Jsoup WhiteList to allow comments

1 Answers

This is not possible by standard JSoup classes and its not dependent on WhiteList. Its the org.jsoup.safety.Cleaner. The cleaner uses a Node traverser that allows only elements and text nodes. Also only the body is parsed. So the head and doctype are ignored completely. So to achieve this you'll have to create a custom cleaner. For example if you have an html like

<!DOCTYPE html>
<html>
    <head>
    <!-- This is a script -->
    <script type="text/javascript">
        function newFun() {
            alert(1);
        }
    </script>
    </head>
    <body>
        <map name="diagram_map">
            <area id="area1" />
            <area id="area2" />
        </map>
        <!-- This is another comment. -->
        <div>Test</div>
    </body>
</html>

You will first create a custom cleaner copying the orginal one. However please note the package should org.jsoup.safety as the cleaner uses some of the protected method of Whitelist associated with. Also there is not point in extending the Cleaner as almost all methods are private and the inner node traverser is final.

package org.jsoup.safety;

import org.jsoup.helper.Validate;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

public class CustomCleaner {
    private Whitelist whitelist;

    public CustomCleaner(Whitelist whitelist) {
    Validate.notNull(whitelist);
    this.whitelist = whitelist;
    }

    public Document clean(Document dirtyDocument) {
    Validate.notNull(dirtyDocument);

    Document clean = Document.createShell(dirtyDocument.baseUri());
    copyDocType(dirtyDocument, clean);
    if (dirtyDocument.head() != null) 
        copySafeNodes(dirtyDocument.head(), clean.head());
    if (dirtyDocument.body() != null) // frameset documents won't have a body. the clean doc will have empty body.
        copySafeNodes(dirtyDocument.body(), clean.body());

    return clean;
    }

    private void copyDocType(Document dirtyDocument, Document clean) {
    dirtyDocument.traverse(new NodeVisitor() {
        public void head(Node node, int depth) {
        if (node instanceof DocumentType) {
            clean.prependChild(node);
        }
        }
        public void tail(Node node, int depth) { }
    });
    }

    public boolean isValid(Document dirtyDocument) {
    Validate.notNull(dirtyDocument);

    Document clean = Document.createShell(dirtyDocument.baseUri());
    int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
    return numDiscarded == 0;
    }

    private final class CleaningVisitor implements NodeVisitor {
    private int numDiscarded = 0;
    private final Element root;
    private Element destination; // current element to append nodes to

    private CleaningVisitor(Element root, Element destination) {
        this.root = root;
        this.destination = destination;
    }

    public void head(Node source, int depth) {
        if (source instanceof Element) {
        Element sourceEl = (Element) source;

        if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
            ElementMeta meta = createSafeElement(sourceEl);
            Element destChild = meta.el;
            destination.appendChild(destChild);

            numDiscarded += meta.numAttribsDiscarded;
            destination = destChild;
        } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
            numDiscarded++;
        }
        } else if (source instanceof TextNode) {
        TextNode sourceText = (TextNode) source;
        TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());
        destination.appendChild(destText);
        } else if (source instanceof Comment) {
        Comment sourceComment = (Comment) source;
        Comment destComment = new Comment(sourceComment.getData(), source.baseUri());
        destination.appendChild(destComment);
        } else if (source instanceof DataNode) {
        DataNode sourceData = (DataNode) source;
        DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri());
        destination.appendChild(destData);
        } else { // else, we don't care about comments, xml proc instructions, etc
        numDiscarded++;
        }
    }

    public void tail(Node source, int depth) {
        if (source instanceof Element && whitelist.isSafeTag(source.nodeName())) {
        destination = destination.parent(); // would have descended, so pop destination stack
        }
    }
    }

    private int copySafeNodes(Element source, Element dest) {
    CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
    NodeTraversor traversor = new NodeTraversor(cleaningVisitor);
    traversor.traverse(source);
    return cleaningVisitor.numDiscarded;
    }

    private ElementMeta createSafeElement(Element sourceEl) {
    String sourceTag = sourceEl.tagName();
    Attributes destAttrs = new Attributes();
    Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
    int numDiscarded = 0;

    Attributes sourceAttrs = sourceEl.attributes();
    for (Attribute sourceAttr : sourceAttrs) {
        if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
        destAttrs.put(sourceAttr);
        else
        numDiscarded++;
    }
    Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
    destAttrs.addAll(enforcedAttrs);

    return new ElementMeta(dest, numDiscarded);
    }

    private static class ElementMeta {
    Element el;
    int numAttribsDiscarded;

    ElementMeta(Element el, int numAttribsDiscarded) {
        this.el = el;
        this.numAttribsDiscarded = numAttribsDiscarded;
    }
    }

}

Once you have both you could do cleaning as normal. Like

import java.io.File;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.CustomCleaner;
import org.jsoup.safety.Whitelist;

public class CustomJsoupSanitizer {

    public static void main(String[] args) {
        try {
            Document doc = Jsoup.parse(new File("t2.html"), "UTF-8");
            CustomCleaner cleaner = new CustomCleaner(Whitelist.relaxed().addTags("script"));
            Document doc2 = cleaner.clean(doc);
            System.out.println(doc2.html());
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

}

This will give you the sanitized output for above html as

<!DOCTYPE html>
<html>
 <head> 
  <!-- This is a script --> 
  <script>
        function newFun() {
            alert(1);
        }
  </script> 
 </head>
 <body>     
  <!-- This is another comment. --> 
  <div>
   Test
  </div>   
 </body>
</html>

You can customize the cleaner to match your requirement. i.e to avoid head node or script tag etc...

191

answered Oct 13 '22 00:10

Syam S

Related questions
                            
                                SSL mutual authentication FAIL on Android Client accepts servers certificate but server does not get the client cert
                            
                                Native Library bluecove_arm not available
                            
                                SDK name of PluginsSDK not set?
                            
                                MediaPlayer seekTo() doesn't update SurfaceView
                            
                                Example 8.1.2-1 Of Java Language Specification(Mutually Recursive Type Variable Bounds)
                            
                                how to create gwt gae with app engine modules using google eclipse plugin
                            
                                Why does Glass/GDK based VoiceListener only catch VoiceCommand once in XE16?
                            
                                variable has private access
                            
                                Can I configure HTMLUnit to only run specific javascript processes and not the whole thing?
                            
                                Edit Line Numbers in Eclipse
                            
                                Java: How to read file from different module?
                            
                                openjdk-1.7.0_55 on ubuntu: Could not load the property file 'output_xml.properties' for output method 'xml'
                            
                                Mocking Couchbase
                            
                                JPA @Version overflow
                            
                                Hibernate Java rearranging parentheses in SQL
                            
                                How to create custom URL in Struts 2? Like www.twitter.com/goodyzain
                            
                                JScrollPane - visual glitch when scrolling
                            
                                Memory Management Recursion Java
                            
                                Guarding the initialization of a non-volatile field with a lock?
                            
                                Where are the source files for forge?

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Jsoup WhiteList to allow comments

Tags:

java

jsoup

Genry

People also ask

1 Answers

Syam S

Recent Activity

Donate For Us