Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

how to extract main text from html using Tika

I just want to know that how i can extract main text and plain text from html using Tika?

maybe one possible solution is to use BoilerPipeContentHandler but do you have some sample/demo codes to show it?

thanks very much in advance

like image 343
user2651995 Avatar asked May 14 '14 11:05

user2651995


2 Answers

The BodyContentHandler class doesn't use the Boilerpipe code, so you'll have to explicitly use the BoilerPipeContentHandler. The following code worked for me:

public String[] tika_autoParser() {
    String[] result = new String[3];
    try {
        InputStream input = new FileInputStream(new File("test.html"));
        ContentHandler textHandler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        AutoDetectParser parser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        parser.parse(input, new BoilerpipeContentHandler(textHandler), metadata, context);
        result[0] = "Title: " + metadata.get(metadata.TITLE);
        result[1] = "Body: " + textHandler.toString();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (SAXException e) {
        e.printStackTrace();
    } catch (TikaException e) {
        e.printStackTrace();
    }

    return result;
}
like image 148
peatb Avatar answered Jan 02 '23 12:01

peatb


Here is a sample:

public String[] tika_autoParser() {
    String[] result = new String[3];
    try {
        InputStream input = new FileInputStream(new File("/Users/nazanin/Books/Web crawler.pdf"));
        ContentHandler textHandler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        AutoDetectParser parser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        parser.parse(input, textHandler, metadata, context);
        result[0] = "Title: " + metadata.get(metadata.TITLE);
        result[1] = "Body: " + textHandler.toString();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (SAXException e) {
        e.printStackTrace();
    } catch (TikaException e) {
        e.printStackTrace();
    }

    return result;
}
like image 40
UserNeD Avatar answered Jan 02 '23 11:01

UserNeD