Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

lucene indexing of html files

Dear Users I am working on apache lucene for indexing and searching . I have to index html files stored on the local disc of computer . I have to make indexing on filename and contents of the html files . I am able to store the file names in the lucene index but not the html file contents which should index not only the data but the entire page consisting images link and url and how can i access the contents from those indexed files for indexing i am using the following code:

    File indexDir = new File(indexpath);
    File dataDir = new File(datapath);
    String suffix = ".htm";
    IndexWriter indexWriter = new IndexWriter(
            FSDirectory.open(indexDir),
            new SimpleAnalyzer(),
            true,
            IndexWriter.MaxFieldLength.LIMITED);
    indexWriter.setUseCompoundFile(false);
    indexDirectory(indexWriter, dataDir, suffix);

    numIndexed = indexWriter.maxDoc();
    indexWriter.optimize();
    indexWriter.close();


private void indexDirectory(IndexWriter indexWriter, File dataDir, String suffix) throws IOException {
    try {
        for (File f : dataDir.listFiles()) {
            if (f.isDirectory()) {
                indexDirectory(indexWriter, f, suffix);
            } else {
                indexFileWithIndexWriter(indexWriter, f, suffix);
            }
        }
    } catch (Exception ex) {
        System.out.println("exception 2 is" + ex);
    }
}

private void indexFileWithIndexWriter(IndexWriter indexWriter, File f,
    String suffix) throws IOException {
    try {
        if (f.isHidden() || f.isDirectory() || !f.canRead() || !f.exists()) {
            return;
        }
        if (suffix != null && !f.getName().endsWith(suffix)) {
            return;
        }
        Document doc = new Document();
        doc.add(new Field("contents", new FileReader(f)));
        doc.add(new Field("filename", f.getFileName(),
                Field.Store.YES, Field.Index.ANALYZED));
        indexWriter.addDocument(doc);
    } catch (Exception ex) {
        System.out.println("exception 4 is" + ex);
    }
}

thanks in advance

like image 574
adesh Avatar asked Dec 15 '22 19:12

adesh


1 Answers

This line of code is the reason why your contents is not being stored:

doc.add(new Field("contents", new FileReader(f)));

This method DOES NOT STORE the contents being indexed.

If you are trying to index HTML files, try using JTidy. It will make the process much easier.

Sample Codes:

public class JTidyHTMLHandler {

    public org.apache.lucene.document.Document getDocument(InputStream is) throws DocumentHandlerException {
        Tidy tidy = new Tidy();
        tidy.setQuiet(true);
        tidy.setShowWarnings(false);
        org.w3c.dom.Document root = tidy.parseDOM(is, null);
        Element rawDoc = root.getDocumentElement();

        org.apache.lucene.document.Document doc =
                new org.apache.lucene.document.Document();

        String body = getBody(rawDoc);

        if ((body != null) && (!body.equals(""))) {
            doc.add(new Field("contents", body, Field.Store.NO, Field.Index.ANALYZED));
        }

        return doc;
    }

    protected String getTitle(Element rawDoc) {
        if (rawDoc == null) {
            return null;
        }

        String title = "";

        NodeList children = rawDoc.getElementsByTagName("title");
        if (children.getLength() > 0) {
            Element titleElement = ((Element) children.item(0));
            Text text = (Text) titleElement.getFirstChild();
            if (text != null) {
                title = text.getData();
            }
        }
        return title;
    }

    protected String getBody(Element rawDoc) {
        if (rawDoc == null) {
            return null;
        }

        String body = "";
        NodeList children = rawDoc.getElementsByTagName("body");
        if (children.getLength() > 0) {
            body = getText(children.item(0));
        }
        return body;
    }

    protected String getText(Node node) {
        NodeList children = node.getChildNodes();
        StringBuffer sb = new StringBuffer();
        for (int i = 0; i < children.getLength(); i++) {
            Node child = children.item(i);
            switch (child.getNodeType()) {
                case Node.ELEMENT_NODE:
                    sb.append(getText(child));
                    sb.append(" ");
                    break;
                case Node.TEXT_NODE:
                    sb.append(((Text) child).getData());
                    break;
            }
        }
        return sb.toString();
    }
}

To get an InputStream from a URL:

URL url = new URL(htmlURLlocation);
URLConnection connection = url.openConnection();
InputStream stream = connection.getInputStream();

To get an InputStream from a File:

InputStream stream = new FileInputStream(new File (htmlFile));
like image 121
Lai Xin Chu Avatar answered Jan 02 '23 07:01

Lai Xin Chu