Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why does Lucene.NET cause OutOfMemoryException when indexing large files?

I have added the code mention above for IndexWriter.

I have set

writer.SetRAMBufferSizeMB(32);
writer.MergeFactor = 1000;
writer.SetMaxFieldLength(Int32.MaxValue);
writer.UseCompoundFile = false;

all the property for avoiding OutOfMemoryException(OOMException).

Here in this code on line writer.AddDocument(document); shows OOM exception.

Can you guide me why I am this error?
Can anyone help me out to solve this?

My machine's configuration:
System type : 64-bit operating system.
RAM : 4 GB (3.86 GB usable)
Processor : Intel i5 - 3230M CPU @ 2.60GHz

using System;
using System.Data.SqlClient;
using Lucene.Net.Documents;
using System.Data;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.QueryParsers;

namespace ConsoleApplication1
{
    class Program
    {
        static String searchTerm = "";
        static void Main(string[] args) {
            /**
             * This will create dataset according to
             * connectingString and query
             **/
            Console.WriteLine("Connecting to Sql database server.");
            String connectionString = "Data Source=proxy-pc;Initial Catalog=Snomed; User          ID=SA;password=admin";
            String query = "SELECT * FROM DESCRIPTION";
            String INDEX_DIRECTORY = "c:\\DatabaseIndex";

            Console.WriteLine("Creating dataset.");
            DataSet dataSet = createDataset(connectionString, query);
            Console.WriteLine("Created dataset successfully.");

            Console.WriteLine("Creating document.");
            Document document = createDocument(dataSet);
            Console.WriteLine("Created document successfully.");

            var version = Lucene.Net.Util.Version.LUCENE_30;
            var length = Lucene.Net.Index.IndexWriter.MaxFieldLength.LIMITED;
            Lucene.Net.Analysis.Standard.StandardAnalyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(version);
            Lucene.Net.Store.Directory directory = Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(@INDEX_DIRECTORY));
            Lucene.Net.Index.IndexWriter writer = new Lucene.Net.Index.IndexWriter(directory, analyzer, length);
            writer.SetMergeScheduler(new Lucene.Net.Index.SerialMergeScheduler());
            writer.SetRAMBufferSizeMB(32);
            writer.MergeFactor = 1000;
            writer.SetMaxFieldLength(Int32.MaxValue);
            writer.UseCompoundFile = false;
            Console.WriteLine("Before Adding document");
            **writer.AddDocument(document); **
             Console.WriteLine("Indexing...");
            writer.Optimize();
            writer.Dispose();
            Console.WriteLine("Indexing finished");

            if (searchTerm == "")
            {
                searchTerm = "(keyword)";
            }

            Console.WriteLine("Searching '" + searchTerm + "'...");

            var occurance = searchKeyword(INDEX_DIRECTORY, version, searchTerm);

            if (occurance != -1)
            {
                Console.WriteLine("Your search found : " + occurance);
            }
            else
            {
                Console.WriteLine("Invalid index directory.");
            }

            Console.Read();
        }

        /**
         *   Method works as a searcher
        **/
        private static int searchKeyword(String index_Directory_Path, Lucene.Net.Util.Version version, String searchWord) {
            if (index_Directory_Path != null)
            {
                var standAnalyzer = new StandardAnalyzer(version);
                IndexSearcher searcher = new IndexSearcher(FSDirectory.Open(index_Directory_Path));

                // parse the query, "term" is the default field to search
                var parser = new QueryParser(version, "term", standAnalyzer);
                Query searchQuery = parser.Parse(searchWord);

                // search
                TopDocs hits = searcher.Search(searchQuery, 100);
                var total = hits.TotalHits;
                return total;
            }

            else
            {
                return -1;
            }
        }

        static DataSet createDataset(String connectionString, String query) {
            DataSet ds = new DataSet();

            using (SqlConnection connection = new SqlConnection(connectionString))
            using (SqlCommand command = new SqlCommand(query, connection))
            using (SqlDataAdapter adapter = new SqlDataAdapter(command))
            {
                adapter.Fill(ds);
            }

            return ds;
        }

        static Lucene.Net.Documents.Document createDocument(DataSet dataSet) {
            Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
            using (dataSet)
            {
                foreach (DataTable table in dataSet.Tables)
                {
                    foreach (DataRow row in table.Rows)
                    {
                        String id = row["id"].ToString();
                        String rTime = row["rTime"].ToString();
                        String active = row["active"].ToString();
                        String mId = row["mId"].ToString();
                        String cId = row["cId"].ToString();
                        String lCode = row["lCode"].ToString();
                        String tId = row["tId"].ToString();
                        String detail = row["detail"].ToString();
                        String sId = row["sId"].ToString();

                        Field idField = new Field("id", id, Field.Store.YES, Field.Index.ANALYZED);
                        Field rTimeField = new Field("rTime", rTime, Field.Store.YES, Field.Index.ANALYZED);
                        Field activeField = new Field("active", active, Field.Store.YES, Field.Index.ANALYZED);
                        Field mIdField = new Field("mId", mId, Field.Store.YES, Field.Index.ANALYZED);
                        Field cIdField = new Field("cId", cId, Field.Store.YES, Field.Index.ANALYZED);
                        Field lCodeField = new Field("lCode", lCode, Field.Store.YES, Field.Index.ANALYZED);
                        Field tIdField = new Field("tId", tId, Field.Store.YES, Field.Index.ANALYZED);
                        Field detailField = new Field("detail", detail, Field.Store.YES, Field.Index.ANALYZED);
                        Field sIdField = new Field("sId", sId, Field.Store.YES, Field.Index.ANALYZED);

                        doc.Add(idField);
                        doc.Add(rTimeField);
                        doc.Add(activeField);
                        doc.Add(mIdField);
                        doc.Add(cIdField);
                        doc.Add(lCodeField);
                        doc.Add(tIdField);
                        doc.Add(detailField);
                        doc.Add(sIdField);
                    }
                }
            }

            return doc;
        }
    }
}
like image 257
Sanket Thakkar Avatar asked Jan 27 '14 12:01

Sanket Thakkar


1 Answers

It looks like you're adding the entire database as a single document.

Have you tried adding each row as a separate document? You could perhaps change "createDocument" to "createDocuments" and yield a single Lucene.Net document per row. That'd leave most of your current code unchanged...

Hope this helps,

like image 99
Adrian Conlon Avatar answered Nov 05 '22 07:11

Adrian Conlon