Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Lucene.net: Querying and using a filter to limit results

As usual I turn to the massive brain power that is the Stackoverflow user base to help solve a Lucene.NET problem I am battling with. First off, I am a complete noob when it comes to Lucene and Lucene.NET and by using the scattered tutorials and code snippets online, I have cobbled together the follow solution for my scenario.

The Scenario

I have an index of the following structure:

---------------------------------------------------------
| id  |    date    | security |           text          |
---------------------------------------------------------
|  1  | 2011-01-01 | -1-12-4- | some analyzed text here |
---------------------------------------------------------
|  2  | 2011-01-01 |  -11-3-  | some analyzed text here |
---------------------------------------------------------
|  3  | 2011-01-01 |    -1-   | some analyzed text here |
---------------------------------------------------------

I need to be able to query the text field, but restrict the results to users that have specific roleId's.

What I came up with to accomplish this (after many, many trips to Google) is to use a "security field" and a Lucene filter to restrict the result set as outlined below:

class SecurityFilter : Lucene.Net.Search.Filter
{
    public override System.Collections.BitArray Bits(Lucene.Net.Index.IndexReader indexReader)
    {
        BitArray bitarray = new BitArray(indexReader.MaxDoc());

        for (int i = 0; i < bitarray.Length; i++)
        {
            if (indexReader.Document(i).Get("security").Contains("-1-"))
            {
                bitarray.Set(i, true);
            }
        }

        return bitarray;
    }
}

... and then ...

Lucene.Net.Search.Sort sort = new Lucene.Net.Search.Sort(new Lucene.Net.Search.SortField("date", true));
Lucene.Net.Analysis.Standard.StandardAnalyzer analyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
Lucene.Net.Search.IndexSearcher searcher = new Lucene.Net.Search.IndexSearcher(Lucene.Net.Store.FSDirectory.Open(indexDirectory), true);
Lucene.Net.QueryParsers.QueryParser parser = new Lucene.Net.QueryParsers.QueryParser(Lucene.Net.Util.Version.LUCENE_29, "text", analyzer);
Lucene.Net.Search.Query query = parser.Parse("some search phrase");
SecurityFilter filter = new SecurityFilter();
Lucene.Net.Search.Hits hits = searcher.Search(query, filter, sort);

This works as expected and would only return documents with the id's of 1 and 3. The problem is that on large indexes this process becomes very slow.

Finally, my question... Does anyone out there have any tips on how to speed it up, or have an alternate solution that would be more efficient than the one I have presented here?

like image 923
nokturnal Avatar asked Sep 30 '11 15:09

nokturnal


2 Answers

If you index your security field as analyzed (such that it splits your security string as 1 12 4 ...)

you can create a filter like this

Filter filter = new QueryFilter(new TermQuery(new Term("security ", "1")));

or

form a query like some text +security:1

like image 121
L.B Avatar answered Oct 13 '22 10:10

L.B


I changed my answer with a simple example that explain what I meant in my previous answer.

I made this quickly and doesnt respect best practices, but it should give you the idea.

Note that the security field will need to be tokenized so that each ID in it are separate tokens, using the WhitespaceAnalyzer for example.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Search;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Analysis.Standard;
using System.IO;

namespace ConsoleApplication1
{
    class Program
    {
        public class RoleFilterCache
        {
            static public Dictionary<string, Filter> Cache = new Dictionary<string,Filter>();

            static public Filter Get(string role)
            {
                Filter cached = null;
                if (!Cache.TryGetValue(role, out cached))
                {
                    return null;
                }
                return cached;
            }

            static public void Put(string role, Filter filter)
            {
                if (role != null)
                {
                    Cache[role] = filter;
                }
            }
        }

        public class User
        {
            public string Username;
            public List<string> Roles;
        }

        public static Filter GetFilterForUser(User u)
        {
            BooleanFilter userFilter = new BooleanFilter();
            foreach (string rolename in u.Roles)
            {   
                // call GetFilterForRole and add to the BooleanFilter
                userFilter.Add(
                    new BooleanFilterClause(GetFilterForRole(rolename), BooleanClause.Occur.SHOULD)
                );
            }
            return userFilter;
        }

        public static Filter GetFilterForRole(string role)
        {
            Filter roleFilter = RoleFilterCache.Get(role);
            if (roleFilter == null)
            {
                roleFilter =
                    // the caching wrapper filter makes it cache the BitSet per segmentreader
                    new CachingWrapperFilter(
                        // builds the filter from the index and not from iterating
                        // stored doc content which is much faster
                        new QueryWrapperFilter(
                            new TermQuery(
                                new Term("security", role)
                            )
                        )
                );
                // put in cache
                RoleFilterCache.Put(role, roleFilter);
            }
            return roleFilter;
        }


        static void Main(string[] args)
        {
            IndexWriter iw = new IndexWriter(new FileInfo("C:\\example\\"), new StandardAnalyzer(), true);
            Document d = new Document();

            Field aField = new Field("content", "", Field.Store.YES, Field.Index.ANALYZED);
            Field securityField = new Field("security", "", Field.Store.NO, Field.Index.ANALYZED);

            d.Add(aField);
            d.Add(securityField);

            aField.SetValue("Only one can see.");
            securityField.SetValue("1");
            iw.AddDocument(d);
            aField.SetValue("One and two can see.");
            securityField.SetValue("1 2");
            iw.AddDocument(d);
            aField.SetValue("One and two can see.");
            securityField.SetValue("1 2");
            iw.AddDocument(d);
            aField.SetValue("Only two can see.");
            securityField.SetValue("2");
            iw.AddDocument(d);

            iw.Close();

            User userone = new User()
            {
                Username = "User one",
                Roles = new List<string>()
            };
            userone.Roles.Add("1");
            User usertwo = new User()
            {
                Username = "User two",
                Roles = new List<string>()
            };
            usertwo.Roles.Add("2");
            User userthree = new User()
            {
                Username = "User three",
                Roles = new List<string>()
            };
            userthree.Roles.Add("1");
            userthree.Roles.Add("2");

            PhraseQuery phraseQuery = new PhraseQuery();
            phraseQuery.Add(new Term("content", "can"));
            phraseQuery.Add(new Term("content", "see"));

            IndexSearcher searcher = new IndexSearcher("C:\\example\\", true);

            Filter securityFilter = GetFilterForUser(userone);
            TopDocs results = searcher.Search(phraseQuery, securityFilter,25);
            Console.WriteLine("User One Results:");
            foreach (var aResult in results.ScoreDocs)
            {
                Console.WriteLine(
                    searcher.Doc(aResult.doc).
                    Get("content")
                );
            }
            Console.WriteLine("\n\n");

            securityFilter = GetFilterForUser(usertwo);
            results = searcher.Search(phraseQuery, securityFilter, 25);
            Console.WriteLine("User two Results:");
            foreach (var aResult in results.ScoreDocs)
            {
                Console.WriteLine(
                    searcher.Doc(aResult.doc).
                    Get("content")
                );
            }
            Console.WriteLine("\n\n");

            securityFilter = GetFilterForUser(userthree);
            results = searcher.Search(phraseQuery, securityFilter, 25);
            Console.WriteLine("User three Results (should see everything):");
            foreach (var aResult in results.ScoreDocs)
            {
                Console.WriteLine(
                    searcher.Doc(aResult.doc).
                    Get("content")
                );
            }
            Console.WriteLine("\n\n");
            Console.ReadKey();
        }
    }
}
like image 43
Jf Beaulac Avatar answered Oct 13 '22 09:10

Jf Beaulac