Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Lucene Returning Documents with non positive score

We have recently upgraded a CMS we work on and had to move from Lucene.net V2.3.1.301 to V2.9.4.1

We used a CustomScoreQuery in our original solution which did various filtering that couldn't be achieved with the built in queries. (GEO, Multi Date Range etc)

Since moving from the old version to the new version of Lucene it started returning documents even though they have a 0 or even negative number score when we inspect the results

enter image description here Below is a sample of the refatored code to demonstrate the issue

    public LuceneTest()
    {
        Lucene.Net.Store.Directory luceneIndexDirectory = FSDirectory.Open(new System.IO.DirectoryInfo(@"C:\inetpub\wwwroot\Project\build\Data\indexes\all_site_search_en"));
        Analyzer analyzer = new WhitespaceAnalyzer(); 
        IndexSearcher searcher = new IndexSearcher(luceneIndexDirectory, true);
        QueryParser parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_23, "", analyzer);
        parser.SetAllowLeadingWildcard(true);
        Query dateQuery = ComposeEventDateQuery(new DateTime(2015, 11, 23), new DateTime(2015,11,25),  searcher);
        BooleanQuery combinedQuery = new BooleanQuery();
        BooleanQuery.SetMaxClauseCount(10000);
        combinedQuery.Add(dateQuery, BooleanClause.Occur.MUST);

        TopDocs hitsFound = searcher.Search(dateQuery, 1000);
        System.Console.WriteLine(String.Format("Found {0} matches with the date filters", hitsFound.TotalHits));
        System.Console.ReadKey();
    }



    public static Query ComposeEventDateQuery(DateTime fromDate, DateTime ToDate, IndexSearcher MySearcher)
    {
        BooleanQuery query = new BooleanQuery();
        Query boolQuery3A = new TermQuery(new Lucene.Net.Index.Term("_language", "en"));
        Query eventDateQuery = new EventDateQuery1(boolQuery3A, MySearcher, fromDate, ToDate, false);
        query.Add(eventDateQuery, BooleanClause.Occur.MUST);
        return query;
    }


    public class EventDateQuery1 : CustomScoreQuery
    {
        private Searcher _searcher;
        private DateTime _fromDT;
        private DateTime _toDT;
        private readonly string _dateFormat = "yyyyMMdd";

        private bool _shouldMatchNonEvents = true;

        public EventDateQuery1(Query subQuery, Searcher searcher, DateTime fromDT, bool shouldMatchNonEvents, int dateRange = 14)
            : base(subQuery)
        {
            _searcher = searcher;
            _fromDT = fromDT.Date;
            _toDT = fromDT.AddDays(dateRange).Date;
            _shouldMatchNonEvents = shouldMatchNonEvents;
        }

        public EventDateQuery1(Query subQuery, Searcher searcher, DateTime fromDT, DateTime toDT, bool shouldMatchNonEvents)
            : base(subQuery)
        {
            _searcher = searcher;
            _fromDT = fromDT.Date;
            _toDT = toDT.Date;
            _shouldMatchNonEvents = shouldMatchNonEvents;
        }


        public override string ToString()
        {
            return GenerateUniqueKey();
        }

        public override string ToString(string field)
        {
            return GenerateUniqueKey();
        }

        public override string Name()
        {
            return GenerateUniqueKey();
        }

        public string GenerateUniqueKey()
        {
            return String.Format("EventDateQuery_{0}_{1}_{2}", _fromDT.ToString(_dateFormat), _toDT.ToString(_dateFormat), _shouldMatchNonEvents.ToString());
        }

        protected override CustomScoreProvider GetCustomScoreProvider(IndexReader reader)
        {
            return new EventDateQueryCustomScoreProvider(reader, _fromDT, _toDT, _shouldMatchNonEvents);
        }



    }

    public class EventDateQueryCustomScoreProvider : CustomScoreProvider
    {
        private DateTime _fromDT;
        private DateTime _toDT;
        private readonly string _dateFormat = "yyyyMMdd";
        private bool _shouldMatchNonEvents = true;
        private float NoMatchFloat = 0f;
        private float MatchFloat = 1f;

        public EventDateQueryCustomScoreProvider(IndexReader reader, DateTime fromDT, DateTime toDT, bool shouldMatchNonEvents)
            : base(reader)
        {
            _fromDT = fromDT.Date;
            _toDT = toDT.Date;
            _shouldMatchNonEvents = shouldMatchNonEvents;
        }



        public override float CustomScore(int doc, float subQueryScore, float valSrcScore)
        {
            return myScore(doc);
        }

        public override float CustomScore(int doc, float subQueryScore, float[] valSrcScores)
        {
            return myScore(doc);
        }

        public float myScore(int doc)
        {
            //Below is a fake implementation just to prove the run
            if (doc < 10)
            {
                return 1F;
            }
            else
            {
                return 0F;
            }
        }



    }

Any suggestions on how to have Lucene not return these documents would be great. Thanks in advance.

like image 856
Ettienne Avatar asked Oct 27 '15 06:10

Ettienne


1 Answers

You can write a custom Collector that collects only documents with >0 score. And then pass an instance of this collector to the Search() method. There is an implementation of such a Collector here.

However, the documentation suggests against this solution if you don't need all the results. Which is probably the case as you are only selecting the top 1000 documents.

like image 184
Tamas Avatar answered Nov 07 '22 05:11

Tamas