Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Solr - case-insensitive search do not work

I want to apply case-insensitive search for field myfield in solr.

I googled a bit for that , and i found that , i need to apply LowerCaseFilterFactory to Field Type and field should be of solr.TextFeild.

I applied that in my schema.xml and re-index the data, then also my search seems to be case-sensitive.

Below is search that i perform.

http://localhost:8080/solr/select?q=myfield:"cloud university"&hl=on&hl.snippets=99&hl.fl=myfield

Below is definition for field type

 <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <!-- in this example, we will only use synonyms at query time
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
        <!-- Case insensitive stop word removal.
          add enablePositionIncrements=true in both the index and query
          analyzers to leave a 'gap' for more accurate phrase queries.
        -->
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords_en.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords_en.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
    </fieldType>

and below is my field definition

 <field name="myfield" type="text_en_splitting" indexed="true" stored="true" />

Not sure , what is wrong with this. Please help me to resolve this.

Thanks

EDIT

Debug Query

<lst name="debug">
    <str name="rawquerystring">
        "cloud university" AND guid:268406b6-db65-49da-848a-c59248f170db
    </str>
    <str name="querystring">
        "cloud university" AND guid:268406b6-db65-49da-848a-c59248f170db
    </str>
    <str name="parsedquery">
        +PhraseQuery(CC:"cloud univers") +guid:268406b6-db65-49da-848a-c59248f170db
    </str>
    <str name="parsedquery_toString">
        +CC:"cloud univers" +guid:268406b6-db65-49da-848a-c59248f170db
    </str>
    <lst name="explain">
        <str name="KSYS_20120805_1100">
            12.572915 = (MATCH) sum of: 0.03595598 = weight(CC:"cloud univers" in 1560524), product of: 0.51819557 = queryWeight(CC:"cloud univers"), product of: 8.881522 = idf(CC: cloud=4798 univers=625207) 0.05834536 = queryNorm 0.06938689 = fieldWeight(CC:"cloud univers" in 1560524), product of: 1.0 = tf(phraseFreq=1.0) 8.881522 = idf(CC: cloud=4798 univers=625207) 0.0078125 = fieldNorm(field=CC, doc=1560524) 12.536959 = (MATCH) weight(guid:268406b6-db65-49da-848a-c59248f170db in 1560524), product of: 0.85526216 = queryWeight(guid:268406b6-db65-49da-848a-c59248f170db), product of: 14.658615 = idf(docFreq=1, maxDocs=1709587) 0.05834536 = queryNorm 14.658615 = (MATCH) fieldWeight(guid:268406b6-db65-49da-848a-c59248f170db in 1560524), product of: 1.0 = tf(termFreq(guid:268406b6-db65-49da-848a-c59248f170db)=1) 14.658615 = idf(docFreq=1, maxDocs=1709587) 1.0 = fieldNorm(field=guid, doc=1560524)
        </str>
    </lst>
    <str name="QParser">LuceneQParser</str>
    <lst name="timing">
        <double name="time">60.0</double>
        <lst name="prepare">
            <double name="time">1.0</double>
            <lst name="org.apache.solr.handler.component.QueryComponent">
                <double name="time">0.0</double>
            </lst>
            <lst name="org.apache.solr.handler.component.FacetComponent">
                <double name="time">0.0</double>
            </lst>
            <lst name="org.apache.solr.handler.component.MoreLikeThisComponent">
                <double name="time">0.0</double>
            </lst>
            <lst name="org.apache.solr.handler.component.HighlightComponent">
                <double name="time">0.0</double>
            </lst>
            <lst name="org.apache.solr.handler.component.StatsComponent">
                <double name="time">0.0</double>
            </lst>
            <lst name="org.apache.solr.handler.component.DebugComponent">
                <double name="time">0.0</double>
            </lst>
        </lst>
        <lst name="process">
            <double name="time">59.0</double>
            <lst name="org.apache.solr.handler.component.QueryComponent">
                <double name="time">0.0</double>
            </lst>
            <lst name="org.apache.solr.handler.component.FacetComponent">
                <double name="time">0.0</double>
            </lst>
            <lst name="org.apache.solr.handler.component.MoreLikeThisComponent">
                <double name="time">0.0</double>
            </lst>
            <lst name="org.apache.solr.handler.component.HighlightComponent">
                <double name="time">57.0</double>
            </lst>
            <lst name="org.apache.solr.handler.component.StatsComponent">
                <double name="time">0.0</double>
            </lst>
            <lst name="org.apache.solr.handler.component.DebugComponent">
                <double name="time">2.0</double>
            </lst>
        </lst>
    </lst>
</lst>
like image 723
meghana Avatar asked Aug 22 '12 10:08

meghana


1 Answers

You should put solr.LowerCaseFilterFactory before the word delimiter because caps in the middle of lower caps or vice versa triggers the word delimiter

like image 59
Bob Yoplait Avatar answered Sep 23 '22 09:09

Bob Yoplait