I'm using Django 1.5 with django-haystack 2.0 and an elasticsearch backend. I'm trying to search by an exact attribute match. However, I'm getting "similar" results even though I'm using both the __exact operator and the Exact() class. How can I prevent this behavior?
For example:
# models.py
class Person(models.Model):
name = models.TextField()
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr="name")
def get_model(self):
return Person
def index_queryset(self, using=None):
return self.get_model().objects.all()
# templates/search/indexes/people/person_text.txt
{{ object.name }}
>>> p1 = Person(name="Simon")
>>> p1.save()
>>> p2 = Person(name="Simons")
>>> p2.save()
$ ./manage.py rebuild_index
>>> person_sqs = SearchQuerySet().models(Person)
>>> person_sqs.filter(name__exact="Simons")
[<SearchResult: people.person (name=u'Simon')>
<SearchResult: people.person (name=u'Simons')>]
>>> person_sqs.filter(name=Exact("Simons", clean=True))
[<SearchResult: people.person (name=u'Simon')>
<SearchResult: people.person (name=u'Simons')>]
I only want the search result for "Simons" - the "Simon" result should not show up.
Python3, Django 1.10, Elasticsearch 2.4.4.
TL;DR: define custom tokenizer (not filter)
Verbose explanation
a) use EdgeNgramField:
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.EdgeNgramField(document=True, use_template=True)
...
b) template:
# templates/search/indexes/people/person_text.txt
{{ object.name }}
c) create custom search backend:
# backends.py
from django.conf import settings
from haystack.backends.elasticsearch_backend import (
ElasticsearchSearchBackend,
ElasticsearchSearchEngine,
)
class CustomElasticsearchSearchBackend(ElasticsearchSearchBackend):
def __init__(self, connection_alias, **connection_options):
super(CustomElasticsearchSearchBackend, self).__init__(
connection_alias, **connection_options)
setattr(self, 'DEFAULT_SETTINGS', settings.ELASTICSEARCH_INDEX_SETTINGS)
class CustomElasticsearchSearchEngine(ElasticsearchSearchEngine):
backend = CustomElasticsearchSearchBackend
d) define custom tokenizer (not filter!):
# settings.py
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'apps.persons.backends.CustomElasticsearchSearchEngine',
'URL': 'http://127.0.0.1:9200/',
'INDEX_NAME': 'haystack',
},
}
ELASTICSEARCH_INDEX_SETTINGS = {
"settings": {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "custom_ngram_tokenizer",
"filter": ["asciifolding", "lowercase"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "custom_edgengram_tokenizer",
"filter": ["asciifolding", "lowercase"]
}
},
"tokenizer": {
"custom_ngram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 12,
"token_chars": ["letter", "digit"]
},
"custom_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 12,
"token_chars": ["letter", "digit"]
}
}
}
}
}
HAYSTACK_DEFAULT_OPERATOR = 'AND'
e) use AutoQuery (more versatile):
# views.py
search_value = 'Simons'
...
person_sqs = \
SearchQuerySet().models(Person).filter(
content=AutoQuery(search_value)
)
f) reindex after changes:
$ ./manage.py rebuild_index
I was facing a similar problem. if you change the settings of your haystacks elasticsearch back end like:
DEFAULT_SETTINGS = {
'settings': {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["haystack_ngram", "lowercase"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["haystack_edgengram", "lowercase"]
}
},
"tokenizer": {
"haystack_ngram_tokenizer": {
"type": "nGram",
"min_gram": 6,
"max_gram": 15,
},
"haystack_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 6,
"max_gram": 15,
"side": "front"
}
},
"filter": {
"haystack_ngram": {
"type": "nGram",
"min_gram": 6,
"max_gram": 15
},
"haystack_edgengram": {
"type": "edgeNGram",
"min_gram": 6,
"max_gram": 15
}
}
}
}
}
Then it will tokenize only when the query is more than 6 character.
If you want results like "xyzsimonsxyz", then you would need to use ngram analyzer instead of EdgeNGram or you could use both depending on your requirements. EdgeNGram generates tokens only from the beginning.
with NGram 'simons' will be one of the generated tokens for term xyzsimonsxyz assuming max_gram >=6 and you will get expected results, also search_analyzer needs to be different or you will get weird results.
Also index size might get pretty big with ngram if you have huge chunk of text
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With