Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Elastic search: find word parts in nested object

I'm not able to find parts of a word in a nested object. Only the full word is found. My analyzer configuration is as follows:

{
  "settings": {
    "number_of_shards": 1,
    "analysis": {
      "filter": {
        "word_part_filter": {
          "type": "ngram",
          "min_gram": 3,
          "max_gram": 15
        },
        "word_part_front_filter": {
          "type": "edgeNGram",
          "min_gram": 2,
          "max_gram": 15
        },
        "codeid_filter": {
          "type": "pattern_replace",
          "pattern": "[-/.:]",
          "replacement": "",
          "preserve_original": true
        }
      },
      "char_filter": {
        "umlaut_char_filter": {
          "type": "mapping",
          "mappings": [
            "ö=>oe",
            "ä=>ae",
            "ü=>ue",
            "Ã?=>ss",
            "Ã?=>Oe",
            "Ã?=>Ae",
            "Ã?=>Ue"
          ]
        }
      },
      "analyzer": {
        "description_analyser_query": {
          "type": "custom",
          "char_filter": [
            "html_strip"
          ],
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "stop",
            "asciifolding"
          ]
        },
        "description_analyser_idx": {
          "type": "custom",
          "char_filter": [
            "html_strip"
          ],
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "stop",
            "asciifolding",
            "word_part_filter"
          ]
        },
        "name_analyser_query": {
          "type": "custom",
          "char_filter": [
            "umlaut_char_filter"
          ],
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "asciifolding"
          ]
        },
        "name_analyser_idx": {
          "type": "custom",
          "char_filter": [
            "umlaut_char_filter"
          ],
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "asciifolding",
            "word_part_filter"
          ]
        },
        "codeid_analyser_query": {
          "type": "custom",
          "tokenizer": "keyword",
          "filter": [
            "lowercase",
            "codeid_filter"
          ]
        },
        "codeid_analyser_idx_front": {
          "type": "custom",
          "tokenizer": "keyword",
          "filter": [
            "lowercase",
            "codeid_filter",
            "word_part_front_filter"
          ]
        },
        "codeid_analyser_idx_any": {
          "type": "custom",
          "tokenizer": "keyword",
          "filter": [
            "lowercase",
            "codeid_filter",
            "word_part_filter"
          ]
        }
      }
    }
  }
}

This is the nested object mapping (extracted):

{
  "properties": {    
    "aid": {
      "type": "nested",
      "properties": {
        "tpid": {
          "type": "string",
          "analyzer": "codeid_analyser_idx_any"
        },
        "aid": {
          "type": "string",
          "analyzer": "codeid_analyser_idx_any"
        }
      }    
    }
  }
}

I'm searching with this query (extract). Only the "nested" part is essential here:

{
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "should": [
              {
                "nested": {
                  "path": "aid",
                  "query": {
                    "bool": {
                      "must": {
                        "match": {
                          "aid.aid": {
                            "query": "1200",
                            "analyzer": "codeid_analyser_query"
                          }
                        }
                      },
                      "filter": {
                        "or": [
                          {
                            "match": {
                              "aid.tpid": "buyer_specific"
                            }
                          },
                          {
                            "match": {
                              "aid.tpid": "mytpid"
                            }
                          }
                        ]
                      }
                    }
                  }
                }
              }
            ],
            "minimum_should_match": 1
          }
        }
      ]
    }
  }
}

there is an element with aid=120000008

When using the analyzers in the fields, it finds nothing. When using no analyzers at all in the nested object mapping and query, only full words (like "120000008") will be found, but not "1200". Any ideas?

like image 794
Rooboo Avatar asked Mar 03 '17 11:03

Rooboo


1 Answers

Actually, using ElasticSearch 5.2, using an index called test, and applying the mapping on a type called "product" (rewriting only the filter part so it is compliant to the evolution of the querying language), I obtain the correct result. The query:

GET test/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "should": [
              {
                "nested": {
                  "path": "aid",
                  "query": {
                    "bool": {
                      "must": {
                        "match": {
                          "aid.aid": {
                            "query": "1200",
                            "analyzer": "codeid_analyser_query"
                          }
                        }
                      },
                      "filter": {
                        "terms": {
                          "aid.tpid": [
                            "mytpid",
                            "buyer_specific"
                          ]
                        }
                      }
                    }
                  }
                }
              }
            ],
            "minimum_should_match": 1
          }
        }
      ]
    }
  }
}

The index:

GET test/_search

{
  "took": 8,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 1,
    "hits": [
      {
        "_index": "test",
        "_type": "product",
        "_id": "AVrJ1CSd-NyeQ4r64kP6",
        "_score": 1,
        "_source": {
          "aid": {
            "aid": "120000008",
            "tpid": "mytpid"
          }
        }
      }
    ]
  }
}

The analyzer (I removed the umlaut filter as it was unreadable on my computer, and it does not change the result as in the tests it is not used):

PUT test
{
  "settings": {
     "analysis": {
      "filter": {
        "word_part_filter": {
          "type": "ngram",
          "min_gram": 3,
          "max_gram": 15
        },
        "word_part_front_filter": {
          "type": "edgeNGram",
          "min_gram": 2,
          "max_gram": 15
        },
        "codeid_filter": {
          "type": "pattern_replace",
          "pattern": "[-/.:]",
          "replacement": "",
          "preserve_original": true
        }
      },

      "analyzer": {
        "description_analyser_query": {
          "type": "custom",
          "char_filter": [
            "html_strip"
          ],
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "stop",
            "asciifolding"
          ]
        },
        "description_analyser_idx": {
          "type": "custom",
          "char_filter": [
            "html_strip"
          ],
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "stop",
            "asciifolding",
            "word_part_filter"
          ]
        },

        "codeid_analyser_query": {
          "type": "custom",
          "tokenizer": "keyword",
          "filter": [
            "lowercase",
            "codeid_filter"
          ]
        },
        "codeid_analyser_idx_front": {
          "type": "custom",
          "tokenizer": "keyword",
          "filter": [
            "lowercase",
            "codeid_filter",
            "word_part_front_filter"
          ]
        },
        "codeid_analyser_idx_any": {
          "type": "custom",
          "tokenizer": "keyword",
          "filter": [
            "lowercase",
            "codeid_filter",
            "word_part_filter"
          ]
        }
      }
     }
  }
}

The mapping on product:

PUT test/_mapping/product
{

  "properties": {    
    "aid": {
      "type": "nested",
      "properties": {
        "tpid": {
          "type": "string",
          "analyzer": "codeid_analyser_idx_any"
        },
        "aid": {
          "type": "string",
          "analyzer": "codeid_analyser_idx_any"
        }
      }    
    }
  }
}
like image 101
Adonis Avatar answered Nov 07 '22 21:11

Adonis