Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Elasticsearch Sorting fields anomaly

Trying to sort a list on certain fields. firstName and lastName but I have noticed some inconstant result.

I am running a simple query

//Return all the employees from a specific company ordering by lastName asc | desc

GET employee-index-sorting
{
  "query": {
    "bool": {
      "filter": {
        "term": {
          "companyId": 3179
        }
      }
    }
  },
  "sort": [
    {
      "lastName.keyword": { <-- Should this be keyword? or not_analyzed
        "order": "desc"
      }
    }
  ]
}

In the result why would van der Mescht and van Breda be before Zwane and Zwezwe?

I suspect there is something wrong with my mappings

{
        "_index": "employee-index",
        "_type": "_doc",
        "_id": "637467",
        "_score": null,
        "_source": {
          "companyId": 3179,
          "firstName": "Name",
          "lastName": "van der Mescht",
        },
        "sort": [
          "van der Mescht"
        ]
      },
      {
        "_index": "employee-index",
        "_type": "_doc",
        "_id": "678335",
        "_score": null,
        "_source": {
          "companyId": 3179,
          "firstName": "Name3",
          "lastName": "van Breda",
        },
        "sort": [
          "van Breda"
        ]
      },
      {
        "_index": "employee-index",
        "_type": "_doc",
        "_id": "113896",
        "_score": null,
        "_source": {
          "companyId": 3179,
          "firstName": "Name2",
          "lastName": "Zwezwe",
        },
        "sort": [
          "Zwezwe"
        ]
      },
      {
        "_index": "employee-index",
        "_type": "_doc",
        "_id": "639639",
        "_score": null,
        "_source": {
          "companyId": 3179,
          "firstName": "Name1",
          "lastName": "Zwane",
        },
        "sort": [
          "Zwane"
        ]
      }

Mappings

Posting the entire map because I am not sure if there might be something else wrong with it.

How should i change the lastName and firstName propery to allow for sorting on them?

PUT employee-index-sorting
{
  "settings": {
    "index": {
      "analysis": {
        "filter": {},
        "analyzer": {
          "keyword_analyzer": {
            "filter": [
              "lowercase",
              "asciifolding",
              "trim"
            ],
            "char_filter": [],
            "type": "custom",
            "tokenizer": "keyword"
          },
          "edge_ngram_analyzer": {
            "filter": [
              "lowercase"
            ],
            "tokenizer": "edge_ngram_tokenizer"
          },
          "edge_ngram_search_analyzer": {
            "tokenizer": "lowercase"
          }
        },
        "tokenizer": {
          "edge_ngram_tokenizer": {
            "type": "edge_ngram",
            "min_gram": 2,
            "max_gram": 5,
            "token_chars": [
              "letter"
            ]
          }
        }
      }
    }
  },
  "mappings": {
    "_doc": {
      "properties": {
        "employeeId": {
          "type": "keyword"
        },
        "companyGroupId": {
          "type": "keyword"
        },
        "companyId": {
          "type": "keyword"
        },
        "number": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "preferredName": {
          "type": "text",
          "index": false
        },
        "firstName": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "middleName": {
          "type": "text",
          "index": false
        },
        "lastName": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "fullName": {
          "type": "text",
          "fields": {
            "keywordstring": {
              "type": "text",
              "analyzer": "keyword_analyzer"
            },
            "edgengram": {
              "type": "text",
              "analyzer": "edge_ngram_analyzer",
              "search_analyzer": "edge_ngram_search_analyzer"
            }
          },
          "analyzer": "standard"
        },
        "terminationDate": {
          "type": "date"
        },
        "companyName": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "email": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "idNumber": {
          "type": "text"
        },
        "description": {
          "type": "text",
          "index": false
        },
        "jobNumber": {
          "type": "keyword"
        },
        "frequencyId": {
          "type": "long"
        },
        "frequencyCode": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "frequencyAccess": {
          "type": "boolean"
        }
      }
    }
  }
}
like image 615
R4nc1d Avatar asked Nov 15 '18 20:11

R4nc1d


1 Answers

For sorting you need to use lastName.keyword, that's correct, no need to change anything there.

The reason why van der Mescht and van Breda are before Zwane and Zwezwe is because sorting on strings happens on a lexicographical level, i.e. basically using the ASCII table and uppercase characters happen before lowercase ones, so words are sorted in that same order. But since you're sorting in desc mode, that's exactly the opposite:

  • z...
  • ...
  • van der Mescht
  • ...
  • van Breda
  • ...
  • a...
  • ...
  • Zwezwe
  • ...
  • Zwane
  • ...
  • Z...
  • ...
  • A...

To fix this, what you simply need to do is to add a normalizer to your lastName.keyword field, i.e. change your mapping to this and it will work:

{
  "settings": {
    "index": {
      "analysis": {
        "filter": {},
        "analyzer": { 
          ...
        },
        "tokenizer": {
          ...
        },
        "normalizer": {             <-- add this
          "lowersort": {
            "type": "custom",
            "filter": [
              "lowercase"
            ]
          }
        }
      }
    }
  },
  "mappings": {
    "_doc": {
      "properties": {
        ...
        "lastName": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "normalizer": "lowersort",   <-- add this
              "ignore_above": 256
            }
          }
        },
        ...
      }
    }
  }
}
like image 192
Val Avatar answered Oct 20 '22 08:10

Val