Elasticsearch bucket aggregation returning wrong doc_count

Question

I am working with Elasticsearch bucket aggregation. I have a set of documents with each document having a category field which is an array. I need to get the count of each category along with the search results. But currently, I am getting the wrong doc_count.

This is my aggregation query

{

"aggs" : {
    "category" : {
        "terms" : { 
            "field" : "category.keyword"
        }
    }
}

}

Below is the result I am getting with the wrong doc_count.

{
"took": 1,
"timed_out": false,
"_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
},
"hits": {
    "total": {
        "value": 112,
        "relation": "eq"
    },
    "max_score": 1.0,
    "hits": [
        {
            "_index": "pages",
            "_type": "_doc",
            "_id": "ljrbn3ABGDJu_xG-W2Mm",
            "_score": 1.0,
            "_source": {
                "title": "testing code.",
                "location_en": "Europe",
                "location_fr": "Europe-Fr",
                "start_date": "2020-03-03 10:05:17",
                "end_date": "2020-03-13 10:05:17",
                "category": [
                    "job",
                    "university",
                    "europe",
                    "researcher",
                    "law"
                ]
            }
        },
        {
            "_index": "pages",
            "_type": "_doc",
            "_id": "lzrbn3ABGDJu_xG-W2PX",
            "_score": 1.0,
            "_source": {
                "title": "Test",
                "slug": "Test",
                "location_en": "Asia",
                "location_fr": "Asia-Fr",
                "start_date": "2020-03-03 10:05:18",
                "end_date": "2020-03-13 10:05:18",
                "category": [
                    "job",
                    "uppsala-university",
                    "asia",
                    "PhD",
                    "history"
                ]
            }
        },
        {
            "_index": "pages",
            "_type": "_doc",
            "_id": "mDrbn3ABGDJu_xG-W2Pl",
            "_score": 1.0,
            "_source": {
                "title": "Test",
                "slug": "Test",
                "location_en": "Europe",
                "location_fr": "Europe-Fr",
                "start_date": "2020-03-03 10:05:18",
                "end_date": "2020-03-13 10:05:18",
                "category": [
                    "job",
                    "university",
                    "europe",
                    "researcher",
                    "law"
                ]
            }
        },
        {
            "_index": "pages",
            "_type": "_doc",
            "_id": "mTrbn3ABGDJu_xG-XGOO",
            "_score": 1.0,
            "_source": {
                "title": "Test",
                "slug": "Test",
                "location_en": "Asia",
                "location_fr": "Asia-Fr",
                "start_date": "2020-03-03 10:05:18",
                "end_date": "2020-03-13 10:05:18",
                "category": [
                    "job",
                    "university",
                    "asia",
                    "PhD",
                    "history"
                ]
            }
        },
        {
            "_index": "pages",
            "_type": "_doc",
            "_id": "mjrbn3ABGDJu_xG-XGOb",
            "_score": 1.0,
            "_source": {
                "title": "testing world",
                "location_en": "Europe",
                "location_fr": "Europe-Fr",
                "start_date": "2020-03-03 10:05:18",
                "end_date": "2020-03-13 10:05:18",
                "category": [
                    "job",
                    "university",
                    "europe",
                    "researcher",
                    "law"
                ]
            }
        },
        {
            "_index": "pages",
            "_type": "_doc",
            "_id": "mzrbn3ABGDJu_xG-XWNG",
            "_score": 1.0,
            "_source": {
                "title": "hello",
                "slug": "Helloo",
                "short_description_en": "Helloo",
                "location_en": "Asia",
                "location_fr": "Asia-Fr",
                "start_date": "2020-03-03 10:05:18",
                "end_date": "2020-03-13 10:05:18",
                "category": [
                    "job",
                    "university",
                    "asia",
                    "PhD",
                    "history"
                ]
            }
        },
        {
            "_index": "pages",
            "_type": "_doc",
            "_id": "nDrbn3ABGDJu_xG-XWNU",
            "_score": 1.0,
            "_source": {
                "title": "Hello",
                "slug": "helloo",
                "short_description_en": "hello worldf",
                "location_en": "Europe",
                "location_fr": "Europe-Fr",
                "start_date": "2020-03-03 10:05:18",
                "end_date": "2020-03-13 10:05:18",
                "category": [
                    "job",
                    "university",
                    "europe",
                    "researcher",
                    "law"
                ]
            }
        },
        {
            "_index": "pages",
            "_type": "_doc",
            "_id": "nTrbn3ABGDJu_xG-XmMF",
            "_score": 1.0,
            "_source": {
                "title": "Test",
                "slug": "test",
                "short_description_en": "Test",
                "location_en": "Asia",
                "location_fr": "Asia-Fr",
                "start_date": "2020-03-03 10:05:18",
                "end_date": "2020-03-13 10:05:18",
                "category": [
                    "job",
                    "university",
                    "asia",
                    "PhD",
                    "history"
                ]
            }
        },
        {
            "_index": "pages",
            "_type": "_doc",
            "_id": "njrbn3ABGDJu_xG-XmMS",
            "_score": 1.0,
            "_source": {
                "title": "Test",
                "slug": "test",
                "short_description_en": "Test",
                "location_en": "Europe",
                "location_fr": "Europe-Fr",
                "start_date": "2020-03-03 10:05:18",
                "end_date": "2020-03-13 10:05:18",
                "category": [
                    "job",
                    "university",
                    "europe",
                    "researcher",
                    "law"
                ]
            }
        },
        {
            "_index": "pages",
            "_type": "_doc",
            "_id": "nzrbn3ABGDJu_xG-XmPZ",
            "_score": 1.0,
            "_source": {
                "title": "Researcher position in accelerator mass spectrometry (AMS)",
                "slug": "researcher-position-in-accelerator-mass-spectrometry-ams",
                "short_description_en": "Uppsala University is a comprehensive research-intensive university with a strong international standing. Our mission is to pursue top-quality research and education and to interact constructively with society. Our most important assets are all the individuals whose curiosity and...",
                "location_en": "Asia",
                "location_fr": "Asia-Fr",
                "start_date": "2020-03-03 10:05:18",
                "end_date": "2020-03-13 10:05:18",
                "category": [
                    "job",
                    "university",
                    "asia",
                    "PhD",
                    "history"
                ]
            }
        }
    ]
},
"aggregations": {
    "category": {
        "doc_count_error_upper_bound": 0,
        "sum_other_doc_count": 0,
        "buckets": [
            {
                "key": "job",
                "doc_count": 112
            },
            {
                "key": "university",
                "doc_count": 112
            },
            {
                "key": "PhD",
                "doc_count": 56
            },
            {
                "key": "asia",
                "doc_count": 56
            },
            {
                "key": "europe",
                "doc_count": 56
            },
            {
                "key": "history",
                "doc_count": 56
            },
            {
                "key": "law",
                "doc_count": 56
            },
            {
                "key": "researcher",
                "doc_count": 56
            }
        ]
    }
}

Joe - Elasticsearch Handbook · Accepted Answer

Getting unique (doc) counts out of arrays is known to cause headaches. Try using a scripted metric aggregation which

iterates through all your docs where it finds the category field
iterates through all the individual categories
and finally saves them in a hash map

which I think corresponds to the occurrence counts that you're after.

GET pages/_search
{
  "size": 0,
  "aggs": {
    "scripted_non_uniques": {
      "scripted_metric": {
        "init_script": "state.map = [:];",
        "map_script": """
              if (doc.containsKey('category')) {
                for (def val : doc['category.keyword']) {
                  if (state.map.containsKey(val)) {
                    // increment if existing
                    state.map[val] += 1;
                  } else {
                    // initialize to increment later
                    state.map[val] = 1;
                  }
                }
              }
        """,
        "combine_script": "              return state",
        "reduce_script": "              return states"
      }
    }
  }
}

which yields (based on the 10 hits from your example) the following:

{
  "took" : 17,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 10,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "scripted_non_uniques" : {
      "value" : [
        {
          "map" : {
            "law" : 5,
            "researcher" : 5,
            "university" : 9,
            "asia" : 5,
            "uppsala-university" : 1,
            "history" : 5,
            "job" : 10,
            "europe" : 5,
            "PhD" : 5
          }
        }
      ]
    }
  }
}

Note that this script would count duplicate categories in one category array multiple times so you should think about this edge case too.

You can debug the script code by calling Debug.explain(val) when you're, for example, in the for-loop. You can read up more on the scripting contexts here.

Elasticsearch bucket aggregation returning wrong doc_count

Tags:

elasticsearch

Arjun Sankar

1 Answers

Joe - Elasticsearch Handbook

Recent Activity

Donate For Us

Elasticsearch bucket aggregation returning wrong doc_count

Tags:

elasticsearch

Arjun Sankar

1 Answers

Joe - Elasticsearch Handbook

Related questions

Recent Activity

Donate For Us