The only close thing that I've found was: Multiple group-by in Elasticsearch
Basically I'm trying to get the ES equivalent of the following MySql
query:
select gender, age_range, count(distinct profile_id) as count FROM TABLE group by age_range, gender
The age and gender by themselves were easy to get:
{ "query": { "match_all": {} }, "facets": { "ages": { "terms": { "field": "age_range", "size": 20 } }, "gender_by_age": { "terms": { "fields": [ "age_range", "gender" ] } } }, "size": 0 }
which gives:
{ "ages": { "_type": "terms", "missing": 0, "total": 193961, "other": 0, "terms": [ { "term": 0, "count": 162643 }, { "term": 3, "count": 10683 }, { "term": 4, "count": 8931 }, { "term": 5, "count": 4690 }, { "term": 6, "count": 3647 }, { "term": 2, "count": 3247 }, { "term": 1, "count": 120 } ] }, "total_gender": { "_type": "terms", "missing": 0, "total": 193961, "other": 0, "terms": [ { "term": 1, "count": 94799 }, { "term": 2, "count": 62645 }, { "term": 0, "count": 36517 } ] } }
But now I need something that looks like this:
[breakdown_gender] => Array ( [1] => Array ( [0] => 264 [1] => 1 [2] => 6 [3] => 67 [4] => 72 [5] => 40 [6] => 23 ) [2] => Array ( [0] => 153 [2] => 2 [3] => 21 [4] => 35 [5] => 22 [6] => 11 ) )
Please note that 0,1,2,3,4,5,6
are "mappings" for the age ranges so they actually mean something :) and not just numbers. e.g. Gender[1] (which is "male") breaks down into age range [0] (which is "under 18") with a count of 246.
Starting from version 1.0 of ElasticSearch
, the new aggregations API allows grouping by multiple fields, using sub-aggregations. Suppose you want to group by fields field1
, field2
and field3
:
{ "aggs": { "agg1": { "terms": { "field": "field1" }, "aggs": { "agg2": { "terms": { "field": "field2" }, "aggs": { "agg3": { "terms": { "field": "field3" } } } } } } } }
Of course this can go on for as many fields as you'd like.
Update:
For completeness, here is how the output of the above query looks. Also below is python code for generating the aggregation query and flattening the result into a list of dictionaries.
{ "aggregations": { "agg1": { "buckets": [{ "doc_count": <count>, "key": <value of field1>, "agg2": { "buckets": [{ "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, { "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, ... ] }, { "doc_count": <count>, "key": <value of field1>, "agg2": { "buckets": [{ "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, { "doc_count": <count>, "key": <value of field2>, "agg3": { "buckets": [{ "doc_count": <count>, "key": <value of field3> }, { "doc_count": <count>, "key": <value of field3> }, ... ] }, ... ] }, ... ] } } }
The following python code performs the group-by given the list of fields. I you specify include_missing=True
, it also includes combinations of values where some of the fields are missing (you don't need it if you have version 2.0 of Elasticsearch thanks to this)
def group_by(es, fields, include_missing): current_level_terms = {'terms': {'field': fields[0]}} agg_spec = {fields[0]: current_level_terms} if include_missing: current_level_missing = {'missing': {'field': fields[0]}} agg_spec[fields[0] + '_missing'] = current_level_missing for field in fields[1:]: next_level_terms = {'terms': {'field': field}} current_level_terms['aggs'] = { field: next_level_terms, } if include_missing: next_level_missing = {'missing': {'field': field}} current_level_terms['aggs'][field + '_missing'] = next_level_missing current_level_missing['aggs'] = { field: next_level_terms, field + '_missing': next_level_missing, } current_level_missing = next_level_missing current_level_terms = next_level_terms agg_result = es.search(body={'aggs': agg_spec})['aggregations'] return get_docs_from_agg_result(agg_result, fields, include_missing) def get_docs_from_agg_result(agg_result, fields, include_missing): current_field = fields[0] buckets = agg_result[current_field]['buckets'] if include_missing: buckets.append(agg_result[(current_field + '_missing')]) if len(fields) == 1: return [ { current_field: bucket.get('key'), 'doc_count': bucket['doc_count'], } for bucket in buckets if bucket['doc_count'] > 0 ] result = [] for bucket in buckets: records = get_docs_from_agg_result(bucket, fields[1:], include_missing) value = bucket.get('key') for record in records: record[current_field] = value result.extend(records) return result
As you only have 2 fields a simple way is doing two queries with single facets. For Male:
{ "query" : { "term" : { "gender" : "Male" } }, "facets" : { "age_range" : { "terms" : { "field" : "age_range" } } } }
And for female:
{ "query" : { "term" : { "gender" : "Female" } }, "facets" : { "age_range" : { "terms" : { "field" : "age_range" } } } }
Or you can do it in a single query with a facet filter (see this link for further information)
{ "query" : { "match_all": {} }, "facets" : { "age_range_male" : { "terms" : { "field" : "age_range" }, "facet_filter":{ "term": { "gender": "Male" } } }, "age_range_female" : { "terms" : { "field" : "age_range" }, "facet_filter":{ "term": { "gender": "Female" } } } } }
Update:
As facets are about to be removed. This is the solution with aggregations:
{ "query": { "match_all": {} }, "aggs": { "male": { "filter": { "term": { "gender": "Male" } }, "aggs": { "age_range": { "terms": { "field": "age_range" } } } }, "female": { "filter": { "term": { "gender": "Female" } }, "aggs": { "age_range": { "terms": { "field": "age_range" } } } } } }
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With