Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

mongo $sum compounded when doing $unwind and then $group on multiple fields

I have the following document structure

{
    "app_id": "DHJFK67JDSJjdasj909",
    "date": ISODate("2014-08-07T00:00:00.000Z"),
    "event_count": 100,
    "events": [
        { "type": 0,  "value": 12  },
        { "type": 10, "value": 24 },
        { "type": 20, "value": 36  },
        { "type": 30, "value": 43 }
    ],
    "unique_events": [
        { "type": 0,  "value": 5  },
        { "type": 10, "value": 8 },
        { "type": 20, "value": 12  },
        { "type": 30, "value": 56 }
    ]
}

I am trying to get a sum of event_counts and also the values for unique_events and events per type. This is the kind of output I am expecting where the event_count and each events and unique_events value has been summed per type.

{
    "app_id": "DHJFK67JDSJjdasj909",
    "date": ISODate("2014-08-07T00:00:00.000Z"),
    "event_count": 4345,
    "events": [
        { "type": 0,  "value": 624  },
        { "type": 10, "value": 234 },
        { "type": 20, "value": 353 },
        { "type": 30, "value": 472 }
    ],
    "unique_events": [
        { "type": 0,  "value": 433  },
        { "type": 10, "value": 554 },
        { "type": 20, "value": 645  },
        { "type": 30, "value": 732 }
    ]
}

This is my query

db.events.aggregate([
    { "$unwind": "$events" },
    { "$group": {
        "_id": { 
            "app_id": "$app_id",
            "type": "$events.type"
            "unique_type": "$unique_events.type"
        },
        "event_count": { "$sum": "$event_count" },
        "event_value": { "$sum": "$events.value" },
        "unique_event_value": { "$sum": "$unique_events.value" }
    }},
    { "$group": {
        "_id": "$_id.app_id",
        "event_count": { "$sum": "$event_count" },
        "events": { "$push": { "type": "$_id.type", "value": "$event_value" } }
        "unique_events": { "$push": { "type": "$_id.unique_type", "value": "$unique_event_value" } }
    }}
]) 

The problem is that using two $unwinds and then grouping by both events and unique_events is causing the $sum to be compounded and much too large. Is there some way I can fix this using mongo or do I have to run two queries and then merge the two result sets in code.

Thanks

Irfan

like image 479
Irfan Avatar asked Aug 08 '14 10:08

Irfan


People also ask

How do I unwind multiple fields in MongoDB?

To unwind, use $unwind. The $unwind deconstructs an array field from the input documents to output a document for each element.

Can we specify more than one aggregate function simultaneously in MongoDB?

Db. collection. aggregate () can use several channels at the same time for data processing.

What field is mandatory in a $Group operation in MongoDB?

_id: This field is mandatory for grouping. If you specify the value of the _id field as null or a constant, the $group operator counts the accumulated values for all input documents as a whole.


2 Answers

This is simple enough really, to sum the results for each array it's just a matter of discerning between which is which and "combining the elements". In short you should probably be doing this in you documents anyway as should be evident from the first pipeline stage(s).

So for MongoDB 2.6 and greater there are some helper methods:

db.events.aggregate([
    { "$project": {
        "app_id": 1,
        "event_count": 1,
        "all_events": {
            "$setUnion": [
                { "$map": {
                    "input": "$events",
                    "as": "el",
                    "in": {
                        "type": "$$el.type",
                        "value": "$$el.value",
                        "class": { "$literal": "A" }
                    }
                }},
                { "$map": {
                    "input": "$unique_events",
                    "as": "el",
                    "in": {
                        "type": "$$el.type",
                        "value": "$$el.value",
                        "class": { "$literal": "B" }
                    }
                }}
            ]
        }
    }},
    { "$unwind": "$all_events" },
    { "$group": {
        "_id": {
            "app_id": "$app_id",
            "class": "$all_events.class",
            "type": "$all_events.type"
        },
        "event_count": { "$sum": "$event_count" },
        "value": { "$sum": "$all_events.value" }
    }},
    { "$group": {
        "_id": "$_id.app_id",
        "event_count": { "$sum": "$event_count" },
        "events": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.class", "A" ] },
                    { "type": "$_id.type", "value": "$value" },
                    false
                ]
            }
        },
        "unique_events": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.class", "B" ] },
                    { "type": "$_id.type", "value": "$value" },
                    false
                ]
            }
        }
    }},
    { "$project": {
        "event_count": 1,
        "events": { "$setDifference": [ "$events", [false] ] },
        "unique_events": {
            "$setDifference": [ "$unique_events", [false] ]
        }
    }}
])

Mostly in the $setUnion and $setDifference operators. The other ccase is $map, which processes arrays in place. The whole thing there is doing operations on arrays without the use of $unwind. But those can of course be done in prior versions, it just takes a bit more work:

db.events.aggregate([
    { "$unwind": "$events" },
    { "$group": {
        "_id": "$_id",
        "app_id": { "$first": "$app_id" },
        "event_count": { "$first": "$event_count" },
        "events": {
            "$push": {
                "type": "$events.type",
                "value": "$events.value",
                "class": { "$const": "A" }
            }
        },
        "unique_events": { "$first": "$unique_events" }            
    }},
    { "$unwind": "$unique_events" },
    { "$group": {
        "_id": "$_id",
        "app_id": { "$first": "$app_id" },
        "event_count": { "$first": "$event_count" },
        "events": { "$first": "$events" },
        "unique_events": {
            "$push": {
                "type": "$unique_events.type",
                "value": "$unique_events.value",
                "class": { "$const": "B" }
            }
        }
    }},
    { "$project": {
        "app_id": 1,
        "event_count": 1,
        "events": 1,
        "unique_events": 1,
        "type": { "$const": [ "A","B" ] }
    }},
    { "$unwind": "$type" },
    { "$unwind": "$events" },
    { "$unwind": "$unique_events" },
    { "$group": {
        "_id": "$_id",
        "app_id": { "$first": "$app_id" },
        "event_count": { "$first": "$event_count" },
        "all_events": {
            "$addToSet": {
                "$cond": [
                     { "$eq": [ "$events.class", "$type" ] },
                     {
                         "type": "$events.type",
                         "value": "$events.value",
                         "class": "$events.class"
                     },
                     {
                         "type": "$unique_events.type",
                         "value": "$unique_events.value",
                         "class": "$unique_events.class"
                     }
                ]
            }
        }
    }},
    { "$unwind": "$all_events" },
   { "$group": {
        "_id": {
            "app_id": "$app_id",
            "class": "$all_events.class",
            "type": "$all_events.type"
        },
        "event_count": { "$sum": "$event_count" },
        "value": { "$sum": "$all_events.value" }
    }},
    { "$group": {
        "_id": "$_id.app_id",
        "event_count": { "$sum": "$event_count" },
        "events": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.class", "A" ] },
                    { "type": "$_id.type", "value": "$value" },
                    false
                ]
            }
        },
        "unique_events": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.class", "B" ] },
                    { "type": "$_id.type", "value": "$value" },
                    false
                ]
            }
        }
    }},
    { "$unwind": "$events" },
    { "$match": { "events": { "$ne": false } } },
    { "$group": {
        "_id": "$_id",
        "event_count": { "$first": "$event_count" },
        "events": { "$push": "$events" },
        "unique_events": { "$first": "$unique_events" }
    }},
    { "$unwind": "$unique_events" },
    { "$match": { "unique_events": { "$ne": false } } },
    { "$group": {
       "_id": "$_id",
        "event_count": { "$first": "$event_count" },
        "events": { "$first": "$events" },
        "unique_events": { "$push": "$unique_events" }
    }}
])

That gets you the results you want with each array being "summed" together as well as the master "event_count" with the correct result.

You probably should consider combining both of those arrays with a similar identifier to what has been used in the pipelines as demonstrated. This part is half of the work. The other half is considering that you probably should store pre-aggregated results in a collection somewhere for the best application performance.

like image 105
Neil Lunn Avatar answered Nov 11 '22 18:11

Neil Lunn


You can perform Following Map reduce:
Its not a dynamic solution, i have created variable for each events and unique_events.
I have saved output in different collection using out: "session_stat" in mapReduce Function.

var mapFunction = function() {
                      var key = this.app_id;
                      var value = {                                 
                                    event_count: this.event_count,
                                    events: this.events,
                                    unique_events: this.unique_events
                                   };

                      emit( key, value );
                  };

var reduceFunction = function(key, values) {

                        var reducedObject = {
                                              app_id: key,
                                              events_wise_total: 0,
                                              unique_events_wise_total:0,
                                              total_event_count:0
                                            };

                        var events_wise_total = [];
                        var events_0_total = { type:0, value :0};
                        var events_10_total = { type:10, value :0};
                        var events_20_total = { type:20, value :0};
                        var events_30_total = { type:30, value :0};

                        var unique_events_wise_total = [];
                        var unique_events_0_total = { type:0, value :0};
                        var unique_events_10_total = { type:10, value :0};
                        var unique_events_20_total = { type:20, value :0};
                        var unique_events_30_total = { type:30, value :0};

                        var total_event_count = 0;
                        values.forEach( function(value) {
                                total_event_count += value.event_count;
                                var events = value.events;

                                events.forEach(function(event){
                                                if(event.type == 0){events_0_total.value += event.value;}
                                                if(event.type == 10){events_10_total.value += event.value;}
                                                if(event.type == 20){events_20_total.value += event.value;}
                                                if(event.type == 30){events_30_total.value += event.value;}
                                        });

                                var unique_events = value.unique_events;

                                unique_events.forEach(function(unique_event){
                                                if(unique_event.type == 0){unique_events_0_total.value += unique_event.value;}
                                                if(unique_event.type == 10){unique_events_10_total.value += unique_event.value;}
                                                if(unique_event.type == 20){unique_events_20_total.value += unique_event.value;}
                                                if(unique_event.type == 30){unique_events_30_total.value += unique_event.value;}
                                        }); 
                            }
                          );
                        events_wise_total.push(events_0_total);
                        events_wise_total.push(events_10_total);
                        events_wise_total.push(events_20_total);
                        events_wise_total.push(events_30_total);

                        unique_events_wise_total.push(unique_events_0_total);
                        unique_events_wise_total.push(unique_events_10_total);
                        unique_events_wise_total.push(unique_events_20_total);
                        unique_events_wise_total.push(unique_events_30_total);

                        reducedObject.events_wise_total = events_wise_total;
                        reducedObject.unique_events_wise_total = unique_events_wise_total;
                        reducedObject.total_event_count = total_event_count;

                        return reducedObject;
                     };

var finalizeFunction = function (key, reducedValue) {
                          return reducedValue;
                       };                    

db.GroupBy.mapReduce(
                       mapFunction,
                       reduceFunction,
                       {
                         out: "session_stat",
                         finalize: finalizeFunction
                       });

Hope it may be useful

like image 36
Harsh Patel Avatar answered Nov 11 '22 16:11

Harsh Patel