Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Mongodb Aggregation by Day then Hour

I am using mongodb aggregation to aggregate set of data. My situation is a bit complex. I've a collection as following:

{
  startTime: ISODate("2014-12-31T10:20:30Z"),
  customerId: 123,
  ping: "2",
  link: "3"
}

Now I want to aggregate data to another collection as following:

{
_id: {
 day: ISODate("2014-12-31T00:00:00Z"),
 customerId: 123
 },
hours: [
  {
   hour: ISODate("2014-12-31T10:00:00Z"),
   pings: 2,
   links: 3
  },
  {
   hour: ISODate("2014-12-31T11:00:00Z"),
   pings: 5,
   links: 6
  }
 ]
}

As you can see the data is group by day first and then by hours. I've got following aggregation query to group them by day but how to group them further by hours? Any Idea?

var pipeline = [
{
 $project : {  
       startTime : 1,
               customerId: 1,
       ping:1,
       link:1,
       date : "$startTime",  
       h : {  
            "$hour" : "$startTime"  
       },  
       m : {  
            "$minute" : "$startTime"  
       },  
       s : {  
            "$second" : "$startTime"  
       },  
       ml : {  
            "$millisecond" : "$startTime"  
       }  
  }
},
{
$project: {
    startTime : 1,
            customerId: 1,
    ping:1,
    link:1,
      date : {      
            "$subtract" : [      
                 "$date",      
                 {      
                      "$add" : [      
                           "$ml",      
                           {      
                                "$multiply" : [      
                                     "$s",      
                                     1000      
                                ]      
                           },      
                           {      
                                "$multiply" : [      
                                     "$m",      
                                     60,      
                                     1000      
                                ]      
                           },
                           {      
                                "$multiply" : [      
                                     "$h",      
                                     60,      
                                     60,      
                                     1000 
                                ]      
                           }      
                      ]      
                 }      
            ]      
       }
    }          
},
{
    $match: {
        "startTime": {
            $gte: new ISODate("2013-12-01T07:00:00Z"),
            $lte: new ISODate("2014-01-01T08:00:00Z"),
        }
    }
},
// Aggregate the data
{
    $group: {
        _id: {day : "$date", customerId: "$customerId"},
        pings : {$sum: "$ping"},
        links : {$sum: "$links"}
    }
}
];
like image 242
user3756522 Avatar asked Jun 19 '14 13:06

user3756522


1 Answers

What you basically want is a double grouping, but you do not get the entire date object back using the date aggregation operators, just the relevant parts:

db.collection.aggregate([
    { "$group": {
        "_id": {
            "customerId": "$customerId",
            "day": { "$dayOfYear": "$startTime" },
            "hour": { "$hour": "$startTime" }
        },
        "pings": { "$sum": "$ping" },
        "links": { "$sum": "$link" }
    }},
    { "$group": {
       "_id": {
           "customerId": "$_id.customerId",
           "day": "$_id.day"
       },
       "hours": { 
           "$push": { 
               "hour": "$_id.hour",
               "pings": "$pings",
               "links": "$links"
           }
       }
    }}
])

The double $group gives you the format you want by placing the results into an array per day. Single document in the sample, but you basically get results like this:

{
    "_id" : {
            "customerId" : 123,
            "day" : 365
    },
    "hours" : [
            {
                    "hour" : 10,
                    "pings" : 2,
                    "links" : 3
            }
    ]
}

If you find the results of the date operators to difficult to deal with or want a simplified "pass-through" result for date objects, then you could cast as epoch timestamps instead:

db.collection.aggregate([
    { "$group": {
        "_id": {
            "customerId": "$customerId",
            "day": {
               "$subtract": [
                   { "$subtract": [ "$startTime", new Date("1970-01-01") ] },
                   {
                       "$mod": [
                           { "$subtract": [ "$startTime", new Date("1970-01-01") ] },
                           1000*60*60*24   
                       ]
                   }
               ]
            },
            "hour": {
               "$subtract": [
                   { "$subtract": [ "$startTime", new Date("1970-01-01") ] },
                   {
                       "$mod": [
                           { "$subtract": [ "$startTime", new Date("1970-01-01") ] },
                           1000*60*60   
                       ]
                   }
               ]
            }
        },
        "pings": { "$sum": "$ping" },
        "links": { "$sum": "$link" }
    }},
    { "$group": {
       "_id": {
           "customerId": "$_id.customerId",
           "day": "$_id.day"
       },
       "hours": { 
           "$push": { 
               "hour": "$_id.hour",
               "pings": "$pings",
               "links": "$links"
           }
       }
    }}
])

The trick in there is when you $subtract one date object from another you get the "epoch" value back as a result. In this case we use the "epoch" start date to get the whole timestamp value and just provide the "date math" to correct the times to the required intervals. So the result:

{
    "_id" : {
            "customerId" : 123,
            "day" : NumberLong("1419984000000")
    },
    "hours" : [
            {
                    "hour" : NumberLong("1420020000000"),
                    "pings" : 2,
                    "links" : 3
            }
    ]
}

Which might be more palatable to you than what the date operators provide as a result depending on your needs.

You can also add a little shorthand for this with MongoDB 2.6 via the $let operator that allows you declare "variables" for scoped operations:

db.event.aggregate([
    { "$group": {
        "_id": {
            "$let": {
                "vars": { 
                   "date": { "$subtract": [ "$startTime", new Date("1970-01-01") ] },
                   "day": 1000*60*60*24,
                   "hour": 1000*60*60
                },
                "in": {
                    "customerId": "$customerId",
                    "day": {
                        "$subtract": [
                            "$$date",
                            { "$mod": [ "$$date", "$$day" ] }
                         ]
                    },
                    "hour": {
                        "$subtract": [
                            "$$date",
                            { "$mod": [ "$$date", "$$hour" ] }
                         ]
                    }
                }
            }
        },
        "pings": { "$sum": "$ping" },
        "links": { "$sum": "$link" }
    }},
    { "$group": {
       "_id": {
           "customerId": "$_id.customerId",
           "day": "$_id.day"
       },
       "hours": { 
           "$push": { 
               "hour": "$_id.hour",
               "pings": "$pings",
               "links": "$links"
           }
       }
    }}
])

Also I nearly forgot to mention that your values for "ping" and "link" are actually strings unless that is a typo. But if not, then make sure you convert them as numbers first.

like image 59
Neil Lunn Avatar answered Nov 13 '22 19:11

Neil Lunn