I have a collection of decision documents in a form similar to:
{
_id: ObjectId("23de23802fe925b6ef7162a4"),
userId: 6de4,
decision: true,
datetime:ISODate("2016-07-27T08:22:47.169Z")
},
{
_id: ObjectId("507f1f77bcf86cd799439011"),
userId: 23f4,
decision: true,
datetime:ISODate("2016-02-03T11:48:50.456Z")
},
.
.
.
I'm struggling to figure out a way of grouping these documents into groups of contiguous datetimes. I.e. a document should belong to a particular group if it is less than, say 5 minutes from at least one other document in the group.
The aim being to achieve groups of decisions that where made in a "session". Further insights could then be made on these "session" groups using aggregation (such as average time per decision..etc).
If not possible with MongoDb's aggreagation framework can this be done with map-reduce or by some other means. I am open to suggestions.
Another way of picturing the problem is by applying the following algorithm to the collection of documents.
This would leave the collection with the required "session" groupings. Of course this is just one way of picturing the problem. I am not aware of any way to traverse an ordered collection, whilst grouping in this way using MongoDb.
Can it be done this way? Is there another way to achieve the same result using MongoDb?
By the algorithm you described, the grouping logic for each document always depends on another document. I don't see a way for doing this using map reduce, aggregation or a single MongoDB query. The only solution I see is to follow strictly your algorithm, that is read each document and make the decision if it belongs to the current group or if it should be in a new one.
It's not recommended to load all documents in memory since it could be a very large collection. So I used a stream to load document by document.
Create a cursor that finds all documents and order them by the date and then use cursor.on('data', function(document){ ... });
to read each document individually.
var groups = {} // init group object
var currentGroupKey;
var groupInterval = 5 * 60 * 1000; // Five minutes in milliseconds
var cursor = db.collection("documents").find({}).sort({date: 1});
cursor.on('data', function(doc) {
var timestamp = doc.date.getTime();
if (currentGroupKey != null && currentGroupKey + groupInterval >= timestamp) {
// add it to current group
groups[currentGroupKey].push(doc);
} else {
// create a new group
groups[timestamp] = [doc];
currentGroupKey = timestamp;
}
});
cursor.once('end', function() {
// This is called after last document is read
console.log(groups); // print your grouped documents
db.close();
});
For this documents
[ { _id: 57f59acb8e73d9634ac8c7b0,
index: 3,
date: Wed Oct 05 2016 21:02:29 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7ae,
index: 1,
date: Wed Oct 05 2016 21:04:02 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7b3,
index: 6,
date: Wed Oct 05 2016 21:07:43 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7b4,
index: 7,
date: Wed Oct 05 2016 21:10:26 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7b2,
index: 5,
date: Wed Oct 05 2016 21:14:23 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7b5,
index: 8,
date: Wed Oct 05 2016 21:17:39 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7b6,
index: 9,
date: Wed Oct 05 2016 21:21:07 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7ad,
index: 0,
date: Wed Oct 05 2016 21:24:19 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7af,
index: 2,
date: Wed Oct 05 2016 21:25:50 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7b1,
index: 4,
date: Wed Oct 05 2016 21:28:13 GMT-0300 (BRT) } ]
The final group object is
{ '1475712149573':
[ { _id: 57f59acb8e73d9634ac8c7b0,
index: 3,
date: Wed Oct 05 2016 21:02:29 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7ae,
index: 1,
date: Wed Oct 05 2016 21:04:02 GMT-0300 (BRT) } ],
'1475712463238':
[ { _id: 57f59acb8e73d9634ac8c7b3,
index: 6,
date: Wed Oct 05 2016 21:07:43 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7b4,
index: 7,
date: Wed Oct 05 2016 21:10:26 GMT-0300 (BRT) } ],
'1475712863890':
[ { _id: 57f59acb8e73d9634ac8c7b2,
index: 5,
date: Wed Oct 05 2016 21:14:23 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7b5,
index: 8,
date: Wed Oct 05 2016 21:17:39 GMT-0300 (BRT) } ],
'1475713267412':
[ { _id: 57f59acb8e73d9634ac8c7b6,
index: 9,
date: Wed Oct 05 2016 21:21:07 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7ad,
index: 0,
date: Wed Oct 05 2016 21:24:19 GMT-0300 (BRT) },
{ _id: 57f59acb8e73d9634ac8c7af,
index: 2,
date: Wed Oct 05 2016 21:25:50 GMT-0300 (BRT) } ],
'1475713693672':
[ { _id: 57f59acb8e73d9634ac8c7b1,
index: 4,
date: Wed Oct 05 2016 21:28:13 GMT-0300 (BRT) } ] }
EDIT
Since the logic for grouping is always the last read document, I modified the algorithm for fit it. Also now it updates each document with a group key so it don't load all documents in memory.
var lastDocumentTimestamp;
var groupIndex = 0;
var groupInterval = 5 * 60 * 1000; // Five minutes in milliseconds
var cursor = db.collection("documents").find({}).sort({date: 1});
cursor.on('data', function(doc) {
var timestamp = doc.date.getTime();
if (lastDocumentTimestamp + groupInterval < timestamp) {
groupIndex++;
}
lastDocumentTimestamp = timestamp;
db.collection("documents").update({ _id: doc._id}, { $set: {group: groupIndex}});
});
cursor.once('end', function() {
// This is called after last document is read
db.close();
});
After that you can use aggregation for grouping documents by its groups
db.collection("documents").aggregate([{
$group: {
_id: "$group",
count: { $sum: 1 },
docs: { $push: "$date" }
}
}])
This produces a result like:
[ { _id: 0,
count: 1,
docs: [ Thu Oct 06 2016 22:00:20 GMT-0300 (BRT) ] },
{ _id: 1,
count: 4,
docs:
[ Thu Oct 06 2016 22:20:31 GMT-0300 (BRT),
Thu Oct 06 2016 22:22:52 GMT-0300 (BRT),
Thu Oct 06 2016 22:25:34 GMT-0300 (BRT),
Thu Oct 06 2016 22:27:15 GMT-0300 (BRT) ] },
{ _id: 2,
count: 5,
docs:
[ Thu Oct 06 2016 22:33:27 GMT-0300 (BRT),
Thu Oct 06 2016 22:35:45 GMT-0300 (BRT),
Thu Oct 06 2016 22:38:45 GMT-0300 (BRT),
Thu Oct 06 2016 22:40:02 GMT-0300 (BRT),
Thu Oct 06 2016 22:44:20 GMT-0300 (BRT) ] } ]
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With