I am trying to make an aggregate query with "group"
to get the total of results.
The total of "requested_items"
(my results) are +- 1.900.000.
If I execute with "group", the query is very slow (+- 300sec).
If I execute without "group", the query is very fast (+- 1sec).
What am I doing wrong?
The sample code is below.
SLOW QUERY
db.minute.aggregate([
{ $match: {
$and: [
{ "status": "Homologado" },
{ "requested_items.status": /aceito/i },
]
} },
{ $sort: {'_id': 1}},
{ $unwind: "$requested_items" },
{ $unwind: "$requested_items.winner" },
{ $match: {
$and: [
{ "status": "Homologado" },
{ "requested_items.status": /aceito/i },
]
} },
{ $project: {
"_id": 1
} },
{ $group: {
"_id" : null,
"total" : {$sum: 1},
} },
], {allowDiskUse: true});
FAST QUERY
db.minute.aggregate([
{ $match: {
$and: [
{ "status": "Homologado" },
{ "requested_items.status": /aceito/i },
]
} },
{ $sort: {'_id': 1}},
{ $unwind: "$requested_items" },
{ $unwind: "$requested_items.winner" },
{ $match: {
$and: [
{ "status": "Homologado" },
{ "requested_items.status": /aceito/i },
]
} },
{ $project: {
"_id": 1
} },
], {allowDiskUse: true});
DB STRUCTURE
{
"_id" : "12345678ABCD",
"field_1" : [
{
"a" : null,
"b" : "ABC"
},
{
"code" : null,
"b" : "ABCD"
}
],
"status" : "Homologado",
"initial_date" : ISODate("2016-05-24T11:31:00.000Z"),
"field_2" : [
{
"a" : "ABC",
"b" : "ABCDE"
},
{
"a" : "ABCF",
"b" : "ABCDEF"
}
],
"field_3" : "Lorem ipsum dolor sit amet...",
"field_4" : [
{
"date" : ISODate("2016-05-24T13:54:48.000Z"),
"a" : "Text",
"b" : "More text..."
}
],
"field_4" : 12312321,
"field_5" : ISODate("2016-05-24T13:55:00.000Z"),
"field_6" : "ABCD",
"requested_items" : [
{
"status" : " Aceito e Habilitado",
"field_a" : "Text...",
"winner" : [
{
"a" : "23213.213213.23/232-23",
"b" : 130446,
"c" : 543223,
"d" : NumberLong(2),
"e" : "ABC 123 FULANO",
"f" : "text",
"g" : {
"description" : "TEXT TEXT TEXT"
}
},
{
"a" : "23213.213213.23/232-23",
"b" : 130446,
"c" : 543223,
"d" : NumberLong(2),
"e" : "ABC 123 FULANO",
"f" : "text",
"g" : {
"description" : "TEXT TEXT TEXT"
}
}
],
"field_c" : {
"_id" : ObjectId("5744dd3271af88052f0cc343"),
"a" : "TEXT",
"b" : "TEXT"
},
"field_d" : NumberLong(2),
"field_e" : 5223,
"field_f" : "Não",
"field_g" : "-",
"field_h" : {
"field_a1" : [
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
},
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
}
],
"field_a2" : [
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
},
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
}
],
"field_a3" : {},
"field_a4" : [
{
"date" : ISODate("2016-05-24T11:34:32.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:12:54.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:48:21.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:55:38.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:55:47.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T13:01:36.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T13:15:02.000Z"),
"A" : "TEXT",
"B" : "TEXT"
}
]
},
"field_i" : "Não",
"field_j" : 1
},
{
"status" : " Aceito e Habilitado",
"field_a" : "Text...",
"winner" : [
{
"a" : "23213.213213.23/232-23",
"b" : 130446,
"c" : 543223,
"d" : NumberLong(2),
"e" : "ABC 123 FULANO",
"f" : "text",
"g" : {
"description" : "TEXT TEXT TEXT"
}
}
],
"field_c" : {
"_id" : ObjectId("5744dd3271af88052f0cc343"),
"a" : "TEXT",
"b" : "TEXT"
},
"field_d" : NumberLong(2),
"field_e" : 5223,
"field_f" : "Não",
"field_g" : "-",
"field_h" : {
"field_a1" : [
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
},
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
}
],
"field_a2" : [
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
},
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
}
],
"field_a3" : {},
"field_a4" : [
{
"date" : ISODate("2016-05-24T11:34:32.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:12:54.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:48:21.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:55:38.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:55:47.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T13:01:36.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T13:15:02.000Z"),
"A" : "TEXT",
"B" : "TEXT"
}
]
},
"field_i" : "Não",
"field_j" : 2
},
{
"status" : " Aceito e Habilitado",
"field_a" : "Text...",
"winner" : [
{
"a" : "23213.213213.23/232-23",
"b" : 130446,
"c" : 543223,
"d" : NumberLong(2),
"e" : "ABC 123 FULANO",
"f" : "text",
"g" : {
"description" : "TEXT TEXT TEXT"
}
}
],
"field_c" : {
"_id" : ObjectId("5744dd3271af88052f0cc343"),
"a" : "TEXT",
"b" : "TEXT"
},
"field_d" : NumberLong(2),
"field_e" : 5223,
"field_f" : "Não",
"field_g" : "-",
"field_h" : {
"field_a1" : [
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
},
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
}
],
"field_a2" : [
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
},
{
"a" : "23213.213213.23/232-23",
"b" : ISODate("2016-05-23T23:54:21.000Z"),
"c" : 103432446,
"d" : 522343,
"e" : "Sim",
"f" : NumberLong(2),
"g" : "TEXT TEXT TEXT",
"h" : "Sim",
"i" : {
"a" : "TEXT TEXT TEXT"
}
}
],
"field_a3" : {},
"field_a4" : [
{
"date" : ISODate("2016-05-24T11:34:32.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:12:54.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:48:21.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:55:38.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T12:55:47.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T13:01:36.000Z"),
"A" : "TEXT",
"B" : "TEXT"
},
{
"date" : ISODate("2016-05-24T13:15:02.000Z"),
"A" : "TEXT",
"B" : "TEXT"
}
]
},
"field_i" : "Não",
"field_j" : 3
},
],
"field_7" : "TEXT",
"field_8" : {
"a" : "TEXT",
"b" : "TEXT",
"c" : "324234",
"d" : "TEXT TEXT TEXT TEXT"
},
"field_9" : 43234
}
EXPLAIN
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"$and" : [
{
"status" : "Homologado"
},
{
"requested_items.status" : /aceito/i
}
]
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "module_database.minute",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"status" : {
"$eq" : "Homologado"
}
},
{
"requested_items.status" : /aceito/i
}
]
},
"winningPlan" : {
"stage" : "COLLSCAN",
"filter" : {
"$and" : [
{
"status" : {
"$eq" : "Homologado"
}
},
{
"requested_items.status" : /aceito/i
}
]
},
"direction" : "forward"
},
"rejectedPlans" : []
}
}
},
{
"$unwind" : {
"path" : "$requested_items"
}
},
{
"$unwind" : {
"path" : "$requested_items.winner"
}
},
{
"$match" : {
"$and" : [
{
"status" : "Homologado"
},
{
"requested_items.status" : /aceito/i
}
]
}
},
{
"$group" : {
"_id" : {
"$const" : null
},
"numberOfdocs" : {
"$sum" : {
"$const" : 1
}
}
}
}
],
"ok" : 1
}
My server is:
OS: UBUNTU14/64
CPU: 6
RAM: 16 GB
Total Storage: 80 GB
Running just the tests of my question.
So an aggregation that uses `$lookup` without indexes can lead to extremely slow trains, which can lead to performance degradation on other queries.
In MongoDB, aggregation operations process the data records/documents and return computed results. It collects values from various documents and groups them together and then performs different types of operations on that grouped data like sum, average, minimum, maximum, etc to return a computed result.
This aggregate works fast for my database of 10,000 people - in fact, it completed this in 273.199ms. However, how will it fare for a MongoDB of 10 million entries? If these rates are proportional [100k:2.7s, 1m:27s, 10m:4m30s], it could take 4 minutes and 30 seconds.
A group key is often a field, or group of fields. The group key can also be the result of an expression. Use the _id field in the $group pipeline stage to set the group key.
Finally solved the problem on my query with group. It was an error of design patterns. Thinking in SQL world, I designed the collections before thinking about my app. As a result, slow queries.
To resolve it I had to redesign my collections and put the relevant data in first level of my docs. In my searches, I found that on Aggregation, index needs to be in first stage of pipeline. If I use a field with index after the stage $unwind, it is not considered.
Besides that, I created an int hash for text fields using the package https://github.com/darkskyapp/string-hash. So, my text fields can be indexed.
So my queries changed from 300s to 5s.
it is hard to determine speed as we don't have environment details. What you could try to see how explain is predicting you query by adding:
{
explain:true
}
to your aggregation query db.coll.aggregate([pipeline], {explain:true},{allowDiskUse: true})
.
What also need to be considered that $unwind
doubles amount of documents to process.
As you re going to count amount of documents -> it could be faster just take size of an array (after first unwind) and sum it later
db.inventory.aggregate(
[
{
$group: {
_id: null,
numberOfdocs: { $sum:{$size: "$requested_items.winner" }}
}
}
]
)
after playing with this query I was able to reduce it execution time circa 45%.
The main point is to skip second $match
as this scans full result set , so last $group
contains all data and we can filter out what's needed at the end as this operation is done on a small result set.
db.coll.aggregate([{
$match : {
"status" : "Homologado"
}
}, {
$unwind : "$requested_items"
}, {
$unwind : "$requested_items.winner"
}, {
$project : {
x : "$requested_items.status",
}
}, {
$group : {
_id : "$x",
numberOfdocs : {
$sum : 1
}
}
}, {
$match : {
"_id" : /acesssito/i
}
}
], {
allowDiskUse: true
});
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With