In each document,
the records is an array containing many duplicated objects.
and in buy_items there are also containing many duplicated items.
How could I clean the duplicated items ?
Original documents:
{
"_id": "0005d116qwwewdq82a1b84f148fa6027d429f3e",
"records": [
{
"DATE": new Date("1996-02-08T08:00:00+0800"),
"buy_items": [
"5210 ",
"5210 ",
"5210 "
]
},
{
"DATE": new Date("1996-02-08T08:00:00+0800"),
"buy_items": [
"5210 ",
"5210 ",
"5210 "
]
}
{
"DATE": new Date("2012-12-08T08:00:00+0800"),
"buy_items": [
"5210 ",
"1234 ",
" "
]
}
]
}
Expected Output:
{
"_id": "0005d116qwwewdq82a1b84f148fa6027d429f3e",
"records": [
{
"DATE": new Date("1996-02-08T08:00:00+0800"),
"buy_items": [
"5210 "
]
},
{
"DATE": new Date("2012-12-08T08:00:00+0800"),
"buy_items": [
"5210 ",
"1234 ",
" "
]
}
]
}
With Michaels solution, the output might looks like this
{
"_id": "0005d116qwwewdq82a1b84f148fa6027d429f3e",
"records": [
"date": new Date("1996-02-08T08:00:00+0800"),
"buy_items": [
"5210 "
"1234 ",
" "
]
]
}
You can remove duplicated objects using the aggregation framework
db.collection.aggregate(
[
{ $unwind: "$records" },
{ $unwind: "$records.buy_items" },
{ $group: { "_id": {id: "$_id", date: "$records.DATE" }, buy_items: { $addToSet: "$records.buy_items" }}},
{ $group: {"_id": "$_id.id", records: { $push: {"date": "$_id.date", "buy_items": "$buy_items" }}}}, { $sort: { "records.0.date": 1 }} ,
{ $out: "collection" }
]
)
The $out operator let you write your aggregation result in specified collection or Replace you existing collection.
Even better using "Bulk" operations
var bulk = bulk = db.collection.initializeOrderedBulkOp(),
count = 0;
db.collection.aggregate([
{ "$unwind": "$records" },
{ "$project": {
"date": "$records.DATE",
"buy_items": { "$setIntersection": "$records.buy_items" }
}},
{ "$unwind": "$buy_items" },
{ "$group": {
"_id": { "id": "$_id", "date": "$date" },
"buy_items": { "$addToSet": "$buy_items" }
}},
{ "$group": {
"_id": "$_id.id",
"records": { "$push": {
"date": "$_id.date",
"buy_items": "$buy_items"
}}
}}
]).forEach(function(doc) {
bulk.find({"_id": doc._id}).updateOne({
"$set": { "records": doc.records }
});
count++;
if (count % 500 == 0) {
bulk.execute();
bulk = db.collection.initializeOrderedBulkOp();
}
})
if (count % 500 != 0)
bulk.execute();
Result:
{
"_id" : "0005d116qwwewdq82a1b84f148fa6027d429f3e",
"records" : [
{
"date" : ISODate("2012-12-08T00:00:00Z"),
"buy_items" : [
" ",
"1234 ",
"5210 "
]
},
{
"date" : ISODate("1996-02-08T00:00:00Z"),
"buy_items" : [
"5210 "
]
}
]
}
If you want to update your current collections without creating new collection and drop previous collection. I tried this but doing this you should run two different update commands.
First update records with distinct like this :
db.collectionName.update({},{"$set":{"records":db.collectionName.distinct('records')}})
and second update for buy_items with distinct like this :
db.collectionName.update({},{"$set":{"records.0.buy_items":db.collectionName.distinct('records.buy_items')}})
If you want to avoid two update query then follow Michael answer .
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With