Suppose we have a collection of raw data:
{ "person": "David, age 102"}
{ "person": "Max, age 8" }
and we'd like to transform that collection to:
{ "age": 102 }
{ "age": 8 }
using only mongo(d) engine. (If all person names or ages had equal lengths, $substr could do the job, ) Is it possible?
Suppose regex is trivial /\d+/
This version of mongod
provides the $split
operator which, of course split the string as shown here.
We then assign the the newly computed value to a variable using the $let
variable operator. The new value can then be use in the in expression to return the "name" and the "age" values using the $arrayElemAt
operator to return the element at a specified index; 0
for the first element and -1
for the last element.
Note that in the in expression we need to split the last element in order to return the string of integer.
Finally we need to iterate the Cursor
object and cast the convert the string of integer to numeric using Number
or parseInt
and use bulk operation and the bulkWrite()
method to $set
the value for those field for maximum efficiency.
let requests = [];
db.coll.aggregate(
[
{ "$project": {
"person": {
"$let": {
"vars": {
"infos": { "$split": [ "$person", "," ] }
},
"in": {
"name": { "$arrayElemAt": [ "$$infos", 0 ] },
"age": {
"$arrayElemAt": [
{ "$split": [
{ "$arrayElemAt": [ "$$infos", -1 ] },
" "
]},
-1
]
}
}
}
}
}}
]
).forEach(document => {
requests.push({
"updateOne": {
"filter": { "_id": document._id },
"update": {
"$set": {
"name": document.person.name,
"age": Number(document.person.age)
},
"$unset": { "person": " " }
}
}
});
if ( requests.length === 500 ) {
// Execute per 500 ops and re-init
db.coll.bulkWrite(requests);
requests = [];
}}
);
// Clean up queues
if(requests.length > 0) {
db.coll.bulkWrite(requests);
}
MongoDB 3.2 deprecates the old Bulk()
API and its associated methods and provides the bulkWrite()
method but it doesn't provide the $split
operator so the only option we have here is to use the mapReduce()
method to transform our data then update the collection using bulk operation.
var mapFunction = function() {
var person = {},
infos = this.person.split(/[,\s]+/);
person["name"] = infos[0];
person["age"] = infos[2];
emit(this._id, person);
};
var results = db.coll.mapReduce(
mapFunction,
function(key, val) {},
{ "out": { "inline": 1 } }
)["results"];
results.forEach(document => {
requests.push({
"updateOne": {
"filter": { "_id": document._id },
"update": {
"$set": {
"name": document.value.name,
"age": Number(document.value.age)
},
"$unset": { "person": " " }
}
}
});
if ( requests.length === 500 ) {
// Execute per 500 operations and re-init
db.coll.bulkWrite(requests);
requests = [];
}}
);
// Clean up queues
if(requests.length > 0) {
db.coll.bulkWrite(requests);
}
We need to use the now deprecated Bulk API.
var bulkOp = db.coll.initializeUnorderedBulkOp();
var count = 0;
results.forEach(function(document) {
bulkOp.find({ "_id": document._id}).updateOne(
{
"$set": {
"name": document.value.name,
"age": Number(document.value.age)
},
"$unset": { "person": " " }
}
);
count++;
if (count === 500 ) {
// Execute per 500 operations and re-init
bulkOp.execute();
bulkOp = db.coll.initializeUnorderedBulkOp();
}
});
// clean up queues
if (count > 0 ) {
bulkOp.execute();
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With