Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Reshape documents by splitting a field value

Suppose we have a collection of raw data:

{ "person": "David, age 102"}
{ "person": "Max, age 8" }

and we'd like to transform that collection to:

{ "age": 102 }
{ "age": 8 }

using only mongo(d) engine. (If all person names or ages had equal lengths, $substr could do the job, ) Is it possible?

Suppose regex is trivial /\d+/

like image 299
Fake Fish Avatar asked Oct 19 '22 10:10

Fake Fish


1 Answers

The optimal way in MongoDB version 3.4.

This version of mongod provides the $split operator which, of course split the string as shown here.

We then assign the the newly computed value to a variable using the $let variable operator. The new value can then be use in the in expression to return the "name" and the "age" values using the $arrayElemAt operator to return the element at a specified index; 0 for the first element and -1 for the last element.

Note that in the in expression we need to split the last element in order to return the string of integer.

Finally we need to iterate the Cursor object and cast the convert the string of integer to numeric using Number or parseInt and use bulk operation and the bulkWrite() method to $set the value for those field for maximum efficiency.

let requests = [];
db.coll.aggregate(
    [
        { "$project": {  
            "person": { 
                "$let": { 
                    "vars": { 
                        "infos":  { "$split": [ "$person", "," ] } 
                    }, 
                    "in": { 
                        "name": { "$arrayElemAt": [ "$$infos", 0 ] }, 
                        "age": { 
                            "$arrayElemAt": [ 
                                { "$split": [ 
                                    { "$arrayElemAt": [ "$$infos", -1 ] }, 
                                    " " 
                                ]}, 
                                -1 
                            ] 
                        } 
                    } 
                } 
            }  
        }}
    ] 
).forEach(document => { 
    requests.push({ 
        "updateOne": { 
            "filter": { "_id": document._id }, 
            "update": { 
                "$set": { 
                    "name": document.person.name, 
                    "age": Number(document.person.age) 
                },
                "$unset": { "person": " " }
            } 
        } 
    }); 
    if ( requests.length === 500 ) { 
        // Execute per 500 ops and re-init
        db.coll.bulkWrite(requests); 
        requests = []; 
    }} 
);

 // Clean up queues
if(requests.length > 0) {
    db.coll.bulkWrite(requests);
}

MongoDB 3.2 or newer.

MongoDB 3.2 deprecates the old Bulk() API and its associated methods and provides the bulkWrite() method but it doesn't provide the $split operator so the only option we have here is to use the mapReduce() method to transform our data then update the collection using bulk operation.

var mapFunction = function() { 
    var person = {}, 
    infos = this.person.split(/[,\s]+/); 
    person["name"] = infos[0]; 
    person["age"] = infos[2]; 
    emit(this._id, person); 
};

var results = db.coll.mapReduce(
    mapFunction, 
    function(key, val) {}, 
    { "out": { "inline": 1 } }
)["results"];

results.forEach(document => { 
    requests.push({ 
        "updateOne": { 
            "filter": { "_id": document._id }, 
            "update": { 
                "$set": { 
                    "name": document.value.name, 
                    "age": Number(document.value.age) 
                }, 
                "$unset": { "person": " " }
            } 
        } 
    }); 
    if ( requests.length === 500 ) { 
        // Execute per 500 operations and re-init
        db.coll.bulkWrite(requests); 
        requests = []; 
    }} 
);

// Clean up queues
if(requests.length > 0) {
    db.coll.bulkWrite(requests);
}

MongoDB version 2.6 or 3.0.

We need to use the now deprecated Bulk API.

var bulkOp = db.coll.initializeUnorderedBulkOp();
var count = 0;

results.forEach(function(document) { 
    bulkOp.find({ "_id": document._id}).updateOne(
        { 
            "$set": { 
                "name": document.value.name, 
                "age": Number(document.value.age)
            },
            "$unset": { "person": " " }
        }
    );
    count++;
    if (count === 500 ) {
        // Execute per 500 operations and re-init
        bulkOp.execute();
        bulkOp = db.coll.initializeUnorderedBulkOp();
    }
});

// clean up queues
if (count > 0 ) {
    bulkOp.execute();
}
like image 159
styvane Avatar answered Oct 21 '22 04:10

styvane