Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Join two collections with MapReduce in MongoDB

I already know that MongoDB doesn't support join operations, but i have to simulate a $lookup (from the aggregation framework) with the mapReduce paradigm.

My two collections are:

// Employees sample 
{
  "_id" : "1234",
  "first_name" : "John",
  "last_name" : "Bush",
  "departments" : 
  [ 
    { "dep_id" : "d001", "hire_date" : "date001" },
    { "dep_id" : "d004", "hire_date" : "date004" }
  ]
}
{ 
  "_id" : "5678", 
  "first_name" : "Johny", 
  "last_name" : "Cash", 
  "departments" : [ { "dep_id" : "d001", "hire_date" : "date03" } ] 
}
{ 
  "_id" : "9012", 
  "first_name" : "Susan", 
  "last_name" : "Bowdy", 
  "departments" : [ { "dep_id" : "d004", "hire_date" : "date04" } ] 
}

// Departments sample 
{
  "_id" : "d001",
  "dep_name" : "Sales",
  "employees" : [ "1234", "5678" ]
},
{
  "_id" : "d004",
  "name" : "Quality M",
  "employees" : [ "1234", "9012" ]
}

And actually I'd like to have a result like this:

{
  "_id" : "1234",
  "value" : 
  {
    "first_name" : "John",
    "departments" :
    [
      { "dep_id" : "d001", "dep_name" : "Sales" },
      { "dep_id" : "d004", "dep_name" : "Quality M" }
    ]
  }
}
{ 
  "_id" : "5678", 
  "value" : 
  { 
    "first_name" : "Johnny", 
    "departments" : [ { "dep_id" : "d001", "dep_name" : "Sales" } ]
  } 
}
{ 
  "_id" : "9012", 
  "value" : 
  { 
    "first_name" : "Susan", 
    "departments" : [ { "dep_id" : "d004", "dep_name" : "Quality M" } ] 
  } 
}

The common fields are dep_id (from Employees) and _id (from Departments).

My code is the next one, but it doesn't work as I need.

var mapD = function() {
  for (var i=0; i<this.employees.length; i++) {
    emit(this.employees[i], { dep_id: 0, dep_name: this.dep_name });
  }
}

var mapE = function() {
  for (var i=0; i<this.departments.length; i++) {
    emit(this._id, { dep_id: this.departments[i].dep_id, dep_name: 0 });
  }
}

var reduceLookUp = function(key, values) {
  var result = {dep_id: 0, dep_name: 0};
  values.forEach(function(value) {
    if (value.dep_name !== null && value.dep_name !== undefined) {
      result.dep_name = values.dep_name;
    }
    if (value.dep_id !== null && value.dep_id !== undefined) {
      result.dep_id = value.dep_id;
    }
  });
  return result;
};

db.Departments.mapReduce(mapD, reduceLookUp, { out: { reduce: "joined" } });
db.Employees.mapReduce(mapE, reduceLookUp, { out: { reduce: "joined" } });

I'll really apreciate your help! Thanks in advance.

like image 729
Vzqivan Avatar asked Dec 25 '22 02:12

Vzqivan


1 Answers

In your problem first_name can be fetched only from Employees collection and dep_name can be fetched only from Departments collection.

You can achieve it both with MapReduce and aggregation framework.

1. MapReduce solution

If you modify your map and reduce functions as follows

var mapD = function() {
  for (var i=0; i<this.employees.length; i++)
    emit(this.employees[i], { dep_id: this._id, dep_name: this.dep_name });  
}

var mapE = function() { emit(this._id, { first_name: this.first_name }); }

var reduceLookUp = function(key, values) {
  var results = {};
  var departments = [];
  values.forEach(function(value) {
    var department = {};
    if (value.dep_id !== undefined) department["dep_id"] = value.dep_id;
    if (value.dep_name !== undefined) department["dep_name"] = value.dep_name;
    if (Object.keys(department).length > 0) departments.push(department);
    if (value.first_name !== undefined) results["first_name"] = value.first_name;
    if (value.departments !== undefined) results["departments"] = value.departments;
  });
  if (Object.keys(departments).length > 0) results["departments"] = departments;
  return results;
}

then first MapReduce call

db.Departments.mapReduce(mapD, reduceLookUp, { out: { reduce: "joined" } });

will insert into joined collection

{ 
  "_id" : "1234", 
  "value" : 
  {
    "departments" : 
    [ 
      { "dep_id" : "d001", "dep_name" : "Sales" }, 
      { "dep_id" : "d004", "dep_name" : "Quality M" } 
    ] 
  }
}

while second call

db.Employees.mapReduce(mapE, reduceLookUp, { out: { reduce: "joined" } });

should insert

{ "_id" : "1234", "value" : { "first_name" : "John" } }

but, according to documentation, reduce output option will

Merge the new result with the existing result if the output collection already exists. If an existing document has the same key as the new result, apply the reduce function to both the new and the existing documents and overwrite the existing document with the result

Thus, reduce function will be called again in your case with parameters

key = "1234",
values =
[
  {
    "departments" : 
    [ 
      { "dep_id" : "d001", "dep_name" : "Sales" }, 
      { "dep_id" : "d004", "dep_name" : "Quality M" } 
    ] 
  },
  { "first_name" : "John" }
]

and final result is

{ 
  "_id" : "1234", 
  "value" : 
  { 
    "first_name" : "John", 
    "departments" : 
    [ 
      { "dep_id" : "d001", "dep_name" : "Sales" }, 
      { "dep_id" : "d004", "dep_name" : "Quality M" }
    ] 
  } 
}

2. Aggregation framework solution

A better solution for your problem is to use aggregation framework instead of Map-Reduce. Here you would use $lookup stage to fetch some data from Employees

db.Departments.aggregate([
  { $unwind: "$employees" },
  { 
    $lookup: 
      { 
        from: "Employees", 
        localField: "employees", 
        foreignField: "_id", 
        as: "employee"
      }
  },
  { $unwind: "$employee" },
  { 
    $group: 
      { 
        "_id": "$employees",
        "first_name": { $first: "$employee.first_name" }, 
        "departments": { $push: { dep_id: "$_id", dep_name: "$dep_name" } } 
      } 
  } 
]);

that will result into

{ 
  "_id" : "1234",
  "first_name" : "John",
  "departments" : 
    [ 
      { "dep_id" : "d001", "dep_name" : "Sales" }, 
      { "dep_id" : "d004", "dep_name" : "Quality M" } 
    ] 
}
like image 87
tarashypka Avatar answered Jan 03 '23 04:01

tarashypka