A simple way to use map reduce to pre-join data sets for analysis. Enjoy!(尽量避免在做实际计算的时候去做join操作,所以可以用这种方法事先将关注的fields join到同一张表中http://stackoverflow.com/questions/4067197/mongodb-and-joins)
Have you ever wanted to join 2 collections of data in MongoDB? I’ve worked on a project that had half a dozen systems each dumping data into their own collection and then needed to do some analytics over all the data joined together. Using the new incremental MapReduce features in MongoDB 1.8 I can pre-join all these data sets together into rich, nested documents that can then be queried by an analytics tool in literally hundreds of different ways and provide the user with instant results. Its amazing. I thought I’d take the opportunity to give a simple demonstration of how this is possible.
There are 2 main tricks to do this. The first is to realize that the fields you perform a join on will be keys what you emit from your mapper function. The second trick is to have the mapping functions for each collection output data in the same format. Then you can use the same reduction function for all of the collections and run mapReduce using the “reduce” output method.
Lets walk through a simple but interesting example. We will find out the amount of money the US goverment gave to the countries with the lowest life expectancy in 2009. The data sets I will be using are:
To import the above files into mongo:
mongoimport --file us_economic_assistance.csv --headerline --type csv -d mr_demo -c us_economic_assistance --drop --ignoreBlanks mongoimport --file life_expectancy.tsv --headerline --type tsv -d mr_demo -c life_expectancy --drop --ignoreBlanks
The map function for the life_expectancy collection:
life_expect_map = function() { // Simply emit the age and 0 for the dollar amount. // The dollar amount will come from the other collection. emit(this.country, {life_expectancy: this.age, dollars: 0}); }
The map function for the US Economic Aid in 2009:
us_econ_map = function() { // The data set contains grant amounts going back to 1946. I // am only interested in 2009 grants. if(this.FY2009 !== undefined && this.FY2009 !== null) { emit(this.country_name, { dollars:this.FY2009, life_expectancy: 0 }); } }
You’ll note that the key that is being emitted by both mappers is the country’s name. This is the field I am joining on. Second, notice that the shape of the emitted documents is the same in both map functions. Here is the reduction function that we will use for both collections:
r = function(key, values) { varresult = {dollars: 0, life_expectancy: 0}; values.forEach(function(value) { // Sum up all the money from all the 2009 grants for this // country (key) result.dollars += (value.dollars !== null) ? value.dollars : 0; // Only set life expectancy once if(result.life_expectancy === 0 && value.life_expectancy !== null ) { result.life_expectancy = value.life_expectancy; } }); returnresult; }
Now run mapReduce over both collections, using the reduce output option so that if a document already exists in the output collection, the reduce function will be called with the 2 values with the result being written back to the output collection.
> res = db.life_expectancy.mapReduce(life_expect_map, r, {out: {reduce: 'joined'}})
{
"result": "joined",
"timeMillis": 22,
"counts": {
"input": 222,
"emit": 222,
"output": 222
},
"ok": 1,
}
> res = db.us_economic_assistance.mapReduce(us_econ_map, r, {out: {reduce: 'joined'}})
{
"result": "joined",
"timeMillis": 184,
"counts": {
"input": 2446,
"emit": 1353,
"output": 252
},
"ok": 1,
}
> db.joined.find({'value.dollars': {$gt:0}, 'value.life_expectancy': {$gt:0}}).sort({'value.life_expectancy':1}).limit(10)
{"_id": "Angola","value": { "dollars": 54227598, "life_expectancy": 38.76 } }
{"_id": "Afghanistan","value": { "dollars": 3046294847, "life_expectancy": 45.02 } }
{"_id": "Nigeria","value": { "dollars": 498281606, "life_expectancy": 47.56 } }
{"_id": "Chad","value": { "dollars": 221842294, "life_expectancy": 48.33 } }
{"_id": "Swaziland","value": { "dollars": 22843193, "life_expectancy": 48.66 } }
{"_id": "Guinea-Bissau","value": { "dollars": 1857000, "life_expectancy": 48.7 } }
{"_id": "South Africa","value": { "dollars": 569625000, "life_expectancy": 49.33 } }
{"_id": "Zimbabwe","value": { "dollars": 285470926, "life_expectancy": 49.64 } }
{"_id": "Central African Republic","value": { "dollars": 36109353, "life_expectancy": 50.07 } }
{"_id": "Somalia","value": { "dollars": 179253637, "life_expectancy": 50.4 } }
Download the example data sets and try it yourself. There ya have it.
下面是我基于上面文章写的nodejs测试:
Mongodb collection:
xuanhouysh@localhost:~/data/masterdb$ mongo localhost MongoDB shell version: 1.8.2 connecting to: localhost > use mr_demo. mr_demo.0 mr_demo.1 mr_demo.ns > use mr_demo switched to db mr_demo > show collections joined life_expectancy system.indexes us_economic_assistance > db.life_expectancy.find() { "_id" : ObjectId("4e9e85016ab9cd886d9271bb"), "id" : 1, "country" : "Monaco", "age" : 89.73 } { "_id" : ObjectId("4e9e85016ab9cd886d9271bc"), "id" : 2, "country" : "Macau", "age" : 84.41 } { "_id" : ObjectId("4e9e85016ab9cd886d9271bd"), "id" : 3, "country" : "San Marino", "age" : 83.01 } { "_id" : ObjectId("4e9e85016ab9cd886d9271be"), "id" : 4, "country" : "Andorra", "age" : 82.43 } { "_id" : ObjectId("4e9e85016ab9cd886d9271bf"), "id" : 5, "country" : "Japan", "age" : 82.25 } { "_id" : ObjectId("4e9e85016ab9cd886d9271c0"), "id" : 6, "country" : "Guernsey", "age" : 82.16 } { "_id" : ObjectId("4e9e85016ab9cd886d9271c1"), "id" : 7, "country" : "Singapore", "age" : 82.14 } { "_id" : ObjectId("4e9e85016ab9cd886d9271c2"), "id" : 8, "country" : "Hong Kong", "age" : 82.04 } { "_id" : ObjectId("4e9e85016ab9cd886d9271c3"), "id" : 9, "country" : "Australia", "age" : 81.81 } { "_id" : ObjectId("4e9e85016ab9cd886d9271c4"), "id" : 10, "country" : "Italy", "age" : 81.77 } { "_id" : ObjectId("4e9e85016ab9cd886d9271c5"), "id" : 11, "country" : "Jersey", "age" : 81.38 } { "_id" : ObjectId("4e9e85016ab9cd886d9271c6"), "id" : 12, "country" : "Canada", "age" : 81.38 } { "_id" : ObjectId("4e9e85016ab9cd886d9271c7"), "id" : 13, "country" : "France", "age" : 81.19 } { "_id" : ObjectId("4e9e85016ab9cd886d9271c8"), "id" : 14, "country" : "Spain", "age" : 81.17 } { "_id" : ObjectId("4e9e85016ab9cd886d9271c9"), "id" : 15, "country" : "Switzerland", "age" : 81.07 } { "_id" : ObjectId("4e9e85016ab9cd886d9271ca"), "id" : 16, "country" : "Sweden", "age" : 81.07 } { "_id" : ObjectId("4e9e85016ab9cd886d9271cb"), "id" : 17, "country" : "Israel", "age" : 80.96 } { "_id" : ObjectId("4e9e85016ab9cd886d9271cc"), "id" : 18, "country" : "Iceland", "age" : 80.9 } { "_id" : ObjectId("4e9e85016ab9cd886d9271cd"), "id" : 19, "country" : "Anguilla", "age" : 80.87 } { "_id" : ObjectId("4e9e85016ab9cd886d9271ce"), "id" : 20, "country" : "Bermuda", "age" : 80.71 } has more > db.us_economic_assistance.find() { "_id" : ObjectId("4e9e84f36ab9cd886d92682d"), "country_name" : "Afghanistan", "program_name" : "Child Survival and Health", "FY2002" : 2150000, "FY2003" : NumberLong(47945019), "FY2004" : NumberLong(35000000), "FY2005" : NumberLong(35785000), "FY2006" : NumberLong(37974000), "FY2007" : NumberLong(69444000), "FY2008" : NumberLong(27813000) } { "_id" : ObjectId("4e9e84f36ab9cd886d92682e"), "country_name" : "Afghanistan", "program_name" : "Department of Defense Security Assistance", "FY2002" : 2671000, "FY2003" : 231000, "FY2004" : NumberLong(113777303), "FY2005" : NumberLong(354880927), "FY2006" : NumberLong(290974654), "FY2007" : NumberLong(445490416), "FY2008" : NumberLong(689463607), "FY2009" : NumberLong(810885866) } { "_id" : ObjectId("4e9e84f36ab9cd886d92682f"), "country_name" : "Afghanistan", "program_name" : "Development Assistance", "FY2001" : 3361305, "FY2002" : 7283228, "FY2003" : NumberLong(46279941), "FY2004" : NumberLong(157126174), "FY2005" : NumberLong(173989599), "FY2006" : NumberLong(197646281), "FY2007" : NumberLong(165774243), "FY2008" : NumberLong(147431872), "FY2009" : 3647457 } { "_id" : ObjectId("4e9e84f36ab9cd886d926830"), "country_name" : "Afghanistan", "program_name" : "Economic Support Fund/Security Support Assistance", "FY1962" : NumberLong(31100000), "FY1993" : NumberLong(10000000), "FY1994" : 1995000, "FY2001" : 50000, "FY2002" : NumberLong(26455294), "FY2003" : NumberLong(289621552), "FY2004" : NumberLong(892527672), "FY2005" : NumberLong(1040289537), "FY2006" : NumberLong(1261962171), "FY2007" : NumberLong(1212809530), "FY2008" : NumberLong(1371420099), "FY2009" : NumberLong(1407978550) } { "_id" : ObjectId("4e9e84f36ab9cd886d926831"), "country_name" : "Afghanistan", "program_name" : "Food For Education", "FY2003" : 3358043, "FY2004" : 2271528, "FY2005" : 2924785, "FY2006" : 359596 } { "_id" : ObjectId("4e9e84f36ab9cd886d926832"), "country_name" : "Afghanistan", "program_name" : "Global Health and Child Survival", "FY2008" : NumberLong(61767000), "FY2009" : 1750933 } { "_id" : ObjectId("4e9e84f36ab9cd886d926833"), "country_name" : "Afghanistan", "program_name" : "Inactive Programs", "FY1950" : 1000, "FY1951" : 100000 } { "_id" : ObjectId("4e9e84f36ab9cd886d926834"), "country_name" : "Afghanistan", "program_name" : "Migration and Refugee Assistance", "FY1996" : 1000000, "FY1997" : 2466064, "FY1998" : 5774376, "FY1999" : 2876857, "FY2000" : NumberLong(11146395), "FY2001" : NumberLong(12800000), "FY2002" : NumberLong(38014479), "FY2003" : NumberLong(47164955), "FY2004" : NumberLong(12858672), "FY2005" : 7261668, "FY2006" : NumberLong(11586488), "FY2007" : NumberLong(15297750), "FY2008" : NumberLong(13099261), "FY2009" : NumberLong(18540220) } { "_id" : ObjectId("4e9e84f36ab9cd886d926835"), "country_name" : "Afghanistan", "program_name" : "Narcotics Control", "FY1979" : 24000, "FY1992" : 332000, "FY1993" : 100000, "FY1994" : 100000, "FY1995" : 115000, "FY1996" : 300000, "FY1997" : 270000, "FY1998" : 504000, "FY1999" : 1046000, "FY2002" : NumberLong(63000000), "FY2004" : NumberLong(170000000), "FY2005" : 1408466, "FY2006" : 1408466, "FY2007" : NumberLong(120507593), "FY2008" : NumberLong(287039049), "FY2009" : NumberLong(575622108) } { "_id" : ObjectId("4e9e84f36ab9cd886d926836"), "country_name" : "Afghanistan", "program_name" : "Nonproliferation, Anti-Terrorism, Demining and Related", "FY1997" : 1000000, "FY1998" : 2200000, "FY1999" : 2615000, "FY2000" : 3000000, "FY2001" : 2800000, "FY2002" : NumberLong(43434000), "FY2003" : NumberLong(96818000), "FY2004" : NumberLong(60716326), "FY2005" : NumberLong(32968585), "FY2006" : NumberLong(22332988), "FY2007" : NumberLong(26903894), "FY2008" : NumberLong(28890549), "FY2009" : NumberLong(34786591) } { "_id" : ObjectId("4e9e84f36ab9cd886d926837"), "country_name" : "Afghanistan", "program_name" : "Other Active Grant Programs", "FY1999" : 1405286, "FY2000" : 2315941, "FY2001" : 2408168, "FY2002" : 7594745, "FY2003" : NumberLong(10079383), "FY2004" : 9696886, "FY2005" : NumberLong(10942562), "FY2006" : 9238647, "FY2007" : NumberLong(16900800), "FY2008" : 9540073, "FY2009" : NumberLong(10607669) } { "_id" : ObjectId("4e9e84f36ab9cd886d926838"), "country_name" : "Afghanistan", "program_name" : "Other State Assistance", "FY1990" : 139050, "FY1991" : 57068, "FY1992" : 61344, "FY1993" : 24000, "FY1994" : 24000, "FY1995" : 48000, "FY1996" : 48000, "FY1997" : 98000, "FY1998" : 48000, "FY1999" : 58000, "FY2001" : 85000, "FY2002" : 717001, "FY2003" : 1219382, "FY2004" : 546445, "FY2005" : 1458435, "FY2006" : 1447768, "FY2007" : 2279328, "FY2008" : 2028680, "FY2009" : 5669015 } { "_id" : ObjectId("4e9e84f36ab9cd886d926839"), "country_name" : "Afghanistan", "program_name" : "Other USAID Assistance", "FY1952" : 300000, "FY1953" : 2200000, "FY1954" : 2500000, "FY1955" : 2000000, "FY1956" : NumberLong(18300000), "FY1957" : NumberLong(14400000), "FY1958" : 8000000, "FY1959" : NumberLong(19400000), "FY1960" : NumberLong(10400000), "FY1961" : NumberLong(12800000), "FY1962" : 7500000, "FY1963" : NumberLong(17400000), "FY1964" : NumberLong(22200000), "FY1965" : NumberLong(10800000), "FY1966" : NumberLong(11000000), "FY1967" : NumberLong(25300000), "FY1968" : 8600000, "FY1969" : 8800000, "FY1970" : 6800000, "FY1971" : 8800000, "FY1972" : 9600000, "FY1973" : NumberLong(27100000), "FY1974" : NumberLong(12500000), "FY1975" : NumberLong(16200000), "FY1976" : 6300000, "FY1976tq" : 1500000, "FY1977" : NumberLong(20010000), "FY1978" : 4989000, "FY1979" : 3074000, "FY1985" : 3369000, "FY1993" : NumberLong(10000000), "FY1994" : 343000, "FY2001" : 9040859, "FY2002" : NumberLong(156845236), "FY2003" : NumberLong(130714501), "FY2004" : NumberLong(34307462), "FY2005" : NumberLong(39432331), "FY2006" : NumberLong(11102253), "FY2007" : NumberLong(21853934), "FY2008" : NumberLong(43962332), "FY2009" : NumberLong(82768828) } { "_id" : ObjectId("4e9e84f36ab9cd886d92683a"), "country_name" : "Afghanistan", "program_name" : "Other USDA Assistance", "FY2003" : 62191, "FY2004" : 361113, "FY2005" : 431968, "FY2008" : 120000 } { "_id" : ObjectId("4e9e84f36ab9cd886d92683b"), "country_name" : "Afghanistan", "program_name" : "Peace Corps", "FY1962" : 1000, "FY1963" : 300000, "FY1964" : 600000, "FY1965" : 1300000, "FY1966" : 1500000, "FY1967" : 1300000, "FY1968" : 1600000, "FY1969" : 1600000, "FY1970" : 900000, "FY1971" : 1100000, "FY1972" : 1300000, "FY1973" : 1400000, "FY1974" : 1500000, "FY1975" : 1200000, "FY1976" : 900000, "FY1976tq" : 200000, "FY1977" : 746000, "FY1978" : 789000, "FY1979" : 323000 } { "_id" : ObjectId("4e9e84f36ab9cd886d92683c"), "country_name" : "Afghanistan", "program_name" : "Section 416(b)/ Commodity Credit Corporation Food for Progress", "FY1999" : NumberLong(21021419), "FY2000" : NumberLong(23632010), "FY2001" : NumberLong(45007163), "FY2002" : 1528454, "FY2004" : 8484351, "FY2005" : 4708302, "FY2006" : NumberLong(17399050), "FY2008" : NumberLong(20546250) } { "_id" : ObjectId("4e9e84f36ab9cd886d92683d"), "country_name" : "Afghanistan", "program_name" : "Title I", "FY1965" : 600000, "FY1967" : 1900000, "FY1968" : 6200000, "FY1969" : 3500000, "FY1971" : 3000000, "FY1972" : 6100000, "FY1973" : 6500000, "FY1976tq" : 2500000, "FY1978" : 5100000, "FY2003" : 5329494, "FY2006" : 7888987, "FY2007" : NumberLong(13061009), "FY2009" : NumberLong(18884710) } { "_id" : ObjectId("4e9e84f36ab9cd886d92683e"), "country_name" : "Afghanistan", "program_name" : "Title II", "FY1956" : 1000, "FY1957" : 6600000, "FY1958" : 6200000, "FY1959" : 7600000, "FY1960" : 100000, "FY1961" : NumberLong(18700000), "FY1962" : 800000, "FY1963" : 100000, "FY1964" : NumberLong(19100000), "FY1965" : NumberLong(21300000), "FY1966" : NumberLong(20800000), "FY1967" : 4200000, "FY1969" : 1500000, "FY1970" : 800000, "FY1971" : 800000, "FY1972" : NumberLong(18700000), "FY1973" : 3600000, "FY1974" : 100000, "FY1975" : 3900000, "FY1976" : 1500000, "FY1976tq" : 300000, "FY1977" : 736000, "FY1978" : 642000, "FY1979" : 7195000, "FY1986" : 8897000, "FY1987" : 2581000, "FY1988" : NumberLong(28946000), "FY1989" : NumberLong(57382000), "FY1990" : NumberLong(15083000), "FY1991" : NumberLong(20083000), "FY1992" : NumberLong(31200000), "FY1993" : NumberLong(18000000), "FY1995" : NumberLong(12390400), "FY1996" : NumberLong(15200000), "FY1997" : NumberLong(28720600), "FY1999" : 6359700, "FY2000" : NumberLong(13977700), "FY2001" : NumberLong(30318600), "FY2002" : NumberLong(160101900), "FY2003" : NumberLong(48111300), "FY2004" : NumberLong(50349700), "FY2005" : NumberLong(57233800), "FY2006" : NumberLong(61496700), "FY2007" : NumberLong(59996000), "FY2008" : NumberLong(177133500), "FY2009" : NumberLong(75152900) } { "_id" : ObjectId("4e9e84f36ab9cd886d92683f"), "country_name" : "Albania", "program_name" : "Child Survival and Health", "FY2004" : 1000000, "FY2005" : 150000, "FY2006" : 825000, "FY2007" : 1712179, "FY2008" : 1350000 } { "_id" : ObjectId("4e9e84f36ab9cd886d926840"), "country_name" : "Albania", "program_name" : "Department of Defense Security Assistance", "FY1994" : 4576386, "FY1995" : 5639826, "FY1996" : 1120683, "FY1997" : 2167567, "FY2004" : NumberLong(15544975), "FY2005" : 7384862, "FY2006" : 9999114, "FY2007" : 4664191, "FY2008" : 477180, "FY2009" : 1139526 } has more > exit bye
var Db = require('mongodb').Db , Connection = require('mongodb').Connection , Server = require('mongodb').Server , host = 'localhost' , port = 27017 , mr_demo = 'mr_demo' , life_expectancy = 'life_expectancy' , us_economic_assistance = 'us_economic_assistance'; /** * node-mongodb-native version: "0.9.6-22" */ var server = new Server(host, port, {auto_reconnect: true}); /** * From V0.8.0 to V0.9.6.9, the Javascript bson parser was * slower than an optional C/C++ bson parser. As of V0.9.6.9+, * due to performance improvements in the Javascript parser, * the C/C++ perser is deprecated and is not installed by default anymore. * * If you are running a version of this library has the C/C++ parser compiled, * to enable the driver to use the C/C++ bson parser pass it the option * native_parser:true like below */ var db = new Db(mr_demo, server, {native_parser : false}); /** * { "_id" : ObjectId("4e9e85016ab9cd886d9271bb"), "id" : 1, "country" : "Monaco", "age" : 89.73 } */ var life_expect_map = function() { // Simply emit the age and 0 for the dollar amount. // The dollar amount will come from the other collection. emit(this.country, {life_expectancy: this.age, dollars: 0}); } /** * "_id" : ObjectId("4e9e84f36ab9cd886d926840"), "country_name" : "Albania", * "program_name" : "Department of Defense Security Assistance", "FY1994" : 4576386, * "FY1995" : 5639826, "FY1996" : 1120683, "FY1997" : 2167567, "FY2004" : NumberLong(15544975), * "FY2005" : 7384862, "FY2006" : 9999114, "FY2007" : 4664191, "FY2008" : 477180, * "FY2009" : 1139526 */ var us_econ_map = function() { // The data set contains grant amounts going back to 1946. I // am only interested in 2009 grants. if (this.FY2009 !== undefined && this.FY2009 !== null) { emit(this.country_name, { dollars: this.FY2009, life_expectancy: 0 }); } } var reduce = function(key, values) { var result = {dollars: 0, life_expectancy: 0}; values.forEach(function(value) { // Sum up all the money from all the 2009 grants for this // country (key) result.dollars += (value.dollars !== null) ? value.dollars : 0; // Only set life expectancy once if (result.life_expectancy === 0 && value.life_expectancy !== null) { result.life_expectancy = value.life_expectancy; } }); return result; } var life_expectancy_command = { mapreduce : life_expectancy , map : life_expect_map.toString() , reduce : reduce.toString() , out : {reduce : 'joined'} }; var us_economic_assistance_command = { mapreduce: us_economic_assistance , map : us_econ_map.toString() , reduce : reduce.toString() , out : {reduce : 'joined'} }; db.open(function(err, db){ db.executeDbCommand(life_expectancy_command, function(err, result){ db.executeDbCommand(us_economic_assistance_command, function(err, result){ console.log("Map/reduce test done!"); db.close(); }); }); });