mongodb是NoSQl领域里非常流行的一款非关系型数据库,提供了强大的分片存储与查询功能,用来做历史数据(日志)存储与查询比较适合,本身也提供了mapreduce功能,但是并不是任何时候Mongodb的使用者都会使用分片功能,更大的可能是使用副本集的方式(有时候机器并不多),而Hadoop提供了HDFS和分布式计算的功能,我们可以利用hadoop的MapReduce来取代Mongodb的MapReduce,用Mongodb的副本集来取代Hadoop的HDFS,那么就有了Hadoop与Mongodb之间的连接器(adapter)mongo-hadoop-master项目(目前在github上课可以下载到)
一 :下载地址:https://github.com/mongodb/mongo-hadoop
二: 下载之后解压:
- [root@bigdata2 software]# cd mongo-hadoop-master
- [root@bigdata2 mongo-hadoop-master]# ll
- total 140
- drwxr-xr-x 3 root root 4096 Oct 15 11:53 bin
- -rw-r--r-- 1 root root 5848 Oct 15 11:53 BSON_README.md
- drwxr-xr-x 4 root root 4096 Nov 30 13:06 build
- -rwxr-xr-x 1 root root 168 Oct 15 11:53 build-all.sh
- -rw-r--r-- 1 root root 12731 Oct 15 11:53 build.gradle
- drwxr-xr-x 2 root root 4096 Oct 15 11:53 clusterConfigs
- drwxr-xr-x 2 root root 4096 Oct 15 11:53 config
- -rw-r--r-- 1 root root 7458 Oct 15 11:53 CONFIG.md
- drwxr-xr-x 4 root root 4096 Nov 30 13:06 core
- drwxr-xr-x 6 root root 4096 Oct 15 11:53 docs
- drwxr-xr-x 7 root root 4096 Oct 15 11:53 examples
- drwxr-xr-x 3 root root 4096 Oct 15 11:53 flume
- drwxr-xr-x 3 root root 4096 Oct 15 11:53 gradle
- -rwxr-xr-x 1 root root 5080 Oct 15 11:53 gradlew
- -rw-r--r-- 1 root root 2314 Oct 15 11:53 gradlew.bat
- -rw-r--r-- 1 root root 1862 Oct 15 11:53 History.md
- drwxr-xr-x 3 root root 4096 Oct 15 11:53 hive
- drwxr-xr-x 3 root root 4096 Oct 15 11:53 integration-tests
- -rw-r--r-- 1 root root 6764 Oct 15 11:53 mongo-defaults.xml
- -rw------- 1 root root 4843 Nov 30 13:12 nohup.out
- drwxr-xr-x 3 root root 4096 Oct 15 11:53 pig
- -rw-r--r-- 1 root root 5106 Oct 15 11:53 README.md
- -rw-r--r-- 1 root root 137 Oct 15 11:53 settings.gradle
- drwxr-xr-x 5 root root 4096 Oct 15 11:53 streaming
- -rwxr-xr-x 1 root root 682 Oct 15 11:53 test.sh
- drwxr-xr-x 2 root root 4096 Oct 15 11:53 tools
- [root@bigdata2 mongo-hadoop-master]#
其中Example目录是自带的测试案例,我这里会采用mongo-hadoop-master/examples/treasury_yield 这个案例里面的src/main/resources/下面哦json数据
{ "_id" : { "$date" : 631324800000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 7.96, "bc5Year" : 7.92, "bc10Year" : 7.99, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.94, "bc3Month" : 7.89, "bc30Year" : 8.039999999999999, "bc1Year" : 7.85, "bc7Year" : 8.039999999999999, "bc6Month" : 7.94 }
{ "_id" : { "$date" : 631411200000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 7.93, "bc5Year" : 7.91, "bc10Year" : 7.98, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.92, "bc3Month" : 7.84, "bc30Year" : 8.039999999999999, "bc1Year" : 7.82, "bc7Year" : 8.02, "bc6Month" : 7.9 }
{ "_id" : { "$date" : 631497600000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 7.94, "bc5Year" : 7.92, "bc10Year" : 7.99, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.9, "bc3Month" : 7.79, "bc30Year" : 8.06, "bc1Year" : 7.79, "bc7Year" : 8.029999999999999, "bc6Month" : 7.85 }
{ "_id" : { "$date" : 631756800000 }, "dayOfWeek" : "MONDAY", "bc3Year" : 7.95, "bc5Year" : 7.92, "bc10Year" : 8.02, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.9, "bc3Month" : 7.79, "bc30Year" : 8.09, "bc1Year" : 7.81, "bc7Year" : 8.050000000000001, "bc6Month" : 7.88 }
{ "_id" : { "$date" : 631843200000 }, "dayOfWeek" : "TUESDAY", "bc3Year" : 7.94, "bc5Year" : 7.92, "bc10Year" : 8.02, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.91, "bc3Month" : 7.8, "bc30Year" : 8.1, "bc1Year" : 7.78, "bc7Year" : 8.050000000000001, "bc6Month" : 7.82 }
{ "_id" : { "$date" : 631929600000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 7.95, "bc5Year" : 7.92, "bc10Year" : 8.029999999999999, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.91, "bc3Month" : 7.75, "bc30Year" : 8.109999999999999, "bc1Year" : 7.77, "bc7Year" : 8, "bc6Month" : 7.78 }
{ "_id" : { "$date" : 632016000000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 7.95, "bc5Year" : 7.94, "bc10Year" : 8.039999999999999, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.91, "bc3Month" : 7.8, "bc30Year" : 8.109999999999999, "bc1Year" : 7.77, "bc7Year" : 8.01, "bc6Month" : 7.8 }
{ "_id" : { "$date" : 632102400000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 7.98, "bc5Year" : 7.99, "bc10Year" : 8.1, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 7.93, "bc3Month" : 7.74, "bc30Year" : 8.17, "bc1Year" : 7.76, "bc7Year" : 8.07, "bc6Month" : 7.81 }
{ "_id" : { "$date" : 632448000000 }, "dayOfWeek" : "TUESDAY", "bc3Year" : 8.130000000000001, "bc5Year" : 8.109999999999999, "bc10Year" : 8.199999999999999, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.1, "bc3Month" : 7.89, "bc30Year" : 8.25, "bc1Year" : 7.92, "bc7Year" : 8.18, "bc6Month" : 7.99 }
{ "_id" : { "$date" : 632534400000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 8.109999999999999, "bc5Year" : 8.109999999999999, "bc10Year" : 8.19, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.09, "bc3Month" : 7.97, "bc30Year" : 8.25, "bc1Year" : 7.91, "bc7Year" : 8.17, "bc6Month" : 7.97 }
{ "_id" : { "$date" : 632620800000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 8.279999999999999, "bc5Year" : 8.27, "bc10Year" : 8.32, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.25, "bc3Month" : 8.039999999999999, "bc30Year" : 8.35, "bc1Year" : 8.050000000000001, "bc7Year" : 8.31, "bc6Month" : 8.08 }
{ "_id" : { "$date" : 632707200000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 8.23, "bc5Year" : 8.199999999999999, "bc10Year" : 8.26, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.199999999999999, "bc3Month" : 8, "bc30Year" : 8.289999999999999, "bc1Year" : 8, "bc7Year" : 8.24, "bc6Month" : 8.01 }
{ "_id" : { "$date" : 632966400000 }, "dayOfWeek" : "MONDAY", "bc3Year" : 8.199999999999999, "bc5Year" : 8.19, "bc10Year" : 8.27, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.18, "bc3Month" : 7.99, "bc30Year" : 8.31, "bc1Year" : 7.98, "bc7Year" : 8.25, "bc6Month" : 7.99 }
{ "_id" : { "$date" : 633052800000 }, "dayOfWeek" : "TUESDAY", "bc3Year" : 8.199999999999999, "bc5Year" : 8.18, "bc10Year" : 8.26, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.18, "bc3Month" : 7.93, "bc30Year" : 8.289999999999999, "bc1Year" : 7.97, "bc7Year" : 8.23, "bc6Month" : 7.97 }
{ "_id" : { "$date" : 633139200000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 8.289999999999999, "bc5Year" : 8.279999999999999, "bc10Year" : 8.380000000000001, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.199999999999999, "bc3Month" : 7.93, "bc30Year" : 8.41, "bc1Year" : 8, "bc7Year" : 8.34, "bc6Month" : 7.99 }
{ "_id" : { "$date" : 633225600000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 8.32, "bc5Year" : 8.31, "bc10Year" : 8.42, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.24, "bc3Month" : 7.95, "bc30Year" : 8.460000000000001, "bc1Year" : 8.029999999999999, "bc7Year" : 8.390000000000001, "bc6Month" : 8.01 }
{ "_id" : { "$date" : 633312000000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 8.380000000000001, "bc5Year" : 8.380000000000001, "bc10Year" : 8.49, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.279999999999999, "bc3Month" : 7.93, "bc30Year" : 8.550000000000001, "bc1Year" : 8.07, "bc7Year" : 8.449999999999999, "bc6Month" : 8.039999999999999 }
{ "_id" : { "$date" : 633571200000 }, "dayOfWeek" : "MONDAY", "bc3Year" : 8.390000000000001, "bc5Year" : 8.390000000000001, "bc10Year" : 8.5, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.300000000000001, "bc3Month" : 8, "bc30Year" : 8.539999999999999, "bc1Year" : 8.08, "bc7Year" : 8.449999999999999, "bc6Month" : 8.09 }
{ "_id" : { "$date" : 633657600000 }, "dayOfWeek" : "TUESDAY", "bc3Year" : 8.390000000000001, "bc5Year" : 8.43, "bc10Year" : 8.51, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.300000000000001, "bc3Month" : 8, "bc30Year" : 8.550000000000001, "bc1Year" : 8.09, "bc7Year" : 8.470000000000001, "bc6Month" : 8.140000000000001 }
{ "_id" : { "$date" : 633744000000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 8.359999999999999, "bc5Year" : 8.35, "bc10Year" : 8.43, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.279999999999999, "bc3Month" : 8, "bc30Year" : 8.460000000000001, "bc1Year" : 8.08, "bc7Year" : 8.390000000000001, "bc6Month" : 8.130000000000001 }
{ "_id" : { "$date" : 633830400000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 8.35, "bc5Year" : 8.35, "bc10Year" : 8.42, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.279999999999999, "bc3Month" : 8.02, "bc30Year" : 8.44, "bc1Year" : 8.09, "bc7Year" : 8.380000000000001, "bc6Month" : 8.130000000000001 }
{ "_id" : { "$date" : 633916800000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 8.43, "bc5Year" : 8.42, "bc10Year" : 8.5, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.369999999999999, "bc3Month" : 8.07, "bc30Year" : 8.51, "bc1Year" : 8.130000000000001, "bc7Year" : 8.460000000000001, "bc6Month" : 8.17 }
{ "_id" : { "$date" : 634176000000 }, "dayOfWeek" : "MONDAY", "bc3Year" : 8.43, "bc5Year" : 8.44, "bc10Year" : 8.529999999999999, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.369999999999999, "bc3Month" : 8.08, "bc30Year" : 8.529999999999999, "bc1Year" : 8.15, "bc7Year" : 8.48, "bc6Month" : 8.18 }
{ "_id" : { "$date" : 634262400000 }, "dayOfWeek" : "TUESDAY", "bc3Year" : 8.43, "bc5Year" : 8.49, "bc10Year" : 8.57, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.42, "bc3Month" : 8.09, "bc30Year" : 8.58, "bc1Year" : 8.15, "bc7Year" : 8.52, "bc6Month" : 8.17 }
{ "_id" : { "$date" : 634348800000 }, "dayOfWeek" : "WEDNESDAY", "bc3Year" : 8.43, "bc5Year" : 8.51, "bc10Year" : 8.52, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.42, "bc3Month" : 8.08, "bc30Year" : 8.57, "bc1Year" : 8.17, "bc7Year" : 8.529999999999999, "bc6Month" : 8.19 }
{ "_id" : { "$date" : 634435200000 }, "dayOfWeek" : "THURSDAY", "bc3Year" : 8.390000000000001, "bc5Year" : 8.449999999999999, "bc10Year" : 8.49, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.369999999999999, "bc3Month" : 8.08, "bc30Year" : 8.5, "bc1Year" : 8.130000000000001, "bc7Year" : 8.48, "bc6Month" : 8.18 }
{ "_id" : { "$date" : 634521600000 }, "dayOfWeek" : "FRIDAY", "bc3Year" : 8.24, "bc5Year" : 8.289999999999999, "bc10Year" : 8.31, "bc20Year" : null, "bc1Month" : null, "bc2Year" : 8.25, "bc3Month" : 8.02, "bc30Year" : 8.359999999999999, "bc1Year" : 8.029999999999999, "bc7Year" : 8.34, "bc6Month" : 8.09 }
三: 我们查看他的README.md,可以看出 ,需要编译
- ## Building
- The mongo-hadoop connector currently supports the following versions of hadoop: 0.23, 1.0, 1.1, 2.2, 2.3, 2.4,
- and CDH 4 abd 5. The default build version will build against the last Apache Hadoop (currently 2.4). If you would like to build
- against a specific version of Hadoop you simply need to pass `-PclusterVersion=
` to gradlew when building. - Run `./gradlew jar` to build the jars. The jars will be placed in to `build/libs` for each module. e.g. for the core module,
- it will be generated in the `core/build/libs` directory.
- After successfully building, you must copy the jars to the lib directory on each node in your hadoop cluster. This is usually one of the
- following locations, depending on which Hadoop release you are using:
- * `$HADOOP_HOME/lib/`
- * `$HADOOP_HOME/share/hadoop/mapreduce/`
- * `$HADOOP_HOME/share/hadoop/lib/`
- ## Supported Distributions of Hadoop
- | Hadoop Version | Build Parameter |
- | :----------------------------------: | :---------------------: |
- | Apache Hadoop 0.23 | -PclusterVersion='0.23' |
- | Apache Hadoop 1.0 | -PclusterVersion='1.0' |
- | Apache Hadoop 1.1 | -PclusterVersion='1.1' |
- | Apache Hadoop 2.2 | -PclusterVersion='2.2' |
- | Apache Hadoop 2.3 | -PclusterVersion='2.3' |
- | Apache Hadoop 2.4 | -PclusterVersion='2.4' |
- --More--(49%)
我们按照下面指令编译:
- ./gradlew jar
编译过程比较缓慢,下载一个较大的软件是amazon的s3,有250多M,完成以后,会在core/build/libs目录下生成Jar包 mongo-hadoop-core-1.4.0-SNAPSHOT.jar(最大的战斗成果。。) ,我们带上JAVA连接MongoDb的驱动,一起拷贝到$hadoop_home/lib里面 ,当然也可以采用运行时加载的方法
- DistributedCache.addFileToClassPath(new Path("/root/software/mongo-java-driver-2.11.1.jar"), conf);
- DistributedCache.addFileToClassPath(new Path("/root/software/mongo-hadoop-core-1.4.0-SNAPSHOT.jar"), conf);
有了编译好的驱动,我们就可以用它来连接Mongodb了。
四:首先我们准备数据,把刚才的数据导入到mongodb
- mongoimport --host 127.0.0.1 --port 27017 -d testmr -c example --file ./yield_historical_in.json
查看数据:
example
mongotest
system.indexes
> db.example.find().limit(2);
{ "_id" : ISODate("1990-01-02T00:00:00Z"), "dayOfWeek" : "TUESDAY", "bc3Year" :
7.9, "bc5Year" : 7.87, "bc10Year" : 7.94, "bc20Year" : null, "bc1Month" : null,
"bc2Year" : 7.87, "bc3Month" : 7.83, "bc30Year" : 8, "bc1Year" : 7.81, "bc7Year"
: 7.98, "bc6Month" : 7.89 }
{ "_id" : ISODate("1990-01-03T00:00:00Z"), "dayOfWeek" : "WEDNESDAY", "bc3Year"
: 7.96, "bc5Year" : 7.92, "bc10Year" : 7.99, "bc20Year" : null, "bc1Month" : nul
l, "bc2Year" : 7.94, "bc3Month" : 7.89, "bc30Year" : 8.04, "bc1Year" : 7.85, "bc
7Year" : 8.04, "bc6Month" : 7.94 }
>
五:新建一个MapReduce工程
- import java.io.IOException;
- import java.util.Date;
- import org.apache.hadoop.io.DoubleWritable;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.bson.BSONObject;
- public class MongoTestMapper extends Mapper
- @Override
- public void map(final Object pkey, final BSONObject pvalue,final Context context)
- {
- final int year = ((Date)pvalue.get("_id")).getYear()+1990;
- double bdyear = ((Number)pvalue.get("bc10Year")).doubleValue();
- try {
- context.write( new IntWritable( year ), new DoubleWritable( bdyear ));
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (InterruptedException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- }
- public class MongoTestReducer extends Reducer
- {
- public void reduce( final IntWritable pKey,
- final Iterable
pValues, - final Context pContext ) throws IOException, InterruptedException{
- int count = 0;
- double sum = 0.0;
- for ( final DoubleWritable value : pValues ){
- sum += value.get();
- count++;
- }
- final double avg = sum / count;
- BasicBSONObject out = new BasicBSONObject();
- out.put("avg", avg);
- pContext.write(pKey, new BSONWritable(out));
- }
- }
这是一个计算平均值的例子的部分代码,之后在Hadoop环境上运行,可以看到输出到Mongodb的结果
{ "_id" : 2080, "avg" : 8.552400000000002 }
{ "_id" : 2081, "avg" : 7.8623600000000025 }
{ "_id" : 2082, "avg" : 7.008844621513946 }
{ "_id" : 2083, "avg" : 5.866279999999999 }
{ "_id" : 2084, "avg" : 7.085180722891565 }
{ "_id" : 2085, "avg" : 6.573920000000002 }
{ "_id" : 2086, "avg" : 6.443531746031742 }
{ "_id" : 2087, "avg" : 6.353959999999992 }
{ "_id" : 2088, "avg" : 5.262879999999994 }
{ "_id" : 2089, "avg" : 5.646135458167332 }
{ "_id" : 2090, "avg" : 6.030278884462145 }
{ "_id" : 2091, "avg" : 5.02068548387097 }
{ "_id" : 2092, "avg" : 4.61308 }
{ "_id" : 2093, "avg" : 4.013879999999999 }
{ "_id" : 2094, "avg" : 4.271320000000004 }
{ "_id" : 2095, "avg" : 4.288880000000001 }
{ "_id" : 2096, "avg" : 4.7949999999999955 }
{ "_id" : 2097, "avg" : 4.634661354581674 }
{ "_id" : 2098, "avg" : 3.6642629482071714 }
{ "_id" : 2099, "avg" : 3.2641200000000037 }
Type "it" for more