hadoop-mongo map/reduce java

官方 http://docs.mongodb.org/ecosystem/tutorial/getting-started-with-hadoop/

mongo-haoop项目地址 https://github.com/mongodb/mongo-hadoop

该代码托管 https://github.com/cclient/mongo_hadoop_map-reduce

 

原分析 由nodejs+async编写

 

用游标迭代查询mongo数据库,分析数据

 

因数据量较大,目前执行分析任务耗时4个小时,这只是极限数据量的1%

为优化,采用hadoop-mongo 方案

优点:mongo只能单机单线程(不作shard的情况),hadoop-mongo可以集群处理。

缺点:查了些资料,发现不能设计mongodb 的输入条件,猜想是因为该中间件是直接读数据文件,并根据数据文件分割数据,加查询条件则不容易分割,只能作全表分析。

    若要设定条件,有以下两种思路。

   1可以先以query,dump下来mongo表(collection)A中的数据,再restore到单独的表B中,对表B执行hadoop任务。

   2可以在mapper任务中,验证条件,符合条件才context.wirte(Writable,Writable),不符合的跳过。

   3按1的思路拓展mongo-hadoop中间件,在map之前,添加任务,先分析出一张临时表,在reducer之后,删除该表(……时间充分再提代码)。

 

完成代码

 

近期一直写的脚本语言,再回头写点JAVA,好悲催,感觉很受限制。

JAVA和.net都是静态语言,但.net在编译层面实现了很多动态语言的特性,java太简陋了。

没有.net的访问器.net可以[index]访问,java 只能getIndex()

 

初步代码 很粗糙

MAIN 入口

 1 package group.artifactid;

 2 

 3 //cc MaxTemperature Application to find the maximum temperature in the weather dataset

 4 //vv MaxTemperature

 5 import org.apache.hadoop.conf.Configuration;

 6 import org.apache.hadoop.fs.Path;

 7 import org.apache.hadoop.io.MapWritable;

 8 import org.apache.hadoop.io.Text;

 9 import org.apache.hadoop.mapreduce.Job;

10 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

11 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

12 

13 import com.mongodb.hadoop.MongoConfig;

14 import com.mongodb.hadoop.io.BSONWritable;

15 import com.mongodb.hadoop.util.MongoTool;

16 

17 import com.mongodb.hadoop.MongoConfig;

18 import com.mongodb.hadoop.MongoInputFormat;

19 import com.mongodb.hadoop.MongoOutputFormat;

20 import com.mongodb.hadoop.util.MapredMongoConfigUtil;

21 import com.mongodb.hadoop.util.MongoConfigUtil;

22 import com.mongodb.hadoop.util.MongoTool;

23 import org.apache.hadoop.conf.Configuration;

24 import org.apache.hadoop.io.IntWritable;

25 import org.apache.hadoop.util.ToolRunner;

26 

27 public class MongoMaxTemperature extends MongoTool {

28     public MongoMaxTemperature() {

29         Configuration conf = new Configuration();

30         MongoConfig config = new MongoConfig(conf);

31         setConf(conf);

32         MongoConfigUtil.setInputFormat(getConf(), MongoInputFormat.class);

33         MongoConfigUtil.setOutputFormat(getConf(), MongoOutputFormat.class);

34         config.setInputURI("mongodb://localhost:27017/db1.collection1");

35         config.setMapper(MongoMaxTemperatureMapper.class);

36         // Combiner

37         config.setCombiner(MongoMaxTemperatureCombine.class);

38         // config.setReducer(MongoMaxTemperatureReducer.class);

39         config.setReducer(MongoMaxTemperatureReducerCombine.class);

40         config.setMapperOutputKey(Text.class);

41         config.setMapperOutputValue(Text.class);

42         config.setOutputKey(Text.class);

43         config.setOutputValue(BSONWritable.class);

44         config.setOutputURI("mongodb://localhost:27017/db2.collection2");

45     }

46 

47     public static void main(String[] args) throws Exception {

48         System.exit(ToolRunner.run(new MongoMaxTemperature(), args));

49     }

50 }

 

MAPER代码

package group.artifactid;



import java.io.IOException;



import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.bson.BSONObject;



import com.mongodb.hadoop.io.BSONWritable;



public class MongoMaxTemperatureMapper extends

        Mapper<Object, BSONObject, Text, Text> {

    @Override

    public void map(final Object key, BSONObject val, Context context)

            throws IOException, InterruptedException {

        String apmac = (String) val.get("apMac");

        String clientmac = (String) val.get("clientMac");

        String url = (String) val.get("url");

        String proto = (String) val.get("proto");

        if (proto.equals("http")&&!url.equals("")) {

            if (url.indexOf("http://") == 0) {

                url = url.substring(7);

            }

            int firstargindex = url.indexOf('/');

            if(firstargindex>-1){

                url = url.substring(0, firstargindex);    

            }

            //验证输入 带.则参数错误,临时转为}

            url=url.replace('.','}');

            context.write(new Text(apmac), new Text(clientmac + url));

        }

    }

}

 

COMBINE代码

package group.artifactid;



import java.io.IOException;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.List;

import java.util.Map;

import com.mongodb.hadoop.io.BSONWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

import org.bson.BasicBSONObject;



public class MongoMaxTemperatureReducerCombine extends

        Reducer<Text, Text, Text, BSONWritable> {

    public class UrlCount {

        public UrlCount(String url, int count) {

            this.Url = url;

            this.Count = count;

        }

        String Url;

        int Count;

    }

    public List<UrlCount> compresstopobj(BasicBSONObject topobj, int topnum) {

        List<UrlCount> studentList = new ArrayList<UrlCount>();

        for (Map.Entry<String, Object> entry : topobj.entrySet()) {

            String Url = entry.getKey();

            String scount = entry.getValue().toString();

            studentList.add(new UrlCount(Url, Integer.parseInt(scount)));

        }

        Collections.sort(studentList, new Comparator<UrlCount>() {

            @Override

            public int compare(UrlCount o1, UrlCount o2) {

                if (o1.Count > o2.Count) {

                    return -1;

                } else if (o1.Count < o2.Count) {

                    return 1;

                } else {

                    return 0;

                }

            }

        });

//        System.out.print("--------这里排序成功,但入库时,mongo按键名()排序,这里的排序是为筛选前100条用\n");

//        for (int i = 0; i < studentList.size(); i++) {

//            System.out.print(studentList.get(i).Count + "\n");

//        }

        if (studentList.size() > topnum) {

            studentList = studentList.subList(0, topnum);

        }

        return studentList;

    }



    @Override

    public void reduce(Text apmac, Iterable<Text> values, Context context)

            throws IOException, InterruptedException {

        BasicBSONObject clientmacmap = new BasicBSONObject();

        int count = 0;

        for (Text value : values) {

            String subline = value.toString();

            String clientmac = subline.substring(0, 17);

            int indexcount = subline.indexOf("|");

            int maplastcount = 1;

            String url = null;

            if (indexcount > -1) {

                indexcount++;

                url = subline.substring(17, indexcount);

                String mapcount = subline.substring(indexcount);

                maplastcount = Integer.parseInt(mapcount);



            } else {

                url = subline.substring(17);

            }

            BasicBSONObject urlmap = (BasicBSONObject) clientmacmap

                    .get(clientmac);

            if (urlmap == null) {

                urlmap = new BasicBSONObject();

                clientmacmap.put(clientmac, urlmap);

            }

            Object eveurl = urlmap.get(url);



            if (eveurl == null && !url.equals(" ")) {

                urlmap.put(url, maplastcount);

            } else {

                urlmap.put(url, Integer.parseInt(eveurl.toString())

                        + maplastcount);

            }

            count++;

            if (count == 10000) {

                List<UrlCount> arr = compresstopobj(urlmap, 100);

                BasicBSONObject newurlcmap = new BasicBSONObject();

                for (int i = 0; i < arr.size(); i++) {

                    UrlCount cuc = arr.get(i);

                    newurlcmap.put(cuc.Url, cuc.Count);

                }

                urlmap = newurlcmap;

            }

        }

        for (Map.Entry<String, Object> entry : clientmacmap.entrySet()) {

            BasicBSONObject urlmap = (BasicBSONObject) entry.getValue();

            List<UrlCount> arr = compresstopobj(urlmap, 100);

            BasicBSONObject newurlcmap = new BasicBSONObject();

            for (int i = 0; i < arr.size(); i++) {

                UrlCount cuc = arr.get(i);

                newurlcmap.put(cuc.Url, cuc.Count);

            }

            urlmap = newurlcmap;

        }

        context.write(apmac, new BSONWritable(clientmacmap));

    }

}

REDUCER代码

package group.artifactid;



import java.io.DataOutputStream;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Collections;

import java.util.Comparator;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.TreeSet;



import com.mongodb.hadoop.io.BSONWritable;



import org.apache.commons.io.output.ByteArrayOutputStream;

import org.apache.hadoop.io.ArrayWritable;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.MapWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.util.StringUtils;

import org.apache.zookeeper.server.util.SerializeUtils;

import org.bson.BasicBSONObject;



public class MongoMaxTemperatureReducer extends

        Reducer<Text, Text, Text, BSONWritable> {

    public class UrlCount {

        public UrlCount(String url, int count) {

            this.Url = url;

            this.Count = count;

        }

        String Url;

        int Count;

    }

    class SortByCount implements Comparator {

        public int compare(Object o1, Object o2) {

            UrlCount s1 = (UrlCount) o1;

            UrlCount s2 = (UrlCount) o2;

            if (s1.Count > s2.Count)

                return 1;

            return 0;

        }

    }

    public List<UrlCount> compresstopobj(BasicBSONObject topobj, int topnum) {

        List<UrlCount> studentList = new ArrayList<UrlCount>();

        for (Map.Entry<String, Object> entry : topobj.entrySet()) {

            String Url = entry.getKey();

            String scount = entry.getValue().toString();

            System.out.print(scount + "\n");

            studentList.add(new UrlCount(Url, Integer.parseInt(scount)));

        }

        Collections.sort(studentList, new SortByCount());

        if (studentList.size() > topnum) {

            studentList = studentList.subList(0, topnum);

        }

        return studentList;

    }

    @Override

    public void reduce(Text apmac, Iterable<Text> values, Context context)

            throws IOException, InterruptedException {

        BasicBSONObject clientmacmap = new BasicBSONObject();

        int count = 0;

        for (Text value : values) {

            String subline = value.toString();

            String clientmac = subline.substring(0, 17);

            String url = subline.substring(17);

            BasicBSONObject urlmap = (BasicBSONObject) clientmacmap

                    .get(clientmac);

            if (urlmap == null) {

                urlmap = new BasicBSONObject();

                clientmacmap.put(clientmac, urlmap);

            }

            Object eveurl = urlmap.get(url);

            if (eveurl == null && !url.equals(" ")) {

                urlmap.put(url, 1);

            } else {

                urlmap.put(url, Integer.parseInt(eveurl.toString()) + 1);

            }

            count++;

            if (count == 1000) {

                List<UrlCount> arr = compresstopobj(urlmap, 100);

                BasicBSONObject newurlcmap = new BasicBSONObject();

                for (int i = 0; i < arr.size(); i++) {

                    UrlCount cuc = arr.get(i);

                    newurlcmap.put(cuc.Url, cuc.Count);

                }

                urlmap = newurlcmap;

            }

        }

        context.write(apmac, new BSONWritable(clientmacmap));

    }

}

 

Mongo collection 数据格式

{

    "_id" : ObjectId("54d83f3548c9bc218e056ce6"),"apMac" : "aa:bb:cc:dd:ee:ff","proto" : "http",

    "url" : "extshort.weixin.qq.comhttp",

    "clientMac" : "ff:ee:dd:cc:bb:aa"

}

 

clientMac和url 先拼在一起,再按mac长度分割

数据流程 

orgin->map

map:[{"aa:bb:cc:dd:ee:ff":[ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp]}]

 

假如是多条数据则 

map:[{"aa:bb:cc:dd:ee:ff":["ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp","ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp1","ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp2"]}]

map->compine

如果有相同的client+url 则统计个数,以|分隔

compine:[{"aa:bb:cc:dd:ee:ff":[ff:ee:dd:cc:bb:aaextshort.weixin.qq.comhttp|100]}]

compine->reducer

reducer中 按mac长度分割出 clientMac url 再按“|”分割出 个数

统计前每个clientMac的前100条

reduce:

{

    "_id": "00:21:26:00:0A:FF",

    "aa:bb:cc:1c:b9:8f": {

        "c}tieba}baidu}com|": 1,

        "short}weixin}qq}comhttp:|": 1,

        "get}sogou}com|": 1,

        "md}openapi}360}cn|": 1,

        "74}125}235}224|": 1,

        "mmbiz}qpic}cn|": 1,

        "tb}himg}baidu}com|": 1

    },

    "cc:bb:aa:d5:30:8a": {

        "captive}apple}com|": 2,

        "www}airport}us|": 1,

        "www}itools}info|": 2,

        "www}thinkdifferent}us|": 1,

        "www}ibook}info|": 1

    },

    "ee:ee:bb:78:31:74": {

        "www}itools}info|": 1,

        "www}ibook}info|": 1

    }

    

}

 

你可能感兴趣的:(hadoop)