统计网站的点击次数,并且输出前五名的网站以及对应的次数:
数据格式:36.63.116.201|sdk.conf.igexin.com|20170207161935|61.147.218.24;222.186.20.109;222.186.20.123|0
数据介绍:
示例数据集dns_log.txt是某公司网站访问的日志,其中第1个字段为访问的源ip地址,第2个字段为访问的网站地址,分隔符为竖线 ’|’ 。
思路与解析:首先这题我们如果用mr进行操作,那么我们就需要想到wordCount,因为以网站为key,在map阶段每一个key对应的是1,直接我们把他写回去,到Reduce阶段分组统计,也就是同样的网站开始次数相加,然后写一个TreeMap算法来对value进行排序,也就是同一个网站次数的结果进行排序。而我把value当做key,网站为value,方便进行排序操作。
假设分析:map阶段处理好的结果:
sdk.conf.igexin.com 1
sdk.conf.igexin.com 1
www.163.com 1
www.163.com 1
www.163.com 1
假设分析:reduce阶段处理好的结果:
开始叠加:
sdk.conf.igexin.com 2
www.163.com 3
假设分析:TreeMap阶段处理好的结果:
www.163.com 3
sdk.conf.igexin.com 2
import java.io.IOException;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.util.Comparator;
public class paixu_top5 {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,paixu_top5.class.getSimpleName());
job.setJarByClass(paixu_top5.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setMapperClass(Top5_Mapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(Top5_Reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
public static class Top5_Mapper extends Mapper {
//设置每一个网站来提取的时候为常量1
// private final static IntWritable one = new IntWritable(1);
// private Text wz = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper.Context context)
throws IOException, InterruptedException {
//36.63.116.201|sdk.conf.igexin.com|20170207161935|61.147.218.24;222.186.20.109;222.186.20.123|0
String line = value.toString();
String split[] = line.split("\\|");
context.write(new Text(split[1]), new IntWritable(1));
}
}
public static class Top5_Reducer extends Reducer {
//定义treeMap来保持统计结果,由于treeMap是按key升序排列的,这里要人为指定Comparator以实现倒排
//这里先使用统计数为key,被统计的单词为value
private TreeMap treeMap = new TreeMap(new Comparator() {
@Override
public int compare(Integer x, Integer y) {
return y.compareTo(x);
}
});
public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
//reduce后的结果放入treeMap,而不是向context中记入结果
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
if (treeMap.containsKey(sum)){ //具有相同网站数的单词之间用逗号分隔
String value = treeMap.get(sum) + "--" + key.toString();
treeMap.put(sum,value);
} else {
treeMap.put(sum, key.toString());
}
}
protected void cleanup(Context context) throws IOException, InterruptedException {
//将treeMap中的结果,按value-key顺序写入contex中
int count = 0;//定义一个标志
for (Integer key : treeMap.keySet()) {
if(count ==5){ //当输出前五个就结束
break;
}else{ //当输出前五个
context.write(new Text(treeMap.get(key)), new IntWritable(key));
}
count = count+1;
}
}
}
}
结果截图:
部分数据:
36.63.116.201|sdk.conf.igexin.com|20170207161935|61.147.218.24;222.186.20.109;222.186.20.123|0
36.63.123.215|cm052.getui.igexin.com|20170207161935|183.131.1.82|0
36.63.132.38|mmbiz.qpic.cn|20170207161935|122.228.72.152;115.231.191.141;122.228.72.165;122.228.72.151;122.228.72.147;115.231.191.143;122.228.72.163;122.228.72.159;115.231.191.144;122.228.56.157;122.228.72.166;122.228.56.155;122.228.72.164;122.228.56.156;115.231.191.142;122.228.72.148|0
117.70.249.121|punch.p2p.qq.com|20170207161935|14.17.43.40|0
114.102.113.19|omgmta.play.t002.ottcn.com|20170207161935|123.151.179.173|0
36.63.40.131|pop.sjk.ijinshan.com|20170207161935|60.169.76.70;61.132.239.147;61.132.239.146|0
36.5.84.35|bird.sns.iqiyi.com|20170207161935|106.38.219.54;106.38.219.34|0
36.4.13.244|tx2.a.yximgs.com|20170207161935|61.191.60.17;61.191.60.16;61.191.60.19;61.191.60.18|0
36.4.151.103|r.vip.qq.com|20170207161935|14.215.138.24|0
223.244.111.107|supportcmsecurity1.ksmobile.com|20170207161935|221.228.204.21;119.147.146.70|0
36.63.218.133|www-cdn.icloud.com.akadns.net|20170207161935|104.70.216.102|0
60.174.104.22|fs_conn_other_doctor.qq.com|20170207161935|59.37.96.205|0
183.162.152.168|kwmsg.kuwo.cn|20170207161935|60.28.205.55;60.28.204.138;60.28.220.109|0
36.63.121.216|inews.gtimg.com|20170207161935|122.228.56.157;115.231.191.146;122.228.72.148;122.228.72.147;122.228.56.155;122.228.72.151;115.231.191.141;122.228.72.152;115.231.191.143;122.228.72.166;122.228.72.159;115.231.191.144;122.228.72.164;115.231.191.142;122.228.72.165;122.228.72.163|0
223.247.34.248|www.qchannel01.cn|20170207161935|114.112.103.16;114.112.103.15|0
36.4.18.149|ctr.datacld.com|20170207161935||0
36.4.137.183|www.163.com|20170207161935|61.132.238.103;60.174.243.159|0
36.4.83.33|clock.redhat.com|20170207161935||0
36.4.44.143|coral.qq.com|20170207161935|101.226.49.10|0
117.57.215.147|s0z.pstatp.com|20170207161935|114.80.174.21;114.80.174.20|0
36.57.51.241|commdata.v.qq.com|20170207161935|183.61.38.150;183.61.38.145;101.226.225.148;183.61.38.160;123.151.78.140;123.151.78.142|0
36.62.45.83|hxbesp.tmall.com|20170207161935|106.11.95.1|0
60.169.40.52|mdws.openapi.360.cn|20170207161935|106.120.160.177;218.30.118.213|0
114.106.81.246|gss0.baidu.com|20170207161935||0
36.63.160.18|xd-q.mediav.com|20170207161935|180.163.255.159|0
183.165.211.195|secure.wiair.com|20170207161935||0
220.179.251.130|c.xdwscache.ourglb0.com|20170207161935||0
117.66.65.171|p9.pstatp.com|20170207161935|122.228.9.42;122.228.9.41;218.92.225.215;122.228.9.40;61.190.149.199;122.228.9.39;61.190.149.196;122.228.9.38;122.228.9.37;218.92.225.207|0
36.63.194.107|hub5p.sandai.net|20170207161935|121.9.209.191;121.9.209.131;121.9.209.132|0
36.62.96.74|www.taobao.com|20170207161935|124.112.127.48|0
60.175.244.88|iseiya.taobao.com|20170207161935|140.205.230.49|0
36.63.171.1|fastly.tampermonkey.net|20170207161935|151.101.72.204|0
183.161.29.82|www.demaxiya.com|20170207161935|58.211.137.89|0
36.4.149.128|monitor.uu.qq.com|20170207161935|113.108.67.15;183.60.9.140;183.60.9.139;113.108.80.220;183.60.9.146;113.108.67.42;113.108.67.46;113.108.67.45|0
36.57.173.151|imgcache.qq.com|20170207161935|221.233.41.20;115.231.191.154;221.233.41.27;122.228.72.162;116.211.185.144;221.233.41.28;60.167.138.18;122.228.56.145;61.155.220.194;116.211.185.143;116.211.185.141;116.211.185.142;122.228.56.147;122.228.72.160;122.228.56.154;122.228.72.161|0
36.62.11.254|vod76.t18.lixian.vip.xunlei.com|20170207161935||0
36.57.159.245|clock.redhat.com|20170207161935||0
117.70.140.150|www.icloud.com|20170207161935|104.70.216.102|0
117.68.79.219|alog.umeng.com|20170207161935|110.173.196.36|0
36.4.40.31|pubserver5.bizport.cn|20170207161935|120.55.204.207|0
183.166.187.130|m.qpic.cn|20170207161935|117.41.243.15;117.41.243.16;117.41.243.18;117.41.243.12;117.41.243.11;117.41.243.14;117.41.243.13;117.41.243.17|0
223.246.186.35|zichan.wacai.com|20170207161935|183.129.207.211|0
114.101.98.231|www.yahoo.com|20170207161935|116.214.12.74|0
183.161.180.226|monitor.uu.qq.com|20170207161935|183.60.9.140;183.60.9.139;113.108.80.220;183.60.9.146;113.108.67.42;113.108.67.46;113.108.67.45;113.108.67.15|0
36.63.201.56|auth4.voole.com|20170207161935|119.57.155.108|0
36.63.47.52|mongotv.download.p2phash.yfp2p.net|20170207161935|122.226.188.8;122.226.188.61;122.226.188.58;122.226.188.41|0
183.165.248.34|oth.eve.mdt.qq.com|20170207161935|61.151.225.40;101.226.76.166|0
36.63.34.52|ubmcmm.baidustatic.com|20170207161935|180.97.66.45|0
36.4.148.149|ac.o2.qq.com|20170207161935|14.215.138.58|0
36.4.148.149|game.qq.com|20170207161935|221.233.41.27;221.233.41.28;122.228.56.147;122.228.72.161;61.155.220.194;116.211.185.142;116.211.185.144;116.211.185.141;60.167.138.18;122.228.56.154;122.228.72.160;116.211.185.143;122.228.72.162;115.231.191.154;122.228.56.145;221.233.41.20|0
60.171.244.5|sys.ahsz.gov.cn|20170207161935|60.171.247.234|0
183.166.112.56|p5.ssl.qhimg.com|20170207161935|101.227.5.23;101.227.5.22|0
60.171.92.17|clients3.google.com|20170207161935|74.125.23.102;74.125.23.101;74.125.23.138;74.125.23.100;74.125.23.113;74.125.23.139|0
60.175.202.12|cm.jd.com|20170207161935|106.39.167.232|0
183.162.36.210|sysupgrade.vivo.com.cn|20170207161935|182.34.127.87;58.223.166.230;180.96.71.124|0
124.112.251.164|init-p01st.push.apple.com|20170207161935|63.243.242.163;63.243.241.43|0
60.173.85.52|dyn.wps.cn|20170207161935|183.61.70.57;183.61.70.130;183.61.70.103|0
223.247.63.14|mobilemsg.youku.com|20170207161935|59.82.5.32|0
61.190.205.114|eweb.ahrcu.com|20170207161935|218.22.24.11|0
36.56.67.70|vweixinf.tc.qq.com|20170207161935|60.174.156.15;60.174.156.27;60.174.156.25;60.174.156.26;60.174.156.24;60.174.156.22|0
183.160.3.54|swa.gtimg.com|20170207161935|221.228.67.163;58.216.6.17;58.216.6.18;221.228.67.164;221.228.67.162;221.228.67.161;60.167.138.27;61.191.60.35;58.216.6.19;60.167.138.26;58.216.6.14;221.228.67.145|0
60.174.165.4|promo.lu.com|20170207161935|222.73.100.26|0
36.62.180.122|mobile.meituan.com|20170207161935|103.37.152.3|0
60.175.8.203|cwx.qlogo.cn|20170207161935|180.163.26.115;180.163.26.111;101.227.160.54;180.163.26.112;61.151.186.31;101.226.90.164;180.163.21.101;180.97.8.36;180.163.21.155;180.97.8.101;180.97.8.25|0
36.4.76.58|zyjc.sec.qq.com|20170207161935|101.226.68.81|0
220.180.235.74|localhost|20170207161935|127.0.0.1|0
36.4.212.59|msg.71.am|20170207161935|116.211.188.80;116.211.188.81;116.211.188.82;116.211.188.83;116.211.188.84;116.211.188.85;116.211.188.86;116.211.188.87;116.211.188.78;116.211.188.79|0
114.104.145.131|vv.video.qq.com|20170207161935|14.18.245.179|0
36.63.55.218|qzs.qq.com|20170207161935|115.231.191.154;122.228.72.162;221.233.41.28;116.211.185.142;116.211.185.144;60.167.138.18;122.228.56.145;221.233.41.27;221.233.41.20;116.211.185.143;122.228.56.154;116.211.185.141;61.155.220.194;122.228.56.147;122.228.72.161;122.228.72.160|0
36.5.20.72|tracker.openbittorrent.com|20170207161935||0
223.244.131.221|teredo.ipv6.microsoft.com|20170207161935||3
117.66.181.61|cm072.getui.igexin.com|20170207161935|183.131.1.81|0
223.241.130.218|m1.ifengimg.com|20170207161935|183.134.53.224;122.225.28.145|0
36.62.12.179|restapi.amap.com|20170207161935|140.205.174.76|0
36.63.68.39|dns.msftncsi.com|20170207161935|131.107.255.255|0
36.63.32.173|data.hicloud.com|20170207161935|117.78.58.31;118.194.57.31|0
183.165.75.60|ip.taobao.com|20170207161935|42.120.226.92;42.120.147.1;140.205.140.33;140.205.157.1|0
183.167.75.140|www.taobao.com|20170207161935|124.112.127.48|0
114.103.244.114|friendapi.uc108.org|20170207161935|115.236.49.168|0
36.5.197.98|dm.toutiao.com|20170207161935|60.28.208.141;60.28.208.142;60.28.208.140;60.28.208.143;60.28.208.144;60.28.208.145|0
10.70.231.34|ts.ah.cloutropy.cn|20170207161935|117.71.34.127|0
223.243.75.174|sum.comment.service.kugou.com|20170207161935|14.18.234.254;14.18.236.180|0
114.101.159.95|gm.mmstat.com|20170207161935|140.205.94.22|0
60.167.133.148|pasta.dianxinos.com|20170207161935|115.239.211.176;180.97.33.177|0
223.244.62.109|irs01.com|20170207161935|180.169.19.133;180.169.19.135;180.169.19.136;180.169.18.133;180.169.18.134;180.169.18.135;180.169.18.136|0
117.71.60.46|srf.qq.com|20170207161935|106.120.151.33;180.149.156.37;180.149.156.34;106.120.151.169|0
117.70.124.55|cm.poll.keke.cn|20170207161935|115.231.102.164|0
36.63.145.54|s.jpush.cn|20170207161935|118.145.3.81;113.31.17.108;223.202.132.110;121.46.20.38;111.13.48.104;113.31.80.134;121.46.20.41;223.202.132.115;118.145.3.76|0
36.62.75.141|qex.f.360.cn|20170207161935|103.28.8.182;220.181.158.28|0
223.241.130.218|m0.ifengimg.com|20170207161935|183.134.53.224;122.225.28.145|0
223.241.25.46|www.jovetech.com|20170207161935|58.56.111.41;58.56.111.27;58.56.111.40|0
36.62.54.252|clock.redhat.com|20170207161935||0
36.62.82.122|weibo.com|20170207161935|180.149.134.141|0
218.22.184.222|pool.ntp.org|20170207161935|83.168.200.199;212.47.249.141;61.216.153.107;46.31.185.18|0
36.63.151.227|m.baidu.com|20170207161935||0
220.178.29.178|researchnet.qqlive.qq.com|20170207161935|113.142.9.51|0
36.4.153.231|kkhulu.youyuan.com|20170207161935|123.103.60.4|0
117.57.81.254|clock.redhat.com|20170207161935||0
36.62.214.60|cu004.www.duba.net|20170207161935|115.231.27.129|0