1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0 264 0 200
1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4 132 1512 200
1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0 240 0 200
1363157993044 18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12 1527 2106 200
1363157995074 84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200
1363157993055 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
1363157995033 15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20 3156 2936 200
1363157983019 13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0 240 0 200
1363157984041 13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9 6960 690 200
1363157973098 15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27 3659 3538 200
1363157986029 15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3 1938 180 200
1363157992093 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9 918 4938 200
1363157986041 13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3 180 180 200
1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12 1938 2910 200
1363157995093 13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200
1363157982040 13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99 y0.ifengimg.com 综合门户 57 102 7335 110349 200
1363157986072 18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18 9531 2412 200
1363157990043 13925057413 00-1F-64-E1-E6-9A:CMCC 120.196.100.55 t3.baidu.com 搜索引擎 69 63 11058 48243 200
1363157988072 13760778710 00-FD-07-A4-7B-08:CMCC 120.196.100.82 2 2 120 120 200
1363157985066 13726238888 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157993055 13560436666 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
根据手机号的前几位既可以查找手机号和归属地或运营商的映射关系
package liuxun.hadoop.mr.dc;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class DataBean implements Writable {
private String tel;
private long upPayLoad;
private long downPayLoad;
private long totalPayLoad;
public DataBean() {
}
public DataBean(String tel, long upPayLoad, long downPayLoad) {
this.tel = tel;
this.upPayLoad = upPayLoad;
this.downPayLoad = downPayLoad;
this.totalPayLoad = upPayLoad + downPayLoad;
}
@Override
public String toString() {
return this.upPayLoad + "\t" + this.downPayLoad + "\t" + this.totalPayLoad;
}
public void readFields(DataInput in) throws IOException {
this.tel = in.readUTF();
this.upPayLoad = in.readLong();
this.downPayLoad = in.readLong();
this.totalPayLoad = in.readLong();
}
// 注意两点:写入的顺序和写入的类型
public void write(DataOutput out) throws IOException {
out.writeUTF(tel);
out.writeLong(upPayLoad);
out.writeLong(downPayLoad);
out.writeLong(totalPayLoad);
}
public String getTel() {
return tel;
}
public void setTel(String tel) {
this.tel = tel;
}
public long getUpPayLoad() {
return upPayLoad;
}
public void setUpPayLoad(long upPayLoad) {
this.upPayLoad = upPayLoad;
}
public long getDownPayLoad() {
return downPayLoad;
}
public void setDownPayLoad(long downPayLoad) {
this.downPayLoad = downPayLoad;
}
public long getTotalPayLoad() {
return totalPayLoad;
}
public void setTotalPayLoad(long totalPayLoad) {
this.totalPayLoad = totalPayLoad;
}
}
package liuxun.hadoop.mr.dc;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DataCountPartition {
public static class DCMapper extends Mapper {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// accept
String line = value.toString();
// split
String[] fields = line.split("\t");
String tel = fields[1];
long up = Long.parseLong(fields[8]);
long down = Long.parseLong(fields[9]);
DataBean bean = new DataBean(tel, up, down);
// send
context.write(new Text(tel), bean);
}
}
public static class DCReducer extends Reducer {
@Override
protected void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
long up_sum = 0;
long down_sum = 0;
for (DataBean bean : values) {
up_sum += bean.getUpPayLoad();
down_sum += bean.getDownPayLoad();
}
DataBean bean = new DataBean("", up_sum, down_sum);
context.write(key, bean);
}
}
public static class ProviderPartitioner extends Partitioner {
private static Map prividerMap = new HashMap();
static {
// 实际开发时是从数据库加载这种映射关系的
// 1:中国移动 2:中国联通 3:中国电信
prividerMap.put("135", 1);
prividerMap.put("136", 1);
prividerMap.put("137", 1);
prividerMap.put("150", 2);
prividerMap.put("159", 2);
prividerMap.put("182", 3);
prividerMap.put("183", 3);
}
// 此方法的返回值是分区号
// key: mapper一次输出的key 这里是手机号
// key: mapper一次输出的Value 这里是DataBean
// numPartitions:分区数量,由Reducer的数量决定,启动几个Reducer就会有几个partition
@Override
public int getPartition(Text key, DataBean value, int numPartitions) {
// 根据手机号得到运营商 此处根据key进行分区,实际开发中也可以根据value进行分区
String account = key.toString();
String sub_acc = account.substring(0, 3);
Integer code = prividerMap.get(sub_acc);
if (code == null) {
code =0;
}
return code;
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(DataCountPartition.class);
job.setMapperClass(DCMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DataBean.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setReducerClass(DCReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DataBean.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setPartitionerClass(ProviderPartitioner.class);
// 设置启动Reducer的数量
job.setNumReduceTasks(Integer.parseInt(args[2]));
job.waitForCompletion(true);
}
}
[email protected] 6000 0 2014-02-20
[email protected] 2000 0 2014-02-20
[email protected] 0 100 2014-02-20
[email protected] 3000 0 2014-02-20
[email protected] 9000 0 2014-02-20
[email protected] 0 200 2014-02-20
需求:将每个用户的总支出、总收入以及总结余统计出来,并进行排序,首先按照收入高低进行排序,收入相同的按照支出的多少进行排序
package liuxun.hadoop.mr.sort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class InfoBean implements WritableComparable {
private String account; // 账号
private double income; // 收入
private double expenses;// 支出
private double surplus; // 结余
public void set(String account,double income,double expenses) {
this.account = account;
this.income = income;
this.expenses = expenses;
this.surplus = this.income - this.expenses;
}
// 序列化
public void write(DataOutput out) throws IOException {
out.writeUTF(account);
out.writeDouble(income);
out.writeDouble(expenses);
out.writeDouble(surplus);
}
// 反序列化
public void readFields(DataInput in) throws IOException {
this.account = in.readUTF();
this.income = in.readDouble();
this.expenses = in.readDouble();
this.surplus = in.readDouble();
}
public int compareTo(InfoBean o) {
if (this.income == o.getIncome()) {
return this.expenses > o.getExpenses() ? 1 : -1;
}else {
return this.income > o.getIncome() ? -1 :1;
}
}
public String getAccount() {
return account;
}
public void setAccount(String account) {
this.account = account;
}
public double getIncome() {
return income;
}
public void setIncome(double income) {
this.income = income;
}
public double getExpenses() {
return expenses;
}
public void setExpenses(double expenses) {
this.expenses = expenses;
}
public double getSurplus() {
return surplus;
}
public void setSurplus(double surplus) {
this.surplus = surplus;
}
// 注意:toString方法决定了Bean写入文件的顺序
@Override
public String toString() {
return income+"\t"+expenses+"\t"+surplus+"\t";
}
}
② 编写MR进行统计(SumStep)
package liuxun.hadoop.mr.sort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SumStep {
public static class SumMapper extends Mapper {
private Text k = new Text();
private InfoBean v = new InfoBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
String account = fields[0];
double in = Double.parseDouble(fields[1]);
double out = Double.parseDouble(fields[2]);
k.set(account);
v.set(account, in, out);
context.write(k, v);
}
}
public static class SumReducer extends Reducer{
private InfoBean v = new InfoBean();
@Override
protected void reduce(Text key, Iterable value,Context context)
throws IOException, InterruptedException {
double in_sum = 0;
double out_sum = 0;
for (InfoBean bean : value) {
in_sum += bean.getIncome();
out_sum += bean.getExpenses();
}
v.set("", in_sum, out_sum);
context.write(key, v);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(SumStep.class);
job.setMapperClass(SumMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfoBean.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setReducerClass(SumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(InfoBean.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
③ 编写MR实现对统计结果进行排序 SortStep
package liuxun.hadoop.mr.sort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SortStep {
public static class SortMapper extends Mapper {
private InfoBean k = new InfoBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
String account = fields[0];
double in = Double.parseDouble(fields[1]);
double out = Double.parseDouble(fields[2]);
k.set(account, in, out);
context.write(k, NullWritable.get());
}
}
public static class SortReducer extends Reducer {
private Text k = new Text();
@Override
protected void reduce(InfoBean bean, Iterable value, Context context)
throws IOException, InterruptedException {
String account = bean.getAccount();
k.set(account);
context.write(k, bean);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf );
job.setJarByClass(SortStep.class);
job.setMapperClass(SortMapper.class);
job.setMapOutputKeyClass(InfoBean.class);
job.setMapOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setReducerClass(SortReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(InfoBean.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
执行步骤:
注意:
如果数据量大的时候使用Combiner的情况效率高
如果数据量很小的话 有可能会更慢 毕竟又增加了一个部分
如果Combiner是可插拔的(有Combiner或没有Combine都不会影响运行结果),那么Combiner的功能和Reducer的功能就是相同的。
Combine也可以是不可插拔的,例如实际开发中经常使用Combine在Mapper端做数据过滤。
例如求总的平均数 Combine就不能和Reducer一样
如果一样:
a.txt 3 6 3 Combine:12/3=4
b.txt 2 5 Combine:7/2=3.5
Reducer:(4+3.5)/2=3.75 != 3.8
只可以按照以下的方式运算:
a.txt 3 6 3 Combine:12 3
b.txt 2 5 Combine:7 2
Reducer:(12+7)/(3+2)=3.8
实例:例如WordCount程序就是 Combiner和Reducer功能相同 可插拔的,流程分析如下:package liuxun.hadoop.mr.ii;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 反向索引
*
* @author liuxun
*
*/
public class InverseIndex {
public static class IndexMapper extends Mapper{
private Text k = new Text();
private Text v = new Text();
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
// 可以从context中获取当前读取输入切片的信息
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String path = inputSplit.getPath().toString();// 格式是:hdfs://hostName:9000/directory ../filename
// 获取截取的部分 /directory.../filename
for (String word : words) {
k.set(word+"->"+path);
v.set("1");
context.write(k, v);
}
}
}
public static class IndexCombiner extends Reducer{
private Text k = new Text();
private Text v = new Text();
@Override
protected void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
String[] wordAndPath = key.toString().split("->");
String word = wordAndPath[0];
String path = wordAndPath[1];
int counter = 0;
for (Text t : values) {
counter += Integer.parseInt(t.toString());
}
k.set(word);
v.set(path+"->"+counter);
context.write(k,v);
}
}
public static class IndexReducer extends Reducer{
private Text v = new Text();
@Override
protected void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
StringBuilder resultBuilder = new StringBuilder();
for (Text t : values) {
resultBuilder.append((t.toString()+"\t").toCharArray());
}
v.set(resultBuilder.toString());
context.write(key,v );
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job =Job.getInstance(conf);
job.setJarByClass(InverseIndex.class);
job.setMapperClass(IndexMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
job.setCombinerClass(IndexCombiner.class);
job.setReducerClass(IndexReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
打包测试 OK