1.分区
实现分区的步骤:
1.1先分析一下具体的业务逻辑,确定大概有多少个分区
1.2首先书写一个类,它要继承org.apache.hadoop.mapreduce.Partitioner这个类
1.3重写public int getPartition这个方法,根据具体逻辑,读数据库或者配置返回相同的数字
1.4在main方法中设置Partioner的类,job.setPartitionerClass(DataPartitioner.class);
1.5设置Reducer的数量,job.setNumReduceTasks(6);
以下例子是统计同一手机号的上行流量、下行流量以及总流量,要求分区
DataInfo.java import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Writable; public class DataInfo implements Writable{ private String tel;//手机号 private long upFlow;//上行流量 private long downFlow;//下行流量 private long sumFlow;//总流量 public DataInfo(){} public DataInfo(String tel,long upFlow,long downFlow) { this.tel=tel; this.upFlow=upFlow; this.downFlow=downFlow; this.sumFlow=upFlow+downFlow; } @Override//序列化成流 public void write(DataOutput out) throws IOException { out.writeUTF(tel); out.writeLong(upFlow); out.writeLong(downFlow); out.writeLong(sumFlow); } @Override//反序列化成对象,注意顺序不要错了 public void readFields(DataInput in) throws IOException { this.tel=in.readUTF(); this.upFlow=in.readLong(); this.downFlow=in.readLong(); this.sumFlow=in.readLong(); } @Override public String toString() { return (upFlow+"\t"+downFlow+"\t"+sumFlow); } public String getTel() { return tel; } public void setTel(String tel) { this.tel = tel; } public long getUpFlow() { return upFlow; } public void setUpFlow(long upFlow) { this.upFlow = upFlow; } public long getDownFlow() { return downFlow; } public void setDownFlow(long downFlow) { this.downFlow = downFlow; } public long getSumFlow() { return sumFlow; } public void setSumFlow(long sumFlow) { this.sumFlow = sumFlow; } }
DataCount.java import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class DataCount { //Map public static class DCMapper extends Mapper<LongWritable,Text,Text,DataInfo> { private Text text=new Text(); protected void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException { String line=value.toString(); String[] str=line.split(" "); String tel=str[0]; long up=Long.parseLong(str[1]); long down=Long.parseLong(str[2]); DataInfo data=new DataInfo(tel,up,down); text.set(tel); context.write(text, data); } } //Partition public static class DCPartitioner extends Partitioner<Text,DataInfo> { private static Map<String,Integer>provider=new HashMap<String,Integer>(); static{ provider.put("134", 1); provider.put("134", 1); provider.put("135", 2); provider.put("135", 2); provider.put("136", 3); provider.put("136", 3); } @Override public int getPartition(Text key, DataInfo data, int arg2) { //向数据库或配置信息读写 String tel=key.toString().substring(0, 3); Integer num=provider.get(tel); if(num==null) num=0; return num; } } //Reducer public static class DCReducer extends Reducer<Text,DataInfo,Text,DataInfo> { protected void reduce(Text key,Iterable<DataInfo> values,Context context) throws IOException, InterruptedException { long up=0; long down=0; for(DataInfo data:values) { up+=data.getUpFlow(); down+=data.getDownFlow(); } DataInfo dataInfo=new DataInfo("",up,down); context.write(key, dataInfo); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf=new Configuration(); Job job=Job.getInstance(conf,"patition"); job.setJarByClass(DataCount.class); job.setMapperClass(DCMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DataInfo.class); job.setReducerClass(DCReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DataInfo.class); job.setPartitionerClass(DCPartitioner.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setNumReduceTasks(Integer.parseInt(args[2]));//设置Reduce数量,即分区数量,这里最少为3,因为分区是3 System.exit(job.waitForCompletion(true)? 0:1); } }
排序MR默认是按key2进行排序的,如果想自定义排序规则,被排序的对象要实现WritableComparable接口
,在compareTo方法中实现排序规则(MapReduce的shuffer会自动调用这个方法),然后将这个对象当做k2,即可完成排序
部分代码如下:
@Override public int compareTo(InfoBean o) { if(this.income == o.getIncome()){ return this.expenses > o.getExpenses() ? 1 : -1; } return this.income > o.getIncome() ? 1 : -1; }
combiner的作用就是在map端对输出先做一次合并(其实相当于一个reducer),以减少传输到reducer的数据量。
如果不用combiner,那么,所有的结果都是reduce完成,效率会相对低下。使用combiner,先完成的map会在本地聚合,提升速度。
注意:Combiner的输出是Reducer的输入,如果Combiner是可插拔的,添加Combiner绝不能改变最终的计算结果。所以Combiner只应该用于那种Reduce的输入key/value与输出key/value类型完全一致,且不影响最终结果的场景。比如累加,最大值等。
下面看一个排序索引的例子
a.txt: hello tom
hello jerry
.....
b.txt: hello jerry
hello tom
....
输出:
hello a.txt->2b.txt->2
......
---------------------------------
Map阶段
<0,"hello tom">
....
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->a.txt",1);
context.write("hello->b.txt",1);
context.write("hello->b.txt",1);
context.write("hello->b.txt",1);
--------------------------------------------------------
combiner阶段
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->a.txt",1>
<"hello->b.txt",1>
<"hello->b.txt",1>
<"hello->b.txt",1>
context.write("hello","a.txt->5");
context.write("hello","b.txt->3");
--------------------------------------------------------
Reducer阶段
<"hello",{"a.txt->5","b.txt->3"}>
context.write("hello","a.txt->5 b.txt->3");
-------------------------------------------------------
结果:
hello "a.txt->5 b.txt->3"
tom "a.txt->2 b.txt->1"
kitty "a.txt->1"
.......
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class Combine { //Map public static class CBMapper extends Mapper<Object,Text,Text,Text> { private Text k=new Text(); private Text v=new Text(); protected void map(Object key,Text value,Context context) throws IOException, InterruptedException { String line=value.toString(); String[]str=line.split(" "); FileSplit inputSplit=(FileSplit)context.getInputSplit(); String path=inputSplit.getPath().toString();//得到路径,通过context这个上下文得到 for(String word:str) { k.set(word+"->"+path); v.set("1"); context.write(k, v); } } } //Combiner(Reduce1) public static class CBCombiner extends Reducer<Text,Text,Text,Text> { private Text k=new Text(); private Text v=new Text(); protected void reduce(Text key,Iterable<Text>values,Context context) throws IOException, InterruptedException { String line=key.toString(); String[]str=line.split("->"); String k1=str[0]; String path=str[1]; int count=0; for(Text t:values) { count+=Integer.parseInt(t.toString()); } k.set(k1); v.set(path+"->"+count); context.write(k,v); } } //Reduce(Reduce2) public static class CBReducer extends Reducer<Text,Text,Text,Text> { private Text v=new Text(); @Override protected void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException { String result=""; for(Text t:values) { result+=t.toString()+"\t"; } v.set(result); context.write(key, v); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { //构建job对象 Configuration conf=new Configuration(); Job job=Job.getInstance(conf, "Combiner"); //设置main方法所在的类 job.setJarByClass(Combine.class); //设置mapper相关属性 job.setMapperClass(CBMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //设置Combiner相关属性 job.setCombinerClass(CBCombiner.class); //设置reducer相关属性 job.setReducerClass(CBReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //设置文件输入输出 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); //提交任务 System.exit(job.waitForCompletion(true)? 0:1); } }