目标:
输入数据:
1. sort1 1
2. sort2 3
3. sort2 88
4. sort2 54
5. sort1 2
6. sort6 22
7. sort6 888
8. sort6 58
输出数据:
1. sort1 1,2
2. sort2 3,54,88
3. sort6 22,58,888
既然是二次排序,所以上述的输入数据必须在同一分区中,不然达不到我们想要的二次排序效果。MapReduce框架默认是对key进行排序的,这里我们需要自定义排序方法对key进行排序。
先理解MapReduce处理数据的流程:
(这是在另一个大神的博客看到的 http://blog.csdn.net/lzm1340458776/article/details/42875751)
我们应该先深刻的理解MapReduce处理数据的整个流程,这是最基础的,不然的话是不可能找到解决问题的思路的。我描述一下MapReduce处理数据的大概流程:首先,MapReduce框架通过getSplits()方法实现对原始文件的切片之后,每一个切片对应着一个MapTask,InputSplit输入到map()函数进行处理,中间结果经过环形缓冲区的排序,然后分区、自定义二次排序(如果有的话)和合并,再通过Shuffle操作将数据传输到reduce Task端,reduce端也存在着缓冲区,数据也会在缓冲区和磁盘中进行合并排序等操作,然后对数据按照key值进行分组,然后每处理完一个分组之后就会去调用一次reduce()函数,最终输出结果。大概流程我画了一下,如下图:
大概解决思路:
map端排序:
首先将输入数据放到同一分区中并对其进行排序,因为MapReduce排序只对key进行排序,所以我们需要一个组合key,以[key,value]的形式输入,map输入的数据类型结构大概为{[key,value],value}:
1. {[sort1,1],1}
2. {[sort2,3],3}
3. {[sort2,88],88}
4. {[sort2,54],54}
5. {[sort1,2],2}
6. {[sort6,22],22}
7. {[sort6,888],888}
8. {[sort6,58],58}
温馨提示:请务必保证map端排序结果正确,不然不管怎么分组都得不到你想要的结果(本人就踩过这坑,费了好长时间才找到原因)。
map端排序后的结果应该如下:
1.
sort1:1,
sort1:2,
sort2:3,
sort2:54,
sort2:88,
sort6:22,
sort6:58,
sort6:888,
2.
reduce端分组:
前面提到过,reduce方法执行的次数是由分组数决定的。在reduce端按组合key的第一个字段进行分组,然后再对对应的value值进行相应的处理则可以达到我们想要的结果。
1. sort1 1,2
2. sort2 3,54,88
3. sort6 22,58,888
具体的代码实现如下:
自定义排序类:
package com.secondSortJob; import com.sun.deploy.util.ArrayUtil; import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; /** * Created by Administrator on 2018/1/28 0028. */ public class SecondKeySort implements WritableComparable{ private Text key = new Text(); private IntWritable count = new IntWritable(); public SecondKeySort(Text key, IntWritable count) { this.key = key; this.count = count; } public SecondKeySort() { } public Text getKey() { return key; } public void setKey(Text key) { this.key = key; } public IntWritable getCount() { return count; } public void setCount(IntWritable count) { this.count = count; } public void write(DataOutput dataOutput) throws IOException { // new Text("["+this.key.toString()+","+this.count.toString()+"]"); this.key.write(dataOutput); this.count.write(dataOutput); } public void readFields(DataInput dataInput) throws IOException { this.key.readFields(dataInput); this.count.readFields(dataInput); } public int compareTo(SecondKeySort o) { String keyStr1 = this.key.toString().substring(0,this.key.toString().length()); String keyStr2 = o.getKey().toString().substring(0,o.getKey().toString().length()); String[] keyStr1Arr = keyStr1.split("\t"); String[] keyStr2Arr = keyStr2.split("\t"); String keyStr1Text = keyStr1Arr [0]; String keyStr2Text = keyStr2Arr [0]; if(!keyStr1Text.equals(keyStr2Text)){ // System.out.println(keyStr1Text.compareTo(keyStr2Text)); return keyStr1Text.compareTo(keyStr2Text) ; }else{ return Integer.parseInt(this.count.toString()) - Integer.parseInt(o.getCount().toString()) ; } } @Override public String toString() { return this.key.toString() +"\t" +this.count.toString(); } }
mapper任务类:
package com.secondSortJob; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * Created by Administrator on 2018/1/28 0028. */ public class SecondSortMapper extends Mapper,Text,SecondKeySort,IntWritable> { private SecondKeySort secondKeySort = new SecondKeySort(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] arrs = value.toString().split("\t"); secondKeySort.setKey(new Text(arrs[0])); secondKeySort.setCount(new IntWritable(Integer.parseInt(arrs[1].trim()))); context.write(secondKeySort,new IntWritable(Integer.parseInt(arrs[1].trim()))); } }
reduce任务类:
package com.secondSortJob; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; /** * Created by Administrator on 2018/1/28 0028. */ public class SecondSortReducer extends Reducer自定义分区类:(保证输入数据在同一个分区,分区数和reduce任务数相等),IntWritable,Text,NullWritable> { @Override protected void reduce(SecondKeySort key, Iterable values, Context context) throws IOException, InterruptedException { // System.out.println("reduce action"); String str = key.getKey().toString() +":"; while (values.iterator().hasNext()){ str = str + values.iterator().next()+","; } context.write(new Text(str),NullWritable.get()); } }
package com.secondSortJob; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Partitioner; /** * Created by Administrator on 2018/1/29 0029. */ public class GroupingPartitioner extends Partitioner,IntWritable> { public int getPartition(SecondKeySort secondKeySort, IntWritable intWritable, int i) { return secondKeySort.getKey().toString().hashCode()%i ; } }
分组函数类:(可以实现RawComparator接口,也可以继承WritableComparator类)
package com.secondSortJob; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.WritableComparator; /** * Created by Administrator on 2018/1/28 0028. */ public class GroupingComparator implements RawComparator{ public int compare(byte[] bytes, int i, int i1, byte[] bytes1, int i2, int i3) { int compareBytes = WritableComparator.compareBytes(bytes, i, 8, bytes1, i2, 8); return compareBytes; } public int compare(SecondKeySort o1, SecondKeySort o2) { return o1.getKey().compareTo(o2.getKey()); } }
主程序加载类:
package com.secondSortJob; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.net.URI; /** * 二次排序 * Created by Administrator on 2018/1/28 0028. */ public class SecondSortJob { private static Configuration configuration = new Configuration(); private static String master = "hdfs://192.168.8.222"; private static String masterFS = "hdfs://192.168.8.222:9000"; private static FileSystem fs = null ; public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { configuration.set("yarn.resourcemanager.hostname",master); fs = FileSystem.get(URI.create(masterFS),configuration); Job job = Job.getInstance(configuration); String inputPath = "/lsw/secondSort.txt"; String secondSortResult = "/lsw/secondSortResult" ; if(fs.isDirectory(new Path(masterFS+secondSortResult))) fs.delete(new Path(masterFS+secondSortResult)); job.setMapOutputKeyClass(SecondKeySort.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setJarByClass(SecondSortJob.class); job.setMapperClass(SecondSortMapper.class); job.setReducerClass(SecondSortReducer.class); job.setPartitionerClass(GroupingPartitioner.class); job.setGroupingComparatorClass(GroupingComparator.class); // job.setNumReduceTasks(3); FileInputFormat.addInputPath(job,new Path(masterFS+inputPath)); FileOutputFormat.setOutputPath(job,new Path(masterFS+secondSortResult)); job.waitForCompletion(true); } }