定义一个自定义的数据类型:里面的public int compareTo(ThirdSortClass o)方法是"三次排序的关键"
自定义Key:
import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class ThirdSortClass implements WritableComparable<ThirdSortClass> { /** * 自定义类型的中包含的变量,本例中的变量都是用于排序的变量 * 后序的事例中我们还将定义一些其它功能的变量 */ private String first; private String second; private String third; public ThirdSortClass() {} public ThirdSortClass(String first, String second, String third) { this.first = first; this.second = second; this.third = third; } /** * 反序列化,从流中的二进制转换成自定义Key */ @Override public void readFields(DataInput input) throws IOException { this.first = input.readUTF(); this.second = input.readUTF(); this.third = input.readUTF(); } /** * 序列化,将自定义Key转化成使用流传送的二进制 */ @Override public void write(DataOutput output) throws IOException { output.writeUTF(first); output.writeUTF(second); output.writeUTF(third); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((first == null) ? 0 : first.hashCode()); result = prime * result + ((second == null) ? 0 : second.hashCode()); result = prime * result + ((third == null) ? 0 : third.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; ThirdSortClass other = (ThirdSortClass) obj; if (first == null) { if (other.first != null) return false; } else if (!first.equals(other.first)) return false; if (second == null) { if (other.second != null) return false; } else if (!second.equals(other.second)) return false; if (third == null) { if (other.third != null) return false; } else if (!third.equals(other.third)) return false; return true; } /** * 用于map阶段和reduce阶段的排序,以及reduce阶段的grouping分组 * 这里是二次排序的关键,二次排序功能的实现主要就在这一个方法 */ @Override public int compareTo(ThirdSortClass o) { if(!this.first.equals(o.getFirst())) { return this.first.compareTo(o.getFirst()); } else if (!this.second.equals(o.getSecond())) { return this.second.compareTo(o.getSecond()); } else if (!this.third.equals(o.getThird())) { return this.third.compareTo(o.getThird()); } return 0; } public String getFirst() { return first; } public void setFirst(String first) { this.first = first; } public String getSecond() { return second; } public void setSecond(String second) { this.second = second; } public String getThird() { return third; } public void setThird(String third) { this.third = third; } }
map阶段:
import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class ThirdMapper extends Mapper<LongWritable, Text, ThirdSortClass, Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString().trim(); if(line.length() > 0) { String[] arr = line.split(","); if(arr.length == 3) { context.write(new ThirdSortClass(arr[0],arr[1], arr[2]), new Text(arr[1] + "," + arr[2])); } } } }reduce阶段:
import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class ThirdSortReducer extends Reducer<ThirdSortClass, Text, Text, Text> { private Text Okey = new Text(); @Override protected void reduce(ThirdSortClass key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // for(Text val: values) { // context.write(new Text(key.getFirst()), val); // } Okey.set(key.getFirst()); context.write(Okey, values.iterator().next()); } }启动函数:
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class JobMain { public static void main(String[] args) throws Exception{ Configuration configuration = new Configuration(); Job job = new Job(configuration, "third-sort-job"); job.setJarByClass(JobMain.class); job.setMapperClass(ThirdMapper.class); job.setMapOutputKeyClass(ThirdSortClass.class); job.setMapOutputValueClass(Text.class); /** * 这里没有用partitioner,到目前为止我们都是做的一个reduce的测试, * 到后面说明全局排序时,还会着重说明多个reduce的写法。 * 后面会有一个实例专门说明partitioner的定制,到说明求奇偶数行之和时会用到 */ // job.setPartitionerClass(ThirdSortPatitioner.class); job.setReducerClass(ThirdSortReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); /** * 为避免博客过长,这里也不说明setGroupingComparatorClass()方法的使用,因为 * 用到这个比较后,reduce的写法会有一些区别,后面有专门的一篇博客说明什么时候 * 用这个方法,以及mapreduce的中reduce的分组过程是会做比较 */ // job.setGroupingComparatorClass(ThirdSortGroupingComparator.class); FileInputFormat.addInputPath(job, new Path(args[0])); Path outputDir = new Path(args[1]); FileSystem fs = FileSystem.get(configuration); if(fs.exists(outputDir)) { fs.delete(outputDir, true); } FileOutputFormat.setOutputPath(job, outputDir); System.exit(job.waitForCompletion(true)? 0: 1); } }运行结果:
在这一篇博客中留下一个问题,就是整个流程的排序是都是用的key的compareTo()方法,但是不是所有的情况都能这样做的,而且规范一点的写法也是单独的各自实现自己的比较器,这里是为了文章的简洁,突出要说明的某一个点,而不让其它的因素影响对要讲解的点的说明。但未说明的地方,后面的博客中都会做为一个技能点来说明。下一篇就说明如何定制GroupingComparatorClass和SortComparatorClass。