mr8 12345,12345
自定义步骤:
1.如何自定义key比较函数类。这是key的第二次比较。这是一个比较器,需要继承WritableComparator。
public static class SortComparator extends WritableComparator
必须有一个构造函数,并且重载 public int compare(WritableComparable w1, WritableComparable w2)
另一种方法是 实现接口RawComparator。
在job中设置使用setSortComparatorClass。
2.如何自定义分组函数类。
在reduce阶段,构造一个key对应的value迭代器的时候,只要first相同就属于同一个组,放在一个value迭代器。这是一个比较器,需要继承WritableComparator。
public static class GroupingComparator extends WritableComparator
同key比较函数类,必须有一个构造函数,并且重载 public int compare(WritableComparable w1, WritableComparable w2)
同key比较函数类,分组函数类另一种方法是实现接口RawComparator。
在job中设置使用setGroupingComparatorClass。
自定义Key:
import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class SecondSortClass implements WritableComparable<SecondSortClass> { /** * 自定义类型的中包含的变量,本例中的变量都是用于排序的变量 * 后序的事例中我们还将定义一些其它功能的变量 */ private String first; private int second; public SecondSortClass() {} public SecondSortClass(String first, int second) { this.first = first; this.second = second; } /** * 反序列化,从流中的二进制转换成自定义Key */ @Override public void readFields(DataInput input) throws IOException { this.first = input.readUTF(); this.second = input.readInt(); } /** * 序列化,将自定义Key转化成使用流传送的二进制 */ @Override public void write(DataOutput output) throws IOException { output.writeUTF(first); output.writeInt(second); } public String getFirst() { return first; } public void setFirst(String first) { this.first = first; } public int getSecond() { return second; } public void setSecond(int second) { this.second = second; } /** * 这里不实现此方法,我们会在SortComparator中实现 */ @Override public int compareTo(SecondSortClass o) { return 0; } }自定义排序比较器:
import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; public class SortComparator extends WritableComparator { public SortComparator() { super(SecondSortClass.class, true); } @Override public int compare(WritableComparable a, WritableComparable b) { SecondSortClass a1 = (SecondSortClass)a; SecondSortClass b1 = (SecondSortClass)b; /** * 首先根据第一个字段排序,然后根据第二个字段排序 */ if(!a1.getFirst().equals(b1.getFirst())) { return a1.getFirst().compareTo(b1.getFirst()); } else { return -(a1.getSecond() - b1.getSecond()); } } }自定义分组比较器:
import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; public class GroupingComparator extends WritableComparator { public GroupingComparator() { super(SecondSortClass.class, true); } @Override public int compare(WritableComparable a, WritableComparable b) { SecondSortClass a1 = (SecondSortClass) a; SecondSortClass b1 = (SecondSortClass) b; /** * 只根据第一个字段进行分组 */ return a1.getFirst().compareTo(b1.getFirst()); } }map阶段:
import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class SecondMapper extends Mapper<LongWritable, Text, SecondSortClass, IntWritable> { private IntWritable cost = new IntWritable(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString().trim(); if(line.length() > 0) { String[] arr = line.split(","); if(arr.length == 3) { cost.set(Integer.valueOf(arr[2])); context.write(new SecondSortClass(arr[1],Integer.valueOf(arr[2])), cost); } } } }reduce阶段:
import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class SecondReducer extends Reducer<SecondSortClass, IntWritable, Text, Text> { private Text okey = new Text(); private Text ovalue = new Text(); @Override protected void reduce(SecondSortClass key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { StringBuffer sb = new StringBuffer(); /** * 把同一用户的消费情况进行拼接 */ for ( IntWritable value : values ) { sb.append(","); sb.append(value.get()); } //删除第一逗号 sb.delete(0, 1); okey.set(key.getFirst()); ovalue.set(sb.toString()); context.write(okey, ovalue); } }启动函数:
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class JobMain { public static void main(String[] args) throws Exception{ Configuration configuration = new Configuration(); Job job = new Job(configuration, "sort-grouping-job"); job.setJarByClass(JobMain.class); job.setMapperClass(SecondMapper.class); job.setMapOutputKeyClass(SecondSortClass.class); job.setMapOutputValueClass(IntWritable.class); //设置排序比较器 job.setSortComparatorClass(SortComparator.class); //设置分组比较器 job.setGroupingComparatorClass(GroupingComparator.class); job.setReducerClass(SecondReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); Path outputDir = new Path(args[1]); FileSystem fs = FileSystem.get(configuration); if(fs.exists(outputDir)) { fs.delete(outputDir, true); } FileOutputFormat.setOutputPath(job, outputDir); System.exit(job.waitForCompletion(true)? 0: 1); } }运行结果:
结论:
这里只是简单的说明了如何定制排序比较器和分组比较器,这个事例还可以加上TopK的功能,即取出某个用户最高的消费的前N笔并输出,这里就不在实现,如果有兴趣可以根据MapReduce-TopK自行实现,下一篇博客将说明如何定制partitioner,所使用的实例为分别求数据中奇数行和偶数行之和。