mr8 12345,12345
自定义步骤:
1.如何自定义key比较函数类。这是key的第二次比较。这是一个比较器,需要继承WritableComparator。
public static class SortComparator extends WritableComparator
必须有一个构造函数,并且重载 public int compare(WritableComparable w1, WritableComparable w2)
另一种方法是 实现接口RawComparator。
在job中设置使用setSortComparatorClass。
2.如何自定义分组函数类。
在reduce阶段,构造一个key对应的value迭代器的时候,只要first相同就属于同一个组,放在一个value迭代器。这是一个比较器,需要继承WritableComparator。
public static class GroupingComparator extends WritableComparator
同key比较函数类,必须有一个构造函数,并且重载 public int compare(WritableComparable w1, WritableComparable w2)
同key比较函数类,分组函数类另一种方法是实现接口RawComparator。
在job中设置使用setGroupingComparatorClass。
自定义Key:
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class SecondSortClass implements WritableComparable {
/**
* 自定义类型的中包含的变量,本例中的变量都是用于排序的变量
* 后序的事例中我们还将定义一些其它功能的变量
*/
private String first;
private int second;
public SecondSortClass() {}
public SecondSortClass(String first, int second) {
this.first = first;
this.second = second;
}
/**
* 反序列化,从流中的二进制转换成自定义Key
*/
@Override
public void readFields(DataInput input) throws IOException {
this.first = input.readUTF();
this.second = input.readInt();
}
/**
* 序列化,将自定义Key转化成使用流传送的二进制
*/
@Override
public void write(DataOutput output) throws IOException {
output.writeUTF(first);
output.writeInt(second);
}
public String getFirst() {
return first;
}
public void setFirst(String first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
/**
* 这里不实现此方法,我们会在SortComparator中实现
*/
@Override
public int compareTo(SecondSortClass o) {
return 0;
}
}
自定义排序比较器:
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class SortComparator extends WritableComparator {
public SortComparator() {
super(SecondSortClass.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
SecondSortClass a1 = (SecondSortClass)a;
SecondSortClass b1 = (SecondSortClass)b;
/**
* 首先根据第一个字段排序,然后根据第二个字段排序
*/
if(!a1.getFirst().equals(b1.getFirst())) {
return a1.getFirst().compareTo(b1.getFirst());
} else {
return -(a1.getSecond() - b1.getSecond());
}
}
}
自定义分组比较器:
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class GroupingComparator extends WritableComparator {
public GroupingComparator() {
super(SecondSortClass.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
SecondSortClass a1 = (SecondSortClass) a;
SecondSortClass b1 = (SecondSortClass) b;
/**
* 只根据第一个字段进行分组
*/
return a1.getFirst().compareTo(b1.getFirst());
}
}
map阶段:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SecondMapper extends Mapper {
private IntWritable cost = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString().trim();
if(line.length() > 0) {
String[] arr = line.split(",");
if(arr.length == 3) {
cost.set(Integer.valueOf(arr[2]));
context.write(new SecondSortClass(arr[1],Integer.valueOf(arr[2])), cost);
}
}
}
}
reduce阶段:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class SecondReducer extends Reducer {
private Text okey = new Text();
private Text ovalue = new Text();
@Override
protected void reduce(SecondSortClass key, Iterable values, Context context)
throws IOException, InterruptedException {
StringBuffer sb = new StringBuffer();
/**
* 把同一用户的消费情况进行拼接
*/
for ( IntWritable value : values ) {
sb.append(",");
sb.append(value.get());
}
//删除第一逗号
sb.delete(0, 1);
okey.set(key.getFirst());
ovalue.set(sb.toString());
context.write(okey, ovalue);
}
}
启动函数:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobMain {
public static void main(String[] args) throws Exception{
Configuration configuration = new Configuration();
Job job = new Job(configuration, "sort-grouping-job");
job.setJarByClass(JobMain.class);
job.setMapperClass(SecondMapper.class);
job.setMapOutputKeyClass(SecondSortClass.class);
job.setMapOutputValueClass(IntWritable.class);
//设置排序比较器
job.setSortComparatorClass(SortComparator.class);
//设置分组比较器
job.setGroupingComparatorClass(GroupingComparator.class);
job.setReducerClass(SecondReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
Path outputDir = new Path(args[1]);
FileSystem fs = FileSystem.get(configuration);
if(fs.exists(outputDir)) {
fs.delete(outputDir, true);
}
FileOutputFormat.setOutputPath(job, outputDir);
System.exit(job.waitForCompletion(true)? 0: 1);
}
}
运行结果:
结论:
这里只是简单的说明了如何定制排序比较器和分组比较器,这个事例还可以加上TopK的功能,即取出某个用户最高的消费的前N笔并输出,这里就不在实现,如果有兴趣可以根据MapReduce-TopK自行实现,下一篇博客将说明如何定制partitioner,所使用的实例为分别求数据中奇数行和偶数行之和。