首先我们要了解java中排序的机制,在java中一切都是对象,而在我们的基本数据类型中都有本类的比较策略,这样任何的比较操作都要依赖于这个策略,所以问题也就转化到实现我们自己的策略上来了——策略模式?
Hadoop的IO系统中,自己定义了许多Writable的接口,比如IntWritable,Text等等,这些类里面都通过实现WritableComparator接口提供了自己的比较策略,而这个接口继承自RawComparator接口,也就是最原始的比较器接口,这个原始比较器中未实现的一些方法,允许(或者要求)我们去实现,给优化效率提供了一个钩子,比如我们对于某种数据类型中前4个字节感兴趣,我们就可以只取前4个字节,而不用拿全部。
在权威指南上有个例子,是关于求最大温度的,我做了一点变化,并做了一些实验,通过实现分区(Partitioner)策略可以将具有某些特征的键分配到同一个reduce中;通过实现一个分组(Group)策略,可以保证具有某些特征的键的值会被组合到一起(组合在一起后,也就是reduce阶段中的那个可迭代对象);然后最后实现了一个排序(Sort)策略,这也就是所谓的关键所在了,我们可以提供某种策略来控制键的排序。
在工作曾经做过一个估计用户在线时长的MR,当时没有考虑到效率问题,将排序全部压在reduce阶段,现在想来确实是个隐患,如果有时间的话最好可以做一下修改,改为二次排序方式。
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class IntPair implements WritableComparable {
private int first = 0;
private int second = 0;
public String toString(){
return first + "\t" + second;
}
/**
* Set the left and right values.
*/
public void set(int left, int right) {
first = left;
second = right;
}
public int getFirst() {
return first;
}
public int getSecond() {
return second;
}
/**
* Read the two integers.
* Encoded as: MIN_VALUE -> 0, 0 -> -MIN_VALUE, MAX_VALUE-> -1
*/
@Override
public void readFields(DataInput in) throws IOException {
first = in.readInt() + Integer.MIN_VALUE;
second = in.readInt() + Integer.MIN_VALUE;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(first - Integer.MIN_VALUE);
out.writeInt(second - Integer.MIN_VALUE);
}
@Override
public int hashCode() {
return first * 157 + second;
}
@Override
public boolean equals(Object right) {
if (right instanceof IntPair) {
IntPair r = (IntPair) right;
return r.first == first && r.second == second;
} else {
return false;
}
}
/** A Comparator that compares serialized IntPair. */
public static class Comparator extends WritableComparator {
public Comparator() {
super(IntPair.class);
}
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
return compareBytes(b1, s1, l1, b2, s2, l2);
}
}
static { // register this comparator
WritableComparator.define(IntPair.class, new Comparator());
}
@Override
public int compareTo(IntPair o) {
if (first != o.first) {
return first < o.first ? -1 : 1;
} else if (second != o.second) {
return second < o.second ? -1 : 1;
} else {
return 0;
}
}
public static int compare(int left, int right) {
// TODO Auto-generated method stub
return left > right ? 1 : (left == right ? 0 : -1);
}
}
以下是MR代码:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class UseSecondarySort extends Configured implements Tool{
public static class Map extends Mapper{
private final IntPair key = new IntPair();
public void map(LongWritable inkey, Text invalue, Context context) throws IOException, InterruptedException{
StringTokenizer itr = new StringTokenizer(invalue.toString());
int left = 0;
int right = 0;
if(itr.hasMoreTokens()){
left = Integer.parseInt(itr.nextToken());
if(itr.hasMoreTokens()){
right = Integer.parseInt(itr.nextToken());
}
}
key.set(left, right);
context.write(key, NullWritable.get());
}
}
public static class Reduce extends Reducer{
private static final IntPair sign = new IntPair();
public void reduce(IntPair key, Iterable values, Context context) throws IOException, InterruptedException{
sign.set(888888, 888888);
context.write(sign,new Text(this.toString()));
for(NullWritable it : values)
context.write(key, new Text(this.toString()));
}
}
public static class FirstPartitioner extends Partitioner{
@Override
public int getPartition(IntPair key, NullWritable value, int numPartitions) {
return Math.abs(key.getFirst() * 127) % numPartitions;
}
}
public static class FirstGroupingComparator
implements RawComparator {
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8, b2, s2, Integer.SIZE/8);
}
@Override
public int compare(IntPair o1, IntPair o2) {
int l = o1.getFirst();
int r = o2.getFirst();
return l == r ? 0 : (l < r ? -1 : 1);
}
}
public static class KeyComparator extends WritableComparator{
protected KeyComparator() {
super(IntPair.class, true);
// TODO Auto-generated constructor stub
}
@Override
public int compare(WritableComparable o1, WritableComparable o2) {
// TODO Auto-generated method stub
IntPair ip1 = (IntPair) o1;
IntPair ip2 = (IntPair) o2;
int cmp = IntPair.compare(ip1.getFirst(), ip2.getFirst());
if(cmp != 0)
return cmp;
return -IntPair.compare(ip1.getSecond(), ip2.getSecond());
}
//
@Override
//
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
//
// TODO Auto-generated method stub
//
return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2);
//
//return 0;
//
}
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
int ret = ToolRunner.run(new UseSecondarySort(), args);
System.exit(ret);
}
@Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
Job job = new Job(getConf());
job.setJobName("UseSecondarySort");
job.setJarByClass(UseSecondarySort.class);
job.setMapOutputKeyClass(IntPair.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(IntPair.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setNumReduceTasks(2);
job.setPartitionerClass(FirstPartitioner.class);
job.setGroupingComparatorClass(FirstGroupingComparator.class);
job.setSortComparatorClass(KeyComparator.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
}
如有问题,欢迎大家交流。