MapReduce-自定义比较器

接着上一篇 MapReduce-三次排序-曾经想不通的二次排序把其实例的需求改变一下,来说明如何定制GroupingComparatorClass和SortComparatorClass
下面就完成这篇博客要完成的问题
测试数据如下:
id,name,cost
1,mr1,3234
2,mr2,123
3,mr3,9877
4,mr4,348
5,mr5,12345
6,mr6,6646
7,mr7,98
8,mr8,12345
1,mr1,334
2,mr2,3123
3,mr3,97
4,mr4,231
5,mr5,122
6,mr6,3455
7,mr7,1222
8,mr8,12345
4,mr4,123
达到的效果:
找出每个用户的消费情况并从高到低排序
mr1     3234,334
mr2     3123,123
mr3     9877,97
mr4     348,231,123
mr5     12345,122
mr6     6646,3455
mr7     1222,98

mr8     12345,12345

自定义步骤:

1.如何自定义key比较函数类。这是key的第二次比较。这是一个比较器,需要继承WritableComparator。 
public static class SortComparator extends WritableComparator 
必须有一个构造函数,并且重载 public int compare(WritableComparable w1, WritableComparable w2) 
另一种方法是 实现接口RawComparator。 
在job中设置使用setSortComparatorClass。  
2.如何自定义分组函数类。
在reduce阶段,构造一个key对应的value迭代器的时候,只要first相同就属于同一个组,放在一个value迭代器。这是一个比较器,需要继承WritableComparator。 
public static class GroupingComparator extends WritableComparator 
同key比较函数类,必须有一个构造函数,并且重载 public int compare(WritableComparable w1, WritableComparable w2) 
同key比较函数类,分组函数类另一种方法是实现接口RawComparator。 
在job中设置使用setGroupingComparatorClass。  

自定义Key:

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

public class SecondSortClass implements WritableComparable<SecondSortClass> {
	/**
	 * 自定义类型的中包含的变量,本例中的变量都是用于排序的变量
	 * 后序的事例中我们还将定义一些其它功能的变量
	 */
	private String first;
	private int second;
	public SecondSortClass() {}
	public SecondSortClass(String first, int second) {
		this.first = first;
		this.second = second;
	}
	/**
	 * 反序列化,从流中的二进制转换成自定义Key
	 */
	@Override
	public void readFields(DataInput input) throws IOException {
		this.first = input.readUTF();
		this.second = input.readInt();
	}
	/**
	 * 序列化,将自定义Key转化成使用流传送的二进制 
	 */
	@Override
	public void write(DataOutput output) throws IOException {
		output.writeUTF(first);
		output.writeInt(second);
	}

	public String getFirst() {
		return first;
	}

	public void setFirst(String first) {
		this.first = first;
	}

	public int getSecond() {
		return second;
	}

	public void setSecond(int second) {
		this.second = second;
	}
	/**
	 * 这里不实现此方法,我们会在SortComparator中实现
	 */
	@Override
	public int compareTo(SecondSortClass o) {
		return 0;
	}
}
自定义排序比较器:

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class SortComparator extends WritableComparator {
	public SortComparator() {
		super(SecondSortClass.class, true);
	}
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		SecondSortClass a1 = (SecondSortClass)a;
		SecondSortClass b1 = (SecondSortClass)b;
		/**
		 * 首先根据第一个字段排序,然后根据第二个字段排序
		 */
		if(!a1.getFirst().equals(b1.getFirst())) {
			return a1.getFirst().compareTo(b1.getFirst());
		} else {
			return -(a1.getSecond() - b1.getSecond());
		}
	}
}
自定义分组比较器:

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class GroupingComparator extends WritableComparator {
	public GroupingComparator() {
		super(SecondSortClass.class, true);
	}
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		SecondSortClass a1 = (SecondSortClass) a;
		SecondSortClass b1 = (SecondSortClass) b;
		/**
		 * 只根据第一个字段进行分组
		 */
		return a1.getFirst().compareTo(b1.getFirst());
	}
}
map阶段:

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class SecondMapper extends Mapper<LongWritable, Text, SecondSortClass, IntWritable> {
	private IntWritable cost = new IntWritable();
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String line = value.toString().trim();
		if(line.length() > 0) {
			String[] arr = line.split(",");
			if(arr.length == 3) {
				cost.set(Integer.valueOf(arr[2]));
				context.write(new SecondSortClass(arr[1],Integer.valueOf(arr[2])), cost);
			}
		}
	}
}
reduce阶段:

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class SecondReducer extends Reducer<SecondSortClass, IntWritable, Text, Text> {

	private Text okey = new Text();
	private Text ovalue = new Text();
	@Override
	protected void reduce(SecondSortClass key, Iterable<IntWritable> values, Context context)
			throws IOException, InterruptedException {
		StringBuffer sb = new StringBuffer();
		/**
		 * 把同一用户的消费情况进行拼接
		 */
		for ( IntWritable value : values  ) {
			sb.append(",");
			sb.append(value.get());
		}
		//删除第一逗号
		sb.delete(0, 1);
		okey.set(key.getFirst());
		ovalue.set(sb.toString());
		context.write(okey, ovalue);
	}
}
启动函数:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobMain {
	public static void main(String[] args) throws Exception{
		Configuration configuration = new Configuration();
		Job job = new Job(configuration, "sort-grouping-job");
		job.setJarByClass(JobMain.class);
		job.setMapperClass(SecondMapper.class);
		job.setMapOutputKeyClass(SecondSortClass.class);
		job.setMapOutputValueClass(IntWritable.class);
		//设置排序比较器
		job.setSortComparatorClass(SortComparator.class);
		//设置分组比较器
		job.setGroupingComparatorClass(GroupingComparator.class);
		job.setReducerClass(SecondReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		Path outputDir = new Path(args[1]);
		FileSystem fs = FileSystem.get(configuration);
		if(fs.exists(outputDir)) {
			fs.delete(outputDir, true);
		}
		FileOutputFormat.setOutputPath(job, outputDir);
		System.exit(job.waitForCompletion(true)? 0: 1);
	}
}
运行结果:


结论:
这里只是简单的说明了如何定制排序比较器和分组比较器,这个事例还可以加上TopK的功能,即取出某个用户最高的消费的前N笔并输出,这里就不在实现,如果有兴趣可以根据MapReduce-TopK自行实现,下一篇博客将说明如何定制partitioner,所使用的实例为分别求数据中奇数行和偶数行之和。


你可能感兴趣的:(mapreduce,hadoop)