hadoop mapreduce作业通过组合key实现二次排序的过程中,只要实现组合key的类就可以了。mapreduce框架本身会基于key对输出进行排序。
而partion函数只为了是实现数据规模较大时,对map的输出实现分区。为启动多个reduce任务做准备。
group函数也是可有可无的。
group函数的作用是对key进行分组,例如对于map的结果:
[(k1, k21), v1]
[(k1, k22), v2]
[(k1, k23), v3]
通过设定group函数可以做到按组合key的k1进行分组:
[[(k1, k21), (k1, k22), (k1, k23)], [v1, v2, v3]]
下面验证结果记录:
输入文件file1:
1 2
6 4
1 4
2 5
3 1
4 3
2 6
4 1
5 5
2 1
6 8
1 9
6 1
package hadoop;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.KeyValue.RawBytesComparator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class DDSort {
public static class Map extends Mapper{
@Override
protected void map(
LongWritable key,
Text value,
Mapper.Context context)
throws IOException, InterruptedException {
String str = value.toString();
String [] sz = str.split("\t");
if(sz.length == 2){
int v1 = Integer.parseInt(sz[0]);
int v2 = Integer.parseInt(sz[1]);
DataPair dp = new DataPair(v1, v2);
//context.write(new IntWritable(v1), new IntWritable(v2));
context.write(dp, new IntWritable(v2));
}
}
}
public static class DataPair implements WritableComparable {
private int v1;
private int v2;
public DataPair(){
}
public DataPair(int arg1, int arg2){
v1 = arg1;
v2 = arg2;
}
public int getV1() {
return v1;
}
public void setV1(int v1) {
this.v1 = v1;
}
public int getV2() {
return v2;
}
public void setV2(int v2) {
this.v2 = v2;
}
@Override
public String toString() {
return new Integer(v1).toString() + " " + new Integer(v2).toString();
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
v1 = in.readInt();
v2 = in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeInt(v1);
out.writeInt(v2);
}
@Override
public int compareTo(DataPair o) {
int tmp = v1 - o.getV1();
if(tmp != 0){
return tmp;
}
return v2 - o.getV2();
}
static {
//WritableComparator.define(DataPair.class, new Comparator());
}
}
// public class Comparator extends WritableComparator{
// public Comparator() {
// super(DataPair.class);
// }
//
// @Override
// public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3,
// int arg4, int arg5) {
// // TODO Auto-generated method stub
// return super.compare(arg0, arg1, arg2, arg3, arg4, arg5);
// }
// }
public static class Prt extends Partitioner{
@Override
public int getPartition(DataPair arg0, IntWritable arg1, int arg2) {
return arg0.getV1()%arg2;
}
}
public static class Grp implements RawComparator{
@Override
public int compare(DataPair o1, DataPair o2) {
return o1.v1 - o2.v1;
}
@Override
public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3,
int arg4, int arg5) {
return WritableComparator.compareBytes(arg0, arg1, 4, arg3, arg4, 4);
}
}
public static class Grp1 extends WritableComparator{
protected Grp1() {
super(DataPair.class);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
DataPair d1 = (DataPair)a;
DataPair d2 = (DataPair)b;
return Integer.compare(d1.getV1(), d2.getV1());
}
}
public static class Reduce extends Reducer{
@Override
protected void reduce(
DataPair key,
Iterable value,
Reducer.Context context)
throws IOException, InterruptedException {
//context.write(key, new IntWritable(1));
int test = 1000; //for test
for(IntWritable i : value){
context.write(key, i);
}
System.out.println(++test); //for test
context.write(key, new IntWritable(test)); //for test
}
}
private static String inputPath = "in-ddsort";
private static String outputPath = "out-ddsort";
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
Job job = new Job(conf, "DDSort");
job.setJarByClass(DDSort.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setPartitionerClass(Prt.class);
job.setNumReduceTasks(6);
//job.setGroupingComparatorClass(Grp.class);
job.setMapOutputKeyClass(DataPair.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(DataPair.class);
job.setOutputValueClass(IntWritable.class);
FileSystem fs = FileSystem.get(conf);
Path outPath = new Path(outputPath);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileInputFormat.addInputPath(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
1、不配值partition和group函数的情况:
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount ");
System.exit(2);
}
Job job = new Job(conf, "DDSort");
job.setJarByClass(DDSort.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
//job.setPartitionerClass(Prt.class);
//job.setNumReduceTasks(6);
//job.setGroupingComparatorClass(Grp.class);
job.setMapOutputKeyClass(DataPair.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(DataPair.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
对应的输出结果为只有一个输出文件:
-rw-r--r-- 3 root supergroup 0 2017-10-10 21:02 /user/root/out-ddsort21/_SUCCESS
drwxr-xr-x - root supergroup 0 2017-10-10 21:01 /user/root/out-ddsort21/_logs
-rw-r--r-- 3 root supergroup 78 2017-10-10 21:02 /user/root/out-ddsort21/part-r-00000
part-r-00000内容为:
1 2 2
1 2 1001
1 4 4
1 4 1001
1 9 9
1 9 1001
2 1 1
2 1 1001
2 5 5
2 5 1001
2 6 6
2 6 1001
3 1 1
3 1 1001
4 1 1
4 1 1001
4 3 3
4 3 1001
5 5 5
5 5 1001
6 1 1
6 1 1001
6 4 4
6 4 1001
6 8 8
6 8 1001
可见一个[(k1, k21), v1]属于一组。
2、只配置partion
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount ");
System.exit(2);
}
Job job = new Job(conf, "DDSort");
job.setJarByClass(DDSort.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setPartitionerClass(Prt.class);
job.setNumReduceTasks(6);
//job.setGroupingComparatorClass(Grp.class);
job.setMapOutputKeyClass(DataPair.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(DataPair.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
Found 8 items
-rw-r--r-- 3 root supergroup 0 2017-10-10 21:34 /user/root/out-ddsort22/_SUCCESS
drwxr-xr-x - root supergroup 0 2017-10-10 21:33 /user/root/out-ddsort22/_logs
-rw-r--r-- 3 root supergroup 18 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00000
-rw-r--r-- 3 root supergroup 18 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00001
-rw-r--r-- 3 root supergroup 18 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00002
-rw-r--r-- 3 root supergroup 6 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00003
-rw-r--r-- 3 root supergroup 12 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00004
-rw-r--r-- 3 root supergroup 6 2017-10-10 21:33 /user/root/out-ddsort22/part-r-00005
part-r-00000:
6 1 1
6 1 1001
6 4 4
6 4 1001
6 8 8
6 8 1001
part-r-00001:
1 2 2
1 2 1001
1 4 4
1 4 1001
1 9 9
1 9 1001
part-r-00003:
2 1 1
2 1 1001
2 5 5
2 5 1001
2 6 6
2 6 1001
.......
可见一个[(k1, k21), v1]属于一组。
3、配置partiton和group
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount ");
System.exit(2);
}
Job job = new Job(conf, "DDSort");
job.setJarByClass(DDSort.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setPartitionerClass(Prt.class);
job.setNumReduceTasks(6);
job.setGroupingComparatorClass(Grp.class);
job.setMapOutputKeyClass(DataPair.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(DataPair.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
Found 8 items
-rw-r--r-- 3 root supergroup 0 2017-10-10 22:04 /user/root/out-ddsort23/_SUCCESS
drwxr-xr-x - root supergroup 0 2017-10-10 22:03 /user/root/out-ddsort23/_logs
-rw-r--r-- 3 root supergroup 18 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00000
-rw-r--r-- 3 root supergroup 18 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00001
-rw-r--r-- 3 root supergroup 18 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00002
-rw-r--r-- 3 root supergroup 6 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00003
-rw-r--r-- 3 root supergroup 12 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00004
-rw-r--r-- 3 root supergroup 6 2017-10-10 22:03 /user/root/out-ddsort23/part-r-00005
part-r-00000:
6 1 1
6 4 4
6 8 8
6 8 1001
part-r-00001:
1 2 2
1 4 4
1 9 9
1 9 1001
part-r-00002:
2 1 1
2 5 5
2 6 6
2 6 1001
.......
可见k1相同的属于一组,以下的属于同一组:
[(k1, k21), v1]
[(k1, k22), v2]
[(k1, k23), v3]
这就是group函数的作用。注意:
public static class Reduce extends Reducer{
@Override
protected void reduce(
DataPair key,
Iterable value,
Reducer.Context context)
throws IOException, InterruptedException {
//context.write(key, new IntWritable(1));
int test = 1000; //for test
for(IntWritable i : value){ //只迭代了value
context.write(key, i);
}
System.out.println(++test); //for test
context.write(key, new IntWritable(test)); //for test
}
}
此处只迭代了value,可从输出来看,for循环中key的值也发生了变化!?
不明白为什么,若哪位知道原因欢迎留言,谢谢。