一般在mr中的join实现方式,原理上就是对不同的来源数据加上不同的标识
比如:
mapper阶段产生
R1, fromA \t original value
R2, fromB \t original value
reducer阶段
根据value中的这个fromA,fromB来进行不同的处理.
缺点是,每条记录中都会多出一份来源信息的额外数据,如果数据量非常大的情况下,是比较浪费的.也导致速度下降
另外,在hive中,看了下join的过程也是在value中增加了一个类似tag的标签用来标识来源.
现在我们使用GenericWritable来改善这个问题,没错.原理还是一样的,但是我们会利用hadoop的本身会对记录的类型会进行记录来达到区分数据类型的目的,从而避免使用额外类型字段.
以下代码使用0.19版本的hadoop编写,未进行测试,大概表达一个意思
JoinTestWithGenericWritable.Java
package join;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.GenericWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleInputs;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class JoinTestWithGenericWritable extends Configured implements Tool {
public static class JoinMapperA extends MapReduceBase implements
Mapper {
@Override
public void map(LongWritable key, Text value,
OutputCollector output, Reporter reporter)
throws IOException {
// ... get real key and value
String realkey = "";
ClassA a = new ClassA();
a.set("");
output.collect(new Text(realkey), new TestGenericWritable(a));
}
}
public static class JoinMapperB extends MapReduceBase implements
Mapper {
@Override
public void map(LongWritable key, Text value,
OutputCollector output,
Reporter reporter) throws IOException {
// ... get real key and value
String realkey = "";
ClassB b = new ClassB();
b.set("");
output.collect(new Text(realkey), new TestGenericWritable(b));
}
}
public static class JoinReduce extends MapReduceBase implements
Reducer {
private static Text outkey = new Text();
private static Text outcontent = new Text();
@Override
public void reduce(Text key, Iterator values,
OutputCollector output, Reporter reporter)
throws IOException {
while(values.hasNext()){
Writable oriData = values.next();
if(oriData instanceof ClassA){
// do logic A
}
else if(oriData instanceof ClassB){
// do logic B
}
}
}
}
@Override
public int run(String[] args) throws Exception {
JobConf job = new JobConf(getConf(), JoinTestWithGenericWritable.class);
job.setJarByClass(JoinTestWithGenericWritable.class);
job.setJobName("Join Test With GenericWritable...");
job.setNumReduceTasks(5);
job.setMapOutputValueClass(TestGenericWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setReducerClass(JoinReduce.class);
job.setOutputFormat(TextOutputFormat.class);
MultipleInputs.addInputPath(job, new Path(args[0]),
TextInputFormat.class, JoinMapperA.class);
MultipleInputs.addInputPath(job, new Path(args[1]),
TextInputFormat.class, JoinMapperB.class);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
JobClient.runJob(job);
return 0;
}
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new JoinTestWithGenericWritable(), args);
System.exit(ret);
}
}
TestGenericWritable.Java
package join;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.GenericWritable;
import org.apache.hadoop.io.Writable;
public class TestGenericWritable extends GenericWritable {
private static Class extends Writable>[] CLASSES = null;
static {
CLASSES = (Class extends Writable>[]) new Class[] { ClassA.class,
ClassB.class };
}
public TestGenericWritable() {
}
public TestGenericWritable(Writable instance) {
set(instance);
}
@Override
protected Class extends Writable>[] getTypes() {
return CLASSES;
}
}
ClassA.Java
package join;
import org.apache.hadoop.io.Text;
public class ClassA extends Text{
}
ClassB.Java
package join;
import org.apache.hadoop.io.Text;
public class ClassB extends Text{
}
参考http://www.lichun.cc/blog/2012/05/hadoop-genericwritable-sample-usage/