在hadoop中利用GenericWritable来减少中间数据,加速join

一般在mr中的join实现方式,原理上就是对不同的来源数据加上不同的标识
比如:
mapper阶段产生
R1,  fromA \t original value
R2,  fromB \t original value
reducer阶段
根据value中的这个fromA,fromB来进行不同的处理.
缺点是,每条记录中都会多出一份来源信息的额外数据,如果数据量非常大的情况下,是比较浪费的.也导致速度下降
另外,在hive中,看了下join的过程也是在value中增加了一个类似tag的标签用来标识来源.

现在我们使用GenericWritable来改善这个问题,没错.原理还是一样的,但是我们会利用hadoop的本身会对记录的类型会进行记录来达到区分数据类型的目的,从而避免使用额外类型字段.

以下代码使用0.19版本的hadoop编写,未进行测试,大概表达一个意思
JoinTestWithGenericWritable.Java


package join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.GenericWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleInputs;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class JoinTestWithGenericWritable extends Configured implements Tool {
	public static class JoinMapperA extends MapReduceBase implements
			Mapper {
		@Override
		public void map(LongWritable key, Text value,
				OutputCollector output, Reporter reporter)
				throws IOException {
			// ... get real key and value
			String realkey = "";
			ClassA a = new ClassA();
			a.set("");
			output.collect(new Text(realkey), new TestGenericWritable(a));
		}
	}

	public static class JoinMapperB extends MapReduceBase implements
			Mapper {
		@Override
		public void map(LongWritable key, Text value,
				OutputCollector output,
				Reporter reporter) throws IOException {
			// ... get real key and value
			String realkey = "";
			ClassB b = new ClassB();
			b.set("");
			output.collect(new Text(realkey),  new TestGenericWritable(b));
		}
	}

	public static class JoinReduce extends MapReduceBase implements
			Reducer {
		private static Text outkey = new Text();
		private static Text outcontent = new Text();

		@Override
		public void reduce(Text key, Iterator values,
				OutputCollector output, Reporter reporter)
				throws IOException {
			while(values.hasNext()){
				Writable oriData = values.next();
				if(oriData instanceof ClassA){
					// do logic A
				}
				else if(oriData instanceof ClassB){
					// do logic B
				}
			}
			
		}
	}

	@Override
	public int run(String[] args) throws Exception {
		JobConf job = new JobConf(getConf(), JoinTestWithGenericWritable.class);
		job.setJarByClass(JoinTestWithGenericWritable.class);
		job.setJobName("Join Test With GenericWritable...");
		job.setNumReduceTasks(5);

		job.setMapOutputValueClass(TestGenericWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		job.setReducerClass(JoinReduce.class);
		job.setOutputFormat(TextOutputFormat.class);
		MultipleInputs.addInputPath(job, new Path(args[0]), 
				TextInputFormat.class, JoinMapperA.class);
		MultipleInputs.addInputPath(job, new Path(args[1]), 
				TextInputFormat.class, JoinMapperB.class);

		FileOutputFormat.setOutputPath(job, new Path(args[2]));

		JobClient.runJob(job);
		return 0;
	}

	public static void main(String[] args) throws Exception {
		int ret = ToolRunner.run(new JoinTestWithGenericWritable(), args);
		System.exit(ret);
	}
}
TestGenericWritable.Java

package join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.GenericWritable;
import org.apache.hadoop.io.Writable;

public class TestGenericWritable extends GenericWritable {

	private static Class[] CLASSES = null;

	static {
		CLASSES = (Class[]) new Class[] { ClassA.class,
				ClassB.class };
	}

	public TestGenericWritable() {
	}

	public TestGenericWritable(Writable instance) {
		set(instance);
	}

	@Override
	protected Class[] getTypes() {
		return CLASSES;
	}
}

ClassA.Java


package join;
import org.apache.hadoop.io.Text;

public class ClassA extends Text{

}

ClassB.Java


package join;
import org.apache.hadoop.io.Text;

public class ClassB extends Text{

}
 
  
参考http://www.lichun.cc/blog/2012/05/hadoop-genericwritable-sample-usage/

你可能感兴趣的:(在hadoop中利用GenericWritable来减少中间数据,加速join)