MapReduce-Join中级优化-hadoop自带datajoin的解决方法

接着上一篇 《MapReuce-Join操作-初级优化》这一篇博客继续说明MapReduce对于Join的操作,这里使用hadoop包中自带的datajoin包来处理,如果是hadoop1.x则包在${HADOOP_HOME}/contrib/datajoin文件夹下。如果是hadoop2.x则该包在${HADOOP_HOME}/share/hadoop/tools/lib下面把包引入工程中就可以使用了。
以下是本篇博客要处理的数据,为了我们前两篇进行用法上的比较,这里使用同样的数据:
uid,name,phoneid
1,tom,40
2,jack,20
3,seven,30
4,lee,10
5,smith,20
6,张三,10
7,李四,30
8,王五,20

goodid,name
10,苹果
20,三星
30,LG
40,华为

输出结果:
lee 苹果
张三 苹果
jack 三星
smith 三星
王五 三星
seven LG
李四 LG
tom 华为
下面说说datajoin包的基本用法:
首先来看看Map端的写法:
Map端要继承DataJoinMapperBase类
public abstract class DataJoinMapperBase extends JobBase 

并实现以下几个方法:

/**
   * Determine the source tag based on the input file name.
   * 
   * @param inputFile
   * @return the source tag computed from the given file name.
   */
  protected abstract Text generateInputTag(String inputFile);

  /**
   * Generate a tagged map output value. The user code can also perform
   * projection/filtering. If it decides to discard the input record when
   * certain conditions are met,it can simply return a null.
   * 
   * @param value
   * @return an object of TaggedMapOutput computed from the given value.
   */
  protected abstract TaggedMapOutput generateTaggedMapOutput(Object value);

  /**
   * Generate a map output key. The user code can compute the key
   * programmatically, not just selecting the values of some fields. In this
   * sense, it is more general than the joining capabilities of SQL.
   * 
   * @param aRecord
   * @return the group key for the given record
   */
  protected abstract Text generateGroupKey(TaggedMapOutput aRecord);
下面来看看configure()和map()函数的执行过程:

public void configure(JobConf job) {
    super.configure(job);
    this.job = job;
    this.inputFile = job.get(MRJobConfig.MAP_INPUT_FILE);
    //生成该map的数据的Tag
    this.inputTag = generateInputTag(this.inputFile);
  }

  public void map(Object key, Object value,
                  OutputCollector output, Reporter reporter) throws IOException {
    if (this.reporter == null) {
      this.reporter = reporter;
    }
    //记录总记录条数
    addLongValue("totalCount", 1);
    //把原始行记录成生一个TaggedMapOutput的对象
    TaggedMapOutput aRecord = generateTaggedMapOutput(value);
    if (aRecord == null) {
      //记录不合格的字条数
      addLongValue("discardedCount", 1);
      return;
    }
    Text groupKey = generateGroupKey(aRecord);
    if (groupKey == null) {
      //记录分组键为空的记录条数
      addLongValue("nullGroupKeyCount", 1);
      return;
    }
    //输出分组键和TaggedMapOutput的对象
    output.collect(groupKey, aRecord);
    addLongValue("collectedCount", 1);
  }
  //主要功能为把map对象中对应的name的计数器加1
  protected Long addLongValue(Object name, long inc) {
    Long val = this.longCounters.get(name);
    Long retv = null;
    if (val == null) {
      retv = Long.valueOf(inc);
    } else {
      retv = Long.valueOf(val.longValue() + inc);
    }
    this.longCounters.put(name, retv);
    return retv;
  }
以上知道了map()中的处理流程过后,
首先我们要创造一个用于传输的实体类,必须继承TaggedMapOutput,下面是参考代码以及注释:

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.contrib.utils.join.TaggedMapOutput;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;

public class TaggedWritable extends TaggedMapOutput {
	/**
	 * 这样定义报空以下导常:
	 * Error: java.lang.NullPointerException
     	 *   at com.seven.mapreduce.join.TaggedWritable.readFields(TaggedWritable.java:32)
	 *   org.apache.hadoop.contrib.utils.join.DataJoinReducerBase.regroup(DataJoinReducerBase.java:106)
	 *   可以参考http://stackoverflow.com/questions/10201500/hadoop-reduce-side-join-using-datajoin
	 *   解决。
	 */
	private Writable data;

	public TaggedWritable() {
		this.tag = new Text();
	}
	public TaggedWritable(Writable data) { 
		this.tag = new Text("");
        this.data = data;  
    }
	public void setData(Writable data) {
		this.data = data;
	}
	public void readFields(DataInput arg0) throws IOException {
		this.tag.readFields(arg0);
		String dataClz = arg0.readUTF();
		/**
		 * 根据序列化时传入的类型进行反序列化
		 */
        if (this.data == null
                || !this.data.getClass().getName().equals(dataClz)) {
            try {
				this.data = (Writable) ReflectionUtils.newInstance(Class.forName(dataClz), null);
			} catch (ClassNotFoundException e) {
				e.printStackTrace();
			}
        }
		this.data.readFields(arg0);
	}
	public void write(DataOutput arg1) throws IOException {
		this.tag.write(arg1);
		/**
		 * 写入类名,反序列化时可以用到
		 */
		arg1.writeUTF(this.data.getClass().getName());
		this.data.write(arg1);
	}
	@Override
	public Writable getData() {
		return data;
	}
}
下面就来编写Map端程序了:

import org.apache.hadoop.contrib.utils.join.DataJoinMapperBase;
import org.apache.hadoop.contrib.utils.join.TaggedMapOutput;
import org.apache.hadoop.io.Text;

public class JoinMapper extends DataJoinMapperBase {
	@Override
	protected Text generateInputTag(String inputFile) {
		/**
		 * 生成对应于该Map的Tag
		 */
		String tagTmp = inputFile.substring(inputFile.lastIndexOf("/") + 1);
		return new Text(tagTmp); 
	}
	@Override
	protected TaggedMapOutput generateTaggedMapOutput(Object value) {
		TaggedWritable retv = new TaggedWritable((Text) value);
		/**
		 * 来自父类DataJoinMapperBase的变量,在config()方法中根据文件名初始化
		 */
        retv.setTag(this.inputTag);  
        return retv;
	}
	@Override
	protected Text generateGroupKey(TaggedMapOutput aRecord) {
		/**
		 * 生成分组的键,如果是多个文件,但对应的列不同,则在这里根据inputTag来进行
		 * 判断和控制
		 */
		String line = ((Text) aRecord.getData()).toString();  
        String[] tokens = line.split(","); 
        String groupKey = null;
        if(this.inputTag.toString().equals("12")){
        	groupKey = tokens[2];
        } else if (this.inputTag.toString().equals("122")){
        	groupKey = tokens[0];
		}
        return new Text(groupKey); 
	}
}
下面实现reduce端的代码:这里不过多的介绍DataJoinReducerBase的具体执行过程,下一篇博客会单独的分析这个包的整个执行过程。
这里的代码编写方式是,直接继承DataJoinReducerBase并实现combine()方法就行。
import org.apache.hadoop.contrib.utils.join.DataJoinReducerBase;
import org.apache.hadoop.contrib.utils.join.TaggedMapOutput;
import org.apache.hadoop.io.Text;
public class JoinReducer extends DataJoinReducerBase {
	/**
	 * combine方法用来筛选掉不需要的组合,获得所需的联结操作(内联结,左联结等)。并且
         * 将结果化为合适输出格式(如:字段排列,去重等)
	 */
	@Override
	protected TaggedMapOutput combine(Object[] tags, Object[] values) {
		/**
		 * 实现innerjoin的功能
		 */
	if (tags.length < 2) return null;    
        String joinedStr = "";   
        for (int i=0; i<values.length; i++) {  
            if (i > 0) joinedStr += ",";  
            TaggedWritable tw = (TaggedWritable) values[i];
            String line = ((Text) tw.getData()).toString();
            String[] tokens = line.split(",");
	    /**
             * 根据tag的不同,把不同文件中的不同的字段取出进和join操作
             * 12为用户信息文件名   122为手机信息文件名
             */
            if(tw.getTag().equals("12")) {
            	joinedStr += tokens[1];
            } else {
            	joinedStr += tokens[1];
            }
        }  
        TaggedWritable retv = new TaggedWritable(new Text(joinedStr));  
        retv.setTag((Text) tags[0]);   
        return retv;  
	}
}
启动程序:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class JobMain extends Configured implements Tool {
	public int run(String[] args) throws Exception {
		Configuration conf = getConf();  
        JobConf job = new JobConf(conf, JobMain.class);  
        Path in = new Path(args[0]);  
        Path out = new Path(args[1]);  
        FileInputFormat.setInputPaths(job, in);
        /**
         * 设置多个文件夹下面的文件进和JOIN操作
         */
        //FileInputFormat.setInputPaths(job, args[0]+ "," + args[1]);
        FileOutputFormat.setOutputPath(job, out);  
        job.setJobName("DataJoin");  
        job.setMapperClass(JoinMapper.class);  
        job.setReducerClass(JoinReducer.class);  
        job.setInputFormat(TextInputFormat.class);  
        job.setOutputFormat(TextOutputFormat.class);  
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(TaggedWritable.class);  
        job.set("mapred.textoutputformat.separator", ",");  
        JobClient.runJob(job);   
        return 0;
	}
	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(
				new Configuration(),  
                new JobMain(),  
                args);  
		System.exit(res); 
	}
}
运行结果:

用户信息表

手机信息表

运行命令:

 ./hadoop jar mr.jar com.seven.mapreduce.join.JobMain /input/eight /output/night00

运行结果:


总结:

这是hadoop包中自带的join方式的使用,这是一个通用型的JOIN方法,如果熟练了可以快速的开发出JOIN功能,但在执行效率上还有可以提高的空间,下面一篇会说明《hadoop硬实战》中的对这一个功能的优化的实现。

你可能感兴趣的:(mapreduce,hadoop)