并实现以下几个方法:
/** * Determine the source tag based on the input file name. * * @param inputFile * @return the source tag computed from the given file name. */ protected abstract Text generateInputTag(String inputFile); /** * Generate a tagged map output value. The user code can also perform * projection/filtering. If it decides to discard the input record when * certain conditions are met,it can simply return a null. * * @param value * @return an object of TaggedMapOutput computed from the given value. */ protected abstract TaggedMapOutput generateTaggedMapOutput(Object value); /** * Generate a map output key. The user code can compute the key * programmatically, not just selecting the values of some fields. In this * sense, it is more general than the joining capabilities of SQL. * * @param aRecord * @return the group key for the given record */ protected abstract Text generateGroupKey(TaggedMapOutput aRecord);下面来看看configure()和map()函数的执行过程:
public void configure(JobConf job) { super.configure(job); this.job = job; this.inputFile = job.get(MRJobConfig.MAP_INPUT_FILE); //生成该map的数据的Tag this.inputTag = generateInputTag(this.inputFile); } public void map(Object key, Object value, OutputCollector output, Reporter reporter) throws IOException { if (this.reporter == null) { this.reporter = reporter; } //记录总记录条数 addLongValue("totalCount", 1); //把原始行记录成生一个TaggedMapOutput的对象 TaggedMapOutput aRecord = generateTaggedMapOutput(value); if (aRecord == null) { //记录不合格的字条数 addLongValue("discardedCount", 1); return; } Text groupKey = generateGroupKey(aRecord); if (groupKey == null) { //记录分组键为空的记录条数 addLongValue("nullGroupKeyCount", 1); return; } //输出分组键和TaggedMapOutput的对象 output.collect(groupKey, aRecord); addLongValue("collectedCount", 1); } //主要功能为把map对象中对应的name的计数器加1 protected Long addLongValue(Object name, long inc) { Long val = this.longCounters.get(name); Long retv = null; if (val == null) { retv = Long.valueOf(inc); } else { retv = Long.valueOf(val.longValue() + inc); } this.longCounters.put(name, retv); return retv; }以上知道了map()中的处理流程过后,
import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.contrib.utils.join.TaggedMapOutput; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.ReflectionUtils; public class TaggedWritable extends TaggedMapOutput { /** * 这样定义报空以下导常: * Error: java.lang.NullPointerException * at com.seven.mapreduce.join.TaggedWritable.readFields(TaggedWritable.java:32) * org.apache.hadoop.contrib.utils.join.DataJoinReducerBase.regroup(DataJoinReducerBase.java:106) * 可以参考http://stackoverflow.com/questions/10201500/hadoop-reduce-side-join-using-datajoin * 解决。 */ private Writable data; public TaggedWritable() { this.tag = new Text(); } public TaggedWritable(Writable data) { this.tag = new Text(""); this.data = data; } public void setData(Writable data) { this.data = data; } public void readFields(DataInput arg0) throws IOException { this.tag.readFields(arg0); String dataClz = arg0.readUTF(); /** * 根据序列化时传入的类型进行反序列化 */ if (this.data == null || !this.data.getClass().getName().equals(dataClz)) { try { this.data = (Writable) ReflectionUtils.newInstance(Class.forName(dataClz), null); } catch (ClassNotFoundException e) { e.printStackTrace(); } } this.data.readFields(arg0); } public void write(DataOutput arg1) throws IOException { this.tag.write(arg1); /** * 写入类名,反序列化时可以用到 */ arg1.writeUTF(this.data.getClass().getName()); this.data.write(arg1); } @Override public Writable getData() { return data; } }下面就来编写Map端程序了:
import org.apache.hadoop.contrib.utils.join.DataJoinMapperBase; import org.apache.hadoop.contrib.utils.join.TaggedMapOutput; import org.apache.hadoop.io.Text; public class JoinMapper extends DataJoinMapperBase { @Override protected Text generateInputTag(String inputFile) { /** * 生成对应于该Map的Tag */ String tagTmp = inputFile.substring(inputFile.lastIndexOf("/") + 1); return new Text(tagTmp); } @Override protected TaggedMapOutput generateTaggedMapOutput(Object value) { TaggedWritable retv = new TaggedWritable((Text) value); /** * 来自父类DataJoinMapperBase的变量,在config()方法中根据文件名初始化 */ retv.setTag(this.inputTag); return retv; } @Override protected Text generateGroupKey(TaggedMapOutput aRecord) { /** * 生成分组的键,如果是多个文件,但对应的列不同,则在这里根据inputTag来进行 * 判断和控制 */ String line = ((Text) aRecord.getData()).toString(); String[] tokens = line.split(","); String groupKey = null; if(this.inputTag.toString().equals("12")){ groupKey = tokens[2]; } else if (this.inputTag.toString().equals("122")){ groupKey = tokens[0]; } return new Text(groupKey); } }下面实现reduce端的代码:这里不过多的介绍DataJoinReducerBase的具体执行过程,下一篇博客会单独的分析这个包的整个执行过程。
import org.apache.hadoop.contrib.utils.join.DataJoinReducerBase; import org.apache.hadoop.contrib.utils.join.TaggedMapOutput; import org.apache.hadoop.io.Text; public class JoinReducer extends DataJoinReducerBase { /** * combine方法用来筛选掉不需要的组合,获得所需的联结操作(内联结,左联结等)。并且 * 将结果化为合适输出格式(如:字段排列,去重等) */ @Override protected TaggedMapOutput combine(Object[] tags, Object[] values) { /** * 实现innerjoin的功能 */ if (tags.length < 2) return null; String joinedStr = ""; for (int i=0; i<values.length; i++) { if (i > 0) joinedStr += ","; TaggedWritable tw = (TaggedWritable) values[i]; String line = ((Text) tw.getData()).toString(); String[] tokens = line.split(","); /** * 根据tag的不同,把不同文件中的不同的字段取出进和join操作 * 12为用户信息文件名 122为手机信息文件名 */ if(tw.getTag().equals("12")) { joinedStr += tokens[1]; } else { joinedStr += tokens[1]; } } TaggedWritable retv = new TaggedWritable(new Text(joinedStr)); retv.setTag((Text) tags[0]); return retv; } }启动程序:
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class JobMain extends Configured implements Tool { public int run(String[] args) throws Exception { Configuration conf = getConf(); JobConf job = new JobConf(conf, JobMain.class); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); /** * 设置多个文件夹下面的文件进和JOIN操作 */ //FileInputFormat.setInputPaths(job, args[0]+ "," + args[1]); FileOutputFormat.setOutputPath(job, out); job.setJobName("DataJoin"); job.setMapperClass(JoinMapper.class); job.setReducerClass(JoinReducer.class); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TaggedWritable.class); job.set("mapred.textoutputformat.separator", ","); JobClient.runJob(job); return 0; } public static void main(String[] args) throws Exception { int res = ToolRunner.run( new Configuration(), new JobMain(), args); System.exit(res); } }运行结果:
用户信息表
./hadoop jar mr.jar com.seven.mapreduce.join.JobMain /input/eight /output/night00
运行结果:
这是hadoop包中自带的join方式的使用,这是一个通用型的JOIN方法,如果熟练了可以快速的开发出JOIN功能,但在执行效率上还有可以提高的空间,下面一篇会说明《hadoop硬实战》中的对这一个功能的优化的实现。