在HBase上跑MapReduce有个很麻烦的问题:多HTable+多Scan作为Map的Input对象。以往都需要开发人员去写TableInputFormat类的重载方法。
HBase 0.94.6版本加入了一个新的Class::MultiTableInputFormatBase。(这里我提醒一下HBase 0.94.6有致命BUG,请用0.94.7)
这个方法可以帮助MapReduce开发人员快速实现多HTable+多Scan的Input。
MultiTableInputFormatBase的实现源于HBASE-3996:Support multiple tables and scanners as input to the mapper in map/reduce jobs
MultiTableInputFormatBase的加入的同时,补丁还调整了TableMapReduceUtil,并加入了以下方法:
//多输入方法 void org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob( List<Scan> scans, Class<? extends TableMapper> mapper, Class<? extends WritableComparable> outputKeyClass, Class<? extends Writable> outputValueClass, Job job ) throws IOException //单输入方法 void org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob( String table, Scan scan, Class<? extends TableMapper> mapper, Class<? extends WritableComparable> outputKeyClass, Class<? extends Writable> outputValueClass, Job job ) throws IOException多输入方法相比单输入方法的最大区别在于:去掉了Table Name的传入参数,取而代之的是一个List<Scan>
现在问题来了,新方法只有Scan,没有HTable名称。如何让MapReduce知道数据从哪几张表获取呢?
经过几番周折,我在MultiTableInputFormatBase.java源码的第116行找到了答案
byte[] tableName = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME); if (tableName == null) throw new IOException("A scan object did not have a table name"); HTable table = new HTable(context.getConfiguration(), tableName);原来TableName是从Scan.SCAN_ATTRIBUTES_TABLE_NAME中获取的
只要对Scan对象的SCAN_ATTRIBUTES_TABLE_NAME进行TableName赋值就行了。
本文是CSDN-撸大湿原创,如要转载请注明出处,谢谢。
如有任何异议请留言,或去CSDN-Hadoop论坛找我,欢迎拍砖~
最后贴上今天刚写的利用MultiTableInput的MapReduce源码:
import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.mapreduce.TableReducer; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Reducer; public class mr_ccu implements StaticObject { public static class CCUMapper extends TableMapper<Text, Text> { Text outputkey = new Text(); Text outputvalue = new Text(); int i = 0; public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException { int LogType = Integer.valueOf(new String(value.getValue("_0".getBytes(), "lgtp".getBytes()))); if (LogType == PLAYER_LOGINZONE || LogType == PLAYER_LOGOUT) { String key = new String(value.getValue("_0".getBytes(), "area".getBytes())) + DEFAULT_DELIMITER + ("00" + new String(value.getValue("_0".getBytes(), "wid".getBytes()))).substring(1) + DEFAULT_DELIMITER + new String(value.getValue("_0".getBytes(), "pid".getBytes())); outputkey.set(key); String val = new String(value.getValue("_0".getBytes(), "uts".getBytes())) + DEFAULT_DELIMITER + LogType; outputvalue.set(val); context.write(outputkey, outputvalue); i++; } } protected void cleanup(Context context) throws IOException, InterruptedException { context.getCounter("CCU", "Mapper Count").setValue(i); }; } public static class CCUCombiner extends Reducer<Text, Text, Text, Text> { Text outputkey = new Text(); Text outputvalue = new Text(); int i = 0; public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Long LastLoginTime = 0L; for (Text val : values) { if (LastLoginTime < Long.valueOf(val.toString().split(DEFAULT_DELIMITER)[0])) { LastLoginTime = Long.valueOf(val.toString().split(DEFAULT_DELIMITER)[0]); outputvalue.set(val); } } context.write(key, outputvalue); i++; } protected void cleanup(org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException { context.getCounter("CCU", "Combiner Count").setValue(i); }; } public static class CCUReducer extends TableReducer<Text, Text, ImmutableBytesWritable> { HashMap<String, Integer> CCUMap = new HashMap<String, Integer>(); String CCUTime_StringTime = ""; String CCUTime_UnixTime = ""; protected void setup(org.apache.hadoop.mapreduce.Reducer<Text, Text, ImmutableBytesWritable, org.apache.hadoop.io.Writable>.Context context) throws IOException, InterruptedException { CCUTime_StringTime = context.getConfiguration().get("CCUTime_StringTime"); CCUTime_UnixTime = String.valueOf(Long.MAX_VALUE - Long.valueOf(context.getConfiguration().get("CCUTime_UnixTime"))); }; public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Long LastLoginTime = 0L; String value = ""; for (Text val : values) { if (LastLoginTime < Long.valueOf(val.toString().split(DEFAULT_DELIMITER)[0])) { LastLoginTime = Long.valueOf(val.toString().split(DEFAULT_DELIMITER)[0]); value = val.toString(); } } if (Integer.valueOf(value.split(DEFAULT_DELIMITER)[1]) == PLAYER_LOGINZONE) { int CCU = DEFAULT_ZERO_TYPE; String[] tmpStr = key.toString().split(DEFAULT_DELIMITER); String MapKey = tmpStr[0] + DEFAULT_DELIMITER + tmpStr[1] + DEFAULT_DELIMITER + CCUTime_UnixTime; if (CCUMap.containsKey(MapKey)) CCU = CCUMap.get(MapKey) + 1; CCUMap.put(MapKey, CCU); } } protected void cleanup(Context context) throws java.io.IOException, java.lang.InterruptedException { Iterator<Entry<String, Integer>> iter = CCUMap.entrySet().iterator(); while (iter.hasNext()) { Entry<String, Integer> PutData = iter.next(); Put put = new Put(PutData.getKey().getBytes()); put.add(Bytes.toBytes("_0"), Bytes.toBytes("ccu"), Bytes.toBytes(String.valueOf(PutData.getValue()))); put.add(Bytes.toBytes("_0"), Bytes.toBytes("ts"), Bytes.toBytes(String.valueOf(CCUTime_StringTime))); context.write(null, put); } } } public static void main(String[] args) throws Exception { MyHBase myHBase = new MyHBase(); String TempStr = ""; String AreaID = ""; String StartTime = ""; String StopTime = ""; if (args.length < 2) { TempStr = "" + "00,01,02" + "\t" + "20130528000000" + "\t" + "20130528210000"; AreaID = TempStr.split("\t")[0]; StartTime = TempStr.split("\t")[1]; StopTime = TempStr.split("\t")[2]; } else { AreaID = args[0]; StartTime = args[1]; StopTime = args[2]; } String sourceTable = "ps2cb.login.event"; String targetTable = "ps2cb.tbl.ccu"; List<Scan> ScanList = new ArrayList<Scan>(); for (String aid : AreaID.split(",")) { String StartRow = aid + DEFAULT_DELIMITER + StartTime; String StopRow = aid + DEFAULT_DELIMITER + StopTime; Scan scan = new Scan(StartRow.getBytes(), StopRow.getBytes()); scan.setCaching(5000); scan.setCacheBlocks(false); scan.addColumn("_0".getBytes(), "lgtp".getBytes()); scan.addColumn("_0".getBytes(), "area".getBytes()); scan.addColumn("_0".getBytes(), "wid".getBytes()); scan.addColumn("_0".getBytes(), "pid".getBytes()); scan.addColumn("_0".getBytes(), "uts".getBytes()); scan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, sourceTable.getBytes()); ScanList.add(scan); } myHBase.myConf.set("CCUTime_StringTime", FormatTime.getStringDatetoString(StopTime)); myHBase.myConf.set("CCUTime_UnixTime", FormatTime.toUnixTimeBySecond(StopTime)); Job job = new Job(myHBase.myConf, "CCU"); job.setJarByClass(mr_ccu.class); job.setCombinerClass(CCUCombiner.class); TableMapReduceUtil.setScannerCaching(job, 5000); TableMapReduceUtil.initTableMapperJob( ScanList, CCUMapper.class, Text.class, Text.class, job); TableMapReduceUtil.initTableReducerJob( targetTable, // output table CCUReducer.class, // reducer class job); System.exit(job.waitForCompletion(true) ? 0 : 1); } }