HBase MapReduce MultiTableInput首次测试

在HBase上跑MapReduce有个很麻烦的问题:多HTable+多Scan作为Map的Input对象。以往都需要开发人员去写TableInputFormat类的重载方法。

HBase 0.94.6版本加入了一个新的Class::MultiTableInputFormatBase。(这里我提醒一下HBase 0.94.6有致命BUG,请用0.94.7)

这个方法可以帮助MapReduce开发人员快速实现多HTable+多Scan的Input。

MultiTableInputFormatBase的实现源于HBASE-3996:Support multiple tables and scanners as input to the mapper in map/reduce jobs

MultiTableInputFormatBase的加入的同时,补丁还调整了TableMapReduceUtil,并加入了以下方法:

//多输入方法
void org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob(
	List<Scan> scans, 
	Class<? extends TableMapper> mapper, 
	Class<? extends WritableComparable> outputKeyClass, 
	Class<? extends Writable> outputValueClass, Job job
	) throws IOException

//单输入方法
void org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob(
	String table, 
	Scan scan, Class<? extends TableMapper> mapper, 
	Class<? extends WritableComparable> outputKeyClass, 
	Class<? extends Writable> outputValueClass, Job job
	) throws IOException
多输入方法相比单输入方法的最大区别在于:去掉了Table Name的传入参数,取而代之的是一个List<Scan>

现在问题来了,新方法只有Scan,没有HTable名称。如何让MapReduce知道数据从哪几张表获取呢?

经过几番周折,我在MultiTableInputFormatBase.java源码的第116行找到了答案

      byte[] tableName = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME);
      if (tableName == null) 
        throw new IOException("A scan object did not have a table name");
      HTable table = new HTable(context.getConfiguration(), tableName);
原来TableName是从Scan.SCAN_ATTRIBUTES_TABLE_NAME中获取的

只要对Scan对象的SCAN_ATTRIBUTES_TABLE_NAME进行TableName赋值就行了。

本文是CSDN-撸大湿原创,如要转载请注明出处,谢谢。

如有任何异议请留言,或去CSDN-Hadoop论坛找我,欢迎拍砖~



最后贴上今天刚写的利用MultiTableInput的MapReduce源码:

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;

public class mr_ccu implements StaticObject {

	public static class CCUMapper extends TableMapper<Text, Text>
	{
		Text outputkey = new Text();
		Text outputvalue = new Text();
		int i = 0;

		public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException {
			int LogType = Integer.valueOf(new String(value.getValue("_0".getBytes(), "lgtp".getBytes())));
			if (LogType == PLAYER_LOGINZONE || LogType == PLAYER_LOGOUT) {
				String key = new String(value.getValue("_0".getBytes(), "area".getBytes()))
						+ DEFAULT_DELIMITER
						+ ("00" + new String(value.getValue("_0".getBytes(), "wid".getBytes()))).substring(1)
						+ DEFAULT_DELIMITER
						+ new String(value.getValue("_0".getBytes(), "pid".getBytes()));
				outputkey.set(key);
				String val = new String(value.getValue("_0".getBytes(), "uts".getBytes()))
						+ DEFAULT_DELIMITER
						+ LogType;
				outputvalue.set(val);
				context.write(outputkey, outputvalue);
				i++;
			}
		}

		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.getCounter("CCU", "Mapper Count").setValue(i);
		};
	}

	public static class CCUCombiner extends Reducer<Text, Text, Text, Text>
	{
		Text outputkey = new Text();
		Text outputvalue = new Text();
		int i = 0;

		public void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			Long LastLoginTime = 0L;
			for (Text val : values) {
				if (LastLoginTime < Long.valueOf(val.toString().split(DEFAULT_DELIMITER)[0])) {
					LastLoginTime = Long.valueOf(val.toString().split(DEFAULT_DELIMITER)[0]);
					outputvalue.set(val);
				}
			}
			context.write(key, outputvalue);
			i++;
		}

		protected void cleanup(org.apache.hadoop.mapreduce.Reducer<Text, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			context.getCounter("CCU", "Combiner Count").setValue(i);
		};
	}

	public static class CCUReducer extends TableReducer<Text, Text, ImmutableBytesWritable>
	{

		HashMap<String, Integer> CCUMap = new HashMap<String, Integer>();
		String CCUTime_StringTime = "";
		String CCUTime_UnixTime = "";

		protected void setup(org.apache.hadoop.mapreduce.Reducer<Text, Text,
				ImmutableBytesWritable, org.apache.hadoop.io.Writable>.Context context)
				throws IOException, InterruptedException {
			CCUTime_StringTime = context.getConfiguration().get("CCUTime_StringTime");
			CCUTime_UnixTime = String.valueOf(Long.MAX_VALUE - Long.valueOf(context.getConfiguration().get("CCUTime_UnixTime")));
		};

		public void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			Long LastLoginTime = 0L;
			String value = "";
			for (Text val : values) {
				if (LastLoginTime < Long.valueOf(val.toString().split(DEFAULT_DELIMITER)[0])) {
					LastLoginTime = Long.valueOf(val.toString().split(DEFAULT_DELIMITER)[0]);
					value = val.toString();
				}
			}
			if (Integer.valueOf(value.split(DEFAULT_DELIMITER)[1]) == PLAYER_LOGINZONE) {
				int CCU = DEFAULT_ZERO_TYPE;
				String[] tmpStr = key.toString().split(DEFAULT_DELIMITER);
				String MapKey = tmpStr[0] + DEFAULT_DELIMITER + tmpStr[1] + DEFAULT_DELIMITER + CCUTime_UnixTime;
				if (CCUMap.containsKey(MapKey))
					CCU = CCUMap.get(MapKey) + 1;
				CCUMap.put(MapKey, CCU);
			}
		}

		protected void cleanup(Context context) throws java.io.IOException, java.lang.InterruptedException
		{
			Iterator<Entry<String, Integer>> iter = CCUMap.entrySet().iterator();
			while (iter.hasNext()) {
				Entry<String, Integer> PutData = iter.next();
				Put put = new Put(PutData.getKey().getBytes());
				put.add(Bytes.toBytes("_0"), Bytes.toBytes("ccu"), Bytes.toBytes(String.valueOf(PutData.getValue())));
				put.add(Bytes.toBytes("_0"), Bytes.toBytes("ts"), Bytes.toBytes(String.valueOf(CCUTime_StringTime)));
				context.write(null, put);
			}
		}
	}

	public static void main(String[] args) throws Exception {
		MyHBase myHBase = new MyHBase();

		String TempStr = "";
		String AreaID = "";
		String StartTime = "";
		String StopTime = "";
		if (args.length < 2) {
			TempStr = "" +
					"00,01,02" + "\t" +
					"20130528000000" + "\t" +
					"20130528210000";
			AreaID = TempStr.split("\t")[0];
			StartTime = TempStr.split("\t")[1];
			StopTime = TempStr.split("\t")[2];
		} else {
			AreaID = args[0];
			StartTime = args[1];
			StopTime = args[2];
		}

		String sourceTable = "ps2cb.login.event";
		String targetTable = "ps2cb.tbl.ccu";
		List<Scan> ScanList = new ArrayList<Scan>();
		for (String aid : AreaID.split(",")) {
			String StartRow = aid + DEFAULT_DELIMITER + StartTime;
			String StopRow = aid + DEFAULT_DELIMITER + StopTime;
			Scan scan = new Scan(StartRow.getBytes(), StopRow.getBytes());
			scan.setCaching(5000);
			scan.setCacheBlocks(false);
			scan.addColumn("_0".getBytes(), "lgtp".getBytes());
			scan.addColumn("_0".getBytes(), "area".getBytes());
			scan.addColumn("_0".getBytes(), "wid".getBytes());
			scan.addColumn("_0".getBytes(), "pid".getBytes());
			scan.addColumn("_0".getBytes(), "uts".getBytes());
			scan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, sourceTable.getBytes());
			ScanList.add(scan);
		}
		myHBase.myConf.set("CCUTime_StringTime", FormatTime.getStringDatetoString(StopTime));
		myHBase.myConf.set("CCUTime_UnixTime", FormatTime.toUnixTimeBySecond(StopTime));
		Job job = new Job(myHBase.myConf, "CCU");
		job.setJarByClass(mr_ccu.class);
		job.setCombinerClass(CCUCombiner.class);
		TableMapReduceUtil.setScannerCaching(job, 5000);
		TableMapReduceUtil.initTableMapperJob(
				ScanList,
				CCUMapper.class,
				Text.class,
				Text.class,
				job);
		TableMapReduceUtil.initTableReducerJob(
				targetTable, // output table
				CCUReducer.class, // reducer class
				job);
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}


你可能感兴趣的:(mapreduce,hadoop,hbase,HTable,MultiTableInput)