hbase 自定义 endpoint coprocessor

hbase 自带的AggregationClient只能对单一列族的单一列进行聚合。如果想对多个列进行聚合的话,比如后面列子中说的salecount(销售量)和salemoney(销售金额),用AggregationClient只能调用两次,这样难免效率会比较低,而且两次调用一致性也不能保证(可能你sum完salecount后,再sum salemoney之前又插入了输入)。

所以只能实现一个自定义的endpoint coprocessor了。


首先自定义一个实现writable的类MyMutiSum,因为要在hadoop集群中进行传输,所以必须实现writable.用来返回每个列sum后的结果,该类实现如下:

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.Writable;

public class MyMutiSum implements Writable {

	private List<Long> resultList = new ArrayList<Long>();

	public MyMutiSum() {
	}

	public MyMutiSum(int resultSize) {
		for (int i = 0; i < resultSize; i++) {
			resultList.add(0L);
		}
	}

	public Long getSum(int i) {
		return resultList.get(i);
	}

	public void setSum(int i, Long sum) {
		resultList.set(i, sum);
	}

	public int getResultSize() {
		return resultList.size();
	}

	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub
		out.writeInt(resultList.size());
		for (Long v : resultList) {
			out.writeLong(v);
		}
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub
		int size = in.readInt();
		for (int i = 0; i < size; i++) {
			resultList.add(in.readLong());
		}
	}

}
然后自定义一个RPC协议,里面有个方法的参数columns是将你要进行sum的多个列都传过去作为参数:

import java.io.IOException;

import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.ipc.CoprocessorProtocol;

public interface MyCoprocessorProtocol extends CoprocessorProtocol {

	MyMutiSum getMutiSum(String[] columns, Scan scan) throws IOException;
}

然后实现这个RPC协议的服务(方法):

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.BaseEndpointCoprocessor;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.regionserver.InternalScanner;
import org.apache.hadoop.hbase.util.Bytes;

public class MyEndpointImpl extends BaseEndpointCoprocessor implements
		MyCoprocessorProtocol {
	protected static Log log = LogFactory.getLog(MyEndpointImpl.class);

	@Override
	public MyMutiSum getMutiSum(String[] columns, Scan scan) throws IOException {
		// TODO Auto-generated method stub
		MyMutiSum result = new MyMutiSum(columns.length);
		InternalScanner scanner = ((RegionCoprocessorEnvironment) getEnvironment())
				.getRegion().getScanner(scan);

		List<KeyValue> keyValues = new ArrayList<KeyValue>();
		try {
			boolean hasMoreRows = false;
			do {
				//循环每一个row的待sum的列,
				hasMoreRows = scanner.next(keyValues);
				for (int i = 0; i < columns.length; i++) {
					
					String column = columns[i];
					for (KeyValue kv : keyValues) {
						if (column.equals(Bytes.toString(kv.getQualifier()))) {
							byte[] value = kv.getValue();
							if (value == null || value.length == 0) {
							} else {
								Long tValue = Bytes.toLong(value);
								//如果是待sum的列,就将该列的值累加到之前的sum值上去。
								result.setSum(i, result.getSum(i) + tValue);

							}
							break;
						}
					}
				}

				keyValues.clear();
			} while (hasMoreRows);
		} finally {
			scanner.close();
		}
		log.debug("Sum from this region is "
				+ ((RegionCoprocessorEnvironment) getEnvironment()).getRegion()
						.getRegionNameAsString() + ": ");

		for (int i = 0; i < columns.length; i++) {
			log.debug(columns[i] + " " + result.getSum(i));
		}
		//将sum后的自定义writable对象返回
		return result;
	}

}

接下来我们可以实现一个rpc client:

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.coprocessor.Batch;
import org.apache.hadoop.hbase.util.Bytes;

public class MyEndpointClient {
	protected static Log log = LogFactory.getLog(MyEndpointClient.class);

	private Configuration conf;

	public MyEndpointClient(Configuration conf) {
		this.conf = conf;
	}

	public MyMutiSum mutiSum(String tableName, String cf,
			final String[] columns, final Scan scan) throws Throwable {
		class MutiSumCallBack implements Batch.Callback<MyMutiSum> {

			MyMutiSum sumVal = null;

			public MyMutiSum getSumResult() {
				return sumVal;
			}

			@Override
			public void update(byte[] region, byte[] row, MyMutiSum result) {
				// TODO Auto-generated method stub
				sumVal = add(sumVal, result);
			}

			public MyMutiSum add(MyMutiSum l1, MyMutiSum l2) {
				if (l1 == null ^ l2 == null) {
					return (l1 == null) ? l2 : l1; // either of one is null.
				} else if (l1 == null) // both are null
					return null;

				MyMutiSum mutiSum = new MyMutiSum(columns.length);

				for (int i = 0; i < columns.length; i++) {
					mutiSum.setSum(i, l1.getSum(i) + l2.getSum(i));
				}

				return mutiSum;
			}

		}

		MutiSumCallBack sumCallBack = new MutiSumCallBack();

		HTable table = null;

		for (int i = 0; i < columns.length; i++) {
			scan.addColumn(Bytes.toBytes(cf), Bytes.toBytes(columns[i]));
		}

		try {
			table = new HTable(conf, tableName);
			// 根据startRow和stopRow确定regionserver,即RPC
			// SERVER,在startRow~stopRow范围的region上执行rpc调用
			// 所以这个方法其实发起了多个RPC调用,每个RPC调用返回后都会调用sumCallBack的update方法,将自己执行的结果通过update方法累加到sumCallBack的sumVal上
			table.coprocessorExec(MyCoprocessorProtocol.class,
					scan.getStartRow(), scan.getStopRow(),
					new Batch.Call<MyCoprocessorProtocol, MyMutiSum>() {

						@Override
						public MyMutiSum call(MyCoprocessorProtocol instance)
								throws IOException {
							// TODO Auto-generated method stub
							// instance.getMutiSum会转化成对region上的指定的MyCoprocessorProtocol的实现类的该方法的rpc调用
							return instance.getMutiSum(columns, scan);
						}

					}, sumCallBack);
		} finally {
			if (table != null) {
				table.close();
			}
		}
		// 返回的是sumCallBack的sumVal,即所有region结果通过update的累加。
		return sumCallBack.getSumResult();
	}


}

ok,现在我们将这三个类打包成MutiSum.jar,上传到hdfs上去,我这里传的目录为hdfs://master24:9000/user/hadoop/jars/MutiSum.jar。

接下来我们将这个自定义的cp设置到member4这个表上去,通过API来实现:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.util.Bytes;

public class SetCoprocessor {

	/**
	 * @param args
	 * @throws Exception
	 * @throws MasterNotRunningException
	 */
	public static void main(String[] args) throws MasterNotRunningException,
			Exception {
		// TODO Auto-generated method stub
		byte[] tableName = Bytes.toBytes("member4");
		Configuration conf = HBaseConfiguration.create();
		HBaseAdmin admin = new HBaseAdmin(conf);
		admin.disableTable(tableName);

		HTableDescriptor htd = admin.getTableDescriptor(tableName);
		htd.addCoprocessor("com.besttone.coprocessor.MyEndpointImpl", new Path(
				"hdfs://master24:9000/user/hadoop/jars/MutiSum.jar"), 1001,
				null);
		admin.modifyTable(tableName, htd);
		admin.enableTable(tableName);
		admin.close();
	}

}

所有都准备就绪了,接下来就可以写一个main函数来测试调用一下这个cp了:

	/**
	 * @param args
	 * @throws Throwable
	 */
	public static void main(String[] args) throws Throwable {
		// TODO Auto-generated method stub
		// 我们建了一个测试表member4,里面info列族上有两个列,一个salecount销售量,一个salemoney销售额,我们通过上面自定义的cp,返回总销售量和总销售额
		final String[] columns = new String[] { "salecount", "salemoney" };
		Configuration conf = HBaseConfiguration.create();

		final Scan scan;
		scan = new Scan();

		scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("salecount"));
		scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("salemoney"));

		MyEndpointClient client = new MyEndpointClient(conf);

		MyMutiSum mutiSum = client.mutiSum("member4", "info", columns, scan);

		for (int i = 0; i < columns.length; i++) {
			System.out.println(columns[i] + " sum is :" + mutiSum.getSum(i));
		}

	}

针对以下代码补充说明一下:

table.coprocessorExec(MyCoprocessorProtocol.class,
					scan.getStartRow(), scan.getStopRow(),
					new Batch.Call<MyCoprocessorProtocol, MyMutiSum>() {

						@Override
						public MyMutiSum call(MyCoprocessorProtocol instance)
								throws IOException {
							// TODO Auto-generated method stub
							// instance.getMutiSum会转化成对region上的指定的MyCoprocessorProtocol的实现类的该方法的rpc调用
							return instance.getMutiSum(columns, scan);
						}

					}, sumCallBack);

hbase 权威指南上的例子是在call方法内部调用多个instance的方法,然后return 一个Pair,这种也是可行的,不过还是尽量封装成调用instance 的一个方法发起一个RPC,调用多个方法其实发起的RPC调用更多。


你可能感兴趣的:(hbase 自定义 endpoint coprocessor)