hbase 自带的AggregationClient只能对单一列族的单一列进行聚合。如果想对多个列进行聚合的话,比如后面列子中说的salecount(销售量)和salemoney(销售金额),用AggregationClient只能调用两次,这样难免效率会比较低,而且两次调用一致性也不能保证(可能你sum完salecount后,再sum salemoney之前又插入了输入)。
所以只能实现一个自定义的endpoint coprocessor了。
首先自定义一个实现writable的类MyMutiSum,因为要在hadoop集群中进行传输,所以必须实现writable.用来返回每个列sum后的结果,该类实现如下:
import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.io.Writable; public class MyMutiSum implements Writable { private List<Long> resultList = new ArrayList<Long>(); public MyMutiSum() { } public MyMutiSum(int resultSize) { for (int i = 0; i < resultSize; i++) { resultList.add(0L); } } public Long getSum(int i) { return resultList.get(i); } public void setSum(int i, Long sum) { resultList.set(i, sum); } public int getResultSize() { return resultList.size(); } @Override public void write(DataOutput out) throws IOException { // TODO Auto-generated method stub out.writeInt(resultList.size()); for (Long v : resultList) { out.writeLong(v); } } @Override public void readFields(DataInput in) throws IOException { // TODO Auto-generated method stub int size = in.readInt(); for (int i = 0; i < size; i++) { resultList.add(in.readLong()); } } }然后自定义一个RPC协议,里面有个方法的参数columns是将你要进行sum的多个列都传过去作为参数:
import java.io.IOException; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.ipc.CoprocessorProtocol; public interface MyCoprocessorProtocol extends CoprocessorProtocol { MyMutiSum getMutiSum(String[] columns, Scan scan) throws IOException; }
import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.coprocessor.BaseEndpointCoprocessor; import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; import org.apache.hadoop.hbase.regionserver.InternalScanner; import org.apache.hadoop.hbase.util.Bytes; public class MyEndpointImpl extends BaseEndpointCoprocessor implements MyCoprocessorProtocol { protected static Log log = LogFactory.getLog(MyEndpointImpl.class); @Override public MyMutiSum getMutiSum(String[] columns, Scan scan) throws IOException { // TODO Auto-generated method stub MyMutiSum result = new MyMutiSum(columns.length); InternalScanner scanner = ((RegionCoprocessorEnvironment) getEnvironment()) .getRegion().getScanner(scan); List<KeyValue> keyValues = new ArrayList<KeyValue>(); try { boolean hasMoreRows = false; do { //循环每一个row的待sum的列, hasMoreRows = scanner.next(keyValues); for (int i = 0; i < columns.length; i++) { String column = columns[i]; for (KeyValue kv : keyValues) { if (column.equals(Bytes.toString(kv.getQualifier()))) { byte[] value = kv.getValue(); if (value == null || value.length == 0) { } else { Long tValue = Bytes.toLong(value); //如果是待sum的列,就将该列的值累加到之前的sum值上去。 result.setSum(i, result.getSum(i) + tValue); } break; } } } keyValues.clear(); } while (hasMoreRows); } finally { scanner.close(); } log.debug("Sum from this region is " + ((RegionCoprocessorEnvironment) getEnvironment()).getRegion() .getRegionNameAsString() + ": "); for (int i = 0; i < columns.length; i++) { log.debug(columns[i] + " " + result.getSum(i)); } //将sum后的自定义writable对象返回 return result; } }
import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.client.coprocessor.Batch; import org.apache.hadoop.hbase.util.Bytes; public class MyEndpointClient { protected static Log log = LogFactory.getLog(MyEndpointClient.class); private Configuration conf; public MyEndpointClient(Configuration conf) { this.conf = conf; } public MyMutiSum mutiSum(String tableName, String cf, final String[] columns, final Scan scan) throws Throwable { class MutiSumCallBack implements Batch.Callback<MyMutiSum> { MyMutiSum sumVal = null; public MyMutiSum getSumResult() { return sumVal; } @Override public void update(byte[] region, byte[] row, MyMutiSum result) { // TODO Auto-generated method stub sumVal = add(sumVal, result); } public MyMutiSum add(MyMutiSum l1, MyMutiSum l2) { if (l1 == null ^ l2 == null) { return (l1 == null) ? l2 : l1; // either of one is null. } else if (l1 == null) // both are null return null; MyMutiSum mutiSum = new MyMutiSum(columns.length); for (int i = 0; i < columns.length; i++) { mutiSum.setSum(i, l1.getSum(i) + l2.getSum(i)); } return mutiSum; } } MutiSumCallBack sumCallBack = new MutiSumCallBack(); HTable table = null; for (int i = 0; i < columns.length; i++) { scan.addColumn(Bytes.toBytes(cf), Bytes.toBytes(columns[i])); } try { table = new HTable(conf, tableName); // 根据startRow和stopRow确定regionserver,即RPC // SERVER,在startRow~stopRow范围的region上执行rpc调用 // 所以这个方法其实发起了多个RPC调用,每个RPC调用返回后都会调用sumCallBack的update方法,将自己执行的结果通过update方法累加到sumCallBack的sumVal上 table.coprocessorExec(MyCoprocessorProtocol.class, scan.getStartRow(), scan.getStopRow(), new Batch.Call<MyCoprocessorProtocol, MyMutiSum>() { @Override public MyMutiSum call(MyCoprocessorProtocol instance) throws IOException { // TODO Auto-generated method stub // instance.getMutiSum会转化成对region上的指定的MyCoprocessorProtocol的实现类的该方法的rpc调用 return instance.getMutiSum(columns, scan); } }, sumCallBack); } finally { if (table != null) { table.close(); } } // 返回的是sumCallBack的sumVal,即所有region结果通过update的累加。 return sumCallBack.getSumResult(); } }
接下来我们将这个自定义的cp设置到member4这个表上去,通过API来实现:
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MasterNotRunningException; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.util.Bytes; public class SetCoprocessor { /** * @param args * @throws Exception * @throws MasterNotRunningException */ public static void main(String[] args) throws MasterNotRunningException, Exception { // TODO Auto-generated method stub byte[] tableName = Bytes.toBytes("member4"); Configuration conf = HBaseConfiguration.create(); HBaseAdmin admin = new HBaseAdmin(conf); admin.disableTable(tableName); HTableDescriptor htd = admin.getTableDescriptor(tableName); htd.addCoprocessor("com.besttone.coprocessor.MyEndpointImpl", new Path( "hdfs://master24:9000/user/hadoop/jars/MutiSum.jar"), 1001, null); admin.modifyTable(tableName, htd); admin.enableTable(tableName); admin.close(); } }
/** * @param args * @throws Throwable */ public static void main(String[] args) throws Throwable { // TODO Auto-generated method stub // 我们建了一个测试表member4,里面info列族上有两个列,一个salecount销售量,一个salemoney销售额,我们通过上面自定义的cp,返回总销售量和总销售额 final String[] columns = new String[] { "salecount", "salemoney" }; Configuration conf = HBaseConfiguration.create(); final Scan scan; scan = new Scan(); scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("salecount")); scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("salemoney")); MyEndpointClient client = new MyEndpointClient(conf); MyMutiSum mutiSum = client.mutiSum("member4", "info", columns, scan); for (int i = 0; i < columns.length; i++) { System.out.println(columns[i] + " sum is :" + mutiSum.getSum(i)); } }
table.coprocessorExec(MyCoprocessorProtocol.class, scan.getStartRow(), scan.getStopRow(), new Batch.Call<MyCoprocessorProtocol, MyMutiSum>() { @Override public MyMutiSum call(MyCoprocessorProtocol instance) throws IOException { // TODO Auto-generated method stub // instance.getMutiSum会转化成对region上的指定的MyCoprocessorProtocol的实现类的该方法的rpc调用 return instance.getMutiSum(columns, scan); } }, sumCallBack);