hive udf写hbase

最近遇到新的需求,需要将hive脚本运行输出的结果存入hbase,故写出一个通用的hive udf来满足该需求,具体代码如下:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;

/**
 * Copyright (C), 2015, 
 *
 * @author 
 * @version 0.0.1
 * @desc 导入hive数据到hbase, CREATE TEMPORARY FUNCTION hive2HBase as 'xxx.etl.hive2hbase.UDFHbaseMerge';
 * @date 2/23/16
 */
@Description(name = "hive2HBase", value = "FUNC(zookeeperQuorum, hbaseTable, CF, rowKey, c1, c2, c3, …) - read data from hive and delete same date at HBase, "
        + "returns success of the import.", extended = "The first argument is zookeeperQuorum, "
        + "the second argument is the hbase table, "
        + "the third argument is the CF, "
        + "the fourth argument is the rowKey, "
        + "the other args should be a map, seprated by ',' ."
        + "example: select FUNC('zookeeperQuorum', 'tableName', 'columFamily', key, 'columnName1,columnName2', columnName1value, columnName2value) from dual;")
@UDFType(deterministic = false)
public class UDFHbaseMerge extends GenericUDF {

    private static final Logger logger = LoggerFactory.getLogger(UDFHbaseMerge.class);

    // 接受输入参数
    protected transient ObjectInspector[] argumentOI;
    protected transient String hbaseTable;
    protected BufferedMutator mutator;
    protected Connection connection;
    protected static String cf = "F";
    protected static String[] cols;
    protected final static String NULL_FLAG = "";
    protected final Text result = new Text();
    protected String zookeeperQuorum;

    @Override
    public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
        argumentOI = objectInspectors;

        // 校验udf输入的前3个参数是否为String, 如果不是将会抛出异常
        for (int i = 0; i < 3; i++) {
            if (objectInspectors[i].getCategory() == ObjectInspector.Category.PRIMITIVE) {
                PrimitiveObjectInspector poi = ((PrimitiveObjectInspector) objectInspectors[i]);
                if (!(poi.getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING)) {
                    throw new UDFArgumentTypeException(i,  "The argument of function  should be \"" + serdeConstants.STRING_TYPE_NAME
                                    + "\", but \"" + objectInspectors[i].getTypeName() + "\" is found");
                }
            }
        }
        // 校验第四个及以后的参数是否为java 原生的类型, 如果不是将会抛出异常
        for (int i = 3; i < objectInspectors.length; i++) {
            if (objectInspectors[i].getCategory() != ObjectInspector.Category.PRIMITIVE) {
                throw new UDFArgumentTypeException(i, "The argument of function should be primative"
                                + ", but \"" + objectInspectors[i].getTypeName() + "\" is found");
            }
        }

        // 设置预期返回值为String
        return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
    }

    @Override
    public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
        try {
            if (mutator == null) {
                zookeeperQuorum = getDeferredObject(deferredObjects, 0);
                hbaseTable = getDeferredObject(deferredObjects, 1);
                cf = getDeferredObject(deferredObjects, 2);
                cols = getDeferredObject(deferredObjects, 4).split(",");
                Configuration conf = HBaseConfiguration.create();
                conf.set("hbase.zookeeper.quorum", zookeeperQuorum);
                conf.set("hbase.zookeeper.property.clientPort", "2181");
                conf.set("mapred.task.timeout", "3600000");
                conf.set("dfs.socket.timeout", "3600000");
                conf.set("dfs.datanode.socket.write.timeout", "3600000");

                connection = ConnectionFactory.createConnection(conf);
                mutator = connection.getBufferedMutator(TableName.valueOf(hbaseTable));
            }
            Put put = getPut(deferredObjects);
            try {
                mutator.mutate(put);
            } catch (IOException e) {
                logger.error(Bytes.toString(mutator.getName().getName()) + "  put error " + e.getMessage());
            }
            result.set("success");
        } catch (Exception ex) {
            logger.error("evaluate发生错误");
            result.set(ex.toString());
            this.close();
        }
        return result;
    }

    @Override
    public String getDisplayString(String[] children) {
        StringBuilder sb = new StringBuilder();
        sb.append("hive2HBase(");
        if (children.length > 0) {
            sb.append(children[0]);
            for (int i = 1; i < children.length; i++) {
                sb.append(",");
                sb.append(children[i]);
            }
        }
        sb.append(")");
        return sb.toString();
    }

    protected String getDeferredObject(DeferredObject[] arguments, int index) throws HiveException {
        if (arguments[index].get() == null) {
            return NULL_FLAG;
        }
        return ((PrimitiveObjectInspector) argumentOI[index]).getPrimitiveJavaObject(arguments[index].get()).toString();
    }

    protected Put getPut(DeferredObject[] arguments) throws Exception {
        String rowKey = getDeferredObject(arguments, 3);
        Put put = new Put(Bytes.toBytes(rowKey));
        for (int i = 0; i < cols.length; i++) {
            put.addColumn(Bytes.toBytes(cf), Bytes.toBytes(cols[i]), Bytes.toBytes(getDeferredObject(arguments, i + 5)));
        }
        return put;
    }

    @Override
    public void close() {
        try {
            super.close();
            if (mutator != null) {
                mutator.flush();
                mutator.close();
                connection.close();
            }
        } catch (Exception e) {
            logger.error(Bytes.toString(mutator.getName().getName()) + "  close  error " + e.getMessage());
        }
    }

    @Override
    public String[] getRequiredFiles() {
        return super.getRequiredFiles();
    }
}
该udf继承GenericUDF,实现了initialize、evaluate等方法,通过在evaluate中构造put进行导入。
具体使用方法如下:
set mapred.reduce.tasks=20;
add jar /home/xxx/xxx/hive2hbase/xxx-etl-0.0.1.jar;
CREATE TEMPORARY FUNCTION hive2HBase as 'xxx.etl.hive2hbase.UDFHbaseMerge';
在hbase创建表
create 'mobile_nature', {NAME => 'n', VERSIONS => 1, COMPRESSION => 'SNAPPY'}, {NUMREGIONS => 3, SPLITALGO => 'HexStringSplit'}
drop table if exists hive2hbase_tmp;
create table hive2hbase_tmp as select hive2HBase('zookeeper Quorum', 'mobile_nature', 'n', mobile, 'n1,n2', 11_, 58_) from mobile_nature;
drop table if exists hive2hbase_tmp;

该udf中使用的是BufferedMutator 新api,如果使用旧的Table api效率将会大大降低,以下是实测效率,hive表中数据有600w行左右
  旧api 新api
读请求 1w-2w/s
3k-4k/s
写请求 4k/s 4w/s
耗时 40min 4min
对集群影响 集群会出现一定的隐患 对集群无影响

通过结果表明应该使用新的批量api,我们的读取请求不会达到3k/s


参考:http://blog.csdn.net/zzuiezhangqihui/article/details/47259465

你可能感兴趣的:(大数据,hadoop,hive,hbase,zookeeper)