Hive UDF教程(一)
Hive UDF教程(二)
Hive UDF教程(三)
前两节分别介绍了基础UDF和UDTF,这一节我们将介绍最复杂的用户自定义聚合函数(UDAF)。用户自定义聚合函数(UDAF)接受从零行到多行的零个到多个列,然后返回单一值,如sum()、count()。要实现UDAF,我们需要实现下面的类:
public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException;
// 输入输出都是Object inspectors public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException; // AggregationBuffer保存数据处理的临时结果 abstract AggregationBuffer getNewAggregationBuffer() throws HiveException; // 重新设置AggregationBuffer public void reset(AggregationBuffer agg) throws HiveException; // 处理输入记录 public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException; // 处理全部输出数据中的部分数据 public Object terminatePartial(AggregationBuffer agg) throws HiveException; // 把两个部分数据聚合起来 public void merge(AggregationBuffer agg, Object partial) throws HiveException; // 输出最终结果 public Object terminate(AggregationBuffer agg) throws HiveException;
下面我们看一个例子,把某一列的值合并,然后和concat_ws()函数一起实现MySQL中group_concat()函数的功能,代码如下:
@Description( name = "collect", value = "_FUNC_(col) - The parameter is a column name. " + "The return value is a set of the column.", extended = "Example:\n" + " > SELECT _FUNC_(col) from src;" ) public class GenericUDAFCollect extends AbstractGenericUDAFResolver { private static final Log LOG = LogFactory.getLog(GenericUDAFCollect.class.getName()); public GenericUDAFCollect() { // TODO Auto-generated constructor stub } @Override public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { if(parameters.length != 1){ throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); } if(parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE){ throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but " + parameters[0].getTypeName() + " was passed as parameter 1."); } return new GenericUDAFCollectEvaluator(); } @SuppressWarnings("deprecation") public static class GenericUDAFCollectEvaluator extends GenericUDAFEvaluator{ private PrimitiveObjectInspector inputOI; private StandardListObjectInspector internalMergeOI; private StandardListObjectInspector loi; @Override public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { super.init(m, parameters); if(m == Mode.PARTIAL1 || m == Mode.COMPLETE){ inputOI = (PrimitiveObjectInspector) parameters[0]; return ObjectInspectorFactory.getStandardListObjectInspector( (PrimitiveObjectInspector) ObjectInspectorUtils .getStandardObjectInspector(inputOI)); } else if(m == Mode.PARTIAL2 || m == Mode.FINAL){ internalMergeOI = (StandardListObjectInspector) parameters[0]; inputOI = (PrimitiveObjectInspector) internalMergeOI.getListElementObjectInspector(); loi = ObjectInspectorFactory.getStandardListObjectInspector(inputOI); return loi; } return null; } static class ArrayAggregationBuffer implements AggregationBuffer{ List<Object> container; } @Override public AggregationBuffer getNewAggregationBuffer() throws HiveException { ArrayAggregationBuffer ret = new ArrayAggregationBuffer(); reset(ret); return ret; } @Override public void reset(AggregationBuffer agg) throws HiveException { ((ArrayAggregationBuffer) agg).container = new ArrayList<Object>(); } @Override public void iterate(AggregationBuffer agg, Object[] param) throws HiveException { Object p = param[0]; if(p != null){ putIntoList(p, (ArrayAggregationBuffer)agg); } } @Override public void merge(AggregationBuffer agg, Object partial) throws HiveException { ArrayAggregationBuffer myAgg = (ArrayAggregationBuffer) agg; ArrayList<Object> partialResult = (ArrayList<Object>) this.internalMergeOI.getList(partial); for(Object obj : partialResult){ putIntoList(obj, myAgg); } } @Override public Object terminate(AggregationBuffer agg) throws HiveException { ArrayAggregationBuffer myAgg = (ArrayAggregationBuffer) agg; ArrayList<Object> list = new ArrayList<Object>(); list.addAll(myAgg.container); return list; } @Override public Object terminatePartial(AggregationBuffer agg) throws HiveException { ArrayAggregationBuffer myAgg = (ArrayAggregationBuffer) agg; ArrayList<Object> list = new ArrayList<Object>(); list.addAll(myAgg.container); return list; } public void putIntoList(Object param, ArrayAggregationBuffer myAgg){ Object pCopy = ObjectInspectorUtils.copyToStandardObject(param, this.inputOI); myAgg.container.add(pCopy); } } }
hive (mydb)> ADD jar /root/experiment/hive/hive-0.0.1-SNAPSHOT.jar; hive (mydb)> CREATE TEMPORARY FUNCTION collect AS "edu.wzm.hive.udaf.GenericUDAFCollect"; hive (mydb)> SELECT collect(name) FROM employee; Query ID = root_20160117221111_c8b88dc9-170c-4957-b665-15b99eb9655a Total jobs = 1 Launching Job 1 out of 1 Number of reduce tasks determined at compile time: 1 In order to change the average load for a reducer (in bytes): set hive.exec.reducers.bytes.per.reducer=<number> In order to limit the maximum number of reducers: set hive.exec.reducers.max=<number> In order to set a constant number of reducers: set mapreduce.job.reduces=<number> Starting Job = job_1453096763931_0001, Tracking URL = http://master:8088/proxy/application_1453096763931_0001/ Kill Command = /root/install/hadoop-2.4.1/bin/hadoop job -kill job_1453096763931_0001 Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1 2016-01-17 22:11:49,360 Stage-1 map = 0%, reduce = 0% 2016-01-17 22:12:01,388 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.76 sec 2016-01-17 22:12:16,830 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 2.95 sec MapReduce Total cumulative CPU time: 2 seconds 950 msec Ended Job = job_1453096763931_0001 MapReduce Jobs Launched: Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 2.95 sec HDFS Read: 1040 HDFS Write: 80 SUCCESS Total MapReduce CPU Time Spent: 2 seconds 950 msec OK ["John Doe","Mary Smith","Todd Jones","Bill King","Boss Man","Fred Finance","Stacy Accountant"] Time taken: 44.302 seconds, Fetched: 1 row(s)
hive (mydb)> SELECT salary,concat_ws(',', collect(name)) FROM employee GROUP BY salary; Query ID = root_20160117222121_dedd4981-e050-4aac-81cb-c449639c721b Total jobs = 1 Launching Job 1 out of 1 Number of reduce tasks not specified. Estimated from input data size: 1 In order to change the average load for a reducer (in bytes): set hive.exec.reducers.bytes.per.reducer=<number> In order to limit the maximum number of reducers: set hive.exec.reducers.max=<number> In order to set a constant number of reducers: set mapreduce.job.reduces=<number> Starting Job = job_1453096763931_0003, Tracking URL = http://master:8088/proxy/application_1453096763931_0003/ Kill Command = /root/install/hadoop-2.4.1/bin/hadoop job -kill job_1453096763931_0003 Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1 2016-01-17 22:21:59,627 Stage-1 map = 0%, reduce = 0% 2016-01-17 22:22:07,207 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.2 sec 2016-01-17 22:22:14,700 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 2.8 sec MapReduce Total cumulative CPU time: 2 seconds 800 msec Ended Job = job_1453096763931_0003 MapReduce Jobs Launched: Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 2.8 sec HDFS Read: 1040 HDFS Write: 131 SUCCESS Total MapReduce CPU Time Spent: 2 seconds 800 msec OK 60000.0 Bill King,Stacy Accountant 70000.0 Todd Jones 80000.0 Mary Smith 100000.0 John Doe 150000.0 Fred Finance 200000.0 Boss Man Time taken: 24.928 seconds, Fetched: 6 row(s)
在实现UDAF时,主要实现下面几个方法:
源代码托管在GitHub上:https://github.com/GatsbyNewton/hive_udf