最近感受了hive的udf函数的强大威力了,不仅可以使用很多已经有的udf函数,还可以自己定义符合业务场景的udf函数,下面就说一下如何写udf/udaf/udtf函数,算是一个入门介绍吧。
First, you need to create a new class that extends UDF, with one or more methods named evaluate.
package com.example.hive.udf; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; public final class Lower extends UDF { public Text evaluate(final Text s) { if (s == null) { return null; }return new Text(s.toString().toLowerCase()); } }
After compiling your code to a jar, you need to add this to the hive classpath.
add jar my_jar.jar;
Once hive is started up with your jars in the classpath, the final step is to register your function
create temporary function my_lower as 'com.example.hive.udf.Lower';
上面主要描述了实现一个udf的过程,首先自然是实现一个UDF函数,然后编译为jar并加入到hive的classpath中,最后创建一个临时变量名字让hive中调用。
几个例子:
package org.apache.hadoop.hive.contrib.udf.example; import org.apache.hadoop.hive.ql.exec.UDF; /** * UDFExampleAdd. * */ public class UDFExampleAdd extends UDF { public Integer evaluate(Integer... a) { int total = 0; for (Integer element : a) { if (element != null) { total += element; } } return total; } public Double evaluate(Double... a) { double total = 0; for (Double element : a) { if (element != null) { total += element; } } return total; } }
package org.apache.hadoop.hive.contrib.udaf.example; import org.apache.hadoop.hive.ql.exec.UDAF; import org.apache.hadoop.hive.ql.exec.UDAFEvaluator; /** * This is a simple UDAF that calculates average. * * It should be very easy to follow and can be used as an example for writing * new UDAFs. * * Note that Hive internally uses a different mechanism (called GenericUDAF) to * implement built-in aggregation functions, which are harder to program but * more efficient. * */ public final class UDAFExampleAvg extends UDAF { /** * The internal state of an aggregation for average. * * Note that this is only needed if the internal state cannot be represented * by a primitive. * * The internal state can also contains fields with types like * ArrayList<String> and HashMap<String,Double> if needed. */ public static class UDAFAvgState { private long mCount; private double mSum; } /** * The actual class for doing the aggregation. Hive will automatically look * for all internal classes of the UDAF that implements UDAFEvaluator. */ public static class UDAFExampleAvgEvaluator implements UDAFEvaluator { UDAFAvgState state; public UDAFExampleAvgEvaluator() { super(); state = new UDAFAvgState(); init(); } /** * Reset the state of the aggregation. */ public void init() { state.mSum = 0; state.mCount = 0; } /** * Iterate through one row of original data. * * The number and type of arguments need to the same as we call this UDAF * from Hive command line. * * This function should always return true. */ public boolean iterate(Double o) { if (o != null) { state.mSum += o; state.mCount++; } return true; } /** * Terminate a partial aggregation and return the state. If the state is a * primitive, just return primitive Java classes like Integer or String. */ public UDAFAvgState terminatePartial() { // This is SQL standard - average of zero items should be null. return state.mCount == 0 ? null : state; } /** * Merge with a partial aggregation. * * This function should always have a single argument which has the same * type as the return value of terminatePartial(). */ public boolean merge(UDAFAvgState o) { if (o != null) { state.mSum += o.mSum; state.mCount += o.mCount; } return true; } /** * Terminates the aggregation and return the final result. */ public Double terminate() { // This is SQL standard - average of zero items should be null. return state.mCount == 0 ? null : Double.valueOf(state.mSum / state.mCount); } } private UDAFExampleAvg() { // prevent instantiation } }
package org.apache.hadoop.hive.contrib.udtf.example; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; /** * GenericUDTFExplode2. * */ @Description(name = "explode2", value = "_FUNC_(a) - like explode, but outputs two identical columns (for testing purposes)") public class GenericUDTFExplode2 extends GenericUDTF { ListObjectInspector listOI = null; @Override public void close() throws HiveException { } @Override public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { if (args.length != 1) { throw new UDFArgumentException("explode() takes only one argument"); } if (args[0].getCategory() != ObjectInspector.Category.LIST) { throw new UDFArgumentException("explode() takes an array as a parameter"); } listOI = (ListObjectInspector) args[0]; ArrayList<String> fieldNames = new ArrayList<String>(); ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(); fieldNames.add("col1"); fieldNames.add("col2"); fieldOIs.add(listOI.getListElementObjectInspector()); fieldOIs.add(listOI.getListElementObjectInspector()); return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); } Object forwardObj[] = new Object[2]; @Override public void process(Object[] o) throws HiveException { List<?> list = listOI.getList(o[0]); for (Object r : list) { forwardObj[0] = r; forwardObj[1] = r; forward(forwardObj); } } @Override public String toString() { return "explode"; } }
Ref: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF