Hive运行用户自定义函数对数据信息处理,可以试用show functions查看 hive当前支持的函数。
hive支持三种类型的UDF函数:
UDF函数的处理单行单列输入的,UDAF是处理多行单列数据,UDTF函数是处理单行多列的。
UDF函数示例
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
/**
* @program: udf-function
* @description: 返回不小于输入值value的最小整数
* @author zhangchenguang
*
*/
public class CeilUDF extends UDF {
public Long evaluate(String value) {
try {
if (StringUtils.isNotBlank(value)) {
double resultDouble = Double.parseDouble(value);
Double resuleInt = Math.ceil(resultDouble);
return resuleInt.longValue();
} else {
return null;
}
} catch (Exception e) {
return null;
}
}
}
UDAF函数示例:
import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
/**
* UDAF实现加强版 MIN函数
* @author zhangchenguang
*
*/
public class MinUDF extends UDAF {
static public class MinShortEvaluator implements UDAFEvaluator {
private short mMin;
private boolean mEmpty;
public MinShortEvaluator() {
super();
init();
}
public void init() {
mMin = 0;
mEmpty = true;
}
public boolean iterate(ShortWritable o) {
if (o != null) {
if (mEmpty) {
mMin = o.get();
mEmpty = false;
} else {
mMin = (short) Math.min(mMin, o.get());
}
}
return true;
}
public ShortWritable terminatePartial() {
return mEmpty ? null : new ShortWritable(mMin);
}
public boolean merge(ShortWritable o) {
return iterate(o);
}
public ShortWritable terminate() {
return mEmpty ? null : new ShortWritable(mMin);
}
}
static public class MinIntEvaluator implements UDAFEvaluator {
private int mMin;
private boolean mEmpty;
public MinIntEvaluator() {
super();
init();
}
public void init() {
mMin = 0;
mEmpty = true;
}
public boolean iterate(IntWritable o) {
if (o != null) {
if (mEmpty) {
mMin = o.get();
mEmpty = false;
} else {
mMin = Math.min(mMin, o.get());
}
}
return true;
}
public IntWritable terminatePartial() {
return mEmpty ? null : new IntWritable(mMin);
}
public boolean merge(IntWritable o) {
return iterate(o);
}
public IntWritable terminate() {
return mEmpty ? null : new IntWritable(mMin);
}
}
static public class MinLongEvaluator implements UDAFEvaluator {
private long mMin;
private boolean mEmpty;
public MinLongEvaluator() {
super();
init();
}
public void init() {
mMin = 0;
mEmpty = true;
}
public boolean iterate(LongWritable o) {
if (o != null) {
if (mEmpty) {
mMin = o.get();
mEmpty = false;
} else {
mMin = Math.min(mMin, o.get());
}
}
return true;
}
public LongWritable terminatePartial() {
return mEmpty ? null : new LongWritable(mMin);
}
public boolean merge(LongWritable o) {
return iterate(o);
}
public LongWritable terminate() {
return mEmpty ? null : new LongWritable(mMin);
}
}
static public class MinFloatEvaluator implements UDAFEvaluator {
private float mMin;
private boolean mEmpty;
public MinFloatEvaluator() {
super();
init();
}
public void init() {
mMin = 0;
mEmpty = true;
}
public boolean iterate(FloatWritable o) {
if (o != null) {
if (mEmpty) {
mMin = o.get();
mEmpty = false;
} else {
mMin = Math.min(mMin, o.get());
}
}
return true;
}
public FloatWritable terminatePartial() {
return mEmpty ? null : new FloatWritable(mMin);
}
public boolean merge(FloatWritable o) {
return iterate(o);
}
public FloatWritable terminate() {
return mEmpty ? null : new FloatWritable(mMin);
}
}
static public class MinDoubleEvaluator implements UDAFEvaluator {
private double mMin;
private boolean mEmpty;
public MinDoubleEvaluator() {
super();
init();
}
public void init() {
mMin = 0;
mEmpty = true;
}
public boolean iterate(DoubleWritable o) {
if (o != null ) {
if (mEmpty) {
mMin = o.get();
mEmpty = false;
} else {
mMin = Math.min(mMin, o.get());
}
}
return true;
}
public DoubleWritable terminatePartial() {
return mEmpty ? null : new DoubleWritable(mMin);
}
public boolean merge(DoubleWritable o) {
return iterate(o);
}
public DoubleWritable terminate() {
return mEmpty ? null : new DoubleWritable(mMin);
}
}
static public class MinStringEvaluator implements UDAFEvaluator {
private Text mMin;
private boolean mEmpty;
public MinStringEvaluator() {
super();
init();
}
public void init() {
mMin = null;
mEmpty = true;
}
public boolean iterate(Text o) {
if (o != null && o.getLength() > 0 && o.getBytes().length>0 ) {
if (mEmpty) {
// 解决空串的问题,跳过空串,不进行比较
if(o.toString().trim().length() <= 0){
return false;
}
mMin = new Text(o);
mEmpty = false;
} else if (mMin.compareTo(o) > 0) {
mMin.set(o);
}
}
return true;
}
public Text terminatePartial() {
return mEmpty ? null : mMin;
}
public boolean merge(Text o) {
return iterate(o);
}
public Text terminate() {
return mEmpty ? null : mMin;
}
}
}
UDTF函数示例模板:
package com.dtwave.udfs.xin.news;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/**
* 将传入参数的第四列进行切割 第四列进行,以逗号进行分离,进行行转列的操作 然后输出 输出四列数据
*
* @author zhangchenguang
*
*/
public class CusDepictOptionsConvert extends GenericUDTF {
private PrimitiveObjectInspector stringOI = null;
@Override
public void close() throws HiveException {
}
@Override
public void process(Object[] args) throws HiveException {
String proCode = args[0].toString();
String cusId = args[1].toString();
String cusDepictCfgId = args[2].toString();
String cusDepictOptionIds = args[3].toString();
String[] cisDepictOptionIdArray = cusDepictOptionIds.split(",");
for (String s : cisDepictOptionIdArray) {
// forward(proCode, cusId, cusDepictCfgId, s);
Object[] arg = new Object[] { proCode, cusId, cusDepictCfgId, s };
forward(arg);
}
}
@Override
public StructObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
// 异常检测
if (objectInspectors.length != 4) {
throw new UDFArgumentException("NameParserGenericUDTF() takes exactly one argument");
}
if (objectInspectors[0].getCategory() != ObjectInspector.Category.PRIMITIVE
&& ((PrimitiveObjectInspector) objectInspectors[0])
.getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
throw new UDFArgumentException("NameParserGenericUDTF() takes a string as a parameter");
}
// 输入
stringOI = (PrimitiveObjectInspector) objectInspectors[0];
// 输出
List fieldNames = new ArrayList(4);
List fieldOIs = new ArrayList(4);
// 输出列名
fieldNames.add("c1");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("c2");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("c3");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("c4");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
}
在下篇博客里面分析一下Hive自定义函数心得体会。