UDF函数开发

Hive有两个不同的接口编写UDF程序。一个是操作简单数据类型的UDF接口,一个是操作复杂类型的GenericUDF接口。

UDF接口暂不讨论。

下面看一个例子:

ary建表语句:

create table ary(
array1 array>,
array2 array>
)
row format delimited fields terminated by '\t'
map keys terminated by ','; 

ary表中数据如下:

hive > select * from ary;
OK
ary.array1      ary.array2
[{"id":"001","name":"明明"}]    [{"addr":"杨浦","id":"001","dt":"20180703"}]
[{"id":"002","name":"阿达"}]    [{"addr":"黄浦","id":"004","dt":"20180629"}]
[{"id":"003","name":"阿珂"}]    [{"addr":"浦东","id":"003","dt":"20180817"}]
[{"id":"004","name":"小宝"}]    [{"addr":"松江","id":"002","dt":"20180623"}]

我们需要的结果:

hive > select aa(array1,array2,'id') from ary;                
OK
_c0
[{"id":"001","name":"明明","addr":"杨浦","dt":"20180703"}]
[{"id":"NULL","name":"NULL","addr":"NULL","dt":"NULL"}]
[{"id":"003","name":"阿珂","addr":"浦东","dt":"20180817"}]
[{"id":"NULL","name":"NULL","addr":"NULL","dt":"NULL"}]

函数逻辑:

操作一行数据,通过传入的字段名对两个结构体类型的数组中字段值进行匹配,值相等则返回匹配到的全部值(当然id只匹配一个),否则全部置空

编写UDF自定义函数:

package com.paic.gbd.udfarray;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.lazy.LazyString;
import org.apache.hadoop.hive.serde2.lazy.LazyStruct;
import org.apache.hadoop.hive.serde2.lazy.LazyMap;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import java.util.ArrayList;
import java.util.List;
 
public class GenericArrayUnion extends GenericUDF {

	//定义输出日志,方便调试,在hive.log里面查看,该文件目录在hive-log4j.properties配置文件里查看
	 static final Log LOG = LogFactory.getLog(GenericArrayUnion.class.getName());
     ////输入变量定义
	 private static final int ARG_COUNT = 3; // Number of arguments to this UDF
	 private static final String FUNC_NAME = "array_union"; // External Name
     private ListObjectInspector arrayOI;
     private ListObjectInspector arrayOI2;
     private ObjectInspector strOI;
     
     private StructObjectInspector structOI;
     private StructObjectInspector structOI2;
     private ArrayList valueList = new ArrayList(); 
     private ArrayList valueList2 = new ArrayList(); 
     int num1=0;
     int num2=0;
     int valueListLength=0;
     int valueList2Length=0;
     @Override
     public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
    	  arrayOI = (ListObjectInspector)arguments[0];
    	  arrayOI2 = (ListObjectInspector)arguments[1];
    	  strOI = (ObjectInspector)arguments[2];
    	  
          structOI = ((StructObjectInspector)arrayOI.getListElementObjectInspector());
          structOI2 = ((StructObjectInspector)arrayOI2.getListElementObjectInspector());
          
          //判断参数个数
          if (arguments.length != ARG_COUNT) {
  			throw new UDFArgumentException("The function " + FUNC_NAME
  					+ " accepts " + ARG_COUNT + " arguments.");
  			}
          //判断参数1,2的数据类型
          if (!arguments[0].getCategory().equals(Category.LIST) || !arguments[1].getCategory().equals(Category.LIST)) {
  			throw new UDFArgumentException(
  					"\""
  							+ org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME
  							+ "\" "
  							+ "expected at function arguments, but \"" + arguments[0].getTypeName()  
  							+ "\" and \"" + arguments[1].getTypeName() + "\" is found.");
  			}
         //数组中结构体结构必须完全一致
  		 if ( structOI.getCategory() != Category.STRUCT 
  				|| structOI2.getCategory() != Category.STRUCT) {
  			throw new UDFArgumentException(
  					"\""
  							+ org.apache.hadoop.hive.serde.serdeConstants.STRUCT_TYPE_NAME
  							+ "\" "
  							+ "expected at ARRAY elements type, but \"" + structOI.getCategory()  
  							+ "\" and \"" + structOI2.getCategory() + "\" is found.");
  		 }
          //判断参数3的数据类型
          if(!arguments[2].getCategory().equals(Category.PRIMITIVE)) {
        	  throw new UDFArgumentException(
    					"\""
    							+ org.apache.hadoop.hive.serde.serdeConstants.PrimitiveTypes
    							+ "\" "
    							+ "expected at function arguments, but \"" + arguments[2].getTypeName()  
    							+  "\" is found.");
          }
          
          //输出结构体类型的数组定义
          ArrayList structFieldNames = new ArrayList();
          ArrayList structFieldObjectInspectors = new ArrayList();
          String value1=null;
          String value2=null;
          valueListLength=0;
          int strOINum1=0;
          for(int i=0;i0 );
 
          StringBuilder sb = new StringBuilder();
          sb.append("array_union(");
          sb.append(children[0]);
          sb.append(")");
 
          return sb.toString();
     }

}
 
  

将代码导成jar包上传到linux上,在hive命令行中添加jar包,并创建函数

hive > add jar /home/hadoop/aa.jar;                                    
Added /home/hadoop/aa.jar to class path
Added resource: /home/hadoop/aa.jar
hive > create function aa as 'com.paic.gbd.udfarray.GenericArrayUnion';
OK
Time taken: 0.031 seconds

最终函数生成了前文需要的结果

你可能感兴趣的:(UDF函数开发)