drop table if exists test;
create table test
(
ind int,
col string,
col1 string
) ;
insert into test values (1,'a,b,c','1,2');
insert into test values (2,'j,k',null);
insert into test values (3,null,null) ;
对第一行需要输出如下结果:
Ind | Key | Value |
---|---|---|
1 | a | 1 |
1 | b | 2 |
1 | c | Null |
其它行都要输出类似数据,如果输入数据为null,则没输出。
编写UDTF(User-Defined Table-Generating Functions),需要继承GenericUDTF类,类中部分代码如下:
/** * A Generic User-defined Table Generating Function (UDTF) * * Generates a variable number of output rows for a single input row. Useful for * explode(array)... */ public abstract class GenericUDTF { public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { List extends StructField> inputFields = argOIs.getAllStructFieldRefs(); ObjectInspector[] udtfInputOIs = new ObjectInspector[inputFields.size()]; for (int i = 0; i < inputFields.size(); i++) { udtfInputOIs[i] = inputFields.get(i).getFieldObjectInspector(); } return initialize(udtfInputOIs); } /** * Give a set of arguments for the UDTF to process. * * @param args * object array of arguments */ public abstract void process(Object[] args) throws HiveException; /** * Called to notify the UDTF that there are no more rows to process. * Clean up code or additional forward() calls can be made here. */ public abstract void close() throws HiveException; }
继承GenericUDTF需要实现以上方法,其中initialize方法和UDF中类似,主要是判断输入类型并确定返回的字段类型。process方法对udft函数输入的每一样进行操作,通过调用forward方法返回一行或多行数据。close方法在process调用结束后调用,用于进行其它一些额外操作,只执行一次。
package com.practice.hive.udtf;
import java.util.List;
import com.google.common.collect.Lists;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/**
* @author liufeifei
* @date 2018/06/20
*/
public class ArrToMapUDTF extends GenericUDTF {
private String[] obj = new String[2];
/**
* 返回类型为 String,string
*
* @param argOIs
* @return
* @throws UDFArgumentException
*/
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
List colName = Lists.newLinkedList();
colName.add("key");
colName.add("value");
List resType = Lists.newLinkedList();
resType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
resType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
// 返回分别为列名 和 列类型
return ObjectInspectorFactory.getStandardStructObjectInspector(colName, resType);
}
@Override
public void process(Object[] args) throws HiveException {
if(args[0] == null) {
return;
}
String arg1 = args[0].toString();
String[] arr1 = arg1.split(",");
String[] arr2 = null;
if(args[1] != null) {
arr2 = args[1].toString().split(",");
}
for(int i = 0; i < arr1.length ; i++ ) {
obj[0] = arr1[i];
if(arr2 != null && arr2.length > i) {
obj[1] = arr2[i];
} else {
obj[1] = null;
}
forward(obj);
}
}
@Override
public void close() throws HiveException {
}
}
udtf使用
执行效果
-- 原始数据
hive> select * from test;
OK
1 a,b,c 1,2
2 j,k NULL
3 NULL NULL
Time taken: 0.051 seconds, Fetched: 3 row(s)
-- 执行效果
hive> add jar /Users/liufeifei/hive/jar/hive.jar;
Added [/Users/liufeifei/hive/jar/hive.jar] to class path
Added resources: [/Users/liufeifei/hive/jar/hive.jar]
hive> create temporary function get_map as 'com.practice.hive.udtf.ArrToMapUDTF';
OK
Time taken: 0.005 seconds
hive> select get_map(col,col1) from test;
OK
a 1
b 2
c NULL
j NULL
k NULL
Time taken: 1.008 seconds, Fetched: 5 row(s)
以上为get_map函数的使用方法,该方法局限性为使用时无法引用其它列。结合lateral view关键词使用可以达到预期效果。
ive> select t.ind,t.col,t.col1,t1.key,t1.value
> from test t
> lateral view get_map(col,col1) t1 as key,value;
OK
1 a,b,c 1,2 a 1
1 a,b,c 1,2 b 2
1 a,b,c 1,2 c NULL
2 j,k NULL j NULL
2 j,k NULL k NULL
Time taken: 0.045 seconds, Fetched: 5 row(s)
该使用方法中涉及到t、t1两张表。lateral view将两张表进行join操作,过程如下:
对输入的test表中的col,col1列进行udtf操作,将得到的数据集命名为t1,并对列命令为key,value
将t表和t1表进行join操作,得到结果数据集
可以看到lateral view 和join操作类似。另外lateral view也支持谓词下推和outer join操作,当udtf不返回值而左侧表有值,此时outer关键词登场了,类似于left outer join操作