Hive官方文档
Hive官方文档
Hive内置函数
测试各种内置函数的快捷方法:
1、创建一个dual表
create table dual(id string);
2、load data local inpath 'home/hadoop/dual.dat' into table dual; 一个文件(一行,一个空格)到dual表
3、select substr('angelababy',2,3) from dual;
select substr('angelababy',0,3) from dual;==>和下一句返回的一模一样
select substr('angelababy',1,3) from dual;==>和上一句返回的一模一样
4、select concat('a','b')from dual;
当Hive提供的内置函数无法满足你的业务处理需要时,此时就可以考虑使用用户自定义函数(UDF:user-defined function)。
1、创建一个java工程,将apache-hive-1.2.2-bin.tar.gz解压缩后的安装路径,找到lib文件夹,将其中的jar包放到工程中
2、创建包名cn.itcast.bigdata.udf
3、创建一个自定义类ToLowerCase--将传入的字符串变为小写
package cn.itcast.bigdata.udf;
import java.util.HashMap;
import org.apache.hadoop.hive.ql.exec.UDF;
public class ToLowerCase extends UDF {
public static HashMap provinceMap = new HashMap();
static {
provinceMap.put("136", "beijing");
provinceMap.put("137", "shanghai");
provinceMap.put("138", "shenzhen");
}
// 必须是public--重载此方法
public String evaluate(String field) {
String result = field.toLowerCase();
return result;
}
//必须是public--重载此方法--两个方法同样的不会有影响
public String evaluate(int phonenbr) {
String pnb = String.valueOf(phonenbr);
return provinceMap.get(pnb.substring(0, 3)) == null ? "huoxing":provinceMap.get(pnb.substring(0,3));
}
}
4、打成jar包,上传至hive节点的服务器,并在服务器添加hive函数,将jar包对应起来
hive>add JAR /home/hadoop/udf.jar;
Hive>create temporary function 自定义函数名tolow as 'cn.itcast.bigdata.udf.ToProvince';
5、查看select * from t_p;
6、添加数据 insert into t_p values(13,'ANGELA');
7、查看select * from t_p;可以看到大写的数据
8、select id,自定义函数名tolow(name) from t_p;可以看到已变成小写,selct时,我们对name进行函数处理,导致小写
Hive>create temporary function 自定义函数名getprovince as 'cn.itcast.bigdata.udf.ToProvince';
select phonenbr,自定义函数名getprovince(phonenbr),flow from t_flow;
UDF 作用于单个数据行,产生一个数据行作为输出。(数学函数,字符串函数)
UDAF(用户定义聚集函数):接收多个输入数据行,并产生一个输出数据行。(count,max)
l 简单UDF示例
1、先开发一个java类,继承UDF,并重载evaluate方法
package cn.itcast.bigdata.udf
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
public final class Lower extends UDF{
public Text evaluate(final Text s){
if(s==null){return null;}
return new Text(s.toString().toLowerCase());
}
}
2、打成jar包上传到服务器
3、将jar包添加到hive的classpath4、创建临时函数与开发好的java class关联
Hive>create temporary function 自定义函数名 as 'cn.itcast.bigdata.udf.ToProvince';
5、即可在hql中使用自定义的函数strip
select 自定义函数名(name),age from t_test;
数据准备
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}
{"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}
{"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}
{"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}
{"movie":"919","rate":"4","timeStamp":"978301368","uid":"1"}
{"movie":"595","rate":"5","timeStamp":"978824268","uid":"1"}
{"movie":"938","rate":"4","timeStamp":"978301752","uid":"1"}
{"movie":"2398","rate":"4","timeStamp":"978302281","uid":"1"}
{"movie":"2918","rate":"4","timeStamp":"978302124","uid":"1"}
{"movie":"1035","rate":"5","timeStamp":"978301753","uid":"1"}
{"movie":"2791","rate":"4","timeStamp":"978302188","uid":"1"}
{"movie":"2687","rate":"3","timeStamp":"978824268","uid":"1"}
{"movie":"2018","rate":"4","timeStamp":"978301777","uid":"1"}
{"movie":"3105","rate":"5","timeStamp":"978301713","uid":"1"}
{"movie":"2797","rate":"4","timeStamp":"978302039","uid":"1"}
1、create table t_json(line string) row format delimited;
2、load data local inpath 'home/hadoop/rating.json' into table t_json;
3、select * from t_json limit 10;
4、自定义类JsonParser
package cn.itcast.bigdata.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import parquet.org.codehaus.jackson.map.ObjectMapper;//之前已经将lib的jar包添加到工程中
public class JsonParser extends UDF {
public String evaluate(String jsonLine) {
ObjectMapper objectMapper = new ObjectMapper();
try {
MovieRateBean bean = objectMapper.readValue(jsonLine, MovieRateBean.class);
return bean.toString();
} catch (Exception e) {
}
return "";
}
}
MovieRateBean
package cn.itcast.bigdata.udf;
//{"movie":"1721","rate":"3","timeStamp":"965440048","uid":"5114"}属性名必须和key一模一样
public class MovieRateBean {
private String movie;
private String rate;
private String timeStamp;
private String uid;
//碍于篇幅 --这里隐藏get和set方法
@Override
public String toString() {//解析后返回字符串
return movie + "\t" + rate + "\t" + timeStamp + "\t" + uid;
}
}
5、打包上传至服务器、将包添加至hive的classpath
6、Hive>create temporary function 自定义函数名parsejson as 'cn.itcast.bigdata.udf.Parsejson';
7、查看数据select parsejson(line) from t_json limit 10;
8、其他方式
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;
drop table if exists t_rating;
create table t_rating(movieid string,rate int,timestring string,uid string)
row format delimited fields terminated by '\t';
insert overwrite table t_rating 新创建表t_rating
select split得到数组(自定义函数parsejson(line),'\t')[0]as movieid,
split(parsejson(line),'\t')[1] as rate,
split(parsejson(line),'\t')[2] as timestring,
split(parsejson(line),'\t')[3] as uid
from rat_json limit 10;
Hive的 TRANSFORM 关键字提供了在SQL中调用自写脚本的功能
适合实现Hive中没有的功能又不想写UDF的情况
使用示例1:下面这句sql就是借用了weekday_mapper.py对数据进行了处理.
CREATE TABLE u_data_new (
movieid INT,
rating INT,
weekday INT,
userid INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
add FILE weekday_mapper.py;//如果是java则是add jar
INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (movieid , rate, timestring,uid)//查询4个字段用python脚本来处理
USING 'python weekday_mapper.py'
AS (movieid, rating, weekday,userid) //将time string转化为星期几 as返回4个结果
FROM t_rating;
其中weekday_mapper.py内容如下
#!/bin/python
import sys
import datetime
for line in sys.stdin:
line = line.strip()
movieid, rating, unixtime,userid = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([movieid, rating, str(weekday),userid])//左右字段加入/t分隔符
步骤整理:
transform案例:
1、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;
2、需要解析json数据成四个字段,插入一张新的表 t_rating
insert overwrite table t_rating
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json;
3、使用transform+python的方式去转换unixtime为weekday
先编辑一个python脚本文件
########python######代码
vi weekday_mapper.py
#!/bin/python
import sys
import datetime
for line in sys.stdin:
line = line.strip()
movieid, rating, unixtime,userid = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([movieid, rating, str(weekday),userid])
保存文件
然后,将文件加入hive的classpath:
hive>add FILE /home/hadoop/weekday_mapper.py;
hive>create TABLE u_data_new as//通过查询得到数据
SELECT
TRANSFORM (movieid, rate, timestring,uid)//和t_rating对应 作为输入传递到python脚本
USING 'python weekday_mapper.py'
AS (movieid, rate, weekday,uid)//根据结果 另外一个表的字段 插入到新表u_data_new
FROM t_rating;
select distinct(weekday) from u_data_new limit 10;