将存储在 rat_json 表的json字符串格式化存储到 t_rating 表
json字符串rating.json
{"movie":"1721","rate":"3","timeStamp":"965440048","uid":"5114"}
表结构
hive>create table rat_json(line string) row format delimited;
hive>load data local inpath '/home/bingo/data/hive/rating.json' into table rat_json;
hive>drop table if exists t_rating;
hive>create table t_rating(movieid string,rate int,timestring string,uid string)
>row format delimited fields terminated by '\t';
udf函数
package com.hiveapp.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
import parquet.org.codehaus.jackson.map.ObjectMapper;
public class JsonParser extends UDF {
public String evaluate(String jsonLine) {
ObjectMapper objectMapper = new ObjectMapper();
try {
MovieRateBean bean = objectMapper.readValue(jsonLine, MovieRateBean.class);
return bean.toString();
} catch (Exception e) {
}
return "";
}
}
public class MovieRateBean {
private String movie;
private String rate;
private String timeStamp;
private String uid;
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public String getRate() {
return rate;
}
public void setRate(String rate) {
this.rate = rate;
}
public String getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(String timeStamp) {
this.timeStamp = timeStamp;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
@Override
public String toString() {
return movie + "\t" + rate + "\t" + timeStamp + "\t" + uid;
}
}
打成jar包上传到服务器,将jar包添加到hive的classpath
hive>add JAR /home/bingo/hiveapp.jar;
创建临时函数与开发好的java class关联
hive>create temporary function parsejson as 'com.hiveapp.udf.JsonParser';
执行
hive>insert overwrite table t_rating
>select split(parsejson(line),'\t')[0]as movieid,
>split(parsejson(line),'\t')[1] as rate,
>split(parsejson(line),'\t')[2] as timestring,
>split(parsejson(line),'\t')[3] as uid
>from rat_json limit 10;
内置jason函数
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json limit 5;
Hive的 TRANSFORM 关键字提供了在SQL中调用自写脚本的功能
适合实现Hive中没有的功能又不想写UDF的情况
示例:将上一步生成的表t_rating 的timestring转成星期的形式显示
weekday_mapper.py
vi /home/bingo/data/hive/weekday_mapper.py
#!/bin/python
import sys
import datetime
for line in sys.stdin:
line = line.strip()
movieid, rating, unixtime,userid = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([movieid, rating, str(weekday),userid])
将文件加入hive的classpath,执行
hive>add FILE /home/bingo/data/hive/weekday_mapper.py;
hive>create TABLE u_data_new as
>SELECT TRANSFORM (movieid, rate, timestring,uid)
>USING 'python weekday_mapper.py' AS (movieid, rate, weekday,uid)
>FROM t_rating;