hive udf与transform

udf

将存储在 rat_json 表的json字符串格式化存储到 t_rating 表
json字符串rating.json

{"movie":"1721","rate":"3","timeStamp":"965440048","uid":"5114"}

表结构

hive>create table rat_json(line string) row format delimited;
hive>load data local inpath '/home/bingo/data/hive/rating.json' into table rat_json;

hive>drop table if exists t_rating;
hive>create table t_rating(movieid string,rate int,timestring string,uid string)
    >row format delimited fields terminated by '\t';

udf函数

package com.hiveapp.udf;

import org.apache.hadoop.hive.ql.exec.UDF;
import parquet.org.codehaus.jackson.map.ObjectMapper;

public class JsonParser extends UDF {

    public String evaluate(String jsonLine) {
        ObjectMapper objectMapper = new ObjectMapper();

        try {
            MovieRateBean bean = objectMapper.readValue(jsonLine, MovieRateBean.class);
            return bean.toString();
        } catch (Exception e) {

        }
        return "";
    }
}

public class MovieRateBean {
    private String movie;
    private String rate;
    private String timeStamp;
    private String uid;

    public String getMovie() {
        return movie;
    }
    public void setMovie(String movie) {
        this.movie = movie;
    }
    public String getRate() {
        return rate;
    }
    public void setRate(String rate) {
        this.rate = rate;
    }
    public String getTimeStamp() {
        return timeStamp;
    }
    public void setTimeStamp(String timeStamp) {
        this.timeStamp = timeStamp;
    }
    public String getUid() {
        return uid;
    }
    public void setUid(String uid) {
        this.uid = uid;
    }
    @Override
    public String toString() {
        return movie + "\t" + rate + "\t" + timeStamp + "\t" + uid;
    }
}

打成jar包上传到服务器,将jar包添加到hive的classpath

hive>add JAR /home/bingo/hiveapp.jar;

创建临时函数与开发好的java class关联

hive>create temporary function parsejson as 'com.hiveapp.udf.JsonParser';

执行

hive>insert overwrite table t_rating
    >select split(parsejson(line),'\t')[0]as movieid,
    >split(parsejson(line),'\t')[1] as rate,
    >split(parsejson(line),'\t')[2] as timestring,
    >split(parsejson(line),'\t')[3] as uid 
    >from rat_json limit 10;

内置jason函数

select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json limit 5;

transform

Hive的 TRANSFORM 关键字提供了在SQL中调用自写脚本的功能
适合实现Hive中没有的功能又不想写UDF的情况
示例:将上一步生成的表t_rating 的timestring转成星期的形式显示

weekday_mapper.py

vi /home/bingo/data/hive/weekday_mapper.py
#!/bin/python
import sys
import datetime

for line in sys.stdin:
  line = line.strip()
  movieid, rating, unixtime,userid = line.split('\t')
  weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  print '\t'.join([movieid, rating, str(weekday),userid])

将文件加入hive的classpath,执行

hive>add FILE /home/bingo/data/hive/weekday_mapper.py;

hive>create TABLE u_data_new as
    >SELECT TRANSFORM (movieid, rate, timestring,uid)
    >USING 'python weekday_mapper.py' AS (movieid, rate, weekday,uid)
    >FROM t_rating;

你可能感兴趣的:(hive)