实现语句:
a.hql脚本
DROP TABLE IF EXISTS ods_start_log;
CREATE EXTERNAL TABLE ods_start_log(
line STRING
)
PARTITIONED BY (`dt` STRING)
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/warehouse/gmall/ods/ods_start_log';
LOAD DATA INPATH '要加载的数据的路径' INTO TABLE ods_start_log PARTITION (dt='日期');
b.shell脚本
#!/bin/bash
database=库名
data_dir=数据路径
#判断变量是否为null,null为true,!null为false
if [ -n $1 ];then
date=$1
else
log_date=`date -d "-1 day" + %F`
fi
hive -e "LOAD DATA INPATH '${data_dir}' INTO TABLE "${database}".ods_start_log PARTITION (dt='${log_date}');"
PS:event表方式相同
实现语句:
a.hql脚本
DROP TABLE IF EXISTS dwd_base_start_log;
CREATE EXTERNAL TABLE ods_start_log(
`mid_id` STRING,
`user_id` STRING,
`version_code` STRING,
`version_name` STRING,
`lang` STRING,
`source` STRING,
`os` STRING,
`area` STRING,
`model` STRING,
`brand` STRING,
`sdk_version` STRING,
`gmail` STRING,
`height_width` STRING,
`app_time` STRING,
`network` STRING,
`lng` STRING,
`lat` STRING,
`event_name` STRING,
`event_json` STRING,
`server_time` STRING)
PARTITIONED BY (`dt` STRING)
STORED AS parquet
LOCATION '/warehouse/gmall/dwd/dwd_base_start_log';
-- 从mid_id至lat为cm对象的拆分
-- event_name为en
-- event_json为et对象
-- server_time为ett
b.UDF解析公共字段实现语句
//需要在pom.xml中添加org.apache.hive依赖
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;
publec class BaseFieldUDF extends UDF{
public String evaluate(String line,String jsonkeysString){
//0 准备一个接收字符串的sb
StringBuilder sb = new StringBuilder();
//1 切割jsonkeys,切割完后为mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,nw,ln,la,t
String[] jsonkeys[] = jsonkeysString.split(",");
//2 处理line 服务器事假|json
String[] logContents = line.split("\\|");
//3 合法性校验
if(logContents.length != 2 || StringUtils.isBlank(logContents[1])){
return "";
}
//4 开始处理json
try{
//大json对象包含三个小对象,分别为et,ap,cm
JSONObject jsonObject = new JSONObject(logContents[1]);
if(jsonObject == null){
return "";
}else{
//获取cm对象值
JSONObject base = jsonObject.getJSONObject("cm");
if(base == null){
return "";
}else{
//循环遍历取值
for(int i = 0; i < jsonkeys.length; i++){
String filedName = jsonkeys[i].trim();
if(base.has(filedName)){
sb.append(base.getString(filedName)).append("\t");
}else{
sb.append("").append("\t");
}
}
}
}
sb.append(jsonObject.getString("et")).append("\t");
sb.append(logContents[0]).append("\t");
}catch(JSONJException e){
e.printStackTrace();
}
return sb.toString();
}
}
c.UDTF解析具体事件字段实现语句
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.json.JSONArray;
import org.json.JSONException;
import java.util.ArrayList;
import java.util.List;
public class EventJsonUDTF extends GenericUDTF{
public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException{
//设置输出参数的名称和类型
List filednames = new ArrayList<>();
List fieldsType = new ArrayList<>();
filednames.add("event_name");
fieldsType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
filednames.add("event_json");
fieldsType.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldnames,fields);
}
@Override
public void process(Object[] objects) throws HiveException{
//获取string
String input = objects[0].toString();
//校验
if(StringUtils.isBlank(input)){
return;
}else{
try{
//创建一个jsonarray
new JSONArray(input) = new JSONArray(input);
//循环取出事件
for(int i=0;i
d.DWD层加载基本数据shell脚本
#!bin/bash
DB=gmall
if [ -n ${1} ] ; then
log_date=${1}
else
log_date=`date -d "-1 day" +%F`
fi
hql="
add jar jar包路径
CREATE FUNCTION base_analizer AS 'udf.BaseFieldUDF';
CREATE FUNCTION flat_analizer AS 'udtf.EventJsonUDTF'
SET hive.exec.dynamic.partition.mode=nonstrict;
INSERT OVERWRITE TABLE "${DB}".dwd_base_start_log PARTITION(dt)
SELECT
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
event_name,
event_json,
server_time,
dt
FROM
(
SELECT
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0] AS mid_id,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1] AS user_id,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2] AS version_code,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3] AS version_name,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4] AS lang,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5] AS source,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6] AS os,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7] AS area,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8] AS model,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9] AS brand,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10] AS sdk_version,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11] AS gmail,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12] AS height_width,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13] AS app_time,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14] AS network,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15] AS lng,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16] AS lat,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17] AS ops,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18] AS server_time
dt
FROM "${DB}".ods_start_log
WHERE
dt='${log_date}'
AND BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>''
) sdk_log
LATERAL VIEW FLAT_ANALIZER(ops) tmp_k AS event_name,event_json;
INSERT OVERWRITE TABLE "${DB}".dwd_base_event_log PARTITION(dt)
SELECT
mid_id,
user_id,
version_code,
version_name,
lang,
source,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
event_name,
event_json,
server_time,
dt
FROM
(
SELECT
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[0] AS mid_id,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[1] AS user_id,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[2] AS version_code,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[3] AS version_name,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[4] AS lang,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[5] AS source,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[6] AS os,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[7] AS area,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[8] AS model,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[9] AS brand,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[10] AS sdk_version,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[11] AS gmail,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[12] AS height_width,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[13] AS app_time,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[14] AS network,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[15] AS lng,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[16] AS lat,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[17] AS ops,
split(BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la'),'\t')[18] AS server_time
dt
FROM "${DB}".ods_event_log
WHERE
dt='${log_date}'
AND BASE_ANALIZER(line,'mid,uid,vc,vn,l,sr,os,ar,md,ba,sv,g,hw,t,nw,ln,la')<>''
) sdk_log
LATERAL VIEW FLAT_ANALIZER(ops) tmp_k AS event_name,event_json;
"
hive -e "${hql}"
e.DWD层事件表(点击表为例)
-- 建表语句(除display外,还有newsdetail(商品详情页表),loading(商品列表页表),ad(广告表),notification(消息通知表),foreground(用户前台活跃表),background(用户后天活跃表),comment(评论表),favorites(收藏表),praise(点赞表),启动日志表(start),错误日志表(error))
DROP TABLE IF EXISTS dwd_display_log;
CREATE EXTERNAL TABLE `dwd_display_log`(
`mid_id` STRING,
`user_id` STRING,
`version_code` STRING,
`lang` STRING,
`os` STRING,
`area` STRING,
`model` STRING,
`brand` STRING,
`sdk_version` STRING,
`gmail` STRING,
`height_width` STRING,
`app_time` STRING,
`network` STRING,
`lng` STRING,
`lat` STRING,
(-- 事件表字段
action STRING,
newsid STRING,
place STRING,
extendl STRING,
category STRING,
)
(-- 日志表字段
entry STRING,
open_ad_type STRING,
action STRING,
loading_time STRING,
detail STRING,
extendl STRING,
)
(-- 错误日志表字段
errorBrief STRING,
errorDetail STRING,
)
`server_time` STRING
)
PARTITIONED BY (dt STRING)
LOCATION '/warehouse/gmall/dwd/dwd_display_log/';
-- 导入数据
INSERT OVERWRITE TABLE dwd_display_log PARTITION(dt)
SELECT
mid_id,
user_id,
version_code,
lang,
os,
area,
model,
brand,
sdk_version,
gmail,
height_width,
app_time,
network,
lng,
lat,
(-- 事件表字段
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.newsid') newsid,
get_json_object(event_json,'$.kv.place') place,
get_json_object(event_json,'$.kv.extendl') extendl,
get_json_object(event_json,'$.kv.category') category,
)
(-- 启动日志表字段
get_json_object(event_json,'$.kv.entry') entry,
get_json_object(event_json,'$.kv.open_ad_type') open_ad_type,
get_json_object(event_json,'$.kv.action') action,
get_json_object(event_json,'$.kv.loading_time') loading_time,
get_json_object(event_json,'$.kv.detail') detail,
get_json_object(event_json,'$.kv.extendl') extendl,
)
(-- 错误日志表字段
get_json_object(event_json,'$.kv.errorBrief') errorBrief,
get_json_object(event_json,'$.kv.errorDetail') errorDetail,
)
`server_time`,
dt
FROM dwd_base_event_log
WHERE
dt='2019-02-10'
AND event_name='display'