hive (gmall)> drop table if exists ods_base_dic;
create external table ods_base_dic(
`dic_code` string COMMENT '编号',
`dic_name` string COMMENT '编码名称',
`parent_code` string COMMENT '父编码',
`create_time` string COMMENT '创建日期',
`operate_time` string COMMENT '操作日期'
) COMMENT '编码字典表'
PARTITIONED BY (`dt` string) row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_dic/';
3.3.24 ODS 层加载数据脚本
1)在/home/dw/bin 目录下创建脚本 hdfs_to_ods_db.sh
[dw@dw1 bin]$ vim hdfs_to_ods_db.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天if[-n "$2"];then
do_date=$2else
do_date=`date -d "-1 day"+%F`
fi
sql1="
load data inpath '/origin_data/$APP/db/order_info/$do_date' OVERWRITE into table ${APP}.ods_order_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_detail/$do_date' OVERWRITE into table ${APP}.ods_order_detail partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/sku_info/$do_date' OVERWRITE into table ${APP}.ods_sku_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/user_info/$do_date' OVERWRITE into table ${APP}.ods_user_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/payment_info/$do_date' OVERWRITE into table ${APP}.ods_payment_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category1/$do_date' OVERWRITE into table ${APP}.ods_base_category1 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category2/$do_date' OVERWRITE into table ${APP}.ods_base_category2 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_category3/$do_date' OVERWRITE into table ${APP}.ods_base_category3 partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_trademark/$do_date' OVERWRITE into table ${APP}.ods_base_trademark partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/activity_info/$do_date' OVERWRITE into table ${APP}.ods_activity_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/activity_order/$do_date' OVERWRITE into table ${APP}.ods_activity_order partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/cart_info/$do_date' OVERWRITE into table ${APP}.ods_cart_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/comment_info/$do_date' OVERWRITE into table ${APP}.ods_comment_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/coupon_info/$do_date' OVERWRITE into table ${APP}.ods_coupon_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/coupon_use/$do_date' OVERWRITE into table ${APP}.ods_coupon_use partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/favor_info/$do_date' OVERWRITE into table ${APP}.ods_favor_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_refund_info/$do_date' OVERWRITE into table ${APP}.ods_order_refund_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/order_status_log/$do_date' OVERWRITE into table ${APP}.ods_order_status_log partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/spu_info/$do_date' OVERWRITE into table ${APP}.ods_spu_info partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/activity_rule/$do_date' OVERWRITE into table ${APP}.ods_activity_rule partition(dt='$do_date');
load data inpath '/origin_data/$APP/db/base_dic/$do_date' OVERWRITE into table ${APP}.ods_base_dic partition(dt='$do_date');
"
sql2="
load data inpath '/origin_data/$APP/db/base_province/$do_date' OVERWRITE into table ${APP}.ods_base_province;
load data inpath '/origin_data/$APP/db/base_region/$do_date' OVERWRITE into table ${APP}.ods_base_region;
"
case $1 in
"first"){$hive-e "$sql1"$hive-e "$sql2"};;"all"){$hive-e "$sql1"};;
esac
2)修改权限
[dw@dw1 bin]$ chmod 777 hdfs_to_ods_db.sh
3)初次导入
[dw@dw1 bin]$ hdfs_to_ods_db.sh first 2020-03-10
4)每日导入
[dw@dw1 bin]$ hdfs_to_ods_db.sh all 2020-03-11
5)测试数据是否导入成功
hive (gmall)> select * from ods_order_detail where dt='2020-03-11';
hive (gmall)>
create function base_analizer as 'com.atguigu.udf.BaseFieldUDF' using jar 'hdfs://dw1:9000/user/hive/jars/hivefunction-1.0-SNAPSHOT.jar';
create function flat_analizer as 'com.atguigu.udtf.EventJsonUDTF' using jar 'hdfs://dw1:9000/user/hive/jars/hivefunction-1.0-SNAPSHOT.jar';
5)注意:如果修改了自定义函数重新生成 jar 包怎么处理?只需要替换 HDFS 路径上的旧 jar 包,然后重启 Hive 客户端即可。
4.2.4 解析事件日志基础明细表
1)解析事件日志基础明细表
hive (gmall)> insert overwrite table dwd_base_event_log
partition(dt='2020-03-10')
select
base_analizer(line,'mid') as mid_id,
base_analizer(line,'uid') as user_id,
base_analizer(line,'vc') as version_code,
base_analizer(line,'vn') as version_name,
base_analizer(line,'l') as lang,
base_analizer(line,'sr') as source,
base_analizer(line,'os') as os,
base_analizer(line,'ar') as area,
base_analizer(line,'md') as model,
base_analizer(line,'ba') as brand,
base_analizer(line,'sv') as sdk_version,
base_analizer(line,'g') as gmail,
base_analizer(line,'hw') as height_width,
base_analizer(line,'t') as app_time,
base_analizer(line,'nw') as network,
base_analizer(line,'ln') as lng,
base_analizer(line,'la') as lat,
event_name,
event_json,
base_analizer(line,'st') as server_time
from ods_event_log lateral view flat_analizer(base_analizer(line,'et'))
tmp_flat as event_name,event_json
where dt='2020-03-10' and base_analizer(line,'et')<>'';
2)测试
hive (gmall)> select * from dwd_base_event_log where dt='2020-03-10' limit 2;
4.2.5 DWD 层数据解析脚本
1)在 dw1 的/home/dw/bin 目录下创建脚本
[dw@dw1 bin]$ vim ods_to_dwd_base_log.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
if[-n "$1"]; then
do_date=$1else
do_date=`date -d '-1 day'+%F`
fi
sql="
use gmall;
insert overwrite table ${APP}.dwd_base_event_log
partition(dt='$do_date')
select
base_analizer(line,'mid') as mid_id,
base_analizer(line,'uid') as user_id,
base_analizer(line,'vc') as version_code,
base_analizer(line,'vn') as version_name,
base_analizer(line,'l') as lang,
base_analizer(line,'sr') as source,
base_analizer(line,'os') as os,
base_analizer(line,'ar') as area,
base_analizer(line,'md') as model,
base_analizer(line,'ba') as brand,
base_analizer(line,'sv') as sdk_version,
base_analizer(line,'g') as gmail,
base_analizer(line,'hw') as height_width,
base_analizer(line,'t') as app_time,
base_analizer(line,'nw') as network,
base_analizer(line,'ln') as lng,
base_analizer(line,'la') as lat,
event_name,
event_json,
base_analizer(line,'st') as server_time
from ods_event_log lateral view flat_analizer(base_analizer(line,'et'))
tmp_flat as event_name,event_json
where dt='$do_date' and base_analizer(line,'et')<>'';
"$hive-e "$sql"
hive (gmall)> insert overwrite table dwd_dim_base_province
select
bp.id,
bp.name,
bp.area_code,
bp.iso_code,
bp.region_id,
br.region_name
from(
select
*
from
ods_base_province
) bp
join
(
select
*
from
ods_base_region
) br
on bp.region_id=br.id;
3)查询加载结果
hive (gmall)> select * from dwd_dim_base_province;
4.4.5 时间维度表(特殊)(预留)
1)建表语句
hive (gmall)> DROP TABLE IF EXISTS `dwd_dim_date_info`;
CREATE EXTERNAL TABLE `dwd_dim_date_info`(
`date_id` string COMMENT '日',
`week_id` int COMMENT '周',
`week_day` int COMMENT '周的第几天',
`day` int COMMENT '每月的第几天',
`month` int COMMENT '第几月',
`quarter` int COMMENT '第几季度',
`year` int COMMENT '年',
`is_workday` int COMMENT '是否是周末',
`holiday_id` int COMMENT '是否是节假日'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_dim_date_info/';
hive (gmall)> insert overwrite table dwd_fact_order_detail
partition(dt='2020-03-10')
select od.id,
od.order_id,
od.user_id,
od.sku_id,
od.sku_name,
od.order_price,
od.sku_num,
od.create_time,
oi.province_id,
od.order_price*od.sku_num
from (
select
*
from
ods_order_detail where dt='2020-03-10'
) od
join (
select
*
from
ods_order_info
where dt='2020-03-10'
) oi
on od.order_id=oi.id;
3)查询加载结果
hive (gmall)> select * from dwd_fact_order_detail where dt='2020-03-10';
注意:dt 是按照优惠卷领用时间 get_time 做为分区。 2)数据装载 set hive.exec.dynamic.partition.mode=nonstrict; #动态非严格模式,才能动态分区
hive (gmall)> set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_fact_coupon_use
partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.coupon_id is null,old.coupon_id,new.coupon_id),
if(new.user_id is null,old.user_id,new.user_id),
if(new.order_id is null,old.order_id,new.order_id),
if(new.coupon_status is null,old.coupon_status,new.coupon_status),
if(new.get_time is null,old.get_time,new.get_time),
if(new.using_time is null,old.using_time,new.using_time),
if(new.used_time is null,old.used_time,new.used_time),
date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')
from (
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from
dwd_fact_coupon_use
where
dt in (
select
date_format(get_time,'yyyy-MM-dd')
from ods_coupon_use
where dt='2020-03-10'
)
)old
full outer join (
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ods_coupon_use
where dt='2020-03-10'
)new
on old.id=new.id;
3)查询加载结果
hive (gmall)> select * from dwd_fact_coupon_use where dt='2020-03-10';
框架
from
(
)old
full outer join
(
)new
on
hive (gmall)> set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_fact_order_info
partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.order_status is null,old.order_status,new.order_status),
if(new.user_id is null,old.user_id,new.user_id),
if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001 对应未支付状态
if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),
if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),
if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),
if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),
if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),
if(new.province_id is null,old.province_id,new.province_id),
if(new.activity_id is null,old.activity_id,new.activity_id),
if(new.original_total_amount is null,old.original_total_amount,new.original_total_amount),
if(new.benefit_reduce_amount is null,old.benefit_reduce_amount,new.benefit_reduce_amount),
if(new.feight_fee is null,old.feight_fee,new.feight_fee),
if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount),
date_format(if(new.tms['1001'] is null,old.create_time,new.tms['1001']),'yyyy-MM-dd')
from (
select
*
from dwd_fact_order_info
where dt in(
select
date_format(create_time,'yyyy-MM-dd')
from ods_order_info where dt='2020-03-10'
)
)old
full outer join (
select
info.id,
info.order_status,
info.user_id,
info.out_trade_no,
info.province_id,
act.activity_id,
log.tms,
info.original_total_amount,
info.benefit_reduce_amount,
info.feight_fee,
info.final_total_amount
from (
select
order_id,
str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=') tms
from ods_order_status_log
where dt='2020-03-10'
group by order_id
)log
join (
select
*
from ods_order_info
where dt='2020-03-10'
)info
on log.order_id=info.id
left join (
select
*
from
ods_activity_order
where dt='2020-03-10'
)act
on log.order_id=act.order_id
)new
on old.id=new.id;
6)查询加载结果
hive (gmall)> select * from dwd_fact_order_info where dt='2020-03-10';
hive (gmall)> insert overwrite table dwd_dim_user_info_his_tmp
select
*
from (
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'2020-03-11' start_date,
'9999-99-99' end_date
from ods_user_info
where dt='2020-03-11'
union all
select
uh.id,
uh.name,
uh.birthday,
uh.gender,
uh.email,
uh.user_level,
uh.create_time,
uh.operate_time,
uh.start_date,
if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1), uh.end_date)
end_date
from dwd_dim_user_info_his
uh
left join (
select
*
from
ods_user_info
where dt='2020-03-11'
) ui
on uh.id=ui.id
)his
order by his.id, start_date;
步骤 3:把临时表覆盖给拉链表
1)导入数据
hive (gmall)> insert overwrite table dwd_dim_user_info_his
select * from dwd_dim_user_info_his_tmp;
2)查询导入数据
hive (gmall)> select
id,
start_date,
end_date
from dwd_dim_user_info_his;
4.4.15 DWD 层数据导入脚本
1)在/home/dw/bin 目录下创建脚本 ods_to_dwd_db.sh
[dw@dw1 bin]$ vim ods_to_dwd_db.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天if[-n "$2"];then
do_date=$2else
do_date=`date -d "-1 day"+%F`
fi
sql1="
insert overwrite table ${APP}.dwd_dim_sku_info
partition(dt='$do_date')
select
sku.id,
sku.spu_id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.tm_id,
bt.tm_name,
bc3.id category3_id,
bc2.id category2_id,
bc1.id category1_id,
bc3.name category3_name,
bc2.name category2_name,
bc1.name category1_name,
spu.spu_name,
sku.create_time
from
(
select
*
from ${APP}.ods_sku_info
where dt='$do_date'
)
sku join
(
select
*
from ${APP}.ods_base_trademark
where dt='$do_date'
)bt on sku.tm_id = bt.tm_id
join
(
select
*
from ${APP}.ods_spu_info
where dt='$do_date'
)spu on spu.id = sku.spu_id
join
(
select
*
from ${APP}.ods_base_category3
where dt='$do_date'
)bc3 on sku.category3_id=bc3.id
join
(
select
*
from ${APP}.ods_base_category2
where dt='$do_date'
)bc2 on bc3.category2_id=bc2.id
join (
select
*
from ${APP}.ods_base_category1
where dt='$do_date'
)bc1 on bc2.category1_id=bc1.id;
insert overwrite table ${APP}.dwd_dim_coupon_info
partition(dt='$do_date')
select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
spu_id, tm_id,
category3_id,
limit_num,
operate_time,
expire_time
from ${APP}.ods_coupon_info
where dt='$do_date';
insert overwrite table ${APP}.dwd_dim_activity_info
partition(dt='$do_date')
select
info.id,
info.activity_name,
info.activity_type,
rule.condition_amount,
rule.condition_num,
rule.benefit_amount,
rule.benefit_discount,
rule.benefit_level,
info.start_time,
info.end_time,
info.create_time from
(
select
*
from
${APP}.ods_activity_info
where dt='$do_date'
)info
left join
(
select
*
from
${APP}.ods_activity_rule
where dt='$do_date'
)rule on info.id = rule.activity_id;
insert overwrite table ${APP}.dwd_fact_order_detail
partition(dt='$do_date')
select od.id,
od.order_id,
od.user_id,
od.sku_id,
od.sku_name,
od.order_price,
od.sku_num,
od.create_time,
oi.province_id,
od.order_price*od.sku_num
from (
select
*
from
${APP}.ods_order_detail where dt='$do_date'
) od
join (
select
*
from
${APP}.ods_order_info
where dt='$do_date'
) oi
on od.order_id=oi.id;
insert overwrite table ${APP}.dwd_fact_payment_info
partition(dt='$do_date')
select
pi.id,
pi.out_trade_no,
pi.order_id,
pi.user_id,
pi.alipay_trade_no,
pi.total_amount,
pi.subject,
pi.payment_type,
pi.payment_time,
oi.province_id
from (
select
*
from ${APP}.ods_payment_info
where dt='$do_date'
)pi
join (
select
id,
province_id
from
${APP}.ods_order_info
where dt='$do_date'
)oi
on pi.order_id = oi.id;
insert overwrite table ${APP}.dwd_fact_order_refund_info
partition(dt='$do_date')
select id,
user_id,
order_id,
sku_id,
refund_type,
refund_num,
refund_amount,
refund_reason_type,
create_time
from ${APP}.ods_order_refund_info
where dt='$do_date';
insert overwrite table ${APP}.dwd_fact_comment_info
partition(dt='$do_date')
select
id,
user_id,
sku_id,
spu_id,
order_id,
appraise,
create_time
from ${APP}.ods_comment_info
where dt='$do_date';
insert overwrite table ${APP}.dwd_fact_cart_info
partition(dt='$do_date')
select id,
user_id,
sku_id,
cart_price,
sku_num,
sku_name,
create_time,
operate_time,
is_ordered,
order_time
from ${APP}.ods_cart_info
where dt='$do_date';
insert overwrite table ${APP}.dwd_fact_favor_info
partition(dt='$do_date')
select id,
user_id,
sku_id,
spu_id,
is_cancel,
create_time,
cancel_time
from ${APP}.ods_favor_info
where dt='$do_date';
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table ${APP}.dwd_fact_coupon_use
partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.coupon_id is null,old.coupon_id,new.coupon_id),
if(new.user_id is null,old.user_id,new.user_id),
if(new.order_id is null,old.order_id,new.order_id),
if(new.coupon_status is null,old.coupon_status,new.coupon_status),
if(new.get_time is null,old.get_time,new.get_time),
if(new.using_time is null,old.using_time,new.using_time),
if(new.used_time is null,old.used_time,new.used_time),
date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')
from (
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from
${APP}.dwd_fact_coupon_use
where
dt in (
select
date_format(get_time,'yyyy-MM-dd')
from ${APP}.ods_coupon_use
where dt='$do_date'
)
)old
full outer join (
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ${APP}.ods_coupon_use
where dt='$do_date'
)new
on old.id=new.id;
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table ${APP}.dwd_fact_order_info
partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.order_status is null,old.order_status,new.order_status),
if(new.user_id is null,old.user_id,new.user_id),
if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
if(new.tms['1001'] is null,old.create_time,new.tms['1001']),
if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),
if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),
if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),
if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),
if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),
if(new.province_id is null,old.province_id,new.province_id),
if(new.activity_id is null,old.activity_id,new.activity_id),
if(new.original_total_amount is null,old.original_total_amount,new.original_total_amount),
if(new.benefit_reduce_amount is null,old.benefit_reduce_amount,new.benefit_reduce_amount),
if(new.feight_fee is null,old.feight_fee,new.feight_fee),
if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount),
date_format(if(new.tms['1001'] is null,old.create_time,new.tms['1001']),'yyyy-MM-dd')
from (
select
*
from ${APP}.dwd_fact_order_info
where dt in(
select
date_format(create_time,'yyyy-MM-dd')
from ${APP}.ods_order_info where dt='$do_date'
)
)old
full outer join (
select
info.id,
info.order_status,
info.user_id,
info.out_trade_no,
info.province_id,
act.activity_id,
log.tms,
info.original_total_amount,
info.benefit_reduce_amount,
info.feight_fee,
info.final_total_amount
from (
select
order_id,
str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=') tms
from ${APP}.ods_order_status_log
where dt='$do_date'
group by order_id
)log
join (
select
*
from ${APP}.ods_order_info
where dt='$do_date'
)info
on log.order_id=info.id
left join (
select
*
from
${APP}.ods_activity_order
where dt='$do_date'
)act
on log.order_id=act.order_id
)new
on old.id=new.id;
insert overwrite table ${APP}.dwd_dim_user_info_his_tmp
select
*
from (
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'$do_date' start_date,
'9999-99-99' end_date
from ${APP}.ods_user_info
where dt='$do_date'
union all
select
uh.id,
uh.name,
uh.birthday,
uh.gender,
uh.email,
uh.user_level,
uh.create_time,
uh.operate_time,
uh.start_date,
if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1), uh.end_date)
end_date
from ${APP}.dwd_dim_user_info_his
uh
left join (
select
*
from
${APP}.ods_user_info
where dt='$do_date'
) ui
on uh.id=ui.id
)his
order by his.id, start_date;
insert overwrite table ${APP}.dwd_dim_user_info_his
select * from ${APP}.dwd_dim_user_info_his_tmp;
"
sql2="
insert overwrite table ${APP}.dwd_dim_base_province
select
bp.id,
bp.name,
bp.area_code,
bp.iso_code,
bp.region_id,
br.region_name
from(
select
*
from
${APP}.ods_base_province
) bp
join
(
select
*
from
${APP}.ods_base_region
) br
on bp.region_id=br.id;
"
case $1 in
"first"){$hive-e "$sql1"$hive-e "$sql2"};;"all"){$hive-e "$sql1"};;
esac
2)增加脚本执行权限
[dw@dw1 bin]$ chmod 777 ods_to_dwd_db.sh
3)执行脚本导入数据
[dw@dw1 bin]$ ods_to_dwd_db.sh all 2020-03-11
4)查看导入数据
hive (gmall)>
select * from dwd_fact_order_info where dt='2020-03-11';
select * from dwd_fact_order_detail where dt='2020-03-11';
select * from dwd_fact_comment_info where dt='2020-03-11';
select * from dwd_fact_order_refund_info where dt='2020-03-11';
hive (gmall)>
with
tmp_login as (
select
user_id,
count(*) login_count
from dwd_start_log
where dt='2020-03-10' and user_id is not null
group by user_id
),
tmp_cart as (
select user_id,
count(*) cart_count,
sum(cart_price*sku_num) cart_amount
from dwd_fact_cart_info
where dt='2020-03-10' and user_id is not null and date_format(create_time,'yyyy-MM-dd')='2020-03-10'
group by user_id
),
tmp_order as (
select
user_id,
count(*) order_count,
sum(final_total_amount) order_amount
from dwd_fact_order_info
where dt='2020-03-10'
group by user_id
) ,
tmp_payment as (
select
user_id,
count(*) payment_count,
sum(payment_amount) payment_amount
from dwd_fact_payment_info
where dt='2020-03-10' group by user_id
)
insert overwrite table dws_user_action_daycount
partition(dt='2020-03-10')
select
user_actions.user_id,
sum(user_actions.login_count),
sum(user_actions.cart_count),
sum(user_actions.cart_amount),
sum(user_actions.order_count),
sum(user_actions.order_amount),
sum(user_actions.payment_count),
sum(user_actions.payment_amount)
from (
select
user_id,
login_count,
0 cart_count,
0 cart_amount,
0 order_count,
0 order_amount,
0 payment_count,
0 payment_amount
from tmp_login
union all
select
user_id,
0 login_count,
cart_count,
cart_amount,
0 order_count,
0 order_amount,
0 payment_count,
0 payment_amount
from tmp_cart
union all
select
user_id,
0 login_count,
0 cart_count,
0 cart_amount,
order_count,
order_amount,
0 payment_count,
0 payment_amount
from tmp_order
union all
select
user_id,
0 login_count,
0 cart_count,
0 cart_amount,
0 order_count,
0 order_amount,
payment_count,
payment_amount
from tmp_payment
) user_actions
group by user_id;
3)查询加载结果
hive (gmall)> select * from dws_user_action_daycount where dt='2020-03-10';
hive (gmall)>insert overwrite table dws_activity_info_daycount
partition(dt='2020-03-10')
select
oi.activity_id,
ai.activity_name,
ai.activity_type,
ai.start_time,
ai.end_time,
ai.create_time,
oi.order_count,
oi.payment_count
from (
select
activity_id,
sum(if(date_format(create_time,'yyyy-MM-dd')='2020-03-10',1,0)) order_count,
sum(if(date_format(payment_time,'yyyy-MM-dd')='2020-03-10',1,0)) payment_count
from dwd_fact_order_info
where (dt='2020-03-10' or dt=date_add('2020-03-10',-1)) and activity_id is not null
group by activity_id
)oi join (
select * from dwd_dim_activity_info
where dt='2020-03-10'
)ai on oi.activity_id=ai.id;
hive (gmall)> insert overwrite table dws_sale_detail_daycount
partition(dt='2020-03-10')
select
op.user_id,
op.sku_id,
ui.gender,
months_between('2020-03-10', ui.birthday)/12 age,
ui.user_level,
si.price,
si.sku_name,
si.tm_id,
si.category3_id,
si.category2_id,
si.category1_id,
si.category3_name,
si.category2_name,
si.category1_name,
si.spu_id,
op.sku_num,
op.order_count,
op.order_amount
from (
select
user_id,
sku_id,
sum(sku_num) sku_num,
count(*) order_count,
sum(total_amount) order_amount
from dwd_fact_order_detail
where dt='2020-03-10' group by user_id, sku_id
)op join (
select
*
from dwd_dim_user_info_his
where end_date='9999-99-99'
)ui
on op.user_id = ui.id
join (
select * from dwd_dim_sku_info where dt='2020-03-10'
)si
on op.sku_id = si.id;
3)查询加载结果
hive (gmall)> select * from dws_sale_detail_daycount where dt='2020-03-10';
5.5 DWS 层数据导入脚本
1)在/home/dw/bin 目录下创建脚本 dwd_to_dws.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive/bin/hive
if[-n "$1"];then
do_date=$1else
do_date=`date -d "-1 day"+%F`
fi
sql="
insert overwrite table ${APP}.dws_uv_detail_daycount
partition(dt='$do_date')
select mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang))lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat,
count(*) login_count
from ${APP}.dwd_start_log
where dt='$do_date'
group by mid_id;
with
tmp_login as (
select
user_id,
count(*) login_count
from ${APP}.dwd_start_log
where dt='$do_date' and user_id is not null
group by user_id
),
tmp_cart as (
select user_id,
count(*) cart_count,
sum(cart_price*sku_num) cart_amount
from ${APP}.dwd_fact_cart_info
where dt='$do_date' and user_id is not null and date_format(create_time,'yyyy-MM-dd')='$do_date'
group by user_id
),
tmp_order as (
select
user_id,
count(*) order_count,
sum(final_total_amount) order_amount
from ${APP}.dwd_fact_order_info
where dt='$do_date'
group by user_id
) ,
tmp_payment as (
select
user_id,
count(*) payment_count,
sum(payment_amount) payment_amount
from ${APP}.dwd_fact_payment_info
where dt='$do_date' group by user_id
)
insert overwrite table ${APP}.dws_user_action_daycount
partition(dt='$do_date')
select
user_actions.user_id,
sum(user_actions.login_count),
sum(user_actions.cart_count),
sum(user_actions.cart_amount),
sum(user_actions.order_count),
sum(user_actions.order_amount),
sum(user_actions.payment_count),
sum(user_actions.payment_amount)
from (
select
user_id,
login_count,
0 cart_count,
0 cart_amount,
0 order_count,
0 order_amount,
0 payment_count,
0 payment_amount
from tmp_login
union all
select
user_id,
0 login_count,
cart_count,
cart_amount,
0 order_count,
0 order_amount,
0 payment_count,
0 payment_amount
from tmp_cart
union all
select
user_id,
0 login_count,
0 cart_count,
0 cart_amount,
order_count,
order_amount,
0 payment_count,
0 payment_amount
from tmp_order
union all
select
user_id,
0 login_count,
0 cart_count,
0 cart_amount,
0 order_count,
0 order_amount,
payment_count,
payment_amount
from tmp_payment
) user_actions
group by user_id;
with
tmp_order as (
select
sku_id,
count(*) order_count,
sum(sku_num) order_num,
sum(total_amount) order_amount
from ${APP}.dwd_fact_order_detail
where dt='$do_date' group by sku_id
),
tmp_payment as (
select
sku_id,
count(*) payment_count,
sum(sku_num) payment_num,
sum(total_amount) payment_amount
from ${APP}.dwd_fact_order_detail
where dt='$do_date' and order_id in (
select
id
from
${APP}.dwd_fact_order_info
where (dt='$do_date' or dt=date_add('$do_date',-1)) and date_format(payment_time,'yyyy-MM-dd')='$do_date' )
group by sku_id ),
tmp_refund as (
select
sku_id,
count(*) refund_count,
sum(refund_num) refund_num,
sum(refund_amount) refund_amount
from ${APP}.dwd_fact_order_refund_info
where dt='$do_date' group by sku_id ),
tmp_cart as (
select
sku_id,
count(*) cart_count,
sum(sku_num) cart_num
from ${APP}.dwd_fact_cart_info
where dt='$do_date' and date_format(create_time,'yyyy-MM-dd')='$do_date' group by sku_id ),
tmp_favor as (
select
sku_id,
count(*) favor_count
from ${APP}.dwd_fact_favor_info
where dt='$do_date' and date_format(create_time,'yyyy-MM-dd')='$do_date' group by sku_id ),
tmp_appraise as (
select sku_id,
sum(if(appraise='1201',1,0)) appraise_good_count,
sum(if(appraise='1202',1,0)) appraise_mid_count,
sum(if(appraise='1203',1,0)) appraise_bad_count,
sum(if(appraise='1204',1,0)) appraise_default_count
from ${APP}.dwd_fact_comment_info
where dt='$do_date'
group by sku_id
)
insert overwrite table ${APP}.dws_sku_action_daycount
partition(dt='$do_date')
select
sku_id,
sum(order_count),
sum(order_num),
sum(order_amount),
sum(payment_count),
sum(payment_num),
sum(payment_amount),
sum(refund_count),
sum(refund_num),
sum(refund_amount),
sum(cart_count),
sum(cart_num),
sum(favor_count),
sum(appraise_good_count),
sum(appraise_mid_count),
sum(appraise_bad_count),
sum(appraise_default_count)
from (
select
sku_id,
order_count,
order_num,
order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
0 cart_num,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_order
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
payment_count,
payment_num,
payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
0 cart_num,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count from tmp_payment
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
refund_count,
refund_num,
refund_amount,
0 cart_count,
0 cart_num,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_refund
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
cart_count,
cart_num,
0 favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_cart
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
0 cart_num,
favor_count,
0 appraise_good_count,
0 appraise_mid_count,
0 appraise_bad_count,
0 appraise_default_count
from tmp_favor
union all
select
sku_id,
0 order_count,
0 order_num,
0 order_amount,
0 payment_count,
0 payment_num,
0 payment_amount,
0 refund_count,
0 refund_num,
0 refund_amount,
0 cart_count,
0 cart_num,
0 favor_count,
appraise_good_count,
appraise_mid_count,
appraise_bad_count,
appraise_default_count
from tmp_appraise
)tmp
group by sku_id;
insert overwrite table ${APP}.dws_coupon_use_daycount
partition(dt='$do_date')
select
cu.coupon_id,
ci.coupon_name,
ci.coupon_type,
ci.condition_amount,
ci.condition_num,
ci.activity_id,
ci.benefit_amount,
ci.benefit_discount,
ci.create_time,
ci.range_type,
ci.spu_id,
ci.tm_id,
ci.category3_id,
ci.limit_num,
cu.get_count,
cu.using_count,
cu.used_count
from (
select
coupon_id,
sum(if(date_format(get_time,'yyyy-MM-dd')='$do_date',1,0)) get_count,
sum(if(date_format(using_time,'yyyy-MM-dd')='$do_date',1,0)) using_count,
sum(if(date_format(used_time,'yyyy-MM-dd')='$do_date',1,0)) used_count
from
${APP}.dwd_fact_coupon_use
where dt='$do_date'
group by coupon_id
)cu left join (
select
*
from ${APP}.dwd_dim_coupon_info
where dt='$do_date'
)ci on cu.coupon_id=ci.id;
insert overwrite table ${APP}.dws_activity_info_daycount
partition(dt='$do_date')
select
oi.activity_id,
ai.activity_name,
ai.activity_type,
ai.start_time,
ai.end_time,
ai.create_time,
oi.order_count,
oi.payment_count
from (
select
activity_id,
sum(if(date_format(create_time,'yyyy-MM-dd')='$do_date',1,0)) order_count,
sum(if(date_format(payment_time,'yyyy-MM-dd')='$do_date',1,0)) payment_count
from ${APP}.dwd_fact_order_info
where (dt='$do_date' or dt=date_add('$do_date',-1)) and activity_id is not null
group by activity_id
)oi join (
select * from ${APP}.dwd_dim_activity_info
where dt='$do_date'
)ai on oi.activity_id=ai.id;
insert overwrite table ${APP}.dws_sale_detail_daycount
partition(dt='$do_date')
select
op.user_id,
op.sku_id,
ui.gender,
months_between('$do_date', ui.birthday)/12 age,
ui.user_level,
si.price,
si.sku_name,
si.tm_id,
si.category3_id,
si.category2_id,
si.category1_id,
si.category3_name,
si.category2_name,
si.category1_name,
si.spu_id,
op.sku_num,
op.order_count,
op.order_amount
from (
select
user_id,
sku_id,
sum(sku_num) sku_num,
count(*) order_count,
sum(total_amount) order_amount
from ${APP}.dwd_fact_order_detail
where dt='$do_date' group by user_id, sku_id
)op join (
select
*
from ${APP}.dwd_dim_user_info_his
where end_date='9999-99-99'
)ui
on op.user_id = ui.id
join (
select * from ${APP}.dwd_dim_sku_info where dt='$do_date'
)si
on op.sku_id = si.id;
"$hive-e "$sql"
2)增加脚本执行权限
chmod 777 dwd_to_dws.sh
3)执行脚本导入数据
dwd_to_dws.sh 2020-03-11
4)查看导入数据
hive (gmall)> select * from dws_uv_detail_daycount where dt='2020-03-11' limit 2;
select * from dws_user_action_daycount where dt='2020-03-11' limit 2;
select * from dws_sku_action_daycount where dt='2020-03-11' limit 2;
select * from dws_sale_detail_daycount where dt='2020-03-11' limit 2;
select * from dws_coupon_use_daycount where dt='2020-03-11' limit 2;
select * from dws_activity_info_daycount where dt='2020-03-11' limit 2;
hive (gmall)> insert into table ads_uv_count
select
'2020-03-10',
sum(if(login_date_last='2020-03-10',1,0)),
sum(if(login_date_last >= date_add(next_day('2020-03-10','monday'),-7) and login_date_last <= date_add(next_day('2020-03-10','monday'),-1),1,0)),
sum(if(date_format(login_date_last,'yyyy-MM') = date_format('2020-03-10','yyyy-MM'),1,0)),
if('2020-03-10' = date_add(next_day('2020-03-10','monday'),-1),'Y','N'),
if('2020-03-10' = last_day('2020-03-10'),'Y','N')
from
dwt_uv_topic;
3)查询导入结果
hive (gmall)> select * from ads_uv_count;
7.1.2 每日新增设备
1)建表语句
hive (gmall)> drop table if exists ads_new_mid_count;
create external table ads_new_mid_count (
`create_date` string comment '创建时间' ,
`new_mid_count` BIGINT comment '新增设备数量'
) COMMENT '每日新增设备信息数量'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_new_mid_count/';
2)导入数据
hive (gmall)>insert into table ads_new_mid_count
select
'2020-03-10',
count(*)
from dwt_uv_topic
where login_date_first='2020-03-10';
insert into table ads_new_mid_count
select
login_date_first,
count(*)
from dwt_uv_topic
where login_date_first='2020-03-10'
group by login_date_first;
3)查询导入数据
hive (gmall)> select * from ads_new_mid_count;
7.1.3 沉默用户数
需求定义:
沉默用户:只在安装当天启动过,且启动时间是在 7 天前
1)建表语句
hive (gmall)> drop table if exists ads_silent_count;
create external table ads_silent_count(
`dt` string COMMENT '统计日期',
`silent_count` bigint COMMENT '沉默设备数'
)row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_silent_count';
2)导入数据
hive (gmall)> insert into table ads_silent_count
select
'2020-03-10',
count(*)
from dwt_uv_topic
where login_date_first=login_date_last and login_date_last<=date_add('2020-03-10',-7);
3)查询导入数据
hive (gmall)> select * from ads_silent_count;
7.1.4 本周回流用户数
需求定义: 本周回流用户 上周未活跃,本周活跃的设备,且不是本周新增设备 1)建表语句
hive (gmall)> drop table if exists ads_back_count;
create external table ads_back_count(
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '统计日期所在周',
`wastage_count` bigint COMMENT '回流设备数'
)row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_back_count';
2)导入数据
hive (gmall)> insert into table ads_back_count
select
'2020-03-10',
concat(date_add(next_day('2020-03-10','MO'),-7),'_',date_add(next_day('2020-03-10','MO'),-1)),
count(*)
from
(
select
mid_id
from dwt_uv_topic
where login_date_last>=date_add(next_day('2020-03-10','MO'),-7)
and login_date_last<= date_add(next_day('2020-03-10','MO'),-1)
and login_date_first=date_add(next_day('2020-03-10','MO'),-7*2)
and dt<= date_add(next_day('2020-03-10','MO'),-7-1)
group by mid_id
)last_wk
on current_wk.mid_id=last_wk.mid_id
where last_wk.mid_id is null;
7.1.5 流失用户数
需求定义: 流失用户:最近 7 天未活跃的设备 1)建表语句
hive (gmall)> drop table if exists ads_wastage_count;
create external table ads_wastage_count(
`dt` string COMMENT '统计日期',
`wastage_count` bigint COMMENT '流失设备数'
)row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_wastage_count';
2)导入数据
hive (gmall)> insert into table ads_wastage_count
select
'2020-03-10',
count(*)
from dwt_uv_topic
where login_date_last <= date_add('2020-03-10',-7);
//不需要group by过滤 dwt层一条记录就是一个设备id
hive (gmall)> insert into table ads_wastage_count
select'2020-03-10', count(*)
from
( select
mid_id
from dwt_uv_topic
where login_date_last<=date_add('2020-03-10',-7)
group by mid_id
)t1;
insert into table ads_wastage_count
select
'2020-03-11',
count(*)
from dwt_uv_topic
where login_date_last <= date_add('2020-03-11',-7);
7.1.6 留存率
明确一天所算的任务03-14
1)建表语句
hive (gmall)> drop table if exists ads_user_retention_day_rate;
create external table ads_user_retention_day_rate (
`stat_date` string comment '统计日期',
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数量',
`new_mid_count` bigint comment '设备新增数量',
`retention_ratio` decimal(10,2) comment '留存率'
) COMMENT '每日用户留存情况'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_rate/';
2)导入数据
hive (gmall)>
insert into table ads_user_retention_day_rate
select
'2020-03-13',
'2020-03-10',
3,
sum(if(login_date_first='2020-03-10' and login_date_last='2020-03-13',1,0)),
sum(if(login_date_first='2020-03-10',1,0)),
sum(if(login_date_first='2020-03-10' and login_date_last='2020-03-13',1,0))/sum(if(login_date_first='2020-03-10',1,0))
from dwt_uv_topic
union all
select
'2020-03-13',
'2020-03-11',
2,
sum(if(login_date_first='2020-03-11' and login_date_last='2020-03-13',1,0)),
sum(if(login_date_first='2020-03-11',1,0)),
sum(if(login_date_first='2020-03-11' and login_date_last='2020-03-13',1,0))/sum(if(login_date_first='2020-03-11',1,0))
from dwt_uv_topic
union all
select
'2020-03-13',
'2020-03-12',
1,
sum(if(login_date_first='2020-03-12' and login_date_last='2020-03-13',1,0)),
sum(if(login_date_first='2020-03-12',1,0)),
sum(if(login_date_first='2020-03-12' and login_date_last='2020-03-13',1,0))/sum(if(login_date_first='2020-03-12',1,0))
from dwt_uv_topic;
#脚本示例
select
'$do_date',
date_add('$do_date',-3),
3,
sum(if(login_date_first=date_add('$do_date',-3) and login_date_last='$do_date',1,0)),
sum(if(login_date_first=date_add('$do_date',-3),1,0)),
sum(if(login_date_first=date_add('$do_date',-3) and login_date_last='$do_date',1,0))/sum(if(login_date_first=date_add('$do_date',-3),1,0))
from dwt_uv_topic
3)查询导入数据
hive (gmall)>select * from ads_user_retention_day_rate;
7.1.7 最近连续三周活跃用户数
1)建表语句
hive (gmall)> drop table if exists ads_continuity_wk_count;
create external table ads_continuity_wk_count(
`dt` string COMMENT '统计日期,一般用结束周周日日期,如果每天计算一次,可用当天日 期',
`wk_dt` string COMMENT '持续时间',
`continuity_count` bigint COMMENT '活跃次数'
)row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_wk_count';
2)导入数据
hive (gmall)> insert into table ads_continuity_wk_count
select '2020-03-10',
concat(date_add(next_day('2020-03-10','MO'),-7*3),'_',date_add(next_day(' 2020-03-10','MO'),-1)),
count(*)
from
(
select
mid_id
from
(
select
mid_id
from
dws_uv_detail_daycount
where dt>=date_add(next_day('2020-03-10','monday'),-7) and dt<=date_add(next_day('2020-03-10','monday'),-1)
group by mid_id
union all
select
mid_id
from dws_uv_detail_daycount
where dt>=date_add(next_day('2020-03-10','monday'),-7*2) and dt<=date_add(next_day('2020-03-10','monday'),-7-1)
group by mid_id
union all
select
mid_id
from
dws_uv_detail_daycount
where dt>=date_add(next_day('2020-03-10','monday'),-7*3) and dt<=date_add(next_day('2020-03-10','monday'),-7*2-1)
group by mid_id
)t1
group by mid_id
having count(*)=3
)t2;
3)查询
hive (gmall)> select * from ads_continuity_wk_count;
7.1.8 最近七天内连续三天活跃用户数
1)建表语句
hive (gmall)> drop table if exists ads_continuity_uv_count;
create external table ads_continuity_uv_count(
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '最近 7 天日期',
`continuity_count` bigint
) COMMENT '连续活跃设备数'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_uv_count';
开窗 2.1)写出导入数据的 SQL 语句思路1
hive (gmall)>
insert into table ads_continuity_uv_count
select
'2020-03-10',
concat(date_add('2020-03-10',-6),'_','2020-03-10'),
count(*)
from
(
select
mid_id
from
(
select
mid_id
from
(
select
mid_id,
date_add(dt,-rk) diff
from
(
select
mid_id,
dt,
rank() over(partition by mid_id order by dt) rk
from dws_uv_detail_daycount
where dt>=date_add('2020-03-10',-6)
)t1
)t2
group by mid_id,diff
having count(*) >=3
)t3
group by mid_id
)t4;
insert into table ads_user_action_convert_day
select
ua.dt,
uv.day_count,
cart_count,
cart_count/uv.day_count*100,
order_count,
order_count/cart_count*100,
payment_count,
payment_count/order_count*100
from
(
select
'2020-03-10' dt,
sum(if(cart_count>0,1,0)) cart_count,
sum(if(order_count>0,1,0)) order_count,
sum(if(payment_count>0,1,0)) payment_count
from dws_user_action_daycount
where dt='2020-03-10'
) ua
join ads_uv_count uv
on ua.dt=uv.dt;
7.3 商品主题
7.3.1 当天商品个数信息
1)建表语句
hive (gmall)> drop table if exists ads_product_info;
create external table ads_product_info(
`dt` string COMMENT '统计日期',
`sku_num` string COMMENT 'sku 个数',
`spu_num` string COMMENT 'spu 个数'
) COMMENT '商品个数信息'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_info';
2)导入数据
select
count(*)
count(distinct(spu_id)) # 不要这么写
from dwt_sku_topic
insert into table ads_product_info
select
'2020-03-10' dt,
sku_num,
spu_num
from
(
select
'2020-03-10' dt,
count(*) sku_num
from
dwt_sku_topic
) tmp_sku_num
join
(
select
'2020-03-10' dt,
count(*) spu_num
from
(
select
spu_id
from
dwt_sku_topic
group by
spu_id
) tmp_spu_id
) tmp_spu_num
on tmp_sku_num.dt = tmp_spu_num.dt;
7.3.2 当天商品销量排名top10
1)建表语句
hive (gmall)> drop table if exists ads_product_sale_topN;
create external table ads_product_sale_topN(
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品 ID',
`payment_amount` bigint COMMENT '当日销量'
) COMMENT '商品个数信息'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_sale_topN';
2)导入数据
insert into ads_product_sale_topN
select
'2020-03-10',
sku_id,
payment_amount
from dws_sku_action_daycount
where dt='2020-03-10'
order by payment_amount desc limit 10;
7.3.3 当天商品收藏排名top10
1)建表语句
hive (gmall)> drop table if exists ads_product_favor_topN;
create external table ads_product_favor_topN(
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品 ID',
`favor_count` bigint COMMENT '当日收藏量'
) COMMENT '商品收藏 TopN'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_favor_topN';
2)导入数据
insert into ads_product_favor_topN
select
'2020-03-10',
sku_id,
favor_count
from dws_sku_action_daycount
where dt='2020-03-10'
order by favor_count
desc
limit 10;
7.3.4 当天商品加入购物车个数排名top10
1)建表语句
hive (gmall)> drop table if exists ads_product_cart_topN;
create external table ads_product_cart_topN(
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品 ID',
`cart_num` bigint COMMENT '加入购物车数量'
) COMMENT '商品加入购物车 TopN'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_cart_topN';
2)导入数据
insert into ads_product_cart_topN
select
'2020-03-10',
sku_id,
cart_num
from dws_sku_action_daycount
where dt='2020-03-10'
order by cart_num
desc
limit 10;
7.3.5 商品退款率排名(最近 30 天)
1)建表语句
hive (gmall)> drop table if exists ads_product_refund_topN;
create external table ads_product_refund_topN(
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品 ID',
`refund_ratio` decimal(10,2) COMMENT '退款率'
) COMMENT '商品退款率 TopN'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_refund_topN';
2)导入数据
insert into table ads_product_refund_topN
select
'2020-03-10',
sku_id,
refund_last_30d_count/payment_last_30d_count*100 refund_ratio
from dwt_sku_topic
order by refund_ratio
desc
limit 10;
7.3.6 当天商品差评率
1)建表语句
hive (gmall)> drop table if exists ads_appraise_bad_topN;
create external table ads_appraise_bad_topN(
`dt` string COMMENT '统计日期',
`sku_id` string COMMENT '商品 ID',
`appraise_bad_ratio` decimal(10,2) COMMENT '差评率'
) COMMENT '商品差评率 TopN'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_appraise_bad_topN';
2)导入数据
insert into table ads_appraise_bad_topN
select
'2020-03-10',
sku_id,
appraise_bad_count/(appraise_good_count+appraise_bad_count+appraise_default_count)*100 appraise_bad_ratio
from dws_sku_action_daycount
where dt='2020-03-10'
order by appraise_bad_ratio
desc
limit 10;
7.4 营销主题(用户+商品+购买行为)
7.4.1 当天下单数目统计
需求分析:统计每日下单数,下单金额及下单用户数。 1)建表语句
hive (gmall)> drop table if exists ads_order_daycount;
create external table ads_order_daycount(
dt string comment '统计日期',
order_count bigint comment '单日下单笔数',
order_amount bigint comment '单日下单金额',
order_users bigint comment '单日下单用户数'
) comment '每日订单总计表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_order_daycount';
2)导入数据
insert into table ads_order_daycount
select
'2020-03-10',
sum(order_count),
sum(order_amount),
sum(if(order_count>0,1,0))
from
dws_user_action_daycount
where dt='2020-03-10';
insert into table ads_payment_daycount
select
tmp_payment.dt,
tmp_payment.payment_count,
tmp_payment.payment_amount,
tmp_payment.payment_user_count,
tmp_skucount.payment_sku_count,
tmp_time.payment_avg_time
from
(
select
'2020-03-10' dt,
sum(payment_count) payment_count,
sum(payment_amount) payment_amount,
sum(if(payment_count>0,1,0)) payment_user_count
from dws_user_action_daycount
where dt='2020-03-10'
) tmp_payment
join
(
select
'2020-03-10' dt,
sum(if(payment_amount>0,1,0)) payment_sku_count
from dws_sku_action_daycount
where dt='2020-03-10'
) tmp_skucount on tmp_payment.dt = tmp_skucount.dt
join
(
select
'2020-03-10' dt,
sum(unix_timestamp(payment_time)-unix_timestamp(create_time))/count(*)/60 payment_avg_time
from dwd_fact_order_info
where dt='2020-03-10' and payment_time is not null
) tmp_time on tmp_payment.dt=tmp_time.dt;
insert into table ads_sale_tm_category1_stat_mn
select
sku_category1_id,
sku_category1_name,
sku_tm_id,
sum(if(order_count>=1,1,0)) buycount,
sum(if(order_count>=2,1,0)) buyTwiceLast,
sum(if(order_count>=3,1,0)) buy3timeLast,
sum(if(order_count>=2,1,0))/sum( if(order_count>=1,1,0)) buyTwiceLastRatio,
sum(if(order_count>=3,1,0))/sum( if(order_count>=1,1,0)) buy3timeLastRatio,
date_format('$do_date' ,'yyyy-MM') stat_mn,
'$do_date' stat_date
from
(
select
sku_category1_id,
sku_category1_name,
user_id,
sku_tm_id,
sum(order_count) order_count
from dws_sale_detail_daycount
where date_format(dt,'yyy-MM')=date_format('$do_date','yyyy-MM')
group by user_id,sku_tm_id,sku_category1_id,sku_category1_name
)t1
group by sku_tm_id,sku_category1_id,sku_category1_name;
7.5 ADS 层导入脚本
1)在/home/dw/bin 目录下创建脚本 dwt_to_ads.sh
vim dwt_to_ads.sh
#!/bin/bash
hive=/opt/module/hive/bin/hive
if[-n "$1"];then
do_date=$1else
do_date=`date -d "-1 day"+%F`
fi
sql="
use gmall;
insert into table ads_uv_count
select
'$do_date',
sum(if(login_date_last='$do_date',1,0)),
sum(if(login_date_last >= date_add(next_day('$do_date','monday'),-7) and login_date_last <= date_add(next_day('$do_date','monday'),-1),1,0)),
sum(if(date_format(login_date_last,'yyyy-MM') = date_format('$do_date','yyyy-MM'),1,0)),
if('$do_date' = date_add(next_day('$do_date','monday'),-1),'Y','N'),
if('$do_date' = last_day('$do_date'),'Y','N')
from
dwt_uv_topic;
insert into table ads_new_mid_count
select
'$do_date',
count(*)
from dwt_uv_topic
where login_date_first='$do_date';
insert into table ads_new_mid_count
select
login_date_first,
count(*)
from dwt_uv_topic
where login_date_first='$do_date'
group by login_date_first;
insert into table ads_silent_count
select
'$do_date',
count(*)
from dwt_uv_topic
where login_date_first=login_date_last and login_date_last<=date_add('$do_date',-7);
insert into table ads_back_count
select
'$do_date',
concat(date_add(next_day('$do_date','MO'),-7),'_',date_add(next_day('$do_date','MO'),-1)),
count(*)
from
(
select
mid_id
from dwt_uv_topic
where login_date_last>=date_add(next_day('$do_date','MO'),-7)
and login_date_last<= date_add(next_day('$do_date','MO'),-1)
and login_date_first$do_date','MO'),-7)
)current_wk
left join
(
select
mid_id
from dws_uv_detail_daycount
where dt>=date_add(next_day('$do_date','MO'),-7*2)
and dt<= date_add(next_day('$do_date','MO'),-7-1)
group by mid_id
)last_wk
on current_wk.mid_id=last_wk.mid_id
where last_wk.mid_id is null;
insert into table ads_wastage_count
select
'$do_date',
count(*)
from dwt_uv_topic
where login_date_last <= date_add('$do_date',-7);
insert into table ads_user_retention_day_rate
select
'$do_date',
date_add('$do_date',-3),
3,
sum(if(login_date_first=date_add('$do_date',-3) and login_date_last='$do_date',1,0)),
sum(if(login_date_first=date_add('$do_date',-3),1,0)),
sum(if(login_date_first=date_add('$do_date',-3) and login_date_last='$do_date',1,0))/sum(if(login_date_first=date_add('$do_date',-3),1,0))
from dwt_uv_topic
union all
select
'$do_date',
date_add('$do_date',-2),
2,
sum(if(login_date_first=date_add('$do_date',-2) and login_date_last='$do_date',1,0)),
sum(if(login_date_first=date_add('$do_date',-2),1,0)),
sum(if(login_date_first=date_add('$do_date',-2) and login_date_last='$do_date',1,0))/sum(if(login_date_first=date_add('$do_date',-2),1,0))
from dwt_uv_topic
union all
select
'$do_date',
date_add('$do_date',-1),
1,
sum(if(login_date_first=date_add('$do_date',-1) and login_date_last='$do_date',1,0)),
sum(if(login_date_first=date_add('$do_date',-1),1,0)),
sum(if(login_date_first=date_add('$do_date',-1) and login_date_last='$do_date',1,0))/sum(if(login_date_first=date_add('$do_date',-1),1,0))
from dwt_uv_topic;
insert into table ads_continuity_wk_count
select '$do_date',
concat(date_add(next_day('$do_date','MO'),-7*3),'_',date_add(next_day('$do_date','MO'),-1)),
count(*)
from
(
select
mid_id
from
(
select
mid_id
from
dws_uv_detail_daycount
where dt>=date_add(next_day('$do_date','monday'),-7) and dt<=date_add(next_day('$do_date','monday'),-1)
group by mid_id
union all
select
mid_id
from dws_uv_detail_daycount
where dt>=date_add(next_day('$do_date','monday'),-7*2) and dt<=date_add(next_day('$do_date','monday'),-7-1)
group by mid_id
union all
select
mid_id
from
dws_uv_detail_daycount
where dt>=date_add(next_day('$do_date','monday'),-7*3) and dt<=date_add(next_day('$do_date','monday'),-7*2-1)
group by mid_id
)t1
group by mid_id
having count(*)=3
)t2;
insert into table ads_continuity_uv_count
select
'$do_date',
concat(date_add('$do_date',-6),'_','$do_date'),
count(*)
from
(
select
mid_id
from
(
select
mid_id
from
(
select
mid_id,
date_add(dt,-rk) diff
from
(
select
mid_id,
dt,
rank() over(partition by mid_id order by dt) rk
from dws_uv_detail_daycount
where dt>=date_add('$do_date',-6) and dt<='$do_date'
)t1
)t2
group by mid_id,diff
having count(*)>=3
)t3
group by mid_id
)t4;
insert into table ads_user_topic
select
'$do_date',
sum(if(login_date_last='$do_date',1,0)),
sum(if(login_date_first='$do_date',1,0)),
sum(if(payment_date_first='$do_date',1,0)),
sum(if(payment_count>0,1,0)),
count(*),
sum(if(login_date_last='$do_date',1,0))/count(*),
sum(if(payment_count>0,1,0))/count(*),
sum(if(login_date_first='$do_date',1,0))/sum(if(login_date_last='$do_date',1,0))
from dwt_user_topic;
insert into table ads_user_action_convert_day
select
ua.dt,
uv.day_count,
cart_count,
cart_count/uv.day_count*100,
order_count,
order_count/cart_count*100,
payment_count,
payment_count/order_count*100
from
(
select
'$do_date' dt,
sum(if(cart_count>0,1,0)) cart_count,
sum(if(order_count>0,1,0)) order_count,
sum(if(payment_count>0,1,0)) payment_count
from dws_user_action_daycount
where dt='$do_date'
) ua
join ads_uv_count uv
on ua.dt=uv.dt;
insert into table ads_product_info
select
'$do_date' dt,
sku_num,
spu_num
from
(
select
'$do_date' dt,
count(*) sku_num
from
dwt_sku_topic
) tmp_sku_num
join
(
select
'$do_date' dt,
count(*) spu_num
from
(
select
spu_id
from
dwt_sku_topic
group by
spu_id
) tmp_spu_id
) tmp_spu_num
on tmp_sku_num.dt = tmp_spu_num.dt;
insert into ads_product_sale_topN
select
'$do_date',
sku_id,
payment_amount
from dws_sku_action_daycount
where dt='$do_date'
order by payment_amount desc limit 10;
insert into ads_product_favor_topN
select
'$do_date',
sku_id,
favor_count
from dws_sku_action_daycount
where dt='$do_date'
order by favor_count
desc
limit 10;
insert into ads_product_cart_topN
select
'$do_date',
sku_id,
cart_num
from dws_sku_action_daycount
where dt='$do_date'
order by cart_num
desc
limit 10;
insert into table ads_product_refund_topN
select
'$do_date',
sku_id,
refund_last_30d_count/payment_last_30d_count*100 refund_ratio
from dwt_sku_topic
order by refund_ratio
desc
limit 10;
insert into table ads_appraise_bad_topN
select
'$do_date',
sku_id,
appraise_bad_count/(appraise_good_count+appraise_bad_count+appraise_default_count)*100 appraise_bad_ratio
from dws_sku_action_daycount
where dt='$do_date'
order by appraise_bad_ratio
desc
limit 10;
insert into table ads_order_daycount
select
'$do_date',
sum(order_count),
sum(order_amount),
sum(if(order_count>0,1,0))
from
dws_user_action_daycount
where dt='$do_date';
insert into table ads_payment_daycount
select
tmp_payment.dt,
tmp_payment.payment_count,
tmp_payment.payment_amount,
tmp_payment.payment_user_count,
tmp_skucount.payment_sku_count,
tmp_time.payment_avg_time
from
(
select
'$do_date' dt,
sum(payment_count) payment_count,
sum(payment_amount) payment_amount,
sum(if(payment_count>0,1,0)) payment_user_count
from dws_user_action_daycount
where dt='$do_date'
) tmp_payment
join
(
select
'$do_date' dt,
sum(if(payment_amount>0,1,0)) payment_sku_count
from dws_sku_action_daycount
where dt='$do_date'
) tmp_skucount on tmp_payment.dt = tmp_skucount.dt
join
(
select
'$do_date' dt,
sum(unix_timestamp(payment_time)-unix_timestamp(create_time))/count(*)/60 payment_avg_time
from dwd_fact_order_info
where dt='$do_date' and payment_time is not null
) tmp_time on tmp_payment.dt=tmp_time.dt;
insert into table ads_sale_tm_category1_stat_mn
select
sku_category1_id,
sku_category1_name,
sku_tm_id,
sum(if(order_count>=1,1,0)) buycount,
sum(if(order_count>=2,1,0)) buyTwiceLast,
sum(if(order_count>=3,1,0)) buy3timeLast,
sum(if(order_count>=2,1,0))/sum( if(order_count>=1,1,0)) buyTwiceLastRatio,
sum(if(order_count>=3,1,0))/sum( if(order_count>=1,1,0)) buy3timeLastRatio,
date_format('$do_date' ,'yyyy-MM') stat_mn,
'$do_date' stat_date
from
(
select
sku_category1_id,
sku_category1_name,
user_id,
sku_tm_id,
sum(order_count) order_count
from dws_sale_detail_daycount
where date_format(dt,'yyyy-MM')=date_format('$do_date','yyyy-MM')
group by user_id,sku_tm_id,sku_category1_id,sku_category1_name
)t1
group by sku_tm_id,sku_category1_id,sku_category1_name;
"
$hive-e "$sql"
第 8 章 Azkaban 调度
8.1 Azkaban 部署
8.1.1 安装前准备
将 Azkaban Web 服务器
Azkaban 执行服务器
Azkaban 的 sql 执行脚本及
MySQL 安装包拷贝到 dw1虚拟机/opt/software 目录下
参考:http://blog.csdn.net/qingfeilee/article/details/7052736
org.hibernate.NonUniqueResultException: query did not return a unique result: 2
在项目中出现了org.hiber
由Oracle通信技术部门主导的演示项目并没有在本月较早前法国南斯举行的行业集团TM论坛大会中获得嘉奖。但是,Oracle通信官员解雇致力于打造一个支持零干预分配和编制功能的网络即服务(NaaS)平台,帮助企业以更灵活和更适合云的方式实现通信服务提供商(CSP)的连接产品。这个Oracle主导的项目属于TM Forum Live!活动上展示的Catalyst计划的19个项目之一。Catalyst计
Spring MVC提供了非常方便的文件上传功能。
1,配置Spring支持文件上传:
DispatcherServlet本身并不知道如何处理multipart的表单数据,需要一个multipart解析器把POST请求的multipart数据中抽取出来,这样DispatcherServlet就能将其传递给我们的控制器了。为了在Spring中注册multipart解析器,需要声明一个实现了Mul
the CollabNet user information center http://help.collab.net/
How do I create a new Wiki page?
A CollabNet TeamForge project can have any number of Wiki pages. All Wiki pages are linked, and
package beautyOfCoding;
import java.util.Arrays;
import java.util.Random;
public class MaxSubArraySum2 {
/**
* 编程之美 子数组之和的最大值(二维)
*/
private static final int ROW = 5;
private stat
示例程序,swap_1和swap_2都是错误的,推理从1开始推到2,2没完成,推到3就完成了
# include <stdio.h>
void swap_1(int, int);
void swap_2(int *, int *);
void swap_3(int *, int *);
int main(void)
{
int a = 3;
int b =
同步工具类包括信号量(Semaphore)、栅栏(barrier)、闭锁(CountDownLatch)
闭锁(CountDownLatch)
public class RunMain {
public long timeTasks(int nThreads, final Runnable task) throws InterruptedException {
fin
不止一次,看到很多讲技术的文章里面出现过这个词语。今天终于弄懂了——通过朋友给的浏览软件,上了wiki。
我再一次感到,没有辞典能像WiKi一样,给出这样体贴人心、一清二楚的解释了。为了表达我对WiKi的喜爱,只好在此一一中英对照,给大家上次课。
In computer science, bleeding edge is a term that