1、建表语句
create table ads_gmv_sum_day(
`dt` string COMMENT '统计日期',
`gmv_count` bigint COMMENT '当日gmv订单个数',
`gmv_amount` decimal(16,2) COMMENT '当日gmv订单总金额',
`gmv_payment` decimal(16,2) COMMENT '当日支付金额'
) COMMENT '每日活跃用户数量'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_gmv_sum_day/'
;
2、数据导入
insert into table ads_gmv_sum_day
select
'2019-02-10' dt ,
sum(order_count) gmv_count ,
sum(order_amount) gmv_amount ,
sum(payment_amount) payment_amount
from dws_user_action
where dt ='2019-02-10'
group by dt
;
1、建表语句
drop table if exists ads_user_convert_day;
create table ads_user_convert_day(
`dt` string COMMENT '统计日期',
`uv_m_count` bigint COMMENT '当日活跃设备',
`new_m_count` bigint COMMENT '当日新增设备',
`new_m_ratio` decimal(10,2) COMMENT '当日新增占日活的比率'
) COMMENT '每日活跃用户数量'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_convert_day/'
;
2、 数据导入
insert into table ads_user_convert_day
select
'2019-02-10',
sum( uc.dc) sum_dc,
sum( uc.nmc) sum_nmc,
cast(sum( uc.nmc)/sum( uc.dc)*100 as decimal(10,2)) new_m_ratio
from
(
select
day_count dc,
0 nmc
from ads_uv_count
where dt='2019-02-10'
union all
select
0 dc,
new_mid_count nmc
from ads_new_mid_count
where create_date='2019-02-10'
)uc;
1、建表语句
create table ads_user_action_convert_day(
`dt` string COMMENT '统计日期',
`total_visitor_m_count` bigint COMMENT '总访问人数',
`order_u_count` bigint COMMENT '下单人数',
`visitor2order_convert_ratio` decimal(10,2) COMMENT '访问到下单转化率',
`payment_u_count` bigint COMMENT '支付人数',
`order2payment_convert_ratio` decimal(10,2) COMMENT '下单到支付的转化率'
) COMMENT '每日用户行为转化率统计'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_convert_day/'
;
2、数据导入
insert into table ads_user_action_convert_day
select
'2019-02-10',
uv.day_count,
ua.order_count,
cast(ua.order_count/uv.day_count*100 as decimal(10,2)) visitor2order_convert_ratio,
ua.payment_count,
cast(ua.payment_count/ua.order_count*100 as decimal(10,2)) order2payment_convert_ratio
from
(
select
sum(if(order_count>0,1,0)) order_count,
sum(if(payment_count>0,1,0)) payment_count
from dws_user_action
where dt='2019-02-10'
)ua, ads_uv_count uv
where uv.dt='2019-02-10'
;
1、建表语句
create external table dws_sale_detail_daycount
( user_id string comment '用户 id',
sku_id string comment '商品 Id',
user_gender string comment '用户性别',
user_age string comment '用户年龄',
user_level string comment '用户等级',
order_price decimal(10,2) comment '订单价格',
sku_name string comment '商品名称',
sku_tm_id string comment '品牌id',
sku_category3_id string comment '商品三级品类id',
sku_category2_id string comment '商品二级品类id',
sku_category1_id string comment '商品一级品类id',
sku_category3_name string comment '商品三级品类名称',
sku_category2_name string comment '商品二级品类名称',
sku_category1_name string comment '商品一级品类名称',
spu_id string comment '商品 spu',
sku_num int comment '购买个数',
order_count string comment '当日下单单数',
order_amount string comment '当日下单金额'
) COMMENT '用户购买商品明细表'
PARTITIONED BY ( `dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_sale_detail_daycount/'
tblproperties ("parquet.compression"="snappy");
2、数据导入
with
tmp_detail as
(
select
user_id,
sku_id,
sum(sku_num) sku_num ,
count(*) order_count ,
sum(od.order_price*sku_num) order_amount
from ods_order_detail od
where od.dt='2019-02-10' and user_id is not null
group by user_id, sku_id
)
insert overwrite table dws_sale_detail_daycount partition(dt='2019-02-10')
select
tmp_detail.user_id,
tmp_detail.sku_id,
u.gender,
months_between('2019-02-10', u.birthday)/12 age,
u.user_level,
price,
sku_name,
tm_id,
category3_id ,
category2_id ,
category1_id ,
category3_name ,
category2_name ,
category1_name ,
spu_id,
tmp_detail.sku_num,
tmp_detail.order_count,
tmp_detail.order_amount
from tmp_detail
left join dwd_user_info u on u.id=tmp_detail.user_id and u.dt='2019-02-10'
left join dwd_sku_info s on tmp_detail.sku_id =s.id and s.dt='2019-02-10'
;
3、数据导入脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
with
tmp_detail as
(
select
user_id,
sku_id,
sum(sku_num) sku_num ,
count(*) order_count ,
sum(od.order_price*sku_num) order_amount
from "$APP".ods_order_detail od
where od.dt='$log_date' and user_id is not null
group by user_id, sku_id
)
insert overwrite table "$APP".dws_sale_detail_daycount partition(dt='$log_date')
select
tmp_detail.user_id,
tmp_detail.sku_id,
u.gender,
months_between('$log_date', u.birthday)/12 age,
u.user_level,
price,
sku_name,
tm_id,
category3_id ,
category2_id ,
category1_id ,
category3_name ,
category2_name ,
category1_name ,
spu_id,
tmp_detail.sku_num,
tmp_detail.order_count,
tmp_detail.order_amount
from tmp_detail
left join "$APP".dwd_user_info u
on u.id=tmp_detail.user_id and u.dt='$log_date'
left join "$APP".dwd_sku_info s on tmp_detail.sku_id =s.id and s.dt='$log_date';
"
$hive -e "$sql"
1、建表语句
create table ads_sale_tm_category1_stat_mn
(
tm_id string comment '品牌id ' ,
category1_id string comment '1级品类id ',
category1_name string comment '1级品类名称 ',
buycount bigint comment '购买人数',
buy_twice_last bigint comment '两次以上购买人数',
buy_twice_last_ratio decimal(10,2) comment '单次复购率',
buy_3times_last bigint comment '三次以上购买人数',
buy_3times_last_ratio decimal(10,2) comment '多次复购率' ,
stat_mn string comment '统计月份',
stat_date string comment '统计日期'
) COMMENT '复购率统计'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/'
;
2、数据导入
insert into table ads_sale_tm_category1_stat_mn
select
mn.sku_tm_id,
mn.sku_category1_id,
mn.sku_category1_name,
sum(if(mn.order_count>=1,1,0)) buycount,
sum(if(mn.order_count>=2,1,0)) buyTwiceLast,
sum(if(mn.order_count>=2,1,0))/sum( if(mn.order_count>=1,1,0)) buyTwiceLastRatio,
sum(if(mn.order_count>3,1,0)) buy3timeLast ,
sum(if(mn.order_count>=3,1,0))/sum( if(mn.order_count>=1,1,0)) buy3timeLastRatio ,
date_format('2019-02-10' ,'yyyy-MM') stat_mn,
'2019-02-10' stat_date
from
(
select od.sku_tm_id,
od.sku_category1_id,
od.sku_category1_name,
user_id ,
sum(order_count) order_count
from dws_sale_detail_daycount od
where
date_format(dt,'yyyy-MM')<=date_format('2019-02-10' ,'yyyy-MM')
group by
od.sku_tm_id, od.sku_category1_id, user_id, od.sku_category1_name
) mn
group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name
;
3、数据导入脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n $1 ] ;then
log_date=$1
else
log_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$APP".ads_sale_tm_category1_stat_mn
select
mn.sku_tm_id,
mn.sku_category1_id,
mn.sku_category1_name,
sum(if(mn.order_count>=1,1,0)) buycount,
sum(if(mn.order_count>=2,1,0)) buyTwiceLast,
sum(if(mn.order_count>=2,1,0))/sum( if(mn.order_count>=1,1,0)) buyTwiceLastRatio,
sum(if(mn.order_count>3,1,0)) buy3timeLast ,
sum(if(mn.order_count>=3,1,0))/sum( if(mn.order_count>=1,1,0)) buy3timeLastRatio ,
date_format('$log_date' ,'yyyy-MM') stat_mn,
'$log_date' stat_date
from
(
select od.sku_tm_id,
od.sku_category1_id,
od.sku_category1_name,
user_id ,
sum(order_count) order_count
from "$APP".dws_sale_detail_daycount od
where date_format(dt,'yyyy-MM')<=date_format('$log_date' ,'yyyy-MM')
group by od.sku_tm_id, od.sku_category1_id, user_id, od.sku_category1_name
) mn
group by mn.sku_tm_id, mn.sku_category1_id, mn.sku_category1_name;
"
$hive -e "$sql"
1、MySQL中建表
create table ads_sale_tm_category1_stat_mn
(
tm_id varchar(200) comment '品牌id ' ,
category1_id varchar(200) comment '1级品类id ',
category1_name varchar(200) comment '1级品类名称 ',
buycount varchar(200) comment '购买人数',
buy_twice_last varchar(200) comment '两次以上购买人数',
buy_twice_last_ratio varchar(200) comment '单次复购率',
buy_3times_last varchar(200) comment '三次以上购买人数',
buy_3times_last_ratio varchar(200) comment '多次复购率' ,
stat_mn varchar(200) comment '统计月份',
stat_date varchar(200) comment '统计日期'
)
2、sqoop导出数据脚本
#!/bin/bash
db_name=gmall
export_data() {
/opt/module/sqoop/bin/sqoop export \
--connect "jdbc:mysql://hadoop102:3306/${db_name}?useUnicode=true&characterEncoding=utf-8" \
--username root \
--password 000000 \
--table $1 \
--num-mappers 1 \
--export-dir /warehouse/$db_name/ads/$1 \
--input-fields-terminated-by "\t" \
--update-key "tm_id,category1_id,stat_mn,stat_date" \
--update-mode allowinsert \
--input-null-string '\\N' \
--input-null-non-string '\\N'
}
case $1 in
"ads_sale_tm_category1_stat_mn")
export_data "ads_sale_tm_category1_stat_mn"
;;
"all")
export_data "ads_sale_tm_category1_stat_mn"
;;
esac
1、生成10条原始订单数据
CALL init_data('2019-02-13',10,5,10,TRUE);
[atguigu@hadoop102 bin]$ sqoop_import.sh all 2019-02-13
[atguigu@hadoop102 bin]$ ods_db.sh 2019-02-13
[atguigu@hadoop102 bin]$ dwd_db.sh 2019-02-13
2、建立拉链表
create table dwd_order_info_his(
`id` string COMMENT '订单编号',
`total_amount` decimal(10,2) COMMENT '订单金额',
`order_status` string COMMENT '订单状态',
`user_id` string COMMENT '用户id' ,
`payment_way` string COMMENT '支付方式',
`out_trade_no` string COMMENT '支付流水号',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间' ,
`start_date` string COMMENT '有效开始日期',
`end_date` string COMMENT '有效结束日期'
) COMMENT '订单拉链表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_order_info_his/'
tblproperties ("parquet.compression"="snappy");
3、初始化拉链表
insert overwrite table dwd_order_info_his
select
id,
total_amount,
order_status,
user_id,
payment_way,
out_trade_no,
create_time,
operate_time,
'2019-02-13',
'9999-99-99'
from ods_order_info oi
where oi.dt='2019-02-13';
1、如何获得每日变动表
(1)最好表内有创建时间和变动时间(Lucky!)
(2)如果没有,可以利用第三方工具监控比如canal,监控MySQL的实时变化进行记录(麻烦)。
(3)逐行对比前后两天的数据, 检查md5(concat(全部有可能变化的字段))是否相同(low)
(4)要求业务数据库提供变动流水(人品,颜值)
2、因为dwd_order_info本身导入过来就是新增变动明细的表,所以不用处理
(1)2019-02-14日新增2条订单数据
CALL init_data('2019-02-14',2,5,10,TRUE);
(2)通过Sqoop把2019-02-14日所有数据导入
sqoop_import.sh all 2019-02-14
(3)ODS层数据导入
ods_db.sh 2019-02-14
(4)DWD层数据导入
dwd_db.sh 2019-02-14
1、建立临时表
create external table dwd_order_info_his_tmp(
`id` string COMMENT '订单编号',
`total_amount` decimal(10,2) COMMENT '订单金额',
`order_status` string COMMENT '订单状态',
`user_id` string COMMENT '用户id' ,
`payment_way` string COMMENT '支付方式',
`out_trade_no` string COMMENT '支付流水号',
`create_time` string COMMENT '创建时间',
`operate_time` string COMMENT '操作时间',
`start_date` string COMMENT '有效开始日期',
`end_date` string COMMENT '有效结束日期'
) COMMENT '订单拉链临时表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_order_info_his_tmp/'
tblproperties ("parquet.compression"="snappy");
2、导入数据
insert overwrite table dwd_order_info_his_tmp
select * from
(
select
id,
total_amount ,
order_status ,
user_id ,
payment_way ,
out_trade_no,
create_time ,
operate_time ,
'2019-02-14' start_date,
'9999-99-99' end_date
from dwd_order_info where dt='2019-02-14'
union all
select oh.id,
oh.total_amount ,
oh.order_status ,
oh.user_id ,
oh.payment_way ,
oh.out_trade_no,
oh.create_time ,
oh.operate_time ,
oh.start_date,
if(oi.id is null ,oh.end_date, date_add(oi.dt,-1)) end_date
from dwd_order_info_his oh left join
(
select
*
from dwd_order_info
where dt='2019-02-14'
) oi
on oh.id=oi.id and oh.end_date='9999-99-99'
)his
order by his.id, start_date;
insert overwrite table dwd_order_info_his
select * from dwd_order_info_his_tmp;