ODS层的数据与源数据的格式基本相同,在ODS数据库中建立用户启动日志信息表【ods_start_log】,如下所示:
use ODS;
create external table ods.ods_start_log(
`str` string)
comment '用户启动日志信息'
partitioned by (`dt` string)
location '/user/data/logs/start';
加载启动日志数据:
手动新建分区:alter table ods.ods_start_log add partition(dt='2020-07-21');
但是我们有N多个分区,所以不可能一直手动新建分区,这时候可使用脚本加载启动日志数据,【ods_load_log.sh】如下:
#!/bin/bash
APP=ODS
hive=/opt/apps/hive-2.3.7/bin/hive
# 可以输入日期;如果未输入日期取昨天的时间
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
# 定义要执行的SQL
sql="alter table "$APP".ods_start_log add partition(dt='$do_date');"
$hive -e "$sql"
【ods_load_log.sh】中现有数据如下:
hive (ods)> SELECT * FROM ods_start_log LIMIT 1;;
OK
ods_start_log.str ods_start_log.dt
2021-09-16 16:55:01.203 [main] INFO com.lagou.ecommerce.AppStart - {"app_active":{"name":"app_active","json":{"entry":"1","action":"0","error_code":"0"},"time":1595260800000},"attr":{"area":"连云港","uid":"2F10092A1","app_v":"1.1.8","event_type":"common","device_id":"1FB872-9A1001","os_type":"0.43","channel":"PN","language":"chinese","brand":"iphone-7"}} 2020-07-21
Time taken: 0.376 seconds, Fetched: 1 row(s)
目前在ODS层中存放的还是原始数据,此时还需要进行json数据解析,保留有效数据,并将数据展开,形成每日启动明细表
创建DWD层表:表的格式:parquet、分区表
use DWD;
drop table if exists dwd.dwd_start_log;
CREATE TABLE dwd.dwd_start_log(
`device_id` string,
`area` string,
`uid` string,
`app_v` string,
`event_type` string,
`os_type` string,
`channel` string,
`language` string,
`brand` string,
`entry` string,
`action` string,
`error_code` string
)
PARTITIONED BY (dt string)
STORED AS parquet;
从ODS层加载数据到DWD层脚本如下【dwd_start_log】:
#!/bin/bash
source /etc/profile
# 可以输入日期;如果未输入日期取昨天的时间
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
# 定义要执行的SQL
sql="
with tmp as(
select split(str, ' ')[7] line
from ods.ods_start_log
where dt='$do_date'
)
insert overwrite table dwd.dwd_start_log
partition(dt='$do_date')
select get_json_object(line, '$.attr.device_id'),
get_json_object(line, '$.attr.area'),
get_json_object(line, '$.attr.uid'),
get_json_object(line, '$.attr.app_v'),
get_json_object(line, '$.attr.event_type'),
get_json_object(line, '$.attr.os_type'),
get_json_object(line, '$.attr.channel'),
get_json_object(line, '$.attr.language'),
get_json_object(line, '$.attr.brand'),
get_json_object(line, '$.app_active.json.entry'),
get_json_object(line, '$.app_active.json.action'),
get_json_object(line, '$.app_active.json.error_code')
from tmp;
"
hive -e "$sql"
此时【dwd_start_log】表中数据如下:
hive (dwd)> SELECT * FROM dwd_start_log LIMIT 5;
OK
dwd_start_log.device_id dwd_start_log.area dwd_start_log.uid dwd_start_log.app_v dwd_start_log.event_type dwd_start_log.os_type dwd_start_log.channel dwd_start_log.language dwd_start_log.brand dwd_start_log.entry dwd_start_log.action dwd_start_log.error_code dwd_start_log.dt
1FB872-9A1001 连云港 2F10092A1 1.1.8 common 0.43 PN chinese iphone-7 1 0 0 2020-07-21
1FB872-9A1002 金昌 2F10092A2 1.1.5 common 5.8.7 OF chinese xiaomi-0 1 1 0 2020-07-21
1FB872-9A1003 句容 2F10092A3 1.1.16 common 0.99 YI chinese iphone-3 1 0 0 2020-07-21
1FB872-9A1004 肇庆 2F10092A4 1.1.6 common 6.6.2 CD chinese xiaomi-6 1 1 0 2020-07-21
1FB872-9A1005 武汉 2F10092A5 1.1.2 common 6.9 WG chinese xiaomi-1 1 1 0 2020-07-21
Time taken: 0.239 seconds, Fetched: 5 row(s)
将DWD中的原始日志数据处理为活跃用户的明细数据,处理过程如下:
DWS层建表:用户日启动日志汇总【dws_member_start_day】,用户周启动汇总表【dws_member_start_week】,用户月启动汇总表【dws_member_start_month】
create table dws.dws_member_start_day
(
`device_id` string,
`uid` string,
`app_v` string,
`os_type` string,
`language` string,
`channel` string,
`area` string,
`brand` string
) COMMENT '用户日启动汇总'
partitioned by(dt string)
stored as parquet;
create table dws.dws_member_start_week(
`device_id` string,
`uid` string,
`app_v` string,
`os_type` string,
`language` string,
`channel` string,
`area` string,
`brand` string,
`week` string
) COMMENT '用户周启动汇总'
PARTITIONED BY (`dt` string)
stored as parquet;
create table dws.dws_member_start_month(
`device_id` string,
`uid` string,
`app_v` string,
`os_type` string,
`language` string,
`channel` string,
`area` string,
`brand` string,
`month` string
) COMMENT '用户月启动汇总'
PARTITIONED BY (`dt` string)
stored as parquet;
活跃用户DWS层加载数据
#!/bin/bash
source /etc/profile
# 可以输入日期;如果未输入日期取昨天的时间
if [ -n "$1" ]
then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
# 定义要执行的SQL
# 汇总得到每日活跃会员信息;每日数据汇总得到每周、每月数据
sql="
insert overwrite table dws.dws_member_start_day
partition(dt='$do_date')
select device_id,
concat_ws('|', collect_set(uid)),
concat_ws('|', collect_set(app_v)),
concat_ws('|', collect_set(os_type)),
concat_ws('|', collect_set(language)),
concat_ws('|', collect_set(channel)),
concat_ws('|', collect_set(area)),
concat_ws('|', collect_set(brand))
from dwd.dwd_start_log
where dt='$do_date'
group by device_id;
-- 汇总得到每周活跃会员
insert overwrite table dws.dws_member_start_week
partition(dt='$do_date')
select device_id,
concat_ws('|', collect_set(uid)),
concat_ws('|', collect_set(app_v)),
concat_ws('|', collect_set(os_type)),
concat_ws('|', collect_set(language)),
concat_ws('|', collect_set(channel)),
concat_ws('|', collect_set(area)),
concat_ws('|', collect_set(brand)),
date_add(next_day('$do_date', 'mo'), -7)
from dws.dws_member_start_day
where dt >= date_add(next_day('$do_date', 'mo'), -7)
and dt <= '$do_date'
group by device_id;
-- 汇总得到每月活跃会员
insert overwrite table dws.dws_member_start_month
partition(dt='$do_date')
select device_id,
concat_ws('|', collect_set(uid)),
concat_ws('|', collect_set(app_v)),
concat_ws('|', collect_set(os_type)),
concat_ws('|', collect_set(language)),
concat_ws('|', collect_set(channel)),
concat_ws('|', collect_set(area)),
concat_ws('|', collect_set(brand)),
date_format('$do_date', 'yyyy-MM')
from dws.dws_member_start_day
where dt >= date_format('$do_date', 'yyyy-MM-01')
and dt <= '$do_date'
group by device_id;
"
hive -e "$sql"
DWS层中是每日、每周、每月活跃会员的汇总表,在ADS层中要计算当天,当周,当月活跃用户数量【ads_member_active_count】
create table ads.ads_member_active_count(
`day_count` int COMMENT '当日会员数量',
`week_count` int COMMENT '当周会员数量',
`month_count` int COMMENT '当月会员数量'
) COMMENT '活跃会员数'
partitioned by(dt string)
row format delimited fields terminated by ',';
加载ADS层数据【ads_load_member_active.sh】
#!/bin/bash
source /etc/profile
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert overwrite table ads.ads_member_active_count
partition(dt='$do_date')
select daycnt, weekcnt, monthcnt
from (select dt, count(*) daycnt
from dws.dws_member_start_day
where dt='$do_date'
group by dt
) day join
(select dt, count(*) weekcnt
from dws.dws_member_start_week
where dt='$do_date'
group by dt
) week on day.dt=week.dt
join
(select dt, count(*) monthcnt
from dws.dws_member_start_month
where dt='$do_date'
group by dt
) month on day.dt=month.dt;
"
hive -e "$sql"