在经过了ODS层的加载数据、DWD层的解析数据后。这一节将具体分析特定指标,DWS层将对DWD层数据轻度汇总,ADS层将统计数据。
用户活跃主题
-
DWD层目标:统计当日、当周、当月活动的每个设备明细。
-
每日活跃设备明细
- 建表语句
hive (gmall)> drop table if exists dws_uv_detail_day; create external table dws_uv_detail_day( `mid_id` string COMMENT '设备唯一标识', `user_id` string COMMENT '用户标识', `version_code` string COMMENT '程序版本号', `version_name` string COMMENT '程序版本名', `lang` string COMMENT '系统语言', `source` string COMMENT '渠道号', `os` string COMMENT '安卓系统版本', `area` string COMMENT '区域', `model` string COMMENT '手机型号', `brand` string COMMENT '手机品牌', `sdk_version` string COMMENT 'sdkVersion', `gmail` string COMMENT 'gmail', `height_width` string COMMENT '屏幕宽高', `app_time` string COMMENT '客户端日志产生时的时间', `network` string COMMENT '网络模式', `lng` string COMMENT '经度', `lat` string COMMENT '纬度' ) partitioned by(dt string) stored as parquet location '/warehouse/gmall/dws/dws_uv_detail_day' ;
- 插入数据
hive (gmall)> set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table dws_uv_detail_day partition(dt='2020-02-03') select mid_id, concat_ws('|', collect_set(user_id)) user_id, concat_ws('|', collect_set(version_code)) version_code, concat_ws('|', collect_set(version_name)) version_name, concat_ws('|', collect_set(lang))lang, concat_ws('|', collect_set(source)) source, concat_ws('|', collect_set(os)) os, concat_ws('|', collect_set(area)) area, concat_ws('|', collect_set(model)) model, concat_ws('|', collect_set(brand)) brand, concat_ws('|', collect_set(sdk_version)) sdk_version, concat_ws('|', collect_set(gmail)) gmail, concat_ws('|', collect_set(height_width)) height_width, concat_ws('|', collect_set(app_time)) app_time, concat_ws('|', collect_set(network)) network, concat_ws('|', collect_set(lng)) lng, concat_ws('|', collect_set(lat)) lat from dwd_start_log where dt='2020-02-03' group by mid_id;
- 查看结果
hive (gmall)> select * from dws_uv_detail_day limit 1;
-
每周活跃设备明细
- 建表语句
hive (gmall)> drop table if exists dws_uv_detail_wk; create external table dws_uv_detail_wk( `mid_id` string COMMENT '设备唯一标识', `user_id` string COMMENT '用户标识', `version_code` string COMMENT '程序版本号', `version_name` string COMMENT '程序版本名', `lang` string COMMENT '系统语言', `source` string COMMENT '渠道号', `os` string COMMENT '安卓系统版本', `area` string COMMENT '区域', `model` string COMMENT '手机型号', `brand` string COMMENT '手机品牌', `sdk_version` string COMMENT 'sdkVersion', `gmail` string COMMENT 'gmail', `height_width` string COMMENT '屏幕宽高', `app_time` string COMMENT '客户端日志产生时的时间', `network` string COMMENT '网络模式', `lng` string COMMENT '经度', `lat` string COMMENT '纬度', `monday_date` string COMMENT '周一日期', `sunday_date` string COMMENT '周日日期' ) COMMENT '活跃用户按周明细' PARTITIONED BY (`wk_dt` string) stored as parquet location '/warehouse/gmall/dws/dws_uv_detail_wk/';
- 插入数据
set hive.exec.dynamic.partition.mode=nonstrict; insert overwrite table dws_uv_detail_wk partition(wk_dt) select mid_id, concat_ws('|', collect_set(user_id)) user_id, concat_ws('|', collect_set(version_code)) version_code, concat_ws('|', collect_set(version_name)) version_name, concat_ws('|', collect_set(lang))lang, concat_ws('|', collect_set(source)) source, concat_ws('|', collect_set(os)) os, concat_ws('|', collect_set(area)) area, concat_ws('|', collect_set(model)) model, concat_ws('|', collect_set(brand)) brand, concat_ws('|', collect_set(sdk_version)) sdk_version, concat_ws('|', collect_set(gmail)) gmail, concat_ws('|', collect_set(height_width)) height_width, concat_ws('|', collect_set(app_time)) app_time, concat_ws('|', collect_set(network)) network, concat_ws('|', collect_set(lng)) lng, concat_ws('|', collect_set(lat)) lat, date_add(next_day('2020-02-03','mo'),-7), date_add(next_day('2020-02-03','mo'),-1), concat(date_add(next_day('2020-02-03','mo'),-7),'_',date_add(next_day('2020-02-03','mo'),-1)) from dws_uv_detail_day where dt>=date_add(next_day('2020-02-03','mo'),-7) and dt<=date_add(next_day('2020-02-03','mo'),-1) group by mid_id;
- 查询导入结果
select * from dws_uv_detail_wk limit 1;
-
每月活跃设备明细
- 建表语句
hive (gmall)> drop table if exists dws_uv_detail_mn; create external table dws_uv_detail_mn( `mid_id` string COMMENT '设备唯一标识', `user_id` string COMMENT '用户标识', `version_code` string COMMENT '程序版本号', `version_name` string COMMENT '程序版本名', `lang` string COMMENT '系统语言', `source` string COMMENT '渠道号', `os` string COMMENT '安卓系统版本', `area` string COMMENT '区域', `model` string COMMENT '手机型号', `brand` string COMMENT '手机品牌', `sdk_version` string COMMENT 'sdkVersion', `gmail` string COMMENT 'gmail', `height_width` string COMMENT '屏幕宽高', `app_time` string COMMENT '客户端日志产生时的时间', `network` string COMMENT '网络模式', `lng` string COMMENT '经度', `lat` string COMMENT '纬度' ) COMMENT '活跃用户按月明细' PARTITIONED BY (`mn` string) stored as parquet location '/warehouse/gmall/dws/dws_uv_detail_mn/';
- 插入数据
insert overwrite table dws_uv_detail_mn partition(mn) select mid_id, concat_ws('|', collect_set(user_id)) user_id, concat_ws('|', collect_set(version_code)) version_code, concat_ws('|', collect_set(version_name)) version_name, concat_ws('|', collect_set(lang)) lang, concat_ws('|', collect_set(source)) source, concat_ws('|', collect_set(os)) os, concat_ws('|', collect_set(area)) area, concat_ws('|', collect_set(model)) model, concat_ws('|', collect_set(brand)) brand, concat_ws('|', collect_set(sdk_version)) sdk_version, concat_ws('|', collect_set(gmail)) gmail, concat_ws('|', collect_set(height_width)) height_width, concat_ws('|', collect_set(app_time)) app_time, concat_ws('|', collect_set(network)) network, concat_ws('|', collect_set(lng)) lng, concat_ws('|', collect_set(lat)) lat, date_format('2020-02-03', 'yyyy-MM') from dws_uv_detail_day where date_format(dt, 'yyyy-MM')=date_format('2020-02-03', 'yyyy-MM') group by mid_id;
- 查询结果
hive (gmall)> select * from dws_uv_detail_mn limit 2;
- 将剩余数据导入
-
-
ADS层目标:计算当日、当周、当月活跃设备数
- 建表语句
hive (gmall)> drop table if exists ads_uv_count; create external table ads_uv_count( `dt` string COMMENT '统计日期', `day_count` bigint COMMENT '当日用户数量', `wk_count` bigint COMMENT '当周用户数量', `mn_count` bigint COMMENT '当月用户数量', `is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果', `is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果' ) COMMENT '活跃设备数' row format delimited fields terminated by '\t' location '/warehouse/gmall/ads/ads_uv_count/';
- 导入数据
hive (gmall)> insert into table ads_uv_count select '2020-02-03', daycount.ct, weekcount.ct, mncount.ct, if(date_add(next_day('2020-02-03','mo'),-1)='2020-02-03', 'Y', 'N'), if(last_day('2020-02-03')='2020-02-03','Y','N') from ( select '2020-02-03' dt, count(*) ct from dws_uv_detail_day where dt='2020-02-03' )daycount join ( select '2020-02-03' dt, count(*) ct from dws_uv_detail_wk where wk_dt=concat(date_add(next_day('2020-02-03','mo'),-7),'_',date_add(next_day('2020-02-03','mo'),-1)) )weekcount on daycount.dt=weekcount.dt join ( select '2020-02-03' dt, count(*) ct from dws_uv_detail_mn where mn=date_format('2020-02-10','yyyy-MM') )mncount on daycount.dt=mncount.dt;
- 加载剩余数据
[hadoop@hadoop151 bin]$ ads_uv_log.sh 2020-01-01 2020-01-31
-
查询ADS层中的数据。
hive (gmall)> select * from ads_uv_count; OK ads_uv_count.dt ads_uv_count.day_count ads_uv_count.wk_count ads_uv_count.mn_count ads_uv_count.is_weekend ads_uv_count.is_monthend 2020-02-03 741 741 741 N N 2020-01-01 521 990 1000 N N 2020-01-10 728 999 1000 N N 2020-01-11 763 999 1000 N N 2020-01-12 742 999 1000 Y N 2020-01-13 444 1000 1000 N N 2020-01-14 757 1000 1000 N N 2020-01-15 757 1000 1000 N N 2020-01-16 756 1000 1000 N N 2020-01-17 744 1000 1000 N N 2020-01-18 746 1000 1000 N N 2020-01-19 722 1000 1000 Y N 2020-01-02 506 990 1000 N N 2020-01-20 751 1000 1000 N N 2020-01-21 742 1000 1000 N N 2020-01-22 760 1000 1000 N N 2020-01-23 750 1000 1000 N N 2020-01-24 784 1000 1000 N N 2020-01-25 578 1000 1000 N N 2020-01-26 866 1000 1000 Y N 2020-01-27 750 999 1000 N N 2020-01-28 757 999 1000 N N 2020-01-29 745 999 1000 N N 2020-01-03 736 990 1000 N N 2020-01-30 766 999 1000 N N 2020-01-31 657 999 1000 N Y 2020-01-04 502 990 1000 N N 2020-01-05 759 990 1000 Y N 2020-01-06 762 999 1000 N N 2020-01-07 772 999 1000 N N 2020-01-08 735 999 1000 N N 2020-01-09 29 999 1000 N N Time taken: 0.042 seconds, Fetched: 32 row(s)
造成周活跃设备接近月活跃设备的原因可能是生成的埋点数据导致的,因为没有生产数据,所以这样的结果应该在意料之中。