1.进行数据清理 MapReduce
#!/bin/bash
day_str=`date -d '-1 day' + '%Y-%m-%d'`
inpath=/app-log-data/data/$day_str
outpath=/app-log-data/clean/${day_str}-clean
echo "准备清洗$day_str数据..."
/root/apps/hadoop-2.8.3/bin/hadoop jar /root/data-clean.jar cn.edu360.app.log.mr.AppLogDataClean $inpath $outpath
---------------------------------------------------------------------------------------------------------------------------
2.原始数据建模
2.1原始数据表
--建表语句
CREATE EXTERNAL TABLE ods_app_log (
sdk_ver string
,time_zone string
,commit_id string
,commit_time string
,pid string
,app_token string
,app_id string
,device_id string
,device_id_type string
,release_channel string
,app_ver_name string
,app_ver_code string
,os_name string
,os_ver string
,LANGUAGE string
,country string
,manufacture string
,device_model string
,resolution string
,net_type string
,account string
,app_device_id string
,mac string
,android_id string
,imei string
,cid_sn string
,build_num string
,mobile_data_type string
,promotion_channel string
,carrier string
,city string
,user_id string
) partitioned BY (
day string
,os string
) row format delimited fields terminated BY '\001' location '/app-log-data/clean';
--改变数据表分区的结构
ALTER TABLE ods_app_log ADD PARTITION (day = '2018-05-19',os = 'android') location '/app-log-data/clean/2018-05-19/android';
ALTER TABLE ods_app_log ADD PARTITION (day = '2018-05-19',os = 'ios') location '/app-log-data/clean/2018-05-19/ios';
--数据导入语句
load data inpath '/app-log-data/clean/2018-05-19-clean/android' into table ods_app_log partition(day = '2018-05-19',os = 'android');
load data inpath '/app-log-data/clean/2018-05-19-clean/ios' into table ods_app_log partition(day = '2018-05-19',os = 'ios');
2.2日活数据表
--建表语句
CREATE TABLE etl_user_active_day (
sdk_ver string
,time_zone string
,commit_id string
,commit_time string
,pid string
,app_token string
,app_id string
,device_id string
,device_id_type string
,release_channel string
,app_ver_name string
,app_ver_code string
,os_name string
,os_ver string
,language string
,country string
,manufacture string
,device_model string
,resolution string
,net_type string
,account string
,app_device_id string
,mac string
,android_id string
,imei string
,cid_sn string
,build_num string
,mobile_data_type string
,promotion_channel string
,carrier string
,city string
,user_id string
) partitioned BY (day string) row format delimited fields terminated BY '\001';
--向日活数据表插入数据
INSERT INTO TABLE etl_user_active_day PARTITION (day = '2018-05-19')
SELECT sdk_ver
,time_zone
,commit_id
,commit_time
,pid
,app_token
,app_id
,device_id
,device_id_type
,release_channel
,app_ver_name
,app_ver_code
,os_name
,os_ver
,LANGUAGE
,country
,manufacture
,device_model
,resolution
,net_type
,account
,app_device_id
,mac
,android_id
,imei
,cid_sn
,build_num
,mobile_data_type
,promotion_channel
,carrier
,city
,user_id
FROM (
SELECT *
,row_number() OVER (
PARTITION BY user_id ORDER BY commit_time
) AS rn
FROM ods_app_log
WHERE day = '2018-05-19'
) tmp
WHERE rn = 1;
2.3.维度日活数据表
---建表语句
CREATE TABLE dim_user_active_day (
os_name string
,city string
,release_channel string
,app_ver_name string
,cnts INT
) partitioned BY (day string ,dim string);
-- 利用多重insert语法来统计各种维度组合的日活用户数,并插入到日活维度统计表的各分区中;
from etl_user_active_day
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0000')
select 'all','all','all','all',count(1)
where day ='2018-05-19'
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1000')
select os_name,'all','all','all',count(1)
where day ='2018-05-19'
group by os_name
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0100')
select 'all',city,'all','all',count(1)
where day ='2018-05-19'
group by city
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0010')
select 'all','all',release_channel,'all',count(1)
where day ='2018-05-19'
group by release_channel
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0001')
select 'all','all','all',app_ver_name,count(1)
where day ='2018-05-19'
group by app_ver_name
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1100')
select os_name,city,'all','all',count(1)
where day = '2018-05-19'
group by os_name,city
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1010')
select os_name,'all',release_channel,'all',count(1)
where day = '2018-05-19'
group by os_name,release_channel
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1001')
select os_name,'all','all',app_ver_name,count(1)
where day = '2018-05-19'
group by os_name,app_ver_name
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0110')
select 'all',city,release_channel,'all',count(1)
where day = '2018-05-19'
group by city,release_channel
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0101')
select 'all',city,'all',app_ver_name,count(1)
where day = '2018-05-19'
group by city,app_ver_name
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day = '2018-05-19'
group by release_channel,app_ver_name
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1110')
select os_name,city,release_channel,'all',count(1)
where day = '2018-05-19'
group by os_name,city,release_channel
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0111')
select 'all',city,release_channel,app_ver_name,count(1)
where day = '2018-05-19'
group by city,release_channel,app_ver_name
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1101')
select os_name,city,'all',app_ver_name,count(1)
where day = '2018-05-19'
group by os_name,city,app_ver_name
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1011')
select os_name,'all',release_channel,app_ver_name,count(1)
where day = '2018-05-19'
group by os_name,release_channel,app_ver_name
insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1111')
select os_name,city,release_channel,app_ver_name,count(1)
where day = '2018-05-19'
group by os_name,city,release_channel,app_ver_name;
3.日新数据表建模
-- 1 历史用户表
create table etl_user_history(user_id string);
-- 2 当日新增用户表:存所有字段(每个人时间最早的一条),带有一个分区字段:day string;
create table etl_user_new_day like etl_user_active_day;
-- 统计实现 *********************************
-- 3 当日活跃-历史用户表 --> 新增用户表的当日分区
insert into etl_user_new_day partition(day='2018-05-20')
SELECT sdk_ver
,time_zone
,commit_id
,commit_time
,pid
,app_token
,app_id
,device_id
,device_id_type
,release_channel
,app_ver_name
,app_ver_code
,os_name
,os_ver
,LANGUAGE
,country
,manufacture
,device_model
,resolution
,net_type
,account
,app_device_id
,mac
,android_id
,imei
,cid_sn
,build_num
,mobile_data_type
,promotion_channel
,carrier
,city
,a.user_id
from etl_user_active_day a left join etl_user_history b on a.user_id = b.user_id
where a.day='2018-05-20' and b.user_id is null;
-- 4 将当日新增用户的user_id追加到历史表
insert into table etl_user_history
select user_id from etl_user_new_day where day='2018-05-20';
4.日新维度数据表建模
-- 1 日新维度统计报表--数据建模
create table dim_user_new_day(os_name string,city string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);
-- 2 日新维度统计报表sql开发(利用多重插入语法)
from etl_user_new_day
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0000')
select 'all','all','all','all',count(1)
where day ='2018-05-20'
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1000')
select os_name,'all','all','all',count(1)
where day ='2018-05-20'
group by os_name
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0100')
select 'all',city,'all','all',count(1)
where day ='2018-05-20'
group by city
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0010')
select 'all','all',release_channel,'all',count(1)
where day ='2018-05-20'
group by release_channel
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0001')
select 'all','all','all',app_ver_name,count(1)
where day ='2018-05-20'
group by app_ver_name
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1100')
select os_name,city,'all','all',count(1)
where day = '2018-05-20'
group by os_name,city
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1010')
select os_name,'all',release_channel,'all',count(1)
where day = '2018-05-20'
group by os_name,release_channel
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1001')
select os_name,'all','all',app_ver_name,count(1)
where day = '2018-05-20'
group by os_name,app_ver_name
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0110')
select 'all',city,release_channel,'all',count(1)
where day = '2018-05-20'
group by city,release_channel
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0101')
select 'all',city,'all',app_ver_name,count(1)
where day = '2018-05-20'
group by city,app_ver_name
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day = '2018-05-20'
group by release_channel,app_ver_name
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1110')
select os_name,city,release_channel,'all',count(1)
where day = '2018-05-20'
group by os_name,city,release_channel
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0111')
select 'all',city,release_channel,app_ver_name,count(1)
where day = '2018-05-20'
group by city,release_channel,app_ver_name
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1101')
select os_name,city,'all',app_ver_name,count(1)
where day = '2018-05-20'
group by os_name,city,app_ver_name
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1011')
select os_name,'all',release_channel,app_ver_name,count(1)
where day = '2018-05-20'
group by os_name,release_channel,app_ver_name
insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1111')
select os_name,city,release_channel,app_ver_name,count(1)
where day = '2018-05-20'
group by os_name,city,release_channel,app_ver_name;