APP数据模拟处理流程—[总结篇]


1.进行数据清理 MapReduce


#!/bin/bash


day_str=`date -d '-1 day' + '%Y-%m-%d'`


inpath=/app-log-data/data/$day_str
outpath=/app-log-data/clean/${day_str}-clean


echo "准备清洗$day_str数据..."


/root/apps/hadoop-2.8.3/bin/hadoop jar /root/data-clean.jar cn.edu360.app.log.mr.AppLogDataClean $inpath $outpath


---------------------------------------------------------------------------------------------------------------------------


2.原始数据建模


2.1原始数据表


--建表语句
CREATE EXTERNAL TABLE ods_app_log (
    sdk_ver string
    ,time_zone string
    ,commit_id string
    ,commit_time string
    ,pid string
    ,app_token string
    ,app_id string
    ,device_id string
    ,device_id_type string
    ,release_channel string
    ,app_ver_name string
    ,app_ver_code string
    ,os_name string
    ,os_ver string
    ,LANGUAGE string
    ,country string
    ,manufacture string
    ,device_model string
    ,resolution string
    ,net_type string
    ,account string
    ,app_device_id string
    ,mac string
    ,android_id string
    ,imei string
    ,cid_sn string
    ,build_num string
    ,mobile_data_type string
    ,promotion_channel string
    ,carrier string
    ,city string
    ,user_id string
    ) partitioned BY (
    day string
    ,os string
    ) row format delimited fields terminated BY '\001' location '/app-log-data/clean';
	
	
	
--改变数据表分区的结构
ALTER TABLE ods_app_log ADD PARTITION (day = '2018-05-19',os = 'android') location '/app-log-data/clean/2018-05-19/android';
ALTER TABLE ods_app_log ADD PARTITION (day = '2018-05-19',os = 'ios') location '/app-log-data/clean/2018-05-19/ios';






--数据导入语句
load data inpath '/app-log-data/clean/2018-05-19-clean/android' into table ods_app_log partition(day = '2018-05-19',os = 'android');
load data inpath '/app-log-data/clean/2018-05-19-clean/ios' into table ods_app_log partition(day = '2018-05-19',os = 'ios');




2.2日活数据表


--建表语句
CREATE TABLE etl_user_active_day (
    sdk_ver string
    ,time_zone string
    ,commit_id string
    ,commit_time string
    ,pid string
    ,app_token string
    ,app_id string
    ,device_id string
    ,device_id_type string
    ,release_channel string
    ,app_ver_name string
    ,app_ver_code string
    ,os_name string
    ,os_ver string
    ,language string
    ,country string
    ,manufacture string
    ,device_model string
    ,resolution string
    ,net_type string
    ,account string
    ,app_device_id string
    ,mac string
    ,android_id string
    ,imei string
    ,cid_sn string
    ,build_num string
    ,mobile_data_type string
    ,promotion_channel string
    ,carrier string
    ,city string
    ,user_id string
    ) partitioned BY (day string) row format delimited fields terminated BY '\001';




--向日活数据表插入数据
INSERT INTO TABLE etl_user_active_day PARTITION (day = '2018-05-19')
SELECT sdk_ver
    ,time_zone
    ,commit_id
    ,commit_time
    ,pid
    ,app_token
    ,app_id
    ,device_id
    ,device_id_type
    ,release_channel
    ,app_ver_name
    ,app_ver_code
    ,os_name
    ,os_ver
    ,LANGUAGE
    ,country
    ,manufacture
    ,device_model
    ,resolution
    ,net_type
    ,account
    ,app_device_id
    ,mac
    ,android_id
    ,imei
    ,cid_sn
    ,build_num
    ,mobile_data_type
    ,promotion_channel
    ,carrier
    ,city
    ,user_id
FROM (
    SELECT *
        ,row_number() OVER (
            PARTITION BY user_id ORDER BY commit_time
            ) AS rn
    FROM ods_app_log
    WHERE day = '2018-05-19'
    ) tmp
WHERE rn = 1;




2.3.维度日活数据表


---建表语句
CREATE TABLE dim_user_active_day (
    os_name string
    ,city string
    ,release_channel string
    ,app_ver_name string
    ,cnts INT
    ) partitioned BY (day string ,dim string);


	


-- 利用多重insert语法来统计各种维度组合的日活用户数,并插入到日活维度统计表的各分区中;


from etl_user_active_day


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0000')
select 'all','all','all','all',count(1)
where day ='2018-05-19'


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1000')
select os_name,'all','all','all',count(1)
where day ='2018-05-19'
group by os_name


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0100')
select 'all',city,'all','all',count(1)
where day ='2018-05-19'
group by city


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0010')
select 'all','all',release_channel,'all',count(1)
where day ='2018-05-19'
group by release_channel


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0001')
select 'all','all','all',app_ver_name,count(1)
where day ='2018-05-19'
group by app_ver_name


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1100')
select os_name,city,'all','all',count(1)
where day = '2018-05-19'
group by os_name,city


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1010')
select os_name,'all',release_channel,'all',count(1)
where day = '2018-05-19'
group by os_name,release_channel


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1001')
select os_name,'all','all',app_ver_name,count(1)
where day = '2018-05-19'
group by os_name,app_ver_name


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0110')
select 'all',city,release_channel,'all',count(1)
where day = '2018-05-19'
group by city,release_channel


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0101')
select 'all',city,'all',app_ver_name,count(1)
where day = '2018-05-19'
group by city,app_ver_name


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day = '2018-05-19'
group by release_channel,app_ver_name


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1110')
select os_name,city,release_channel,'all',count(1)
where day = '2018-05-19'
group by os_name,city,release_channel


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '0111')
select 'all',city,release_channel,app_ver_name,count(1)
where day = '2018-05-19'
group by city,release_channel,app_ver_name


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1101')
select os_name,city,'all',app_ver_name,count(1)
where day = '2018-05-19'
group by os_name,city,app_ver_name


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1011')
select os_name,'all',release_channel,app_ver_name,count(1)
where day = '2018-05-19'
group by os_name,release_channel,app_ver_name


insert into table dim_user_active_day partition(day = '2018-05-19',dim = '1111')
select os_name,city,release_channel,app_ver_name,count(1)
where day = '2018-05-19'
group by os_name,city,release_channel,app_ver_name;






3.日新数据表建模


-- 1 历史用户表
create table etl_user_history(user_id string);






-- 2 当日新增用户表:存所有字段(每个人时间最早的一条),带有一个分区字段:day string;
create table etl_user_new_day like etl_user_active_day;




-- 统计实现 *********************************


-- 3 当日活跃-历史用户表 --> 新增用户表的当日分区
insert  into etl_user_new_day partition(day='2018-05-20')
SELECT sdk_ver
    ,time_zone
    ,commit_id
    ,commit_time
    ,pid
    ,app_token
    ,app_id
    ,device_id
    ,device_id_type
    ,release_channel
    ,app_ver_name
    ,app_ver_code
    ,os_name
    ,os_ver
    ,LANGUAGE
    ,country
    ,manufacture
    ,device_model
    ,resolution
    ,net_type
    ,account
    ,app_device_id
    ,mac
    ,android_id
    ,imei
    ,cid_sn
    ,build_num
    ,mobile_data_type
    ,promotion_channel
    ,carrier
    ,city
    ,a.user_id
from  etl_user_active_day a left join  etl_user_history b on a.user_id = b.user_id
where a.day='2018-05-20' and b.user_id is null;




-- 4 将当日新增用户的user_id追加到历史表
insert into table etl_user_history
select user_id from etl_user_new_day where day='2018-05-20';






4.日新维度数据表建模


-- 1 日新维度统计报表--数据建模
create table dim_user_new_day(os_name string,city string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);


-- 2 日新维度统计报表sql开发(利用多重插入语法)
from etl_user_new_day


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0000')
select 'all','all','all','all',count(1)
where day ='2018-05-20'


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1000')
select os_name,'all','all','all',count(1)
where day ='2018-05-20'
group by os_name


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0100')
select 'all',city,'all','all',count(1)
where day ='2018-05-20'
group by city


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0010')
select 'all','all',release_channel,'all',count(1)
where day ='2018-05-20'
group by release_channel


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0001')
select 'all','all','all',app_ver_name,count(1)
where day ='2018-05-20'
group by app_ver_name


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1100')
select os_name,city,'all','all',count(1)
where day = '2018-05-20'
group by os_name,city


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1010')
select os_name,'all',release_channel,'all',count(1)
where day = '2018-05-20'
group by os_name,release_channel


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1001')
select os_name,'all','all',app_ver_name,count(1)
where day = '2018-05-20'
group by os_name,app_ver_name


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0110')
select 'all',city,release_channel,'all',count(1)
where day = '2018-05-20'
group by city,release_channel


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0101')
select 'all',city,'all',app_ver_name,count(1)
where day = '2018-05-20'
group by city,app_ver_name


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day = '2018-05-20'
group by release_channel,app_ver_name


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1110')
select os_name,city,release_channel,'all',count(1)
where day = '2018-05-20'
group by os_name,city,release_channel


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '0111')
select 'all',city,release_channel,app_ver_name,count(1)
where day = '2018-05-20'
group by city,release_channel,app_ver_name


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1101')
select os_name,city,'all',app_ver_name,count(1)
where day = '2018-05-20'
group by os_name,city,app_ver_name


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1011')
select os_name,'all',release_channel,app_ver_name,count(1)
where day = '2018-05-20'
group by os_name,release_channel,app_ver_name


insert into table dim_user_new_day partition(day = '2018-05-20',dim = '1111')
select os_name,city,release_channel,app_ver_name,count(1)
where day = '2018-05-20'
group by os_name,city,release_channel,app_ver_name;

你可能感兴趣的:(学习笔记)