APP
,那这个用户定义为新增用户;卸载再安装的设备,不会算作一次新增。新增用户包括日新增用户、周新增用户、月新增用户。说明: 将同一分组的不同行数据聚合成一个集合。
举例:
创建数据表:
drop table if exists stu;
create table stu (name string,
area string,
course string,
score int);
插入数据:
insert into table stu values('zs', 'bj', 'math', 88);
insert into table stu values('ls', 'bj', 'math', 99);
insert into table stu values('ww', 'sh', 'chinese', 92);
insert into table stu values('zl', 'sh', 'chinese', 54);
insert into table stu values('tq', 'bj', 'chinese', 91);
查询数据:
select * from stu;
把同一分组的不同行数据聚合成一个集合:
select course , collect_set(area) , avg(score) from stu group by course;
用下标可以取某一个:
select course , collect_set(area) , avg(score)[0] from stu group by course;
date_format
函数(根据格式整理日期)
select date_format('2020-05-22','yyyy-MM');
-- 2020-05
date_add
函数(加减日期)
select date_add('2020-05-22',1);
-- 2020-02-23
select date_add('2020-05-22',-1);
-- 2020-02-21
next_day
函数
-- 取当前天的下一个周一
select next_day('2020-05-22','MO');
-- 2020-05-25
-- 取当前周的周一
select date_add(next_day('2020-05-22','MO'),-7);
-- 2020-05-18
last_day
函数(求当月最后一天日期)
select last_day('2020-05-22');
-- 2020-05-31
跟设备唯一标识 mid_id
有关,且与启动表 dwd_start_log
有关。
建表语句:
drop table if exists dws_uv_detail_day;
create external table dws_uv_detail_day (
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`net_work` string,
`lng` string,
`lat` string
)
partitioned by(`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_day/';
插入数据
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_day
partition(dt='2020-05-11')
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code ,
concat_ws('|', collect_set(version_name)) version_name ,
concat_ws('|', collect_set(lang)) lang ,
concat_ws('|', collect_set(source)) source ,
concat_ws('|', collect_set(os)) os ,
concat_ws('|', collect_set(area)) area ,
concat_ws('|', collect_set(model)) model ,
concat_ws('|', collect_set(brand)) brand ,
concat_ws('|', collect_set(sdk_version)) sdk_version ,
concat_ws('|', collect_set(gmail)) gmail ,
concat_ws('|', collect_set(height_width)) height_width ,
concat_ws('|', collect_set(app_time)) app_time ,
concat_ws('|', collect_set(net_work)) net_work ,
concat_ws('|', collect_set(lng)) lng ,
concat_ws('|', collect_set(lat)) lat
from dwd_start_log
where dt='2020-05-11'
group by mid_id;
查询导入结果
select * from dws_uv_detail_day limit 2;
select count(*) from dws_uv_detail_day;
跟设备唯一标识 mid_id
有关,且与日活表 dws_uv_detail_day
有关。
建表语句
drop table if exists dws_uv_detail_week;
create external table dws_uv_detail_week (
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`net_work` string,
`lng` string,
`lat` string,
`monday_date` string,
`sunday_date` string
)
partitioned by(`wk_dt` string) -- wk_dt 相当于表中的一个字段,且在最后
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_week/';
插入数据
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_week
partition(wk_dt)
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code ,
concat_ws('|', collect_set(version_name)) version_name ,
concat_ws('|', collect_set(lang)) lang ,
concat_ws('|', collect_set(source)) source ,
concat_ws('|', collect_set(os)) os ,
concat_ws('|', collect_set(area)) area ,
concat_ws('|', collect_set(model)) model ,
concat_ws('|', collect_set(brand)) brand ,
concat_ws('|', collect_set(sdk_version)) sdk_version ,
concat_ws('|', collect_set(gmail)) gmail ,
concat_ws('|', collect_set(height_width)) height_width ,
concat_ws('|', collect_set(app_time)) app_time ,
concat_ws('|', collect_set(net_work)) net_work ,
concat_ws('|', collect_set(lng)) lng ,
concat_ws('|', collect_set(lat)) lat ,
date_add(next_day('2020-05-11', 'MO') , -7),
date_add(next_day('2020-05-11', 'MO') , -1),
concat(date_add(next_day('2020-05-11', 'MO') , -7), '_', date_add(next_day('2020-05-11', 'MO') , -1))
from dws_uv_detail_day
where dt>=date_add(next_day('2020-05-11', 'MO') , -7) and dt<=date_add(next_day('2020-05-11', 'MO') , -1)
group by mid_id;
查询导入结果
select * from dws_uv_detail_week limit 2;
select count(*) from dws_uv_detail_week;
跟设备唯一标识 mid_id
有关,且与日活表 dws_uv_detail_day
有关。
建表语句
drop table if exists dws_uv_detail_month;
create external table dws_uv_detail_month (
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`net_work` string,
`lng` string,
`lat` string
)
partitioned by(`mt_dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_month/';
插入数据
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_month
partition(mt_dt)
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code ,
concat_ws('|', collect_set(version_name)) version_name ,
concat_ws('|', collect_set(lang)) lang ,
concat_ws('|', collect_set(source)) source ,
concat_ws('|', collect_set(os)) os ,
concat_ws('|', collect_set(area)) area ,
concat_ws('|', collect_set(model)) model ,
concat_ws('|', collect_set(brand)) brand ,
concat_ws('|', collect_set(sdk_version)) sdk_version ,
concat_ws('|', collect_set(gmail)) gmail ,
concat_ws('|', collect_set(height_width)) height_width ,
concat_ws('|', collect_set(app_time)) app_time ,
concat_ws('|', collect_set(net_work)) net_work ,
concat_ws('|', collect_set(lng)) lng ,
concat_ws('|', collect_set(lat)) lat ,
date_format('2020-05-11', 'yyyy-MM')
from dws_uv_detail_day
where date_format(dt, 'yyyy-MM') = date_format('2020-05-11', 'yyyy-MM')
group by mid_id;
查询导入结果
select * from dws_uv_detail_month limit 2;
select count(*) from dws_uv_detail_month;
在 hadoop101
的 /home/zgl/bin/
目录下创建脚本 dws_uv_log.sh
。
#!/bin/bash
APP=gmall
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n $1 ] ; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".dws_uv_detail_day
partition(dt='$do_date')
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code ,
concat_ws('|', collect_set(version_name)) version_name ,
concat_ws('|', collect_set(lang)) lang ,
concat_ws('|', collect_set(source)) source ,
concat_ws('|', collect_set(os)) os ,
concat_ws('|', collect_set(area)) area ,
concat_ws('|', collect_set(model)) model ,
concat_ws('|', collect_set(brand)) brand ,
concat_ws('|', collect_set(sdk_version)) sdk_version ,
concat_ws('|', collect_set(gmail)) gmail ,
concat_ws('|', collect_set(height_width)) height_width ,
concat_ws('|', collect_set(app_time)) app_time ,
concat_ws('|', collect_set(net_work)) net_work ,
concat_ws('|', collect_set(lng)) lng ,
concat_ws('|', collect_set(lat)) lat
from "$APP".dwd_start_log
where dt='$do_date'
group by mid_id;
insert overwrite table "$APP".dws_uv_detail_week
partition(wk_dt)
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code ,
concat_ws('|', collect_set(version_name)) version_name ,
concat_ws('|', collect_set(lang)) lang ,
concat_ws('|', collect_set(source)) source ,
concat_ws('|', collect_set(os)) os ,
concat_ws('|', collect_set(area)) area ,
concat_ws('|', collect_set(model)) model ,
concat_ws('|', collect_set(brand)) brand ,
concat_ws('|', collect_set(sdk_version)) sdk_version ,
concat_ws('|', collect_set(gmail)) gmail ,
concat_ws('|', collect_set(height_width)) height_width ,
concat_ws('|', collect_set(app_time)) app_time ,
concat_ws('|', collect_set(net_work)) net_work ,
concat_ws('|', collect_set(lng)) lng ,
concat_ws('|', collect_set(lat)) lat ,
date_add(next_day('2020-05-11', 'MO') , -7),
date_add(next_day('2020-05-11', 'MO') , -1),
concat(date_add(next_day('$do_date', 'MO') , -7), '_', date_add(next_day('$do_date', 'MO') , -1))
from "$APP".dws_uv_detail_day
where dt>=date_add(next_day('$do_date', 'MO') , -7) and dt<=date_add(next_day('$do_date', 'MO') , -1)
group by mid_id;
insert overwrite table "$APP".dws_uv_detail_month
partition(mt_dt)
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code ,
concat_ws('|', collect_set(version_name)) version_name ,
concat_ws('|', collect_set(lang)) lang ,
concat_ws('|', collect_set(source)) source ,
concat_ws('|', collect_set(os)) os ,
concat_ws('|', collect_set(area)) area ,
concat_ws('|', collect_set(model)) model ,
concat_ws('|', collect_set(brand)) brand ,
concat_ws('|', collect_set(sdk_version)) sdk_version ,
concat_ws('|', collect_set(gmail)) gmail ,
concat_ws('|', collect_set(height_width)) height_width ,
concat_ws('|', collect_set(app_time)) app_time ,
concat_ws('|', collect_set(net_work)) net_work ,
concat_ws('|', collect_set(lng)) lng ,
concat_ws('|', collect_set(lat)) lat ,
date_format('$do_date', 'yyyy-MM')
from "$APP".dws_uv_detail_day
where date_format(dt, 'yyyy-MM') = date_format('$do_date', 'yyyy-MM')
group by mid_id;
"
$hive -e "$sql"
增加脚本执行权限
chmod 777 dws_uv_log.sh
脚本使用
dws_uv_log.sh 2020-05-12
查询是否导入成功
select count(*) from dws_uv_detail_month where mt_dt='2020-05-12';
目标:当日、当周、当月设备活跃数
建表语句
drop table if exists ads_uv_count;
create external table ads_uv_count (
`dt` string, -- 统计日期
`day_count` bigint, -- 当天活跃用户数
`week_count` bigint, -- 当周活跃用户数
`month_count` bigint, -- 当月活跃用户数
`is_weekend` string, -- Y/N代表是否是周末,用于得到本周的最终结果
`is_monthend` string -- Y/N代表是否是月末,用于得到本月的最终结果
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_uv_count_day/';
插入数据
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table ads_uv_count
select
'2020-05-11' dt,
daycount.ct day_count,
weekcount.ct week_count,
monthcount.ct month_count,
if(date_add(next_day('2020-05-11','MO'),-1)='2020-05-11', 'Y', 'N'),
if(last_day('2020-05-11')='2020-05-11', 'Y', 'N')
from (
select
'2020-05-11' dt, count(*) ct
from
dws_uv_detail_day
where
dt='2020-05-11'
) daycount
join
(
select
'2020-05-11' dt, count(*) ct
from
dws_uv_detail_week
where
wk_dt=concat(date_add(next_day('2020-05-11','MO'),-7), '_', date_add(next_day('2020-05-11','MO'),-1) )
) weekcount on daycount.dt=weekcount.dt
join
(
select
'2020-05-11' dt, count(*) ct
from
dws_uv_detail_month
where
mt_dt=date_format('2020-05-11' , 'yyyy-MM')
) monthcount on daycount.dt=monthcount.dt ;
查看导入是否成功
select * from ads_uv_count;
在 hadoop101
的 /home/zhl/bin/
的目录下创建脚本 ads_uv_log.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ]; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert into table "$APP".ads_uv_count
select
'$do_date' dt,
daycount.ct day_count,
weekcount.ct week_count,
monthcount.ct month_count,
if(date_add(next_day('$do_date','MO'),-1)='$do_date', 'Y', 'N'),
if(last_day('$do_date')='$do_date', 'Y', 'N')
from (
select
'$do_date' dt, count(*) ct
from
"$APP".dws_uv_detail_day
where
dt='$do_date'
) daycount
join
(
select
'$do_date' dt, count(*) ct
from
"$APP".dws_uv_detail_week
where
wk_dt=concat(date_add(next_day('$do_date','MO'),-7), '_', date_add(next_day('$do_date','MO'),-1) )
) weekcount on daycount.dt=weekcount.dt
join
(
select
'$do_date' dt, count(*) ct
from
"$APP".dws_uv_detail_month
where
mt_dt=date_format('$do_date' , 'yyyy-MM')
) monthcount on daycount.dt=monthcount.dt ;
"
$hive -e "$sql"
给脚本增加可执行权限
chmod 777 ads_uv_log.sh
执行脚本
ads_uv_log.sh 2020-05-12
查看脚本执行结果
select * from ads_uv_count;
与设备的唯一标识 mid_id
有关,需要使用到日活表 dws_uv_detail_day
。
建表语句
drop table if exists dws_new_mid_day;
create external table dws_new_mid_day (
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`net_work` string,
`lng` string,
`lat` string,
`create_date` string
)
stored as parquet
location '/warehouse/gmall/dws/dws_new_mid_day/';
插入数据
insert into table dws_new_mid_day
select
ud.mid_id,
ud.user_id,
ud.version_code,
ud.version_name,
ud.lang,
ud.source,
ud.os,
ud.area,
ud.model,
ud.brand,
ud.sdk_version,
ud.gmail,
ud.height_width,
ud.app_time,
ud.net_work,
ud.lng,
ud.lat,
'2020-05-11'
from
dws_uv_detail_day ud left join dws_new_mid_day nm
on ud.mid_id=nm.mid_id
where ud.dt='2020-05-11' and nm.mid_id is null;
查询导入的数据
select * from dws_new_mid_day;
在 hadoop101
的 /home/zgl/bin/
目录下创建 dws_new_mid_day_log.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ]; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert into table "$APP".dws_new_mid_day
select
ud.mid_id,
ud.user_id,
ud.version_code,
ud.version_name,
ud.lang,
ud.source,
ud.os,
ud.area,
ud.model,
ud.brand,
ud.sdk_version,
ud.gmail,
ud.height_width,
ud.app_time,
ud.net_work,
ud.lng,
ud.lat,
'$do_date'
from
"$APP".dws_uv_detail_day ud left join "$APP".dws_new_mid_day nm
on ud.mid_id=nm.mid_id
where ud.dt='$do_date' and nm.mid_id is null;
"
$hive -e "$sql"
给脚本添加执行权限
chmod 777 dws_new_mid_day_log.sh
执行脚本
dws_new_mid_day_log.sh 2020-05-12
查看脚本执行情况
select * from dws_new_mid_day_log.sh limit 2;
select count(*) from dws_new_mid_day_log.sh;
建表语句
drop table if exists ads_new_mid_count;
create external table ads_new_mid_count (
`create_date` string,
`new_mid_count` bigint
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_new_mid_count';
插入数据
insert into table ads_new_mid_count
select
'2020-05-11',
count(*)
from dws_new_mid_day
where create_date='2020-05-11';
查看是否插入成功
select * from ads_new_mid_count;
在 hadoop101
的 /home/zgl/bin/
目录下创建脚本 ads_new_mid_count_log.sh
。
#!/bin/bash
APP=gmall
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ]; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert into table "$APP".ads_new_mid_count
select
'$do_date',
count(*)
from "$APP".dws_new_mid_day
where create_date='$do_date';
"
$hive -e "$sql"
给脚本添加执行权限
ads_new_mid_count_log.sh 2020-05-12
查看脚本执行结果
select * from ads_new_mid_count;
left join
每日新增表,新增表 id
为 null
的为 11日的新增设备。join
12日的活跃表,且新增日期是11日,活跃日期是12日。建表语句
drop table if exists dws_user_retention_day;
create external table dws_user_retention_day (
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`net_work` string,
`lng` string,
`lat` string,
`create_date` string, -- 新增日期
`retention_day` string -- 截至当前日期留存天数
)
partitioned by(`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_retention_day';
导入数据:(计算前一天的新用户访问留存明细)
insert overwrite table dws_user_retention_day
partition (dt='2020-05-12')
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.net_work,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm
on ud.mid_id = nm.mid_id
where ud.dt='2020-05-12' and nm.create_date=date_add('2020-05-12',-1); -- 今日活跃 and 前一天新增
查询导入结果
select count(*) from dws_user_retention_day;
导入数据 (每天计算前 1,2,3,n 天的新用户访问留存明细)
insert overwrite table dws_user_retention_day
partition (dt='2020-05-12')
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.net_work,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm
on ud.mid_id = nm.mid_id
where ud.dt='2020-05-12' and nm.create_date=date_add('2020-05-12',-1)
union all
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.net_work,
nm.lng,
nm.lat,
nm.create_date,
2 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm
on ud.mid_id = nm.mid_id
where ud.dt='2020-05-12' and nm.create_date=date_add('2020-05-12',-2)
union all
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.net_work,
nm.lng,
nm.lat,
nm.create_date,
3 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm
on ud.mid_id = nm.mid_id
where ud.dt='2020-05-12' and nm.create_date=date_add('2020-05-12',-3);
查询导入结果
select retention_day, count(*) from dws_user_retention_day group by retention_day;
union
会将联合的结果集去重,效率较 union all
差。union all
不会对结果集去重,所以效率较高。建表语句
drop table if exists ads_user_retention_day_count;
create external table ads_user_retention_day_count (
`create_date` string,
`retention_day` string,
`retention_count` bigint
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_count/';
导入数据
insert into table ads_user_retention_day_count
select
create_date,
retention_day,
count(*) retention_count
from dws_user_retention_day
where dt='2020-05-12'
group by create_date,retention_day;
查询导入结果
select * from ads_user_retention_day_count;
建表语句
drop table if exists ads_user_retention_day_rate;
create external table ads_user_retention_day_rate (
`stat_date` string,
`create_date` string,
`retention_day` string,
`retention_count` bigint,
`new_mid_count` bigint,
`retention_ratio` decimal(10,2)
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_rate/';
导入数据
insert into table ads_user_retention_day_rate
select
'2020-05-12',
ur.create_date,
ur.retention_day,
ur.retention_count,
nc.new_mid_count,
ur.retention_count/nc.new_mid_count*100
from
(
select
create_date,
retention_day,
count(*) retention_count
from dws_user_retention_day
where dt='2020-05-12'
group by create_date,retention_day
) ur join ads_new_mid_count nc on nc.create_date=ur.create_date;
查询导入结果
select * from ads_user_retention_day_rate;
为了分析沉默用户、本周回流用户数、流失用户、最近连续3周活跃用户、最近七天内连续三天活跃用户数,需要准备 2020-05-13、2020-05-21日的数据。
2020-05-13 数据准备
2.1 修改日志事件
dt.sh 2020-05-13
2.2 启动集群
cluster.sh start
2.3 生成日志数据
lg.sh
2.4 将 HDFS 数据导入 ODS 层
ods_log.sh 2020-05-13
2.5 将 ODS 层数据导入 DWD 层
dwd_start_log.sh 2020-05-13
dwd_base_log.sh 2020-05-13
dwd_event_log.sh 2020-05-13
2.6 将 DWD 层数据导入到 DWS 层
dws_uv_log.sh 2020-05-13
2.7 验证
select * from dws_uv_detail_day where dt='2020-05-13' limit 2;
2020-05-21 数据准备
3.1 修改日志事件
dt.sh 2020-05-21
3.2 启动集群
cluster.sh start
3.3 生成日志数据
lg.sh
3.4 将 HDFS 数据导入 ODS 层
ods_log.sh 2020-05-21
3.5 将 ODS 层数据导入 DWD 层
dwd_start_log.sh 2020-05-21
dwd_base_log.sh 2020-05-21
dwd_event_log.sh 2020-05-21
3.6 将 DWD 层数据导入到 DWS 层
dws_uv_log.sh 2020-05-21
3.7 验证
select * from dws_uv_detail_day where dt='2020-05-21' limit 2;
dws_uv_detail_day
作为 DWS
层数据。建表语句
drop table if exists ads_slient_count;
create external table ads_slient_count (
`dt` string ,
`slient_count` bigint
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_slient_count';
导入数据
insert into table ads_slient_count
select
'2020-05-21' dt,
count(*) slient_count
from
(
select mid_id
from dws_uv_detail_day
where dt<='2020-05-21'
group by mid_id
having count(*)=1 and min(dt) < date_add('2020-05-21', -7)
) t1;
查询导入结果
select * from ads_slient_count;
在 hadoop101
的 /home/zgl/bin/
在创建脚本 ods_slient_log.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ]; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert into table "$APP".ads_slient_count
select
'$do_date' dt,
count(*) slient_count
from
(
select mid_id
from "$APP".dws_uv_detail_day
where dt<='$do_date'
group by mid_id
having count(*)=1 and min(dt) < date_add('$do_date', -7)
) t1;
"
$hive -e "$sql"
增加脚本执行权限
chmod 777 ads_slient_log.sh
执行脚本
ads_slient_log.sh 2020-05-21
查询结果
select * from ads_slient_count;
dws_uv_detail_day
作为 DWS
层数据。建表语句
drop table if exists ads_back_count;
create external table ads_back_count (
`dt` string,
`wk_dt` string,
`wastage_count` bigint
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_back_count';
导入数据
insert into table ads_back_count
select
'2020-05-21' dt,
concat(date_add(next_day('2020-05-21','MO'),-7), '_', date_add(next_day('2020-05-21','MO'),-1)) wk_dt,
count(*)
from
(
select t1.mid_id
from
(
select mid_id
from dws_uv_detail_week
where wk_dt=concat(date_add(next_day('2020-05-21','MO'),-7),'_',date_add(next_day('2020-05-21','MO'),-1))
) t1
left join
(
select mid_id
from dws_new_mid_day
where create_date<=date_add(next_day('2020-05-21','MO'),-1) and create_date>=date_add(next_day('2020-05-21','MO'),-7)
)t2
on t1.mid_id=t2.mid_id
left join
(
select mid_id
from dws_uv_detail_week
where wk_dt=concat(date_add(next_day('2020-05-21','MO'),-7*2),'_',date_add(next_day('2020-05-21','MO'),-7-1))
)t3
on t1.mid_id=t3.mid_id
where t2.mid_id is null and t3.mid_id is null
) t4;
查询结果
select * from ads_back_count;
在 hadoop101
的 /home/zgl/bin/
目录下创建 ads_back_log.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ]; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert into table "$APP".ads_back_count
select
'$do_date' dt,
concat(date_add(next_day('$do_date','MO'),-7), '_', date_add(next_day('$do_date','MO'),-1)) wk_dt,
count(*)
from
(
select t1.mid_id
from
(
select mid_id
from "$APP".dws_uv_detail_week
where wk_dt=concat(date_add(next_day('$do_date','MO'),-7),'_',date_add(next_day('$do_date','MO'),-1))
) t1
left join
(
select mid_id
from "$APP".dws_new_mid_day
where create_date<=date_add(next_day('$do_date','MO'),-1) and create_date>=date_add(next_day('$do_date','MO'),-7)
)t2
on t1.mid_id=t2.mid_id
left join
(
select mid_id
from "$APP".dws_uv_detail_week
where wk_dt=concat(date_add(next_day('$do_date','MO'),-7*2),'_',date_add(next_day('$do_date','MO'),-7-1))
)t3
on t1.mid_id=t3.mid_id
where t2.mid_id is null and t3.mid_id is null
) t4;
"
$hive -e "$sql"
给脚本添加执行权限
chmod 777 ads_back_log.sh
执行脚本
ads_back_log.sh 2020-05-21
查询结果
select * from ads_back_count;
dws_uv_detail_day
作为 DWS
层数据。建表语句
drop table if exists ads_wastage_count;
create external table ads_wastage_count(
`dt` string COMMENT '统计日期',
`wastage_count` bigint COMMENT '流失设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_wastage_count';
导入数据:
insert into table ads_wastage_count
select
'2020-05-21',
count(*)
from
(
select mid_id
from dws_uv_detail_day
group by mid_id
having max(dt)<=date_add('2020-05-21',-7)
)t1;
查询导入结果
select * from ads_wastage_count;
在 hadoop101
的 /home/zgl/bin/
目录下创建 ads_wastage_log.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ]; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert into table "$APP".ads_wastage_count
select
'$do_date',
count(*)
from
(
select mid_id
from "$APP".dws_uv_detail_day
group by mid_id
having max(dt)<=date_add('$do_date',-7)
)t1;
"
$hive -e "$sql"
给脚本添加执行权限
chmod 777 ads_wastage_log.sh
执行脚本
ads_wastage_log.sh 2020-05-21
查询结果
select * from ads_wastage_count;
dws_uv_detail_week
作为 DWS
层数据。建表语句
drop table if exists ads_continuity_wk_count;
create external table ads_continuity_wk_count(
`dt` string,
`wk_dt` string,
`continuity_count` bigint
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_wk_count';
导入 2020-05-21 所在周的数据
insert into table ads_continuity_wk_count
select
'2020-05-21',
concat(date_add(next_day('2020-05-21','MO'),-7*3),'_',date_add(next_day('2020-05-21','MO'),-1)),
count(*)
from
(
select mid_id
from dws_uv_detail_week
where wk_dt>=concat(date_add(next_day('2020-05-21','MO'),-7*3),'_',date_add(next_day('2020-05-21','MO'),-7*2-1))
and wk_dt<=concat(date_add(next_day('2020-05-21','MO'),-7),'_',date_add(next_day('2020-05-21','MO'),-1))
group by mid_id
having count(*)=3
)t1;
查询
select * from ads_continuity_wk_count;
在 hadoop101
的 /home/zgl/bin/
下创建脚本:ads_continuity_wk_log.sh
#!/bin/bash
APP=gmall
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ]; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert into table "$APP".ads_continuity_wk_count
select
'$do_date', concat(date_add(next_day('$do_date','MO'),-7*3),'_',date_add(next_day('$do_date','MO'),-1)),
count(*)
from
(
select mid_id
from "$APP".dws_uv_detail_week
where wk_dt>=concat(date_add(next_day('$do_date','MO'),-7*3),'_',date_add(next_day('$do_date','MO'),-7*2-1))
and wk_dt<=concat(date_add(next_day('$do_date','MO'),-7),'_',date_add(next_day('$do_date','MO'),-1))
group by mid_id
having count(*)=3
)t1;
"
$hive -e "$sql"
给脚本添加执行权限
chmod 777 ads_continuity_wk_log.sh
执行脚本
ads_continuity_wk_log.sh 2020-05-21
查看运行结果
select * from ads_continuity_wk_count;
dws_uv_detail_day
作为 DWS
层数据。建表语句
drop table if exists ads_continuity_uv_count;
create external table ads_continuity_uv_count(
`dt` string ,
`wk_dt` string ,
`continuity_count` bigint
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_uv_count';
导入数据
insert into table ads_continuity_uv_count
select
'2020-05-21',
concat(date_add('2020-05-21',-6),'_','2020-05-21'),
count(*)
from
(
select mid_id
from
(
select mid_id
from
(
select
mid_id,
date_sub(dt,rank) date_dif
from
(
select
mid_id,
dt,
rank() over(partition by mid_id order by dt) rank
from dws_uv_detail_day
where dt>=date_add('2020-05-21',-6) and dt<='2020-05-21'
)t1
)t2
group by mid_id,date_dif
having count(*)>=3
)t3
group by mid_id
)t4;
查询结果:
select * from ads_continuity_uv_count;
在 hadoop101
的 /home/zgl/bin
目录下创建脚本: ads_continuity_uv_log.sh
。
#!/bin/bash
APP=gmall
hive=/opt/module/hive-1.2.1/bin/hive
if [ -n "$1" ]; then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
insert into table "$APP".ads_continuity_uv_count
select
'$do_date',
concat(date_add('$do_date',-6),'_','$do_date'),
count(*)
from
(
select mid_id
from
(
select mid_id
from
(
select
mid_id,
date_sub(dt,rank) date_dif
from
(
select
mid_id,
dt,
rank() over(partition by mid_id order by dt) rank
from "$APP".dws_uv_detail_day
where dt>=date_add('$do_date',-6) and dt<='$do_date'
)t1
)t2
group by mid_id,date_dif
having count(*)>=3
)t3
group by mid_id
)t4;
"
$hive -e "$sql"
给脚本添加执行权限
chmod 777 ads_continuity_uv_log.sh
执行脚本
ads_continuity_uv_log.sh 2020-05-21
查看结果
select * from ads_continuity_uv_count;