用户访问路径分析:
用户访问路径明细记录表
源表:DWD_APP_TFC_DTL_DEMO
目标表:DWD_APL_RUT_DTL
源表DWD_APP_TFC_DTL_DEMO表结构:
hive>create table DWD_APP_TFC_DTL_DEMO(
guid bigint,
eventid String,
event Map,
uid String,
imei String,
mac String,
imsi String,
osName String,
osVer String,
androidId String,
resolution String,
deviceType String,
deviceId String,
uuid String,
appid String,
appVer String,
release_ch String,
promotion_ch String,
areacode String,
longtitude Double,
latitude Double,
carrier String,
netType String,
cid_sn String,
ip String,
sessionId String,
`timestamp` bigint,
province String,
city String,
district String,
year string,
month string,
day string,
datestr string
)
partitioned by (dt string);
+--------------------------+-----------------------+-----------------------+--+
| col_name | data_type | comment |
+--------------------------+-----------------------+-----------------------+--+
| guid | bigint | |
| eventid | string | |
| event | map | |
| uid | string | |
| imei | string | |
| mac | string | |
| imsi | string | |
| osname | string | |
| osver | string | |
| androidid | string | |
| resolution | string | |
| devicetype | string | |
| deviceid | string | |
| uuid | string | |
| appid | string | |
| appver | string | |
| release_ch | string | |
| promotion_ch | string | |
| areacode | string | |
| longtitude | double | |
| latitude | double | |
| carrier | string | |
| nettype | string | |
| cid_sn | string | |
| ip | string | |
| sessionid | string | |
| timestamp | bigint | |
| province | string | |
| city | string | |
| district | string | |
| year | string | |
| month | string | |
| day | string | |
| datestr | string | |
| dt | string | |
| | NULL | NULL |
| # Partition Information | NULL | NULL |
| # col_name | data_type | comment |
| | NULL | NULL |
| dt | string | |
+--------------------------+-----------------------+-----------------------+--+
DWD_APP_TFC_DTL_DEMO表数据:
链接:https://pan.baidu.com/s/1AP_yeVZQL31QumXi8DN4OQ
提取码:wjl5
导入:
#数据导入
hive>load data local inpath '/root/tmp_data/data/file.txt' into table DWD_APP_TFC_DTL_DEMO partition(dt='2023-03-01');
#数据导出
hive>insert overwrite local directory '/root/tmp_data/data/table' select * from DWD_APP_TFC_DTL_DEMO;
#数据导出 指定分割符‘,’
hive>insert overwrite local directory '/root/tmp_data/data/table' row format delimited fields terminated by ',' select * from DWD_APP_TFC_DTL_DEMO;
数据示例:
9334571961830607macos10.0百度手机助手MI_MIX2FXnvkTfIDK8r51hCML6l3qcn.kgc.mall2.2.8百度手机助手0765282510285.7073300343655138.47361498737002ISP06WIFI44345302633837.51.202.77sid-388b01cb-f158-4ef7-8d9d-91869f7468c31575562642000unkownunkownunkown20191262019-12-062023-03-01-479747698pgviewEventutm_sourceurlhttp://www.kgcedu.cn/abi/pg416referrer_hosthttp://www.kgcedu.cn/aba/pg758utm_contentpgid416utm_campaignreferrertitleutm_mediumutm_term0598972660742431a0-f8-e2-07-46-50-c9
9334571961830607macos10.0百度手机助手MI_MIX2FXnvkTfIDK8r51hCML6l3qcn.kgc.mall2.2.8百度手机助手0765282510285.7073300343655138.47361498737002ISP06WIFI44345302633837.51.202.77sid-388b01cb-f158-4ef7-8d9d-91869f7468c31575563314000unkownunkownunkown20191262019-12-062023-03-01-479747698pgviewEventutm_sourceurlhttp://www.kgcedu.cn/aai/pg406referrer_hosthttp://www.kgcedu.cn/abi/pg416utm_contentpgid406utm_campaignreferrertitleutm_mediumutm_term0598972660742431a0-f8-e2-07-46-50-c9
9334571961830607macos10.0百度手机助手MI_MIX2FXnvkTfIDK8r51hCML6l3qcn.kgc.mall2.2.8百度手机助手0765282510285.7073300343655138.47361498737002ISP06WIFI44345302633837.51.202.77sid-388b01cb-f158-4ef7-8d9d-91869f7468c31575571978000unkownunkownunkown20191262019-12-062023-03-01
hive>create table DWD_APP_RUT_DTL_DEMO(
guid bigint,
sessionid string,
url string,
stepno int, --访问过程中的第几步
referral string, --前页
stay_times bigint --页面停留时间
)
partitioned by (dt string)
stored as orc;
-源表记录:
hive>select
guid,
sessionid,
event['url'] as url,
`timestamp` as ts
from DWD_APP_TFC_DTL_DEMO
where dt='2023-03-01';
结果:
+--------------+-------------------------------------------+---------------------------------+----------------+--+
| guid | sessionid | url | ts |
+--------------+-------------------------------------------+---------------------------------+----------------+--+
| -1522483296 | sid-99fe7648-d8e4-4cbe-86af-17b5b3c3a7fc | http://www.kgcedu.cn/acd/pg939 | 1575555829000 |
| -1589815556 | sid-34565b53-3d1e-4d7b-8e9c-6afb94ca81a2 | http://www.kgcedu.cn/aab/pg460 | 1575522543000 |
| -1589815556 | sid-34565b53-3d1e-4d7b-8e9c-6afb94ca81a2 | http://www.kgcedu.cn/abd/pg497 | 1575529492000 |
| -1589815556 | sid-34565b53-3d1e-4d7b-8e9c-6afb94ca81a2 | http://www.kgcedu.cn/aao/pg125 | 1575532020000 |
| -1589815556 | sid-34565b53-3d1e-4d7b-8e9c-6afb94ca81a2 | http://www.kgcedu.cn/abo/pg920 | 1575532834000 |
+--------------+-------------------------------------------+---------------------------------+----------------+--+
计算用户访问路径分析:
hive>select
guid,
sessionid,
event['url'] as url,
row_number() over(partition by guid,sessionid order by `timestamp` ) as stepno,
lag(event['url'],1,null) over(partition by guid,sessionid order by `timestamp` ) as referral,
`timestamp` as ts,
lead(`timestamp`,1,null) over(partition by guid,sessionid order by `timestamp` ) as after_ts
from DWD_APP_TFC_DTL_DEMO where dt='2023-03-01'
结果:
+--------------+-------------------------------------------+---------------------------------+---------+---------------------------------+----------------+----------------+--+
| guid | sessionid | url | stepno | referral | ts | after_ts |
+--------------+-------------------------------------------+---------------------------------+---------+---------------------------------+----------------+----------------+--+
| -2139243149 | sid-5fb31973-cc47-4d02-a66a-686afc769f0a | http://www.kgcedu.cn/aco/pg906 | 1 | NULL | 1575525663000 | 1575526312000 |
| -2139243149 | sid-5fb31973-cc47-4d02-a66a-686afc769f0a | http://www.kgcedu.cn/aaf/pg112 | 2 | http://www.kgcedu.cn/aco/pg906 | 1575526312000 | 1575529711000 |
| -2139243149 | sid-5fb31973-cc47-4d02-a66a-686afc769f0a | http://www.kgcedu.cn/acd/pg418 | 3 | http://www.kgcedu.cn/aaf/pg112 | 1575529711000 | 1575535940000 |
| -2139243149 | sid-5fb31973-cc47-4d02-a66a-686afc769f0a | http://www.kgcedu.cn/abd/pg300 | 4 | http://www.kgcedu.cn/acd/pg418 | 1575535940000 | 1575537311000 |
| -2139243149 | sid-5fb31973-cc47-4d02-a66a-686afc769f0a | http://www.kgcedu.cn/aad/pg367 | 5 | http://www.kgcedu.cn/abd/pg300 | 1575537311000 | 1575537902000 |
+--------------+-------------------------------------------+---------------------------------+---------+---------------------------------+----------------+----------------+--+
hive>
select
guid,
sessionid,
event['url'] as url,
row_number() over(partition by guid,sessionid order by `timestamp` ) as stepno,
lag(event['url'],1,null) over(partition by guid,sessionid order by `timestamp` ) as referral,
`timestamp` as ts,
lead(`timestamp`,1,null) over(partition by guid,sessionid order by `timestamp` )-`timestamp` as stay_times
from DWD_APP_TFC_DTL_DEMO where dt='2023-03-01';
结果:
+--------------+-------------------------------------------+---------------------------------+---------+---------------------------------+----------------+-------------+--+
| guid | sessionid | url | stepno | referral | ts | stay_times |
+--------------+-------------------------------------------+---------------------------------+---------+---------------------------------+----------------+-------------+--+
| -2139243149 | sid-5fb31973-cc47-4d02-a66a-686afc769f0a | http://www.kgcedu.cn/aco/pg906 | 1 | NULL | 1575525663000 | 649000 |
| -2139243149 | sid-5fb31973-cc47-4d02-a66a-686afc769f0a | http://www.kgcedu.cn/aaf/pg112 | 2 | http://www.kgcedu.cn/aco/pg906 | 1575526312000 | 3399000 |
| -2139243149 | sid-5fb31973-cc47-4d02-a66a-686afc769f0a | http://www.kgcedu.cn/acd/pg418 | 3 | http://www.kgcedu.cn/aaf/pg112 | 1575529711000 | 6229000 |
| -2139243149 | sid-5fb31973-cc47-4d02-a66a-686afc769f0a | http://www.kgcedu.cn/abd/pg300 | 4 | http://www.kgcedu.cn/acd/pg418 | 1575535940000 | 1371000 |
| -2139243149 | sid-5fb31973-cc47-4d02-a66a-686afc769f0a | http://www.kgcedu.cn/aad/pg367 | 5 | http://www.kgcedu.cn/abd/pg300 | 1575537311000 | 591000 |
+--------------+-------------------------------------------+---------------------------------+---------+---------------------------------+----------------+-------------+--+
最终计算:
hive>insert into DWD_APP_RUT_DTL_DEMO partition(dt='2023-03-01')
select
guid,
sessionid,
event['url'] as url,
row_number() over(partition by guid,sessionid order by `timestamp` ) as stepno,
lag(event['url'],1,null) over(partition by guid,sessionid order by `timestamp` ) as referral,
nvl(lead(`timestamp`,1,null) over(partition by guid,sessionid order by `timestamp` )-`timestamp`,30000) as stay_times
from DWD_APP_TFC_DTL_DEMO where dt='2023-03-01';
访问路径概况统计报表
横表计算
源表:DWD_APP_RUT_DTL
目标:ADS_APP_RUT_OVW
hive>create table ADS_APP_RUT_OVW_DEMO (
stepno int,
url string,
referral string,
route_counts int,
step_counts int,
page_counts int
)
partitioned by (dt string)
stored as orc;
计算逻辑:先算出各种路径会话数,然后基于上面的结果,利用sum() over() 进行page+step的计算累加,得到步骤会话数,
再利用sum() over() 进行窗口page的累加,得到页面会话数
hive>with temp as(
select
stepno,
url,
referral,
count(sessionid) as route_counts
from DWD_APP_RUT_DTL_DEMO where dt='2023-03-01'
group by stepno,url,referral
)
select
stepno,
url,
referral,
route_counts,
sum(route_counts) over(partition by url,stepno order by url rows between unbounded preceding and unbounded following) as step_counts,
sum(route_counts) over(partition by url order by url rows between unbounded preceding and unbounded following) as page_counts
from temp;
结果:
+---------+---------------------------------+---------------------------------+---------------+--------------+--------------+--+
| stepno | url | referral | route_counts | step_counts | page_counts |
+---------+---------------------------------+---------------------------------+---------------+--------------+--------------+--+
| 2 | http://www.kgcedu.cn/aaa/pg011 | http://www.kgcedu.cn/aac/pg410 | 1 | 1 | 3 |
| 29 | http://www.kgcedu.cn/aaa/pg011 | http://www.kgcedu.cn/aaa/pg483 | 1 | 1 | 3 |
| 17 | http://www.kgcedu.cn/aaa/pg011 | http://www.kgcedu.cn/abb/pg485 | 1 | 1 | 3 |
| 3 | http://www.kgcedu.cn/aaa/pg016 | http://www.kgcedu.cn/aag/pg099 | 1 | 1 | 5 |
| 8 | http://www.kgcedu.cn/aaa/pg016 | http://www.kgcedu.cn/aah/pg825 | 1 | 2 | 5 |
+---------+---------------------------------+---------------------------------+---------------+--------------+--------------+--+
模拟数据
使用count() over() 进行计算
hive>create table RUT_DEMO(
sessionid string,
stepno int,
url string,
referral string
)
row format delimited fields terminated by ',';
hive>load data local inpath '/root/tmp_data/data/url.txt' into table RUT_DEMO;
linux>vi url.txt
sessionid,stepno,url,referral
1,1,A,NULL
1,2,B,A
1,3,C,B
1,4,D,C
2,1,A,NULL
2,2,B,A
2,3,C,B
2,4,D,C
3,1,D,NULL
3,2,B,D
3,3,C,B
3,4,X,C
3,5,F,X
--count() over()
-- 页面 会话数
--步骤+页面 会话数
--步骤+页面+前页 会话数
---不支持 count(distinct sessionid) vagrant 下 可以
hive>select
stepno,
url,
referral,
count(distinct sessionid) over(partition by url) as page_counts,
count(sessionid) over(partition by url,stepno) as step_counts,
count(sessionid) over(partition by url,stepno,referral) as route_counts
from RUT_DEMO
group by sessionid,url,stepno,referral;
结果:
+---------+------+-----------+--------------+--------------+---------------+--+
| stepno | url | referral | page_counts | step_counts | route_counts |
+---------+------+-----------+--------------+--------------+---------------+--+
| 1 | A | NULL | 1 | 2 | 2 |
| 1 | A | NULL | 1 | 2 | 2 |
| 2 | B | A | 1 | 3 | 2 |
| 2 | B | A | 1 | 3 | 2 |
| 2 | B | D | 1 | 3 | 1 |
| 3 | C | B | 1 | 3 | 3 |
| 3 | C | B | 1 | 3 | 3 |
| 3 | C | B | 1 | 3 | 3 |
| 1 | D | NULL | 1 | 1 | 1 |
| 4 | D | C | 1 | 2 | 2 |
| 4 | D | C | 1 | 2 | 2 |
| 5 | F | X | 1 | 1 | 1 |
| 4 | X | C | 1 | 1 | 1 |
+---------+------+-----------+--------------+--------------+---------------+--+
去掉完全重复的记录
hive>with temp as(
select
stepno,
url,
referral,
count(distinct sessionid) over(partition by url) as page_counts,
count(sessionid) over(partition by url,stepno) as step_counts,
count(sessionid) over(partition by url,stepno,referral) as route_counts
from RUT_DEMO
group by sessionid,url,stepno,referral
)
select *
from temp
group by stepno,url,referral,page_counts,step_counts,route_counts;
结果:
+----------------------------+-------------------------+------------------------------+---------------------------------+---------------------------------+----------------------------------+--+
| (tok_table_or_col stepno) | (tok_table_or_col url) | (tok_table_or_col referral) | (tok_table_or_col page_counts) | (tok_table_or_col step_counts) | (tok_table_or_col route_counts) |
+----------------------------+-------------------------+------------------------------+---------------------------------+---------------------------------+----------------------------------+--+
| 1 | A | NULL | 1 | 2 | 2 |
| 1 | D | NULL | 1 | 1 | 1 |
| 2 | B | A | 1 | 3 | 2 |
| 2 | B | D | 1 | 3 | 1 |
| 3 | C | B | 1 | 3 | 3 |
| 4 | D | C | 1 | 2 | 2 |
| 4 | X | C | 1 | 1 | 1 |
| 5 | F | X | 1 | 1 | 1 |
+----------------------------+-------------------------+------------------------------+---------------------------------+---------------------------------+----------------------------------+--+
使用 grouping sets((url),(url,stepno),(url,stepno,referral)) 计算的是竖表
hive>select
stepno,
url,
referral,
count(distinct sessionid) as session_counts
from RUT_DEMO
group by stepno,url,referral
grouping sets ((url),(url,stepno),(url,stepno,referral));
结果:
+---------+------+-----------+-----------------+--+
| stepno | url | referral | session_counts |
+---------+------+-----------+-----------------+--+
| NULL | A | NULL | 2 |
| NULL | B | NULL | 3 |
| NULL | C | NULL | 3 |
| NULL | D | NULL | 3 |
| NULL | F | NULL | 1 |
| NULL | X | NULL | 1 |
| 1 | A | NULL | 2 |
| 1 | A | NULL | 2 |
| 1 | D | NULL | 1 |
| 1 | D | NULL | 1 |
| 2 | B | NULL | 3 |
| 2 | B | A | 2 |
| 2 | B | D | 1 |
| 3 | C | NULL | 3 |
| 3 | C | B | 3 |
| 4 | D | NULL | 2 |
| 4 | D | C | 2 |
| 4 | X | NULL | 1 |
| 4 | X | C | 1 |
| 5 | F | NULL | 1 |
| 5 | F | X | 1 |
+---------+------+-----------+-----------------+--+