Presto分析模型SQL和UDF函数

  • 分析模型SQL
    • 事件分析
      • 无分组多个指标查询SQL
      • 有分组多个指标查询SQL
    • 留存分析
      • 无分组留存人数/设备SQL
      • 有分组留存/设备人数SQL
      • 无分组流失人数/设备SQL
      • 有分组流失人数/设备SQL
    • LTV分析
      • 无分组LTV分析
      • 有分组LTV分析
    • 分布分析
      • 无分组数组分布分析SQL
      • 有分组分布分析SQL
    • 漏斗分析
      • 无分组漏斗SQL
      • 有分组漏斗SQL
    • 路径分析
      • 以某事件开头的路径分析
      • 以某事件结尾的路径分析
    • 事件明细分析
    • SQL分析
  • UDF函数
    • 标量函数
      • 除法函数
      • 分段函数
    • 聚合函数
      • 分段聚合函数
  • Presto自定义函数部署

将日常的分析指标划分为八大分析模型, 根据此分析模型的SQL模版, 只需要修改事件名和字段名, 便可以快速的进行指表统计。

分析模型SQL

事件分析

多个事件或事件属性的统计分析

无分组多个指标查询SQL
with result(day,idx0,idx1,idx2,idx3,idx4,idx5) as (
select
cast(e.day as varchar),  --  时间(day,hour,minute,month等)
-- 指标计算(根据不同的指标统计进行替换)
-- 总人数
count(distinct case when e.event = 'ta_mp_show' then e.uid else null end), 
-- 总次数
count(case when e.event = 'ta_mp_show' then '*' else null end),
-- 人均次数
divide(count(case when e.event = 'ta_mp_show' then '*' else null end), count(distinct case when e.event = 'ta_mp_show' then e.uid else null end)), 
-- 设备数
count(distinct case when e.event = 'ta_mp_show' then e.properties['device_no'] else null end), 
-- 设备次数(每个设备发生该事件的次数)
divide(count(case when e.event = 'ta_mp_show' then '*' else null end), count(distinct case when e.event = 'ta_mp_show' then e.properties['device_no'] else null end)),
-- 某数值字段的平均值avg, 总和sum, 最大值max, 最小值min
cast(avg(case when e.event = 'ta_mp_show' then cast(element_at(e.properties,'scene') as DOUBLE) else 0 end) as decimal(18,2)) 
from event.action e
-- 时间条件和事件条件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai')))) and e.event in ('ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show')
group by e.day
order by (e.day) desc
)
select day, idx0,idx1,idx2,idx3,idx4,idx5 from result

有分组多个指标查询SQL
with result(day,g0,row_id,idx0,idx1,idx2,idx3,idx4,idx5) as (
select
cast(e.day as varchar),   --  时间(day,hour,minute,month等)
element_at(e.properties,'#mp_platform'),   -- 分组的字段
--  避免数据量太大, 将限制返回的分组数量 
row_number() over(partition by e.day order by (element_at(e.properties,'#mp_platform')) desc),
-- 指标计算(根据不同的指标统计进行替换)
-- 总人数
count(distinct case when e.event = 'ta_mp_show' then e.uid else null end), 
-- 总次数
count(case when e.event = 'ta_mp_show' then '*' else null end),
-- 人均次数
divide(count(case when e.event = 'ta_mp_show' then '*' else null end), count(distinct case when e.event = 'ta_mp_show' then e.uid else null end)), 
-- 设备数
count(distinct case when e.event = 'ta_mp_show' then e.properties['device_no'] else null end), 
-- 设备次数(每个设备发生该事件的次数)
divide(count(case when e.event = 'ta_mp_show' then '*' else null end), count(distinct case when e.event = 'ta_mp_show' then e.properties['device_no'] else null end)),
-- 某数值字段的平均值avg, 总和sum, 最大值max, 最小值min
cast(avg(case when e.event = 'ta_mp_show' then cast(element_at(e.properties,'scene') as DOUBLE) else 0 end) as decimal(18,2)) 
from event.action e
-- 时间条件和事件条件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai')))) and e.event in ('ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show','ta_mp_show')
-- 分组排序
group by e.day, element_at(e.properties,'#mp_platform')
order by (e.day) desc
)

select day,g0, idx0,idx1,idx2,idx3,idx4,idx5,
-- 聚合指标, 按百分比显示
cast(cast(divide(idx0, idx1) as decimal(18,2)) * 100 as varchar) || '%' as aggIdx0,  
-- 聚合指标, 按整数显示
cast(cast(divide(idx0, idx1) as decimal(18,2)) as bigint) as aggIdx1, 
-- 聚合指标, 按2位浮点数显示
cast(divide(idx0, idx1) as decimal(18,2)) as aggIdx2 from result
-- 显示前100个分组的数据
where row_id <= 100
image.png

留存分析

初始事件到目标事件留存或流失的用户分析

无分组留存人数/设备SQL
with number(n) as(
values (1),(3),(5)     -- 留存日期 1日,3日, 5日
),tmp1 (day, uid)as(
select distinct
e.day,
-- 留存人数
e.uid  
-- 留存设备数 element_at(e.properties, 'device_no') 
from event.action e
-- 筛选初始事件, 比如求注册->登录的留存, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, uid, idx) as(
select distinct
e.day, 
-- 留存人数
e.uid,  
-- 留存设备数 element_at(e.properties, 'device_no') 
sum(1)  
from event.action e
-- 筛选目标事件, 比如求注册->登录的留存, 目标事件为登录事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$login'
group by e.day, e.uid
),result(day, n, user_count, idx) as(
select
e.day, 
n, 
count(distinct e.uid),  -- 初始事件人数/设备数
count(distinct ee.uid) -- 初始事件到目标事件留存人数/设备数
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day = date_add('day', n, e.day)
where n <= 5  -- 小于等于最大的留存日期
group by e.day, n
order by (e.day, n) asc
)
select
cast(e.day as varchar),
max(user_count),
array_agg(idx)
from result e
group by e.day
order by (e.day) desc
有分组留存/设备人数SQL
with number(n) as(
values (1),(3),(5)    -- 留存日期 1日,3日, 5日
),tmp1 (day, g0, uid)as(
select distinct
e.day, 
element_at(e.properties,'media'),  -- 分组字段
-- 留存人数
e.uid  
-- 留存设备数 element_at(e.properties, 'device_no') 
from event.action e
-- 筛选初始事件, 比如求注册->登录的留存, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, g0, uid, idx) as(
select distinct
e.day, element_at(e.properties,'media'), 
-- 留存人数
e.uid,
-- 留存设备数 element_at(e.properties, 'device_no') 
sum(1)
from event.action e
-- 筛选目标事件, 比如求注册->登录的留存, 目标事件为登录事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$login'
group by e.day, element_at(e.properties,'media'), e.uid
),result(day, g0, n, user_count, idx) as(
select
e.day, e.g0, n, count(distinct e.uid), count(distinct ee.uid)
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day = date_add('day', n, e.day) and e.g0 = ee.g0
where n <= 5  -- 小于等于最大的留存日期
group by e.day, e.g0, n
order by (e.day, n) asc
), order_result(day, g0, row_id, user_count, idxList) as (
select
cast(e.day as varchar), e.g0,
-- 避免分组过大导致数据量过大, 进行排序和过滤
row_number() over(partition by day order by (e.g0) desc) row_id,
max(user_count),
array_agg(idx)
from result e
-- 分组排序
group by e.day, e.g0
order by (e.day, e.g0) desc
)
select day, g0, user_count, idxList from order_result
where row_id <= 100  -- 保留前100个分组的数据
无分组流失人数/设备SQL
with number(n) as(
values (1),(3),(5)     -- 留存日期 1日,3日, 5日
),tmp1 (day, uid)as(
select distinct
e.day,
-- 流失人数
e.uid  
-- 流失设备数 element_at(e.properties, 'device_no') 
from event.action e
-- 筛选初始事件, 比如求注册->登录的流失, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, uid, idx) as(
select distinct
e.day, 
-- 流失人数
e.uid,  
-- 流失设备数 element_at(e.properties, 'device_no') 
sum(1)  
from event.action e
-- 筛选目标事件, 比如求注册->登录的流失, 目标事件为登录事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$login'
group by e.day, e.uid
),result(day, n, user_count, idx) as(
select
e.day, 
n, 
count(distinct e.uid),  -- 初始事件人数/设备数
count(distinct case when ee.uid is null then e.uid else null end) -- 初始事件到目标事件流失人数/设备数
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day = date_add('day', n, e.day)
where n <= 5  -- 小于等于最大的留存日期
group by e.day, n
order by (e.day, n) asc
)
select
cast(e.day as varchar),
max(user_count),
array_agg(idx)
from result e
group by e.day
order by (e.day) desc
有分组流失人数/设备SQL
with number(n) as(
values (1),(3),(5)    -- 留存日期 1日,3日, 5日
),tmp1 (day, g0, uid)as(
select distinct
e.day, 
element_at(e.properties,'media'),  -- 分组字段
-- 留存人数
e.uid  
-- 留存设备数 element_at(e.properties, 'device_no') 
from event.action e
-- 筛选初始事件, 比如求注册->登录的留存, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, g0, uid, idx) as(
select distinct
e.day, element_at(e.properties,'media'), 
-- 留存人数
e.uid,
-- 留存设备数 element_at(e.properties, 'device_no') 
sum(1)
from event.action e
-- 筛选目标事件, 比如求注册->登录的留存, 目标事件为登录事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$login'
group by e.day, element_at(e.properties,'media'), e.uid
),result(day, g0, n, user_count, idx) as(
select
e.day, e.g0, n, count(distinct e.uid),
count(distinct case when ee.uid is null then e.uid else null end) -- 初始事件到目标事件流失人数/设备数
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day = date_add('day', n, e.day) and e.g0 = ee.g0
where n <= 5  -- 小于等于最大的留存日期
group by e.day, e.g0, n
order by (e.day, n) asc
), order_result(day, g0, row_id, user_count, idxList) as (
select
cast(e.day as varchar), e.g0,
-- 避免分组过大导致数据量过大, 进行排序和过滤
row_number() over(partition by day order by (e.g0) desc) row_id,
max(user_count),
array_agg(idx)
from result e
-- 分组排序
group by e.day, e.g0
order by (e.day, e.g0) desc
)
select day, g0, user_count, idxList from order_result
where row_id <= 100  -- 保留前100个分组的数据
image.png

LTV分析

初始事件到营收事件的用户收益分析

无分组LTV分析
with number(n) as(
values (1),(2),(3) -- LTV日期 1日(当日),3日, 5日
),tmp1 (day, uid)as(
select distinct
e.day, e.uid
from event.action e
-- 筛选初始事件, 比如求注册->购买的LTV, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, uid, idx) as(
select
e.day, e.uid, 
-- 计算该字段的收益之和
sum(cast(element_at(e.properties,'#vp@revenue_amount') as DOUBLE))
from event.action e
-- 筛选目标事件, 比如求注册->购买的LTV, 目标事件为购买事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = 'payment'
group by e.day, e.uid
),result(day, n, user_count, idx) as(
select
e.day, n, 
count(distinct e.uid),  -- 初始事件人数
coalesce(sum(idx),0)  -- 初始事件到目标事件产生收益之和
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day between e.day and date_add('day', n-1, e.day)
where n <= 3
group by e.day, n
order by (e.day, n) asc
)
select
cast(e.day as varchar),
max(user_count),
array_agg(idx)
from result e
group by e.day
order by (e.day) desc
有分组LTV分析
with number(n) as(
values (1),(2),(3)   -- LTV日期 1日(当日),3日, 5日
),tmp1 (day, g0, uid)as(
select distinct
e.day, 
element_at(e.properties,'media'),  -- 分组字段
e.uid
from event.action e
-- 筛选初始事件, 比如求注册->购买的LTV, 初始事件为注册事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = '$register'
),tmp2(day, g0, uid, idx) as(
select
e.day, element_at(e.properties,'media'), e.uid, 
-- 计算该字段的收益之和
sum(cast(element_at(e.properties,'#vp@revenue_amount') as DOUBLE))
from event.action e
-- 筛选目标事件, 比如求注册->购买的LTV, 目标事件为购买事件
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day between date_add('day', -3, current_date) and date_add('day', -1, current_date)
and e.event = 'payment'
group by e.day, element_at(e.properties,'media'), e.uid
),result(day, g0, n, user_count, idx) as(
select
e.day, e.g0, n, 
count(distinct e.uid),  -- 初始事件人数
coalesce(sum(idx),0)  -- 初始事件到目标事件产生收益之和
from tmp1 e
cross join number
left join tmp2 ee
on e.uid = ee.uid
and ee.day between e.day and date_add('day', n-1, e.day) and e.g0 = ee.g0
where n <= 3
group by e.day, e.g0, n
order by (e.day, n) asc
), order_result(day, g0, row_id, user_count, idxList) as (
select
cast(e.day as varchar), e.g0,
row_number() over(partition by day order by (e.g0) desc) row_id,
max(user_count),
array_agg(idx)
from result e
group by e.day, e.g0
order by (e.day, e.g0) desc
)
select day, g0, user_count, idxList from order_result
where row_id <= 100
image.png

分布分析

用户事件发生频次或属性数值区间分布分析

无分组数组分布分析SQL
with number(n,array_data) as (
SELECT
CAST(subscript AS INTEGER) subscrip, ARRAY[60,100,200] -- 数组分段区间
FROM (VALUES(sequence(2,4+1,1))) AS t1(array_subscript) -- 数组长度加一
CROSS JOIN UNNEST(array_subscript) AS t2(subscript)
), tmp1(day, uid, idx) as (
select
e.day,
e.uid,
-- 计算每个用户该字段的总和
sum(COALESCE(cast(element_at(e.properties,'scene') as DOUBLE),0))
from event.action e
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai')))) and e.event = 'ta_mp_show'
group by e.day, e.uid
),tmp2(day, user,min_value,max_value,arr_value) as (
select
day,
count(distinct uid),
min(idx),
max(idx),
array_agg(idx)
from tmp1
group by day
)
select
-- 分组字段
cast(day as varchar) "日期",
max(user) "总人数",
array_agg(concat(
case when n-1=1 then '(-∞' else concat('[',cast(array_data[n-2] as varchar)) end,
',',
case when n-1=4 then '+∞)' else concat(cast(array_data[n-1] as varchar),')') end) -- d为数组长度
) gap_list,
array_agg(cardinality(filter(
arr_value, x -> IF(n-1=1,min_value-1,array_data[n-2]) <= x and x< IF(n-1<4,array_data[n-1],max_value+1) ))
) value_list
from tmp2
cross join number
where n-1 <= 4   -- 4为数组长度
group by day order by (day) desc
有分组分布分析SQL
with number(n,array_data) as (
SELECT
CAST(subscript AS INTEGER) subscrip, ARRAY[60,100,200] --数组分段区间
FROM (VALUES(sequence(2,4+1,1))) AS t1(array_subscript) -- 数组长度加一
CROSS JOIN UNNEST(array_subscript) AS t2(subscript)
), tmp1(day, g0, row_id, uid, idx) as (
select
e.day, 
-- 分组指标
element_at(e.properties,'#mp_platform'), 
-- 分组排序,用来保留前100个分组, 避免有些分组过大, 数据爆炸
row_number() over(partition by (e.uid, e.day) order by (element_at(e.properties,'#mp_platform')) desc),
e.uid,
-- 计算每个用户该字段的总和
sum(COALESCE(cast(element_at(e.properties,'scene') as DOUBLE),0))
from event.action e
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai')))) and e.event = 'ta_mp_show'
group by e.day, element_at(e.properties,'#mp_platform'), e.uid
),tmp2(day, g0, user,min_value,max_value,arr_value) as (
select
day, g0,
count(distinct uid),
min(idx),
max(idx),
array_agg(idx)
from tmp1
where row_id <= 100
group by day, g0
)
select
-- 分组字段
cast(day as varchar) "日期", g0,
max(user) "总人数",
array_agg(concat(
case when n-1=1 then '(-∞' else concat('[',cast(array_data[n-2] as varchar)) end,
',',
case when n-1=4 then '+∞)' else concat(cast(array_data[n-1] as varchar),')') end) -- d为数组长度
) gap_list,
array_agg(cardinality(filter(
arr_value, x -> IF(n-1=1,min_value-1,array_data[n-2]) <= x and x< IF(n-1<4,array_data[n-1],max_value+1) ))
) value_list
from tmp2
cross join number
where n-1 <= 4 -- 4为数组长度
group by day, g0 order by (day, g0) desc
image.png

漏斗分析

用户在多个事件之间的转换分析

无分组漏斗SQL
-- tmp0,tmp1,tmpx分别为漏斗事件顺序
with
tmp0 ( uid, t) as (
select
e.uid, e.time
from event.action e
where e.event = 'ta_mp_show' and e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
),tmp1 ( uid, t) as (
select
e.uid, e.time
from event.action e
where e.event = 'ta_mp_show' and e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
)
select
count(distinct e0.uid) s0, 
count(distinct e1.uid) s1  
from tmp0 e0
-- interval '1440' minute 代表两个事件发生时间不小于此时间 算有效漏斗
left join tmp1 e1 on e1.uid = e0.uid and e1.t between e0.t and (e0.t + interval '1440' minute)
有分组漏斗SQL
-- tmp0,tmp1,tmpx分别为漏斗事件顺序
with
tmp0 (g0, uid, t) as (
select
element_at(e.properties,'#mp_platform'), e.uid, e.time
from event.action e
where e.event = 'ta_mp_show' and e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
),
tmp1 (g0, uid, t) as (
select
element_at(e.properties,'#mp_platform'), e.uid, e.time
from event.action e
where e.event = 'ta_mp_show' and e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
)
select
e0.g0,
count(distinct e0.uid) s0, count(distinct e1.uid) s1
from tmp0 e0
-- interval '1440' minute 代表两个事件发生时间不小于此时间 算有效漏斗
left join tmp1 e1 on e1.uid = e0.uid and e1.t between e0.t and (e0.t + interval '1440' minute) and e0.g0 = e1.g0
group by e0.g0 order by (e0.g0) desc limit 100

image.png

路径分析

以某个事件为开始或结束在多个事件之间的转换分析

以某事件开头的路径分析
with tmp(uid, action, t) as (
select distinct
e.uid,
if(e.event='$register','用户注册', if(e.event='$login','用户登录', if(e.event='payment','内购', '未知' ))),
e.time
from event.action e
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
and e.event in ('$register', '$login', 'payment')
), tmp1(uid, action, t, row_id) as (
select
uid, action, t,
row_number() over(partition by uid order by (t,action))
from tmp
order by (uid, t, action)
), tmp2(uid, action, t, lead_action, lead_t, max_row_id) as (
select
uid, action, t,
lead(action, 1, '默认事件') over(partition by uid order by (t,action)),
lead(t, 1, cast('2999-12-31 23:59:59' as timestamp)) over(partition by uid order by (t,action)),
max(row_id) over(partition by uid) max_row_id
from tmp1
left join (values(1),(1)) t(rid) on row_id = rid
), tmp3(uid,path) as (
select uid,
filter(
reduce(
transform(
--按时间排序
zip(
array_sort(zip(array_agg(t),array_agg(action)),
(x, y) -> IF(cast(x[1] as timestamp) < cast(y[1] as timestamp),-1,
IF (cast(x[1] as timestamp) = cast(y[1] as timestamp), 0, 1))),
array_sort(zip(array_agg(lead_t),array_agg(lead_action)),
(x, y) -> IF(cast(x[1] as timestamp) < cast(y[1] as timestamp),-1,
IF (cast(x[1] as timestamp) = cast(y[1] as timestamp), 0, 1)))),
x -> ARRAY[
cast(x[2][1] as varchar), x[2][2],
IF(x[2][2] = '用户注册',
'0',
IF(cast(x[1][1] as timestamp) >= (cast(x[2][1] as timestamp) + interval '-1440' minute) and x[1][2]<>x[2][2] and x[2][2] <> '默认事件',
'1',
'-1')
)
]
),
-- 三维数组
array[array[array['2020-10-01 10:00:00.000', '默认事件', '-1']]],
(state, value) -> case
when value[3] = '0' or value[3] = '-1' then
concat(state, array[value])
when value[3] = '1' then
concat(slice(state, 1, cardinality(state) - 1), array[concat(slice(state, -1, 1)[1], array[value])])
end,
state -> slice(state, 2, cardinality(state) - 1)
),
-- 过滤掉开头或结尾不为该事件的
x -> x[1][3] = '0' and cardinality(x) > 1
) path
from tmp2
where max_row_id>1
group by uid
), result as(
SELECT
uid, replace(ltrim(regexp_replace(str,'([0-9]*)(\p{Punct}*)','')),' ','->') path
FROM tmp3
CROSS JOIN UNNEST(SPLIT(json_format(cast(path as json)) ,']],[[')) t(str)
)
select
path,
count(*) over() all_path_count,
count(*) path_count
from result
where path<>'用户注册' and path<>''
group by path
order by (path_count, path) desc
limit 100
以某事件结尾的路径分析
with tmp(uid, action, t) as (
select distinct
e.uid,
reverse(if(e.event='$register','用户注册', if(e.event='$login','用户登录', if(e.event='payment','内购', '未知' )))),
e.time
from event.action e
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and ((e.day between date_add('day',-3, date(now() AT Time Zone 'Asia/Shanghai')) and date_add('day', -1, date(now() AT Time Zone 'Asia/Shanghai'))))
and e.event in ('$register', '$login', 'payment')
),tmp1(uid, action, t, row_id) as (
select
uid, action, t,
row_number() over(partition by uid order by (t,action) desc)
from tmp
),tmp2(uid, action, t, lead_action, lead_t, max_row_id) as (
select
uid, action, t,
lead(action, 1, '默认事件') over(partition by uid order by (t,action) desc),
lead(t, 1, cast('1970-01-01 00:00:00' as timestamp)) over(partition by uid order by (t,action) desc),
max(row_id) over(partition by uid) max_row_id
from tmp1
left join (values(1),(1)) t(rid) on row_id = rid
),tmp3(uid,path) as (
select uid,
filter(
reduce(
transform(
--按时间排序
zip(
array_sort(zip(array_agg(t),array_agg(action)),
(x, y) -> IF(cast(x[1] as timestamp) > cast(y[1] as timestamp),-1,
IF (cast(x[1] as timestamp) = cast(y[1] as timestamp), 0, 1))),
array_sort(zip(array_agg(lead_t),array_agg(lead_action)),
(x, y) -> IF(cast(x[1] as timestamp) > cast(y[1] as timestamp),-1,
IF (cast(x[1] as timestamp) = cast(y[1] as timestamp), 0, 1)))),
x -> ARRAY[
cast(x[2][1] as varchar), x[2][2],
IF(x[2][2] = reverse('内购'),
'0',
IF(cast(x[1][1] as timestamp) <= (cast(x[2][1] as timestamp) + interval '1440' minute) and x[1][2]<>x[2][2] and x[2][2] <> '默认事件',
'1',
'-1')
)
]
),
-- 三维数组
array[array[array['2020-10-01 10:00:00.000', '默认事件', '-1']]],
(state, value) -> case
when value[3] = '0' or value[3] = '-1' then
concat(state, array[value])
when value[3] = '1' then
concat(slice(state, 1, cardinality(state) - 1), array[concat(slice(state, -1, 1)[1], array[value])])
end,
state -> slice(state, 2, cardinality(state) - 1)
),
-- 过滤掉开头或结尾不为该事件的
x -> x[1][3] = '0' and cardinality(x) > 1
) path
from tmp2
where max_row_id>1
group by uid
), result as(
SELECT
uid,replace(ltrim(regexp_replace(str,'([0-9]*)(\p{Punct}*)','')),' ','>-') path
FROM tmp3
CROSS JOIN UNNEST(SPLIT(json_format(cast(path as json)) ,']],[[')) t(str)
)
select
reverse(path) path,
count(*) over() all_path_count,
count(*) path_count
from result
where path<>reverse('内购') and path<>''
group by path
order by (path_count, path) desc
limit 100
image.png

事件明细分析

查看和多维度筛选事件日志

select
e.uid "账号ID",
e.uid_type "账号类型",
element_at(e.properties,'#mp_platform') "小程序平台",
element_at(e.properties,'#os') "操作系统",
cast(element_at(e.properties,'#zone_offset') as DOUBLE) "时区偏移",
cast(element_at(e.properties,'life_time') as INTEGER) "生命周期天数",
cast(element_at(e.properties,'scene') as DOUBLE) "场景值",

format_datetime(e.time, 'yyyy-MM-dd HH:mm:ss') "时间",
e.timezone "时区",
format_datetime(e.timestamp, 'yyyy-MM-dd HH:mm:ss') "采集时间"
from event.action e
where e.game_id = 9 and e.timezone in ('Asia/Shanghai')
and e.day >= date_add('day',-2, date(now() AT Time Zone 'Asia/Shanghai')) and e.minute <= format_datetime(date_add('minute', -1, now() AT Time Zone 'Asia/Shanghai'),'yyyy-MM-dd HH:mm') and e.event = 'ta_mp_show'
and (element_at(e.properties,'#mp_platform') = '平台1')  -- 筛选条件
order by (e.time) desc
limit 100
image.png

SQL分析

支持对全量数据的SQL查询和可视化,在通用的分析模型不支持的时候, 可以使用SQL制作复杂报表

image.png

UDF函数

标量函数

除法函数

除0特殊处理, 除法保留2位小数, 支持不同数值类型相除

package scalarFunctions;

import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.spi.function.Description;
import com.facebook.presto.spi.function.ScalarFunction;
import com.facebook.presto.spi.function.SqlType;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;

import static com.facebook.presto.common.type.StandardTypes.DOUBLE;
import static com.facebook.presto.common.type.StandardTypes.INTEGER;

public class avgPrestoFunction {
    @ScalarFunction("divide")//函数名称
    @Description("(double) (a/b)")//函数描述
    @SqlType(DOUBLE)//函数返回类型
    public static double avg(@SqlType(INTEGER) long numerator, // 分子
                                   @SqlType(INTEGER) long denominator) {// 分母
        if (denominator == 0){
            return Double.parseDouble("0.0");
        }
        double l = (double) numerator / denominator;
        String result = String.format("%.2f",l);
        return Double.parseDouble(result);
    }

    @ScalarFunction("divide")//函数名称
    @Description("(double) (a/b)")//函数描述
    @SqlType(DOUBLE)//函数返回类型
    public static double avg(@SqlType(INTEGER) long numerator, // 分子
                          @SqlType(DOUBLE) double denominator) {// 分母
        if (denominator == 0){
            return Double.parseDouble("0.0");
        }
        double l = numerator / denominator;
        String result = String.format("%.2f",l);
        return Double.parseDouble(result);
    }

    @ScalarFunction("divide")//函数名称
    @Description("(double) (a/b)")//函数描述
    @SqlType(DOUBLE)//函数返回类型
    public static double avg(@SqlType(DOUBLE) double numerator, // 分子
                          @SqlType(INTEGER) long denominator) {// 分母
        if (denominator == 0){
            return Double.parseDouble("0.0");
        }
        double l = numerator / denominator;
        String result = String.format("%.2f",l);
        return Double.parseDouble(result);
    }

    @ScalarFunction("divide")//函数名称
    @Description("(double) (a/b)")//函数描述
    @SqlType(DOUBLE)//函数返回类型
    public static double avg(@SqlType(DOUBLE) double numerator, // 分子
                          @SqlType(DOUBLE) double denominator) {// 分母
        if (denominator == 0){
            return Double.parseDouble("0.0");
        }
        double l = numerator / denominator;
        String result = String.format("%.2f",l);
        return Double.parseDouble(result);
    }
}

分段函数

将当前数据划分到某个区间端, 用于分布分析模型当中

package scalarFunctions;

import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.spi.function.Description;
import com.facebook.presto.spi.function.ScalarFunction;
import com.facebook.presto.spi.function.SqlType;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;

import java.text.SimpleDateFormat;
import java.util.Date;

public class toIntervalFunction {
    @ScalarFunction("toIntervalUDF")//函数名称
    @Description("Number and spacing of specified segments, int to Interval function")//函数描述
    @SqlType(StandardTypes.VARCHAR)//函数返回类型
    public static Slice toInterval(@SqlType(StandardTypes.BIGINT) long input, // 传入的总数
                                   @SqlType(StandardTypes.BIGINT) long n,     // 指定段数
                                   @SqlType(StandardTypes.BIGINT) long gap) {  // 指定区间间隔
        String interval = "";
        if (input <= gap) {
            interval = "(-∞," + gap +")";
        } else {
            int i = (int) (input / gap);
            if (i == 0) {
                interval = "(-∞," + gap +")";
            } else if (i < n) {
                interval = "[" + i * gap + "," + (i+1) * gap + ")";
            } else {
                interval = "[" + n * gap + "," + "+∞)";
            }
        }
        return Slices.utf8Slice(interval);
    }
}

聚合函数

分段聚合函数

统计出该列的最小值,最大值,按照指定段数进行分段

package aggregationFunctions;

import com.facebook.presto.common.block.BlockBuilder;
import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.common.type.VarcharType;
import com.facebook.presto.spi.function.*;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;

import static com.facebook.presto.common.type.StandardTypes.VARCHAR;

@AggregationFunction("toIntervalUDAF")//函数名称
@Description("参数:(聚合操作名,聚合值,段数,区间间隔) 指定聚合操作进行聚合,将聚合结果转为指定区间范围")
public class specialIntervalFunction {
  // 累计状态接口
  public interface specialState extends AccumulatorState {
    // 最大值
    double getMax();

    void setMax(double value);

    // 最小值
    double getMin();

    void setMin(double value);

    // 总和
    double getSum();

    void setSum(double sum);

    // 段数
    long getNum();

    void setNum(long num);

    // 区间间隔
    long getGap();

    void setGap(long gap);

    // 总个数
    long getCount();

    void setCount(long value);

    // 聚合操作类型
    Slice getOperation();

    void setOperation(Slice operation);
  }

  @InputFunction
  public static void input(specialState state,
                           @SqlType(VARCHAR) Slice operation,  // 聚合操作名: sum, max, min, avg
                           @SqlType(StandardTypes.BIGINT) long input, // 需要聚合的数值
                           @SqlType(StandardTypes.BIGINT) long n,     // 指定段数
                           @SqlType(StandardTypes.BIGINT) long gap) {  // 指定区间间隔
    // 第一次初始化state
    if (state.getCount() <= 0) {
      double value = Double.parseDouble(String.valueOf(input));
      // 初始化聚合操作类型
      state.setOperation(operation);
      // 初始化段数
      state.setNum(n);
      // 初始化最大值
      state.setMax(value);
      // 初始化最小值
      state.setMin(value);
      // 初始化区间间隔
      state.setGap(gap);
      // 统计总和
      state.setSum(state.getSum() + value);
      // 计数+1
      state.setCount(state.getCount() + 1);
    } else {// 非第一次
      double value = Double.parseDouble(String.valueOf(input));
      if (state.getOperation().toStringUtf8().toLowerCase().equals("max")) {
        // 记录最大值
        if (state.getMax() < value) state.setMax(value);
      } else if (state.getOperation().toStringUtf8().toLowerCase().equals("min")) {
        // 记录最小值
        if (state.getMin() > value) state.setMin(value);
      } else {
        // 统计总和 --sum和avg都需要求总和
        state.setSum(state.getSum() + value);
      }
      // 计数+1
      state.setCount(state.getCount() + 1);
    }
  }

  @InputFunction
  public static void input2(specialState state,
                           @SqlType(VARCHAR) Slice operation,  // 聚合操作名: sum, max, min, avg
                           @SqlType(StandardTypes.INTEGER) long input, // 需要聚合的数值
                           @SqlType(StandardTypes.BIGINT) long n,     // 指定段数
                           @SqlType(StandardTypes.BIGINT) long gap) {  // 指定区间间隔
    // 第一次初始化state
    if (state.getCount() <= 0) {
      double value = Double.parseDouble(String.valueOf(input));
      // 初始化聚合操作类型
      state.setOperation(operation);
      // 初始化段数
      state.setNum(n);
      // 初始化最大值
      state.setMax(value);
      // 初始化最小值
      state.setMin(value);
      // 初始化区间间隔
      state.setGap(gap);
      // 统计总和
      state.setSum(state.getSum() + value);
      // 计数+1
      state.setCount(state.getCount() + 1);
    } else {// 非第一次
      double value = Double.parseDouble(String.valueOf(input));
      if (state.getOperation().toStringUtf8().toLowerCase().equals("max")) {
        // 记录最大值
        if (state.getMax() < value) state.setMax(value);
      } else if (state.getOperation().toStringUtf8().toLowerCase().equals("min")) {
        // 记录最小值
        if (state.getMin() > value) state.setMin(value);
      } else {
        // 统计总和 --sum和avg都需要求总和
        state.setSum(state.getSum() + value);
      }
      // 计数+1
      state.setCount(state.getCount() + 1);
    }
  }


  @InputFunction
  public static void input(specialState state,
                           @SqlType(VARCHAR) Slice operation,  // 聚合操作名: sum, max, min, avg
                           @SqlType(StandardTypes.DOUBLE) double input, // 需要聚合的数值
                           @SqlType(StandardTypes.BIGINT) long n,     // 指定段数
                           @SqlType(StandardTypes.BIGINT) long gap) {  // 指定区间间隔
    // 第一次初始化state
    if (state.getCount() <= 0) {
      // 初始化聚合操作类型
      state.setOperation(operation);
      // 初始化段数
      state.setNum(n);
      // 初始化最大值
      state.setMax(input);
      // 初始化最小值
      state.setMin(input);
      // 初始化区间间隔
      state.setGap(gap);
      // 统计总和
      state.setSum(state.getSum() + input);
      // 计数+1
      state.setCount(state.getCount() + 1);
    } else {// 非第一次
      if (state.getOperation().toStringUtf8().toLowerCase().equals("max")) {
        // 记录最大值
        if (state.getMax() < input) state.setMax(input);
      } else if (state.getOperation().toStringUtf8().toLowerCase().equals("min")) {
        // 记录最小值
        if (state.getMin() > input) state.setMin(input);
      } else {
        // 统计总和 --sum和avg都需要求总和
        state.setSum(state.getSum() + input);
      }
      // 计数+1
      state.setCount(state.getCount() + 1);
    }
  }

  @CombineFunction
  public static void combine(specialState state, specialState otherState) {
    if (state.getCount() != 0) {
      state.setCount(state.getCount() + otherState.getCount());//合并个数
      state.setMax(Math.max(state.getMax(), otherState.getMax()));//合并最大值
      state.setMin(Math.min(state.getMin(), otherState.getMin()));//合并最小值
      state.setSum(state.getSum() + otherState.getSum());//合并总和
      state.setNum(state.getNum());//合并段数:段数都一样
      state.setGap(state.getGap());//合并区间间隔: 区间间隔都一样
      state.setOperation(state.getOperation());//合并操作类型: 操作类型都一样
    } else {
      state.setCount(otherState.getCount());//个数
      state.setMax(otherState.getMax());//最大值
      state.setMin(otherState.getMin());//最小值
      state.setSum(otherState.getSum());//总和
      state.setNum(otherState.getNum());//段数
      state.setGap(otherState.getGap());//区间间隔
      state.setOperation(otherState.getOperation());//操作类型
    }
  }

  @OutputFunction(StandardTypes.VARCHAR)
  public static void output(specialState state, BlockBuilder out) {
    String operation = state.getOperation().toStringUtf8();//操作类型
    long n = state.getNum();//段数
    long gap = state.getGap();//区间
    long count = state.getCount();//个数
    double max = state.getMax();//最大值
    double min = state.getMin();//最小值
    double sum = state.getSum();//总和

    // 没有数据返回null
    if (count <= 0) {
      out.appendNull();
      return;
    }

    double value;

    if (operation.toLowerCase().equals("avg")) {
      double v = sum / count;
      value =  v;
    } else if (operation.toLowerCase().equals("max")) {
      value = max;
    } else if (operation.toLowerCase().equals("min")) {
      value = min;
    } else {
      value = sum;
    }

    String interval = "";
    if (value < gap) {
      interval = "(-∞," + gap + ")";
    } else {
      int i = (int) (value / gap);
      if (i == 0) {
        interval = "(-∞," + gap + ")";
      } else if (i < n) {
        interval = "[" + i * gap + "," + (i + 1) * gap + ")";
      } else {
        interval = "[" + n * gap + "," + "+∞)";
      }
    }
    Slice slice = Slices.utf8Slice(interval);
    VarcharType.VARCHAR.writeSlice(out, slice);
  }
}

Presto自定义函数部署

将jar放在presto插件位置/usr/lib/presto/plugin, 重启presto即可

系列文章

第一篇: Ambari自动化部署
第二篇: 数据埋点设计和SDK源码
第三篇: 数据采集和验证方案
第四篇: ETL实时方案: Kafka->Flink->Hive
第五篇: ETL用户数据处理: kafka->spark->kudu
第六篇: Presto分析模型SQL和UDF函数
第七篇: 用户画像和留存预测模型

你可能感兴趣的:(Presto分析模型SQL和UDF函数)