1、背景:通过业务埋点数据,统计用户在页面的停留时间
样例数据,样例数据存入表tmp,
有如下字段用户ID:uid、动作时间戳:time、页面名称pn、动作名称action
SELECT '12345' AS uid, '1695613731020' AS time, '搜索' AS pn, 'click' AS action
UNION ALL
SELECT '12345' AS uid, '1695613732021' AS time, '搜索' AS pn, 'click' AS action
UNION ALL
SELECT '12345' AS uid, '1695613734024' AS time, '搜索' AS pn, 'click' AS action
UNION ALL
SELECT '12345' AS uid, '1695613737036' AS time, '列表' AS pn, 'click' AS action
UNION ALL
SELECT '12345' AS uid, '1695613738037' AS time, '列表' AS pn, 'click' AS action
UNION ALL
SELECT '12345' AS uid, '1695613740040' AS time, '列表' AS pn, 'click' AS action
uid | time | pn | action |
12345 | 1695613731020 | 搜索 | click |
12345 | 1695613732021 | 搜索 | click |
12345 | 1695613734024 | 搜索 | click |
12345 | 1695613737036 | 列表 | click |
12345 | 1695613738037 | 列表 | click |
12345 | 1695613740040 | 列表 | click |
思路:以用户维度按时间进行升序排列,通过lag函数找到上一个时间动作last_pn
SELECT uid, time, pn, row_number() OVER (PARTITION BY uid ORDER BY time DESC) AS rn
, lag(pn, 1) OVER (PARTITION BY uid ORDER BY time ASC) AS last_pn
FROM tmp
rn排序的作用是找到最后一个动作,然后进行剔除
uid | time | pn | rn | last_pn |
12345 | 1695613731020 | 搜索 | 6 | |
12345 | 1695613732021 | 搜索 | 5 | 搜索 |
12345 | 1695613734024 | 搜索 | 4 | 搜索 |
12345 | 1695613737036 | 列表 | 3 | 搜索 |
12345 | 1695613738037 | 列表 | 2 | 列表 |
12345 | 1695613740040 | 列表 | 1 | 列表 |
然后将发生页面变化的节点进行标记,
SELECT *, if(pn <> nvl(last_pn, '空') OR rn = 1, 1, 0) AS label
FROM (
SELECT uid, time, pn, row_number() OVER (PARTITION BY uid ORDER BY time DESC) AS rn
, lag(pn, 1) OVER (PARTITION BY uid ORDER BY time ASC) AS last_pn
FROM tmp
) t
uid | time | pn | rn | last_pn | label |
12345 | 1695613731020 | 搜索 | 6 | 1 | |
12345 | 1695613732021 | 搜索 | 5 | 搜索 | 0 |
12345 | 1695613734024 | 搜索 | 4 | 搜索 | 0 |
12345 | 1695613737036 | 列表 | 3 | 搜索 | 1 |
12345 | 1695613738037 | 列表 | 2 | 列表 | 0 |
12345 | 1695613740040 | 列表 | 1 | 列表 | 1 |
之后统计停留时间就可以只看label =1的日志之间的时间差即可,全部代码如下,
WITH tmp AS (
SELECT '12345' AS uid, '1695613731020' AS time, '搜索' AS pn, 'click' AS action
UNION ALL
SELECT '12345' AS uid, '1695613732021' AS time, '搜索' AS pn, 'click' AS action
UNION ALL
SELECT '12345' AS uid, '1695613734024' AS time, '搜索' AS pn, 'click' AS action
UNION ALL
SELECT '12345' AS uid, '1695613737036' AS time, '列表' AS pn, 'click' AS action
UNION ALL
SELECT '12345' AS uid, '1695613738037' AS time, '列表' AS pn, 'click' AS action
UNION ALL
SELECT '12345' AS uid, '1695613740040' AS time, '列表' AS pn, 'click' AS action
)
SELECT *
FROM (
SELECT uid, pn, time, CAST((lead(time, 1) OVER (PARTITION BY uid ORDER BY time ASC) - time) / 1000 AS BIGINT) AS stay_time
FROM (
SELECT *
, if(pn <> nvl(last_pn, '空')
OR rn = 1, 1, 0) AS label
FROM (
SELECT uid, time, pn, row_number() OVER (PARTITION BY uid ORDER BY time DESC) AS rn
, lag(pn, 1) OVER (PARTITION BY uid ORDER BY time ASC) AS last_pn
FROM tmp
) t
) tt
WHERE label = 1
) ttt
WHERE stay_time IS NOT NULL
最终统计结果如下
uid | pn | time | stay_time |
12345 | 搜索 | 1695613731020 | 6 |
12345 | 列表 | 1695613737036 | 3 |