间断连续登陆天数问题
问题:
统计用户最大连续登陆天数, 间隔一天也算是连续登陆;例如: 用户 1 3 5 8 登陆, 算做连续登陆5天
1- 数据准备
WITH user_active_info AS (
SELECT * FROM (
VALUES ('10001' , '2023-02-01'),('10001' , '2023-02-03')
,('10001' , '2023-02-08'),('10001' , '2023-02-05')
,('10002' , '2023-02-02'),('10002' , '2023-02-10')
,('10002' , '2023-02-04'),('10002' , '2023-02-05')
,('10002' , '2023-02-07'),('10003' , '2023-02-02')
,('10003' , '2023-02-03'),('10003' , '2023-02-04')
,('10003' , '2023-02-04'),('10003' , '2023-02-06')
,('10003' , '2023-02-09'),('10003' , '2023-02-08')
,('10004' , '2023-02-03'),('10004' , '2023-02-04')
,('10004' , '2023-02-06'),('10004' , '2023-02-09')
,('10004' , '2023-02-08'),('10004' , '2023-02-08')
,('10005' , '2023-02-02'),('10005' , '2023-02-05')
,('10005' , '2023-02-06'),('10005' , '2023-02-09')
) AS user_active_info(user_id, active_date)
)
2- 代码实现
, t_1 AS (
SELECT
user_id, active_date
, LAG(active_date, 1, '1970-01-01') OVER(PARTITION BY user_id ORDER BY active_date) AS lag1_date
FROM (
SELECT user_id , active_date
FROM user_active_info
GROUP BY user_id , active_date
) a
)
user_id |
active_date |
lag1_date |
10001 |
2023-02-01 |
1970-01-01 |
10001 |
2023-02-03 |
2023-02-01 |
10001 |
2023-02-05 |
2023-02-03 |
10001 |
2023-02-08 |
2023-02-05 |
10002 |
2023-02-02 |
1970-01-01 |
10002 |
2023-02-04 |
2023-02-02 |
10002 |
2023-02-05 |
2023-02-04 |
10002 |
2023-02-07 |
2023-02-05 |
10002 |
2023-02-10 |
2023-02-07 |
10003 |
2023-02-02 |
1970-01-01 |
10003 |
2023-02-03 |
2023-02-02 |
10003 |
2023-02-04 |
2023-02-03 |
… |
… |
… |
, t_2 AS (
SELECT
user_id, active_date, lag1_date, flag
, CONCAT(user_id, '_' ,flag) AS user_id_flag
FROM (
SELECT
user_id, active_date, lag1_date
, SUM(IF(DATEDIFF(active_date, lag1_date) > 2 , 1 , 0)) OVER(PARTITION BY user_id ORDER BY active_date) AS flag
FROM t_1
) a
)
user_id |
active_date |
lag1_date |
flag |
user_id_flag |
10001 |
2023-02-01 |
1970-01-01 |
1 |
10001_1 |
10001 |
2023-02-03 |
2023-02-01 |
1 |
10001_1 |
10001 |
2023-02-05 |
2023-02-03 |
1 |
10001_1 |
10001 |
2023-02-08 |
2023-02-05 |
2 |
10001_2 |
10002 |
2023-02-02 |
1970-01-01 |
1 |
10002_1 |
10002 |
2023-02-04 |
2023-02-02 |
1 |
10002_1 |
10002 |
2023-02-05 |
2023-02-04 |
1 |
10002_1 |
10002 |
2023-02-07 |
2023-02-05 |
1 |
10002_1 |
10002 |
2023-02-10 |
2023-02-07 |
2 |
10002_2 |
10003 |
2023-02-02 |
1970-01-01 |
1 |
10003_1 |
10003 |
2023-02-03 |
2023-02-02 |
1 |
10003_1 |
… |
… |
… |
… |
… |
SELECT
user_id, MAX(continue_days) AS max_continue_days
FROM (
SELECT
user_id , user_id_flag
, DATEDIFF(MAX(active_date),MIN(active_date)) + 1 AS continue_days
FROM t_2
GROUP BY user_id , user_id_flag
) a
GROUP BY user_id
;
user_id |
user_id_flag |
continue_days |
10001 |
10001_1 |
5 |
10001 |
10001_2 |
1 |
10002 |
10002_1 |
6 |
10002 |
10002_2 |
1 |
10003 |
10003_1 |
8 |
10004 |
10004_1 |
7 |
10005 |
10005_1 |
1 |
10005 |
10005_2 |
2 |
10005 |
10005_3 |
1 |
user_id |
max_continue_days |
10001 |
5 |
10002 |
6 |
10003 |
8 |
10004 |
7 |
10005 |
2 |
3- 总结
- 连续登陆天数的升级版
- 结合浏览窗口划分
end