MySql实现用户留存率分析


  • 明确概念:某一用户在第N天仍然登录的比例,称为第N日留存率,一般会按照每隔1单位时间(例日、周、月)来进行统计

  • 创建测试表,插入测试数据
# 建立用户注册表(id,用户名,注册时间)
CREATE TABLE t_user (id int AUTO_INCREMENT PRIMARY KEY,username nvarchar(25) NOT NULL,register_time datetime NOT NULL);
#改大循环次数
SET SESSION cte_max_recursion_depth=50000; 
#插入递归生成的测试用户
INSERT INTO t_user
WITH RECURSIVE cte AS (SELECT 1 n, '2022-06-16 00:00:00' d UNION ALL
	SELECT n+1,d+INTERVAL '10' MINUTE FROM cte WHERE n<3000
)
SELECT NULL,concat('user',n),d FROM cte;
# 建立用户登录表(id,用户id,登录时间)
CREATE TABLE t_user_login(id int AUTO_INCREMENT PRIMARY KEY,uid int NOT NULL,login_time datetime NOT NULL);
#插入递归生成的随机登录数据
INSERT INTO t_user_login
WITH RECURSIVE cte AS (SELECT 1 n,CEIL(rand()*3000) id,'2022-06-16 00:00:00' d UNION ALL
	SELECT n+1,CEIL(rand()*3000) id,d+INTERVAL CEIL(n/200) SECOND FROM cte WHERE n<50000)
SELECT NULL,CEIL(id),d FROM cte;
# 删除不符合逻辑的数据(登录时间在注册时间之前)【不删除也不影响测试结果的准确度】
DELETE FROM t_user_login t2 WHERE EXISTS (SELECT * FROM t_user t1 WHERE t1.id=t2.uid AND login_time<=register_time);
  • 准备工作完成,开始进行用户留存率分析
-- 原始方法:使用连接查询,通过count统计人数,多次join效率低下
SELECT date(tu.register_time) reg_date,
	100*count(DISTINCT(tul1.uid))/count(DISTINCT(tu.id)) rr1,
	100*count(DISTINCT(tul3.uid))/count(DISTINCT(tu.id)) rr3,
	100*count(DISTINCT(tul7.uid))/count(DISTINCT(tu.id)) rr7,
	100*count(DISTINCT(tul30.uid))/count(DISTINCT(tu.id)) rr30
	FROM t_user tu 
	LEFT JOIN t_user_login tul1 ON tu.id=tul1.uid AND date(tul1.login_time)=date(register_time) + INTERVAL '1' day
	LEFT JOIN t_user_login tul3 ON tu.id=tul3.uid AND date(tul3.login_time)=date(register_time) + INTERVAL '3' day
	LEFT JOIN t_user_login tul7 ON tu.id=tul7.uid AND date(tul7.login_time)=date(register_time) + INTERVAL '7' day
	LEFT JOIN t_user_login tul30 ON tu.id=tul30.uid AND date(tul30.login_time)=date(register_time) + INTERVAL '30' day
	GROUP BY date(tu.register_time);
-- 使用窗口函数,通过计数的最后一位统计总人数,效率更高[dense_rank()与rank()相比有一个去重的作用]
WITH t1 AS(
SELECT tu.id,date(register_time) reg_date,date(login_time) log_date,tul.uid,
	DENSE_RANK() over(PARTITION BY date(register_time) ORDER BY tu.id) daily_reg,
	DENSE_RANK() over(PARTITION BY date(register_time),date(login_time) ORDER BY tul.uid) daily_log
	FROM t_user tu LEFT JOIN t_user_login tul ON tu.id=tul.uid 
	AND (date(tul.login_time)=date(register_time) + INTERVAL '1' DAY OR 
		 date(tul.login_time)=date(register_time) + INTERVAL '3' DAY OR 
		 date(tul.login_time)=date(register_time) + INTERVAL '7' DAY OR 
		 date(tul.login_time)=date(register_time) + INTERVAL '30' DAY)),
	 t2 AS(
SELECT reg_date,log_date,max(daily_reg) daily_reg,max(daily_log) daily_log FROM t1 GROUP BY reg_date,log_date)
SELECT reg_date,
	100*max(CASE WHEN log_date=reg_date+ INTERVAL '1' DAY THEN daily_log END)/max(daily_reg) rr1,
	100*max(CASE WHEN log_date=reg_date+ INTERVAL '3' DAY THEN daily_log END)/max(daily_reg) rr3,
	100*max(CASE WHEN log_date=reg_date+ INTERVAL '7' DAY THEN daily_log END)/max(daily_reg) rr7,
	100*max(CASE WHEN log_date=reg_date+ INTERVAL '30' DAY THEN daily_log END)/max(daily_reg) rr30
	FROM t2 GROUP BY reg_date;
  • 知识点总结:
    1.相关函数和关键字:随机函数rand()、向上取整函数ceil()、日期时间间隔关键字interval、取日期函数date();
    2.MySQL8.0新增功能:cte公共表达式(common table expression),使用with as关键字实现递归和临时表的建立;
    3.MySQL8.0新增功能:窗口函数:rank/dense_rank/row_number/聚合函数/…() over(partition by 列名分区 order by 列名排序) 【通过窗口函数,可方便实现排名,计数,统计等功能】
    4、使用case when… then… end实现行列转换。

  • 参考视频:SQL面试题:用户留存率分析 by:董旭阳TonyDong

-- 补充方法:连接一次,通过datediff()函数判断间隔天数。
SELECT date(tu.register_time) reg_date,
	100*count(DISTINCT(IF(datediff(login_time,register_time)=1,tul.uid,null)))/count(DISTINCT(tu.id)) rr1,
	100*count(DISTINCT(IF(datediff(login_time,register_time)=3,tul.uid,null)))/count(DISTINCT(tu.id)) rr3,
	100*count(DISTINCT(IF(datediff(login_time,register_time)=7,tul.uid,null)))/count(DISTINCT(tu.id)) rr7,
	100*count(DISTINCT(IF(datediff(login_time,register_time)=30,tul.uid,null)))/count(DISTINCT(tu.id)) rr30
	FROM t_user tu 
	LEFT JOIN t_user_login tul ON tu.id=tul.uid GROUP BY date(tu.register_time);
  • 参考文章:MySQL留存分析 by:Lin王发林

你可能感兴趣的:(SQL,mysql,sql,数据库)