笔者从事大数据行业快三年时间,在用户行为分析方向也有所沉淀。现在整理一些常用的 SQL ,希望本文对用户行为分析的初学者的 SQL 学习有所帮助。
行业内用的数据系统各式各样,在 《华为数据之道》一书中描述的 “华为” 的数据中台的五种数据中台联接方式:以业务流(事件)为中心联接、以对象(主体)为中心联接、智能标签、报告数据、算法模型。大部分企业的最基础的数据模型就是 “用户行为” 和 "用户主体"。
现在假设大家的数据系统里都有两张大宽表,一张记录 “用户行为”,我们称 events 表,一张记录 “用户主体”,我们称 users 表。表的结构大致是:
Events 表
user_id | event | date | time |
---|---|---|---|
1 | visit | 2021-01-01 | 2021-01-01 01:31:07.474 |
2 | visit | 2021-01-01 | 2021-01-01 01:32:03.674 |
3 | payorder | 2021-01-01 | 2021-01-01 01:33:12.444 |
Users 表
id | first_id | second_id | age | sex | city |
---|---|---|---|---|---|
1 | android_1 | red | 13 | 男 | 合肥,淮北 |
2 | android_1 | green | 14 | 男 | 北京,上海 |
3 | ios_1 | blue | 22 | 女 | 深圳 |
日访问量( PV ):
SELECT
COUNT(*) AS "今日总访问次数"
FROM
EVENTS
WHERE
date=CURRENT_DATE() AND event='visit'
日活跃用户数( UV ):
SELECT
COUNT(DISTINCT user_id) AS "今日独立用户数"
FROM
EVENTS
WHERE
date=CURRENT_DATE() AND event='visit'
最近七天日活:
SELECT
date,COUNT(DISTINCT user_id) AS "今日独立用户数"
FROM
EVENTS
WHERE
event='visit' AND date BETWEEN CURRENT_DATE() - INTERVAL '7' DAY AND CURRENT_DATE()
GROUP BY date
分时活跃:
SELECT
HOUR(time) AS "小时数",
COUNT(DISTINCT user_id) AS "独立用户数"
FROM
EVENTS
WHERE
event='visit' AND date=CURRENT_DATE()
GROUP BY HOUR(time)
查询每天上午 10 点至 11 点的下单用户数
SELECT
COUNT(*) AS "独立用户数"
FROM
EVENTS
WHERE
EXTRACT(HOUR FROM time) IN (10, 11) AND event = 'payorder'
GROUP BY 1
List 类型的查询,包含 xx 的 List 有哪些:
SELECT
city AS "城市"
FROM
users
WHERE
CONTAINS('合肥', city);
List 里的元素个数:
SELECT
city,
length(city)-length(replace(city, '\n',''))+1 AS "元素个数"
FROM
users
WHERE
city IS NOT NULL
根据生日得到年龄
SELECT
id,
YEAR(NOW())-YEAR(TO_TIMESTAMP(birthday)) AS "年龄"
FROM
users
漏斗用户:
visit(访问)—addtocart(加购)—payorder(支付)(窗口期 48 小时且严格满足事件先后顺序)
SELECT
COUNT(DISTINCT s3.user_id) AS "漏斗独立用户数"
FROM
events s3
INNER JOIN (
SELECT
s2.user_id as user_id,
s2.time as time,
s1.endtime
FROM
EVENTS s2//s2 得到第二步用户、加购时间、窗口期时间
INNER JOIN (
SELECT user_id,time,HOURS_ADD(time, 48) as endtime //窗口期 2 天(+ 48 小时) FROM EVENTS
WHERE event = 'visit' AND date between '2021-01-01' and '2021-01-02' //时间范围是 1 号到 3 号
) s1 //s1 得到用户、访问时间、访问时间+48 小时
ON s2.user_id = s1.user_id AND s2.time > s1.time AND s2.time <= endtime
WHERE
s2.event = 'addtocart'
) a
ON
s3.user_id = a.user_id AND s3.time > a.time AND s3.time <= a.endtime
WHERE
s3.event = 'payorder'
连续 n 天访问的用户(n=1,2,3,4,...):
SELECT
COUNT(DISTINCT user_id) AS "连续 n 天访问的用户"
FROM
(SELECT
user_id,
date,
TO_DATE(DAY_TO_DATE(LEAD(date,n-1,null)OVER(PARTITION BY user_id ORDER BY date ASC))) AS newdate
FROM
(SELECT user_id, date FROM EVENTS WHERE event='visit' GROUP BY user_id,date) a
) b
WHERE DATEDIFF(newdate,date)=n-1
PS:注意把 SQL 里的 n 替换为具体的天数
用户连续 n 天做某事(主要不同用户得 n)--感谢同事提供:
SELECT
user_id,
continuous_days AS "连续访问天数"
FROM
(-- 原理:得到用户 ymd1-ROW_NUMBER1=ymd2-ROW_NUMBER2 的数量,则为连续天数
SELECT
user_id,
continuous_group,
COUNT(1) AS continuous_days
FROM
(-- 原理:ROW_NUMBER 和日期天一样是步长为 1 的数列
-- 如果用户是连续日期做这件事,日期减去ROW_NUMBER一定是相等的,以此构建分组
SELECT
ymd - ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY ymd ASC) AS continuous_group,
user_id,
event
FROM
(-- 把用户做这个事件的日期取出来
SELECT
DISTINCT
-- 由于数字格式在跨月时步长不相等,如 20191101-20191031>1,需要把日期换算成连续天数列
-- FROM_UNIXTIME(UNIX_TIMESTAMP(date),'yyyyMMdd') AS ymd,
DATEDIFF(date,'1970-01-01') AS ymd,
user_id,
event
FROM
events
WHERE
event='visit'
) event_log
) event_group
GROUP BY
user_id,
continuous_group
) continuous_groups
WHERE
continuous_days>1
查询一段时间内的用户下单次数分布情况
SELECT
CASE
WHEN c < 10 THEN '<10'
WHEN c < 20 THEN '<20'
WHEN c < 100 THEN '<100'
ELSE '>100'
END,
COUNT(*)
FROM (
SELECT user_id, COUNT(*) AS c FROM events
WHERE date BETWEEN '2015-09-01' AND '2015-09-20' AND event = 'payorderr'
GROUP BY 1
)a
GROUP BY 1
做过 addtocart (加购) 且没有 payorder (支付)的用户:
SELECT
COUNT(DISTINCT a.user_id) AS "做过 addtocart (加购) 且没有 payorder (支付)的用户"
FROM
(SELECT user_id FROM EVENTS WHERE event='addtocart') a
LEFT JOIN
(SELECT user_id FROM EVENTS WHERE event='payorder') b
ON
a.user_id=b.user_id
WHERE
b.user_id IS NULL
做过 addtocart (加购) 且没有 payorder (支付)的用户和商品:
SELECT
COUNT(DISTINCT addcart.mapid) AS "做过 addtocart (加购) 且没有 payorder (支付)的用户和商品"
FROM
(SELECT user_id, CONCAT(CAST(user_id AS STRING), commodityID) AS mapid FROM EVENTS WHERE event = 'addtocart') addcart
LEFT JOIN
(SELECT user_id, CONCAT(CAST(user_id AS STRING), commodityID) AS mapid FROM EVENTS WHERE event = 'payorder') pay
ON
addcart.mapid = pay.mapid
WHERE
pay.user_id is null
不同支付方式的支付金额总和(微信支付+支付宝支付):
SELECT
user_id,
SUM(value) AS "支付金额总和"
FROM
(
-- 支付宝支付金额
SELECT
user_id,
SUM(events.alipay_amount) AS value
FROM
events
WHERE
date BETWEEN '[baseTime]'-INTERVAL 30 DAY AND '[baseTime]'
AND events.event='alipay_detail'
GROUP BY
user_id
UNION ALL
-- 微信支付金额
SELECT
user_id,
SUM(events.wechat_amount) AS value
FROM
events
WHERE
date BETWEEN '[baseTime]'-INTERVAL 30 DAY AND '[baseTime]'
AND events.event='wechat_pay_detail'
GROUP BY
user_id
) t
GROUP BY
user_id
正则表达式之使用 QQ 邮箱为邮件的用户数
SELECT
COUNT(*) AS "独立用户数"
FROM
users
WHERE
regexp_like(email, '@qq.com$')
正则表达式之匹配 ID 规则
-- users 表的 first_id 记录用户的设备 id,second_id 来记录用户的登录 id
-- 匹配 first_id 为 Android_id,Android_id 一般是 16 位字母和数字的组合
SELECT
id,
first_id AS "安卓设备 ID"
FROM
users
WHERE
REGEXP_LIKE(first_id, '^([0-9a-z]{1,16})$')
-- 匹配 first_id 为 IDFA/IDFV,一般是 32 位字母和数字的组合
SELECT
id,
first_id AS " 设备 ID"
FROM
users
WHERE
REGEXP_LIKE(first_id, '^([0-9A-Z]{8})(([/\s-][0-9A-Z]{4}){3})([/\s-][0-9A-Z]{12})$')
-- 匹配 first_id 为小程序的 open_id
SELECT
id,
first_id AS "小程序设备 ID"
FROM
users
WHERE
REGEXP_LIKE(first_id, '^o[0-9a-zA-Z_-]{27}$')
--这里不再一一列举,只列举几个常用的
^\d{n}$ (验证 n 位数字,n 输入具体的值)
^[0-9a-zA-Z]{n,m}$ (n ~m 个数字、字母组成的字符串)
^[a-zA-Z]{n,m}$(n ~m 个字母组成的字符串)
^([A-Za-z0-9_\-\.])+\@[a-zA-Z0-9_\-]+([a-zA-Z0-9_\-\.])+$(验证是否是邮箱)
间隔计算,计算两个事件的间隔时间(超过 10 分钟则不计算)
SELECT
user_id,
SUM(
CASE WHEN
end_time - begin_time < 600
THEN
end_time - begin_time
ELSE
0
END
) AS "间隔时长(秒)"
FROM (
SELECT
user_id,
EXTRACT(EPOCH FROM time) AS end_time,
LAG(EXTRACT(EPOCH FROM time), 1, NULL) OVER (PARTITION BY user_id ORDER BY time ASC) AS begin_time
FROM events ) a
GROUP BY 1
计算支付行为间隔天数
SELECT
user_id,
datex,
DATEDIFF(datex, LAG(datex,1,NULL) OVER(PARTITION BY user_id ORDER BY datex ASC)) AS diff
FROM
(SELECT user_id,trunc(time, 'DD') AS datex FROM events WHERE event = 'payorder' GROUP BY user_id,datex) a
超级微笑曲线
SELECT
visit_days AS "访问天数",
COUNT(user_id) AS "独立用户数"
FROM
(SELECT user_id, COUNT(DISTINCT date) AS visit_days FROM events WHERE date BETWEEN CURRENT_DATE() - INTERVAL '30' DAY AND CURRENT_DATE() - INTERVAL '1' DAY GROUP BY 1) a
GROUP BY 1
用户首个购买和第二次购买的日期间隔
SELECT
user_id,
DATEDIFF(first_time_value(time,next_time),MIN(time)) AS "首次购买和第二次购买的时间差"
FROM
(SELECT user_id,time,LEAD(time, 1, NULL) OVER (PARTITION BY user_id ORDER BY time asc) AS next_time FROM events WHERE event='payorder' ) a
GROUP BY
user_id
PS:first_time_value 是自定义的函数,使用 first_time_value(time, 其他属性) 聚合函数来获取第一次发生某行为时的相关属性,建议把这个函数内置。如果没有的话,需要参考:
SELECT
user_id,
time,
next_time,
DATEDIFF(next_time,time)
FROM
(SELECT user_id,time,next_time,ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY time ASC) AS rank FROM (SELECT
user_id,time,distinct_id,
LEAD(time, 1, NULL) OVER (PARTITION BY user_id ORDER BY time asc) AS next_time
FROM events WHERE event='payOrder') a
WHERE next_time IS NOT NULL)b
WHERE rank=1
寻找流失用户:最近两次访问时间间隔在 30 天以上
SELECT
COUNT(user_id) AS "独立用户数"
FROM (
SELECT user_id,(UNIX_TIMESTAMP(MAX(t2)) - UNIX_TIMESTAMP(MAX(t1))) / 86400 AS d1
FROM (
SELECT user_id,LAG(time,1) OVER(PARTITION BY user_id ORDER BY time ASC) AS t1,time AS t2 FROM events
WHERE event='visit' ) t
GROUP BY user_id ) r
过去 7 天浏览偏好的商品类型(前 3)
/* 假设当前基准时间为 2019-06-19 */
/* 集合类型标签*/
SELECT id, MAX(distinct_id) AS distinct_id,
GROUP_CONCAT(product_type, '\n') AS value
FROM (
SELECT id, distinct_id, product_type,
RANK() OVER (PARTITION BY id ORDER BY cnt DESC) AS rank_num
FROM (
SELECT user_id AS id, product_type,
MAX(distinct_id) AS distinct_id, COUNT(*) AS cnt
FROM events
WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1'DAY
AND event = 'payorder'
GROUP BY 1, 2
) a
) b
WHERE rank_num <= 3
GROUP BY 1
/* 其中 group_concat(product_type, '\n') 表示用户前三的商品类型。 */
/* 返回值是 list 类型,需要创建为集合类型的标签 */
过去 7 天中用户最近一次访问距今时间(天)
/* 假设当前基准时间为 2019-06-19 */
/* 数值类型标签 */
SELECT id, distinct_id, DATEDIFF(now(), time) AS value
FROM (
SELECT user_id AS id, MAX(distinct_id) AS distinct_id, MAX(time) AS time
FROM events
WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1' DAY
AND event = 'View'
GROUP BY 1
) a
/* 其中 View 为用户访事件,datediff(now(), time) as value 表示事件发生的距今天数 */
过去 7 天浏览最多的商品类型
/* 字符串类型标签 */
SELECT id, distinct_id, product_type AS value
FROM (
SELECT id, distinct_id, product_type,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY cnt DESC) AS row_num
FROM (
SELECT user_id AS id, product_type, MAX(distinct_id) AS distinct_id, COUNT(*) AS cnt
FROM events
WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1' DAY
AND event = 'ProductDetails'
GROUP BY 1, 2
) a
) b
WHERE row_num <= 1
暂时就这么多,我会不定期补充的~