在支持窗口函数里的sql里,善用窗口函数,能降低sql编写复杂度并提高sql执行效率。
窗口函数
function() over(partition by col1 order by col2 )
1. 聚合型窗口函数
sum(),min(),max(),avg(),count(),collect_set()
rows between 可以定义窗口范围
n PRECEDING : 前n行
n FOLLOWING:后n行
CURRENT ROW : 当前行
如果不想限制具体的行数,可以将 n 替换为 UNBOUNDED.比如从起始到当前行
--取每个客户前7天的平均资产:
select
user_no,
dt,
total_assets,
avg(total_assets)over(partition by user_no order by dt rows between 7 preceding and current row) as recently_asset
from user_assets;
无法使用count(distinct )over()
但是可以通过 size(collect_set()over())来实现
select
dt,
size(collect_set(user_no)over(order by dt)) asset_user
from user_assets where dt between 20200525 and 20200528;
--实测效率太低,不推荐使用
2. 分析型窗口函数
ntile(),row_number(),rank(),dense_rank()
- ntile 表示对数据分成n个区,返回对应的区编号
- row_number(),rank(),dense_rank()都是排序函数
- row_number()不去重,rank()去重但是会产生不连续序号,dense_rank()去重且序号连续
select
dt,
user_no,
total_assets,
ntile(10) over(partition by dt order by total_assets desc) nt, --对每天的有资产客户按照资产多少分成10等分,返回对应编号
row_number()over(partition by dt order by total_assets)rk1,
rank()over(partition by dt order by total_assets)rk2,
dense_rank()over(partition by dt order by total_assets)rk3
from
user_assets where dt between 20200525 and 20200528;
cume_dist(),percent_rank()
- cume_list() 小于等于当前值的行数/分组内总行数
- percent_rank() 分组内当前行的RANK值-1/分组内总行数-1
SELECT
dept,
userid,
sal,
CUME_DIST() OVER(ORDER BY sal) AS rn1,
CUME_DIST() OVER(PARTITION BY dept ORDER BY sal) AS rn2
FROM tbname;
dept userid sal rn1 rn2
-------------------------------------------
d1 user1 1000 0.2 0.3333333333333333
d1 user2 2000 0.4 0.6666666666666666
d1 user3 3000 0.6 1.0
d2 user4 4000 0.8 0.5
d2 user5 5000 1.0 1.0
SELECT
dept,
userid,
sal,
PERCENT_RANK() OVER(ORDER BY sal) AS rn1, --分组内
RANK() OVER(ORDER BY sal) AS rn11, --分组内RANK值
SUM(1) OVER(PARTITION BY NULL) AS rn12, --分组内总行数
PERCENT_RANK() OVER(PARTITION BY dept ORDER BY sal) AS rn2
FROM tbname;
dept userid sal rn1 rn11 rn12 rn2
---------------------------------------------------
d1 user1 1000 0.0 1 5 0.0
d1 user2 2000 0.25 2 5 0.5
d1 user3 3000 0.5 3 5 1.0
d2 user4 4000 0.75 4 5 0.0
d2 user5 5000 1.0 5 5 1.0
3. 取值型窗口函数
lead(),lag(),first_value(),last_value()
--以下可以用于股价的关联
select
dt,
stock_id,
close_price,
lead(dt,1,date_format(current_date(),'yyyyMMdd'))over(partition by stock_id order by dt) next_dt, --下一个交易日,如果无下一个交易日,取当前日期
lag(dt,1)over(partition by stock_id order by dt) next_dt, --上一个交易日,如果无上一个交易日,取值null
first_value(dt)over(partition by stock_id order by dt) next_dt, --该股票的最早交易日
last_value(dt)over(partition by stock_id order by dt) next_dt --该股票的最新交易日
from
stock_price;
4.GROUPING SET(),with CUBE, with ROLLUP 对 group by 进行限制
-
- GROUPING SETS: 根据不同的维度组合进行聚合,等价于将不同维度的GROUP BY结果集进行UNION ALL
SELECT
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID
FROM per_d_report
GROUP BY month,day
GROUPING SETS (month,day)
ORDER BY GROUPING__ID;
month day uv GROUPING__ID
------------------------------------------------
2015-03 NULL 5 1
2015-04 NULL 6 1
NULL 2015-03-10 4 2
NULL 2015-03-12 1 2
NULL 2015-04-12 2 2
NULL 2015-04-13 3 2
NULL 2015-04-15 2 2
NULL 2015-04-16 2 2
等价于
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM per_d_report GROUP BY month
UNION ALL
SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM per_d_report GROUP BY day;
-
- with cube: 根据GROUP BY的维度的所有组合进行聚合
SELECT
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID
FROM per_d_report
GROUP BY month,day
WITH CUBE
ORDER BY GROUPING__ID;
month day uv GROUPING__ID
--------------------------------------------
NULL NULL 7 0
2015-03 NULL 5 1
2015-04 NULL 6 1
NULL 2015-04-12 2 2
NULL 2015-04-13 3 2
NULL 2015-04-15 2 2
NULL 2015-04-16 2 2
NULL 2015-03-10 4 2
NULL 2015-03-12 1 2
2015-03 2015-03-10 4 3
2015-03 2015-03-12 1 3
2015-04 2015-04-16 2 3
2015-04 2015-04-12 2 3
2015-04 2015-04-13 3 3
2015-04 2015-04-15 2 3
等价于
SELECT NULL,NULL,COUNT(DISTINCT cookieid) AS uv,0 AS GROUPING__ID FROM per_d_report
UNION ALL
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM per_d_report GROUP BY month
UNION ALL
SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM per_d_report GROUP BY day
UNION ALL
SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS GROUPING__ID FROM per_d_report GROUP BY month,day;
-
- with rollup: 是CUBE的子集,以最左侧的维度为主,从该维度进行层级聚合
SELECT
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID
FROM per_d_report
GROUP BY month,day
WITH ROLLUP
ORDER BY GROUPING__ID;
month day uv GROUPING__ID
---------------------------------------------------
NULL NULL 7 0
2015-03 NULL 5 1
2015-04 NULL 6 1
2015-03 2015-03-10 4 3
2015-03 2015-03-12 1 3
2015-04 2015-04-12 2 3
2015-04 2015-04-13 3 3
2015-04 2015-04-15 2 3
2015-04 2015-04-16 2 3
等价于
SELECT NULL,NULL,COUNT(DISTINCT cookieid) AS uv,0 AS GROUPING__ID FROM per_d_report
UNION ALL
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM per_d_report GROUP BY month
UNION ALL
SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS GROUPING__ID FROM per_d_report GROUP BY month,day;