hive窗口函数盘点

在支持窗口函数里的sql里，善用窗口函数，能降低sql编写复杂度并提高sql执行效率。

窗口函数

function() over(partition by col1 order by col2 )

1. 聚合型窗口函数

sum(),min(),max(),avg(),count(),collect_set()

rows between 可以定义窗口范围
n PRECEDING : 前n行
n FOLLOWING：后n行
CURRENT ROW ：当前行
如果不想限制具体的行数，可以将 n 替换为 UNBOUNDED.比如从起始到当前行


--取每个客户前7天的平均资产:
select 
user_no,
dt,
total_assets,
avg(total_assets)over(partition by user_no order by dt rows between 7 preceding and current row) as recently_asset
from user_assets;

无法使用count(distinct )over()
但是可以通过 size(collect_set()over())来实现

select 
dt,
size(collect_set(user_no)over(order by dt)) asset_user 
from user_assets where dt between 20200525 and 20200528;
--实测效率太低，不推荐使用

2. 分析型窗口函数

ntile(),row_number(),rank(),dense_rank()

ntile 表示对数据分成n个区，返回对应的区编号
row_number(),rank(),dense_rank()都是排序函数
row_number()不去重,rank()去重但是会产生不连续序号,dense_rank()去重且序号连续


select 
dt,
user_no,
total_assets,
ntile(10) over(partition by dt order by total_assets desc) nt, --对每天的有资产客户按照资产多少分成10等分,返回对应编号
row_number()over(partition by dt order by total_assets)rk1,
rank()over(partition by dt order by total_assets)rk2,
dense_rank()over(partition by dt order by total_assets)rk3
from 
user_assets where dt between 20200525 and 20200528;

cume_dist(),percent_rank()

cume_list() 小于等于当前值的行数/分组内总行数
percent_rank() 分组内当前行的RANK值-1/分组内总行数-1

SELECT 
dept,
userid,
sal,
CUME_DIST() OVER(ORDER BY sal) AS rn1,
CUME_DIST() OVER(PARTITION BY dept ORDER BY sal) AS rn2 
FROM tbname;
 
dept    userid   sal   rn1       rn2 
-------------------------------------------
d1      user1   1000    0.2     0.3333333333333333
d1      user2   2000    0.4     0.6666666666666666
d1      user3   3000    0.6     1.0
d2      user4   4000    0.8     0.5
d2      user5   5000    1.0     1.0


SELECT 
dept,
userid,
sal,
PERCENT_RANK() OVER(ORDER BY sal) AS rn1,   --分组内
RANK() OVER(ORDER BY sal) AS rn11,          --分组内RANK值
SUM(1) OVER(PARTITION BY NULL) AS rn12,     --分组内总行数
PERCENT_RANK() OVER(PARTITION BY dept ORDER BY sal) AS rn2 
FROM tbname;
 
dept    userid   sal    rn1    rn11     rn12    rn2
---------------------------------------------------
d1      user1   1000    0.0     1       5       0.0
d1      user2   2000    0.25    2       5       0.5
d1      user3   3000    0.5     3       5       1.0
d2      user4   4000    0.75    4       5       0.0
d2      user5   5000    1.0     5       5       1.0

3. 取值型窗口函数

lead(),lag(),first_value(),last_value()

--以下可以用于股价的关联
select 
dt,
stock_id,
close_price,
lead(dt,1,date_format(current_date(),'yyyyMMdd'))over(partition by stock_id order by dt) next_dt, --下一个交易日,如果无下一个交易日,取当前日期
lag(dt,1)over(partition by stock_id order by dt) next_dt, --上一个交易日，如果无上一个交易日，取值null
first_value(dt)over(partition by stock_id order by dt) next_dt, --该股票的最早交易日
last_value(dt)over(partition by stock_id order by dt) next_dt --该股票的最新交易日
from 
stock_price;

4.GROUPING SET(),with CUBE, with ROLLUP 对 group by 进行限制

1. GROUPING SETS: 根据不同的维度组合进行聚合，等价于将不同维度的GROUP BY结果集进行UNION ALL


SELECT 
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID 
FROM per_d_report 
GROUP BY month,day 
GROUPING SETS (month,day) 
ORDER BY GROUPING__ID;
 
month      day            uv      GROUPING__ID
------------------------------------------------
2015-03    NULL            5       1
2015-04    NULL            6       1
NULL       2015-03-10      4       2
NULL       2015-03-12      1       2
NULL       2015-04-12      2       2
NULL       2015-04-13      3       2
NULL       2015-04-15      2       2
NULL       2015-04-16      2       2
 
 
等价于 
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM per_d_report GROUP BY month 
UNION ALL 
SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM per_d_report GROUP BY day;

1. with cube: 根据GROUP BY的维度的所有组合进行聚合

SELECT 
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID 
FROM per_d_report 
GROUP BY month,day 
WITH CUBE 
ORDER BY GROUPING__ID;
 
 
month               day             uv     GROUPING__ID
--------------------------------------------
NULL            NULL            7       0
2015-03         NULL            5       1
2015-04         NULL            6       1
NULL            2015-04-12      2       2
NULL            2015-04-13      3       2
NULL            2015-04-15      2       2
NULL            2015-04-16      2       2
NULL            2015-03-10      4       2
NULL            2015-03-12      1       2
2015-03         2015-03-10      4       3
2015-03         2015-03-12      1       3
2015-04         2015-04-16      2       3
2015-04         2015-04-12      2       3
2015-04         2015-04-13      3       3
2015-04         2015-04-15      2       3
 
 
 
等价于
SELECT NULL,NULL,COUNT(DISTINCT cookieid) AS uv,0 AS GROUPING__ID FROM per_d_report
UNION ALL 
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM per_d_report GROUP BY month 
UNION ALL 
SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM per_d_report GROUP BY day
UNION ALL 
SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS GROUPING__ID FROM per_d_report GROUP BY month,day;

1. with rollup: 是CUBE的子集，以最左侧的维度为主，从该维度进行层级聚合

SELECT 
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID  
FROM per_d_report 
GROUP BY month,day
WITH ROLLUP 
ORDER BY GROUPING__ID;
 
month               day             uv     GROUPING__ID
---------------------------------------------------
NULL             NULL            7       0
2015-03          NULL            5       1
2015-04          NULL            6       1
2015-03          2015-03-10      4       3
2015-03          2015-03-12      1       3
2015-04          2015-04-12      2       3
2015-04          2015-04-13      3       3
2015-04          2015-04-15      2       3
2015-04          2015-04-16      2       3

等价于
SELECT NULL,NULL,COUNT(DISTINCT cookieid) AS uv,0 AS GROUPING__ID FROM per_d_report
UNION ALL 
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM per_d_report GROUP BY month 
UNION ALL 
SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS GROUPING__ID FROM per_d_report GROUP BY month,day;