Reference
- Hive
- Hive窗口函数
- Hive Wiki WindowingAndAnalytics
- Oracle
- Window Function
- Analytic Functions
- MySQL
- MySQL窗口函数
- Window Functions
概述
-- ALTER TABLE order RENAME TO order_infos
create table order_infos(name varchar(20), order_date date, cost int)
insert into order_infos
select 'jack',cast('2015-01-01' as date),10
union all
select 'tony',cast('2015-01-02' as date),15
union all
select 'jack',cast('2015-02-03' as date),23
union all
select 'tony',cast('2015-01-04' as date),29
union all
select 'jack',cast('2015-01-05' as date),46
union all
select 'jack',cast('2015-04-06' as date),42
union all
select 'tony',cast('2015-01-07' as date),50
union all
select 'jack',cast('2015-01-08' as date),55
union all
select 'mart',cast('2015-04-08' as date),62
union all
select 'mart',cast('2015-04-09' as date),68
union all
select 'neil',cast('2015-05-10' as date),12
union all
select 'mart',cast('2015-04-11' as date),75
union all
select 'neil',cast('2015-06-12' as date),80
union all
select 'mart',cast('2015-04-13' as date),94
- 当同一个
select
查询中存在多个窗口函数时,他们相互之间是没有影响的.每个窗口函数应用自己的规则.
- window子句
PRECEDING
:往前
FOLLOWING
:往后
CURRENT ROW
:当前行
UNBOUNDED
:起点
UNBOUNDED PRECEDING
:从前面的起点
UNBOUNDED FOLLOWING
:到后面的终点
- 如果只使用
partition by
子句,未指定order by
的话,我们的聚合是分组内的聚合
- 使用了
order by
子句,未使用window
子句的情况下,默认从起点到当前行
select name,order_date,cost,
sum(cost) over() as sample1, --在每一行后面增加 所有行相加
sum(cost) over(partition by name) as sample2,--在每一行后面增加 按name分组,组内数据 相加
sum(cost) over(partition by name order by order_date) as sample3,--在每一行后面增加 按name分组,组内数据 累加
sum(cost) over(partition by name order by order_date rows between UNBOUNDED PRECEDING and current row ) as sample4 ,--和sample3一样,由起点到当前行的聚合
sum(cost) over(partition by name order by order_date rows between 1 PRECEDING and current row) as sample5, --当前行和前面一行做聚合
sum(cost) over(partition by name order by order_date rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6,--当前行和前边一行及后面一行
sum(cost) over(partition by name order by order_date rows between current row and UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行
from testtest.order_infos;
- 排序
row_number()
:从1开始,按照顺序,生成分组内记录的序列,row_number()
的值不会存在重复,当排序的值相同时,按照表中记录的顺序进行排列
RANK()
: 生成数据项在分组中的排名,排名相等会在名次中留下空位 (排序相同时不连续)。RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn
DENSE_RANK()
: 生成数据项在分组中的排名,排名相等会在名次中不会留下空位(排序连续)
- LAG(向上,向前)和LEAD(向下,向后)函数:可以返回上下数据行的数据
select name,order_date,cost,
lag(order_date,1,'1900-01-01') over(partition by name order by order_date ) as time1,
lag(order_date,2) over (partition by name order by order_date) as time2
from testtest.order_infos
- first_value取分组内排序后,截止到当前行,第一个值
- last_value取分组内排序后,截止到当前行,最后一个值
select name,order_date,cost,
first_value(order_date) over(partition by name order by order_date) as time1,
last_value(order_date) over(partition by name order by order_date) as time2
from testtest.order_infos
- MySQL
- 说明:MySQL从8.0开始支持窗口函数
- 常见窗口函数
row_number()
rank()
dense_rank()
percent_rank()
:当前记录排序的(rank() - 1)
除以 (分组总记录数 - 1)的百分比
cume_dist()
:计算当前记录排序rank()
的总记录数除以分组总记录数的百分比
lag(col,N)
:按照分组排序,显示该记录的前N
个的col
值
lead(col,N)
first_value(col)
:按照分组排序,显示排序第一的col
值
last_value(col)
nth_value(col,N)
:按照分组排序,截止到当前记录第N
排序的col
值
nfile(N)
:按照分组排序,将所有记录分为N
份
select row_number() over w as row_num,class_num,user_id,score from score where subject_name='Math' window w as ( partition by class_num order by score desc) ;
select `row_number`,class_num,user_id,score from (select if(@class=class_num,@r:=@r+1,@r:=1) as `row_number`,@class:=class_num,class_num,user_id,score from score,(select @r:=0,@class=NULL) temp order by class_num,score desc) tmp2;
select dense_rank() over w as `dense_rank`,class_num,user_id,score from score where subject_name='Math' window w as ( partition by class_num order by score desc) ;
select `dense_rank`,class_num,user_id,score from (select if(@class=class_num,case when @s = score then @r when @s := score then @r := @r + 1 end,@r:=1) as `dense_rank`,@class:=class_num,@s:=score,class_num,user_id,score FROM score, ( SELECT @r := 0, @s = NULL,@class=NULL ) temp where subject_name = 'Math' order by class_num,score desc) temp2;
select rank() over w as `rank`,class_num,user_id,score from score where subject_name='Math' window w as ( order by score desc) ;
select `rank`,class_num,user_id,score from (SELECT @r:=if(@s = score,@r,@c) AS `rank`,@s:=score,@c:=@c+1,class_num,user_id,score from score, ( select @r := 0, @s = NULL,@c:=1 ) r where subject_name = 'Math' order by score desc) temp;