row_number()
rank()
dense_rank()
percent_rank()
cume_dist()
ntile()
lag()
lead()
first_value()
last_value()
nth_value()
举例:
select * from test order by name;
id | name
-----+---------
100 | apple
2 | apple
2 | apple
2 | apple
3 | apple
1 | apple
3 | banana
4 | orange
5 | orange
5 | pumpkin
select *,row_number() over(partition by name order by id),rank() over(partition by name order by id),dense_rank() over(partition by name order by id),percent_rank() over(partition by name order by id),cume_dist() over(partition by name order by id) from test;
id | name | row_number | rank | dense_rank | percent_rank | cume_dist
-----+---------+------------+------+------------+--------------+-------------------
1 | apple | 1 | 1 | 1 | 0 | 0.166666666666667
2 | apple | 2 | 2 | 2 | 0.2 | 0.666666666666667
2 | apple | 3 | 2 | 2 | 0.2 | 0.666666666666667
2 | apple | 4 | 2 | 2 | 0.2 | 0.666666666666667
3 | apple | 5 | 5 | 3 | 0.8 | 0.833333333333333
100 | apple | 6 | 6 | 4 | 1 | 1
3 | banana | 1 | 1 | 1 | 0 | 1
4 | orange | 1 | 1 | 1 | 0 | 0.5
5 | orange | 2 | 2 | 2 | 1 | 1
5 | pumpkin | 1 | 1 | 1 | 0 | 1
(10 rows)
row_number()分组返回行号,示例表示按name分组,可以看到name分组的内部排序,apple是1~6,orange是1~2
rank列表示按name分组,name分组后,按id排序的排序,1,2,2,2,5,6 id重复的会跳级处理
dense_rank同rank,但是不会跳级处理,所以值是1,2,2,2,3,4是连续的
percent_rank (rank-1)/(row-1) rank表示当前行的rank值,row表示分组的总行数,如rank=1的,值肯定都是0,rank=5 那么(5-1)/(6-1)=0.8
cume_dist 分组中截止当前行的行数除以分组的总行数,如apple组,rank=1 那么1/6=0.166666666666667,rank=2的,是分组的第四行 那么4/6=0.666666666666667,rank=5 是分组的第五行,5/6=0.833333333333333
窗口函数也可以和一些普通聚合函数配合使用
select sum(id) over(partition by name),avg(id) over(partition by name),max(id) over(partition by name),min(id) over(partition by name),* from test;
sum | avg | max | min | id | name
-----+---------------------+-----+-----+-----+---------
110 | 18.3333333333333333 | 100 | 1 | 100 | apple
110 | 18.3333333333333333 | 100 | 1 | 2 | apple
110 | 18.3333333333333333 | 100 | 1 | 2 | apple
110 | 18.3333333333333333 | 100 | 1 | 2 | apple
110 | 18.3333333333333333 | 100 | 1 | 3 | apple
110 | 18.3333333333333333 | 100 | 1 | 1 | apple
3 | 3.0000000000000000 | 3 | 3 | 3 | banana
9 | 4.5000000000000000 | 5 | 4 | 4 | orange
9 | 4.5000000000000000 | 5 | 4 | 5 | orange
5 | 5.0000000000000000 | 5 | 5 | 5 | pumpkin
窗口函数实际用例:
有下面一张表tb1,求每门科目的第一名,得出表2
id | name | subject | score
----+--------+---------+-------
6 | 刘德华 | 数学 | 89.5
7 | 刘德华 | 语文 | 99.5
8 | 刘德华 | 英语 | 79.5
9 | 刘德华 | 物理 | 89.5
10 | 刘德华 | 化学 | 69.5
11 | 张学友 | 数学 | 89.5
12 | 张学友 | 语文 | 91.5
13 | 张学友 | 英语 | 92.5
14 | 张学友 | 物理 | 93.5
15 | 张学友 | 化学 | 94.5
1 | 郭富城 | 数学 | 99.5
2 | 郭富城 | 语文 | 89.5
3 | 郭富城 | 英语 | 79.5
4 | 郭富城 | 物理 | 99.5
5 | 郭富城 | 化学 | 98.5
id | name | subject | score
----+--------+---------+-------
5 | 郭富城 | 化学 | 98.5
1 | 郭富城 | 数学 | 99.5
4 | 郭富城 | 物理 | 99.5
13 | 张学友 | 英语 | 92.5
7 | 刘德华 | 语文 | 99.5
SELECT id,name,subject,score FROM
(SELECT row_number() OVER (PARTITION BY subject ORDER BY score DESC) AS rn,* FROM tb1) AS t
WHERE rn=1 ORDER BY SUBJECT;
接上表如何得出下表,每个人每门成绩与第一名的差距,以及第一名,排名
ord | id | name | subject | score | top1 | diff
-----+----+--------+---------+-------+------+------
1 | 5 | 郭富城 | 化学 | 98.5 | 98.5 | 0.0
2 | 15 | 张学友 | 化学 | 94.5 | 98.5 | 4.0
3 | 10 | 刘德华 | 化学 | 69.5 | 98.5 | 29.0
1 | 1 | 郭富城 | 数学 | 99.5 | 99.5 | 0.0
2 | 6 | 刘德华 | 数学 | 89.5 | 99.5 | 10.0
3 | 11 | 张学友 | 数学 | 89.5 | 99.5 | 10.0
1 | 4 | 郭富城 | 物理 | 99.5 | 99.5 | 0.0
2 | 14 | 张学友 | 物理 | 93.5 | 99.5 | 6.0
3 | 9 | 刘德华 | 物理 | 89.5 | 99.5 | 10.0
1 | 13 | 张学友 | 英语 | 92.5 | 92.5 | 0.0
2 | 3 | 郭富城 | 英语 | 79.5 | 92.5 | 13.0
3 | 8 | 刘德华 | 英语 | 79.5 | 92.5 | 13.0
1 | 7 | 刘德华 | 语文 | 99.5 | 99.5 | 0.0
2 | 12 | 张学友 | 语文 | 91.5 | 99.5 | 8.0
3 | 2 | 郭富城 | 语文 | 89.5 | 99.5 | 10.0
SELECT row_number() over(partition by subject order by score desc) as ord,
id,name,subject,score,max(score) over (partition by subject) as top1,
max(score) over (partition by subject) - score as diff from tb1;
取出A表的每天的新增用户
select * from a;
day_key | user_id | app_name
----------+---------+-------------
20170102 | user1 | feijidazhan
20170102 | user2 | feijidazhan
20170102 | user3 | feijidazhan
20170102 | user4 | feijidazhan
20170103 | user1 | feijidazhan
20170103 | user3 | feijidazhan
20170103 | user5 | feijidazhan
20170101 | user1 | feijidazhan
20170101 | user2 | feijidazhan
(9 rows)
select day_key,user_id,app_name from
(select day_key,user_id,app_name,row_number() over (partition by user_id order by day_key)num
from a )c where c.num=1 order by 1;
day_key | user_id | app_name
----------+---------+-------------
20170101 | user1 | feijidazhan
20170101 | user2 | feijidazhan
20170102 | user4 | feijidazhan
20170102 | user3 | feijidazhan
20170103 | user5 | feijidazhan
(5 rows)
从此表中取出连续三天登陆的用户
select * from aa;
day_key | sky_id
----------+--------
20170101 | skya
20170101 | skyb
20170101 | skyc
20170105 | skya
20170105 | skyb
20170105 | skyc
20170104 | skya
20170104 | skyb
20170103 | skya
20170103 | skyb
20170102 | skyb
20170102 | skyc
(12 rows)
select sky_id from
(select min(day_key) start_day,max(day_key) end_day,sky_id from
(select sky_id,day_key,day_key-row_number()over(partition by sky_id order by day_key asc) as num from aa where day_key>=20170101)a
group by sky_id,num)b
where end_day-start_day>=2;
sky_id
--------
skya
skyb
(2 rows)
ntile(num_buckets integer):从1到当前值,平均分配这些值,如下:
select ntile(6) over(partition by name order by id),* from test;
ntile | id | name
-------+-----+---------
1 | 1 | apple
2 | 2 | apple
3 | 2 | apple
4 | 2 | apple
5 | 3 | apple
6 | 100 | apple #apple分组有6行,每行领一个号
1 | 3 | banana
1 | 4 | orange
2 | 5 | orange
1 | 5 | pumpkin
(10 rows)
select ntile(2) over(partition by name order by id),* from test;
ntile | id | name
-------+-----+---------
1 | 1 | apple
1 | 2 | apple
1 | 2 | apple
2 | 2 | apple #apple分组6行,这里1~2,所以平均3行领一个号
2 | 3 | apple
2 | 100 | apple
1 | 3 | banana
1 | 4 | orange
2 | 5 | orange
1 | 5 | pumpkin
(10 rows)
select ntile(3) over(partition by name order by id),* from test;
ntile | id | name
-------+-----+---------
1 | 1 | apple
1 | 2 | apple
2 | 2 | apple
2 | 2 | apple
3 | 3 | apple
3 | 100 | apple
1 | 3 | banana
1 | 4 | orange
2 | 5 | orange
1 | 5 | pumpkin
lag(value any [, offset integer [, default any ]]):偏移量函数,取滞后值,如lag(column_name,2,0)表示字段偏移量为2,没有则用default值代替,这里是0,不写默认是null
lead(value any [, offset integer [, default any ]]):偏移量函数,取提前值,类上
select lead(id,1,null) over(partition by name order by id),lag(id,1,null) over(partition by name order by id),* from test;
lead | lag | id | name
------+-----+-----+---------
2 | | 1 | apple
2 | 1 | 2 | apple
2 | 2 | 2 | apple
3 | 2 | 2 | apple
100 | 2 | 3 | apple
| 3 | 100 | apple
| | 3 | banana
5 | | 4 | orange
| 4 | 5 | orange
| | 5 | pumpkin
(10 rows)
first_value(value any):返回分组中的第一个值
last_value(value any):返回分组中的最后一个值
select first_value(id) over(partition by name),last_value(id) over(partition by name),* from test;
first_value | last_value | id | name
-------------+------------+-----+---------
100 | 1 | 100 | apple
100 | 1 | 2 | apple
100 | 1 | 2 | apple
100 | 1 | 2 | apple
100 | 1 | 3 | apple
100 | 1 | 1 | apple
3 | 3 | 3 | banana
4 | 5 | 4 | orange
4 | 5 | 5 | orange
5 | 5 | 5 | pumpkin
以下注意如果加上order by,要使用rows between unbounded preceding and unbounded following,见区别
select last_value(id) over(partition by name order by id),* from test;
last_value | id | name
------------+-----+---------
1 | 1 | apple
2 | 2 | apple
2 | 2 | apple
2 | 2 | apple
3 | 3 | apple
100 | 100 | apple
3 | 3 | banana
4 | 4 | orange
5 | 5 | orange
5 | 5 | pumpkin
select last_value(id) over(partition by name order by id rows between unbounded preceding and unbounded following),* from test;
last_value | id | name
------------+-----+---------
100 | 1 | apple
100 | 2 | apple
100 | 2 | apple
100 | 2 | apple
100 | 3 | apple
100 | 100 | apple
3 | 3 | banana
5 | 4 | orange
5 | 5 | orange
5 | 5 | pumpkin
(10 rows)
nth_value(value any, nth integer):返回窗口框架中的指定值,如nth_value(id,2),则表示返回字段id的第二个窗口函数值
select nth_value(id,2) over(partition by name order by id),* from test;
nth_value | id | name
-----------+-----+---------
| 1 | apple
2 | 2 | apple
2 | 2 | apple
2 | 2 | apple
2 | 3 | apple
2 | 100 | apple
| 3 | banana
| 4 | orange
5 | 5 | orange
| 5 | pumpkin