排名函数主要有row_number、rank和dense_rank。
row_number是行号,不会重复,rank数据相同的,给出并列排名,但是会跳跃,dense_rank类似于rank,但不会跳跃,通过下面的案例来看一下区别。
准备数据
drop table if exists test.test_zw;
CREATE TABLE if not exists test.test_zw(
name string COMMENT '姓名',
score double comment '成绩'
)
COMMENT '测试表'
STORED as parquet TBLPROPERTIES('parquet.compression'='SNAPPY');
-- 插入数据
insert into test.test_zw values
('小王',100),
('小李',90),
('小张',90),
('小红',80);
简单查询
select name,score
from test.test_zw
order by score desc
按照成绩降序排名
select name,score
,row_number() over( order by score desc) `row_number排名`
,rank() over ( order by score desc) `rank排名`
,dense_rank() over ( order by score desc) `dense_rank排名`
from test.test_zw
order by score desc
over里面使用partition by可以指定分组,给出分组排名。例如查看数学和语文的排名。
准备数据
drop table if exists test.test_zw;
CREATE TABLE if not exists test.test_zw(
name string COMMENT '姓名',
course string comment '课程',
score double comment '成绩'
)
COMMENT '测试表'
STORED as parquet TBLPROPERTIES('parquet.compression'='SNAPPY');
-- 插入数据
insert into test.test_zw values
('小王','数学',100),
('小李','数学',90),
('小张','数学',90),
('小红','数学',80),
('小王','语文',60),
('小李','语文',80),
('小张','语文',80),
('小红','语文',100);
简单查询
select name,course,score
from test.test_zw
order by course,score desc
先按course分组,再按score降序排名
select name,course,score
,row_number() over( partition by course order by score desc) `row_number排名`
,rank() over ( partition by course order by score desc) `rank排名`
,dense_rank() over ( partition by course order by score desc) `dense_rank排名`
from test.test_zw
order by course, score desc
需求:分别获取数学和语文前两名的学生
分析:先使用rank函数按照course分组score降序排名,然后取排名小于等于2的数据。注意,使用rank可以跑出并列排名,最终可能导致跑出的数据多于要求的数据。
select name,course,score,`rank排名`
from (
select name,course,score
,row_number() over( partition by course order by score desc) `row_number排名`
,rank() over ( partition by course order by score desc) `rank排名`
,dense_rank() over ( partition by course order by score desc) `dense_rank排名`
from test.test_zw
) t1
where `rank排名`<=2
order by course, score desc
需求:有如下数据,取出每个用户最高连续签到次数。
准备数据:
drop table if exists test.test_zw;
CREATE TABLE if not exists test.test_zw(
name string COMMENT '姓名',
sign_date string comment '签到日期'
)
COMMENT '测试表'
STORED as parquet TBLPROPERTIES('parquet.compression'='SNAPPY');
-- 插入数据
insert into test.test_zw values
('老王','2021-01-01'),
('老王','2021-01-02'),
('老王','2021-01-03'),
('老王','2021-01-07'),
('小张','2021-01-01'),
('小张','2021-01-03'),
('小张','2021-01-04');
简单查询:从数据上看,老王最高连续签到3次,小张最高连续签到2次。
select name,sign_date
from test.test_zw
order by name,sign_date
第一步:给每一组连续签到打标,方法是签到日期减去排名,如果是连续签到,返回的日期相同。比如老王1号、2号、3号连续签到,返回2020-12-31。
select name
,sign_date
,rank() over(partition by name order by sign_date asc) as `排名`
,date_sub(sign_date, rank() over(partition by name order by sign_date asc))as label -- 连续签到分组标签
from test.test_zw
order by name,sign_date
第二步:按照姓名和连续签到分组标签分组,签到日期升序排名。
select name,sign_date,label
,rank() over(partition by name,label order by sign_date asc) as `连续签到排名`
from (
select name
,sign_date
,rank() over(partition by name order by sign_date asc) as `排名`
,date_sub(sign_date, rank() over(partition by name order by sign_date asc))as label -- 连续签到分组标签
from test.test_zw
) t1
order by name,sign_date
select name,max(`连续签到排名`) as `最大连续签到次数`
from (
select name,sign_date,label
,rank() over(partition by name,label order by sign_date asc) as `连续签到排名`
from (
select name
,sign_date
,rank() over(partition by name order by sign_date asc) as `排名`
,date_sub(sign_date, rank() over(partition by name order by sign_date asc))as label -- 连续签到分组标签
from test.test_zw
) t1
) t2
group by name