---------开窗函数(窗口函数)-分组排序开窗函数------------------------------
-- 序号函数:row_number,rank,dense_rank
-- 1.1准备数据test1.txt
/*
cookie1,2018-04-10,1
cookie1,2018-04-11,5
cookie1,2018-04-12,7
cookie1,2018-04-13,3
cookie1,2018-04-14,2
cookie1,2018-04-15,4
cookie1,2018-04-16,4
cookie2,2018-04-10,2
cookie2,2018-04-11,3
cookie2,2018-04-12,5
cookie2,2018-04-13,6
cookie2,2018-04-14,3
cookie2,2018-04-15,9
cookie2,2018-04-16,7
*/
create table test_ordinal_func
(
cookie_id string comment 'cookie id',
create_time string comment '创建时间',
pv int comment '访问次数'
) row format delimited fields terminated by ',';
load data local inpath '/export/data/test_window.txt' into table test_ordinal_func;
select *
from test_ordinal_func;
-- 使用开窗函数来实现分组并组内排序
-- 需求1:按照cookie_id进行分组,并且在每一组中按照pv进行降序排序
-- partition by:分组
select *,
-- 1 2 3 4 5
row_number() over (partition by cookie_id order by pv desc ) row_number,
-- 1 2 3 3 5
rank() over (partition by cookie_id order by pv desc ) rank,
-- 1 2 3 3 4
dense_rank() over (partition by cookie_id order by pv desc ) dense_rank
from test_ordinal_func;
-- 需求:按照cookie_id进行分组,并且在每一组中按照pv进行降序排序,选出每一组排名前三的信息(TopN问题)
-- 以下写法报错,因为SQL的执行顺序:from where select,where的时候,别名还没有出现
/*
select *,
dense_rank() over (partition by cookie_id order by pv desc ) as rk3 -- 1 2 3 3 4
from test_ordinal_func
where rk3 <= 3;
*/
-- 通过子查询解决
select *
from (
select *,
dense_rank() over (partition by cookie_id order by pv desc ) as rk3 -- 1 2 3 3 4
from test_ordinal_func
) as dense_rank
where dense_rank.rk3 <= 3;
-- 需求:查询新冠疫情数据表中每一个州确诊人数最多的县TopN
select *
from (
select *,
-- partition by:后面可跟多个字段(同州同县分一组)
-- order by:后面可跟多个字段(主要条件相同比较次要条件。按确诊病例降序,确证病例相同则按死亡病例降序)
dense_rank() over (partition by state, county order by cases desc, deaths desc) as rk
from covid2
) as dense_rank
where dense_rank.rk = 1;
-- 分组和排序的后边都可以指定多个字段
/*
select *,
dense_rank() over (partition by cookie_id,pv order by cookie_id,create_time desc ) as rk3 -- 1 2 3 3 4
from test_ordinal_func;
*/
---------开窗函数(窗口函数)-指定区间进行统计开窗函数(聚合开窗函数)------------------------------
-- 3、使用聚合开窗函数进行统计
-- 累加区间:从第1行累加到当前行(默认)
select cookie_id,
create_time,
pv,
sum(pv) over (partition by cookie_id order by create_time) as pv1
from test_ordinal_func;
-- 等价上边的写法
select cookie_id,
create_time,
pv,
sum(pv) over (partition by cookie_id order by create_time
rows between unbounded preceding and current row) as pv2
from test_ordinal_func;
-- 累加区间:从前3行累加到当前行
select cookie_id,
create_time,
pv,
sum(pv) over (partition by cookie_id order by create_time
rows between 3 preceding and current row)
from test_ordinal_func;
-- 累加区间:从前3行累加到下1行
select cookie_id,
create_time,
pv,
sum(pv) over (partition by cookie_id order by create_time
rows between 3 preceding and 1 following) as pv5
from test_ordinal_func;
-- 累加区间:从当前行加到组的最后
select cookie_id,
create_time,
pv,
sum(pv) over (partition by cookie_id order by create_time
rows between current row and unbounded following) as pv6
from test_ordinal_func;
-- 以上的sum可以替换为avg,max,min
select cookie_id,
create_time,
pv,
max(pv) over (partition by cookie_id order by create_time) as pv1
from test_ordinal_func;
select cookie_id,
create_time,
pv,
min(pv) over (partition by cookie_id order by create_time) as pv1
from test_ordinal_func;
select cookie_id,
create_time,
pv,
avg(pv) over (partition by cookie_id order by create_time) as pv1
from test_ordinal_func;
-- 前后函数:lag lead
-- lag
-- 将上1行数据放在当前行
select cookie_id,
create_time,
pv,
lag(create_time, 1) over (partition by cookie_id order by create_time)
from test_ordinal_func;
-- 将上2行数据放在当前行
select cookie_id,
create_time,
pv,
lag(create_time, 2) over (partition by cookie_id order by create_time)
from test_ordinal_func;
-- lead
-- 将下1行数据放在当前行
select cookie_id,
create_time,
pv,
lead(create_time, 1) over (partition by cookie_id order by create_time)
from test_ordinal_func;
-- 将下2数据放在当前行
select cookie_id,
create_time,
pv,
lead(create_time, 2) over (partition by cookie_id order by create_time)
from test_ordinal_func;