2022-03-07 开窗函数

---------开窗函数(窗口函数)-分组排序开窗函数------------------------------
-- 序号函数:row_number,rank,dense_rank

-- 1.1准备数据test1.txt
/*
cookie1,2018-04-10,1
cookie1,2018-04-11,5
cookie1,2018-04-12,7
cookie1,2018-04-13,3
cookie1,2018-04-14,2
cookie1,2018-04-15,4
cookie1,2018-04-16,4
cookie2,2018-04-10,2
cookie2,2018-04-11,3
cookie2,2018-04-12,5
cookie2,2018-04-13,6
cookie2,2018-04-14,3
cookie2,2018-04-15,9
cookie2,2018-04-16,7
*/

create table test_ordinal_func
(
    cookie_id   string comment 'cookie id',
    create_time string comment '创建时间',
    pv          int comment '访问次数'
) row format delimited fields terminated by ',';

load data local inpath '/export/data/test_window.txt' into table test_ordinal_func;

select *
from test_ordinal_func;

-- 使用开窗函数来实现分组并组内排序
-- 需求1:按照cookie_id进行分组,并且在每一组中按照pv进行降序排序
-- partition by:分组
select *,
       -- 1 2 3 4 5
       row_number() over (partition by cookie_id order by pv desc ) row_number,
       -- 1 2 3 3 5
       rank() over (partition by cookie_id order by pv desc )       rank,
       -- 1 2 3 3 4
       dense_rank() over (partition by cookie_id order by pv desc ) dense_rank
from test_ordinal_func;

-- 需求:按照cookie_id进行分组,并且在每一组中按照pv进行降序排序,选出每一组排名前三的信息(TopN问题)

-- 以下写法报错,因为SQL的执行顺序:from where select,where的时候,别名还没有出现
/*
select *,
       dense_rank() over (partition by cookie_id order by pv desc ) as rk3 -- 1 2 3 3 4
from test_ordinal_func
where rk3 <= 3;
*/

-- 通过子查询解决
select *
from (
         select *,
                dense_rank() over (partition by cookie_id order by pv desc ) as rk3 -- 1 2 3 3 4
         from test_ordinal_func
     ) as dense_rank
where dense_rank.rk3 <= 3;


-- 需求:查询新冠疫情数据表中每一个州确诊人数最多的县TopN
select *
from (
         select *,
                -- partition by:后面可跟多个字段(同州同县分一组)
                -- order by:后面可跟多个字段(主要条件相同比较次要条件。按确诊病例降序,确证病例相同则按死亡病例降序)
                dense_rank() over (partition by state, county order by cases desc, deaths desc) as rk
         from covid2
     ) as dense_rank
where dense_rank.rk = 1;

-- 分组和排序的后边都可以指定多个字段
/*
select *,
       dense_rank() over (partition by cookie_id,pv order by cookie_id,create_time desc ) as rk3 -- 1 2 3 3 4
from test_ordinal_func;
*/


---------开窗函数(窗口函数)-指定区间进行统计开窗函数(聚合开窗函数)------------------------------

-- 3、使用聚合开窗函数进行统计
-- 累加区间:从第1行累加到当前行(默认)
select cookie_id,
       create_time,
       pv,
       sum(pv) over (partition by cookie_id order by create_time) as pv1
from test_ordinal_func;

-- 等价上边的写法
select cookie_id,
       create_time,
       pv,
       sum(pv) over (partition by cookie_id order by create_time
           rows between unbounded preceding and current row) as pv2
from test_ordinal_func;

-- 累加区间:从前3行累加到当前行
select cookie_id,
       create_time,
       pv,
       sum(pv) over (partition by cookie_id order by create_time
           rows between 3 preceding and current row)
from test_ordinal_func;

-- 累加区间:从前3行累加到下1行
select cookie_id,
       create_time,
       pv,
       sum(pv) over (partition by cookie_id order by create_time
           rows between 3 preceding and 1 following) as pv5
from test_ordinal_func;

-- 累加区间:从当前行加到组的最后
select cookie_id,
       create_time,
       pv,
       sum(pv) over (partition by cookie_id order by create_time
           rows between current row and unbounded following) as pv6
from test_ordinal_func;

-- 以上的sum可以替换为avg,max,min
select cookie_id,
       create_time,
       pv,
       max(pv) over (partition by cookie_id order by create_time) as pv1
from test_ordinal_func;

select cookie_id,
       create_time,
       pv,
       min(pv) over (partition by cookie_id order by create_time) as pv1
from test_ordinal_func;

select cookie_id,
       create_time,
       pv,
       avg(pv) over (partition by cookie_id order by create_time) as pv1
from test_ordinal_func;


-- 前后函数:lag lead
-- lag
-- 将上1行数据放在当前行
select cookie_id,
       create_time,
       pv,
       lag(create_time, 1) over (partition by cookie_id order by create_time)
from test_ordinal_func;

-- 将上2行数据放在当前行
select cookie_id,
       create_time,
       pv,
       lag(create_time, 2) over (partition by cookie_id order by create_time)
from test_ordinal_func;

-- lead
-- 将下1行数据放在当前行
select cookie_id,
       create_time,
       pv,
       lead(create_time, 1) over (partition by cookie_id order by create_time)
from test_ordinal_func;

-- 将下2数据放在当前行
select cookie_id,
       create_time,
       pv,
       lead(create_time, 2) over (partition by cookie_id order by create_time)
from test_ordinal_func;

你可能感兴趣的:(2022-03-07 开窗函数)