常用窗口函数总结

目录

 

SUM,AVG,MIN,MAX,NTILE,

ROW_NUMBER,RANK,DENSE_RANK,

CUME_DIST,PERCENT_RANK,LAG,LEAD,

FIRST_VALUE,LAST_VALUEGROUPING SETS,GROUPING__ID,

CUBE,ROLLUP

--Hive分析窗口函数(一) SUM,AVG,MIN,MAX

--Hive分析窗口函数(二) NTILE,ROW_NUMBER,RANK,DENSE_RANK

--HIVE窗口函数(三)

--Hive分析窗口函数(四) 


汇总:
SUM,AVG,MIN,MAX,NTILE,
ROW_NUMBER,RANK,DENSE_RANK,
CUME_DIST,PERCENT_RANK,
LAG,LEAD,FIRST_VALUE,LAST_VALUE
GROUPING SETS,GROUPING__ID,CUBE,ROLLUP


CREATE database win;

use win;

show tables;

drop table if exists win1;
--窗口函数加上order by 就是针对所有行,没有加order by就是针对到当前行为止

--Hive分析窗口函数(一) SUM,AVG,MIN,MAX


CREATE EXTERNAL TABLE win1 (
cookieid string,
createtime string,   --day 
pv INT
) ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ',' 
stored as textfile location '/winLearn/win1/';

desc win1;

load data local inpath '/root/win1' overwrite into table win1;

select * from win1;

--SUM
--PRECEDING:往前
--FOLLOWING:往后
--CURRENT ROW:当前行
--UNBOUNDED:起点,UNBOUNDED PRECEDING 表示从前面的起点, UNBOUNDED FOLLOWING:表示到后面的终点
SELECT cookieid,
createtime,
pv,
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默认为从起点到当前行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --从起点到当前行,结果同pv1 
SUM(pv) OVER(PARTITION BY cookieid) AS pv3,                                --分组内所有行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,   --当前行+往前3行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,    --当前行+往前3行+往后1行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6   ---当前行+往后所有行  
FROM win1 order by createtime;

--Hive分析窗口函数(二) NTILE,ROW_NUMBER,RANK,DENSE_RANK


CREATE EXTERNAL TABLE win2 (
cookieid string,
createtime string,   --day 
pv INT
) ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ',' 
stored as textfile location '/winLearn/win2/';

load data local inpath'/root/win2' into table win2;

select * from win2;

--NTILE分片函数,随机分配n个编号给相应的分组
SELECT 
cookieid,
createtime,
pv,
NTILE(2) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn1,    --分组内将数据分成2片
NTILE(3) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn2,  --分组内将数据分成3片
NTILE(4) OVER(ORDER BY createtime) AS rn3        --将所有数据分成4片
FROM win2 
ORDER BY cookieid,createtime;

---ROW_NUMBER()对分组中的数据进行编号
SELECT 
cookieid,
createtime,
pv,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn 
FROM win2;

---RANK 和 DENSE_RANK
--—RANK() 生成数据项在分组中的排名,排名相等会在名次中留下空位
--—DENSE_RANK() 生成数据项在分组中的排名,排名相等会在名次中不会留下空位
SELECT 
cookieid,
createtime,
pv,
RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn1,
DENSE_RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn2,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv DESC) AS rn3 
FROM win2 
WHERE cookieid = 'cookie1';

--HIVE窗口函数(三)


CREATE EXTERNAL TABLE win3(
dept STRING,
userid string,
sal INT
) ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ',' 
stored as textfile location '/tmp/win3/';

load data local inpath '/root/win3' into table win3;

select * from win3;

--CUME_DIST分组内小于等于当前值的比例
SELECT 
dept,
userid,
sal,
CUME_DIST() OVER(ORDER BY sal) AS rn1,
CUME_DIST() OVER(PARTITION BY dept ORDER BY sal) AS rn2 
FROM win3;

--PERCENT_RANK返回位置百分数,按照rank进行排序之后,找出当前位置的rank所在的位置百分数
SELECT 
dept,
userid,
sal,
PERCENT_RANK() OVER(ORDER BY sal) AS rn1,   --分组内
RANK() OVER(ORDER BY sal) AS rn11,          --分组内RANK值
SUM(1) OVER(PARTITION BY NULL) AS rn12,     --分组内总行数
PERCENT_RANK() OVER(PARTITION BY dept ORDER BY sal) AS rn2 
FROM win3;

--Hive分析窗口函数(四) 


CREATE EXTERNAL TABLE win4(
cookieid string,
createtime string,  --页面访问时间
url STRING       --被访问页面
) ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ',' 
stored as textfile location '/winLearn/win4/';

load data local inpath '/root/win4' overwrite into table win4;

select * from win4;

--LAG(字段,n,为空时的默认值)向上n行
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LAG(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS last_1_time,
LAG(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS last_2_time 
FROM win4;


--LEAD(字段,n,为空时的默认值)向下n行
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LEAD(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS next_1_time,
LEAD(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS next_2_time 
FROM win4;

--以下两个函数如果不使用order by就会导致不按时间排序,first_value返回第一个值,last_value返回最后一个值但是不按照时间顺序
--如果要返回第一个可以用first_value,要返回最后一个值,则倒序返回first_value
--FIRST_VALUE()取分组后截止到当前行相应字段的第一个值
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS first1 
FROM lxw1234;

--LAST_VALUE()取分组后截止到当前行的相应字段最后一个值
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LAST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS last1 
FROM win4;

----窗口函数(五)

CREATE EXTERNAL TABLE win5(
month STRING,
day STRING, 
cookieid STRING 
) ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ',' 
stored as textfile location '/tmp/win5/';

load data local inpath '/root/win5' into table win5;

select * from win5;
--GROUPING SETS 根据group by 的不同字段分别进行聚合求并集,然后分配一个groupid作为排序字段
SELECT 
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID 
FROM win5 
GROUP BY month,day 
GROUPING SETS (month,day) 
ORDER BY GROUPING__ID;

--cube根据group by 字段的所有结果组合进行聚合grouping set加强版GROUPING_id为可以指定的一个字段
--在使用group by 的情况下可以使用
SELECT 
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID 
FROM lxw1234 
GROUP BY month,day 
WITH CUBE 
ORDER BY GROUPING__ID;

--ROLLUP以左侧的维度为主的结果集进行聚合,是cube的含有左边字段的子集
SELECT 
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID  
FROM win5 
GROUP BY month,day
WITH ROLLUP 
ORDER BY GROUPING__ID;
 

你可能感兴趣的:(常用窗口函数总结)