over() :窗口函数,在括号中指定开窗条件,通常和聚合函数、排名函数一起使用。如果开窗条件为空,那么聚合的是过滤后的整张表。
一个窗口函数就会启动一个MR程序。
PARTITION BY:指定分组条件。
ORDER BY:指定组内排序条件。
CURRENT ROW:当前行。
n PRECEDING:前n行。
n FOLLOWING:后n行。
UNBOUNDED:无限的。UNBOUNDED PRECEDING 第一行,UNBOUNDED FOLLOWING最后一行。
LAG(字段名,n):某一列当前行的前n行。
LEAD(字段名,n):某一列当前行的后n行。
NTILE(n):将数据分组并发送到不同窗口,返回组号。
RANK():排名有重复,总数不会变。
DENSE_RANK():排名有重复,总数会减少。
ROW_NUMBER():按顺序排名,无重复。
create table business(
name string,
orderdate string,
cost int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
-- 数据如下
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94
-- over()不写开窗条件,聚合的是过滤后的整张表
SELECT name,count(*) over() cnt
FROM business WHERE substring(orderdate,1,7)='2017-04'
GROUP BY name;
-- 按照月份分组,组内相加
SELECT name,orderdate,cost,
SUM(cost) OVER(PARTITION BY substring(orderdate,1,7)) mtotal
FROM business;
-- over()中不写开窗条件就是统计整张表
SELECT name,orderdate,cost,
SUM(cost) OVER() total
FROM business;
-- 根据姓名分区,组内相加
SELECT name,orderdate,cost,
SUM(cost) OVER(PARTITION BY name) ev_cost
FROM business;
-- 按照姓名分组,从第一行到当前行累加
SELECT name,orderdate,cost,
SUM(cost) OVER(PARTITION BY name ORDER BY orderdate) ev_cur_cost
FROM business;
SELECT name,orderdate,cost,
SUM(cost) OVER(PARTITION BY name ORDER BY orderdate ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) ev_cur_cost2
FROM business;
-- 按姓名分组,组内前一行和当前行相加
SELECT name,orderdate,cost,
SUM(cost) OVER(PARTITION BY name ORDER BY orderdate ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) ev_pre_cost
FROM business;
-- 按姓名分组,组内连续三次相加
SELECT name,orderdate,cost,
SUM(cost) OVER(PARTITION BY name ORDER BY orderdate ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) ev_con_cost
FROM business;
-- 按照姓名分组,当前行与后面所有行的累加
SELECT name,orderdate,cost,
SUM(cost) OVER(PARTITION BY name ORDER BY orderdate ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) ev_foll_cost
FROM business;
-- 购买日期列当前行的前1行,只查看上次购买时间,不用分组
SELECT name,orderdate,cost,
LAG(orderdate,1) OVER(ORDER BY orderdate) lag_cost_time
FROM business;
-- 数据按照时间分为5组,取第一组的数据
SELECT *
FROM(SELECT name,orderdate,cost,
NTILE(5) OVER(ORDER BY orderdate) n
FROM business) tbl
WHERE n=1;
有哪些顾客连续两(或n)天来过我的店
-- 1.先给数据按姓名分组,按日期排序并加行号
SELECT name,orderdate,
ROW_NUMBER() OVER(PARTITION BY name ORDER BY orderdate) n
FROM business;
-- 2.日期与行号相减,如果连续两(或n)行结果一致,那么就是连续两(或n)天都来过
SELECT *,DATE_SUB(orderdate,n) ds
FROM (SELECT name,orderdate,
ROW_NUMBER() OVER(PARTITION BY name ORDER BY orderdate) n
FROM business) t1;
-- 3.count聚合,大于等于2(或n)就是连续两(或n)天都来过
SELECT name,count(*) c
FROM (
SELECT *,DATE_SUB(orderdate,n) ds
FROM (
SELECT name,orderdate,
ROW_NUMBER() OVER(PARTITION BY name ORDER BY orderdate) n
FROM business) t1
)t2
GROUP BY name,ds HAVING c>=2;
有如下数据:
name subject score
孙悟空 语文 87
孙悟空 数学 95
孙悟空 英语 68
大海 语文 94
大海 数学 56
大海 英语 84
宋宋 语文 64
宋宋 数学 86
宋宋 英语 84
婷婷 语文 65
婷婷 数学 85
婷婷 英语 78
SELECT subject,name,score,
RANK() OVER(PARTITION BY subject ORDER BY score DESC) rk,
DENSE_RANK() OVER(PARTITION BY subject ORDER BY score DESC) drk,
ROW_NUMBER() OVER(PARTITION BY subject ORDER BY score DESC) rnm
FROM score;