窗口函数用于对 -Query出来的数据- 进行 -分窗口- 的 -动作-.
这句话有3个重点:
1. Query出来的数据
2. 分窗口
3. 动作
我们来看下 标准sql
select *,action(col) over([partition by col] [order by col] [row between UNBOUNDED PRECEDING and CURRENT ROW])
from table;
从上面的sql来看, 窗口over在整个表, 但是可以对窗口进行分区, 进行排序, 然后对窗口内部数据action操作.
其中. action操作可以是sum()、count()、等聚合操作.
也可以是LAG(col, n)、LEAD(col, n)这种取窗口片段的操作.
还可以是NTILE(n)这种把整个table分成n份的操作.
相关sql伪代码如下:
select *,
sum(col) over(),
count(col) over(),
lag(col,n) over(partition by col order by col),
lead(col,n) over(partition by col order by col),
max(col) over(partition by col order by col row between UNBOUNDED PRECEDING and CURRENT ROW),
ntile(n) over()
from table;
-- 1.相关函数说明
-- OVER():指定分析函数工作的数据窗口大小,这个数据窗口大小可能会随着行的变而变化
-- CURRENT ROW:当前行
-- n PRECEDING:往前n行数据
-- n FOLLOWING:往后n行数据
-- UNBOUNDED:起点,UNBOUNDED PRECEDING 表示从前面的起点, UNBOUNDED FOLLOWING表示到后面的终点
-- LAG(col,n):往前第n行数据
-- LEAD(col,n):往后第n行数据
-- NTILE(n):把有序分区中的行分发到指定数据的组中,各个组有编号,编号从1开始,对于每一行,NTILE返回此行所属的组的编号。注意:n必须为int类型。
-- 3.需求
-- (1)查询在2017年4月份购买过的顾客及总人数
-- (2)查询顾客的购买明细及月购买总额
-- (3)上述的场景,要将cost按照日期进行累加
-- (4)查询顾客上次的购买时间
-- (5)查询前20%时间的订单信息
-- table: business
-- name,orderdate,cost
-- jack,2017-01-01,10
-- tony,2017-01-02,15
-- jack,2017-02-03,23
-- tony,2017-01-04,29
-- jack,2017-01-05,46
-- jack,2017-04-06,42
-- tony,2017-01-07,50
-- jack,2017-01-08,55
-- mart,2017-04-08,62
-- mart,2017-04-09,68
-- neil,2017-05-10,12
-- mart,2017-04-11,75
-- neil,2017-06-12,80
-- mart,2017-04-13,94
-- 1.相关函数说明
-- OVER():指定分析函数工作的数据窗口大小,这个数据窗口大小可能会随着行的变而变化
-- CURRENT ROW:当前行
-- n PRECEDING:往前n行数据
-- n FOLLOWING:往后n行数据
-- UNBOUNDED:起点,UNBOUNDED PRECEDING 表示从前面的起点, UNBOUNDED FOLLOWING表示到后面的终点
-- LAG(col,n):往前第n行数据
-- LEAD(col,n):往后第n行数据
-- NTILE(n):把有序分区中的行分发到指定数据的组中,各个组有编号,编号从1开始,对于每一行,NTILE返回此行所属的组的编号。注意:n必须为int类型。
create table business(
name string,
orderdate string,
cost int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as textfile;
-- 1)查询在2017年4月份购买过的顾客及总人数
1.1) 找出2017年4月购买过的人的明细
select name,orderdate from business where month(orderdate)=4 and year(orderdate)=2017;
-- 也可以用substring(orderdate,1,7)='2017-04';
select name,count(*) over(partition by name) from business
where substring(orderdate,1,7)='2017-04'
group by name ;
1.2)
-- 窗口函数要跟着聚合函数一起行动
select name, count(*) over() from business where month(orderdate)=4 and year(orderdate)=2017;
select name, count(*) over() from business where month(orderdate)=4 and year(orderdate)=2017
group by name;
-- 2)查询顾客的购买明细及月购买总额
select name,orderdate,cost,sum(*) over() from business
group by substring(orderdate,1,7)
select *,max(cost) over(),lag(orderdate,1) over() from business;
select *,sum(cost) over(partition by substring(orderdate,1,7) order by name)
from business;
-- 3))上述的场景,要将cost按照日期进行累加
select *,sum(cost) over() from business order by orderdate;
select *,sum(cost) over(order by orderdate rows between unbounded preceding and current row)
from business;
select *,sum(cost) over(order by orderdate rows between 1 preceding and 1 following)
from business;
select *,sum(cost) over(order by orderdate rows between current row and 1 following)
from business;
select *,sum(cost) over(order by orderdate rows between current row and 1 following)
from business;
select *,sum(cost) over(partition by name order by orderdate rows between unbounded preceding and current row)
from business;
select *, sum(cost) over(partition by name order by orderdate rows between current row and unbounded following)
from business;
--4)查询顾客上次的购买时间
select *,lag(orderdate,1) over(partition by name order by orderdate)
from business;
-- 查询顾客下次的购买时间
select *,LEAD(orderdate,1) over(partition by name order by orderdate)
from business;
select *,lag(orderdate,6) over(partition by name order by orderdate)
from business;
-- 查询上下跳
select *,
lag(orderdate,1) over(partition by name order by orderdate),
LEAD(orderdate,1) over(partition by name order by orderdate)
from business;
--5)查询前20%时间的订单信息
select * from
(
select *,NTILE(5) over(order by orderdate) ntile
from business
) as t
where t.ntile = 1;