数据格式
zhm,2018-04-10,12
zhm,2018-04-11,51
zhm,2018-04-12,72
zhm,2018-04-13,31
zhm,2018-04-14,21
zhm,2018-04-15,14
zhm,2018-04-16,44
创建表
create table fun_table1
(username string, createtime string, pv int)
row format delimited
fields terminated by ',';
加入本地数据
load data local inpath "/home/zhm/test1.txt" into table fun_table1;
select
username,
createtime,
pv,
pv1: 分组内从起点到当前行的pv累积,如,11号的pv1=10号的pv+11号的pv, 12号=10号+11号+12号
sum(pv) over (partition by username order by createtime rows between unbounded preceding and current row) as pv1,
pv2: 同pv1
sum(pv) over (partition by username order by createtime) as pv2,
pv3: 分组内(cookie1)所有的pv累加
sum(pv) over (partition by username) as pv3,
pv4: 分组内当前行+往前3行,如,11号=10号+11号, 12号=10号+11号+12号, 13号=10号+11号+12号+13号, 14号=11号+12号+13号+14号
sum(pv) over (partition by username order by createtime rows between 3 preceding and current row) as pv4,
pv5: 分组内当前行+往前3行+往后1行,如,14号=11号+12号+13号+14号+15号=5+7+3+2+4=21
sum(pv) over (partition by username order by createtime rows between 3 preceding and 1 following) as pv5,
pv6: 分组内当前行+往后所有行,如,13号=13号+14号+15号+16号=3+2+4+4=13,14号=14号+15号+16号=2+4+4=10
sum(pv) over (partition by username order by createtime rows between current row and unbounded following) as pv6
from fun_table1;
如果不指定ROWS BETWEEN,默认为从起点到当前行;
如果不指定ORDER BY,则将分组内所有值累加;
关键是理解ROWS BETWEEN含义,也叫做WINDOW子句:
PRECEDING:往前
FOLLOWING:往后
CURRENT ROW:当前行
UNBOUNDED:起点,
UNBOUNDED PRECEDING 表示从前面的起点,
UNBOUNDED FOLLOWING:表示到后面的终点
其他AVG,MIN,MAX,和SUM用法一样
ROW_NUMBER() 从1开始,按照顺序,生成分组内记录的序列
比如,按照pv降序排列,生成分组内每天的pv名次
分组排序
select
username,
createtime,
pv,
row_number() over (partition by username order by pv desc) as rn
from fun_table1;
RANK 和 DENSE_RANK
—RANK() 生成数据项在分组中的排名,排名相等会在名次中留下空位
—DENSE_RANK() 生成数据项在分组中的排名,排名相等会在名次中不会留下空位
select
username,
createtime,
pv,
rank() over (partition by username order by pv desc) as rn1,
dense_rank() over (partition by username order by pv desc) as rn2,
row_number() over (partition by username order by pv desc) as rn3
from fun_table1
where cookieid='zhm';
row_number: 按顺序编号,不留空位
rank: 按顺序编号,相同的值编相同号,留空位
dense_rank: 按顺序编号,相同的值编相同的号,不留空位
CUME_DIST : 小于等于当前值的行数/分组内总行数
比如,统计小于等于当前薪水的人数,所占总人数的比例
create table cookie3(dept string, userid string, sal int)
row format delimited fields terminated by ',';
select
dept,
userid,
sal,
cume_dist() over (order by sal) as rn1,
cume_dist() over (partition by dept order by sal) as rn2
from cookie.cookie3;