1、sum()over(partition by ...) (累加求和)
select cookieid,create_time,pv,
sum(pv) over(partition by cookieid order by createtime) as pv1, -- 默认为从起点到当前行
sum(pv) over(partition by cookieid order by create_time rows between unbounded preceding and current row) as pv2, --从起点到当前行,结果同pv1
sum(pv) over(partition by cookieid order by create_time rows between 3 preceding and current row) as pv3, --当前行+往前3行
sum(pv) over(partition by cookieid order by create_time rows between 3 preceding and 1 following) as pv4, --当前行+往前3行+往后1行
sum(pv) over(partition by cookieid order by create_time rows between current row and unbounded following) as pv5 ---当前行+往后所有行
from table_test;
注释rows between 含义,也叫做window子句:
preceding:往前
following:往后
current row:当前行
unbounded:无边界,unbounded preceding 表示从最前面的起点开始, unbounded following:表示到最后面的终点
–其他avg,min,max,和sum用法一样
2、row_number()over(partition by ...) 按顺序排序 (排序eg:1,2,3,4,5,6)
select cookieid,row_number()over(partition by cookieid order by create_time desc ) as rn from table_test;
3、rank()over(partition by ..,) (排序eg:1,2,2,4)
dense_rank()over(partition by ..,) (排序eg:1,2,2,3,3,4,5)
select cookieid,createtime,pv,
rank() over(partition by cookieid order by pv desc) as rank1,
dense_rank() over(partition by cookieid order by pv desc) as d_rank2,
row_number() over(partition by cookieid order by pv desc) as rn3
from table_test;
4、lag 和 lead 函数
lag(col,n,default) 用于统计窗口内往上第n行值
第一个参数为列名,第二个参数为往上第n行(可选,默认为1),第三个参数为默认值(当往上第n行为null时候,取默认值,如不指定,则为null)
select cookieid,create_time,pv,
row_number() over(partition by cookieid order by createtime) as rn,
lag(create_time,1,'1970-01-01') over(partition by cookieid order by createtime) as lag1, --根据cookieid往上一条记录的create_time
lag(create_time,2) over(partition by cookieid order by createtime) as lag2 --根据cookieid往上弟二条记录的create_time
from table_test;
lead 函数则与 lag 相反:
lead(col,n,default) 用于统计窗口内往下第n行值
第一个参数为列名,第二个参数为往下第n行(可选,默认为1),第三个参数为默认值(当往下第n行为null时候,取默认值,如不指定,则为null)
5、cume_dist
percent_rank
first_value 和 last_value