目录
SUM,AVG,MIN,MAX,NTILE,
ROW_NUMBER,RANK,DENSE_RANK,
CUME_DIST,PERCENT_RANK,LAG,LEAD,
FIRST_VALUE,LAST_VALUEGROUPING SETS,GROUPING__ID,
CUBE,ROLLUP
--Hive分析窗口函数(一) SUM,AVG,MIN,MAX
--Hive分析窗口函数(二) NTILE,ROW_NUMBER,RANK,DENSE_RANK
--HIVE窗口函数(三)
--Hive分析窗口函数(四)
CREATE database win;
use win;
show tables;
drop table if exists win1;
--窗口函数加上order by 就是针对所有行,没有加order by就是针对到当前行为止
CREATE EXTERNAL TABLE win1 (
cookieid string,
createtime string, --day
pv INT
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
stored as textfile location '/winLearn/win1/';
desc win1;
load data local inpath '/root/win1' overwrite into table win1;
select * from win1;
--SUM
--PRECEDING:往前
--FOLLOWING:往后
--CURRENT ROW:当前行
--UNBOUNDED:起点,UNBOUNDED PRECEDING 表示从前面的起点, UNBOUNDED FOLLOWING:表示到后面的终点
SELECT cookieid,
createtime,
pv,
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默认为从起点到当前行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --从起点到当前行,结果同pv1
SUM(pv) OVER(PARTITION BY cookieid) AS pv3, --分组内所有行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4, --当前行+往前3行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5, --当前行+往前3行+往后1行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6 ---当前行+往后所有行
FROM win1 order by createtime;
CREATE EXTERNAL TABLE win2 (
cookieid string,
createtime string, --day
pv INT
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
stored as textfile location '/winLearn/win2/';
load data local inpath'/root/win2' into table win2;
select * from win2;
--NTILE分片函数,随机分配n个编号给相应的分组
SELECT
cookieid,
createtime,
pv,
NTILE(2) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn1, --分组内将数据分成2片
NTILE(3) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn2, --分组内将数据分成3片
NTILE(4) OVER(ORDER BY createtime) AS rn3 --将所有数据分成4片
FROM win2
ORDER BY cookieid,createtime;
---ROW_NUMBER()对分组中的数据进行编号
SELECT
cookieid,
createtime,
pv,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn
FROM win2;
---RANK 和 DENSE_RANK
--—RANK() 生成数据项在分组中的排名,排名相等会在名次中留下空位
--—DENSE_RANK() 生成数据项在分组中的排名,排名相等会在名次中不会留下空位
SELECT
cookieid,
createtime,
pv,
RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn1,
DENSE_RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn2,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv DESC) AS rn3
FROM win2
WHERE cookieid = 'cookie1';
CREATE EXTERNAL TABLE win3(
dept STRING,
userid string,
sal INT
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
stored as textfile location '/tmp/win3/';
load data local inpath '/root/win3' into table win3;
select * from win3;
--CUME_DIST分组内小于等于当前值的比例
SELECT
dept,
userid,
sal,
CUME_DIST() OVER(ORDER BY sal) AS rn1,
CUME_DIST() OVER(PARTITION BY dept ORDER BY sal) AS rn2
FROM win3;
--PERCENT_RANK返回位置百分数,按照rank进行排序之后,找出当前位置的rank所在的位置百分数
SELECT
dept,
userid,
sal,
PERCENT_RANK() OVER(ORDER BY sal) AS rn1, --分组内
RANK() OVER(ORDER BY sal) AS rn11, --分组内RANK值
SUM(1) OVER(PARTITION BY NULL) AS rn12, --分组内总行数
PERCENT_RANK() OVER(PARTITION BY dept ORDER BY sal) AS rn2
FROM win3;
CREATE EXTERNAL TABLE win4(
cookieid string,
createtime string, --页面访问时间
url STRING --被访问页面
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
stored as textfile location '/winLearn/win4/';
load data local inpath '/root/win4' overwrite into table win4;
select * from win4;
--LAG(字段,n,为空时的默认值)向上n行
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LAG(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS last_1_time,
LAG(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS last_2_time
FROM win4;
--LEAD(字段,n,为空时的默认值)向下n行
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LEAD(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS next_1_time,
LEAD(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS next_2_time
FROM win4;
--以下两个函数如果不使用order by就会导致不按时间排序,first_value返回第一个值,last_value返回最后一个值但是不按照时间顺序
--如果要返回第一个可以用first_value,要返回最后一个值,则倒序返回first_value
--FIRST_VALUE()取分组后截止到当前行相应字段的第一个值
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS first1
FROM lxw1234;
--LAST_VALUE()取分组后截止到当前行的相应字段最后一个值
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LAST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS last1
FROM win4;
----窗口函数(五)
CREATE EXTERNAL TABLE win5(
month STRING,
day STRING,
cookieid STRING
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
stored as textfile location '/tmp/win5/';
load data local inpath '/root/win5' into table win5;
select * from win5;
--GROUPING SETS 根据group by 的不同字段分别进行聚合求并集,然后分配一个groupid作为排序字段
SELECT
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID
FROM win5
GROUP BY month,day
GROUPING SETS (month,day)
ORDER BY GROUPING__ID;
--cube根据group by 字段的所有结果组合进行聚合grouping set加强版GROUPING_id为可以指定的一个字段
--在使用group by 的情况下可以使用
SELECT
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID
FROM lxw1234
GROUP BY month,day
WITH CUBE
ORDER BY GROUPING__ID;
--ROLLUP以左侧的维度为主的结果集进行聚合,是cube的含有左边字段的子集
SELECT
month,
day,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID
FROM win5
GROUP BY month,day
WITH ROLLUP
ORDER BY GROUPING__ID;