Hive&Mysql开窗函数

文章目录

          • SUM AVG MIN MAX
          • rank row_number dense_rank
          • ntile - 将数据按照指定的顺序分成几部分
          • PERCENT_RANK 百分比rank
          • CUME_DIST 小于等于自己的比例
          • LAST_VALUE & FIRST_VALUE 截止到当前最后一个值
          • LAG & LEAD取前几行的值
          • Mysql实现开窗 [不好用建议迁移oracle或者impala]

SUM AVG MIN MAX
SELECT id,
date_time,
pv,
-- 从第一行到当前行进行sum
SUM(pv) OVER(PARTITION BY id ORDER BY date_time) AS pv1,

-- 从第一行到当前行进行sum
SUM(pv) OVER(PARTITION BY id ORDER BY date_time ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2,

-- 从第一行到最后一行进行sum
SUM(pv) OVER(PARTITION BY id) AS pv3,

-- 从前3行到当前行进行sum
SUM(pv) OVER(PARTITION BY id ORDER BY date_time ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,

-- 从前3行到后1行进行sum
SUM(pv) OVER(PARTITION BY id ORDER BY date_time ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,

-- 从当前行到最后一行进行sum
SUM(pv) OVER(PARTITION BY id ORDER BY date_time ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6
FROM;
 
id date_time     pv      pv1     pv2     pv3     pv4     pv5      pv6 
-----------------------------------------------------------------------------
1  20191101      1       1       1       26      1       6       26
1  20191102      5       6       6       26      6       13      25
1  20191103      7       13      13      26      13      16      20
1  20191104      3       16      16      26      16      18      13
1  20191105      2       18      18      26      17      21      10
1  20191106      4       22      22      26      16      20      8
1  20191107      4       26      26      26      13      13      4

-- 讲解
ROWS BETWEEN ... AND
-- 限定前几行 ... 后几行
	PRECEDING 往前
	FOLLOWING 往后
	CURRENT ROW 当前行
	UNBOUNDED PRECEDING 表示第一行
	UNBOUNDED FOLLOWING 表示最后一行
ORDER BY
-- 排序方式,不指定会将所有值进行累加
rank row_number dense_rank
SELECT 
id,
date_time,
pv,
RANK() OVER(PARTITION BY id ORDER BY pv desc) AS rk1,
DENSE_RANK() OVER(PARTITION BY id ORDER BY pv desc) AS rk2,
ROW_NUMBER() OVER(PARTITION BY id ORDER BY pv DESC) AS rk3 
FROM 表 
 
id date_time           pv       rk1     rk2     rk3 
-------------------------------------------------- 
1 20191101      7       1       1       1
1 20191102      5       2       2       2
1 20191103      4       3       3       3
1 20191104      4       3       3       4
1 20191105      3       5       4       5
1 20191106      2       6       5       6
1 20191107      1       7       6       7
ntile - 将数据按照指定的顺序分成几部分
SELECT 
id,
date_time,
pv,
-- 将数据切分, 如果分不均,就划到第一部分
NTILE(2) OVER(PARTITION BY id ORDER BY date_time) AS n1,
NTILE(3) OVER(PARTITION BY id ORDER BY date_time) AS n2,
NTILE(4) OVER(ORDER BY date_time) AS n3
FROM-- from -> where -> group -> 聚合函数 -> having -> 算数函数 -> select -> order
ORDER BY id,date_time;
 
id date_time           pv       n1     n2     n3
-------------------------------------------------
1 20191101      1       1       1       1
1 20191102      5       1       1       1
1 20191103      7       1       1       2
1 20191104      3       1       2       2
1 20191105      2       2       2       3
1 20191106      4       2       3       3
1 20191107      4       2       3       4
2 20191101      2       1       1       1
2 20191102      3       1       1       1
2 20191103      5       1       1       2
2 20191104      6       1       2       2
2 20191105      3       2       2       3
2 20191106      9       2       3       4
2 20191107      7       2       3       4
PERCENT_RANK 百分比rank
SELECT 
dept,
id,
sal,
PERCENT_RANK() OVER(ORDER BY sal) AS n1,
RANK() OVER(ORDER BY sal) AS n11,
SUM(1) OVER(PARTITION BY NULL) AS n12,
PERCENT_RANK() OVER(PARTITION BY dept ORDER BY sal) AS n2 
FROM;
 
dept    id   sal    n1    n11     n12    n2
---------------------------------------------------
1      1   1000    0.0     1       5       0.0
1      2   2000    0.25    2       5       0.5
1      3   3000    0.5     3       5       1.0
2      4   4000    0.75    4       5       0.0
2      5   5000    1.0     5       5       1.0
 
-- n1 = (n11-1) / (n12-1) 
-- (1-1)/(5-1)=0/4=0
-- (2-1)/(5-1)=1/4=0.25
-- (4-1)/(5-1)=3/4=0.75
CUME_DIST 小于等于自己的比例
SELECT 
dept,
id,
sal,
CUME_DIST() OVER(ORDER BY sal) AS n1,
CUME_DIST() OVER(PARTITION BY dept ORDER BY sal) AS n2 
FROM;
 
dept    id   sal   rn1       rn2 
-------------------------------------------
1      1   1000    0.2     0.3333333333333333
1      2   2000    0.4     0.6666666666666666
1      3   3000    0.6     1.0
2      4   4000    0.8     0.5
2      5   5000    1.0     1.0
 
n1: 没有partition,所有数据均为1组,总行数为5,
     第一行:小于等于1000的行数为1,因此,1/5=0.2
     第三行:小于等于3000的行数为3,因此,3/5=0.6
n2: 按照部门分组,dept=1的行数为3,
     第二行:小于等于2000的行数为2,因此,2/3=0.6666666666666666
LAST_VALUE & FIRST_VALUE 截止到当前最后一个值
SELECT 
id,
date_time,
url,
ROW_NUMBER() OVER(PARTITION BY id ORDER BY date_time) AS rn,
LAST_VALUE(url) OVER(PARTITION BY id ORDER BY date_time) AS last1 
FROM;
 
 
id  date_time    url    rn       last1  
------------------------------------------------
1 20191001     url1    1       url1
1 20191001     url2    2       url2
1 20191001     url3   3       url3
1 20191001     url4    4       url4
1 20191001     url5    5       url5
1 20191001     url6    6       url6
1 20191001     url7    7       url7
2 20191001     url11   1       url11
2 20191001     url22   2       url22
2 20191001     url33  3       url33
2 20191001     url44   4       url44
2 20191001     url55   5       url55
2 20191001     url66   6       url66
2 20191001     url77   7       url77
LAG & LEAD取前几行的值
SELECT id,
date_time,
url,
ROW_NUMBER() OVER(PARTITION BY id ORDER BY date_time) AS rn,
LAG(date_time,1,'1970-01-01 00:00:00') OVER(PARTITION BY id ORDER BY date_time) AS t1,
LAG(date_time,2) OVER(PARTITION BY id ORDER BY date_time) AS t2 
FROM;
 
 
id date_time             url    rn       t1             t2
-------------------------------------------------------------------------------------------
1 2019-11-10 10:00:00     url1    1       1970-01-01 00:00:00     NULL
1 2019-11-10 10:00:02     url2    2       2019-11-10 10:00:00     NULL
1 2019-11-10 10:03:04     url3   3       2019-11-10 10:00:02     2019-11-10 10:00:00
1 2019-11-10 10:10:00     url4    4       2019-11-10 10:03:04     2019-11-10 10:00:02
1 2019-11-10 10:50:01     url5    5       2019-11-10 10:10:00     2019-11-10 10:03:04
1 2019-11-10 10:50:05     url6    6       2019-11-10 10:50:01     2019-11-10 10:10:00
1 2019-11-10 11:00:00     url7    7       2019-11-10 10:50:05     2019-11-10 10:50:01
2 2019-11-10 10:00:00     url11   1       1970-01-01 00:00:00     NULL
2 2019-11-10 10:00:02     url22   2       2019-11-10 10:00:00     NULL
2 2019-11-10 10:03:04     url33  3       2019-11-10 10:00:02     2019-11-10 10:00:00
2 2019-11-10 10:10:00     url44   4       2019-11-10 10:03:04     2019-11-10 10:00:02
2 2019-11-10 10:50:01     url55   5       2019-11-10 10:10:00     2019-11-10 10:03:04
2 2019-11-10 10:50:05     url66   6       2019-11-10 10:50:01     2019-11-10 10:10:00
2 2019-11-10 11:00:00     url77   7       2019-11-10 10:50:05     2019-11-10 10:50:01
 
 
t1: 指定了往上第1行的值,default'1970-01-01 00:00:00'  
             1第一行,往上1行为NULL,因此取默认值 1970-01-01 00:00:00
             1第三行,往上1行值为第二行值,2019-11-10 10:00:02
             1第六行,往上1行值为第五行值,2019-11-10 10:50:01
t2: 指定了往上第2行的值,为指定默认值
		1第一行,往上2行为NULL
		1第二行,往上2行为NULL
		1第四行,往上2行为第二行值,2019-11-10 10:00:02
		1第七行,往上2行为第五行值,2019-11-10 10:50:01
Mysql实现开窗 [不好用建议迁移oracle或者impala]
select
pagemountedtime
,@rowNum:=@rowNum+1
from
(
	select
	pagemountedtime
	,@rowNum :=0
	fromorder by 
	pagemountedtime
) t

你可能感兴趣的:(03,Hive及数仓)