【Hive】SQL窗口函数实践

一、准备

DROP TABLE IF EXISTS db_test.merchants_turnover
;
CREATE TABLE IF NOT EXISTS db_test.merchants_turnover
(
    merchant_name STRING COMMENT '店铺名称',
    turnover      DECIMAL(10, 2) COMMENT '营业额',
    dt            STRING COMMENT '日期yyyy-MM-dd'
) COMMENT '商户营业额'
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
    STORED AS TEXTFILE
;
LOAD DATA LOCAL INPATH '/home/hadoop/merchants_turnover.txt' OVERWRITE INTO TABLE db_test.merchants_turnover
;

创建如图的测试表,并加载测试数据(假数据使用python的Faker库造的,假数据最后贴上)。
【Hive】SQL窗口函数实践_第1张图片

二、大纲

  1. SUM(),MIN(),MAX(),AVG()等聚合函数。
  2. NTILE(),ROW_NUMBER(),RANK(),DENSE_RANK()函数生成序号列。
  3. LAG(),LEAD(),FIRST_VALUE(),LAST_VALUE()函数窗口中找点。
  4. GROUPING SET(),WITH CUBE,WITH ROLLUP对GROUP BY进行限制。

三、实践

1、SUM(),MIN(),MAX(),AVG()

-- 1、SUM(),MIN(),MAX(),AVG()等聚合函数,可以直接使用OVER()进行分区计算。
SELECT merchant_name,
       turnover,
       -- 每月累计到当天的销售额
       SUM(turnover)
           OVER (PARTITION BY merchant_name,DATE_FORMAT(dt, 'yyyy-MM') ORDER BY dt
               ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS accu_turnover,
       -- 最近三天销售额及当天之和
       SUM(turnover)
           OVER (PARTITION BY merchant_name ORDER BY dt
               ROWS BETWEEN 3 PRECEDING AND CURRENT ROW )         AS last_3_turnover,
       -- 历史所有销售额累计到当天的下一天销售额之和
       SUM(turnover)
           OVER (PARTITION BY merchant_name ORDER BY dt
               ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING)  AS accu_next_turnover,
       dt
FROM db_test.merchants_turnover
;
-- 窗口函数over使得聚合函数sum可以在限定的窗口中进行聚合。
-- 上面的例子窗口分别是:每月月初~当天,前三天~当天,历史第一天~明天。
-- 窗口限定语法为:rows between 某一行 and 某一行,行排序可以按照时间。
-- 时间节点可以用:n PRECEDING 前n行,n FOLLOWING 后n行,CURRENT ROW 当前行,
-- UNBOUNDED PRECEDING 向前不限制行数。
-- 窗口函数over()和group by最大的区别在于group by之后其余列也必须按照分区进行计算,
-- 而over()函数使得单个特征可以进行分区。

【Hive】SQL窗口函数实践_第2张图片

2、NTILE(),ROW_NUMBER(),RANK(),DENSE_RANK()

-- 2、NTILE(),ROW_NUMBER(),RANK(),DENSE_RANK()可以为数据集增加序号列。
SELECT merchant_name,
       turnover,
       -- 按照商户名分成3个分区,返回第几个分区编号
       NTILE(3)
             OVER (PARTITION BY merchant_name ORDER BY dt)              AS partition_num,
       -- 1234567 连续序号
       ROW_NUMBER() OVER (PARTITION BY merchant_name ORDER BY turnover) AS row_num,
       -- 1233567 排序数值相同的则序号相同,跳过下一个序号。
       RANK() OVER (PARTITION BY merchant_name ORDER BY turnover)       AS rank_num,
       -- 1233456 排序数值相同的则序号相同,不跳过下一个序号。
       DENSE_RANK() OVER (PARTITION BY merchant_name ORDER BY turnover) AS d_r_num,
       dt
FROM db_test.merchants_turnover
;

【Hive】SQL窗口函数实践_第3张图片

3、LAG(),LEAD(),FIRST_VALUE(),LAST_VALUE()

-- 3、LAG(),LEAD(),FIRST_VALUE(),LAST_VALUE()函数返回一系列指定的点。
SELECT merchant_name,
       dt,
       turnover,
       -- 上1笔销售额的日期,缺失则用NULL
       LAG(dt, 1, NULL) OVER (PARTITION BY merchant_name ORDER BY dt)          AS last_dt,
       -- 取最早一笔贷款的日期,缺失则2020-01-01
       LEAD(dt, 1, '2020-01-01') OVER (PARTITION BY merchant_name ORDER BY dt) AS next_dt,
       -- 取最早销售额的日期
       FIRST_VALUE(dt) OVER (PARTITION BY merchant_name ORDER BY dt)           AS first_dt,
       -- 取新一销售额的日期
       LAST_VALUE(dt) OVER (PARTITION BY merchant_name ORDER BY dt)            AS latest_dt
FROM db_test.merchants_turnover
;
-- lag(x,y,z)按照x向前错位y行,缺失则取z。lead(x,y,z)将数据按照x向后错位y行,缺失则z。

【Hive】SQL窗口函数实践_第4张图片

4、GROUPING SET(),WITH CUBE,WITH ROLLUP

--4、 GROUPING SET(),WITH CUBE,WITH ROLLUP对GROUP BY进行限制。
-- 笛卡儿积,各种组合,(),(mth),(mth,dt),(dt)
SELECT SUM(turnover), SUBSTRING(dt, 1, 7), dt, GROUPING__ID
FROM db_test.merchants_turnover
GROUP BY SUBSTRING(dt, 1, 7), dt
WITH CUBE
ORDER BY GROUPING__ID
;

【Hive】SQL窗口函数实践_第5张图片

-- 不断增加组合,(),(mth),(mth,dt),
-- 比with cube少了一种组合(dt),
-- 因为with rollup是以左侧维度为主,
-- 当左侧month_date维度为NULL时,右侧day_date维度就不允许为NULL。
SELECT SUM(turnover),
       SUBSTRING(dt, 1, 7) AS mth,
       dt,
       GROUPING__ID
FROM db_test.merchants_turnover
GROUP BY SUBSTRING(dt, 1, 7), dt
WITH ROLLUP
ORDER BY GROUPING__ID
;

【Hive】SQL窗口函数实践_第6张图片

-- 手动指定 (mth,dt),(mth),(dt)
SELECT SUM(turnover),
       SUBSTRING(dt, 1, 7) AS mth,
       dt,
       GROUPING__ID
FROM db_test.merchants_turnover
GROUP BY SUBSTRING(dt, 1, 7), dt
    GROUPING SETS (
    ( SUBSTRING(dt, 1, 7), dt),
    ( SUBSTRING(dt, 1, 7)),
    ( dt)
    )
ORDER BY GROUPING__ID
;

【Hive】SQL窗口函数实践_第7张图片

四、熄灯

假数据《merchants_turnover.txt》

Jon Cole53378.772022-01-01
Rachel Davis8315.462022-01-01
Russell Reynolds32494.892022-01-01
April Griffin51568.672022-01-01
Crystal Landry73128.972022-01-01
Amanda Johnson97011.662022-01-01
Teresa James43795.962022-01-01
Javier Johnson88190.372022-01-01
Jeffrey Simpson64941.262022-01-01
Jon Cole10352.242022-01-02
Rachel Davis91306.422022-01-02
Russell Reynolds29295.832022-01-02
April Griffin15734.902022-01-02
Crystal Landry93258.332022-01-02
Amanda Johnson61599.422022-01-02
Teresa James8468.662022-01-02
Javier Johnson15524.492022-01-02
Jeffrey Simpson72736.712022-01-02
Jon Cole71824.462022-01-03
Rachel Davis44352.932022-01-03
Russell Reynolds81634.882022-01-03
April Griffin32915.072022-01-03
Crystal Landry40093.052022-01-03
Amanda Johnson12233.902022-01-03
Teresa James89980.672022-01-03
Javier Johnson8383.152022-01-03
Jeffrey Simpson62089.642022-01-03
Jon Cole71824.462022-01-04
Rachel Davis75260.852022-01-04
Russell Reynolds49006.272022-01-04
April Griffin36897.232022-01-04
Crystal Landry80926.792022-01-04
Amanda Johnson12233.902022-01-04
Teresa James86023.652022-01-04
Javier Johnson31650.912022-01-04
Jeffrey Simpson35506.942022-01-04
Jon Cole64822.892022-01-05
Rachel Davis20906.332022-01-05
Russell Reynolds71062.842022-01-05
April Griffin87299.632022-01-05
Crystal Landry45118.172022-01-05
Amanda Johnson8658.012022-01-05
Teresa James74095.722022-01-05
Javier Johnson47024.812022-01-05
Jeffrey Simpson91166.812022-01-05
Jon Cole65892.972022-01-06
Rachel Davis79864.772022-01-06
Russell Reynolds33983.202022-01-06
April Griffin40772.342022-01-06
Crystal Landry84212.812022-01-06
Amanda Johnson22623.642022-01-06
Teresa James52836.502022-01-06
Javier Johnson32401.392022-01-06
Jeffrey Simpson6908.022022-01-06
Jon Cole90876.252022-01-07
Rachel Davis63978.412022-01-07
Russell Reynolds84048.042022-01-07
April Griffin1551.872022-01-07
Crystal Landry40084.552022-01-07
Amanda Johnson34741.362022-01-07
Teresa James67296.762022-01-07
Javier Johnson37818.662022-01-07
Jeffrey Simpson33920.272022-01-07

你可能感兴趣的:(大数据,hive,sql,大数据,开窗函数,OVER)