DROP TABLE IF EXISTS db_test.merchants_turnover
;
CREATE TABLE IF NOT EXISTS db_test.merchants_turnover
(
merchant_name STRING COMMENT '店铺名称',
turnover DECIMAL(10, 2) COMMENT '营业额',
dt STRING COMMENT '日期yyyy-MM-dd'
) COMMENT '商户营业额'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
STORED AS TEXTFILE
;
LOAD DATA LOCAL INPATH '/home/hadoop/merchants_turnover.txt' OVERWRITE INTO TABLE db_test.merchants_turnover
;
创建如图的测试表,并加载测试数据(假数据使用python的Faker库造的,假数据最后贴上)。
-- 1、SUM(),MIN(),MAX(),AVG()等聚合函数,可以直接使用OVER()进行分区计算。
SELECT merchant_name,
turnover,
-- 每月累计到当天的销售额
SUM(turnover)
OVER (PARTITION BY merchant_name,DATE_FORMAT(dt, 'yyyy-MM') ORDER BY dt
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS accu_turnover,
-- 最近三天销售额及当天之和
SUM(turnover)
OVER (PARTITION BY merchant_name ORDER BY dt
ROWS BETWEEN 3 PRECEDING AND CURRENT ROW ) AS last_3_turnover,
-- 历史所有销售额累计到当天的下一天销售额之和
SUM(turnover)
OVER (PARTITION BY merchant_name ORDER BY dt
ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) AS accu_next_turnover,
dt
FROM db_test.merchants_turnover
;
-- 窗口函数over使得聚合函数sum可以在限定的窗口中进行聚合。
-- 上面的例子窗口分别是:每月月初~当天,前三天~当天,历史第一天~明天。
-- 窗口限定语法为:rows between 某一行 and 某一行,行排序可以按照时间。
-- 时间节点可以用:n PRECEDING 前n行,n FOLLOWING 后n行,CURRENT ROW 当前行,
-- UNBOUNDED PRECEDING 向前不限制行数。
-- 窗口函数over()和group by最大的区别在于group by之后其余列也必须按照分区进行计算,
-- 而over()函数使得单个特征可以进行分区。
-- 2、NTILE(),ROW_NUMBER(),RANK(),DENSE_RANK()可以为数据集增加序号列。
SELECT merchant_name,
turnover,
-- 按照商户名分成3个分区,返回第几个分区编号
NTILE(3)
OVER (PARTITION BY merchant_name ORDER BY dt) AS partition_num,
-- 1234567 连续序号
ROW_NUMBER() OVER (PARTITION BY merchant_name ORDER BY turnover) AS row_num,
-- 1233567 排序数值相同的则序号相同,跳过下一个序号。
RANK() OVER (PARTITION BY merchant_name ORDER BY turnover) AS rank_num,
-- 1233456 排序数值相同的则序号相同,不跳过下一个序号。
DENSE_RANK() OVER (PARTITION BY merchant_name ORDER BY turnover) AS d_r_num,
dt
FROM db_test.merchants_turnover
;
-- 3、LAG(),LEAD(),FIRST_VALUE(),LAST_VALUE()函数返回一系列指定的点。
SELECT merchant_name,
dt,
turnover,
-- 上1笔销售额的日期,缺失则用NULL
LAG(dt, 1, NULL) OVER (PARTITION BY merchant_name ORDER BY dt) AS last_dt,
-- 取最早一笔贷款的日期,缺失则2020-01-01
LEAD(dt, 1, '2020-01-01') OVER (PARTITION BY merchant_name ORDER BY dt) AS next_dt,
-- 取最早销售额的日期
FIRST_VALUE(dt) OVER (PARTITION BY merchant_name ORDER BY dt) AS first_dt,
-- 取新一销售额的日期
LAST_VALUE(dt) OVER (PARTITION BY merchant_name ORDER BY dt) AS latest_dt
FROM db_test.merchants_turnover
;
-- lag(x,y,z)按照x向前错位y行,缺失则取z。lead(x,y,z)将数据按照x向后错位y行,缺失则z。
--4、 GROUPING SET(),WITH CUBE,WITH ROLLUP对GROUP BY进行限制。
-- 笛卡儿积,各种组合,(),(mth),(mth,dt),(dt)
SELECT SUM(turnover), SUBSTRING(dt, 1, 7), dt, GROUPING__ID
FROM db_test.merchants_turnover
GROUP BY SUBSTRING(dt, 1, 7), dt
WITH CUBE
ORDER BY GROUPING__ID
;
-- 不断增加组合,(),(mth),(mth,dt),
-- 比with cube少了一种组合(dt),
-- 因为with rollup是以左侧维度为主,
-- 当左侧month_date维度为NULL时,右侧day_date维度就不允许为NULL。
SELECT SUM(turnover),
SUBSTRING(dt, 1, 7) AS mth,
dt,
GROUPING__ID
FROM db_test.merchants_turnover
GROUP BY SUBSTRING(dt, 1, 7), dt
WITH ROLLUP
ORDER BY GROUPING__ID
;
-- 手动指定 (mth,dt),(mth),(dt)
SELECT SUM(turnover),
SUBSTRING(dt, 1, 7) AS mth,
dt,
GROUPING__ID
FROM db_test.merchants_turnover
GROUP BY SUBSTRING(dt, 1, 7), dt
GROUPING SETS (
( SUBSTRING(dt, 1, 7), dt),
( SUBSTRING(dt, 1, 7)),
( dt)
)
ORDER BY GROUPING__ID
;
Jon Cole53378.772022-01-01
Rachel Davis8315.462022-01-01
Russell Reynolds32494.892022-01-01
April Griffin51568.672022-01-01
Crystal Landry73128.972022-01-01
Amanda Johnson97011.662022-01-01
Teresa James43795.962022-01-01
Javier Johnson88190.372022-01-01
Jeffrey Simpson64941.262022-01-01
Jon Cole10352.242022-01-02
Rachel Davis91306.422022-01-02
Russell Reynolds29295.832022-01-02
April Griffin15734.902022-01-02
Crystal Landry93258.332022-01-02
Amanda Johnson61599.422022-01-02
Teresa James8468.662022-01-02
Javier Johnson15524.492022-01-02
Jeffrey Simpson72736.712022-01-02
Jon Cole71824.462022-01-03
Rachel Davis44352.932022-01-03
Russell Reynolds81634.882022-01-03
April Griffin32915.072022-01-03
Crystal Landry40093.052022-01-03
Amanda Johnson12233.902022-01-03
Teresa James89980.672022-01-03
Javier Johnson8383.152022-01-03
Jeffrey Simpson62089.642022-01-03
Jon Cole71824.462022-01-04
Rachel Davis75260.852022-01-04
Russell Reynolds49006.272022-01-04
April Griffin36897.232022-01-04
Crystal Landry80926.792022-01-04
Amanda Johnson12233.902022-01-04
Teresa James86023.652022-01-04
Javier Johnson31650.912022-01-04
Jeffrey Simpson35506.942022-01-04
Jon Cole64822.892022-01-05
Rachel Davis20906.332022-01-05
Russell Reynolds71062.842022-01-05
April Griffin87299.632022-01-05
Crystal Landry45118.172022-01-05
Amanda Johnson8658.012022-01-05
Teresa James74095.722022-01-05
Javier Johnson47024.812022-01-05
Jeffrey Simpson91166.812022-01-05
Jon Cole65892.972022-01-06
Rachel Davis79864.772022-01-06
Russell Reynolds33983.202022-01-06
April Griffin40772.342022-01-06
Crystal Landry84212.812022-01-06
Amanda Johnson22623.642022-01-06
Teresa James52836.502022-01-06
Javier Johnson32401.392022-01-06
Jeffrey Simpson6908.022022-01-06
Jon Cole90876.252022-01-07
Rachel Davis63978.412022-01-07
Russell Reynolds84048.042022-01-07
April Griffin1551.872022-01-07
Crystal Landry40084.552022-01-07
Amanda Johnson34741.362022-01-07
Teresa James67296.762022-01-07
Javier Johnson37818.662022-01-07
Jeffrey Simpson33920.272022-01-07