在日常数据统计过程中,不可避免的会遇到窗口函数,本篇文章将以postgresql数据库为例,介绍下窗口函数。
function_name ([expression [, expression ... ]]) [ FILTER ( WHERE filter_clause ) ] OVER window_name
function_name ([expression [, expression ... ]]) [ FILTER ( WHERE filter_clause ) ] OVER ( window_definition )
function_name ( * ) [ FILTER ( WHERE filter_clause ) ] OVER window_name
function_name ( * ) [ FILTER ( WHERE filter_clause ) ] OVER ( window_definition )
windows_definition的语法如下:
[ existing_window_name ]
[ PARTITION BY expression [, ...] ]
[ ORDER BY expression [ ASC | DESC | USING operator ] [ NULLS { FIRST | LAST } ] [, ...] ]
[ frame_clause ]
可选的frame_clause是下列之一
{ RANGE | ROWS | GROUPS } frame_start [ frame_exclusion ]
{ RANGE | ROWS | GROUPS } BETWEEN frame_start AND frame_end [ frame_exclusion ]
简而言之
<窗口函数> over (partition by <分组的字段名称>
order by <排序的字段名称>)
rows的滚动写法:
窗口函数最大的特点就是不改变数据的行数。
窗口函数如下:
常见写法举例
sum(salary) OVER ()
sum(salary) over (partition by null )
sum(salary) OVER (ORDER BY salary)
cume_dist() over (partition by null order by [salary])
avg(salary) OVER (PARTITION BY depname)
rank() OVER (PARTITION BY depname ORDER BY salary DESC)
-- 当遇到重复窗口计算时,可以这么写
SELECT sum(salary) OVER w, avg(salary) OVER w
FROM empsalary
WINDOW w AS (PARTITION BY depname ORDER BY salary DESC);
为方便进行展示,在这里构建演示的数据集,数据颗粒度到月份。由于本文会主要介绍窗口函数的使用,因此本文在解决某一问题时,仅考虑使用窗口函数。
DROP TABLE IF EXISTS "fruit_sale";
CREATE TABLE "fruit_sale" (
"statistical_date" date,
"product" varchar(255),
"year" varchar(5),
"qty" numeric(8),
"amount" numeric(8)
)
;
INSERT INTO "fruit_sale" VALUES ('2018-01-01', '西瓜', '2018', 1721, 253541);
INSERT INTO "fruit_sale" VALUES ('2018-03-01', '西瓜', '2018', 4496, 203471);
INSERT INTO "fruit_sale" VALUES ('2018-04-01', '西瓜', '2018', 7359, 206686);
INSERT INTO "fruit_sale" VALUES ('2018-05-01', '西瓜', '2018', 5678, 129644);
INSERT INTO "fruit_sale" VALUES ('2018-08-01', '西瓜', '2018', 9187, 220605);
INSERT INTO "fruit_sale" VALUES ('2018-09-01', '西瓜', '2018', 4029, 119187);
INSERT INTO "fruit_sale" VALUES ('2018-10-01', '西瓜', '2018', 3129, 137928);
INSERT INTO "fruit_sale" VALUES ('2018-11-01', '西瓜', '2018', 2633, 175737);
INSERT INTO "fruit_sale" VALUES ('2018-12-01', '西瓜', '2018', 8646, 267718);
INSERT INTO "fruit_sale" VALUES ('2019-01-01', '西瓜', '2019', 1223, 113053);
INSERT INTO "fruit_sale" VALUES ('2019-02-01', '西瓜', '2019', 9079, 200716);
INSERT INTO "fruit_sale" VALUES ('2019-03-01', '西瓜', '2019', 3437, 104221);
INSERT INTO "fruit_sale" VALUES ('2019-05-01', '西瓜', '2019', 8963, 122630);
INSERT INTO "fruit_sale" VALUES ('2019-06-01', '西瓜', '2019', 1991, 167150);
INSERT INTO "fruit_sale" VALUES ('2018-01-01', '苹果', '2018', 5559, 269419);
INSERT INTO "fruit_sale" VALUES ('2018-02-01', '苹果', '2018', 5832, 142631);
INSERT INTO "fruit_sale" VALUES ('2018-03-01', '苹果', '2018', 5276, 120441);
INSERT INTO "fruit_sale" VALUES ('2018-04-01', '苹果', '2018', 5590, 182167);
INSERT INTO "fruit_sale" VALUES ('2018-05-01', '苹果', '2018', 1392, 249027);
INSERT INTO "fruit_sale" VALUES ('2018-06-01', '苹果', '2018', 9694, 179832);
INSERT INTO "fruit_sale" VALUES ('2018-07-01', '苹果', '2018', 3852, 130764);
INSERT INTO "fruit_sale" VALUES ('2018-08-01', '苹果', '2018', 8625, 235426);
INSERT INTO "fruit_sale" VALUES ('2018-09-01', '苹果', '2018', 7249, 286565);
INSERT INTO "fruit_sale" VALUES ('2018-10-01', '苹果', '2018', 3490, 125275);
INSERT INTO "fruit_sale" VALUES ('2018-11-01', '苹果', '2018', 8614, 170263);
INSERT INTO "fruit_sale" VALUES ('2018-12-01', '苹果', '2018', 2140, 139439);
INSERT INTO "fruit_sale" VALUES ('2019-01-01', '苹果', '2019', 5558, 156995);
INSERT INTO "fruit_sale" VALUES ('2019-02-01', '苹果', '2019', 2246, 216573);
INSERT INTO "fruit_sale" VALUES ('2019-03-01', '苹果', '2019', 6545, 238608);
INSERT INTO "fruit_sale" VALUES ('2019-04-01', '苹果', '2019', 9992, 157696);
INSERT INTO "fruit_sale" VALUES ('2019-05-01', '苹果', '2019', 6319, 282352);
INSERT INTO "fruit_sale" VALUES ('2019-06-01', '苹果', '2019', 1274, 150122);
INSERT INTO "fruit_sale" VALUES ('2018-06-01', '西瓜', '2018', 7434, 206686);
INSERT INTO "fruit_sale" VALUES ('2018-02-01', '西瓜', '2018', 5530, 129644);
INSERT INTO "fruit_sale" VALUES ('2018-07-01', '西瓜', '2018', 4711, 129644);
INSERT INTO "fruit_sale" VALUES ('2019-04-01', '西瓜', '2019', 6524, 206686);
问题a.统计所有水果的销售量:
--以下写法均可:
--写法1:
select *,sum(qty) over (PARTITION by null) total_qty
from fruit_sale;
--写法2:
select *,sum(qty) over (PARTITION by '') total_qty
from fruit_sale;
--写法3:
select *,sum(qty) over () total_qty
from fruit_sale;
问题b.按水果种类统计水果的销售量,并依据销量排序:
--按水果种类,分别统计总销售量
select *,sum(qty) over (PARTITION by product) category_total_qty
from fruit_sale;
问题a.求各种水果的历史销量最大值:
select *,
max(qty) over (PARTITION by product) category_total_qty_max
from fruit_sale;
问题b.求各种水果在各年度的最高销量:
select *,
max(qty) over (PARTITION by product,year) category_year_total_qty_max
from fruit_sale;
问题a:求每个月各水果的销量占当年水果总销量比值
--步骤1:先计算累计占比,即分母
--步骤2:将数据进行拼接,并用分子除以分母
SELECT T.*,
round( T.qty / T.product_year_qty, 4 )
FROM
( SELECT *, SUM ( qty ) OVER ( PARTITION BY YEAR ) AS product_year_qty FROM fruit_sale) T;
结果部分截图如下:
问题b:求各水果年初至本月的销量占当年水果总销量的占比,即求累计占比
--求累计占比
--注意该方法与cume_dist()的区别
SELECT T
.*,
round( T.product_mtd_qty / T.product_year_qty, 4 ) accumulate_rate
FROM
(
SELECT
*,
SUM ( qty ) OVER ( PARTITION BY YEAR, product ORDER BY statistical_date ) AS product_mtd_qty,
SUM ( qty ) OVER ( PARTITION BY YEAR, product ) AS product_year_qty
FROM
fruit_sale
) T;
问题:对各类水果的历史各年度销量排序,按照由高至低。
SELECT
product,
YEAR,
SUM(qty) total_product_qty_y,
dense_rank( ) OVER (ORDER BY SUM(qty) desc) qty_rank
FROM
fruit_sale
GROUP BY
product,
YEAR;
问题:对各月份的西瓜销售额进行排名,按照由高至低。
SELECT
*,
DENSE_RANK ( ) OVER ( ORDER BY amount desc) total_qty
FROM
fruit_sale
WHERE
product = '西瓜';
问题:求各月份的西瓜销售额在各年度的排名情况,按照销售额由高至低。
--求各月份的西瓜销售额在各年度的排名情况,按照销售额由高至低
SELECT
*,
DENSE_RANK ( ) OVER ( PARTITION BY YEAR ORDER BY amount DESC ) total_qty
FROM
fruit_sale
WHERE
product = '西瓜';
问题:求每年度西瓜销售额最高的3个月份。
--求每年度西瓜销售额最高的3个月份。
SELECT A
.*
FROM
( SELECT *, DENSE_RANK ( ) OVER ( PARTITION BY YEAR ORDER BY amount DESC ) total_qty_rank FROM fruit_saleWHERE product = '西瓜' ) A
WHERE
A.total_qty_rank <= 3;
问题a:求各水果在各个月份的历史累计金额
--求各水果在各个月份的历史累计金额
SELECT
*,
SUM ( amount ) OVER ( PARTITION BY product ORDER BY statistical_date ) total_qty
FROM
fruit_sale;
部分结果截图,如下:
问题b: 求各水果在各年度的月份销售额累计情况
--求各水果在各年度的月份销售额累计情况
SELECT
*,
SUM ( amount ) OVER ( PARTITION BY YEAR, product ORDER BY statistical_date ) total_qty
FROM
fruit_sale;
部分结果截图如下:
问题c: 求水果在历史月份的销售额累计情况
--求水果在历史月份的销售额累计情况
SELECT
*,
SUM ( amount ) OVER ( PARTITION BY null ORDER BY statistical_date,product ) total_qty
FROM
fruit_sale;
提前了解一些关于滚动的小知识:
UNBOUNDED PRECEDING:表示该分组的第一行
UNBOUNDED FOLLOWING:表示该分组的最后一行
CURRENT ROW:表示当前行。
n PRECEDING:表示从当前行往前数n数量的行
n FOLLOWING:表示从当前行往后数n数量的行
问题a:求各种类水果,连续三个月的滚动销量(本行前两个月+本行所在月)
--滚动计算3个月(前两月+本月)的销售额
SELECT
*,
SUM ( amount ) OVER ( PARTITION BY product ORDER BY statistical_date rows between 3 preceding and current row) accumulate_total_qty
FROM
fruit_sale;
结果部分截图如下:
问题b:滚动计算各种类水果在3个月(前两月+本月)+下1个月的销售额
--滚动计算各种类水果在3个月(前两月+本月)+下1个月的销售额
SELECT
*,
SUM ( amount ) OVER ( PARTITION BY product ORDER BY statistical_date rows between 3 preceding and 1 following) accumulate_total_qty
FROM
fruit_sale;
部分结果截图如下:
问题3:滚动计算水果的销售额当前月至末尾月的销售额总和
--滚动计算水果的销售额当前月至末尾月的销售额总和(可根据排序方向调整计算方法)
SELECT
*,
SUM ( amount ) OVER ( PARTITION BY product ORDER BY statistical_date asc rows between current row and unbounded following) accumulate_total_qty
FROM
fruit_sale;
注意:利用本方法在计算同环比时,月份与月份之间要连续,不能缺失
问题:分别求各水果的销量环比
--分别求各水果的销量环比
SELECT
fs.*,
fs.qty / lead_qty - 1 huanbi
FROM
( SELECT statistical_date, product, qty, LEAD ( qty, 1 ) OVER ( PARTITION BY product ORDER BY statistical_date DESC ) lead_qty FROM fruit_sale ) fs;
问题:分别求各水果的销量同比
--分别求各水果的销量同比
SELECT
fs.*,
fs.qty / lead_qty - 1 tongbi
FROM
( SELECT statistical_date, product, qty, LEAD ( qty, 12 ) OVER ( PARTITION BY product ORDER BY statistical_date DESC ) lead_qty FROM fruit_sale ) fs;
问题a:将各类水果按照月份分别分为两类,日期越早,序号越小
-- 将各类水果按照月份分别分为两类,日期越早,序号越小
--均分情况(9+9)
select *,ntile(2) over (partition by product order by statistical_date) slice
from fruit_sale;
部分结果截图如下:
问题b:将各类水果按照月份分别分为4类,日期越早,序号越大
-- 将各类水果按照月份分别分为4类,日期越早,序号越大
--不均分情况(5+5+4+4)
select *,ntile(4) over (partition by product order by statistical_date desc) slice
from fruit_sale;
问题a:求水果销量排名最高的前50%月份的总销量和平均销量
--求水果销量排名最高的前50%月份的总销量和平均销量
SELECT
SUM ( fs.qty ),
AVG ( fs.qty )
FROM
( SELECT *, NTILE ( 5 ) OVER (ORDER BY qty DESC ) ranking FROM fruit_sale ) fs
WHERE
fs.ranking < 5 ;
结果截图如下:
问题b:求各类水果销量排名最高的前80%月份的总销量
--求各类水果销量排名最高的前80%月份的总销量和平均销量
--求解思路:先将水果按销量分为5类,然后取序号1,2,3,4的数据
SELECT
fs.product,
SUM ( fs.qty ),
AVG ( fs.qty )
FROM
( SELECT *, NTILE ( 5 ) OVER ( PARTITION BY product ORDER BY qty DESC ) ranking FROM fruit_sale ) fs
WHERE
fs.ranking < 5
GROUP BY
fs.product;
主要由cume_dist和percent_rank两个函数,两个都是求计数的百分比。cume_dist包含当前行,但是percent_rank不包含当前行。
本次例子中以cume_dist作为例子。
问题a:求水果的销量低于当前行销量的月份数量百分比
--求水果的销量低于当前行销量的百分比
SELECT
*,
round( CUME_DIST ( ) OVER ( ORDER BY qty ) :: NUMERIC, 4 ) AS qty_less_rate
FROM
fruit_sale;
问题b:求各水果的销量低于当前行销量的月份数量百分比
--求各水果的销量低于当前行销量的百分比
SELECT
*,
round( CUME_DIST ( ) OVER ( PARTITION BY product ORDER BY qty ) :: NUMERIC, 4 ) AS qty_less_rate
FROM
fruit_sale;
问题: 求各水果上一个月的销量
--求各水果上一个月的销量
-- 方法1(lag,推荐)
SELECT
*,
LAG ( qty, 1 ) OVER ( PARTITION BY product ORDER BY statistical_date ) last_month_qty
FROM
fruit_sale;
-- 方法2(lead + 倒序)
SELECT
*,
LEAD ( qty, 1 ) OVER ( PARTITION BY product ORDER BY statistical_date DESC ) last_month_qty
FROM
fruit_sale;
虽然两种方法展现结果的形式不太一样,但数据是一致的,lag截图如下:
问题:求各水果次月的销量
-- 求各水果次月的销量
--方法1 (lead,推荐)
SELECT
*,
LEAD ( qty, 1 ) OVER ( PARTITION BY product ORDER BY statistical_date ) next_month_qty
FROM
fruit_sale;
-- 方法2 (lag + 倒序)
SELECT
*,
LAG ( qty, 1 ) OVER ( PARTITION BY product ORDER BY statistical_date DESC ) next_month_qty
FROM
fruit_sale;
虽然两种方法展现结果的形式不太一样,但数据是一致的,lead截图如下:
该方法可以指定数据生效的行范围。first_value函数是在组内排序后,截止到当前行,第一个值。
问题:求各水果在首月的销量。
-- 求各水果在首月的销量
SELECT
*,
FIRST_VALUE ( qty ) OVER ( PARTITION BY product ORDER BY statistical_date ) first_month_sale
FROM
fruit_sale;
该函数方法可以指定数据生效的行范围。last函数是在组内排序后,截止到当前行,最后一个值。
问题:
-- 求各水果在最新一个月(尾月)的销量
SELECT
*,
LAST_VALUE ( qty ) OVER ( PARTITION BY product ORDER BY statistical_date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) first_month_sale
FROM
fruit_sale;
以上就是窗口函数的一些运用案例,后面如果有更多有趣的统计方法,会进行补充。