作为一名开发人员也好,作为一个数据分析师也罢,或者其他使用数据库的职位,都要熟练掌握SQL语句。下面是一些比较困难的案例,敢来挑战一下么?
场景描述:按照统计粒度,取多条记录中最新日期且某列字段不为null的数据;若只有一条记录时取该条记录即可,无需判断该列字段是否为null。
演示数据如下:
WITH test_table AS (
SELECT UNNEST
( ARRAY [ '财务', '行政', '销售', '财务', '行政', '行政', '客服' ] ) AS "depart",
UNNEST ( ARRAY [ 10, 20, 30, 50, 40, NULL, NULL ] ) AS "money",
UNNEST ( ARRAY [ '2021-12-01', '2021-12-03', '2022-01-04', '2022-03-15', '2022-05-02', '2022-07-01', '2022-06-01' ] ) AS "month"
) SELECT
t1.depart,
t2.money,
t1.MONTH
FROM
(
SELECT
depart,
COALESCE ( MAX ( MONTH ) FILTER ( WHERE money IS NOT NULL ), MAX ( MONTH ) ) AS MONTH
FROM
test_table
GROUP BY
depart
) t1
LEFT JOIN test_table t2 ON t1.MONTH = t2.MONTH
AND t1.depart = t2.depart;
代码思想解读:
场景描述:在进行数据聚合时,需要将某列字符串按照聚合的维度汇总到一行一列,并用特定符号进行分隔开。
WITH student AS (
SELECT UNNEST
( ARRAY [ '1班', '2班', '1班', '3班', '3班', '1班' ] ) AS class_name,
UNNEST ( ARRAY [ '于雨信', '陈星津', '贾书雪', '叶向萍', '曹绮丽', '汪光熙' ] ) AS student_name,
UNNEST ( ARRAY [ '男', '男', '女', '女', '女', '男' ] ) AS sex
) SELECT
class_name,
string_agg ( student_name, ',' ) name_list
FROM
student
GROUP BY
class_name
ORDER BY
class_name;
场景描述:依据产品的活动信息表和销售订单明细表,将各产品的活动信息追加到记录里,当同一时间段内有多条活动时,以后面的活动为准。
活动信息表(activity):
构建数据表语句:
-- 构建活动信息表
DROP TABLE IF EXISTS "activity";
CREATE TABLE "activity" (
"id" int8,
"name" varchar(255) COLLATE "pg_catalog"."default",
"start_dt" date,
"end_dt" date,
"product" varchar(255) COLLATE "pg_catalog"."default",
"participating_store" varchar(255) COLLATE "pg_catalog"."default",
"rebate" numeric(13,2)
)
;
INSERT INTO "activity" VALUES (1, 'A促', '2022-01-01', '2022-12-31', 'A衣服', '甲门店', 10.00);
INSERT INTO "activity" VALUES (2, 'A促', '2022-02-01', '2022-06-01', 'B衣服', '乙门店', 20.00);
INSERT INTO "activity" VALUES (3, 'A促', '2022-01-01', '2022-12-31', 'A衣服', '丙门店', 10.00);
INSERT INTO "activity" VALUES (4, 'A促', '2022-03-01', '2022-12-31', 'A衣服', '甲门店', 20.00);
INSERT INTO "activity" VALUES (5, 's促', '2022-07-01', '2022-07-30', 'A衣服', '甲门店', 30.00);
INSERT INTO "activity" VALUES (6, 's促', '2022-07-01', '2022-09-30', 'B衣服', '乙门店', 50.00);
-- 构建订单明细表
DROP TABLE IF EXISTS "order_detail";
CREATE TABLE "order_detail" (
"order_id" int8,
"store_name" varchar(255) COLLATE "pg_catalog"."default",
"product" varchar(255) COLLATE "pg_catalog"."default",
"order_date" date,
"qty" numeric(8),
"amount" numeric(10,2)
)
;
INSERT INTO "order_detail" VALUES (1, '甲门店', 'A衣服', '2021-12-01', 1, 150.00);
INSERT INTO "order_detail" VALUES (3, '甲门店', 'A衣服', '2022-01-05', 3, 450.00);
INSERT INTO "order_detail" VALUES (5, '甲门店', 'A衣服', '2022-08-01', 3, 450.00);
INSERT INTO "order_detail" VALUES (6, '甲门店', 'C衣服', '2022-08-01', 1, 200.00);
INSERT INTO "order_detail" VALUES (7, '甲门店', 'A衣服', '2022-08-01', 1, 150.00);
INSERT INTO "order_detail" VALUES (2, '乙门店', 'B衣服', '2022-01-05', 2, 600.00);
INSERT INTO "order_detail" VALUES (4, '乙门店', 'B衣服', '2022-07-25', 1, 300.00);
用于实现的SQL语句,如下:
-- 本语句输出结果中追加了标识信息,可视情况保留或去除
SELECT
*
FROM
(
SELECT
d.*,
A.ID,
A.NAME,
A.start_dt,
A.end_dt,
A.rebate,
A.rebate * d.qty order_rebate,
ROW_NUMBER ( ) OVER ( PARTITION BY d.product, d.store_name, d.order_date, d.order_day_num ORDER BY ID DESC ) recent_id
FROM
( SELECT *, ROW_NUMBER ( ) OVER ( PARTITION BY product, store_name, order_date ORDER BY order_id DESC ) order_day_num FROM order_detail ) d
LEFT JOIN activity A ON A.product = d.product
AND A.participating_store = d.store_name
AND d.order_date >= A.start_dt
AND d.order_date <= A.end_dt
) T
WHERE
T.recent_id = 1
ORDER BY
T.order_date,
T.store_name,
T.order_date;
代码思想介绍:
其他说明:
本SQL语句考虑了同一天多笔订单的情况,同时也考虑了活动的生效情况。本例中如果将订单明细表里的order_id作为partition by的信息,就可以不增加order_day_num标识,在本例中之所以会新增一条标识,是因为在增加标识后,不仅可以应对没有order_id的情况,还可以更好的查看当天(product, store_name, order_date)信息重复的情况。
场景描述:依据历史设置的活动信息,将参加活动的门店和产品按时间生效的范围生成对应的活动日历。注意,同意时间段内只能有一个活动,若有多个活动时,按照后设置的为准,即id最大的活动。
构建需要的数据信息表:
-- 构建数据表
DROP TABLE IF EXISTS "activity";
CREATE TABLE "activity" (
"id" int8,
"name" varchar(255) COLLATE "pg_catalog"."default",
"start_dt" date,
"end_dt" date,
"product" varchar(255) COLLATE "pg_catalog"."default",
"participating_store" varchar(255) COLLATE "pg_catalog"."default",
"rebate" numeric(13,2)
)
;
INSERT INTO "activity" VALUES (1, 'A促', '2022-01-01', '2022-12-31', 'A衣服', '甲门店', 10.00);
INSERT INTO "activity" VALUES (2, 'A促', '2022-02-01', '2022-06-01', 'B衣服', '乙门店', 20.00);
INSERT INTO "activity" VALUES (3, 'A促', '2022-01-01', '2022-12-31', 'A衣服', '丙门店', 10.00);
INSERT INTO "activity" VALUES (4, 'A促', '2022-03-01', '2022-12-31', 'A衣服', '甲门店', 20.00);
INSERT INTO "activity" VALUES (5, 's促', '2022-07-01', '2022-09-30', 'A衣服', '甲门店', 30.00);
INSERT INTO "activity" VALUES (6, 's促', '2022-07-01', '2022-09-30', 'B衣服', '乙门店', 50.00);
SELECT
t3.*
FROM
(
SELECT A --拼接生效的活动信息
.ID,--生效的活动id
t2.product,
t2.participating_store,
t2.start_dt,
t2.end_dt,
A.rebate,
A.start_dt origin_start,--生效的活动原始开始时间
A.end_dt origin_end,--生效的活动原始结束时间
ROW_NUMBER ( ) OVER ( PARTITION BY A.product, A.participating_store, t2.start_dt, t2.end_dt ORDER BY A.ID DESC ) recent_id
FROM
(
SELECT -- 构建活动结束时间
t1.*,
LEAD ( start_dt, 1 ) OVER ( PARTITION BY product, participating_store ORDER BY start_dt ) end_dt
FROM
(
SELECT DISTINCT --生成唯一的活动区间
product,
participating_store,
start_dt AS start_dt
FROM
activity UNION
SELECT DISTINCT
product,
participating_store,
end_dt AS start_dt
FROM
activity
ORDER BY
product,
participating_store,
start_dt
) t1
) t2
LEFT JOIN activity A ON t2.product = A.product
AND t2.participating_store = A.participating_store
AND t2.start_dt >= A.start_dt
AND t2.end_dt <= A.end_dt
WHERE --去除超出活动时间范围内的数据
t2.end_dt IS NOT NULL
AND A.ID IS NOT NULL
) t3
WHERE
t3.recent_id = 1
ORDER BY
t3.product,
t3.participating_store,
t3.start_dt
代码思想注释:
在进行数据统计时,往往需要使用到日期的维表数据,因此构建一个维表的重要性不言而喻。
日期维表部分效果图展示:
依据本方法构建的维表,基本可以满足95%的日期维度需要,实现代码如下:
WITH dim_date AS ( SELECT generate_series ( DATE'2000-01-01', DATE'2100-12-31', INTERVAL '1 day' ) :: DATE AS date_name ) SELECT
to_char( date_name, 'yyyymmdd' ) :: INT ID,
to_char( date_name, 'yyyy-mm-dd' ) date_name,
to_char( date_name, 'yyyy年mm月dd日' ) date_name_cn,
date_name calendar_date,
EXTRACT ( DAY FROM date_name ) current_day,
to_char( date_name, 'yyyymm' ) :: INT month_id,
to_char( date_name, 'yyyy-mm-01' ) month_name,
to_char( date_name, 'yyyy年mm月' ) month_name_cn,
EXTRACT ( MONTH FROM date_name ) month_number,
concat ( EXTRACT ( MONTH FROM date_name ), '月' ) month_name_short_cn,
EXTRACT ( DAY FROM ( date_trunc( 'month', date_name ) + INTERVAL '1 months' - date_trunc( 'month', date_name ) ) ) days_in_month,
-- 33-extract(day from to_char(date_name,'yyyy-mm-01')::date -interval '1 days' + interval '33 days') days_in_month_2, -- 33(标记,需要大于31且小于60)-(上月最后1天+33)
--extract(day from (date_trunc('month',date_name + interval '1 months' ) - interval '1 days')) days_in_month_3, --取月末最后一天,然后提取在月份中的第几天
to_char( date_name, 'yyyymm01' ) firtst_of_month,
to_char( date_name - INTERVAL '1 months', 'yyyymm' ) :: INT last_month_id,
to_char( to_char( date_name + INTERVAL '1 months', 'yyyy-mm-01' ) :: DATE - INTERVAL '1 days', 'yyyymmdd' ) :: INT month_end_id,
to_char( to_char( date_name + INTERVAL '1 months', 'yyyy-mm-01' ) :: DATE - INTERVAL '1 days', 'yyyy-mm-dd' ) month_end_date,
to_char( date_name, 'yyyyq' ) :: INT quarter_id,
to_char( date_name, 'yyyy-0q' ) quarter_name,
to_char( date_name, 'yyyy年q季度' ) quarter_name_cn,
to_char( date_name, 'q' ) quarter_name_short,
to_char( date_name, 'q季度' ) quarter_name_short_cn,
EXTRACT ( YEAR FROM date_name ) year_id,
to_char( date_name, 'yyyy' ) year_name,
to_char( date_name, 'yyyy年' ) year_name_cn,
-- to_char(date_name,'D') day_of_week_west, -- 一周里的第几天,周日为1,西方国家习惯
to_char( date_name, 'ID' ) day_of_week_cn,-- 一周里的第几天,周一为1,国内习惯
to_char( date_name, 'DD' ) day_of_month,-- 一月里的第几天
to_char( date_name, 'DDD' ) day_of_year,-- 一年里的第几天
to_char( date_name, 'W' ) week_of_month,-- 一月里的第几周,从当月第一天开始计算
-- to_char(date_name,'WW') week_of_year_west, -- 一年里的第几周,从当年第一天开始计算,西方国家习惯
to_char( date_name, 'IW' ) week_of_year_cn,-- 一年里的第几周,ISO 8601 周编号年份的周数(01-53;一年中的第一个星期四在第 1 周)
EXTRACT ( week FROM date_name ) week_of_year_cn_num,
to_char( date_name, 'yyyy年IW周' ) year_week_name,
to_char( date_trunc( 'week', date_name ), 'yyyy-mm-dd' ) week_begin,
to_char( date_trunc( 'week', date_name ) + INTERVAL '6 days', 'yyyy-mm-dd' ) week_end,
CASE
WHEN EXTRACT ( MONTH FROM date_name ) > 6 THEN
concat ( to_char( date_name, 'yyyy年' ), '下半年' ) ELSE concat ( to_char( date_name, 'yyyy年' ), '上半年' )
END AS half_year,
CASE
WHEN EXTRACT ( isodow FROM date_name ) < 6 THEN
'工作日' ELSE'非工作日'
END AS workday_flag,
CASE
WHEN EXTRACT ( DAY FROM date_name ) = EXTRACT ( DAY FROM ( date_trunc( 'month', date_name ) + INTERVAL '1 months' - date_trunc( 'month', date_name ) ) ) THEN
'是' ELSE'否'
END AS is_month_lastday,
CASE
EXTRACT ( isodow FROM date_name )
WHEN 1 THEN
'星期一'
WHEN 2 THEN
'星期二'
WHEN 3 THEN
'星期三'
WHEN 4 THEN
'星期四'
WHEN 5 THEN
'星期五'
WHEN 6 THEN
'星期六' ELSE'星期日'
END AS week_name
FROM
dim_date;
场景描述,在进行统计时需要按照某一维度的字段转为列进行展示。
演示数据如下:
期望得到的效果如下:
实现代码如下:
-- 方法1
WITH student AS (
SELECT UNNEST
( ARRAY [ '1班', '2班', '1班', '3班', '3班', '1班' ] ) AS class_name,
UNNEST ( ARRAY [ '于雨信', '陈星津', '贾书雪', '叶向萍', '曹绮丽', '汪光熙' ] ) AS student_name,
UNNEST ( ARRAY [ '男', '男', '女', '女', '女', '男' ] ) AS sex
) SELECT
class_name,
sum(case when sex='男' then 1 else 0 end ) 男,
sum(case when sex='女' then 1 else 0 end ) 女
from student
GROUP BY class_name;
方法2:
-- 方法2
WITH student AS (
SELECT UNNEST
( ARRAY [ '1班', '2班', '1班', '3班', '3班', '1班' ] ) AS class_name,
UNNEST ( ARRAY [ '于雨信', '陈星津', '贾书雪', '叶向萍', '曹绮丽', '汪光熙' ] ) AS student_name,
UNNEST ( ARRAY [ '男', '男', '女', '女', '女', '男' ] ) AS sex
) SELECT
class_name,
count(sex) filter ( where sex='男') 男,
count(sex) filter(where sex='女') 女
from student
GROUP BY class_name;
场景描述:
根据产品在多个平台的销售情况,对各产品销售情况进行排序,优先对销售额进行排序,若产品销售额相同,则对销量进行排序。
演示数据(product_plat_order)如下:
实现效果如下:
实现代码如下:
SELECT
*,
ROW_NUMBER ( ) OVER ( PARTITION BY product ORDER BY ( CASE WHEN amount IS NULL THEN - 1 ELSE amount END ) DESC, qty DESC ) rank_num
FROM
product_plat_order;