本文目录:
一、行列转换
二、排名中取他值
三、累计求值
四、窗口大小控制
五、产生连续数值
六、数据扩充与收缩
七、合并与拆分
八、模拟循环操作
九、不使用distinct或group by去重
十、容器--反转内容
十一、多容器--成对提取数据
十二、多容器--转多行
十三、抽象分组--断点排序
十四、业务逻辑的分类与抽象--时效
十五、时间序列--进度及剩余
十六、时间序列--构造日期
十七、时间序列--构造累积日期
十八、时间序列--构造连续日期
十九、时间序列--取多个字段最新的值
二十、时间序列--补全数据
二十一、时间序列--取最新完成状态的前一个状态
二十二、非等值连接--范围匹配
二十三、非等值连接--最近匹配
二十四、N指标--累计去重
十四、业务逻辑的分类与抽象--时效
日期表:d_date
表字段及内容:
date_id is_work
2017-04-13 1
2017-04-14 1
2017-04-15 0
2017-04-16 0
2017-04-17 1
工作日:周一至周五09:30-18:30
客户申请表:t14
表字段及内容:
a b c
1 申请 2017-04-14 18:03:00
1 通过 2017-04-17 09:43:00
2 申请 2017-04-13 17:02:00
2 通过 2017-04-15 09:42:00
问题一:计算上表中从申请到通过占用的工作时长
输出结果如下所示:
a d
1 0.67h
2 10.67h
参考答案:
select
a,
round(sum(diff)/3600,2) as d
from (
select
a,
apply_time,
pass_time,
dates,
rn,
ct,
is_work,
case when is_work=1 and rn=1 then unix_timestamp(concat(dates,' 18:30:00'),'yyyy-MM-dd HH:mm:ss')-unix_timestamp(apply_time,'yyyy-MM-dd HH:mm:ss')
when is_work=0 then 0
when is_work=1 and rn=ct then unix_timestamp(pass_time,'yyyy-MM-dd HH:mm:ss')-unix_timestamp(concat(dates,' 09:30:00'),'yyyy-MM-dd HH:mm:ss')
when is_work=1 and rn!=ct then 9*3600
end diff
from (
select
a,
apply_time,
pass_time,
time_diff,
day_diff,
rn,
ct,
date_add(start,rn-1) dates
from (
select
a,
apply_time,
pass_time,
time_diff,
day_diff,
strs,
start,
row_number() over(partition by a) as rn,
count(*) over(partition by a) as ct
from (
select
a,
apply_time,
pass_time,
time_diff,
day_diff,
substr(repeat(concat(substr(apply_time,1,10),','),day_diff+1),1,11*(day_diff+1)-1) strs
from (
select
a,
apply_time,
pass_time,
unix_timestamp(pass_time,'yyyy-MM-dd HH:mm:ss')-unix_timestamp(apply_time,'yyyy-MM-dd HH:mm:ss') time_diff,
datediff(substr(pass_time,1,10),substr(apply_time,1,10)) day_diff
from (
select
a,
max(case when b='申请' then c end) apply_time,
max(case when b='通过' then c end) pass_time
from t14
group by a
) tmp1
) tmp2
) tmp3
lateral view explode(split(strs,",")) t as start
) tmp4
) tmp5
join d_date
on tmp5.dates = d_date.date_id
) tmp6
group by a;
十五、时间序列--进度及剩余
表名:t15
表字段及内容:
date_id is_work
2017-07-30 0
2017-07-31 1
2017-08-01 1
2017-08-02 1
2017-08-03 1
2017-08-04 1
2017-08-05 0
2017-08-06 0
2017-08-07 1
问题一:求每天的累计周工作日,剩余周工作日
输出结果如下所示:
date_id week_to_work week_left_work
2017-07-31 1 4
2017-08-01 2 3
2017-08-02 3 2
2017-08-03 4 1
2017-08-04 5 0
2017-08-05 5 0
2017-08-06 5 0
参考答案:
此处给出两种解法,其一:
select
date_id
,case date_format(date_id,'u')
when 1 then 1
when 2 then 2
when 3 then 3
when 4 then 4
when 5 then 5
when 6 then 5
when 7 then 5
end as week_to_work
,case date_format(date_id,'u')
when 1 then 4
when 2 then 3
when 3 then 2
when 4 then 1
when 5 then 0
when 6 then 0
when 7 then 0
end as week_to_work
from t15
其二:
select
date_id,
week_to_work,
week_sum_work-week_to_work as week_left_work
from(
select
date_id,
sum(is_work) over(partition by year,week order by date_id) as week_to_work,
sum(is_work) over(partition by year,week) as week_sum_work
from(
select
date_id,
is_work,
year(date_id) as year,
weekofyear(date_id) as week
from t15
) ta
) tb order by date_id;
十六、时间序列--构造日期
问题一:直接使用SQL实现一张日期维度表,包含以下字段:
date string 日期
d_week string 年内第几周
weeks int 周几
w_start string 周开始日
w_end string 周结束日
d_month int 第几月
m_start string 月开始日
m_end string 月结束日
d_quarter int 第几季
q_start string 季开始日
q_end string 季结束日
d_year int 年份
y_start string 年开始日
y_end string 年结束日
参考答案:
drop table if exists dim_date;
create table if not exists dim_date(
`date` string comment '日期',
d_week string comment '年内第几周',
weeks string comment '周几',
w_start string comment '周开始日',
w_end string comment '周结束日',
d_month string comment '第几月',
m_start string comment '月开始日',
m_end string comment '月结束日',
d_quarter int comment '第几季',
q_start string comment '季开始日',
q_end string comment '季结束日',
d_year int comment '年份',
y_start string comment '年开始日',
y_end string comment '年结束日'
);
--自然月: 指每月的1号到那个月的月底,它是按照阳历来计算的。就是从每月1号到月底,不管这个月有30天,31天,29天或者28天,都算是一个自然月。
insert overwrite table dim_date
select `date`
, d_week --年内第几周
, case weekid
when 0 then '周日'
when 1 then '周一'
when 2 then '周二'
when 3 then '周三'
when 4 then '周四'
when 5 then '周五'
when 6 then '周六'
end as weeks -- 周
, date_add(next_day(`date`,'MO'),-7) as w_start --周一
, date_add(next_day(`date`,'MO'),-1) as w_end -- 周日_end
-- 月份日期
, concat('第', monthid, '月') as d_month
, m_start
, m_end
-- 季节
, quarterid as d_quart
, concat(d_year, '-', substr(concat('0', (quarterid - 1) * 3 + 1), -2), '-01') as q_start --季开始日
, date_sub(concat(d_year, '-', substr(concat('0', (quarterid) * 3 + 1), -2), '-01'), 1) as q_end --季结束日
-- 年
, d_year
, y_start
, y_end
from (
select `date`
, pmod(datediff(`date`, '2012-01-01'), 7) as weekid --获取周几
, cast(substr(`date`, 6, 2) as int) as monthid --获取月份
, case
when cast(substr(`date`, 6, 2) as int) <= 3 then 1
when cast(substr(`date`, 6, 2) as int) <= 6 then 2
when cast(substr(`date`, 6, 2) as int) <= 9 then 3
when cast(substr(`date`, 6, 2) as int) <= 12 then 4
end as quarterid --获取季节 可以直接使用 quarter(`date`)
, substr(`date`, 1, 4) as d_year -- 获取年份
, trunc(`date`, 'YYYY') as y_start --年开始日
, date_sub(trunc(add_months(`date`, 12), 'YYYY'), 1) as y_end --年结束日
, date_sub(`date`, dayofmonth(`date`) - 1) as m_start --当月第一天
, last_day(date_sub(`date`, dayofmonth(`date`) - 1)) m_end --当月最后一天
, weekofyear(`date`) as d_week --年内第几周
from (
-- '2021-04-01'是开始日期, '2022-03-31'是截止日期
select date_add('2021-04-01', t0.pos) as `date`
from (
select posexplode(
split(
repeat('o', datediff(
from_unixtime(unix_timestamp('2022-03-31', 'yyyy-mm-dd'),
'yyyy-mm-dd'),
'2021-04-01')), 'o'
)
)
) t0
) t1
) t2;
十七、时间序列--构造累积日期
表名:t17
表字段及内容:
date_id
2017-08-01
2017-08-02
2017-08-03
问题一:每一日期,都扩展成月初至当天
输出结果如下所示:
date_id date_to_day
2017-08-01 2017-08-01
2017-08-02 2017-08-01
2017-08-02 2017-08-02
2017-08-03 2017-08-01
2017-08-03 2017-08-02
2017-08-03 2017-08-03
这种累积相关的表,常做桥接表。
参考答案:
select
date_id,
date_add(date_start_id,pos) as date_to_day
from
(
select
date_id,
date_sub(date_id,dayofmonth(date_id)-1) as date_start_id
from t17
) m lateral view
posexplode(split(space(datediff(from_unixtime(unix_timestamp(date_id,'yyyy-MM-dd')),from_unixtime(unix_timestamp(date_start_id,'yyyy-MM-dd')))), '')) t as pos, val;
十八、时间序列--构造连续日期
表名:t18
表字段及内容:
a b c
101 2018-01-01 10
101 2018-01-03 20
101 2018-01-06 40
102 2018-01-02 20
102 2018-01-04 30
102 2018-01-07 60
问题一:构造连续日期
问题描述:将表中数据的b字段扩充至范围[2018-01-01, 2018-01-07],并累积对c求和。
b字段的值是较稀疏的。
输出结果如下所示:
a b c d
101 2018-01-01 10 10
101 2018-01-02 0 10
101 2018-01-03 20 30
101 2018-01-04 0 30
101 2018-01-05 0 30
101 2018-01-06 40 70
101 2018-01-07 0 70
102 2018-01-01 0 0
102 2018-01-02 20 20
102 2018-01-03 0 20
102 2018-01-04 30 50
102 2018-01-05 0 50
102 2018-01-06 0 50
102 2018-01-07 60 110
参考答案:
select
a,
b,
c,
sum(c) over(partition by a order by b) as d
from
(
select
t1.a,
t1.b,
case
when t18.b is not null then t18.c
else 0
end as c
from
(
select
a,
date_add(s,pos) as b
from
(
select
a,
'2018-01-01' as s,
'2018-01-07' as r
from (select a from t18 group by a) ta
) m lateral view
posexplode(split(space(datediff(from_unixtime(unix_timestamp(r,'yyyy-MM-dd')),from_unixtime(unix_timestamp(s,'yyyy-MM-dd')))), '')) t as pos, val
) t1
left join t18
on t1.a = t18.a and t1.b = t18.b
) ts;
十九、时间序列--取多个字段最新的值
表名:t19
表字段及内容:
date_id a b c
2014 AB 12 bc
2015 23
2016 d
2017 BC
问题一:如何一并取出最新日期
输出结果如下所示:
date_a a date_b b date_c c
2017 BC 2015 23 2016 d
参考答案:
此处给出三种解法,其一:
SELECT max(CASE WHEN rn_a = 1 THEN date_id else 0 END) AS date_a
,max(CASE WHEN rn_a = 1 THEN a else null END) AS a
,max(CASE WHEN rn_b = 1 THEN date_id else 0 END) AS date_b
,max(CASE WHEN rn_b = 1 THEN b else NULL END) AS b
,max(CASE WHEN rn_c = 1 THEN date_id else 0 END) AS date_c
,max(CASE WHEN rn_c = 1 THEN c else null END) AS c
FROM (
SELECT date_id
,a
,b
,c
--对每列上不为null的值 的 日期 进行排序
,row_number()OVER( PARTITION BY 1 ORDER BY CASE WHEN a IS NULL THEN 0 ELSE date_id END DESC) AS rn_a
,row_number()OVER(PARTITION BY 1 ORDER BY CASE WHEN b IS NULL THEN 0 ELSE date_id END DESC) AS rn_b
,row_number()OVER(PARTITION BY 1 ORDER BY CASE WHEN c IS NULL THEN 0 ELSE date_id END DESC) AS rn_c
FROM t19
) t
WHERE t.rn_a = 1
OR t.rn_b = 1
OR t.rn_c = 1;
其二:
SELECT
a.date_id
,a.a
,b.date_id
,b.b
,c.date_id
,c.c
FROM
(
SELECT
t.date_id,
t.a
FROM
(
SELECT
t.date_id
,t.a
,t.b
,t.c
FROM t19 t INNER JOIN t19 t1 ON t.date_id = t1.date_id AND t.a IS NOT NULL
) t
ORDER BY t.date_id DESC
LIMIT 1
) a
LEFT JOIN
(
SELECT
t.date_id
,t.b
FROM
(
SELECT
t.date_id
,t.b
FROM t19 t INNER JOIN t19 t1 ON t.date_id = t1.date_id AND t.b IS NOT NULL
) t
ORDER BY t.date_id DESC
LIMIT 1
) b ON 1 = 1
LEFT JOIN
(
SELECT
t.date_id
,t.c
FROM
(
SELECT
t.date_id
,t.c
FROM t19 t INNER JOIN t19 t1 ON t.date_id = t1.date_id AND t.c IS NOT NULL
) t
ORDER BY t.date_id DESC
LIMIT 1
) c
ON 1 = 1;
其三:
select
*
from
(
select t1.date_id as date_a,t1.a from (select t1.date_id,t1.a from t19 t1 where t1.a is not null) t1
inner join (select max(t1.date_id) as date_id from t19 t1 where t1.a is not null) t2
on t1.date_id=t2.date_id
) t1
cross join
(
select t1.date_b,t1.b from (select t1.date_id as date_b,t1.b from t19 t1 where t1.b is not null) t1
inner join (select max(t1.date_id) as date_id from t19 t1 where t1.b is not null)t2
on t1.date_b=t2.date_id
) t2
cross join
(
select t1.date_c,t1.c from (select t1.date_id as date_c,t1.c from t19 t1 where t1.c is not null) t1
inner join (select max(t1.date_id) as date_id from t19 t1 where t1.c is not null)t2
on t1.date_c=t2.date_id
) t3;