count(distinct ) over(partition by order by) 替换成 size(collect_set() over(partition by order by)) 来实现, 含义为求分组后的去重个数。
测试数据:
create table test_distinct as
SELECT '1' as id ,'201808' as m,'a' as k
union all
SELECT '2' as id ,'201808' as m,'a' as k
union all
SELECT '1' as id ,'201809' as m,'a' as k
union all
SELECT '1' as id ,'201808' as m,'b' as k
union all
SELECT '2' as id ,'201809' as m,'b' as k;
id代表人编号, m代表月份,k代表其他key键。
需求:本月累计人数(即9月份的客户要包含9月以前的客户数)
select test_distinct.*,
-- ,count(distinct id) over(partition by k) cnt 报错
-- ,size(collect_set(id) over(partition by k ORDER BY m asc)) cnt
collect_set(id) over(partition by k ORDER BY m asc),
collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
size(collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)) cnt
from test_distinct;
结果:
发现 _wcol0 在 m=201808 且 k=a 时,它的值是[“2”, “1”], _wco1的值是[“2”], 在m=201808 且 k=b 时, 它的值是[“1”]。 这就是窗口加不加
rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW 的区别。
只需要取最新的一条数据就可以了
select * from
(
select
k, m,
row_number() over(PARTITION BY k,m ORDER BY cnt desc) as rk
from
(
select test_distinct.*,
-- ,count(distinct id) over(partition by k) cnt 报错
-- ,size(collect_set(id) over(partition by k ORDER BY m asc)) cnt
collect_set(id) over(partition by k ORDER BY m asc),
collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
size(collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)) cnt
from test_distinct
) temp
) tb
where rk = 1;
使用笛卡尔积实现:
select k, m, ROW_NUMBER() OVER(PARTITION BY k,m ORDER BY m ASC) AS flag from test_distinct;
select
t1.k, t1.m,
t1.cnt as ins, -- 当前新增
sum(t2.cnt) as total -- 历史至今累计
from
(
select k, m, count(*) as cnt
from
(
select k, m, ROW_NUMBER() OVER(PARTITION BY k,m ORDER BY m ASC) AS flag
from test_distinct
) a
where flag = 1
group by k, m
) t1,
(
select k, m, count(*) as cnt
from
(
select k, m, ROW_NUMBER() OVER(PARTITION BY k,m ORDER BY m ASC) AS flag
from test_distinct
) b
where flag = 1
group by k, m
) t2
where t1.m >= t2.m
group by t1.k, t1.m;