Hive如何实现 count(distinct ) over (partition by )?

一、方式1:

count(distinct ) over(partition by order by) 替换成 size(collect_set() over(partition by order by)) 来实现, 含义为求分组后的去重个数。

测试数据:

create table test_distinct as 
SELECT '1' as id ,'201808' as m,'a' as k
union all
SELECT '2' as id ,'201808' as m,'a' as k
union all
SELECT '1' as id ,'201809' as m,'a' as k
union all
SELECT '1' as id ,'201808' as m,'b' as k
union all
SELECT '2' as id ,'201809' as m,'b' as k;

id代表人编号, m代表月份,k代表其他key键。

需求:本月累计人数(即9月份的客户要包含9月以前的客户数)

第一步:
    select test_distinct.*,
    -- ,count(distinct id) over(partition by k) cnt   报错
    -- ,size(collect_set(id) over(partition by k ORDER BY m asc)) cnt
    collect_set(id) over(partition by k ORDER BY m asc),
    collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
    size(collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)) cnt
from test_distinct;

结果:
发现 _wcol0 在 m=201808 且 k=a 时,它的值是[“2”, “1”], _wco1的值是[“2”], 在m=201808 且 k=b 时, 它的值是[“1”]。 这就是窗口加不加
rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW 的区别。
Hive如何实现 count(distinct ) over (partition by )?_第1张图片

第二步:

只需要取最新的一条数据就可以了

select * from 
(
select 
k, m, 
row_number() over(PARTITION BY k,m ORDER BY cnt desc) as rk
from
(
    select test_distinct.*,
    -- ,count(distinct id) over(partition by k) cnt   报错
    -- ,size(collect_set(id) over(partition by k ORDER BY m asc)) cnt
    collect_set(id) over(partition by k ORDER BY m asc),
    collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
    size(collect_set(id) over(partition by k ORDER BY m asc rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)) cnt
from test_distinct
) temp 
) tb
where rk = 1;

结果:
Hive如何实现 count(distinct ) over (partition by )?_第2张图片

二、方式2:

使用笛卡尔积实现:

第一步:
select k, m, ROW_NUMBER() OVER(PARTITION BY k,m ORDER BY m ASC) AS flag from test_distinct;
第二步:
select 
	t1.k, t1.m, 
	t1.cnt as ins,           -- 当前新增
	sum(t2.cnt) as total     -- 历史至今累计
from 
(
	select k, m, count(*) as cnt  
	from 
		(	
			select k, m, ROW_NUMBER() OVER(PARTITION BY k,m ORDER BY m ASC) AS flag 
			from test_distinct
		) a
	where flag = 1 
	group by k, m 
) t1, 
(
	select k, m, count(*) as cnt  
	from 
		(	
			select k, m, ROW_NUMBER() OVER(PARTITION BY k,m ORDER BY m ASC) AS flag 
			from test_distinct
		) b 
	where flag = 1 
	group by k, m 
) t2 
where t1.m >= t2.m 
group by t1.k, t1.m;

你可能感兴趣的:(hive,hive,大数据)