业务需求中往往需要将多表全量数据合并,数据行合并时用union、union all,列合并时则用full join。示例如下:
create table tmp.tmp_fulljoin_tablea (key1 string,key2 string,tcola string);
create table tmp.tmp_fulljoin_tableb (key1 string,key2 string,tcolb string);
create table tmp.tmp_fulljoin_tablec (key1 string,key2 string,tcolc string);
insert into table tmp.tmp_fulljoin_tablea
select '111','aaa','111aaa'
union all
select '222','bbb','222bbb'
union all
select '333',null,'333ccc';
insert into table tmp.tmp_fulljoin_tableb
select '111','aaa','111aaa'
union all
select '444','ddd','444ddd'
union all
select null,'ccc','333ccc';
insert into table tmp.tmp_fulljoin_tablec
select '111','aaa','111aaa'
union all
select '444','ddd','444ddd'
union all
select '333','ccc','333ccc';
a表与b表通过key1进行全关联全字段
select *
from tmp.tmp_fulljoin_tablea a
full join tmp.tmp_fulljoin_tableb b
on a.key1 = b.key1;
实际应用
select COALESCE(a.key1,b.key1) key1,
COALESCE(a.key2,b.key2) key2,
tcola,tcolb
from tmp.tmp_fulljoin_tablea a
full join tmp.tmp_fulljoin_tableb b
on a.key1 = b.key1;
a、b、c表通过key1进行全关联全字段
select *
from tmp.tmp_fulljoin_tablea a
full join tmp.tmp_fulljoin_tableb b
on a.key1 = b.key1
full join tmp.tmp_fulljoin_tablec c
on COALESCE(a.key1,b.key1) = c.key1;
注意不要将c表关联条件写成on a.key1 = c.key1;否则达不到多表关联效果
实际应用
select COALESCE(a.key1,b.key1,c.key1) key1,
COALESCE(a.key2,b.key2,c.key2) key2,
tcola,tcolb,tcolc
from tmp.tmp_fulljoin_tablea a
full join tmp.tmp_fulljoin_tableb b
on a.key1 = b.key1
full join tmp.tmp_fulljoin_tablec c
on COALESCE(a.key1,b.key1) = c.key1;
select COALESCE(a.key1,b.key1,c.key1) key1,
COALESCE(a.key2,b.key2,c.key2) key2,
tcola,tcolb,tcolc
from tmp.tmp_fulljoin_tablea a
full join tmp.tmp_fulljoin_tableb b
on a.key1 = b.key1 and a.key2 = b.key2
full join tmp.tmp_fulljoin_tablec c
on COALESCE(a.key1,b.key1) = c.key1 and COALESCE(a.key2,b.key2) = c.key2
结合执行计划可以看出先是a表和b表关联,然后得出的结果再与c表关联,使用coalesce也不会出现c表的333eee和a表的333、b表的ccc错表关联。
当数据量过大时可以尝试将所有表的关联键落表,然后left join。根据实际环境执行表现选择最佳方法。
create table as tmp_key as
select key1,key2 from tmp.tmp_fulljoin_tablea
union
select key1,key2 from tmp.tmp_fulljoin_tableb
union
select key1,key2 from tmp.tmp_fulljoin_tablec;
select k.*,
tcola,tcolb,tcolc
from tmp_key k
left join tmp.tmp_fulljoin_tablea a
on k.key1=a.key1 and k.key2=a.key2
left join tmp.tmp_fulljoin_tableb b
on k.key1=b.key1 and k.key2=b.key2
left join tmp.tmp_fulljoin_tablec c
on k.key1=c.key1 and k.key2=c.key2
--注意key值有null的情况