拉链表会在维度表中添加两个字段,一个是该行记录的生效时间,一个是该行记录的失效时间,用于记录每条信息的生命周期;
在这种缓慢变化维表 + 每日存储全量历史数据 的应用场景下,使用拉链表可以减少冗余数据,提高存储效率;
如下是每日全量同步和使用拉链表进行对比:
表中存储两种数据:1.失效数据 2.有效数据
缓慢变化维表中 大部分是有效数据,使用拉链表我们只需要存储一份有效数据即可;否则有效数据每日都要存储一分冗余太多
如何使用拉链表就是说如何从拉链表获取想要的数据
场景1:获取一份最新的全量数据
(2)第二天及往后
dim dt = 9999
的数据和 ods dt = 当天
的数据full join如下图所示:
insert overwrite table dim_user_info partition(dt='9999-99-99')
select
id,
login_name,
nick_name,
md5(name),
md5(phone_num),
md5(email),
user_level,
birthday,
gender,
create_time,
operate_time,
'2020-06-14',
'9999-99-99'
from ods_user_info
where dt='2020-06-14';
with
tmp as
(
select
old.id old_id,
old.login_name old_login_name,
old.nick_name old_nick_name,
old.name old_name,
old.phone_num old_phone_num,
old.email old_email,
old.user_level old_user_level,
old.birthday old_birthday,
old.gender old_gender,
old.create_time old_create_time,
old.operate_time old_operate_time,
old.start_date old_start_date,
old.end_date old_end_date,
new.id new_id,
new.login_name new_login_name,
new.nick_name new_nick_name,
new.name new_name,
new.phone_num new_phone_num,
new.email new_email,
new.user_level new_user_level,
new.birthday new_birthday,
new.gender new_gender,
new.create_time new_create_time,
new.operate_time new_operate_time,
new.start_date new_start_date,
new.end_date new_end_date
from
( --dim有效数据
select
id,
login_name,
nick_name,
name,
phone_num,
email,
user_level,
birthday,
gender,
create_time,
operate_time,
start_date,
end_date
from dim_user_info
where dt='9999-99-99'
)old
full outer join -- full join
( --ods当天全量数据是新增及变化的
select
id,
login_name,
nick_name,
md5(name) name,
md5(phone_num) phone_num,
md5(email) email,
user_level,
birthday,
gender,
create_time,
operate_time,
'2020-06-15' start_date,
'9999-99-99' end_date
from ods_user_info
where dt='2020-06-15'
)new
on old.id=new.id
)
insert overwrite table dim_user_info partition(dt)
select
-- ods的不为null的最新数据 和 ods为null dim不为null的数据 也就是有效数据
--装载到dim dt = 999
nvl(new_id,old_id),
nvl(new_login_name,old_login_name),
nvl(new_nick_name,old_nick_name),
nvl(new_name,old_name),
nvl(new_phone_num,old_phone_num),
nvl(new_email,old_email),
nvl(new_user_level,old_user_level),
nvl(new_birthday,old_birthday),
nvl(new_gender,old_gender),
nvl(new_create_time,old_create_time),
nvl(new_operate_time,old_operate_time),
nvl(new_start_date,old_start_date),
nvl(new_end_date,old_end_date),
nvl(new_end_date,old_end_date) dt
from tmp
union all --union 去重 union all不去重
select
--装载到dt = 14的数据
old_id,
old_login_name,
old_nick_name,
old_name,
old_phone_num,
old_email,
old_user_level,
old_birthday,
old_gender,
old_create_time,
old_operate_time,
old_start_date,
cast(date_add('2020-06-15',-1) as string),
cast(date_add('2020-06-15',-1) as string) dt --动态分区
from tmp
where new_id is not null and old_id is not null; -- join上的数据为过期的