储存两年就是 2 x 365 x 1000万 = 7300000000(70亿),如果储存更长时间,则无法估算需要的存储。而用拉链表存储,每日只向表中新增和变化的数据量,每日不过20万条,
储存2年也只需要 2 x 365 * 200000 = 146000000 (1.46以)存储空间。
drop table t_userinfo_src;
create table t_userinfo_src(
user_id int,
user_name character varying,
user_no integer,
phone_no character varying,
create_date date,
update_date date
) distribute by hash(user_id);
drop table t_userinfo_zipper;
create table t_userinfo_zipper(
user_id int,
user_name character varying,
user_no integer,
phone_no character varying,
effective_date date,
invalid_date date
) distribute by hash(user_id);
2019年11月12日 新增了两个用户,
则这两条记录的生效时间为当天,由于到 2019年11月12日 为止,这两条记录还没有被修改过,所以失效时间为无穷大,
insert into t_userinfo_src(user_id,user_name,user_no,phone_no,create_date,update_date)
postgres=> select * from t_userinfo_src;
user_id | user_name | user_no | phone_no | create_date | update_date
1002 | eleven | 120 | 13000000002 | 2019-11-12 00:00:00 | 2019-11-12 00:00:00
1001 | se7en.shi | 110 | 13000000001 | 2019-11-12 00:00:00 | 2019-11-12 00:00:00
1003 | rose | 120 | 13000000003 | 2019-11-12 00:00:00 | 2019-11-12 00:00:00
(3 rows)
select * from fn_userinfo_zipper('2019-11-13');
postgres=> select * from t_userinfo_zipper;
user_id | user_name | user_no | phone_no | effective_date | invalid_date
1003 | rose | 120 | 13000000003 | 2019-11-12 00:00:00 | 2999-12-31 00:00:00
1001 | se7en.shi | 110 | 13000000001 | 2019-11-12 00:00:00 | 2999-12-31 00:00:00
1002 | eleven | 120 | 13000000002 | 2019-11-12 00:00:00 | 2999-12-31 00:00:00
(3 rows)
用户 1001 被删除,
用户 1002 的电话号码被修改成 13000000004 。
为了保留历史状态,用户 1001 的失效时间被修改成 2019-11-12,用户 1002 则变成两条记录,
delete from t_userinfo_src where user_id=1001;
update t_userinfo_src set phone_no='13000000004',update_date='2019-11-13' where user_id=1002;
insert into t_userinfo_src(user_id,user_name,user_no,phone_no,create_date,update_date)
postgres=> select * from t_userinfo_src;
user_id | user_name | user_no | phone_no | create_date | update_date
1003 | rose | 120 | 13000000003 | 2019-11-12 00:00:00 | 2019-11-12 00:00:00
1004 | jack | 110 | 13000000005 | 2019-11-13 00:00:00 | 2019-11-13 00:00:00
1002 | eleven | 120 | 13000000004 | 2019-11-12 00:00:00 | 2019-11-13 00:00:00
postgres=> select * from t_userinfo_zipper;
user_id | user_name | user_no | phone_no | effective_date | invalid_date
1003 | rose | 120 | 13000000003 | 2019-11-12 00:00:00 | 2999-12-31 00:00:00 --拉链表中,14号执行后,应该被新增
1001 | se7en.shi | 110 | 13000000001 | 2019-11-12 00:00:00 | 2999-12-31 00:00:00 --拉链表中,14号执行后,应该被删除
1002 | eleven | 120 | 13000000002 | 2019-11-12 00:00:00 | 2999-12-31 00:00:00 --拉链表中,14号执行后,应该被标记为无效,无效时间是2019-11-13 00:00:00
select * from fn_userinfo_zipper('2019-11-14');
postgres=> select * from t_userinfo_src;
user_id | user_name | user_no | phone_no | create_date | update_date
1003 | rose | 120 | 13000000003 | 2019-11-12 00:00:00 | 2019-11-12 00:00:00
1004 | jack | 110 | 13000000005 | 2019-11-13 00:00:00 | 2019-11-13 00:00:00
1002 | eleven | 120 | 13000000004 | 2019-11-12 00:00:00 | 2019-11-13 00:00:00
(3 rows)
postgres=> select * from t_userinfo_zipper;
user_id | user_name | user_no | phone_no | effective_date | invalid_date
1003 | rose | 120 | 13000000003 | 2019-11-12 00:00:00 | 2999-12-31 00:00:00
1001 | se7en.shi | 110 | 13000000001 | 2019-11-12 00:00:00 | 2019-11-13 00:00:00 --被标记为删除,invalid_date为2019-11-13 00:00:00
1004 | jack | 110 | 13000000005 | 2019-11-13 00:00:00 | 2999-12-31 00:00:00 --新增数据
1002 | eleven | 120 | 13000000002 | 2019-11-12 00:00:00 | 2019-11-13 00:00:00 --被标记为无效
1002 | eleven | 120 | 13000000004 | 2019-11-13 00:00:00 | 2999-12-31 00:00:00 --更新后的数据
(5 rows)
1,如果要查询最新的数据,那么只要查询失效时间为 2999-12-31 的数据即可
postgres=> select * from t_userinfo_zipper where invalid_date='2999-12-31';
user_id | user_name | user_no | phone_no | effective_date | invalid_date
1004 | jack | 110 | 13000000005 | 2019-11-13 00:00:00 | 2999-12-31 00:00:00
1003 | rose | 120 | 13000000003 | 2019-11-12 00:00:00 | 2999-12-31 00:00:00
1002 | eleven | 120 | 13000000004 | 2019-11-13 00:00:00 | 2999-12-31 00:00:00
1,如果要查询 2019年11月12号 的历史数据,则筛选生效时间 <= 2019-11-13 并且失效时间 > 2019-11-13 的数据即可;
postgres=> select * from t_userinfo_zipper where invalid_date<='2019-11-13' and invalid_date >='2019-11-13';
user_id | user_name | user_no | phone_no | effective_date | invalid_date
1002 | eleven | 120 | 13000000002 | 2019-11-12 00:00:00 | 2019-11-13 00:00:00
1001 | se7en.shi | 110 | 13000000001 | 2019-11-12 00:00:00 | 2019-11-13 00:00:00
(2 rows)
create or replace function fn_userinfo_zipper(IN cur_date text)
returns void
as $$
本功能是将原数据表中 新增数据、修改、删除记录到拉链表中
invalid_date 设定为 2999-12-31
本函数传入值为时间,具体为今天执行昨天的数据,参数为 (to_date(cur_date,'yyyy-mm-dd') - 1)
--1.目标表中没有此主键的,确定为新增 - 新增
--3 捕获被修改的内容,将其置为无效
--3.1 闭链:目标表中有此主键的记录,状态值不同,更新结束日期为当天
--3.2 开链:目标表中新增一条修改的数据,更新结束日期为无穷大
@author: se7en.shi
@date: 2019-11-13
--1.目标表中没有此主键的,确定为新增 - 新增
insert into t_userinfo_zipper(user_id,user_name,user_no,phone_no,effective_date,invalid_date)
select a.user_id,a.user_name,a.user_no,a.phone_no,a.create_date,to_date('2999-12-31','yyyy-mm-dd') as invalid_date
from t_userinfo_src a
where a.create_date=(to_date(cur_date,'yyyy-mm-dd') - 1)
and not exists(
select 1 from t_userinfo_zipper b
where a.user_id=b.user_id);
raise notice 'finish new increasing ...';
update t_userinfo_zipper a set invalid_date=(to_date(cur_date,'yyyy-mm-dd')-1)
where not EXISTS(
select 1 from t_userinfo_src b
where a.user_id=b.user_id
raise notice 'finish delete data capture ...';
--3 捕获被修改的内容,将其置为无效
--3.1 闭链:目标表中有此主键的记录,状态值不同,更新结束日期为当天
update t_userinfo_zipper a set invalid_date=(to_date(cur_date,'yyyy-mm-dd')-1)
where a.invalid_date=to_date('2999-12-31','yyyy-mm-dd')
and exists(
select 1 from t_userinfo_src b
where a.user_id=b.user_id and b.create_date < (to_date(cur_date,'yyyy-mm-dd')-1)
and (b.user_name<>a.user_name or b.user_no<>a.user_no or b.phone_no<>a.phone_no)
raise notice 'finish modifyed data capture lable invalid...';
--3.2 开链:目标表中新增一条修改的数据,更新结束日期为无穷大
insert into t_userinfo_zipper(user_id,user_name,user_no,phone_no,effective_date,invalid_date)
select a.user_id,a.user_name,a.user_no,a.phone_no,(to_date(cur_date,'yyyy-mm-dd') - 1) as effective_date,to_date('2999-12-31','yyyy-mm-dd') as invalid_date
from t_userinfo_src a
where a.create_date<=(to_date(cur_date,'yyyy-mm-dd') - 1)
and exists(
select 1 from (
select user_id,effective_date,max(invalid_date) as invalid_date
from t_userinfo_zipper
group by user_id,effective_date ) b
where a.user_id=b.user_id
and a.create_date=b.effective_date
and b.invalid_date <= (to_date(cur_date,'yyyy-mm-dd') - 1)
raise notice 'finish modifyed data capture new insert';
$$ language plpgsql;