创建测试表
變更的數據:
1.新增數據
2.變更數據:
(1)歷史數據存在,回溯
(2)當時新增,當日變更
3.刪除數據
(1)歷史數據存在,回溯過程
(2)當時新增,當日刪除
-- ==============================================測試數據集===============================================
use default;
drop table if exists test3;
CREATE TABLE if not exists test3(
id string,
name string,
create_date string,
last_modified_date string,
amount double,
is_delete int
)partitioned by (dt string)
row format delimited fields terminated by '\001'
NULL DEFINED AS '增量測試數據'
LOCATION '/user/hive/warehouse/test3';
插入测试数据
insert overwrite table test3 partition(dt='2023-10-03') values ('1','手機','2023-10-01 10:11:11','2023-10-01 10:11:11',3,0);
insert into table test3 partition(dt='2023-10-03') values ('2','手機','2023-10-01 10:11:11','2023-10-01 10:11:11',2,0);
insert into table test3 partition(dt='2023-10-03') values ('3','電腦','2023-10-02 10:11:11','2023-10-02 10:11:11',3,0);
insert into table test3 partition(dt='2023-10-03') values ('4','儀器','2023-10-02 10:11:11','2023-10-02 10:11:11',10,1);
insert overwrite table test3 partition(dt='2023-10-04') values ('1','手機','2023-10-01 10:11:11','2023-10-10 10:11:11',2,0);
insert into table test3 partition(dt='2023-10-04') values ('2','手機','2023-10-01 10:11:11','2023-10-04 11:11:11',11,0);
insert into table test3 partition(dt='2023-10-04') values ('2','手機','2023-10-01 10:11:11','2023-10-04 16:11:11',6,0);
insert into table test3 partition(dt='2023-10-04') values ('3','電腦','2023-10-02 10:11:11','2023-10-04 10:11:11',3,1);
insert into table test3 partition(dt='2023-10-04') values ('5','電腦','2023-10-04 10:11:11','2023-10-04 10:11:11',30,0);
insert into table test3 partition(dt='2023-10-04') values ('6','電腦','2023-10-04 10:11:11','2023-10-04 10:11:11',30,0);
insert into table test3 partition(dt='2023-10-04') values ('6','電腦','2023-10-04 10:11:11','2023-10-04 12:11:11',30,1);
insert into table test3 partition(dt='2023-10-04') values ('7','電腦','2023-10-04 10:11:11','2023-10-04 10:11:11',11,0);
insert into table test3 partition(dt='2023-10-04') values ('7','電腦','2023-10-04 10:11:11','2023-10-04 12:11:11',10,0);
insert overwrite table test3 partition(dt='2023-10-05') values ('1','手機','2023-10-01 10:11:11','2023-10-05 10:11:11',1,0);
insert into table test3 partition(dt='2023-10-05') values ('2','手機','2023-10-01 10:11:11','2023-10-05 16:11:11',3,0);
insert into table test3 partition(dt='2023-10-05') values ('5','電腦','2023-10-10 10:11:11','2023-10-05 10:11:11',30,1);
insert into table test3 partition(dt='2023-10-05') values ('6','電腦','2023-10-05 10:11:11','2023-10-05 10:11:11',2,0);
insert into table test3 partition(dt='2023-10-05') values ('6','電腦','2023-10-05 10:11:11','2023-10-05 12:11:11',2,1);
insert into table test3 partition(dt='2023-10-05') values ('7','電腦','2023-10-04 10:11:11','2023-10-05 12:11:11',7,0);
insert into table test3 partition(dt='2023-10-05') values ('8','電腦','2023-10-05 10:11:11','2023-10-05 10:11:11',10,0);
insert into table test3 partition(dt='2023-10-05') values ('8','電腦','2023-10-05 10:11:11','2023-10-05 12:11:11',10,1);
建一張表:每日全量数据,過濾出首日有效數據
drop table if exists test_full;
CREATE TABLE if not exists test_full(
id string,
name string,
create_date string,
last_modified_date string,
amount double,
is_delete int,
flag int
)partitioned by (dt string)
row format delimited fields terminated by '\001'
NULL DEFINED AS '最新全量數據'
LOCATION '/user/hive/warehouse/test_full';
插入首日数据
-- 首日
insert overwrite table test_full partition (dt='2023-10-03')
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
1 flag
from test3
where dt = '2023-10-03'
and is_delete =0;
建一張表:每日变更数据
drop table if exists test_detail;
CREATE TABLE if not exists test_detail(
id string,
name string,
create_date string,
last_modified_date string,
amount double,
is_delete int,
flag int
)partitioned by (dt string)
row format delimited fields terminated by '\001'
NULL DEFINED AS '每日变更数据'
LOCATION '/user/hive/warehouse/test_detail';
每日变更表插入数据
with extra as (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete
from (
-- 過濾掉多餘的數據
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
row_number() over (partition by id order by last_modified_date desc) rn
from test3
where dt = '2023-10-04'
)t1
where rn = 1
)
,old as (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
flag
from test_full
where dt = date_sub('2023-10-04',1)
and is_delete = 0
)
-- 判斷刪除的數據是否在之前的數據中存在
,deleted_data as (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete
from extra
where is_delete = 1
)
,flag_deleted as (
select deleted_data.id,
deleted_data.name,
deleted_data.create_date,
deleted_data.last_modified_date,
deleted_data.amount,
deleted_data.is_delete,
-- 當天創建,當天刪除,係數直接設為0即可
if(old.id is null ,0,-1) flag
from deleted_data
left join old
on deleted_data.id = old.id
)
,update_data as (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete
from extra
where is_delete = 0
and date_format(last_modified_date,'yyyy-MM-dd HH:mm:ss') <> date_format(create_date,'yyyy-MM-dd HH:mm:ss')
)
,flag_updated as (
select update_data.id,
update_data.name,
update_data.create_date,
update_data.last_modified_date,
update_data.amount,
update_data.is_delete,
old.id old_id,
old.name old_name,
old.create_date old_create_date,
old.last_modified_date old_last_modified_date,
old.amount old_amount,
old.is_delete old_is_delete
from update_data
left join old
on update_data.id = old.id
)
-- 只有一條更新數據
, one_update as (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
1 flag
from flag_updated
where old_id is null
)
,two_update as (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
1 flag,
old_id,
old_name,
old_create_date,
old_last_modified_date,
old_amount,
old_is_delete,
-1 flag2
from flag_updated
where old_id is not null
)
,result as (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
flag
from flag_deleted
union all
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
flag
from one_update
union all
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
flag
from two_update
union all
select old_id,
old_name,
old_create_date,
old_last_modified_date,
old_amount,
old_is_delete,
flag2
from two_update
union all
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
1 flag
from extra
where is_delete = 0
and date_format(last_modified_date, 'yyyy-MM-dd HH:mm:ss') = date_format(create_date, 'yyyy-MM-dd HH:mm:ss')
)
insert overwrite table test_detailpartition (dt='2023-10-04')
select *
from result;
每日全量数据
-- 每日
-- 整理出最新全量數據
with old as (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
flag
from test_full
where dt = date_sub('2023-10-04',1)
and is_delete = 0
)
,t1 as (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
flag
from test_detail
where dt = '2023-10-04'
)
,t3 as (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
flag,
row_number() over (partition by id order by last_modified_date desc) rn
from (
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
flag
from old
union all
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
flag
from t1
)t1
)
insert overwrite table test_full partition (dt='2023-10-04')
select id,
name,
create_date,
last_modified_date,
amount,
is_delete,
flag
from t3
where rn = 1;