hive变更数据过程

创建测试表

變更的數據:
    1.新增數據
    2.變更數據:
        (1)歷史數據存在,回溯
        (2)當時新增,當日變更
    3.刪除數據
        (1)歷史數據存在,回溯過程
        (2)當時新增,當日刪除
-- ==============================================測試數據集===============================================

use default;
drop table if exists test3;
CREATE TABLE if not exists test3(
    id   string,
    name string,
    create_date string,
    last_modified_date string,
    amount double,
    is_delete int
)partitioned by (dt string)
row format delimited fields terminated by '\001'
NULL DEFINED AS '增量測試數據'
 LOCATION '/user/hive/warehouse/test3';

插入测试数据

insert overwrite table test3 partition(dt='2023-10-03') values ('1','手機','2023-10-01 10:11:11','2023-10-01 10:11:11',3,0);
insert  into table test3 partition(dt='2023-10-03') values ('2','手機','2023-10-01 10:11:11','2023-10-01 10:11:11',2,0);
insert into table test3 partition(dt='2023-10-03') values ('3','電腦','2023-10-02 10:11:11','2023-10-02 10:11:11',3,0);
insert into table test3 partition(dt='2023-10-03') values ('4','儀器','2023-10-02 10:11:11','2023-10-02 10:11:11',10,1);

insert overwrite table test3 partition(dt='2023-10-04') values ('1','手機','2023-10-01 10:11:11','2023-10-10 10:11:11',2,0);
insert into table test3 partition(dt='2023-10-04') values ('2','手機','2023-10-01 10:11:11','2023-10-04 11:11:11',11,0);
insert into table test3 partition(dt='2023-10-04') values ('2','手機','2023-10-01 10:11:11','2023-10-04 16:11:11',6,0);
insert into table test3 partition(dt='2023-10-04') values ('3','電腦','2023-10-02 10:11:11','2023-10-04 10:11:11',3,1);
insert into table test3 partition(dt='2023-10-04') values ('5','電腦','2023-10-04 10:11:11','2023-10-04 10:11:11',30,0);
insert into table test3 partition(dt='2023-10-04') values ('6','電腦','2023-10-04 10:11:11','2023-10-04 10:11:11',30,0);
insert into table test3 partition(dt='2023-10-04') values ('6','電腦','2023-10-04 10:11:11','2023-10-04 12:11:11',30,1);
insert into table test3 partition(dt='2023-10-04') values ('7','電腦','2023-10-04 10:11:11','2023-10-04 10:11:11',11,0);
insert into table test3 partition(dt='2023-10-04') values ('7','電腦','2023-10-04 10:11:11','2023-10-04 12:11:11',10,0);
insert overwrite table test3 partition(dt='2023-10-05') values ('1','手機','2023-10-01 10:11:11','2023-10-05 10:11:11',1,0);

insert into table test3 partition(dt='2023-10-05') values ('2','手機','2023-10-01 10:11:11','2023-10-05 16:11:11',3,0);
insert into table test3 partition(dt='2023-10-05') values ('5','電腦','2023-10-10 10:11:11','2023-10-05 10:11:11',30,1);
insert into table test3 partition(dt='2023-10-05') values ('6','電腦','2023-10-05 10:11:11','2023-10-05 10:11:11',2,0);
insert into table test3 partition(dt='2023-10-05') values ('6','電腦','2023-10-05 10:11:11','2023-10-05 12:11:11',2,1);
insert into table test3 partition(dt='2023-10-05') values ('7','電腦','2023-10-04 10:11:11','2023-10-05 12:11:11',7,0);
insert into table test3 partition(dt='2023-10-05') values ('8','電腦','2023-10-05 10:11:11','2023-10-05 10:11:11',10,0);
insert into table test3 partition(dt='2023-10-05') values ('8','電腦','2023-10-05 10:11:11','2023-10-05 12:11:11',10,1);

建一張表:每日全量数据,過濾出首日有效數據

drop table if exists test_full;
CREATE TABLE if not exists test_full(
    id   string,
    name string,
    create_date string,
    last_modified_date string,
    amount double,
    is_delete int,
    flag int
)partitioned by (dt string)
row format delimited fields terminated by '\001'
NULL DEFINED AS '最新全量數據'
 LOCATION '/user/hive/warehouse/test_full';

插入首日数据

-- 首日
insert overwrite table test_full partition (dt='2023-10-03')
select id,
       name,
       create_date,
       last_modified_date,
       amount,
       is_delete,
       1 flag
from test3
where dt = '2023-10-03'
and is_delete =0;

建一張表:每日变更数据

drop table if exists test_detail;
CREATE TABLE if not exists test_detail(
    id   string,
    name string,
    create_date string,
    last_modified_date string,
    amount double,
    is_delete int,
    flag int
)partitioned by (dt string)
row format delimited fields terminated by '\001'
NULL DEFINED AS '每日变更数据'
 LOCATION '/user/hive/warehouse/test_detail';

每日变更表插入数据

with extra as (
    select id,
           name,
           create_date,
           last_modified_date,
           amount,
           is_delete
    from (
        --    過濾掉多餘的數據
             select id,
                    name,
                    create_date,
                    last_modified_date,
                    amount,
                    is_delete,
                    row_number() over (partition by id order by last_modified_date desc) rn
             from test3
             where dt = '2023-10-04'
         )t1
    where rn = 1
)
,old as (
    select id,
           name,
           create_date,
           last_modified_date,
           amount,
           is_delete,
           flag
    from test_full
    where dt = date_sub('2023-10-04',1)
    and is_delete = 0
    )

-- 判斷刪除的數據是否在之前的數據中存在
,deleted_data as (
    select id,
           name,
           create_date,
           last_modified_date,
           amount,
           is_delete
    from extra
    where is_delete = 1
)
,flag_deleted as (
    select deleted_data.id,
           deleted_data.name,
           deleted_data.create_date,
           deleted_data.last_modified_date,
           deleted_data.amount,
           deleted_data.is_delete,
--            當天創建,當天刪除,係數直接設為0即可
           if(old.id is null ,0,-1) flag
    from deleted_data
    left join old
    on deleted_data.id = old.id
)
,update_data as (
        select id,
               name,
               create_date,
               last_modified_date,
               amount,
               is_delete
        from extra
    where is_delete = 0
    and date_format(last_modified_date,'yyyy-MM-dd HH:mm:ss') <> date_format(create_date,'yyyy-MM-dd HH:mm:ss')
    )

,flag_updated as (
    select update_data.id,
           update_data.name,
           update_data.create_date,
           update_data.last_modified_date,
           update_data.amount,
           update_data.is_delete,
           old.id old_id,
           old.name old_name,
           old.create_date old_create_date,
           old.last_modified_date old_last_modified_date,
           old.amount old_amount,
           old.is_delete old_is_delete
    from update_data
    left join old
    on update_data.id = old.id
)

--    只有一條更新數據
, one_update as (
   select id,
          name,
          create_date,
          last_modified_date,
          amount,
          is_delete,
          1 flag
   from flag_updated
    where old_id is null
)
,two_update as (
    select id,
           name,
           create_date,
           last_modified_date,
           amount,
           is_delete,
           1 flag,
           old_id,
           old_name,
           old_create_date,
           old_last_modified_date,
           old_amount,
           old_is_delete,
           -1 flag2
    from flag_updated
   where old_id is not null
)

,result as (
    select id,
           name,
           create_date,
           last_modified_date,
           amount,
           is_delete,
           flag
    from flag_deleted
    union all
    select id,
           name,
           create_date,
           last_modified_date,
           amount,
           is_delete,
           flag
    from one_update
    union all
    select id,
           name,
           create_date,
           last_modified_date,
           amount,
           is_delete,
           flag
    from two_update
    union all
    select old_id,
           old_name,
           old_create_date,
           old_last_modified_date,
           old_amount,
           old_is_delete,
           flag2
    from two_update
    union all
    select id,
           name,
           create_date,
           last_modified_date,
           amount,
           is_delete,
           1 flag
    from extra
    where is_delete = 0
      and date_format(last_modified_date, 'yyyy-MM-dd HH:mm:ss') = date_format(create_date, 'yyyy-MM-dd HH:mm:ss')
)
insert overwrite table test_detailpartition (dt='2023-10-04')
select *
from result;

每日全量数据

-- 每日
-- 整理出最新全量數據
with old as (
    select id,
           name,
           create_date,
           last_modified_date,
           amount,
           is_delete,
           flag
    from test_full
    where dt = date_sub('2023-10-04',1)
    and is_delete = 0
)
 ,t1 as (
    select id,
           name,
           create_date,
           last_modified_date,
           amount,
           is_delete,
           flag
    from test_detail
    where dt = '2023-10-04'
)
   ,t3 as (
       select id,
              name,
              create_date,
              last_modified_date,
              amount,
              is_delete,
              flag,
              row_number() over (partition by id order by last_modified_date desc) rn
       from (
                select id,
                       name,
                       create_date,
                       last_modified_date,
                       amount,
                       is_delete,
                       flag
                from old
                union all
                select id,
                       name,
                       create_date,
                       last_modified_date,
                       amount,
                       is_delete,
                       flag
                from t1
            )t1
)

insert overwrite table test_full partition (dt='2023-10-04')
select id,
       name,
       create_date,
       last_modified_date,
       amount,
       is_delete,
       flag
from t3
where rn = 1;

你可能感兴趣的:(HQL,hive,数据仓库)