今天接到需求大概是这样的,在以前上线的分区报表中新加一个字段,并且要求添加到指定的列,然后刷新同步以前的数据,现将模拟实现如下:
创建测试表
create external table test.table_add_column_test(
original_column1 string comment '原始数据1',
original_column2 string comment '原始数据2'
)
comment 'add_column的测试表'
partitioned by (
`daystr` string comment '日期'
)
row format delimited fields terminated by '\t'
stored as textfile;
OK
Time taken: 0.266 seconds
插入测试数据
insert into table test.table_add_column_test partition(daystr='20190107') select '测试数据1_0107','测试数据2_0107';
insert into table test.table_add_column_test partition(daystr='20190108') values ('测试数据1_0108','测试数据2_0108');
查看现有数据
select * from test.table_add_column_test;
OK
测试数据1_0107 测试数据2_0107 20190107
测试数据1_0108 测试数据2_0108 20190108
Time taken: 0.226 seconds, Fetched: 2 row(s)
官网添加列的语法
ALTER TABLE table_name
[PARTITION partition_spec] -- (Note: Hive 0.14.0 and later)
ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...)
[CASCADE|RESTRICT] -- (Note: Hive 1.1.0 and later)
注意: 默认模式为RESTRICT(即不修改元数据),cascade则同步修改元数据,这样才会重新刷新数据时添加的字段才会有值,不然刷新数据新添加的字段以前的数据都为null
增加一列,指定增加到原始的两列中间
先添加一列
alter table test.table_add_column_test add columns (added_column string comment '新添加的列') cascade;
OK
Time taken: 0.328 seconds
再对列进行排序(注意:必须添加cascade关键字,不然不会刷新旧分区数据,关键字cascade能修改元数据)
alter table test.table_add_column_test change column added_column added_column string after original_column1 cascade;
OK
Time taken: 0.219 seconds
查看建表语句,是否将新的列添加在原始的两列中间
show create table test.table_add_column_test;
OK
CREATE EXTERNAL TABLE `test.table_add_column_test`(
`original_column1` string COMMENT '原始数据1',
`added_column` string COMMENT '新添加的列',
`original_column2` string COMMENT '原始数据2')
COMMENT 'add_column的测试表'
PARTITIONED BY (
`daystr` string COMMENT '日期')
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='\t',
'serialization.format'='\t')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://emr-header-1.cluster-67230:9000/user/hive/warehouse/test.db/table_add_column_test'
TBLPROPERTIES (
'last_modified_by'='hdfs',
'last_modified_time'='1546872372',
'transient_lastDdlTime'='1546872372')
Time taken: 0.137 seconds, Fetched: 22 row(s)
查看数据(注意: 虽然列名顺序变了,但HDFS文件内容并没有变化,所以结果第二列还是有数据,第三列没数据)
select * from test.table_add_column_test;
OK
测试数据1_0107 测试数据2_0107 NULL 20190107
测试数据1_0108 测试数据2_0108 NULL 20190108
Time taken: 0.235 seconds, Fetched: 2 row(s)
重刷旧分区数据(将以前第二列放到第三列位置,现第二列为新数据)
insert overwrite table test.table_add_column_test partition(daystr='20190107') select original_column1, '新增列数据_0107', added_column from test.table_add_column_test where daystr = '20190107';
insert overwrite table test.table_add_column_test partition(daystr='20190108') select original_column1, '新增列数据_0108', added_column from test.table_add_column_test where daystr = '20190108';
查看数据(旧分区数据有更新)
select * from test.table_add_column_test;
OK
测试数据1_0107 新增列数据_0107 测试数据2_0107 20190107
测试数据1_0108 新增列数据_0108 测试数据2_0108 20190108
Time taken: 0.189 seconds, Fetched: 2 row(s)
删除表
drop table test.table_add_column_test;
hdfs dfs -rm -r /user/hive/warehouse/test.db/table_add_column_test
创建测试表
create external table test.table_add_column_test(
original_column1 string comment '原始数据1',
original_column2 string comment '原始数据2'
)
comment 'add_column的测试表'
partitioned by (
`daystr` string comment '日期'
)
row format delimited fields terminated by '\t'
stored as textfile;
插入测试数据
insert into table test.table_add_column_test partition(daystr='20190107') select '测试数据1_0107','测试数据2_0107';
insert into table test.table_add_column_test partition(daystr='20190108') values ('测试数据1_0108','测试数据2_0108');
添加列(不加关键字cascade)
alter table test.table_add_column_test add columns (added_column string comment '新添加的列');
alter table test.table_add_column_test change column added_column added_column string after original_column1;
重刷旧分区数据
insert overwrite table test.table_add_column_test partition(daystr='20190107') select original_column1, '新增列数据_0107', added_column from test.table_add_column_test where daystr = '20190107';
insert overwrite table test.table_add_column_test partition(daystr='20190108') select original_column1, '新增列数据_0108', added_column from test.table_add_column_test where daystr = '20190108';
查看数据(旧分区没有变化)
select * from test.table_add_column_test;
OK
测试数据1_0107 新增列数据_0107 NULL 20190107
测试数据1_0108 新增列数据_0108 NULL 20190108
Time taken: 0.195 seconds, Fetched: 2 row(s)