本次预研场景主要为kafka=>flink sql=>iceberg=>hive=>hdfs=>trino(presto)
名称 | 版本 | 描述 |
---|---|---|
flink | 1.12.1 | 通过parcel包部署于cdh6.3.2中 |
cdh | 6.3.2 | 开源版本 |
hive | 2.3.7 | 包含cdh中(更换jar升级替换) |
hadoop | 3.0.0 | cdh原生版本 |
presto | 2.591 | 开源版本 |
trino | 360 | 开源版本 |
iceberg | 0.12.0 | 于2021年8月15日发布 |
https://iceberg.apache.org/releases/#0120-release-notes
iceberg-flink-runtime-0.12.0.jar
flink-sql-connector-hive-2.3.6_2.12-1.12.1.jar
flink-sql-connector-kafka_2.12-1.12.1.jar
flink-json-1.12.1.jar
catalogs:
- name: iceberg
type: iceberg
property-version: 2
warehouse: hdfs://nameservice1/data/iceberg
uri: thrift://cdh2:9083
catalog-type: hive
## sql-client 开启checkpoint配置,如果不开启,kafka-iceberg就不会提交元数据信息
configuration:
execution.checkpointing.interval: 60000
state.checkpoints.num-retained: 10
execution.checkpointing.mode: EXACTLY_ONCE
execution.checkpointing.externalized-checkpoint-retention: RETAIN_ON_CANCELLATION
state.backend: rocksdb
state.checkpoints.dir: hdfs:///user/flink/checkpoints
state.savepoints.dir: hdfs:///user/flink/checkpoints
sql-client.sh embedded -l /flink/soft/
测试iceberg on hive-catalog
##如果不设置sql-client-default.yaml,可通过语句创建
CREATE CATALOG iceberg WITH (
'type'='iceberg',
'catalog-type'='hive',
'uri'='thrift://cdh2:9083',
'clients'='5',
'property-version'='1',
'warehouse'='hdfs://nameservice1/data/iceberg'
);
##查看所有的catalogs
show catalogs;
##进入iceberg-catalogs中
use catalog iceberg;
##查看所有的databases
show databases;
##创建databases;
create database iceberg_test;
##进入iceberg_test(database)
use iceberg_test;
##创建iceberg_v1表
CREATE TABLE students_v1 (
addre string,
lxdh string,
xm string,
dz string,
byyx string,
id string COMMENT 'id',
age int
)PARTITIONED BY (age)
WITH (
'connector'='iceberg',
'write.format.default'='orc',
'format-version'='1', --指定表版本为v1
'write.metadata.delete-after-commit.enabled'='true' --删除最旧的版本元数据文件默认保存最近一百个
);
##查看所有表
show tables;
##删除表
drop tables table_name
##插入数据
insert into iceberg.iceberg_test.students_v1 values('郑巷141号','17341533143','丁晟睿','福海市','西南大学','1',31);
insert into iceberg.iceberg_test.students_v1 values('郑巷141号','17341533143','丁晟','福海市','西南大学','2',30);
##测试查询
select * from students_v1;
select * from students_v1 where age = 31;
##创建iceberg_v2表
CREATE TABLE students_v2 (
addre string,
lxdh string,
xm string,
dz string,
byyx string,
id string PRIMARY KEY NOT ENFORCED COMMENT 'PRIMARY KEY' ,
age int
)
WITH (
'connector'='iceberg',
'format-version'='2', -- 指定表版本为v2 v2不支持orc方式写入
'write.distribution-mode'='hash', -- 避免小文件产生
'write.metadata.delete-after-commit.enabled'='true' --删除最旧的版本元数据文件默认保存最近一百个
);
##插入数据
insert into iceberg.iceberg_test.students_v2 values('郑巷141号','17341533143','丁晟睿','福海市','西南大学','1',31);
insert into iceberg.iceberg_test.students_v2 values('郑巷141号','17341533143','丁晟','福海市','西南大学','2',30);
##测试查询
select * from students_v2;
select * from students_v2 where age = 31;
创建kafka-flink
kafka常用命令集
kafka-topics --list --zookeeper cdh2:2181,cdh3:2181
kafka-topics --create --topic iceberg --partitions 3 --replication-factor 1 --zookeeper cdh3:2181
#加了--from-beginning 重头消费所有的消息
kafka-console-consumer --bootstrap-server cdh2:9092 --topic iceberg --from-beginning
kafka-console-producer --broker-list cdh2:9092 --topic iceberg
{"addre":"孔桥90714号","lxdh":"17298981101","xm":"张弘文","dz":"包宁市","byyx":"东北技术大学","id":49,"age":76}
{"addre":"韦巷57841号","lxdh":"18660990864","xm":"戴建辉","dz":"西阳市","byyx":"西北科技大学","id":50,"age":49}
{"addre":"宋路4975号","lxdh":"15974172825","xm":"汪智辉","dz":"济乡县","byyx":"西南大学","id":51,"age":95}
{"addre":"侯巷6339号","lxdh":"17596361794","xm":"夏明轩","dz":"衡京市","byyx":"南技术大学","id":51,"age":95}
{"addre":"秦街6371号","lxdh":"15110361098","xm":"王立辉","dz":"安沙市","byyx":"东科技大学","id":51,"age":35}
{"addre":"贺中心10436号","lxdh":"13305794462","xm":"侯智辉","dz":"海都市","byyx":"西南体育大学","id":54,"age":87}
{"addre":"黄中心0号","lxdh":"13956637216","xm":"宋浩轩","dz":"珠阳市","byyx":"西北农业大学","id":55,"age":57}
kafka-topics --zookeeper cdh2:2181 --delete --topic iceberg
CREATE TABLE kafka_test (
addre string,
lxdh string,
xm string,
dz string,
byyx string,
id int,
age int
) WITH (
'connector.type' = 'kafka', -- 使用 kafka connector
'connector.version' = 'universal', -- kafka 版本,universal 支持 0.11 以上的版本
'connector.topic' = 'iceberg', -- kafka topic名称
'connector.startup-mode' = 'earliest-offset', -- 从起始 offset 开始读取
'connector.properties.bootstrap.servers' = 'cdh2:9092', -- kafka broker 地址
'connector.properties.group.id' = 'testgroup1',
'format.type' = 'json',
'format.ignore-parse-errors' = 'true' -- 解析失败跳过
);
select * from kafka_test;
##测试v1插入
insert into iceberg.iceberg_test.students_v1 select addre,lxdh,xm,dz,byyx,cast(id as string) as id,age from default_catalog.default_database.kafka_test;
##测试v2插入
insert into iceberg.iceberg_test.students_v2 select addre,lxdh,xm,dz,byyx,cast(id as string) as id,age from default_catalog.default_database.kafka_test;
connector.name=iceberg
hive.metastore.uri=thrift://cdh2:9083
iceberg.file-format=orc
hive.config.resources=/etc/alternatives/hadoop-conf/core-site.xml,/etc/alternatives/hadoop-conf/hdfs-site.xml
v1表查询
##trino 查询所有数据
select * from iceberg.iceberg_test.students_v1
##按照分区删除
delete from iceberg_test.students_v1 where age = 30
##查询所有快照
SELECT * FROM iceberg_test."students_v1$snapshots" ORDER BY committed_at
##快照回滚
CALL iceberg.system.rollback_to_snapshot('iceberg_test', 'students_v1', 512151592674641227)
v2表目前查询失败
##trino 查询所有数据
select * from iceberg_test.students_v2
##按照分区删除
delete from iceberg_test.students_v2 where age = 30
##查询所有快照
SELECT * FROM iceberg_test."students_v2$snapshots" ORDER BY committed_at
##快照回滚
CALL iceberg.system.rollback_to_snapshot('iceberg_test', 'students_v2', 512151592674641227)
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-shade-pluginartifactId>
<version>3.1.1version>
<executions>
<execution>
<phase>packagephase>
<goals>
<goal>shadegoal>
goals>
<configuration>
<artifactSet>
<excludes>
<exclude>com.google.code.findbugs:jsr305exclude>
<exclude>org.slf4j:*exclude>
<exclude>log4j:*exclude>
excludes>
artifactSet>
<filters>
<filter>
<artifact>*:*artifact>
<excludes>
<exclude>META-INF/*.SFexclude>
<exclude>META-INF/*.DSAexclude>
<exclude>META-INF/*.RSAexclude>
excludes>
filter>
filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.wk.iceberg.iceberg.KafkaOnIcebergmainClass>
transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
transformers>
configuration>
execution>
executions>
plugin>
plugins>
build>
//归并小文件为设置文件大小
Actions.forTable(table)
.rewriteDataFiles()
.maxParallelism(5)
.targetSizeInBytes(128 * 1024 * 1024)
.execute();
//自动删除以前的过期数据文件
Snapshot snapshot = table.currentSnapshot();
if (snapshot != null){
long time = snapshot.timestampMillis();
table.expireSnapshots()
.expireOlderThan(time)
.commit();
}
总结如下
V1 | V2 | |
---|---|---|
使用场景 | insert流 | upsert流 |
trino(presto) | Y | 不支持 |
读取方式 | 批流 | 批 |
存储格式 | orc/parquet | parquet |
小文件合并 | Y | N |
快照回滚 | Y | 暂未测试(jar冲突) |
过期数据删除 | Y | N |
分区 | Y | Y(主键+分区相同数据不会显示) |