组件 |
版本 |
Java |
1.8.251 |
Scala |
1.12.14 |
Flink |
1.12.5 |
Iceberg |
0.12.0 |
Hadoop |
2.9.2 |
Hive |
2.3.6 |
将hdfs-site.xml,core-site.xml,hive-site.xml放入resources下
object TestFlinkSQLOptIcebergHadoopCatalog {
def main(args: Array[String]): Unit = {
// val env = StreamExecutionEnvironment.getExecutionEnvironment
val settings = EnvironmentSettings
.newInstance()
.useBlinkPlanner()
.inBatchMode()
.build()
val tableEnv = TableEnvironment.create(settings)
val DDL =
"""
|create catalog hadoop_catalog with (
| 'type' = 'iceberg',
| 'catalog-type' = 'hadoop',
| 'property-version' = '1',
| 'warehouse' = 'hdfs:///user/hive/warehouse/'
|)
|""".stripMargin
tableEnv.executeSql(DDL)
// 两种写法
// tableEnv.executeSql("use catalog hadoop_catalog")
tableEnv.useCatalog("hadoop_catalog")
tableEnv.executeSql("create database if not exists iceberg_db")
// tableEnv.executeSql("use iceberg_db")
tableEnv.useDatabase("iceberg_db")
tableEnv.executeSql("show databases").print()
tableEnv.executeSql("show tables").print()
// 1. 创建表
// val tableDDL =
// """
// |create table if not exists iceberg_test_table (
// | id bigint comment 'unique id',
// | data string
// |) comment 'iceberg test table'
// | partitioned by (data)
// |""".stripMargin
// tableEnv.executeSql(tableDDL)
// tableEnv.executeSql("show tables").print()
// *** 2. 修改表名,暂不支持 hadoop catalog 更改表名等操作,当前仅支持更新设置属性和删除
// tableEnv.executeSql("alter table iceberg_test_table rename to iceberg_test_table2")
// tableEnv.executeSql("show tables").print()
// 3. 删除表
// tableEnv.executeSql("drop table if exists iceberg_test_table")
// tableEnv.executeSql("show tables").print()
// 4. 查询表
// tableEnv.executeSql("show tables").print()
// 5. like 根据已有表创建新表
tableEnv.executeSql("create table iceberg_test_like like iceberg_test_table")
tableEnv.executeSql("show tables").print()
// 6. 修改表属性
// flink1.11后支持
// tableEnv.executeSql("""alter table test_like set ('write.format.default'='avro')""")
// 7. 写入
// tableEnv.executeSql("insert into test_hadoop_table values (1, 'a')")
// tableEnv.executeSql("insert overwrite test_hadoop_table values (2, 'a') ")
// tableEnv.executeSql("insert overwrite test_table PARTITION(data='b') SELECT 6")
// 8. 读取数据
// tableEnv.executeSql("select * from test_hadoop_table").print()
// 9. 写入数据
// val insert =
// """
// |insert into test_like
// |select
// | id, data
// |from test_hadoop_table
// |""".stripMargin
// tableEnv.executeSql(insert)
}
}
object TestFlinkSQLOptIcebergHiveCatalog {
private var logger: org.slf4j.Logger = _
def main(args: Array[String]): Unit = {
logger = LoggerFactory.getLogger(this.getClass.getSimpleName)
Logger.getLogger("org.apache").setLevel(Level.INFO)
Logger.getLogger("hive.metastore").setLevel(Level.INFO)
Logger.getLogger("akka").setLevel(Level.INFO)
val tableEnv = FlinkUtils.initStreamTableEnvironment()
// val env = StreamExecutionEnvironment.getExecutionEnvironment
//
// val settings = EnvironmentSettings
// .newInstance()
// .useBlinkPlanner()
// .inStreamingMode()
// .build()
//
// streamTable 环境
// val tableEnv = StreamTableEnvironment.create(env, settings)
// batchTable 环境
// val settings = EnvironmentSettings
// .newInstance()
// .useBlinkPlanner()
// .inBatchMode()
// .build()
// val tableEnv = TableEnvironment.create(settings)
// val catalog_name = "hive_catalog"
// val database = "iceberg_test_db"
// val hiveConf = "F:\\workspace\\realtime-lakehouse\\test\\src\\main\\resources"
//
// val hiveCatalog = new HiveCatalog(
// catalog_name,
// null,
// hiveConf
// )
// tableEnv.registerCatalog(catalog_name, hiveCatalog)
// tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE)
// catalog
// val catalogDDL =
// """
// |create catalog hive_catalog with (
// | 'type' = 'iceberg',
// | 'catalog-type' = 'hive',
// | 'uri' = 'thrift://test-lakehouse:9083',
// | 'clients' = '5',
// | 'property-version' = '1',
// | 'warehouse' = 'hdfs://test-lakehouse:9000/user/hive/warehouse/'
// |)
// |""".stripMargin
// tableEnv.executeSql(catalogDDL)
// 两种写法
// // tableEnv.executeSql("use catalog hive_catalog")
// tableEnv.useCatalog("hive_catalog")
// tableEnv.executeSql("show catalogs").print()
//
// val databaseDDL = "create database if not exists iceberg_test_db"
// tableEnv.executeSql(databaseDDL)
//
// tableEnv.useDatabase("iceberg_test_db")
// println(s"current database: ${tableEnv.getCurrentDatabase}")
// tableEnv.executeSql("show databases").print()
// println("list catalogs:")
// tableEnv.listCatalogs().foreach(println)
// tableEnv.listDatabases()
// 1. 创建表
// val tableDDL =
// """
// |create table if not exists iceberg_test_table (
// | id bigint comment 'unique id',
// | data string,
// | primary key (id) not enforced
// |) comment 'iceberg test table'
// | partitioned by (id)
// |""".stripMargin
// tableEnv.executeSql(tableDDL)
// tableEnv.executeSql("show tables").print()
// 2. 修改表名
// tableEnv.executeSql("alter table iceberg_test_table rename to iceberg_test_table2")
// tableEnv.executeSql("show tables").print()
//
// 3. 删除表
// tableEnv.executeSql("drop table if exists iceberg_test_table")
// tableEnv.executeSql("show tables").print()
// 4. 查询表
// tableEnv.executeSql("show tables").print()
// 5. like 根据已有表创建新表
// tableEnv.executeSql("create table iceberg_test_like like iceberg_test_table")
// tableEnv.executeSql("show tables").print()
// 6. 修改表属性
// flink1.11后支持
// tableEnv.executeSql("alter table iceberg_test_like set ('write.format.default'='avro')")
// 7. 写入
// tableAPI
// val statementSet = tableEnv.createStatementSet()
// statementSet.addInsertSql("insert into iceberg_test_table values (1, 'a')")
// statementSet.execute()
// tableEnv.executeSql("insert into iceberg_test_table values (1, 'a'), (2, 'b')")
//+----------------------+--------------------------------+
//| id | data |
//+----------------------+--------------------------------+
//| 1 | a |
//| 2 | b |
//+----------------------+--------------------------------+
// tableEnv.executeSql("insert overwrite iceberg_test_table values (111, 'b')")
//+----------------------+--------------------------------+
//| id | data |
//+----------------------+--------------------------------+
//| 1 | aaa |
//| 2 | b |
//+----------------------+--------------------------------+
// tableEnv.executeSql("insert overwrite iceberg_test_table partition(data='b') select 888")
//+----------------------+--------------------------------+
//| id | data |
//+----------------------+--------------------------------+
//| 2 | b |
//| 1 | ccc |
//+----------------------+--------------------------------+
// 8. 读取数据
// tableEnv.executeSql("select * from iceberg_test_table").print()
// val table = tableEnv.sqlQuery("select * from iceberg_test_table")
// table.printSchema()
// table.execute().print()
// 9. 写入数据
// val insert =
// """
// |insert into iceberg_test_like
// |select
// | id, data
// |from iceberg_test_table
// |""".stripMargin
// tableEnv.executeSql(insert)
// tableEnv.executeSql("select * from iceberg_test_like").print()
// 10. 流读
// val config = tableEnv.getConfig.getConfiguration
// config.setBoolean(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true)
//
// // read all the records
// val readAllDML =
// """
// |select * from iceberg_test_table
// |/*+ options('streaming'='true', 'monitor-interval'='1s')*/
// |""".stripMargin
// tableEnv.executeSql(readAllDML).print()
//
// // read incremental data
// val readIncrementalDML =
// """
// |select * from iceberg_test_table
// |/*+ options('streaming'='true', 'monitor-interval'='1s', 'start-snapshot-id'='8116368287341314212')*/
// |""".stripMargin
// tableEnv.executeSql(readIncrementalDML).print()
// 11. cdc
tableEnv.useCatalog("default_catalog")
val cdcDDL =
"""
|create table if not exists iceberg_cdc_source (
| id int,
| data string,
| primary key (id) not enforced
|) with (
| 'connector' = 'mysql-cdc',
| 'hostname' = 'test-lakehouse',
| 'port' = '3306',
| 'username' = 'test',
| 'password' = '123456',
| 'database-name' = 'test_db',
| 'table-name' = 'test',
| 'server-time-zone' = 'Asia/Shanghai'
|)
""".stripMargin
tableEnv.executeSql(cdcDDL)
// output
// tableEnv.executeSql("select * from iceberg_cdc_source").print()
// val printSinkSql =
// """
// |create table if not exists print_sink (
// | id int,
// | data string,
// | primary key (id) not enforced
// |) with (
// | 'connector' = 'print'
// |)
// """.stripMargin
// tableEnv.executeSql(printSinkSql)
//
// tableEnv.executeSql("insert into print_sink select * from iceberg_cdc_source")
// catalog
val catalogDDL =
"""
|create catalog hive_catalog with (
| 'type' = 'iceberg',
| 'catalog-type' = 'hive',
| 'uri' = 'thrift://test-lakehouse:9083',
| 'clients' = '5',
| 'property-version' = '1',
| 'warehouse' = 'hdfs://test-lakehouse:9000/user/hive/warehouse/'
|)
|""".stripMargin
tableEnv.executeSql(catalogDDL)
val databaseDDL = "create database if not exists hive_catalog.iceberg_test_db"
tableEnv.executeSql(databaseDDL)
// tableEnv.executeSql("drop table if exists hive_catalog.iceberg_test_db.iceberg_cdc_test")
val tableDDL =
"""
|create table if not exists hive_catalog.iceberg_test_db.iceberg_cdc_test (
| id bigint comment 'unique id',
| data string,
| primary key (id) not enforced
|) comment 'iceberg test table'
| partitioned by (id)
| with(
| 'iceberg.format.version' = '2',
// | 'write.metadata.delete-after-commit.enabled' = 'true',
// | 'write.metadata.previous-versions-max' = '100'
| )
|""".stripMargin
tableEnv.executeSql(tableDDL)
val cdcDML =
"""
|insert into hive_catalog.iceberg_test_db.iceberg_cdc_test
|select * from default_catalog.default_database.iceberg_cdc_source
|""".stripMargin
tableEnv.executeSql(cdcDML)
// stop cdc after
// tableEnv.executeSql("select * from iceberg_cdc_test").print()
// val config = tableEnv.getConfig.getConfiguration
// config.setBoolean(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true)
// tableEnv.executeSql(
// """
// |select * from iceberg_cdc_test
// |/*+ options('streaming'='true', 'monitor-interval'='1s')*/
// """.stripMargin).print()
}
}
建表后
2021-09-24 14:40:31,948 INFO - Successfully committed to table hive_catalog.iceberg_test_db.iceberg_test_table in 2008 ms
+--------------------+
| table name |
+--------------------+
| iceberg_test_table |
+--------------------+
hdfs数据
未写入数据前,只有metastore
metastore信息
{
"format-version" : 1,
"table-uuid" : "efbc787a-6eed-46ef-a2a8-c04b8cbcf1c2",
"location" : "hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table",
"last-updated-ms" : 1632715958040,
"last-column-id" : 2,
"schema" : {
"type" : "struct",
"schema-id" : 0,
"identifier-field-ids" : [ 1 ],
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : true,
"type" : "long"
}, {
"id" : 2,
"name" : "data",
"required" : false,
"type" : "string"
} ]
},
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"identifier-field-ids" : [ 1 ],
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : true,
"type" : "long"
}, {
"id" : 2,
"name" : "data",
"required" : false,
"type" : "string"
} ]
} ],
"partition-spec" : [ {
"name" : "id",
"transform" : "identity",
"source-id" : 1,
"field-id" : 1000
} ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ {
"name" : "id",
"transform" : "identity",
"source-id" : 1,
"field-id" : 1000
} ]
} ],
"last-partition-id" : 1000,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : { },
"current-snapshot-id" : -1,
"snapshots" : [ ],
"snapshot-log" : [ ],
"metadata-log" : [ ]
}
修改数据后
......
"properties" : { },
"current-snapshot-id" : 3357358225130025285,
"snapshots" : [ {
"snapshot-id" : 750183960105471040,
"timestamp-ms" : 1632715970291,
"summary" : {
"operation" : "append",
"flink.job-id" : "c79435a3ae5097eba8842a1816409be5",
"flink.max-committed-checkpoint-id" : "9223372036854775807",
"added-data-files" : "2",
"added-records" : "2",
"added-files-size" : "1354",
"changed-partition-count" : "2",
"total-records" : "2",
"total-files-size" : "1354",
"total-data-files" : "2",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
"manifest-list" : "hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table/metadata/snap-750183960105471040-1-bc72ec07-52b5-4352-9c6b-1db44c8f85e9.avro",
"schema-id" : 0
}, {
"snapshot-id" :
......
},
"manifest-list" : "hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table/metadata/snap-3357358225130025285-1-9f5c0553-7a4b-42c7-8199-1f7cff77f3ac.avro",
"schema-id" : 0
} ],
"snapshot-log" : [ {
"timestamp-ms" : 1632715970291,
"snapshot-id" : 750183960105471040
}, {
......
} ],
"metadata-log" : [ {
"timestamp-ms" : 1632715958040,
"metadata-file" : "hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table/metadata/00000-7f31a7d0-6bd9-45a4-82f6-210ea2aa5f10.metadata.json"
}, {
......
} ]
}
hdfs dfs -text /.../iceberg_test_db.db/iceberg_test_table/metadata/snap-3357358225130025285-1-9f5c0553-7a4b-42c7-8199-1f7cff77f3ac.avro
{"manifest_path":"hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table/metadata/9f5c0553-7a4b-42c7-8199-1f7cff77f3ac-m1.avro","manifest_length":6030,"partition_spec_id":0,"added_snapshot_id":{"long":3357358225130025285},"added_data_files_count":{"int":1},"existing_data_files_count":{"int":0},"deleted_data_files_count":{"int":0},"partitions":{"array":[{"contains_null":false,"contains_nan":{"boolean":false},"lower_bound":{"bytes":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},"upper_bound":{"bytes":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"}}]},"added_rows_count":{"long":1},"existing_rows_count":{"long":0},"deleted_rows_count":{"long":0}}
hdfs dfs -text /.../iceberg_test_db.db/iceberg_test_table/metadata/9f5c0553-7a4b-42c7-8199-1f7cff77f3ac-m0.avro
{"status":2,"snapshot_id":{"long":3357358225130025285},"data_file":{"file_path":"hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table/data/id=1/00007-0-3ccc043d-9d03-4b5c-8268-55c09827927b-00001.parquet","file_format":"PARQUET","partition":{"id":{"long":1}},"record_count":1,"file_size_in_bytes":691,"block_size_in_bytes":67108864,"column_sizes":{"array":[{"key":1,"value":46},{"key":2,"value":54}]},"value_counts":{"array":[{"key":1,"value":1},{"key":2,"value":1}]},"null_value_counts":{"array":[{"key":1,"value":0},{"key":2,"value":0}]},"nan_value_counts":{"array":[]},"lower_bounds":{"array":[{"key":1,"value":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":2,"value":"aaa"}]},"upper_bounds":{"array":[{"key":1,"value":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":2,"value":"aaa"}]},"key_metadata":null,"split_offsets":{"array":[4]},"sort_order_id":{"int":0}}}
hiveCatalog hive元数据信息会记录 iceberg 表名及 iceberg 元数据位置 metadata_location
修改iceberg表名,只会修改hive元数据信息,iceberg元数据metadata不变,也就是hdfs上表目录名和json信息不会变。
删除表,会删除hive元数据信息和iceberg元数据位置信息,会删除hdfs上目录下的metadata下的元数据,但是不会删目录。
Overwrite,会写入新的parquet文件,不会第一时间清理旧文件。
Hive信息
修改后
overwrite后,之前的snapshots 无法再读取
Found overwrite operation, cannot support incremental data in snapshots (8116368287341314212, 3591005179391590033]
另外,Flink cdc mysql8.x问题
Public Key Retrieval is not allowed
MySQL8.0连接验证机制发生了变化,默认使用caching_sha2_password作为身份验证插件,修改为使用mysql_native_password加密规则来校验身份。
alter user 'test'@'%' identified with mysql_native_password by '123456';
Iceberg v1 CDC 不支持deletions,只能初始化CDC时导入,后续有deletions操作数据进入会报错。v2功能还在开发中,暂未对外开放,所以CDC功能目前并不能使用。
UTF-8
1.8
1.8
3.2.2
3.8.1
3.1.1
1.8
2.12.13
2.12
2.9.2
1.12.5
0.12.0
2.3.9
compile
org.scala-lang
scala-library
${scala.version}
${scope.type}
org.apache.flink
flink-runtime-web_${scala.binary.version}
${flink.version}
${scope.type}
org.apache.flink
flink-core
${flink.version}
${scope.type}
org.apache.flink
flink-scala_${scala.binary.version}
${flink.version}
${scope.type}
org.apache.flink
flink-table-common
${flink.version}
${scope.type}
org.apache.flink
flink-table-api-scala-bridge_${scala.binary.version}
${flink.version}
${scope.type}
org.apache.flink
flink-streaming-scala_${scala.binary.version}
${flink.version}
${scope.type}
org.apache.flink
flink-table-planner-blink_${scala.binary.version}
${flink.version}
${scope.type}
org.apache.flink
flink-clients_${scala.binary.version}
${flink.version}
${scope.type}
org.apache.flink
flink-csv
${flink.version}
${scope.type}
org.apache.flink
flink-json
${flink.version}
${scope.type}
org.apache.flink
flink-orc_${scala.binary.version}
${flink.version}
${scope.type}
org.apache.flink
flink-statebackend-rocksdb_2.11
${flink.version}
${scope.type}
org.apache.flink
flink-sql-connector-kafka_${scala.binary.version}
${flink.version}
${scope.type}
org.apache.flink
flink-statebackend-rocksdb_2.11
${flink.version}
${scope.type}
org.apache.flink
flink-connector-hive_${scala.binary.version}
${flink.version}
${scope.type}
com.alibaba.ververica
flink-sql-connector-mysql-cdc
1.2.0
${scope.type}
org.apache.iceberg
iceberg-flink-runtime
${iceberg.version}
${scope.type}
org.apache.hadoop
hadoop-common
${hadoop.version}
${scope.type}
org.apache.hadoop
hadoop-hdfs
${hadoop.version}
${scope.type}
org.apache.hadoop
hadoop-client
${hadoop.version}
${scope.type}
org.apache.hive
hive-exec
${hive.version}
${scope.type}
org.apache.logging.log4j
log4j-slf4j-impl
org.apache.hive
hive-llap-tez
org.antlr
antlr-runtime
3.5.2
net.alchim31.maven
scala-maven-plugin
${scala.maven.plugin.version}
compile
org.apache.maven.plugins
maven-assembly-plugin
${maven.assembly.plugin.version}
jar-with-dependencies
make-assembly
package
single