Flink Iceberg 测试

组件版本

组件

版本

Java

1.8.251

Scala

1.12.14

Flink

1.12.5

Iceberg

0.12.0

Hadoop

2.9.2

Hive

2.3.6

将hdfs-site.xml,core-site.xml,hive-site.xml放入resources下

hadoop_catalog

object TestFlinkSQLOptIcebergHadoopCatalog {
  def main(args: Array[String]): Unit = {
    //   val env = StreamExecutionEnvironment.getExecutionEnvironment
    val settings = EnvironmentSettings
      .newInstance()
      .useBlinkPlanner()
      .inBatchMode()
      .build()

    val tableEnv = TableEnvironment.create(settings)

    val DDL =
      """
        |create catalog hadoop_catalog with (
        | 'type' = 'iceberg',
        | 'catalog-type' = 'hadoop',
        | 'property-version' = '1',
        | 'warehouse' = 'hdfs:///user/hive/warehouse/'
        |)
        |""".stripMargin
    tableEnv.executeSql(DDL)

    //    两种写法
    //    tableEnv.executeSql("use catalog hadoop_catalog")
    tableEnv.useCatalog("hadoop_catalog")

    tableEnv.executeSql("create database if not exists iceberg_db")
    //    tableEnv.executeSql("use iceberg_db")
    tableEnv.useDatabase("iceberg_db")

    tableEnv.executeSql("show databases").print()
    tableEnv.executeSql("show tables").print()

    // 1. 创建表
    //    val tableDDL =
    //      """
    //        |create table if not exists iceberg_test_table (
    //        | id bigint comment 'unique id',
    //        | data string
    //        |) comment 'iceberg test table'
    //        | partitioned by (data)
    //        |""".stripMargin
    //    tableEnv.executeSql(tableDDL)
    //    tableEnv.executeSql("show tables").print()

    // *** 2. 修改表名,暂不支持 hadoop catalog 更改表名等操作,当前仅支持更新设置属性和删除
    //    tableEnv.executeSql("alter table iceberg_test_table rename to iceberg_test_table2")
    //    tableEnv.executeSql("show tables").print()

    // 3. 删除表
    //    tableEnv.executeSql("drop table if exists iceberg_test_table")
    //    tableEnv.executeSql("show tables").print()

    // 4. 查询表
    //    tableEnv.executeSql("show tables").print()

    // 5. like 根据已有表创建新表
    tableEnv.executeSql("create table iceberg_test_like like iceberg_test_table")
    tableEnv.executeSql("show tables").print()

    // 6. 修改表属性
    // flink1.11后支持
    //    tableEnv.executeSql("""alter table test_like set ('write.format.default'='avro')""")

    // 7. 写入
    //    tableEnv.executeSql("insert into test_hadoop_table values (1, 'a')")
    //    tableEnv.executeSql("insert overwrite test_hadoop_table values (2, 'a') ")
    //    tableEnv.executeSql("insert overwrite test_table PARTITION(data='b') SELECT 6")

    // 8. 读取数据
    //    tableEnv.executeSql("select * from test_hadoop_table").print()

    // 9. 写入数据
    //    val insert =
    //      """
    //        |insert into test_like
    //        |select
    //        | id, data
    //        |from test_hadoop_table
    //        |""".stripMargin
    //    tableEnv.executeSql(insert)
  }
}

hive_catalog

object TestFlinkSQLOptIcebergHiveCatalog {
  private var logger: org.slf4j.Logger = _

  def main(args: Array[String]): Unit = {
    logger = LoggerFactory.getLogger(this.getClass.getSimpleName)
    Logger.getLogger("org.apache").setLevel(Level.INFO)
    Logger.getLogger("hive.metastore").setLevel(Level.INFO)
    Logger.getLogger("akka").setLevel(Level.INFO)

    val tableEnv = FlinkUtils.initStreamTableEnvironment()

    //    val env = StreamExecutionEnvironment.getExecutionEnvironment
    //
    //    val settings = EnvironmentSettings
    //      .newInstance()
    //      .useBlinkPlanner()
    //      .inStreamingMode()
    //      .build()
    //
      // streamTable 环境
    //    val tableEnv = StreamTableEnvironment.create(env, settings)

    // batchTable 环境
    //    val settings = EnvironmentSettings
    //      .newInstance()
    //      .useBlinkPlanner()
    //      .inBatchMode()
    //      .build()
    //    val tableEnv = TableEnvironment.create(settings)

    //        val catalog_name = "hive_catalog"
    //        val database = "iceberg_test_db"
    //        val hiveConf = "F:\\workspace\\realtime-lakehouse\\test\\src\\main\\resources"
    //
    //        val hiveCatalog = new HiveCatalog(
    //          catalog_name,
    //          null,
    //          hiveConf
    //        )
    //        tableEnv.registerCatalog(catalog_name, hiveCatalog)
    //        tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE)

    // catalog
    //    val catalogDDL =
    //      """
    //        |create catalog hive_catalog with (
    //        | 'type' = 'iceberg',
    //        | 'catalog-type' = 'hive',
    //        | 'uri' = 'thrift://test-lakehouse:9083',
    //        | 'clients' = '5',
    //        | 'property-version' = '1',
    //        | 'warehouse' = 'hdfs://test-lakehouse:9000/user/hive/warehouse/'
    //        |)
    //        |""".stripMargin
    //    tableEnv.executeSql(catalogDDL)

    //    两种写法
    //    //    tableEnv.executeSql("use catalog hive_catalog")
    //    tableEnv.useCatalog("hive_catalog")
    //    tableEnv.executeSql("show catalogs").print()
    //
    //    val databaseDDL = "create database if not exists iceberg_test_db"
    //    tableEnv.executeSql(databaseDDL)
    //
    //    tableEnv.useDatabase("iceberg_test_db")
    //    println(s"current database: ${tableEnv.getCurrentDatabase}")

    //    tableEnv.executeSql("show databases").print()

    //    println("list catalogs:")
    //    tableEnv.listCatalogs().foreach(println)
    //    tableEnv.listDatabases()

    //    1. 创建表
    //    val tableDDL =
    //      """
    //        |create table if not exists iceberg_test_table (
    //        | id bigint comment 'unique id',
    //        | data string,
    //        | primary key (id) not enforced
    //        |) comment 'iceberg test table'
    //        | partitioned by (id)
    //        |""".stripMargin
    //    tableEnv.executeSql(tableDDL)
    //    tableEnv.executeSql("show tables").print()

    //  2. 修改表名
    //    tableEnv.executeSql("alter table iceberg_test_table rename to iceberg_test_table2")
    //    tableEnv.executeSql("show tables").print()
    //

    // 3. 删除表
    //    tableEnv.executeSql("drop table if exists iceberg_test_table")
    //    tableEnv.executeSql("show tables").print()

    // 4. 查询表
    //    tableEnv.executeSql("show tables").print()

    // 5. like 根据已有表创建新表
    //    tableEnv.executeSql("create table iceberg_test_like like iceberg_test_table")
    //    tableEnv.executeSql("show tables").print()

    // 6. 修改表属性
    // flink1.11后支持
    //    tableEnv.executeSql("alter table iceberg_test_like set ('write.format.default'='avro')")

    // 7. 写入
    // tableAPI
    //    val statementSet = tableEnv.createStatementSet()
    //    statementSet.addInsertSql("insert into iceberg_test_table values (1, 'a')")
    //    statementSet.execute()

    //    tableEnv.executeSql("insert into iceberg_test_table values (1, 'a'), (2, 'b')")
    //+----------------------+--------------------------------+
    //|                   id |                           data |
    //+----------------------+--------------------------------+
    //|                    1 |                              a |
    //|                    2 |                              b |
    //+----------------------+--------------------------------+

    //    tableEnv.executeSql("insert overwrite iceberg_test_table values (111, 'b')")
    //+----------------------+--------------------------------+
    //|                   id |                           data |
    //+----------------------+--------------------------------+
    //|                    1 |                            aaa |
    //|                    2 |                              b |
    //+----------------------+--------------------------------+

    //    tableEnv.executeSql("insert overwrite iceberg_test_table partition(data='b') select 888")
    //+----------------------+--------------------------------+
    //|                   id |                           data |
    //+----------------------+--------------------------------+
    //|                    2 |                              b |
    //|                    1 |                            ccc |
    //+----------------------+--------------------------------+

    // 8. 读取数据
    //    tableEnv.executeSql("select * from iceberg_test_table").print()
    //    val table = tableEnv.sqlQuery("select * from iceberg_test_table")
    //    table.printSchema()
    //    table.execute().print()

    // 9. 写入数据
    //    val insert =
    //      """
    //        |insert into iceberg_test_like
    //        |select
    //        | id, data
    //        |from iceberg_test_table
    //        |""".stripMargin
    //    tableEnv.executeSql(insert)
    //    tableEnv.executeSql("select * from iceberg_test_like").print()

    // 10. 流读
    //    val config = tableEnv.getConfig.getConfiguration
    //    config.setBoolean(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true)
    //
    //    // read all the records
    //    val readAllDML =
    //    """
    //      |select * from iceberg_test_table
    //      |/*+ options('streaming'='true', 'monitor-interval'='1s')*/
    //      |""".stripMargin
    //    tableEnv.executeSql(readAllDML).print()
    //
    //    // read incremental data
    //    val readIncrementalDML =
    //      """
    //        |select * from iceberg_test_table
    //        |/*+ options('streaming'='true', 'monitor-interval'='1s', 'start-snapshot-id'='8116368287341314212')*/
    //        |""".stripMargin
    //    tableEnv.executeSql(readIncrementalDML).print()

    // 11. cdc
    tableEnv.useCatalog("default_catalog")
    val cdcDDL =
      """
        |create table if not exists iceberg_cdc_source (
        |  id int,
        |  data string,
        |  primary key (id) not enforced
        |) with (
        | 'connector' = 'mysql-cdc',
        | 'hostname' = 'test-lakehouse',
        | 'port' = '3306',
        | 'username' = 'test',
        | 'password' = '123456',
        | 'database-name' = 'test_db',
        | 'table-name' = 'test',
        | 'server-time-zone' = 'Asia/Shanghai'
        |)
      """.stripMargin
    tableEnv.executeSql(cdcDDL)

    // output
    //    tableEnv.executeSql("select * from iceberg_cdc_source").print()

    //        val printSinkSql =
    //          """
    //            |create table if not exists print_sink (
    //            |  id int,
    //            |  data string,
    //            |  primary key (id) not enforced
    //            |) with (
    //            | 'connector' = 'print'
    //            |)
    //          """.stripMargin
    //        tableEnv.executeSql(printSinkSql)
    //
    //        tableEnv.executeSql("insert into print_sink select * from iceberg_cdc_source")

    //   catalog
    val catalogDDL =
      """
        |create catalog hive_catalog with (
        | 'type' = 'iceberg',
        | 'catalog-type' = 'hive',
        | 'uri' = 'thrift://test-lakehouse:9083',
        | 'clients' = '5',
        | 'property-version' = '1',
        | 'warehouse' = 'hdfs://test-lakehouse:9000/user/hive/warehouse/'
        |)
        |""".stripMargin
    tableEnv.executeSql(catalogDDL)

    val databaseDDL = "create database if not exists hive_catalog.iceberg_test_db"
    tableEnv.executeSql(databaseDDL)

    //    tableEnv.executeSql("drop table if exists hive_catalog.iceberg_test_db.iceberg_cdc_test")

    val tableDDL =
      """
        |create table if not exists hive_catalog.iceberg_test_db.iceberg_cdc_test (
        | id bigint comment 'unique id',
        | data string,
        | primary key (id) not enforced
        |) comment 'iceberg test table'
        | partitioned by (id)
        | with(
        |  'iceberg.format.version' = '2',
//            |  'write.metadata.delete-after-commit.enabled' = 'true',
//            |  'write.metadata.previous-versions-max' = '100'
        | )
        |""".stripMargin
    tableEnv.executeSql(tableDDL)

    val cdcDML =
      """
        |insert into hive_catalog.iceberg_test_db.iceberg_cdc_test
        |select * from default_catalog.default_database.iceberg_cdc_source
        |""".stripMargin
    tableEnv.executeSql(cdcDML)

    // stop cdc after
    // tableEnv.executeSql("select * from iceberg_cdc_test").print()

    //    val config = tableEnv.getConfig.getConfiguration
    //    config.setBoolean(TableConfigOptions.TABLE_DYNAMIC_TABLE_OPTIONS_ENABLED, true)
    //    tableEnv.executeSql(
    //      """
    //        |select * from iceberg_cdc_test
    //        |/*+ options('streaming'='true', 'monitor-interval'='1s')*/
    //      """.stripMargin).print()
  }
}

检查数据

建表后

2021-09-24 14:40:31,948 INFO - Successfully committed to table hive_catalog.iceberg_test_db.iceberg_test_table in 2008 ms

+--------------------+

| table name |

+--------------------+

| iceberg_test_table |

+--------------------+

hdfs数据

未写入数据前,只有metastore

metastore信息

{
  "format-version" : 1,
  "table-uuid" : "efbc787a-6eed-46ef-a2a8-c04b8cbcf1c2",
  "location" : "hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table",
  "last-updated-ms" : 1632715958040,
  "last-column-id" : 2,
  "schema" : {
    "type" : "struct",
    "schema-id" : 0,
    "identifier-field-ids" : [ 1 ],
    "fields" : [ {
      "id" : 1,
      "name" : "id",
      "required" : true,
      "type" : "long"
    }, {
      "id" : 2,
      "name" : "data",
      "required" : false,
      "type" : "string"
    } ]
  },
  "current-schema-id" : 0,
  "schemas" : [ {
    "type" : "struct",
    "schema-id" : 0,
    "identifier-field-ids" : [ 1 ],
    "fields" : [ {
      "id" : 1,
      "name" : "id",
      "required" : true,
      "type" : "long"
    }, {
      "id" : 2,
      "name" : "data",
      "required" : false,
      "type" : "string"
    } ]
  } ],
  "partition-spec" : [ {
    "name" : "id",
    "transform" : "identity",
    "source-id" : 1,
    "field-id" : 1000
  } ],
  "default-spec-id" : 0,
  "partition-specs" : [ {
    "spec-id" : 0,
    "fields" : [ {
      "name" : "id",
      "transform" : "identity",
      "source-id" : 1,
      "field-id" : 1000
    } ]
  } ],
  "last-partition-id" : 1000,
  "default-sort-order-id" : 0,
  "sort-orders" : [ {
    "order-id" : 0,
    "fields" : [ ]
  } ],
  "properties" : { },
  "current-snapshot-id" : -1,
  "snapshots" : [ ],
  "snapshot-log" : [ ],
  "metadata-log" : [ ]
}

修改数据后

    ......

  "properties" : { },
  "current-snapshot-id" : 3357358225130025285,
  "snapshots" : [ {
    "snapshot-id" : 750183960105471040,
    "timestamp-ms" : 1632715970291,
    "summary" : {
      "operation" : "append",
      "flink.job-id" : "c79435a3ae5097eba8842a1816409be5",
      "flink.max-committed-checkpoint-id" : "9223372036854775807",
      "added-data-files" : "2",
      "added-records" : "2",
      "added-files-size" : "1354",
      "changed-partition-count" : "2",
      "total-records" : "2",
      "total-files-size" : "1354",
      "total-data-files" : "2",
      "total-delete-files" : "0",
      "total-position-deletes" : "0",
      "total-equality-deletes" : "0"
    },
    "manifest-list" : "hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table/metadata/snap-750183960105471040-1-bc72ec07-52b5-4352-9c6b-1db44c8f85e9.avro",
    "schema-id" : 0
  }, {
    "snapshot-id" :

       ......

    },
    "manifest-list" : "hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table/metadata/snap-3357358225130025285-1-9f5c0553-7a4b-42c7-8199-1f7cff77f3ac.avro",
    "schema-id" : 0
  } ],
  "snapshot-log" : [ {
    "timestamp-ms" : 1632715970291,
    "snapshot-id" : 750183960105471040
  }, {
    
    ......

  } ],

  "metadata-log" : [ {
    "timestamp-ms" : 1632715958040,
    "metadata-file" : "hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table/metadata/00000-7f31a7d0-6bd9-45a4-82f6-210ea2aa5f10.metadata.json"
  }, {

    ......

  } ]
}
hdfs dfs -text /.../iceberg_test_db.db/iceberg_test_table/metadata/snap-3357358225130025285-1-9f5c0553-7a4b-42c7-8199-1f7cff77f3ac.avro

{"manifest_path":"hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table/metadata/9f5c0553-7a4b-42c7-8199-1f7cff77f3ac-m1.avro","manifest_length":6030,"partition_spec_id":0,"added_snapshot_id":{"long":3357358225130025285},"added_data_files_count":{"int":1},"existing_data_files_count":{"int":0},"deleted_data_files_count":{"int":0},"partitions":{"array":[{"contains_null":false,"contains_nan":{"boolean":false},"lower_bound":{"bytes":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},"upper_bound":{"bytes":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"}}]},"added_rows_count":{"long":1},"existing_rows_count":{"long":0},"deleted_rows_count":{"long":0}} 

hdfs dfs -text /.../iceberg_test_db.db/iceberg_test_table/metadata/9f5c0553-7a4b-42c7-8199-1f7cff77f3ac-m0.avro


{"status":2,"snapshot_id":{"long":3357358225130025285},"data_file":{"file_path":"hdfs://test-lakehouse:9000/user/hive/warehouse/iceberg_test_db.db/iceberg_test_table/data/id=1/00007-0-3ccc043d-9d03-4b5c-8268-55c09827927b-00001.parquet","file_format":"PARQUET","partition":{"id":{"long":1}},"record_count":1,"file_size_in_bytes":691,"block_size_in_bytes":67108864,"column_sizes":{"array":[{"key":1,"value":46},{"key":2,"value":54}]},"value_counts":{"array":[{"key":1,"value":1},{"key":2,"value":1}]},"null_value_counts":{"array":[{"key":1,"value":0},{"key":2,"value":0}]},"nan_value_counts":{"array":[]},"lower_bounds":{"array":[{"key":1,"value":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":2,"value":"aaa"}]},"upper_bounds":{"array":[{"key":1,"value":"\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":2,"value":"aaa"}]},"key_metadata":null,"split_offsets":{"array":[4]},"sort_order_id":{"int":0}}}

hiveCatalog hive元数据信息会记录 iceberg 表名及 iceberg 元数据位置 metadata_location

 Flink Iceberg 测试_第1张图片

修改iceberg表名,只会修改hive元数据信息,iceberg元数据metadata不变,也就是hdfs上表目录名和json信息不会变。

删除表,会删除hive元数据信息和iceberg元数据位置信息,会删除hdfs上目录下的metadata下的元数据,但是不会删目录。

Overwrite,会写入新的parquet文件,不会第一时间清理旧文件。

Hive信息

Flink Iceberg 测试_第2张图片

修改后

Flink Iceberg 测试_第3张图片

overwrite后,之前的snapshots 无法再读取

Found overwrite operation, cannot support incremental data in snapshots (8116368287341314212, 3591005179391590033]

另外,Flink cdc mysql8.x问题

Public Key Retrieval is not allowed

MySQL8.0连接验证机制发生了变化,默认使用caching_sha2_password作为身份验证插件,修改为使用mysql_native_password加密规则来校验身份。

alter user 'test'@'%' identified with mysql_native_password by '123456';

Iceberg v1 CDC 不支持deletions,只能初始化CDC时导入,后续有deletions操作数据进入会报错。v2功能还在开发中,暂未对外开放,所以CDC功能目前并不能使用。

 pom文件

    
        
        UTF-8
        1.8
        1.8
        
        3.2.2
        3.8.1
        3.1.1
        
        1.8
        2.12.13
        2.12
        
        2.9.2
        1.12.5
        0.12.0
        2.3.9


        compile
    

    
        
        
            org.scala-lang
            scala-library
            ${scala.version}
            ${scope.type}
        

        
        
            org.apache.flink
            flink-runtime-web_${scala.binary.version}
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-core
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-scala_${scala.binary.version}
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-table-common
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-table-api-scala-bridge_${scala.binary.version}
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-streaming-scala_${scala.binary.version}
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-table-planner-blink_${scala.binary.version}
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-clients_${scala.binary.version}
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-csv
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-json
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-orc_${scala.binary.version}
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-statebackend-rocksdb_2.11
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-sql-connector-kafka_${scala.binary.version}
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-statebackend-rocksdb_2.11
            ${flink.version}
            ${scope.type}
        
        
            org.apache.flink
            flink-connector-hive_${scala.binary.version}
            ${flink.version}
            ${scope.type}
        
        
            com.alibaba.ververica
            flink-sql-connector-mysql-cdc
            1.2.0
            ${scope.type}
        

        
        
            org.apache.iceberg
            iceberg-flink-runtime
            ${iceberg.version}
            ${scope.type}
        

        
        
            org.apache.hadoop
            hadoop-common
            ${hadoop.version}
            ${scope.type}
        
        
            org.apache.hadoop
            hadoop-hdfs
            ${hadoop.version}
            ${scope.type}
        
        
            org.apache.hadoop
            hadoop-client
            ${hadoop.version}
            ${scope.type}
        

        
        
            org.apache.hive
            hive-exec
            ${hive.version}
            ${scope.type}
            
                
                    org.apache.logging.log4j
                    log4j-slf4j-impl
                
                
                    org.apache.hive
                    hive-llap-tez
                
            
        
        
            org.antlr
            antlr-runtime
            3.5.2
        

    

    
        
            
                net.alchim31.maven
                scala-maven-plugin
                ${scala.maven.plugin.version}
                
                    
                        
                            
                            compile
                        
                    
                
            
            
                org.apache.maven.plugins
                maven-assembly-plugin
                ${maven.assembly.plugin.version}
                
                    
                        jar-with-dependencies
                    
                
                
                    
                        make-assembly
                        package
                        
                            single
                        
                    
                
            
        
    

你可能感兴趣的:(Flink,Iceberg,flink,scala,big,data)