虽然使用SQL的方式进行同步非常方便。但是该方式,每向一个Iceberg目标表导入数据,都会向集群提交一个Application,非常消耗集群的资源
通过DataStream API的方式,可以只检索Mysql的数据源一次,就可同步多个数据库的多个数据表。然后通过Flink的Side Output将数据分成多个流。然后再将多个流分别导入到不同的Iceberg表
这里的示例只演示一个表的同步
将Mysql的表d_geneal.org_main同步到Iceberg的d_general.org_main,Iceberg使用HiveCatalog
package org.mq.streamWarehouse.ODS.iceberg
import com.alibaba.fastjson2.{JSON, JSONObject}
import com.ververica.cdc.connectors.mysql.source.MySqlSource
import com.ververica.cdc.debezium.JsonDebeziumDeserializationSchema
import org.apache.flink.api.common.eventtime.WatermarkStrategy
import org.apache.flink.api.common.functions.FlatMapFunction
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.datastream.{DataStreamSource, SingleOutputStreamOperator}
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.table.data.{GenericRowData, RowData, StringData, TimestampData}
import org.apache.flink.types.RowKind
import org.apache.flink.util.Collector
import org.apache.hadoop.conf.Configuration
import org.apache.iceberg.catalog.{Namespace, TableIdentifier}
import org.apache.iceberg.flink.sink.FlinkSink
import org.apache.iceberg.flink.{CatalogLoader, TableLoader}
import org.apache.iceberg.hive.HiveCatalog
import org.apache.iceberg.types.Types
import org.apache.iceberg.{PartitionSpec, Schema, Table}
import scala.collection.JavaConverters.bufferAsJavaListConverter
import scala.collection.mutable.ArrayBuffer
object MysqlCDC2IcebergDataStream {
// =========获取Mysql CDC的流数据==========
def getMysqlCDCSource(senv: StreamExecutionEnvironment): DataStreamSource[String] = {
val mySqlSource: MySqlSource[String] = MySqlSource.builder()
.hostname("192.168.8.124")
.port(3306)
.databaseList("d_general")
.tableList("d_general.org_main")
.username("hnmqet")
.password("hnmq123456")
// 将SourceRecord转换成Json格式
.deserializer(new JsonDebeziumDeserializationSchema())
.build()
val watermarkStrategy: WatermarkStrategy[String] = WatermarkStrategy.noWatermarks()
val sourceDataStream: DataStreamSource[String] = senv.fromSource[String](mySqlSource, watermarkStrategy, "Mysql Source")
sourceDataStream
}
// ==========将String类型的Mysql binlog数据,根据type的不同,转换成RowData数据========
def transformSource2RowDataStream(sourceDataStream: DataStreamSource[String]): SingleOutputStreamOperator[RowData] = {
// 可能有的rowData为null,所以使用flatMap
val rowDataSteam: SingleOutputStreamOperator[RowData] = sourceDataStream.flatMap(new FlatMapFunction[String, RowData] {
override def flatMap(input: String, collector: Collector[RowData]): Unit = {
val binlogJsonObject: JSONObject = JSON.parseObject(input)
// 获取DML操作类型
val mysqlDMLType = binlogJsonObject.getString("op")
// 根据DML的类型,确定读取before还是after
var beforeOrAfter: String = null
var rowKindValue: String = null
// 如果是读取snapshot、insert和更新,读取after部分
if (mysqlDMLType == "r" || mysqlDMLType == "c") {
beforeOrAfter = "after"
rowKindValue = "INSERT"
} else if (mysqlDMLType == "u") {
beforeOrAfter = "after"
rowKindValue = "UPDATE_AFTER"
} else if (mysqlDMLType == "d") {
// 如果是删除,读取before部分
beforeOrAfter = "before"
rowKindValue = "DELETE"
}
// 如果beforeOrAfter不为空,则读取需要的部分
if (beforeOrAfter != null) {
val afterOrAfterJsonObject = binlogJsonObject.getJSONObject(beforeOrAfter)
val id = afterOrAfterJsonObject.getLong("id")
val org_name = afterOrAfterJsonObject.getString("org_name")
val create_time = afterOrAfterJsonObject.getLong("create_time")
val rowData: GenericRowData = new GenericRowData(3)
// 指定数据的导入类型
rowData.setRowKind(RowKind.valueOf(rowKindValue))
rowData.setField(0, id)
rowData.setField(1, StringData.fromString(org_name))
rowData.setField(2, TimestampData.fromEpochMillis(create_time))
collector.collect(rowData)
}
}
})
rowDataSteam
}
// ==========初始化Hive Catalog============
def initialHiveCatalog(): (HiveCatalog, java.util.HashMap[String, String]) = {
val hiveCatalog: HiveCatalog = new HiveCatalog()
hiveCatalog.setConf(new Configuration())
val hiveCatalogProperties: java.util.HashMap[String, String] =
new java.util.HashMap[String, String]()
hiveCatalogProperties.put("warehouse", "hdfs://nnha/user/hive/warehouse")
hiveCatalogProperties.put("uri", "thrift://bigdata003:9083")
hiveCatalogProperties.put("clients", "2") // 客户端连接池大小
hiveCatalogProperties.put("property-version", "1")
hiveCatalog.initialize("hive_catalog", hiveCatalogProperties) // 第一个参数为catalog名称
(hiveCatalog, hiveCatalogProperties)
}
// ============================创建数据库和表============================
def createHiveDbTable(hiveCatalog: HiveCatalog): (Table, TableIdentifier) = {
val tableSchema: Schema = new Schema(
// 通过Java API生成的Schema,需要给每个字段指定唯一ID
Types.NestedField.required(1, "id", Types.LongType.get()),
Types.NestedField.required(2, "org_name", Types.StringType.get()),
Types.NestedField.optional(3, "create_time", Types.TimestampType.withoutZone())
)
hiveCatalog.createNamespace(Namespace.of("d_general"))
val partitionSpec: PartitionSpec = PartitionSpec.unpartitioned()
// 参数分别是数据库名和表名
val tableProperties: java.util.Map[String, String] = new java.util.HashMap[String, String]()
tableProperties.put("format-version", "2")
val hiveCatalogTableName: TableIdentifier = TableIdentifier.of("d_general", "org_main")
val hiveCatalogTable = hiveCatalog.createTable(hiveCatalogTableName, tableSchema, partitionSpec, tableProperties)
(hiveCatalogTable, hiveCatalogTableName)
}
def main(args: Array[String]): Unit = {
val senv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment()
senv.enableCheckpointing(10000L, CheckpointingMode.EXACTLY_ONCE)
senv.setParallelism(1)
val sourceDataStream: DataStreamSource[String] = getMysqlCDCSource(senv)
val rowDataSteam: SingleOutputStreamOperator[RowData] = transformSource2RowDataStream(sourceDataStream)
val (hiveCatalog, hiveCatalogProperties): (HiveCatalog, java.util.HashMap[String, String])
= initialHiveCatalog()
val (hiveCatalogTable, hiveCatalogTableName): (Table, TableIdentifier) = createHiveDbTable(hiveCatalog)
// ================upsert方式插入数据到表===========
val IcebergTableLoader: TableLoader = TableLoader.fromCatalog(CatalogLoader.hive("hive_catalog", new Configuration(), hiveCatalogProperties), hiveCatalogTableName)
FlinkSink.forRowData(rowDataSteam)
.table(hiveCatalogTable)
.tableLoader(IcebergTableLoader)
.equalityFieldColumns(ArrayBuffer("id").asJava)
// upsert需要指定equality field columns。id不存在则插入,id存在则更新
.upsert(true)
// overwrite不能同时和upsert使用。每一次commit会进行partition级别的覆盖
.overwrite(false)
.append()
senv.execute("MysqlCDC2IcebergDataStream");
}
}
会将Mysql的Snapshot快照数据同步过来。然后在Mysql进行insert、update、delete操作,Iceberg也会同步进行操作。使用Trino查询Iceberg的数据,和Mysql的结果一模一样