一、FlinkDataStream的优缺点就不做过多的累赘了。
自定义反序列的好处就是得到自己想要的数据格式。
先来看三个简单的反序列例子吧,测试以MySQL数据源为主。
1、封装成JSON对象。反序列:CustomDebeziumDeserializationSchema
package com.sgd;
import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.debezium.DebeziumDeserializationSchema;
import org.apache.flink.util.Collector;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.Struct;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.kafka.connect.source.SourceRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
/**
* @author lzl
* @create 2023-05-12 18:14
* @name CustomDebeziumDeserializationSchema
*/
public class CustomDebeziumDeserializationSchema implements DebeziumDeserializationSchema<JSONObject> {
private static final Logger LOGGER = LoggerFactory.getLogger(CustomDebeziumDeserializationSchema.class);
private static final long serialVersionUID = 7906905121308228264L;
public CustomDebeziumDeserializationSchema() {
}
/**
* 新增:SourceRecord{sourcePartition={server=mysql_binlog_source}, sourceOffset={file=mysql-bin.000220, pos=16692, row=1, snapshot=true}} ConnectRecord{topic='mysql_binlog_source.Flink_cdc.flink_cdc', kafkaPartition=null, key=Struct{id=2}, keySchema=Schema{mysql_binlog_source.Flink_cdc.student.Key:STRUCT}, value=Struct{after=Struct{id=2,name=刘蓓,age=18,dt=2023-05-15},source=Struct{version=1.2.1.Final,connector=mysql,name=mysql_binlog_source,ts_ms=0,snapshot=true,db=Flink_cdc,table=student,server_id=1,file=mysql-bin.000220,pos=16692,row=0},op=c,ts_ms=1603357255749}, valueSchema=Schema{mysql_binlog_source.Flink_cdc.student.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)}
* 更新:SourceRecord{sourcePartition={server=mysql_binlog_source}, sourceOffset={ts_sec=1603357705, file=mysql-bin.000220, pos=22964, row=1, server_id=1, event=2}} ConnectRecord{topic='mysql_binlog_source.Flink_cdc.student', kafkaPartition=null, key=Struct{id=8}, keySchema=Schema{mysql_binlog_source.Flink_cdc.student.Key:STRUCT}, value=Struct{before=Struct{id=8,name=关羽,age=15,dt=2023-05-17},after=Struct{id=8,name=张菲,age=16,dt=2023-05-18},source=Struct{version=1.2.1.Final,connector=mysql,name=mysql_binlog_source,ts_ms=1603357705000,db=Flink_cdc,table=student,server_id=1,file=mysql-bin.000220,pos=23109,row=0,thread=41},op=u,ts_ms=1603357705094}, valueSchema=Schema{mysql_binlog_source.Flink_cdc.student.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)}
* 删除:SourceRecord{sourcePartition={server=mysql_binlog_source}, sourceOffset={ts_sec=1603357268, file=mysql-bin.000220, pos=18510, row=1, server_id=1, event=2}} ConnectRecord{topic='mysql_binlog_source.Flink_cdc.student', kafkaPartition=null, key=Struct{id=4}, keySchema=Schema{mysql_binlog_source.Flink_cdc.student.Key:STRUCT}, value=Struct{before=Struct{id=4,name=赵芸,agew=15,dt=2023-05-15},source=Struct{version=1.2.1.Final,connector=mysql,name=mysql_binlog_source,ts_ms=1603357268010,db=Flink_cdc,table=student,server_id=1,file=mysql-bin.000220,pos=18655,row=0,thread=41},op=d,ts_ms=1603357268728}, valueSchema=Schema{mysql_binlog_source.Flink_cdc.student.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)}
*
* @param sourceRecord sourceRecord
* @param collector out
*/
@Override
public void deserialize(SourceRecord sourceRecord, Collector<JSONObject> collector) {
JSONObject resdata = new JSONObject();
try {
Struct valueStruct = (Struct) sourceRecord.value();
Struct afterStruct = valueStruct.getStruct("after");
Struct beforeStruct = valueStruct.getStruct("before");
// 注意:若valueStruct中只有after,则表明插入;若只有before,说明删除;若既有before,也有after,则代表更新
if (afterStruct != null && beforeStruct != null) {
// 修改
System.out.println("Updating ==>>>>>>>");
LOGGER.info("Updated, ignored ...");
}else if (afterStruct != null) {
// 插入
System.out.println("Inserting ==>>>>>>>");
List<Field> fields = afterStruct.schema().fields();
String name;
Object value;
for (Field field : fields) {
name = field.name();
value = afterStruct.get(name);
resdata.put(name, value);
}
}else if (beforeStruct != null) {
// 删除
System.out.println("Deleting ==>>>>>>>");
LOGGER.info("Deleted, ignored ...");
} else {
System.out.println("No this operation ...");
LOGGER.warn("No this operation ...");
}
}catch (Exception e){
System.out.println("Deserialize throws exception:");
LOGGER.error("Deserialize throws exception:", e);
}
collector.collect(resdata);
}
@Override
public TypeInformation<JSONObject> getProducedType() {
return BasicTypeInfo.of(JSONObject.class);
}
}
1.2.主类MySQL:MySqlBinlogCdcMySql
package com.sgd;
import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.connectors.mysql.MySqlSource;
import com.ververica.cdc.connectors.mysql.table.StartupOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.Properties;
/**
* @author lzl
* @create 2023-05-12 18:34
* @name MySqlBinlogCdcMySql
*/
public class MySqlBinlogCdcMySql {
public static void main(String[] args) throws Exception {
//TODO 1.获取Flink的执行环境
Configuration configuration = new Configuration();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(configuration);
env.setParallelism(1);
// TODO 2. 开启CK检查点
// TODO 3. 创建 Flink-MySQL-CDC 的 Source
Properties props = new Properties();
props.setProperty("scan.startup.mode", "initial");
SourceFunction<JSONObject> sourceFunction = MySqlSource.<JSONObject>builder()
// SourceFunction sourceFunction = MySqlSource.builder()
.hostname("10.110.17.52")
.port(3306)
.databaseList("flink_cdc") //订阅的库
.tableList("flink_cdc.student")//监控的表名,记住表签一定要加库名
.username("root")
.password("xxb@5196")
.startupOptions(StartupOptions.initial())//开启全量同步
.debeziumProperties(props)
.deserializer(new CustomerDeserializationSchema())
.build();
//4.使用 CDC Source从 MySQL读取数据
// DataStreamSource dataStream = env.addSource(sourceFunction);
DataStreamSource<JSONObject> dataStream = env.addSource(sourceFunction);
//5.数据打印
dataStream.print("===>");
//6.数据添加到另一个MySQL中
// dataStream.addSink(new MysqlWriter());
// System.out.println("MySQL写入成功!");
//7.启动任务
env.execute();
}
}
输出的数据格式为:
Inserting ==>>>>>>>
===>> {"dt":"2023-05-15","name":"刘蓓","id":1,"age":20}
Inserting ==>>>>>>>
===>> {"dt":"2023-05-15","name":"关雨","id":2,"age":20}
Inserting ==>>>>>>>
===>> {"dt":"2023-05-15","name":"张菲","id":3,"age":18}
Inserting ==>>>>>>>
===>> {"dt":"2023-05-16","name":"赵芸","id":4,"age":19}
2.自定义反序列:CustomerDeserializationSchema。封装成JSON对象。
package com.sgd;
import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.debezium.DebeziumDeserializationSchema;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.util.Collector;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.Struct;
import org.apache.kafka.connect.source.SourceRecord;
import java.util.List;
/**
* @author lzl
* @create 2023-05-16 18:11
* @name CustomerDeserializationSchema
*/
public class CustomerDeserializationSchema implements DebeziumDeserializationSchema<JSONObject> {
private static final long serialVersionUID = -3168848963265670603L;
public CustomerDeserializationSchema() {
}
@Override
public void deserialize(SourceRecord record, Collector<JSONObject> out) {
Struct dataRecord = (Struct) record.value();
Struct afterStruct = dataRecord.getStruct("after");
Struct beforeStruct = dataRecord.getStruct("before");
/*
todo 1,同时存在 beforeStruct 跟 afterStruct数据的话,就代表是update的数据
2,只存在 beforeStruct 就是delete数据
3,只存在 afterStruct数据 就是insert数据
*/
JSONObject logJson = new JSONObject();
String data_type = "";
List<Field> fieldsList = null;
if (afterStruct != null && beforeStruct != null) {
System.out.println("这是update数据");
data_type = "update";
fieldsList = afterStruct.schema().fields();
//获取字段与值
for (Field field : fieldsList) {
String fieldName = field.name();
Object fieldValue = afterStruct.get(fieldName);
logJson.put(fieldName, fieldValue);
}
} else if (afterStruct != null) {
System.out.println("这是insert数据");
data_type = "insert";
fieldsList = afterStruct.schema().fields();
//获取字段与值
for (Field field : fieldsList) {
String fieldName = field.name();
Object fieldValue = afterStruct.get(fieldName);
logJson.put(fieldName, fieldValue);
}
} else if (beforeStruct != null) {
System.out.println("这是delete数据");
data_type = "delete";
fieldsList = beforeStruct.schema().fields();
//获取字段与值
for (Field field : fieldsList) {
String fieldName = field.name();
Object fieldValue = beforeStruct.get(fieldName);
logJson.put(fieldName, fieldValue);
}
} else {
System.out.println("同步数据失败!");
}
//获取databases、table的信息
Struct source = dataRecord.getStruct("source");
Object db = source.get("db");
Object table = source.get("table");
Object ts_ms = source.get("ts_ms");
logJson.put("data_database", db);
logJson.put("data_table", table);
logJson.put("data_ts", ts_ms);
logJson.put("data_type", data_type);
//获取topic
String topic = record.topic();
System.out.println("topic = " + topic);
//获取主键字段
Struct pk = (Struct) record.key();
List<Field> pkFieldList = pk.schema().fields();
int partitionerNum = 0;
for (Field field : pkFieldList) {
Object pkValue = pk.get(field.name());
partitionerNum += pkValue.hashCode();
}
int hash = Math.abs(partitionerNum) % 3;
logJson.put("pk_hashcode", hash);
out.collect(logJson);
}
@Override
public TypeInformation<JSONObject> getProducedType() {
return BasicTypeInfo.of(JSONObject.class);
}
}
2.1 输出的数据格式
这是insert数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-15","data_type":"insert","data_table":"student","name":"刘蓓","id":1,"data_ts":1684287256536,"age":20,"data_database":"flink_cdc","pk_hashcode":1
}
这是insert数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-15","data_type":"insert","data_table":"student","name":"关雨","id":2,"data_ts":1684287256544,"age":20,"data_database":"flink_cdc","pk_hashcode":2
}
这是insert数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-15","data_type":"insert","data_table":"student","name":"张菲","id":3,"data_ts":1684287256546,"age":18,"data_database":"flink_cdc","pk_hashcode":0
}
这是insert数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-16","data_type":"insert","data_table":"student","name":"赵芸","id":4,"data_ts":1684287256546,"age":19,"data_database":"flink_cdc","pk_hashcode":1}
这是update数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-16","data_type":"update","data_table":"student","name":"关雨","id":2,"data_ts":1684281997000,"age":20,"data_database":"student","pk_hashcode":2
}
这是delete数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-17","data_type":"detele","data_table":"student","name":"刘璨","id":5,"data_ts":1684232217000,"age":15,"data_database":"student","pk_hashcode":2
}
3.自定义反序列3:CustomerDeserialization,封装成String
package com.sgd;
/**
* @author lzl
* @create 2023-05-16 17:29
* @name customerDeserialization
*/
import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.debezium.DebeziumDeserializationSchema;
import io.debezium.data.Envelope;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.util.Collector;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.Schema;
import org.apache.kafka.connect.data.Struct;
import org.apache.kafka.connect.source.SourceRecord;
import java.util.List;
/**
* 封装的数据格式
*
* "database":"",
* "tableName":"",
* "before":"id":"","tm_name":""....,
* "after":"id":"","tm_name":""....,
* "type":"c u d",
* //"ts":156456135615
*
*/
public class customerDeserialization implements DebeziumDeserializationSchema<String> {
private static final long serialVersionUID = -3168848963265670603L;
public customerDeserialization() {
}
@Override
public void deserialize(SourceRecord sourceRecord, Collector collector) throws Exception {
//1.创建json对象用于存储最终数据
JSONObject result = new JSONObject();
//2.获取库名表名
String topic = sourceRecord.topic();
//分隔符得写 \\. 不然就报错
String[] split = topic.split("\\.");
String database = split[1];
String tableName = split[2];
Struct dataRecord = (Struct) sourceRecord.value();
//3.获取"before"数据
Struct before = dataRecord.getStruct("before");
//创建json对象用于存放before的value值
JSONObject beforeData = new JSONObject();
if (before != null) {
Schema beforeSchema = before.schema();
List<Field> beforeFields = beforeSchema.fields();
for (Field beforeField : beforeFields) {
Object o = before.get(beforeField);
beforeData.put(beforeField.name(), o);
}
}
//4.获取"after"数据
Struct after = dataRecord.getStruct("after");
//创建json对象用于存放after的value值
JSONObject afterData = new JSONObject();
if (after != null) {
Schema afterSchema = after.schema();
List<Field> afterFields = afterSchema.fields();
for (Field afterField : afterFields) {
Object o = after.get(afterField);
afterData.put(afterField.name(), o);
}
}
Envelope.Operation operation = Envelope.operationFor(sourceRecord);
String type = operation.toString().toLowerCase();
if (type.equals("create")) {
type = "insert";
}
//6.将字段写入json对象
result.put("database", database);
result.put("tableName", tableName);
result.put("before", beforeData);
result.put("after", afterData);
result.put("operation",operation);
result.put("type", type);
//7.输出数据
collector.collect(result.toJSONString());
}
@Override
public TypeInformation<String> getProducedType() {
return BasicTypeInfo.STRING_TYPE_INFO;
}
}
3.2 输出的数据格式:
===>> {"database":"flink_cdc","before":{},"after":{"dt":"2023-05-15","name":"刘蓓","id":1,"age":20},"type":"read","operation":"READ","tableName":"student"}
===>> {"database":"flink_cdc","before":{},"after":{"dt":"2023-05-15","name":"关雨","id":2,"age":18},"type":"read","operation":"READ","tableName":"student"}
===>> {"database":"flink_cdc","before":{},"after":{"dt":"2023-05-15","name":"张菲","id":3,"age":18},"type":"read","operation":"READ","tableName":"student"}
===>> {"database":"flink_cdc","before":{},"after":{"dt":"2023-05-16","name":"赵芸","id":4,"age":19},"type":"read","operation":"READ","tableName":"student"}
===>> {"database":"flink_cdc","before":{"dt":"2023-05-15","name":"关雨","id":2,"age":18},"after":{"dt":"2023-05-15","name":"关雨","id":2,"age":19},"type":"update","operation":"UPDATE","tableName":"student"}
===>> {"database":"flink_cdc","before":{},"after":{"dt":"2023-06-15","name":"刘璨","id":5,"age":13},"type":"insert","operation":"CREATE","tableName":"student"}
===>> {"database":"flink_cdc","before":{"dt":"2023-06-15","name":"刘璨","id":5,"age":13},"after":{},"type":"delete","operation":"DELETE","tableName":"student"}