FlinkCDC之DataStream的反序列自定义

一、FlinkDataStream的优缺点就不做过多的累赘了。
自定义反序列的好处就是得到自己想要的数据格式。
先来看三个简单的反序列例子吧,测试以MySQL数据源为主。

1、封装成JSON对象。反序列:CustomDebeziumDeserializationSchema

package com.sgd;

import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.debezium.DebeziumDeserializationSchema;
import org.apache.flink.util.Collector;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.Struct;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.kafka.connect.source.SourceRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;

/**
 * @author lzl
 * @create 2023-05-12 18:14
 * @name CustomDebeziumDeserializationSchema
 */
public class CustomDebeziumDeserializationSchema implements DebeziumDeserializationSchema<JSONObject> {
    private static final Logger LOGGER = LoggerFactory.getLogger(CustomDebeziumDeserializationSchema.class);
    private static final long serialVersionUID = 7906905121308228264L;

    public CustomDebeziumDeserializationSchema() {
    }

    /**
     * 新增:SourceRecord{sourcePartition={server=mysql_binlog_source}, sourceOffset={file=mysql-bin.000220, pos=16692, row=1, snapshot=true}} ConnectRecord{topic='mysql_binlog_source.Flink_cdc.flink_cdc', kafkaPartition=null, key=Struct{id=2}, keySchema=Schema{mysql_binlog_source.Flink_cdc.student.Key:STRUCT}, value=Struct{after=Struct{id=2,name=刘蓓,age=18,dt=2023-05-15},source=Struct{version=1.2.1.Final,connector=mysql,name=mysql_binlog_source,ts_ms=0,snapshot=true,db=Flink_cdc,table=student,server_id=1,file=mysql-bin.000220,pos=16692,row=0},op=c,ts_ms=1603357255749}, valueSchema=Schema{mysql_binlog_source.Flink_cdc.student.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)}
     * 更新:SourceRecord{sourcePartition={server=mysql_binlog_source}, sourceOffset={ts_sec=1603357705, file=mysql-bin.000220, pos=22964, row=1, server_id=1, event=2}} ConnectRecord{topic='mysql_binlog_source.Flink_cdc.student', kafkaPartition=null, key=Struct{id=8}, keySchema=Schema{mysql_binlog_source.Flink_cdc.student.Key:STRUCT}, value=Struct{before=Struct{id=8,name=关羽,age=15,dt=2023-05-17},after=Struct{id=8,name=张菲,age=16,dt=2023-05-18},source=Struct{version=1.2.1.Final,connector=mysql,name=mysql_binlog_source,ts_ms=1603357705000,db=Flink_cdc,table=student,server_id=1,file=mysql-bin.000220,pos=23109,row=0,thread=41},op=u,ts_ms=1603357705094}, valueSchema=Schema{mysql_binlog_source.Flink_cdc.student.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)}
     * 删除:SourceRecord{sourcePartition={server=mysql_binlog_source}, sourceOffset={ts_sec=1603357268, file=mysql-bin.000220, pos=18510, row=1, server_id=1, event=2}} ConnectRecord{topic='mysql_binlog_source.Flink_cdc.student', kafkaPartition=null, key=Struct{id=4}, keySchema=Schema{mysql_binlog_source.Flink_cdc.student.Key:STRUCT}, value=Struct{before=Struct{id=4,name=赵芸,agew=15,dt=2023-05-15},source=Struct{version=1.2.1.Final,connector=mysql,name=mysql_binlog_source,ts_ms=1603357268010,db=Flink_cdc,table=student,server_id=1,file=mysql-bin.000220,pos=18655,row=0,thread=41},op=d,ts_ms=1603357268728}, valueSchema=Schema{mysql_binlog_source.Flink_cdc.student.Envelope:STRUCT}, timestamp=null, headers=ConnectHeaders(headers=)}
     *
     * @param sourceRecord sourceRecord
     * @param collector    out
     */
    @Override
    public void deserialize(SourceRecord sourceRecord, Collector<JSONObject> collector) {
        JSONObject resdata  = new JSONObject();
        try {
            Struct valueStruct = (Struct) sourceRecord.value();
            Struct afterStruct = valueStruct.getStruct("after");
            Struct beforeStruct = valueStruct.getStruct("before");
            // 注意:若valueStruct中只有after,则表明插入;若只有before,说明删除;若既有before,也有after,则代表更新
            if (afterStruct != null && beforeStruct != null) {
                // 修改
                System.out.println("Updating ==>>>>>>>");
                LOGGER.info("Updated, ignored ...");
            }else if (afterStruct != null) {
                // 插入
                System.out.println("Inserting ==>>>>>>>");
                List<Field> fields = afterStruct.schema().fields();
                String name;
                Object value;
                for (Field field : fields) {
                    name = field.name();
                    value = afterStruct.get(name);
                    resdata.put(name, value);
                }
            }else if (beforeStruct != null) {
                // 删除
                System.out.println("Deleting ==>>>>>>>");
                LOGGER.info("Deleted, ignored ...");
            } else {
                System.out.println("No this operation ...");
                LOGGER.warn("No this operation ...");
            }
        }catch (Exception e){
                System.out.println("Deserialize throws exception:");
                LOGGER.error("Deserialize throws exception:", e);
        }
        collector.collect(resdata);
    }

    @Override
    public TypeInformation<JSONObject> getProducedType() {
        return BasicTypeInfo.of(JSONObject.class);
    }
}

1.2.主类MySQL:MySqlBinlogCdcMySql

package com.sgd;

import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.connectors.mysql.MySqlSource;
import com.ververica.cdc.connectors.mysql.table.StartupOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.Properties;

/**
 * @author lzl
 * @create 2023-05-12 18:34
 * @name MySqlBinlogCdcMySql
 */
public class MySqlBinlogCdcMySql {
    public static void main(String[] args) throws Exception {
        //TODO 1.获取Flink的执行环境
        Configuration configuration = new Configuration();
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(configuration);
        env.setParallelism(1);

        // TODO 2. 开启CK检查点

        // TODO 3. 创建 Flink-MySQL-CDC 的 Source
        Properties props = new Properties();
        props.setProperty("scan.startup.mode", "initial");
        SourceFunction<JSONObject> sourceFunction = MySqlSource.<JSONObject>builder()
//        SourceFunction sourceFunction = MySqlSource.builder()
                .hostname("10.110.17.52")
                .port(3306)
                .databaseList("flink_cdc") //订阅的库
                .tableList("flink_cdc.student")//监控的表名,记住表签一定要加库名
                .username("root")
                .password("xxb@5196")
                .startupOptions(StartupOptions.initial())//开启全量同步
                .debeziumProperties(props)
                .deserializer(new CustomerDeserializationSchema())
                .build();

        //4.使用 CDC Source从 MySQL读取数据
//        DataStreamSource dataStream = env.addSource(sourceFunction);
        DataStreamSource<JSONObject> dataStream = env.addSource(sourceFunction);
        //5.数据打印
        dataStream.print("===>");
        //6.数据添加到另一个MySQL中
 //       dataStream.addSink(new MysqlWriter());
//        System.out.println("MySQL写入成功!");
        //7.启动任务
        env.execute();
    }
}

输出的数据格式为:

Inserting ==>>>>>>>
===>> {"dt":"2023-05-15","name":"刘蓓","id":1,"age":20}
Inserting ==>>>>>>>
===>> {"dt":"2023-05-15","name":"关雨","id":2,"age":20}
Inserting ==>>>>>>>
===>> {"dt":"2023-05-15","name":"张菲","id":3,"age":18}
Inserting ==>>>>>>>
===>> {"dt":"2023-05-16","name":"赵芸","id":4,"age":19}

2.自定义反序列:CustomerDeserializationSchema。封装成JSON对象。

package com.sgd;

import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.debezium.DebeziumDeserializationSchema;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.util.Collector;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.Struct;
import org.apache.kafka.connect.source.SourceRecord;
import java.util.List;

/**
 * @author lzl
 * @create 2023-05-16 18:11
 * @name CustomerDeserializationSchema
 */
public class CustomerDeserializationSchema implements DebeziumDeserializationSchema<JSONObject> {
    private static final long serialVersionUID = -3168848963265670603L;
    public CustomerDeserializationSchema() {
    }

    @Override
    public void deserialize(SourceRecord record, Collector<JSONObject> out) {
        Struct dataRecord = (Struct) record.value();

        Struct afterStruct = dataRecord.getStruct("after");
        Struct beforeStruct = dataRecord.getStruct("before");
        /*
          todo 1,同时存在 beforeStruct 跟 afterStruct数据的话,就代表是update的数据
               2,只存在 beforeStruct 就是delete数据
               3,只存在 afterStruct数据 就是insert数据
         */

        JSONObject logJson = new JSONObject();

        String data_type = "";
        List<Field> fieldsList = null;
        if (afterStruct != null && beforeStruct != null) {
            System.out.println("这是update数据");
            data_type = "update";
            fieldsList = afterStruct.schema().fields();
            //获取字段与值
            for (Field field : fieldsList) {
                String fieldName = field.name();
                Object fieldValue = afterStruct.get(fieldName);
                logJson.put(fieldName, fieldValue);
            }
        } else if (afterStruct != null) {
            System.out.println("这是insert数据");
            data_type = "insert";
            fieldsList = afterStruct.schema().fields();
            //获取字段与值
            for (Field field : fieldsList) {
                String fieldName = field.name();
                Object fieldValue = afterStruct.get(fieldName);
                logJson.put(fieldName, fieldValue);
            }
        } else if (beforeStruct != null) {
            System.out.println("这是delete数据");
            data_type = "delete";
            fieldsList = beforeStruct.schema().fields();
            //获取字段与值
            for (Field field : fieldsList) {
                String fieldName = field.name();
                Object fieldValue = beforeStruct.get(fieldName);
                logJson.put(fieldName, fieldValue);
            }
        } else {
            System.out.println("同步数据失败!");
        }

        //获取databases、table的信息
        Struct source = dataRecord.getStruct("source");
        Object db = source.get("db");
        Object table = source.get("table");
        Object ts_ms = source.get("ts_ms");

        logJson.put("data_database", db);
        logJson.put("data_table", table);
        logJson.put("data_ts", ts_ms);
        logJson.put("data_type", data_type);

        //获取topic
        String topic = record.topic();
        System.out.println("topic = " + topic);

        //获取主键字段
        Struct pk = (Struct) record.key();
        List<Field> pkFieldList = pk.schema().fields();
        int partitionerNum = 0;
        for (Field field : pkFieldList) {
            Object pkValue = pk.get(field.name());
            partitionerNum += pkValue.hashCode();

        }
        int hash = Math.abs(partitionerNum) % 3;
        logJson.put("pk_hashcode", hash);
        out.collect(logJson);
    }
    @Override
    public TypeInformation<JSONObject> getProducedType() {
        return BasicTypeInfo.of(JSONObject.class);
    }
}

2.1 输出的数据格式

这是insert数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-15","data_type":"insert","data_table":"student","name":"刘蓓","id":1,"data_ts":1684287256536,"age":20,"data_database":"flink_cdc","pk_hashcode":1
}
这是insert数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-15","data_type":"insert","data_table":"student","name":"关雨","id":2,"data_ts":1684287256544,"age":20,"data_database":"flink_cdc","pk_hashcode":2
}
这是insert数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-15","data_type":"insert","data_table":"student","name":"张菲","id":3,"data_ts":1684287256546,"age":18,"data_database":"flink_cdc","pk_hashcode":0
}
这是insert数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-16","data_type":"insert","data_table":"student","name":"赵芸","id":4,"data_ts":1684287256546,"age":19,"data_database":"flink_cdc","pk_hashcode":1}
这是update数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-16","data_type":"update","data_table":"student","name":"关雨","id":2,"data_ts":1684281997000,"age":20,"data_database":"student","pk_hashcode":2
}
这是delete数据
topic = mysql_binlog_source.flink_cdc.student
===>> {"dt":"2023-05-17","data_type":"detele","data_table":"student","name":"刘璨","id":5,"data_ts":1684232217000,"age":15,"data_database":"student","pk_hashcode":2
}

3.自定义反序列3:CustomerDeserialization,封装成String

package com.sgd;

/**
 * @author lzl
 * @create 2023-05-16 17:29
 * @name customerDeserialization
 */

import com.alibaba.fastjson.JSONObject;
import com.ververica.cdc.debezium.DebeziumDeserializationSchema;
import io.debezium.data.Envelope;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.util.Collector;
import org.apache.kafka.connect.data.Field;
import org.apache.kafka.connect.data.Schema;
import org.apache.kafka.connect.data.Struct;
import org.apache.kafka.connect.source.SourceRecord;

import java.util.List;

/**
 * 封装的数据格式
 *
 * "database":"",
 * "tableName":"",
 * "before":"id":"","tm_name":""....,
 * "after":"id":"","tm_name":""....,
 * "type":"c u d",
 * //"ts":156456135615
 *
 */
public class customerDeserialization implements DebeziumDeserializationSchema<String> {
    private static final long serialVersionUID = -3168848963265670603L;
    public customerDeserialization() {
    }
    
    @Override
    public void deserialize(SourceRecord sourceRecord, Collector collector) throws Exception {
        //1.创建json对象用于存储最终数据
        JSONObject result = new JSONObject();
        //2.获取库名表名
        String topic = sourceRecord.topic();
        //分隔符得写 \\. 不然就报错
        String[] split = topic.split("\\.");
        String database = split[1];
        String tableName = split[2];
        Struct dataRecord = (Struct) sourceRecord.value();

        //3.获取"before"数据
        Struct before = dataRecord.getStruct("before");
        //创建json对象用于存放before的value值
        JSONObject beforeData = new JSONObject();
        if (before != null) {
            Schema beforeSchema = before.schema();
            List<Field> beforeFields = beforeSchema.fields();
            for (Field beforeField : beforeFields) {
                Object o = before.get(beforeField);
                beforeData.put(beforeField.name(), o);
            }
        }

        //4.获取"after"数据
        Struct after = dataRecord.getStruct("after");
        //创建json对象用于存放after的value值
        JSONObject afterData = new JSONObject();
        if (after != null) {
            Schema afterSchema = after.schema();
            List<Field> afterFields = afterSchema.fields();
            for (Field afterField : afterFields) {
                Object o = after.get(afterField);
                afterData.put(afterField.name(), o);
            }
        }

        Envelope.Operation operation = Envelope.operationFor(sourceRecord);
        String type = operation.toString().toLowerCase();
        if (type.equals("create")) {
            type = "insert";
        }

        //6.将字段写入json对象
        result.put("database", database);
        result.put("tableName", tableName);
        result.put("before", beforeData);
        result.put("after", afterData);
        result.put("operation",operation);
        result.put("type", type);

        //7.输出数据
        collector.collect(result.toJSONString());
        }

    @Override
    public TypeInformation<String> getProducedType() {
        return BasicTypeInfo.STRING_TYPE_INFO;

    }
}

3.2 输出的数据格式:

===>> {"database":"flink_cdc","before":{},"after":{"dt":"2023-05-15","name":"刘蓓","id":1,"age":20},"type":"read","operation":"READ","tableName":"student"}
===>> {"database":"flink_cdc","before":{},"after":{"dt":"2023-05-15","name":"关雨","id":2,"age":18},"type":"read","operation":"READ","tableName":"student"}
===>> {"database":"flink_cdc","before":{},"after":{"dt":"2023-05-15","name":"张菲","id":3,"age":18},"type":"read","operation":"READ","tableName":"student"}
===>> {"database":"flink_cdc","before":{},"after":{"dt":"2023-05-16","name":"赵芸","id":4,"age":19},"type":"read","operation":"READ","tableName":"student"}
===>> {"database":"flink_cdc","before":{"dt":"2023-05-15","name":"关雨","id":2,"age":18},"after":{"dt":"2023-05-15","name":"关雨","id":2,"age":19},"type":"update","operation":"UPDATE","tableName":"student"}
===>> {"database":"flink_cdc","before":{},"after":{"dt":"2023-06-15","name":"刘璨","id":5,"age":13},"type":"insert","operation":"CREATE","tableName":"student"}
===>> {"database":"flink_cdc","before":{"dt":"2023-06-15","name":"刘璨","id":5,"age":13},"after":{},"type":"delete","operation":"DELETE","tableName":"student"}

你可能感兴趣的:(Flink,kafka,mysql,java,大数据,flink)