flink-cdc,clickhouse写入,多路输出

1、场景

kafka日志数据从kafka读取

1、关联字典表:完善日志数据

2、判断日志内容级别:多路输出

低级:入clickhouse

高级:入clickhouse的同时推送到kafka供2次数据流程处理。

2、实现

package com.ws.kafka2clickhouse;

import cn.hutool.json.JSONUtil;
import com.ws.kafka2clickhouse.bean.CompanyInfo;
import com.ws.kafka2clickhouse.bean.LogEvent;
import com.ws.kafka2clickhouse.sink.MyClickHouseSink;
import org.apache.avro.data.Json;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.connector.jdbc.JdbcConnectionOptions;
import org.apache.flink.connector.jdbc.JdbcExecutionOptions;
import org.apache.flink.connector.jdbc.JdbcSink;
import org.apache.flink.connector.jdbc.JdbcStatementBuilder;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import org.apache.flink.streaming.connectors.kafka.KafkaSerializationSchema;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import org.apache.flink.types.RowKind;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;

import javax.annotation.Nullable;
import java.nio.charset.StandardCharsets;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Properties;

public class Kafka2ClickHouse {


    public static void main(String[] args) throws Exception {
        System.setProperty("java.net.preferIPv4Stack", "true");
        System.setProperty("HADOOP_USER_NAME", "hdfs");

        StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
//        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(8);
//        env.enableCheckpointing(1000, CheckpointingMode.AT_LEAST_ONCE);
//        env.getCheckpointConfig().setCheckpointStorage("file:///D:/out_test/ck");
//        env.getCheckpointConfig().setCheckpointStorage("hdfs://hdp01:8020/tmp/kafka2hdfs/");
        StreamTableEnvironment tenv = StreamTableEnvironment.create(env);

        // 1、读取主流日志数据
        KafkaSource build = KafkaSource.builder()
                .setTopics("dataSource")
                .setGroupId("group1")
                .setBootstrapServers("hdp01:6667")
                .setStartingOffsets(OffsetsInitializer.latest())
                .setValueOnlyDeserializer(new SimpleStringSchema())
                .build();
        DataStreamSource kafka = env.fromSource(build, WatermarkStrategy.noWatermarks(), "kafka");
        // 2、主流数据json转换成POJO对象
        SingleOutputStreamOperator beans = kafka.map((MapFunction) s -> JSONUtil.toBean(s, LogEvent.class));
        // 3、加载字典表cdc流
        tenv.executeSql(
                "CREATE TABLE dmpt_base_oper_log (\n" +
                        "id bigInt primary key," +
                        "title String" +
                        ") WITH (\n" +
                        "'connector' = 'mysql-cdc',\n" +
                        "'hostname' = 'localhost',\n" +
                        "'port' = '3306',\n" +
                        "'username' = 'root',\n" +
                        "'password' = 'root',\n" +
                        "'database-name' = 'test',\n" +
                        "'table-name' = 'test_recursive'\n" +
                        ")"
        );
        Table result = tenv.sqlQuery("select * from dmpt_base_oper_log");
        DataStream dict = tenv.toChangelogStream(result);
        dict.print();
        // 4、加工字典数据,并组装上 字典表更新类型
        SingleOutputStreamOperator companyDict = dict.map(new RichMapFunction() {
            @Override
            public CompanyInfo map(Row row) throws Exception {
                Long id = (Long) row.getField("id");
                String title = (String) row.getField("title");
                // 携带上cdc数据的数据类型,《新增,删除,修改》
                RowKind kind = row.getKind();
                return new CompanyInfo(id, title, kind);
            }
        });
        // 5、对字典数据进行广播
        MapStateDescriptor company_info_desc = new MapStateDescriptor<>("company_info_dict", Long.class, CompanyInfo.class);
        BroadcastStream broadcastStream = companyDict.broadcast(company_info_desc);
        // 6、创建测流
        OutputTag tokafka = new OutputTag("tokafka") {
        };


        SingleOutputStreamOperator beans_company = beans.connect(broadcastStream).process(new BroadcastProcessFunction() {
            @Override
            public void processElement(LogEvent logEvent, ReadOnlyContext readOnlyContext, Collector collector) throws Exception {
                // 新来一条数据流,处理方法
                ReadOnlyBroadcastState broadcastState = readOnlyContext.getBroadcastState(company_info_desc);
                CompanyInfo companyInfo = broadcastState.get(logEvent.getMessageId());
                // 7、如果有单位信息,代表为高级用户数据,将消息同时吐到kafka,因此再输出到主流的同时往测流中也输出一份
                if (companyInfo != null) {
                    logEvent.setCompanyInfo(companyInfo);
                    readOnlyContext.output(tokafka, JSONUtil.toJsonStr(logEvent));
                }
                collector.collect(logEvent);
            }

            @Override
            public void processBroadcastElement(CompanyInfo companyInfo, Context context, Collector collector) throws Exception {
                // 新来一条广播流,处理方法
                BroadcastState broadcastState = context.getBroadcastState(company_info_desc);
                // 新增
                if (companyInfo.getRowKind().name().equalsIgnoreCase(RowKind.INSERT.name())) {
                    broadcastState.put(companyInfo.getId(), companyInfo);
                } else if (companyInfo.getRowKind().name().equalsIgnoreCase(RowKind.DELETE.name())) {
                    // 删除
                    broadcastState.remove(companyInfo.getId());
                } else {
                    // 修改
                    broadcastState.remove(companyInfo.getId());
                    broadcastState.put(companyInfo.getId(), companyInfo);
                }
            }
        });

        //准备向ClickHouse中插入数据的sql
        String insetIntoCkSql = "insert into default.dns_logs values(?,?,?,?,?,?,?,?,?,?,?,?,?)";
        //设置ClickHouse Sink
        SinkFunction sink = JdbcSink.sink(
                //插入数据SQL
                insetIntoCkSql,
                //设置插入ClickHouse数据的参数
                new JdbcStatementBuilder() {
                    @Override
                    public void accept(PreparedStatement preparedStatement, LogEvent logEvent) throws SQLException {
                        try {
                            preparedStatement.setString(1, logEvent.getMessageType());
                            preparedStatement.setLong(2, logEvent.getMessageId());
                            preparedStatement.setString(3, logEvent.getDeviceId());
                            preparedStatement.setString(4, logEvent.getCol1());
                            preparedStatement.setString(5, logEvent.getCol2());
                            preparedStatement.setString(6, logEvent.getCol3());
                            preparedStatement.setString(7, logEvent.getCol4());
                            preparedStatement.setString(8, logEvent.getHeaders().getDeviceTime());
                            preparedStatement.setLong(9, logEvent.getHeaders().get_uid());
                            preparedStatement.setString(10, logEvent.getHeaders().getProductId());
                            preparedStatement.setString(11, logEvent.getHeaders().getOrgId());
                            if (logEvent.getCompanyInfo() != null) {
                                preparedStatement.setString(12, logEvent.getCompanyInfo().getTitle());
                            } else {
                                preparedStatement.setString(12, null);
                            }
                            preparedStatement.setString(13, logEvent.getRegion());
                        } catch (SQLException e) {
                            e.printStackTrace();
                        }
                    }
                },
                //设置批次插入数据
                new JdbcExecutionOptions.Builder()
                        // 批次大小,默认5000
                        .withBatchSize(10000)
                        // 批次间隔时间
                        .withBatchIntervalMs(5000).
                        withMaxRetries(3).build(),
                //设置连接ClickHouse的配置
                new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
                        .withDriverName("ru.yandex.clickhouse.ClickHouseDriver")
                        .withUrl("jdbc:clickhouse://192.16.10.118:1111")
                        .withUsername("default")
                        .withPassword("xxxx")
                        .build()
        );
        // 8、所有数据进入基础库
        beans_company.addSink(sink);
        beans_company.print("基础库clickhouse");
        // 9、高级用户同时推送到分析kafka
        DataStream sideOutput = beans_company.getSideOutput(tokafka);
        sideOutput.print("增强分析kafka");
        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "hdp01:6667");
        // 10、构建kafka sink
        KafkaSerializationSchema serializationSchema = new KafkaSerializationSchema() {
            @Override
            public ProducerRecord serialize(String element, @Nullable Long timestamp) {
                return new ProducerRecord<>(
                        "dataZengQiang", // target topic
                        element.getBytes(StandardCharsets.UTF_8)); // record contents
            }
        };

        FlinkKafkaProducer myProducer = new FlinkKafkaProducer<>(
                "dataZengQiang",             // target topic
                serializationSchema,    // serialization schema
                properties,             // producer config
                FlinkKafkaProducer.Semantic.EXACTLY_ONCE); // fault-tolerance
        // 11、写入kafka
        sideOutput.addSink(myProducer);
        env.execute();
    }
}


    
        test
        org.example
        1.0-SNAPSHOT
    
    4.0.0

    flink
    
        1.13.2
        2.11
    

    
        
            org.apache.flink
            flink-runtime-web_${scala.version}
            ${flink.version}
        
        
            org.apache.flink
            flink-java
            ${flink.version}
        
        
            org.apache.flink
            flink-clients_${scala.version}
            ${flink.version}
        
        
            org.apache.flink
            flink-streaming-java_${scala.version}
            ${flink.version}
        
        
        
            org.apache.flink
            flink-table-api-java-bridge_${scala.version}
            ${flink.version}
        
        
            org.apache.flink
            flink-table-planner-blink_${scala.version}
            ${flink.version}
        
        
            org.apache.hadoop
            hadoop-client
            3.1.1
        
        
            org.apache.flink
            flink-connector-kafka_${scala.version}
            ${flink.version}
        
        
            org.apache.flink
            flink-connector-jdbc_${scala.version}
            ${flink.version}
        
        
            org.apache.kafka
            kafka-clients
            2.8.1
        

        
        
            ru.yandex.clickhouse
            clickhouse-jdbc
            0.2.4
        
        
        
            com.ws
            mysql-cdc
            2.2.0
            system
            ${project.basedir}/lib/flink-connector-mysql-cdc-2.3-SNAPSHOT.jar
        
    
    
        
            
                
                org.apache.maven.plugins
                maven-assembly-plugin
                3.3.0
                
                    
                        jar-with-dependencies
                    
                
                
                    
                        make-assembly
                        package
                        
                            single
                        
                    
                
            
        
    

你可能感兴趣的:(flink,大数据)