flink 对每天的数据进行汇总

使用flink 对kafka中每天的数据进行汇总,来一条数据统计一次结果,并将结果进行持久化

public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        Properties props = new Properties();
        props.setProperty("group.id", "test005");
        props.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("enable.auto.commit", false);
        props.put("auto.offset.reset", "latest ");


        // 定义kafka服务器地址列表,不需要指定所有的broker
        props.setProperty("bootstrap.servers", "192.168.0.45:9092,192.168.0.46:9092,192.168.0.47:9092");

        FlinkKafkaConsumer kafkaConsumer = new FlinkKafkaConsumer<>("audit_alarm", new SimpleStringSchema(), props);

        DataStream stream = env.addSource(kafkaConsumer);

        DataStream dailyData = stream.map(new MapFunction() {
            @Override
            public DailyData map(String value) throws Exception {

  
                DailyData dailyData = new DailyData(day, message, timestamp);
                
                return dailyData;
            }
        }).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.seconds(1)) {
            @Override
            public long extractTimestamp(DailyData element) {
                return element.getTimestamp();
            }
        });

        DataStream dailyDataCount = dailyData
                .keyBy(DailyData::getKey)
                .window(TumblingEventTimeWindows.of(Time.days(1)))
                //.trigger(EventTimeTrigger.create()) // 每来一条数据触发一次计算
                .trigger(CountTrigger.of(1))
                //.trigger(ContinuousProcessingTimeTrigger.of(Time.minutes(10)))
                .process(new DailyDataCountProcessProcess());

        dailyDataCount.addSink(new MySQLSink(url, username, password));
        env.execute("Daily Data Count");
    }

之前一直在trigger中想用时间进行触发,后面发现当环境中使用的是事件时间(EventTime)

事件时间(EventTime)语义,因此触发器应该使用 EventTimeTrigger,而不是 ProcessingTimeTrigger

因此,将 .trigger(ProcessingTimeTrigger.create()) 更改为 .trigger(EventTimeTrigger.create()) 可以使触发器生效。

但是,这样的话,窗口将不会每来一条数据触发一次计算,而是要等到该窗口的 Watermark 推进到窗口结束时间后才会触发计算。如果要每来一条数据就立即计算,可以考虑使用 CountTrigger 触发器,例如 .trigger(CountTrigger.of(1))。这将在每接收到一条数据时触发计算。注意,这样会大幅增加计算和数据传输的开销。

还有种触发条件可根据时间

trigger(ContinuousProcessingTimeTrigger.of(Time.minutes(10)))

设置多久触发一次算子

补充:

MySQLSink为将数据写入到mysql,进行了更新和新增操作,根据生成的key,对数据先进行修改动作,若修改失败,则进行新增,保证一天内,聚合的数据的唯一性
package utils;

import cn.hutool.json.JSONUtil;
import mode.AlarmGatherResponse;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;

import java.sql.Connection;
import java.sql.Date;
import java.sql.DriverManager;
import java.sql.PreparedStatement;

public  class MySQLSink extends RichSinkFunction {
    private Connection connection;
    private PreparedStatement insertStatement;
    private PreparedStatement updateStatement;

    private String jdbcUrl;
    private String username;
    private String password;

    public MySQLSink(String jdbcUrl, String username, String password) {
        this.jdbcUrl = jdbcUrl;
        this.username = username;
        this.password = password;
    }

    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        connection = DriverManager.getConnection(jdbcUrl, username, password);
        // 创建插入和更新的 PreparedStatement
        String insertSql = "insert into t_flink_dailyDataCount(uuid," +
                "start_time," +
                "end_time," +
                "src_list," +
                "dst_list," +
                "attacker_list," +
                "attacked_list," +
                "alarm_type," +
                "priority," +
                "attack_direct," +
                "attack_chain," +
                "update_time," +
                "related_uuid_list,group_key ) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
        String updateSql = "UPDATE t_flink_dailyDataCount " +
                "SET end_time = ?," +
                "src_list = ?," +
                "dst_list = ?," +
                "attacker_list = ?," +
                "attacked_list = ?," +
                "alarm_type = ?," +
                "priority = ?," +
                "attack_direct = ?," +
                "attack_chain = ?," +
                "update_time = ? " +
                " WHERE group_key = ?";
        insertStatement = connection.prepareStatement(insertSql);
        updateStatement = connection.prepareStatement(updateSql);
    }

    @Override
    public void close() throws Exception {
        super.close();
        // 关闭连接和 PreparedStatement
        if (insertStatement != null) {
            insertStatement.close();
        }
        if (updateStatement != null) {
            updateStatement.close();
        }
        if (connection != null) {
            connection.close();
        }
    }

    @Override
    public void invoke(AlarmGatherResponse row, Context context) throws Exception {
        // 先尝试更新数据
        updateStatement.setDate(1, row.getEndTime());
        updateStatement.setString(2, JSONUtil.toJsonStr(row.getSrcList()));
        updateStatement.setString(3, JSONUtil.toJsonStr(row.getDstList()));
        updateStatement.setString(4, JSONUtil.toJsonStr(row.getAttackerList()));
        updateStatement.setString(5, JSONUtil.toJsonStr(row.getAttackedList()));
        updateStatement.setString(6, row.getAlarmType());
        updateStatement.setString(7, row.getPriority());
        updateStatement.setString(8, row.getAttackDirect());
        updateStatement.setInt(9, row.getAttackChain()==null?0:row.getAttackChain());
        updateStatement.setTimestamp(10, new java.sql.Timestamp(System.currentTimeMillis()));
        updateStatement.setString(11, row.getKey());
        int updatedRows = updateStatement.executeUpdate();

        if (updatedRows == 0) {
            // 如果更新失败,则插入数据
            insertStatement.setString(1, row.getUuid());
            insertStatement.setDate(2, row.getStartTime());
            insertStatement.setDate(3, row.getEndTime());
            insertStatement.setString(4, JSONUtil.toJsonStr(row.getSrcList()));
            insertStatement.setString(5, JSONUtil.toJsonStr(row.getDstList()));
            insertStatement.setString(6, JSONUtil.toJsonStr(row.getAttackerList()));
            insertStatement.setString(7, JSONUtil.toJsonStr(row.getAttackedList()));
            insertStatement.setString(8, row.getAlarmType());
            insertStatement.setString(9, row.getPriority());
            insertStatement.setString(10, row.getAttackDirect());
            insertStatement.setInt(11, row.getAttackChain()==null?0:row.getAttackChain());
            insertStatement.setDate(12, new Date(System.currentTimeMillis()));
            insertStatement.setString(13, JSONUtil.toJsonStr(row.getRelatedUuidList()));
            insertStatement.setString(14, row.getKey());
            insertStatement.executeUpdate();
        }
    }
}

你可能感兴趣的:(flink,kafka,java)