Flink1.11 事件时间(event time)、watermark、watermarkstrategy使用详细案例

前言

flink1.11中有了新版的指定watermark和watermarkstrategy API。这里先就其中一种情况(代码中指定事件时间语义、使用周期性水印,从数据中抽取事件时间字段)写了一个简单的demo,作为入门,详细的关于新版事件时间和watermark的说明有时间再补充。

下面代码的数据来自metricbeat,本机启动metricbeat将数据接入kafka,flink消费kafka中的数据,于是有了下面程序。

metricbeat中的数据为json格式,有一个时间字段名为@timestamp。

代码

package it.kenn.eventtime;

import com.alibaba.fastjson.JSONObject;
import it.kenn.util.DateUtils;
import org.apache.flink.api.common.eventtime.*;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.StringDeserializer;

import java.time.Duration;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
import java.util.Iterator;
import java.util.Properties;


/**
 * 主要是event time、watermark的知识
 */
public class EventTimeDemo {
    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        env.setParallelism(6);
        Properties properties = new Properties();
        properties.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
        properties.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "1test_34fldink182ddddd344356");
        properties.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
        properties.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
        properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");

        SingleOutputStreamOperator kafkaSource = env.addSource(new FlinkKafkaConsumer<>("metric-topic", new SimpleStringSchema(), properties)).map(JSONObject::parseObject);

        kafkaSource
                .assignTimestampsAndWatermarks(WatermarkStrategy
                        .forBoundedOutOfOrderness(Duration.ofSeconds(5))//水印策略
                        .withTimestampAssigner((record, ts) -> {
                            DateTimeFormatter pattern = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
//                            LocalDateTime parse = LocalDateTime.parse(record.getString("@timestamp"), pattern).plusHours(8);
//                            return parse.toInstant(ZoneOffset.of("+8")).toEpochMilli();
                            return DateUtils.parseStringToLong(record.getString("@timestamp"),pattern,8, ChronoUnit.HOURS);
                        })//解析事件时间
                        .withIdleness(Duration.ofMinutes(1))//对于很久不来的流(空闲流,即可能一段时间内某源没有流来数据)如何处置
                )
                .keyBy(new KeySelector() {
                    @Override
                    public String getKey(JSONObject record){
                        if (record.containsKey("process") && record.getJSONObject("process").containsKey("name")){
                            return record.getJSONObject("process").getString("name");
                        }else {
                            return "unknown-process";
                        }
                    }
                })
                .window(TumblingEventTimeWindows.of(Time.seconds(5)))
                //四个泛型分别是输入类型,输出类型,key和TimeWindow,这个process函数处理的数据是这个5s窗口中的所有数据
                .process(new ProcessWindowFunction, String, TimeWindow>() {
                    @Override
                    public void process(String key, Context context, Iterable iterable, Collector> collector) throws Exception {
                        String time = null;
                        Long ts = 0L;
                        Iterator iterator = iterable.iterator();
                        if (iterator.hasNext()){
                            JSONObject next = iterator.next();
                            time = next.getString("@timestamp");
                            DateTimeFormatter pattern = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
//                            time = LocalDateTime.parse(time, pattern).plusHours(8).toString().replace("T"," ");
                            ts = DateUtils.parseStringToLong(time, pattern, 8, ChronoUnit.HOURS);
                        }
                        collector.collect(new Tuple2<>(key,ts));
                    }
                })
                .print();
//        kafkaSource.print();
        env.execute();
    }
}
package it.kenn.util;

import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.temporal.TemporalUnit;

/**
 * 时间工具类
 *
 * @author kenn
 * 2020年11月25日23点10分
 */
public final class DateUtils {

    public static Long parseStringToLong(String time, DateTimeFormatter pattern, int offset, TemporalUnit unit) {
//        DateTimeFormatter pattern = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
        LocalDateTime dateTime = null;
        if (offset > 0){
            dateTime = LocalDateTime.parse(time, pattern).plus(offset, unit);
        }else if (offset < 0){
            dateTime = LocalDateTime.parse(time, pattern).minus(Math.abs(offset), unit);
        }else {
            dateTime = LocalDateTime.parse(time, pattern);
        }
        return dateTime.toInstant(ZoneOffset.of("+8")).toEpochMilli();
    }

    public static Long parseStringToLong(String time, DateTimeFormatter pattern) {
        return parseStringToLong(time, pattern, 0, null);
    }

    public static Long parseStringToLong(String time) {
        return parseStringToLong(time, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"));
    }

    public static LocalDateTime parseStringToDateTime(String time, DateTimeFormatter pattern) {
        return LocalDateTime.parse(time, pattern);
    }

    public static LocalDateTime parseStringToDateTime(String time) {
        return parseStringToDateTime(time, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"));
    }
}

单就代码本身而言没有很难。但是对于新版watermark生成策略有些许疑问。等搞清楚了再补充一下。

2020-12-19补充

如下面代码

//下面代码是为数据源指定Timestamps和Watermarks。但是如下面所示。如果注释掉时间戳分配器的相关代码,有些程序并不会报错,有些程序是会报错的。即使程序不报错,也不能实现我想要的结果。
//所以上面几节中我为什么会对时间戳指定的必要性产生疑问,因为即使不指定时间戳程序好像照样正常运行了。因为在学习的时候所知道的样本不够,要调用[assignTimestampsAndWatermarks]必须同时指定timestamp和watermark才能实现需求
SingleOutputStreamOperator> source = env.addSource(new ForJoinSource1())
                .assignTimestampsAndWatermarks(WatermarkStrategy
                        .>forBoundedOutOfOrderness(Duration.ofMillis(100))
//                        .withTimestampAssigner((e, ts) -> e._2())
                );

 

 

 

 

 

你可能感兴趣的:(Flink,flink)