flink1.11中有了新版的指定watermark和watermarkstrategy API。这里先就其中一种情况(代码中指定事件时间语义、使用周期性水印,从数据中抽取事件时间字段)写了一个简单的demo,作为入门,详细的关于新版事件时间和watermark的说明有时间再补充。
下面代码的数据来自metricbeat,本机启动metricbeat将数据接入kafka,flink消费kafka中的数据,于是有了下面程序。
metricbeat中的数据为json格式,有一个时间字段名为@timestamp。
package it.kenn.eventtime;
import com.alibaba.fastjson.JSONObject;
import it.kenn.util.DateUtils;
import org.apache.flink.api.common.eventtime.*;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.StringDeserializer;
import java.time.Duration;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
import java.util.Iterator;
import java.util.Properties;
/**
* 主要是event time、watermark的知识
*/
public class EventTimeDemo {
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(6);
Properties properties = new Properties();
properties.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
properties.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "1test_34fldink182ddddd344356");
properties.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
properties.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
SingleOutputStreamOperator kafkaSource = env.addSource(new FlinkKafkaConsumer<>("metric-topic", new SimpleStringSchema(), properties)).map(JSONObject::parseObject);
kafkaSource
.assignTimestampsAndWatermarks(WatermarkStrategy
.forBoundedOutOfOrderness(Duration.ofSeconds(5))//水印策略
.withTimestampAssigner((record, ts) -> {
DateTimeFormatter pattern = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
// LocalDateTime parse = LocalDateTime.parse(record.getString("@timestamp"), pattern).plusHours(8);
// return parse.toInstant(ZoneOffset.of("+8")).toEpochMilli();
return DateUtils.parseStringToLong(record.getString("@timestamp"),pattern,8, ChronoUnit.HOURS);
})//解析事件时间
.withIdleness(Duration.ofMinutes(1))//对于很久不来的流(空闲流,即可能一段时间内某源没有流来数据)如何处置
)
.keyBy(new KeySelector() {
@Override
public String getKey(JSONObject record){
if (record.containsKey("process") && record.getJSONObject("process").containsKey("name")){
return record.getJSONObject("process").getString("name");
}else {
return "unknown-process";
}
}
})
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
//四个泛型分别是输入类型,输出类型,key和TimeWindow,这个process函数处理的数据是这个5s窗口中的所有数据
.process(new ProcessWindowFunction, String, TimeWindow>() {
@Override
public void process(String key, Context context, Iterable iterable, Collector> collector) throws Exception {
String time = null;
Long ts = 0L;
Iterator iterator = iterable.iterator();
if (iterator.hasNext()){
JSONObject next = iterator.next();
time = next.getString("@timestamp");
DateTimeFormatter pattern = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
// time = LocalDateTime.parse(time, pattern).plusHours(8).toString().replace("T"," ");
ts = DateUtils.parseStringToLong(time, pattern, 8, ChronoUnit.HOURS);
}
collector.collect(new Tuple2<>(key,ts));
}
})
.print();
// kafkaSource.print();
env.execute();
}
}
package it.kenn.util;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.temporal.TemporalUnit;
/**
* 时间工具类
*
* @author kenn
* 2020年11月25日23点10分
*/
public final class DateUtils {
public static Long parseStringToLong(String time, DateTimeFormatter pattern, int offset, TemporalUnit unit) {
// DateTimeFormatter pattern = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
LocalDateTime dateTime = null;
if (offset > 0){
dateTime = LocalDateTime.parse(time, pattern).plus(offset, unit);
}else if (offset < 0){
dateTime = LocalDateTime.parse(time, pattern).minus(Math.abs(offset), unit);
}else {
dateTime = LocalDateTime.parse(time, pattern);
}
return dateTime.toInstant(ZoneOffset.of("+8")).toEpochMilli();
}
public static Long parseStringToLong(String time, DateTimeFormatter pattern) {
return parseStringToLong(time, pattern, 0, null);
}
public static Long parseStringToLong(String time) {
return parseStringToLong(time, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"));
}
public static LocalDateTime parseStringToDateTime(String time, DateTimeFormatter pattern) {
return LocalDateTime.parse(time, pattern);
}
public static LocalDateTime parseStringToDateTime(String time) {
return parseStringToDateTime(time, DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"));
}
}
单就代码本身而言没有很难。但是对于新版watermark生成策略有些许疑问。等搞清楚了再补充一下。
如下面代码
//下面代码是为数据源指定Timestamps和Watermarks。但是如下面所示。如果注释掉时间戳分配器的相关代码,有些程序并不会报错,有些程序是会报错的。即使程序不报错,也不能实现我想要的结果。
//所以上面几节中我为什么会对时间戳指定的必要性产生疑问,因为即使不指定时间戳程序好像照样正常运行了。因为在学习的时候所知道的样本不够,要调用[assignTimestampsAndWatermarks]必须同时指定timestamp和watermark才能实现需求
SingleOutputStreamOperator> source = env.addSource(new ForJoinSource1())
.assignTimestampsAndWatermarks(WatermarkStrategy
.>forBoundedOutOfOrderness(Duration.ofMillis(100))
// .withTimestampAssigner((e, ts) -> e._2())
);