source 单并行度参考 https://blog.csdn.net/xu470438000/article/details/83271123
结论:
###窗口触发条件
1、watermark时间 >= window_end_time
2、在[window_start_time,window_end_time)中有数据存在
###对于设置了allowedLateness(Time.seconds(N))的
1. 第一次触发是在watermark >=window_end_time时
2. 第二次(或多次)触发的条件是watermark < window_end_time + allowedLateness时间内,这个窗口有late数据到达时。
多并行度的情况下,watermark对齐会取所有channel最小的watermark
这里把数据贴出来,省的小伙伴照着敲,并行度为1的watermark上述博客中有,不再赘述
data 并行度为1下的watermark 多并行度下watermark(并行度=3)
0001,1538359882000 1538359872000
0001,1538359886000 1538359876000
0001,1538359892000 1538359882000 1538359872000
0001,1538359893000 1538359883000 1538359876000
0001,1538359894000 (并行度为1下第一次触发窗口计算的时间点) 1538359884000 1538359882000
0001,1538359896000 1538359883000
0001,1538359897000 (3并行度下第一次触发窗口计算的时间点) 1538359884000
0001,1538359899000
0001,1538359891000
0001,1538359903000
##测试延迟的数据
0001,1538359890000
0001,1538359903000
0001,1538359890000
0001,1538359891000
0001,1538359892000
0001,1538359904000
0001,1538359890000
0001,1538359891000
0001,1538359892000
0001,1538359905000
0001,1538359890000
0001,1538359891000
0001,1538359892000
代码,基本和单source源的一致,改了source源和watermark生成部分
public class StreamingWindowWatermark {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
FlinkKafkaConsumer010 kafkaConsumer = new FlinkKafkaConsumer010<>("zjf_topic_003", new SimpleStringSchema(), getKafkaProperties());
SingleOutputStreamOperator text = env.addSource(kafkaConsumer).uid("gateway-source").setParallelism(1);
text.setParallelism(3);
//设置使用eventtime,默认是使用processtime
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//设置并行度为1,默认并行度是当前机器的cpu数量
env.setParallelism(3);
//连接socket获取输入的数据
// DataStream text = env.socketTextStream("172.31.120.110", 8999, "\n");
//数据分割 数据+时间戳
SingleOutputStreamOperator> input = text.map(new MapFunction>() {
@Override
public Tuple2 map(String value) throws Exception {
String[] arr = value.split(",");
return new Tuple2<>(arr[0], Long.parseLong(arr[1]));
}
});
//生成watermark
SingleOutputStreamOperator> watermarks = input.assignTimestampsAndWatermarks(
WatermarkStrategy.>forBoundedOutOfOrderness(Duration.ofSeconds(10))
.withTimestampAssigner(new SerializableTimestampAssigner>() {
@Override
public long extractTimestamp(Tuple2 element, long recordTimestamp) {
return element.f1; //指定EventTime对应的字段
}
})
);
SingleOutputStreamOperator window = watermarks.keyBy(0)
.window(TumblingEventTimeWindows.of(Time.seconds(3)))//按照消息的EventTime分配窗口,和调用TimeWindow效果一样
// .allowedLateness(Time.seconds(2))
.apply(new WindowFunction, String, Tuple, TimeWindow>() {
/**
* 对window内的数据进行排序,保证数据的顺序
* @param tuple
* @param window
* @param input
* @param out
* @throws Exception
*/
@Override
public void apply(Tuple tuple, TimeWindow window, Iterable> input, Collector out) throws Exception {
String key = tuple.toString();
List arrarList = new ArrayList();
Iterator> it = input.iterator();
while (it.hasNext()) {
Tuple2 next = it.next();
arrarList.add(next.f1);
}
Collections.sort(arrarList);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
String result = key + "---------" + arrarList.size() + "-----------" + sdf.format(arrarList.get(0)) + "," + sdf.format(arrarList.get(arrarList.size() - 1))
+ "," + sdf.format(window.getStart()) + "," + sdf.format(window.getEnd());
out.collect(result);
}
});
window.print();
env.execute("watermark demo");
}
private static Properties getKafkaProperties() {
Properties properties = new Properties();
//2020-01-09 切换到vpc环境
// properties.setProperty("bootstrap.servers", "172.21.164.59:9092,172.21.147.215:9092,172.21.243.86:9092");
properties.setProperty("bootstrap.servers", "172.31.117.101:9092");
properties.setProperty("group.id", "flink_01");
return properties;
}
}
pom文件:
org.apache.flink
flink-connector-jdbc_2.11
1.11.2
org.apache.commons
commons-dbcp2
2.1.1
com.alibaba.ververica
flink-format-changelog-json
1.0.0
com.alibaba.ververica
flink-connector-mysql-cdc
1.0.0
org.apache.flink
flink-cep_${scala.version}
${flink.version}
com.google.guava
guava
22.0
org.apache.flink
flink-scala_${scala.version}
${flink.version}
org.apache.flink
flink-clients_2.11
${flink.version}
org.scala-lang
scala-library
2.11.8
org.apache.logging.log4j
log4j-api-scala_2.11
11.0
org.apache.logging.log4j
log4j-core
2.8.2
org.apache.logging.log4j
log4j-api
2.8.2
org.apache.flink
flink-table-api-java-bridge_2.11
${flink.version}
org.apache.flink
flink-table-api-scala-bridge_2.11
${flink.version}
org.apache.flink
flink-table-common
${flink.version}
org.apache.flink
flink-table-planner_2.11
${flink.version}
org.apache.flink
flink-table-planner-blink_2.11
${flink.version}
org.apache.flink
flink-connector-kafka-0.11_2.11
${flink.version}
org.apache.flink
flink-connector-elasticsearch7_2.11
${flink.version}
org.projectlombok
lombok
1.18.10
org.apache.kafka
kafka-clients
2.3.0
org.apache.kafka
kafka_2.12
2.3.0
com.typesafe
config
1.2.1
com.alibaba
fastjson
1.2.47
mysql
mysql-connector-java
5.1.38
org.slf4j
slf4j-api
1.7.25
org.slf4j
slf4j-simple
1.7.25
com.alibaba
easyexcel
2.2.6
junit
junit
RELEASE
ru.ivi.opensource
flink-clickhouse-sink
1.1.0
flink-java
org.apache.flink
flink-core
org.apache.flink
flink-hadoop-fs
org.apache.flink
flink-streaming-java_2.11
org.apache.flink
org.apache.flink
flink-runtime-web_2.11
${flink.version}