加入我们需要基于processTime 处理数据,使用5 分钟的滑动窗口。伪代码如下
window(TumblingProcessingTimeWindows.of(Time.minites(10)))
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
@Override
public void process(String s, Context context, Iterable<Tuple2<String, Long>> elements, Collector<String> out) throws Exception {
out.collect(elements.toString());
}
});
s.addSink(new MySink<>("jieguo",true));
思考下上面的代码有什么问题:我们可以看到 window窗口时十分钟,也就是Mysink每次处理的是十分钟内的数据,如果数据流量过大会对下游的写入造成很大的压力, 我们很容易可以考虑到,如果可以有一种策略使得数据达到一定的条数 也触发窗口计算就好了。计算后的数据从窗口移除,然后等窗口结束的时候再次将剩余未参与计算的数据计算一次,这就可以解决下游压力过大的问题,使得窗口的数据被分批次计算。此功能需要自定义Trigger。
package com.test.demo.stream.自定义trigger;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.state.ReducingState;
import org.apache.flink.api.common.state.ReducingStateDescriptor;
import org.apache.flink.api.common.typeutils.base.LongSerializer;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.triggers.ProcessingTimeTrigger;
import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.triggers.TriggerResult;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
class ProcessTimeCountTrigger<T> extends Trigger<T, TimeWindow> {
/** 窗口最大数据量 */
private final int maxCount;
/** 用于储存窗口当前数据量的状态对象 */
private final ReducingStateDescriptor<Long> countStateDescriptor =
new ReducingStateDescriptor<>("counter", new Sum(), LongSerializer.INSTANCE);
public ProcessTimeCountTrigger(int maxCount) {
this.maxCount = maxCount;
}
@Override
public TriggerResult onElement(T element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception {
ReducingState<Long> countState = ctx.getPartitionedState(countStateDescriptor);
countState.add(1L);
if (countState.get() >= maxCount) {
System.out.println("触发count计数, Fire");
// 清除上次的计数
ctx.getPartitionedState(countStateDescriptor).clear();
// FIRE_NAD_PURGE 则立马触发窗口计算,并将窗口内已经参与计算的数据清除
//单纯的使用FIRE 可以实现同一个窗口 滑动触发计算,参与第一次计算的数据,和后续参与计算的数据都不会被清除。直到窗口结束
return TriggerResult.FIRE_AND_PURGE;
// return TriggerResult.FIRE;
// 请读者 TriggerResult.FIRE和 TriggerResult.FIRE_AND_PURGE分别测试观察输出结果
//可以看到Fire计算后,数据不会删除,再次达到计算条件的时候数据会重复计算
// FIRE_AND_PURGE 则不会重复计算
// 这里多说一句内置的EventTimeTrigger用的就是FIRE, 这是因为后续迟到的数据可以重新结合之前的数据再次重新计算,以修正第一次的计算结果
}
// 注册定时触发器,当时间到的时候会触发omProcessTime 函数,再次触发窗口计算,此时会将窗口内剩余的数据计算。
// 计算完毕后窗口会被彻底销毁(此销毁逻辑在:WindowOperator 的 clearAllState 中 )
// 为了防止状态过大,clearAllState 会清理所有数据
ctx.registerProcessingTimeTimer(window.maxTimestamp());
return TriggerResult.CONTINUE;
}
@Override
public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
System.out.println("clicking..........");
// 触发计算并清空计数
ctx.getPartitionedState(countStateDescriptor).clear();
return TriggerResult.FIRE_AND_PURGE;
}
@Override
public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
return TriggerResult.CONTINUE;
}
@Override
public void clear(TimeWindow window, TriggerContext ctx) throws Exception {
System.out.println("CLEAR state");
ctx.getPartitionedState(countStateDescriptor).clear();
ctx.deleteEventTimeTimer(window.maxTimestamp());
}
/** 计数方法 */
static class Sum implements ReduceFunction<Long> {
@Override
public Long reduce(Long value1, Long value2) throws Exception {
return value1 + value2;
}
}
}
package com.test.demo.stream.自定义trigger;
import cn.hutool.core.collection.IterUtil;
import com.test.demo.stream.wondow.MyDataSource;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.state.ReducingState;
import org.apache.flink.api.common.state.ReducingStateDescriptor;
import org.apache.flink.api.common.typeutils.base.LongSerializer;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
class Main{
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//定义两条流
DataStream<Tuple2<String, Long>> stream = env.addSource(new MyDataSource());
stream.assignTimestampsAndWatermarks(WatermarkStrategy.noWatermarks());
SingleOutputStreamOperator<String> s = stream.keyBy(new KeySelector<Tuple2<String, Long>, String>() {
@Override
public String getKey(Tuple2<String, Long> value) throws Exception {
return value.f0;
}
}).window(TumblingProcessingTimeWindows.of(Time.seconds(4)))
.trigger(new ProcessTimeCountTrigger<>(2))
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow>() {
@Override
public void process(String s, Context context, Iterable<Tuple2<String, Long>> elements, Collector<String> out) throws Exception {
out.collect(elements.toString());
}
});
s.addSink(new PrintSinkFunction<>("jieguo",true));
env.execute();
}
}
先看EventTimeTrigger
package org.apache.flink.streaming.api.windowing.triggers;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
/**
* A {@link Trigger} that fires once the watermark passes the end of the window to which a pane
* belongs.
*
* @see org.apache.flink.streaming.api.watermark.Watermark
*/
@PublicEvolving
public class EventTimeTrigger extends Trigger<Object, TimeWindow> {
private static final long serialVersionUID = 1L;
private EventTimeTrigger() {}
@Override
public TriggerResult onElement(
Object element, long timestamp, TimeWindow window, TriggerContext ctx)
throws Exception {
if (window.maxTimestamp() <= ctx.getCurrentWatermark()) {
// 表示触发一次窗口数据下游计算
return TriggerResult.FIRE;
} else {
// 注册定时触发器,到达时间后会触发onEventTime 函数
ctx.registerEventTimeTimer(window.maxTimestamp());
return TriggerResult.CONTINUE;
}
}
@Override
public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) {
// TriggerResult.FIRE 表示触发一次窗口数据下游计算
return time == window.maxTimestamp() ? TriggerResult.FIRE : TriggerResult.CONTINUE;
}
@Override
public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx)
throws Exception {
return TriggerResult.CONTINUE;
}
@Override
public void clear(TimeWindow window, TriggerContext ctx) throws Exception {
ctx.deleteEventTimeTimer(window.maxTimestamp());
}
@Override
public boolean canMerge() {
return true;
}
@Override
public void onMerge(TimeWindow window, OnMergeContext ctx) {
// only register a timer if the watermark is not yet past the end of the merged window
// this is in line with the logic in onElement(). If the watermark is past the end of
// the window onElement() will fire and setting a timer here would fire the window twice.
long windowMaxTimestamp = window.maxTimestamp();
if (windowMaxTimestamp > ctx.getCurrentWatermark()) {
ctx.registerEventTimeTimer(windowMaxTimestamp);
}
}
@Override
public String toString() {
return "EventTimeTrigger()";
}
/**
* Creates an event-time trigger that fires once the watermark passes the end of the window.
*
* Once the trigger fires all elements are discarded. Elements that arrive late immediately
* trigger window evaluation with just this one element.
*/
public static EventTimeTrigger create() {
return new EventTimeTrigger();
}
}
再看看WindowOperator源码中有一下几个核心片段:
if (windowAssigner.isEventTime()
&& isCleanupTime(triggerContext.window, timer.getTimestamp())) {
clearAllState(triggerContext.window, windowState, mergingWindows);
}
// 注册一个窗口数据清理器,时间到达的时候会触发下面的 isCleanupTime
protected void registerCleanupTimer(W window) {
long cleanupTime = cleanupTime(window);
if (cleanupTime == Long.MAX_VALUE) {
// don't set a GC timer for "end of time"
return;
}
if (windowAssigner.isEventTime()) {
triggerContext.registerEventTimeTimer(cleanupTime);
} else {
triggerContext.registerProcessingTimeTimer(cleanupTime);
}
}
protected final boolean isCleanupTime(W window, long time) {
// time是ctx.registerEventTimeTimer(window.maxTimestamp()) 中注册的window.maxTimestamp()
return time == cleanupTime(window);
}
private long cleanupTime(W window) {
if (windowAssigner.isEventTime()) {
long cleanupTime = window.maxTimestamp() + allowedLateness;
return cleanupTime >= window.maxTimestamp() ? cleanupTime : Long.MAX_VALUE;
} else {
return window.maxTimestamp();
}
}
可以看到当时间来到 window.maxTimestamp()的时候触发函数:onEventTime ,此时返回FIRE, 此时窗口中的所有数据参与一次计算。
如果设置了allowedLateness 则isCleanupTime 为false则不会清除窗口数据,随着时间往后推移当:
window.maxTimestamp() <= ctx.getCurrentWatermark() 此时返回FIRE, 此时窗口中的所有数据再一次参与一次计算。同时cleanupTime
为True 会触发窗口状态清除。
不明白的欢迎留言,有点小复杂。