Flink 自定义trigger

自定义trigger的主要目的是为了等待数据到齐:

代码如下; flink版本1.6  

public class WatermarkTest {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", GlobalConstants.KAFKA_BROKER);
        properties.setProperty("group.id", "crm_stream_window");
        properties.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
        DataStream stream =
                env.addSource(new FlinkKafkaConsumer011<>("test", new SimpleStringSchema(), properties));
        DataStream> inputMap = stream.map(new MapFunction>() {
            private static final long serialVersionUID = -8812094804806854937L;

            @Override
            public Tuple3 map(String value) throws Exception {
                return new Tuple3<>(value.split("\\W+")[0], Long.valueOf(value.split("\\W+")[1]), Integer.valueOf(value.split("\\W+")[2]));
            }
        });
        DataStream> watermark =
                inputMap.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks>() {

                    private static final long serialVersionUID = 8252616297345284790L;
                    Long currentMaxTimestamp = 0L;
                    Long maxOutOfOrderness = 2000L;//最大允许的乱序时间是10s
                    Watermark watermark = null;
                    SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");

                    @Nullable
                    @Override
                    public Watermark getCurrentWatermark() {
                        watermark = new Watermark(currentMaxTimestamp - maxOutOfOrderness);
                        return watermark;
                    }

                    @Override
                    public long extractTimestamp(Tuple3 element, long previousElementTimestamp) {
                        Long timestamp = element.f1;
                        currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp);
                        System.out.println("timestamp : " + element.f1 + "|" + format.format(element.f1) + " currentMaxTimestamp : " + currentMaxTimestamp + "|" + format.format(currentMaxTimestamp) + "," + " watermark : " + watermark.getTimestamp() + "|" + format.format(watermark.getTimestamp()));
                        return timestamp;
                    }
                });

        OutputTag> lateOutputTag = new OutputTag>("late-data") {
            private static final long serialVersionUID = -1552769100986888698L;
        };

        SingleOutputStreamOperator resultStream = watermark
                .keyBy(0)
                .window(TumblingEventTimeWindows.of(Time.seconds(3)))
                .trigger(new Trigger, TimeWindow>() {
                    private static final long serialVersionUID = 2742133264310093792L;
                    ValueStateDescriptor sumStateDescriptor = new ValueStateDescriptor("sum", Integer.class);

                    @Override
                    public TriggerResult onElement(Tuple3 element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception {
                        ValueState sumState = ctx.getPartitionedState(sumStateDescriptor);
                        if (null == sumState.value()) {
                            sumState.update(0);
                        }
                        sumState.update(element.f2 + sumState.value());
                        if (sumState.value() >= 2) {
                            //这里可以选择手动处理状态
                            //  默认的trigger发送是TriggerResult.FIRE 不会清除窗口数据
                            return TriggerResult.FIRE_AND_PURGE;
                        }
                        return TriggerResult.CONTINUE;
                    }

                    @Override
                    public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
                        return TriggerResult.CONTINUE;
                    }

                    @Override
                    public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
                        return TriggerResult.CONTINUE;
                    }

                    @Override
                    public void clear(TimeWindow window, TriggerContext ctx) throws Exception {
                        System.out.println("清理窗口状态  窗口内保存值为" + ctx.getPartitionedState(sumStateDescriptor).value());
                        ctx.getPartitionedState(sumStateDescriptor).clear();
                    }
                })
                //如果使用allowedLateness会有重复计算的效果
                //默认的trigger情况下
                // 在event time>window_end_time+watermark+allowedLateness时会触发窗口的clear
                // 后续数据如果属于该窗口而且数据的event_time>watermark-allowedLateness 会触发重新计算
                //
                //在使用自定义的trigger情况下
                //同一个窗口内只要满足要求可以不停的触发窗口数据往下流
                //在event time>window_end_time+watermark+allowedLateness时会触发窗口clear
                //后续数据如果属于该窗口而且数据的event_time>watermark-allowedLateness 会触发重新计算
                //
                //窗口状态的clear只和时间有关与是否自定义trigger无关
                .allowedLateness(Time.seconds(3))
                .sideOutputLateData(lateOutputTag)
                .apply(new WindowFunction, String, Tuple, TimeWindow>() {
                    private static final long serialVersionUID = 7813420265419629362L;

                    @Override
                    public void apply(Tuple tuple, TimeWindow window, Iterable> input, Collector out) throws Exception {
                        for (Tuple3 stringLongTuple2 : input) {
                            System.out.println(stringLongTuple2.f1);
                        }
                        SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
                        out.collect("window  " + format.format(window.getStart()) + "   window  " + format.format(window.getEnd()));
                    }
                });

        resultStream.print();
//        resultStream.getSideOutput(lateOutputTag).print();
        env.execute("window test");
    }

比较了自定义trigger和默认的trigger在event time的前提下,watermark和allowedLateness对trigger的影响。

默认trigger加allowedLateness: 会导致窗口原来的数据也会触发

timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000
1461756862000
1461756863000
8> window  2016-04-27 19:34:21.000   window  2016-04-27 19:34:24.000
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756864000|2016-04-27 19:34:24.000
1461756862000
1461756863000
1461756862000
8> window  2016-04-27 19:34:21.000   window  2016-04-27 19:34:24.000

自定义trigger加allowedLateness: 会将落后的数据直接往下发送

timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000
1461756862000
1461756863000
8> window  2016-04-27 19:34:21.000   window  2016-04-27 19:34:24.000
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756864000|2016-04-27 19:34:24.000
1461756862000
8> window  2016-04-27 19:34:21.000   window  2016-04-27 19:34:24.000

可以发现两者的不同,默认的trigger会将之前窗口中的数据一起发出,但是自定义的trigger不会将之前的数据发送,而是单独将落后的数据往后发送了。避免数据的重复的问题(trigger发送数据的方式不同)。

默认trigger加allowedLateness: 会导致窗口原来的数据也会触发

timestamp : 1461756861000|2016-04-27 19:34:21.000 currentMaxTimestamp : 1461756861000|2016-04-27 19:34:21.000, watermark : -2000|1970-01-01 07:59:58.000
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : 1461756859000|2016-04-27 19:34:19.000
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000
timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756863000|2016-04-27 19:34:23.000
1461756861000
1461756862000
1461756863000
1461756862000
8> window  2016-04-27 19:34:21.000   window  2016-04-27 19:34:24.000
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000
timestamp : 1461756867000|2016-04-27 19:34:27.000 currentMaxTimestamp : 1461756867000|2016-04-27 19:34:27.000, watermark : 1461756864000|2016-04-27 19:34:24.000
timestamp : 1461756868000|2016-04-27 19:34:28.000 currentMaxTimestamp : 1461756868000|2016-04-27 19:34:28.000, watermark : 1461756865000|2016-04-27 19:34:25.000
timestamp : 1461756869000|2016-04-27 19:34:29.000 currentMaxTimestamp : 1461756869000|2016-04-27 19:34:29.000, watermark : 1461756866000|2016-04-27 19:34:26.000
清理窗口状态  窗口内保存值为4

默认trigger clear()的调用时间: 在29s的时候触发的默认的clear方法,默认执行的类名(EventTimeTrigger)

timestamp : 146175682000|1974-08-20 04:21:22.000 currentMaxTimestamp : 146175682000|1974-08-20 04:21:22.000, watermark : -2000|1970-01-01 07:59:58.000
timestamp : 146175683000|1974-08-20 04:21:23.000 currentMaxTimestamp : 146175683000|1974-08-20 04:21:23.000, watermark : 146175680000|1974-08-20 04:21:20.000
timestamp : 146175684000|1974-08-20 04:21:24.000 currentMaxTimestamp : 146175684000|1974-08-20 04:21:24.000, watermark : 146175681000|1974-08-20 04:21:21.000
timestamp : 146175685000|1974-08-20 04:21:25.000 currentMaxTimestamp : 146175685000|1974-08-20 04:21:25.000, watermark : 146175682000|1974-08-20 04:21:22.000
timestamp : 146175686000|1974-08-20 04:21:26.000 currentMaxTimestamp : 146175686000|1974-08-20 04:21:26.000, watermark : 146175683000|1974-08-20 04:21:23.000
146175682000
146175683000
8> window  1974-08-20 04:21:21.000   window  1974-08-20 04:21:24.000
timestamp : 146175687000|1974-08-20 04:21:27.000 currentMaxTimestamp : 146175687000|1974-08-20 04:21:27.000, watermark : 146175684000|1974-08-20 04:21:24.000
timestamp : 146175688000|1974-08-20 04:21:28.000 currentMaxTimestamp : 146175688000|1974-08-20 04:21:28.000, watermark : 146175685000|1974-08-20 04:21:25.000
timestamp : 146175689000|1974-08-20 04:21:29.000 currentMaxTimestamp : 146175689000|1974-08-20 04:21:29.000, watermark : 146175686000|1974-08-20 04:21:26.000

默认trigger clear()的调用时间:

timestamp : 1461756862000|2016-04-27 19:34:22.000 currentMaxTimestamp : 1461756862000|2016-04-27 19:34:22.000, watermark : -2000|1970-01-01 07:59:58.000
timestamp : 1461756863000|2016-04-27 19:34:23.000 currentMaxTimestamp : 1461756863000|2016-04-27 19:34:23.000, watermark : 1461756860000|2016-04-27 19:34:20.000
1461756862000
1461756863000
timestamp : 1461756864000|2016-04-27 19:34:24.000 currentMaxTimestamp : 1461756864000|2016-04-27 19:34:24.000, watermark : 1461756861000|2016-04-27 19:34:21.000
timestamp : 1461756865000|2016-04-27 19:34:25.000 currentMaxTimestamp : 1461756865000|2016-04-27 19:34:25.000, watermark : 1461756862000|2016-04-27 19:34:22.000
timestamp : 1461756866000|2016-04-27 19:34:26.000 currentMaxTimestamp : 1461756866000|2016-04-27 19:34:26.000, watermark : 1461756863000|2016-04-27 19:34:23.000
timestamp : 1461756867000|2016-04-27 19:34:27.000 currentMaxTimestamp : 1461756867000|2016-04-27 19:34:27.000, watermark : 1461756864000|2016-04-27 19:34:24.000
timestamp : 1461756868000|2016-04-27 19:34:28.000 currentMaxTimestamp : 1461756868000|2016-04-27 19:34:28.000, watermark : 1461756865000|2016-04-27 19:34:25.000
timestamp : 1461756869000|2016-04-27 19:34:29.000 currentMaxTimestamp : 1461756869000|2016-04-27 19:34:29.000, watermark : 1461756866000|2016-04-27 19:34:26.000
清理窗口状态  窗口内保存值为2

通过自定义trigger和默认的trigger的比较,可以发现clear()方法的调用只和时间有关

当event time>window_end_time+watermark+allowedLateness时调用

进入event time默认的trigger看看:

@PublicEvolving
public class EventTimeTrigger extends Trigger {
	private static final long serialVersionUID = 1L;

	private EventTimeTrigger() {}

	@Override
	public TriggerResult onElement(Object element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception {
		if (window.maxTimestamp() <= ctx.getCurrentWatermark()) {
			// if the watermark is already past the window fire immediately
			return TriggerResult.FIRE;
		} else {
            // 注册一个事件时间的定时器,触发onEventTime
			ctx.registerEventTimeTimer(window.maxTimestamp());
			return TriggerResult.CONTINUE;
		}
	}

	@Override
	public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) {
        //可以看到当触发onEventTime方法时只是将数据往下发送了
		return time == window.maxTimestamp() ?
			TriggerResult.FIRE :
			TriggerResult.CONTINUE;
	}

	@Override
	public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception {
		return TriggerResult.CONTINUE;
	}

	@Override
	public void clear(TimeWindow window, TriggerContext ctx) throws Exception {
          //删除事件时间的定时器
		ctx.deleteEventTimeTimer(window.maxTimestamp());
	}

	@Override
	public boolean canMerge() {
		return true;
	}

	@Override
	public void onMerge(TimeWindow window,
			OnMergeContext ctx) {
		// only register a timer if the watermark is not yet past the end of the merged window
		// this is in line with the logic in onElement(). If the watermark is past the end of
		// the window onElement() will fire and setting a timer here would fire the window twice.
		long windowMaxTimestamp = window.maxTimestamp();
		if (windowMaxTimestamp > ctx.getCurrentWatermark()) {
			ctx.registerEventTimeTimer(windowMaxTimestamp);
		}
	}

	@Override
	public String toString() {
		return "EventTimeTrigger()";
	}

	/**
	 * Creates an event-time trigger that fires once the watermark passes the end of the window.
	 *
	 * 

Once the trigger fires all elements are discarded. Elements that arrive late immediately * trigger window evaluation with just this one element. */ public static EventTimeTrigger create() { return new EventTimeTrigger(); } }

到此有些疑问窗口中元素的清除是在什么类中实现的?何时清除的?(自我理解:按理说是应该在调用clear()方法时清除窗口数据,因为此时窗口结束时间已经比watermark-allowedLateness小了)

努力吧,皮卡丘

你可能感兴趣的:(Flink)