在《0基础学习PyFlink——个数滚动窗口(Tumbling Count Windows)》和《0基础学习PyFlink——个数滑动窗口(Sliding Count Windows)》中,我们发现如果窗口中元素个数没有把窗口填满,则不会触发计算。
为了解决长期不计算的问题,我们引入了在《0基础学习PyFlink——时间滚动窗口(Tumbling Time Windows)》和《0基础学习PyFlink——时间滑动窗口(Sliding Time Windows)》的方案。但是这个方案引入另外一个问题,就是每次处理数据可能不尽相同。这是因为它们使用了“处理时间”(Processing Time)来作为窗口划分的参考系,而每次程序处理时间会根据当前负载情况有很大的不同。这样我们对同一批数据做处理时,可能会得出不同的Window切分方案。
于是我们引入《0基础学习PyFlink——事件时间和运行时间的窗口》方案。它可以使用源自数据本身的“事件时间”(Event Time)作为Time Window的参考系,这样在不同负载、不同时间,相同数据的时间参考系是一样的,进而可以得出一致的结果。
但是现实中,我们没法保证上述数据是按照上面的顺序到达Flink的。
比如下面这个例子,红色部分都是乱序的,那么Flink如何处理这些数据呢?
只有两种可能性:
这些即有别于Count Window,也有别于Time Window。这个时候就要引入水位线(watermark)技术来解决这个问题。
在详细讲解之前,我们需要明确一些基本知识:
import time
from pyflink.common import Duration, WatermarkStrategy, Time, Types
from pyflink.datastream.window import TumblingEventTimeWindows, TimeWindow, TumblingProcessingTimeWindows
from pyflink.common.watermark_strategy import TimestampAssigner
from pyflink.datastream import StreamExecutionEnvironment,RuntimeExecutionMode, TimeCharacteristic
from pyflink.table import StreamTableEnvironment, TableDescriptor, Schema, DataTypes
from pyflink.datastream.functions import AllWindowFunction, ProcessFunction, ProcessAllWindowFunction, KeyedProcessFunction
from pyflink.table.expressions import lit, col
from pyflink.table.window import Tumble
from pyflink.common.time import Instant
from pyflink.table.udf import udf
from pyflink.common import Row
class WindowFunc(AllWindowFunction[tuple, tuple, TimeWindow]):
def apply(self, window, inputs):
out = "**************************WindowFunc**************************" \
"\nwindow: start:{} end:{} \ninputs: {}" \
"\n**************************WindowFunc**************************" \
.format(Instant.of_epoch_milli(window.start), Instant.of_epoch_milli(window.end), inputs)
print(out)
for value in inputs:
yield (value, Instant.of_epoch_milli(window.start), Instant.of_epoch_milli(window.end))
class TimestampAssignerAdapter(TimestampAssigner):
def extract_timestamp(self, value, record_timestamp: int):
return value[1] * 1000
class TimestampAssignerProcessFunctionAdapter(ProcessFunction):
def process_element(self, value, ctx: 'ProcessFunction.Context'):
out_put = "-----------------------TimestampAssignerProcessFunctionAdapter {}-----------------------" \
"\nvalue: {} \ttimestamp: {} \tcurrent_processing_time: {} \tcurrent_watermark: {}" \
"\n-----------------------TimestampAssignerProcessFunctionAdapter-----------------------" \
.format(int(time.time()), value, Instant.of_epoch_milli(ctx.timestamp()),
Instant.of_epoch_milli(ctx.timer_service().current_processing_time()),
Instant.of_epoch_milli(ctx.timer_service().current_watermark()))
print(out_put)
yield (value, Instant.of_epoch_milli(ctx.timestamp()),
Instant.of_epoch_milli(ctx.timer_service().current_processing_time()),
Instant.of_epoch_milli(ctx.timer_service().current_watermark()))
def gen_random_int_and_timestamp():
stream_execute_env = StreamExecutionEnvironment.get_execution_environment()
# stream_execute_env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
stream_execute_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
stream_execute_env.set_parallelism(1)
stream_execute_env.get_config().set_auto_watermark_interval(2)
stream_table_env = StreamTableEnvironment.create(stream_execution_environment=stream_execute_env)
ordinal_num_start = 0
ordinal_num_end = 10
rows_per_second = 1
schame = Schema.new_builder().column('in_ord', DataTypes.INT()) \
.build()
table_descriptor = TableDescriptor.for_connector('datagen') \
.schema(schame) \
.option('fields.in_ord.kind', 'sequence') \
.option('fields.in_ord.start', str(ordinal_num_start)) \
.option('fields.in_ord.end', str(ordinal_num_end)) \
.option('rows-per-second', str(rows_per_second)) \
.build()
stream_table_env.create_temporary_table('source', table_descriptor)
table = stream_table_env.from_path('source')
@udf(result_type=DataTypes.ROW([DataTypes.FIELD("in_ord", DataTypes.INT()), DataTypes.FIELD("calc_order", DataTypes.INT())]), input_types=[DataTypes.INT()])
def colFunc(oneCol):
ordinal_num_data_map = {0: 1, 1: 0, 2: 3, 3: 4, 4: 8, 5: 6, 6: 7, 7: 2, 8: 9, 9: 10, 10: 5}
# ordinal_num_data_map = {0: 16, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9,
# 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 0, 17: 17, 18: 18, 19: 19,
# 20: 20, 21: 121, 22: 122, 23: 123, 24: 124, 25: 125, 26: 126, 27: 127, 28: 128, 29: 129,}
data = ordinal_num_data_map[oneCol] + 100
return Row(oneCol, data)
input_table=table.map(colFunc(col('in_ord')))
datastream = stream_table_env.to_data_stream(input_table)
###############################################################################################
# datastream.window_all(TumblingProcessingTimeWindows.of(Time.milliseconds(10))) \
# .apply(WindowFunc())
###############################################################################################
# watermark_strategy = WatermarkStrategy.no_watermarks().with_timestamp_assigner(TimestampAssignerAdapter())
# datastream_with_watermark=datastream.assign_timestamps_and_watermarks(watermark_strategy)
# datastream_with_watermark.process(TimestampAssignerProcessFunctionAdapter())
# datastream_with_watermark.window_all(TumblingEventTimeWindows.of(Time.milliseconds(10))) \
# .apply(WindowFunc())
###############################################################################################
# watermark_strategy = WatermarkStrategy.for_monotonous_timestamps().with_timestamp_assigner(TimestampAssignerAdapter())
watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(0)).with_timestamp_assigner(TimestampAssignerAdapter())
datastream_with_watermark=datastream.assign_timestamps_and_watermarks(watermark_strategy)
datastream_with_watermark.process(TimestampAssignerProcessFunctionAdapter())
datastream_with_watermark.window_all(TumblingEventTimeWindows.of(Time.seconds(5))) \
.apply(WindowFunc())
###############################################################################################
stream_execute_env.execute()
if __name__ == '__main__':
gen_random_int_and_timestamp()
-----------------------TimestampAssignerProcessFunctionAdapter 1699856800-----------------------
value: Row(in_ord=0, calc_order=101) timestamp: Instant<101, 0> current_processing_time: Instant<1699856800, 705000000> current_watermark: Instant<-9223372036854776, 192000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
-----------------------TimestampAssignerProcessFunctionAdapter 1699856802-----------------------
value: Row(in_ord=1, calc_order=100) timestamp: Instant<100, 0> current_processing_time: Instant<1699856802, 700000000> current_watermark: Instant<100, 999000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
-----------------------TimestampAssignerProcessFunctionAdapter 1699856802-----------------------
value: Row(in_ord=2, calc_order=103) timestamp: Instant<103, 0> current_processing_time: Instant<1699856802, 702000000> current_watermark: Instant<100, 999000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
-----------------------TimestampAssignerProcessFunctionAdapter 1699856804-----------------------
value: Row(in_ord=3, calc_order=104) timestamp: Instant<104, 0> current_processing_time: Instant<1699856804, 700000000> current_watermark: Instant<102, 999000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
-----------------------TimestampAssignerProcessFunctionAdapter 1699856804-----------------------
value: Row(in_ord=4, calc_order=108) timestamp: Instant<108, 0> current_processing_time: Instant<1699856804, 709000000> current_watermark: Instant<102, 999000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
WindowFunc
window: start:Instant<100, 0> end:Instant<105, 0>
inputs: [Row(in_ord=0, calc_order=101), Row(in_ord=1, calc_order=100), Row(in_ord=2, calc_order=103), Row(in_ord=3, calc_order=104)]
WindowFunc
-----------------------TimestampAssignerProcessFunctionAdapter 1699856806-----------------------
value: Row(in_ord=5, calc_order=106) timestamp: Instant<106, 0> current_processing_time: Instant<1699856806, 701000000> current_watermark: Instant<107, 999000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
-----------------------TimestampAssignerProcessFunctionAdapter 1699856806-----------------------
value: Row(in_ord=6, calc_order=107) timestamp: Instant<107, 0> current_processing_time: Instant<1699856806, 705000000> current_watermark: Instant<107, 999000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
-----------------------TimestampAssignerProcessFunctionAdapter 1699856808-----------------------
value: Row(in_ord=7, calc_order=102) timestamp: Instant<102, 0> current_processing_time: Instant<1699856808, 700000000> current_watermark: Instant<107, 999000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
-----------------------TimestampAssignerProcessFunctionAdapter 1699856808-----------------------
value: Row(in_ord=8, calc_order=109) timestamp: Instant<109, 0> current_processing_time: Instant<1699856808, 701000000> current_watermark: Instant<107, 999000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
-----------------------TimestampAssignerProcessFunctionAdapter 1699856809-----------------------
value: Row(in_ord=9, calc_order=110) timestamp: Instant<110, 0> current_processing_time: Instant<1699856809, 440000000> current_watermark: Instant<108, 999000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
-----------------------TimestampAssignerProcessFunctionAdapter 1699856809-----------------------
value: Row(in_ord=10, calc_order=105) timestamp: Instant<105, 0> current_processing_time: Instant<1699856809, 441000000> current_watermark: Instant<108, 999000000>
-----------------------TimestampAssignerProcessFunctionAdapter-----------------------
WindowFunc
window: start:Instant<105, 0> end:Instant<110, 0>
inputs: [Row(in_ord=4, calc_order=108), Row(in_ord=5, calc_order=106), Row(in_ord=6, calc_order=107), Row(in_ord=8, calc_order=109), Row(in_ord=10, calc_order=105)]
WindowFunc
WindowFunc
window: start:Instant<110, 0> end:Instant<115, 0>
inputs: [Row(in_ord=9, calc_order=110)]