
1. 背景

笔者带领的团队在自研的大数据平台XSailboat 上进行专家策略规则的计算管道开发时,遇到这样一种情形:




2. 解决方案探索

2.1 方案1:Kafka源节点增加拉取延时



  • org.apache.flink.connector.kafka.source.KafkaSource(入口)
 public SourceReader<OUT, KafkaPartitionSplit> createReader(SourceReaderContext readerContext)
         throws Exception {
     return createReader(readerContext, (ignore) -> {});

 SourceReader<OUT, KafkaPartitionSplit> createReader(
         SourceReaderContext readerContext, Consumer<Collection<String>> splitFinishedHook)
         throws Exception {
     FutureCompletingBlockingQueue<RecordsWithSplitIds<ConsumerRecord<byte[], byte[]>>>
             elementsQueue = new FutureCompletingBlockingQueue<>();
             new DeserializationSchema.InitializationContext() {
                 public MetricGroup getMetricGroup() {
                     return readerContext.metricGroup().addGroup("deserializer");

                 public UserCodeClassLoader getUserCodeClassLoader() {
                     return readerContext.getUserCodeClassLoader();
     final KafkaSourceReaderMetrics kafkaSourceReaderMetrics =
             new KafkaSourceReaderMetrics(readerContext.metricGroup());

     Supplier<KafkaPartitionSplitReader> splitReaderSupplier =
             () -> new KafkaPartitionSplitReader(props, readerContext, kafkaSourceReaderMetrics);
     KafkaRecordEmitter<OUT> recordEmitter = new KafkaRecordEmitter<>(deserializationSchema);

     return new KafkaSourceReader<>(
// KafkaSourceFetcherManager是拉取数据的管理器,关注它
             new KafkaSourceFetcherManager(
                     elementsQueue, splitReaderSupplier::get, splitFinishedHook),
  • org.apache.flink.connector.kafka.source.reader.fetcher.KafkaSourceFetcherManager的祖先类:org.apache.flink.connector.base.source.reader.fetcher.SingleThreadFetcherManager
// SplitT就是按Kafka分区分解成的一个个可并行的任务项
    public void addSplits(List<SplitT> splitsToAdd) {
        SplitFetcher<E, SplitT> fetcher = getRunningFetcher();
        if (fetcher == null) {
// Fetcher拉取器,如果没有正在运行的拉取器,就重新创建一个
            fetcher = createSplitFetcher();
            // Add the splits to the fetchers.
// 把拉取任务加入到拉取器里面
// 启动拉取器,启动拉取器的代码见下面
        } else {
  • org.apache.flink.connector.kafka.source.reader.fetcher.KafkaSourceFetcherManager的根类:org.apache.flink.connector.base.source.reader.fetcher.SplitFetcherManager
protected void startFetcher(SplitFetcher<E, SplitT> fetcher) {
// 将拉取器加入到线程执行器中。
// 如果这个拉取器想延迟n秒再拉取数据,就可以使用定时器,延迟n秒之后,再submit。这个延迟时长参数n需要层层传递进来。
// 例如:
// Executors.newScheduledThreadPool(1).schedule(()->executors.submit(fetcher) , n , TimeUnit.SECONDS) ;


2.2 方案2: 在合流的时候让数据流处理挂起等待配置流先处理



  • org.apache.flink.streaming.api.functions.co.CoProcessFunction
public abstract class CoProcessFunction<IN1, IN2, OUT> extends AbstractRichFunction {

    private static final long serialVersionUID = 1L;

     * This method is called for each element in the first of the connected streams.

This function can output zero or more elements using the {@link Collector} parameter and * also update internal state or set timers using the {@link Context} parameter. * * @param value The stream element * @param ctx A {@link Context} that allows querying the timestamp of the element, querying the * {@link TimeDomain} of the firing timer and getting a {@link TimerService} for registering * timers and querying the time. The context is only valid during the invocation of this * method, do not store it. * @param out The collector to emit resulting elements to * @throws Exception The function may throw exceptions which cause the streaming program to fail * and go into recovery. */ public abstract void processElement1(IN1 value, Context ctx, Collector<OUT> out) throws Exception; /** * This method is called for each element in the second of the connected streams. * *

This function can output zero or more elements using the {@link Collector} parameter and * also update internal state or set timers using the {@link Context} parameter. * * @param value The stream element * @param ctx A {@link Context} that allows querying the timestamp of the element, querying the * {@link TimeDomain} of the firing timer and getting a {@link TimerService} for registering * timers and querying the time. The context is only valid during the invocation of this * method, do not store it. * @param out The collector to emit resulting elements to * @throws Exception The function may throw exceptions which cause the streaming program to fail * and go into recovery. */ public abstract void processElement2(IN2 value, Context ctx, Collector<OUT> out) throws Exception; /** * Called when a timer set using {@link TimerService} fires. * * @param timestamp The timestamp of the firing timer. * @param ctx An {@link OnTimerContext} that allows querying the timestamp of the firing timer, * querying the {@link TimeDomain} of the firing timer and getting a {@link TimerService} * for registering timers and querying the time. The context is only valid during the * invocation of this method, do not store it. * @param out The collector for returning result values. * @throws Exception This method may throw exceptions. Throwing an exception will cause the * operation to fail and may trigger recovery. */ public void onTimer(long timestamp, OnTimerContext ctx, Collector<OUT> out) throws Exception {} }



public class Test1
	static ERowTypeInfo initInputs(List<Row> aInputs)
		for(int i=0 ; i<100 ; i++)
			Row row_0 = Row.withNames() ;
			row_0.setField("f0" , "k" + (int)(Math.random()*1)) ;
			row_0.setField("f1" , i) ;
			row_0.setField("f2" , "c"+i) ;
			aInputs.add(row_0) ;
		return new ERowTypeInfo(new TypeInformation[] {
				BasicTypeInfo.STRING_TYPE_INFO ,
				BasicTypeInfo.INT_TYPE_INFO ,
				BasicTypeInfo.STRING_TYPE_INFO ,
		} , new String[] {"f0" , "f1" , "f2"}) ;

	public static void main(String[] args) throws Exception
		StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment() ;
		List<Row> inputs1 = CS.arrayList() ;
		List<Row> inputs2 = CS.arrayList() ;
		ERowTypeInfo rowTypeInfo1 = initInputs(inputs1) ;
		ERowTypeInfo rowTypeInfo2 = initInputs(inputs2) ;
		streamEnv.setParallelism(2) ;
		DataStreamSource<Row> ds1 = streamEnv.fromCollection(inputs1 , rowTypeInfo1) ;
		DataStreamSource<Row> ds2 = streamEnv.fromCollection(inputs2 , rowTypeInfo2) ;
		ds2.connect(ds1.broadcast()).process(new _CoProcessFunction()).print() ;
		streamEnv.execute() ;
	static class _CoProcessFunction extends CoProcessFunction<Row, Row, Row>{

		private static final long serialVersionUID = 1L;
		public _CoProcessFunction()

		public void processElement1(Row aArg0, CoProcessFunction<Row, Row, Row>.Context aArg1, Collector<Row> aArg2)
				throws Exception
		// 设置断点-1

		public void processElement2(Row aArg0, CoProcessFunction<Row, Row, Row>.Context aArg1, Collector<Row> aArg2)
				throws Exception
		// 设置断点-2




  • org.apache.flink.streaming.runtime.io.StreamMultipleInputProcessor
public DataInputStatus processInput() throws Exception {
	// 第1条流,还是第2条流
    int readingInputIndex;
    if (isPrepared) {
        readingInputIndex = selectNextReadingInputIndex();
    } else {
        // the preparations here are not placed in the constructor because all work in it
        // must be executed after all operators are opened.
        readingInputIndex = selectFirstReadingInputIndex();
    if (readingInputIndex == InputSelection.NONE_AVAILABLE) {
        return DataInputStatus.NOTHING_AVAILABLE;

    lastReadInputIndex = readingInputIndex;
    // 找到相应的处理器处理
    DataInputStatus inputStatus = inputProcessors[readingInputIndex].processInput();
    return inputSelectionHandler.updateStatusAndSelection(inputStatus, readingInputIndex);

2.3 方案3(可行): 在从Kafka中读取数据第1次反序列化的时候,加延时。



public Row deserialize(byte[] message) throws IOException
	if (mFirstRead && mDelayReadMs > 0)
		mLogger.info("设置了读取延时({} ms),现在时第一次读取,延时等待。");
		mFirstRead = false ;

