StreamGraph、JobGraph、ExecutionGraph生成源码分析

StreamGraph生成函数分析

我们从StreamGraphGenerator.generate()方法往下看:

public static StreamGraph generate(StreamExecutionEnvironment env, List> transformations) {
        return new StreamGraphGenerator(env).generateInternal(transformations);
    }
    
    //注意,StreamGraph的生成是从sink开始的
    private StreamGraph generateInternal(List> transformations) {
        for (StreamTransformation transformation: transformations) {
            transform(transformation);
        }
        return streamGraph;
    }
    
    //这个方法的核心逻辑就是判断传入的steamOperator是哪种类型,并执行相应的操作,详情见下面那一大堆if-else
    private Collection transform(StreamTransformation transform) {

        if (alreadyTransformed.containsKey(transform)) {
            return alreadyTransformed.get(transform);
        }

        LOG.debug("Transforming " + transform);

        if (transform.getMaxParallelism() <= 0) {

            // if the max parallelism hasn't been set, then first use the job wide max parallelism
            // from theExecutionConfig.
            int globalMaxParallelismFromConfig = env.getConfig().getMaxParallelism();
            if (globalMaxParallelismFromConfig > 0) {
                transform.setMaxParallelism(globalMaxParallelismFromConfig);
            }
        }

        // call at least once to trigger exceptions about MissingTypeInfo
        transform.getOutputType();

        Collection transformedIds;
        //这里对操作符的类型进行判断,并以此调用相应的处理逻辑.简而言之,处理的核心无非是递归的将该节点和节点的上游节点加入图
        if (transform instanceof OneInputTransformation) {
            transformedIds = transformOneInputTransform((OneInputTransformation) transform);
        } else if (transform instanceof TwoInputTransformation) {
            transformedIds = transformTwoInputTransform((TwoInputTransformation) transform);
        } else if (transform instanceof SourceTransformation) {
            transformedIds = transformSource((SourceTransformation) transform);
        } else if (transform instanceof SinkTransformation) {
            transformedIds = transformSink((SinkTransformation) transform);
        } else if (transform instanceof UnionTransformation) {
            transformedIds = transformUnion((UnionTransformation) transform);
        } else if (transform instanceof SplitTransformation) {
            transformedIds = transformSplit((SplitTransformation) transform);
        } else if (transform instanceof SelectTransformation) {
            transformedIds = transformSelect((SelectTransformation) transform);
        } else if (transform instanceof FeedbackTransformation) {
            transformedIds = transformFeedback((FeedbackTransformation) transform);
        } else if (transform instanceof CoFeedbackTransformation) {
            transformedIds = transformCoFeedback((CoFeedbackTransformation) transform);
        } else if (transform instanceof PartitionTransformation) {
            transformedIds = transformPartition((PartitionTransformation) transform);
        } else if (transform instanceof SideOutputTransformation) {
            transformedIds = transformSideOutput((SideOutputTransformation) transform);
        } else {
            throw new IllegalStateException("Unknown transformation: " + transform);
        }

        //注意这里和函数开始时的方法相对应,在有向图中要注意避免循环的产生
        // need this check because the iterate transformation adds itself before
        // transforming the feedback edges
        if (!alreadyTransformed.containsKey(transform)) {
            alreadyTransformed.put(transform, transformedIds);
        }

        if (transform.getBufferTimeout() > 0) {
            streamGraph.setBufferTimeout(transform.getId(), transform.getBufferTimeout());
        }
        if (transform.getUid() != null) {
            streamGraph.setTransformationUID(transform.getId(), transform.getUid());
        }
        if (transform.getUserProvidedNodeHash() != null) {
            streamGraph.setTransformationUserHash(transform.getId(), transform.getUserProvidedNodeHash());
        }

        if (transform.getMinResources() != null && transform.getPreferredResources() != null) {
            streamGraph.setResources(transform.getId(), transform.getMinResources(), transform.getPreferredResources());
        }

        return transformedIds;
    }

因为map,filter等常用操作都是OneInputStreamOperator,我们就来看看transformOneInputTransform((OneInputTransformation) transform)方法。

private  Collection transformOneInputTransform(OneInputTransformation transform) {

        Collection inputIds = transform(transform.getInput());

        // 在递归处理节点过程中,某个节点可能已经被其他子节点先处理过了,需要跳过
        if (alreadyTransformed.containsKey(transform)) {
            return alreadyTransformed.get(transform);
        }

        //这里是获取slotSharingGroup。这个group用来定义当前我们在处理的这个操作符可以跟什么操作符chain到一个slot里进行操作
        //因为有时候我们可能不满意flink替我们做的chain聚合
        //一个slot就是一个执行task的基本容器
        String slotSharingGroup = determineSlotSharingGroup(transform.getSlotSharingGroup(), inputIds);

        //把该operator加入图
        streamGraph.addOperator(transform.getId(),
                slotSharingGroup,
                transform.getOperator(),
                transform.getInputType(),
                transform.getOutputType(),
                transform.getName());
        
        //对于keyedStream,我们还要记录它的keySelector方法
        //flink并不真正为每个keyedStream保存一个key,而是每次需要用到key的时候都使用keySelector方法进行计算
        //因此,我们自定义的keySelector方法需要保证幂等性
        //到后面介绍keyGroup的时候我们还会再次提到这一点
        if (transform.getStateKeySelector() != null) {
            TypeSerializer keySerializer = transform.getStateKeyType().createSerializer(env.getConfig());
            streamGraph.setOneInputStateKey(transform.getId(), transform.getStateKeySelector(), keySerializer);
        }

        streamGraph.setParallelism(transform.getId(), transform.getParallelism());
        streamGraph.setMaxParallelism(transform.getId(), transform.getMaxParallelism());
        
        //为当前节点和它的依赖节点建立边
        //这里可以看到之前提到的select union partition等逻辑节点被合并入edge的过程
        for (Integer inputId: inputIds) {
            streamGraph.addEdge(inputId, transform.getId(), 0);
        }

        return Collections.singleton(transform.getId());
    }
    
    public void addEdge(Integer upStreamVertexID, Integer downStreamVertexID, int typeNumber) {
        addEdgeInternal(upStreamVertexID,
                downStreamVertexID,
                typeNumber,
                null,
                new ArrayList(),
                null);

    }
    //addEdge的实现,会合并一些逻辑节点
    private void addEdgeInternal(Integer upStreamVertexID,
            Integer downStreamVertexID,
            int typeNumber,
            StreamPartitioner partitioner,
            List outputNames,
            OutputTag outputTag) {
        //如果输入边是侧输出节点,则把side的输入边作为本节点的输入边,并递归调用
        if (virtualSideOutputNodes.containsKey(upStreamVertexID)) {
            int virtualId = upStreamVertexID;
            upStreamVertexID = virtualSideOutputNodes.get(virtualId).f0;
            if (outputTag == null) {
                outputTag = virtualSideOutputNodes.get(virtualId).f1;
            }
            addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, null, outputTag);
            //如果输入边是select,则把select的输入边作为本节点的输入边
        } else if (virtualSelectNodes.containsKey(upStreamVertexID)) {
            int virtualId = upStreamVertexID;
            upStreamVertexID = virtualSelectNodes.get(virtualId).f0;
            if (outputNames.isEmpty()) {
                // selections that happen downstream override earlier selections
                outputNames = virtualSelectNodes.get(virtualId).f1;
            }
            addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, outputNames, outputTag);
            //如果是partition节点
        } else if (virtualPartitionNodes.containsKey(upStreamVertexID)) {
            int virtualId = upStreamVertexID;
            upStreamVertexID = virtualPartitionNodes.get(virtualId).f0;
            if (partitioner == null) {
                partitioner = virtualPartitionNodes.get(virtualId).f1;
            }
            addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, outputNames, outputTag);
        } else {
        //正常的edge处理逻辑
            StreamNode upstreamNode = getStreamNode(upStreamVertexID);
            StreamNode downstreamNode = getStreamNode(downStreamVertexID);

            // If no partitioner was specified and the parallelism of upstream and downstream
            // operator matches use forward partitioning, use rebalance otherwise.
            if (partitioner == null && upstreamNode.getParallelism() == downstreamNode.getParallelism()) {
                partitioner = new ForwardPartitioner();
            } else if (partitioner == null) {
                partitioner = new RebalancePartitioner();
            }

            if (partitioner instanceof ForwardPartitioner) {
                if (upstreamNode.getParallelism() != downstreamNode.getParallelism()) {
                    throw new UnsupportedOperationException("Forward partitioning does not allow " +
                            "change of parallelism. Upstream operation: " + upstreamNode + " parallelism: " + upstreamNode.getParallelism() +
                            ", downstream operation: " + downstreamNode + " parallelism: " + downstreamNode.getParallelism() +
                            " You must use another partitioning strategy, such as broadcast, rebalance, shuffle or global.");
                }
            }

            StreamEdge edge = new StreamEdge(upstreamNode, downstreamNode, typeNumber, outputNames, partitioner, outputTag);

            getStreamNode(edge.getSourceId()).addOutEdge(edge);
            getStreamNode(edge.getTargetId()).addInEdge(edge);
        }
    }
 
 

JobGraph的生成

与StreamGraph类似,JobGraph的入口方法是StreamingJobGraphGenerator.createJobGraph()。我们直接来看源码

private JobGraph createJobGraph() {

        // 设置启动模式为所有节点均在一开始就启动
        jobGraph.setScheduleMode(ScheduleMode.EAGER);

        // 为每个节点生成hash id
        Map hashes = defaultStreamGraphHasher.traverseStreamGraphAndGenerateHashes(streamGraph);

        // 为了保持兼容性创建的hash
        List> legacyHashes = new ArrayList<>(legacyStreamGraphHashers.size());
        for (StreamGraphHasher hasher : legacyStreamGraphHashers) {
            legacyHashes.add(hasher.traverseStreamGraphAndGenerateHashes(streamGraph));
        }

        Map>> chainedOperatorHashes = new HashMap<>();
        //生成jobvertex,串成chain等
        //这里的逻辑大致可以理解为,挨个遍历节点,如果该节点是一个chain的头节点,就生成一个JobVertex,如果不是头节点,就要把自身配置并入头节点,然后把头节点和自己的出边相连;对于不能chain的节点,当作只有头节点处理即可
        setChaining(hashes, legacyHashes, chainedOperatorHashes);
        //设置输入边edge
        setPhysicalEdges();
        //设置slot共享group
        setSlotSharing();
        //配置检查点
        configureCheckpointing();

        // 如果有之前的缓存文件的配置的话,重新读入
        for (Tuple2 e : streamGraph.getEnvironment().getCachedFiles()) {
            DistributedCache.writeFileInfoToConfig(e.f0, e.f1, jobGraph.getJobConfiguration());
        }

        // 传递执行环境配置
        try {
            jobGraph.setExecutionConfig(streamGraph.getExecutionConfig());
        }
        catch (IOException e) {
            throw new IllegalConfigurationException("Could not serialize the ExecutionConfig." +
                    "This indicates that non-serializable types (like custom serializers) were registered");
        }

        return jobGraph;
    }

operator chain的逻辑

为了更高效地分布式执行,Flink会尽可能地将operator的subtask链接(chain)在一起形成task。每个task在一个线程中执行。将operators链接成task是非常有效的优化:它能减少线程之间的切换,减少消息的序列化/反序列化,减少数据在缓冲区的交换,减少了延迟的同时提高整体的吞吐量。


StreamGraph、JobGraph、ExecutionGraph生成源码分析_第1张图片

上图中将KeyAggregation和Sink两个operator进行了合并,因为这两个合并后并不会改变整体的拓扑结构。但是,并不是任意两个 operator 就能 chain 一起的,其条件还是很苛刻的:

  • 上下游的并行度一致
  • 下游节点的入度为1 (也就是说下游节点没有来自其他节点的输入)
  • 上下游节点都在同一个 slot group 中(下面会解释 slot group)
  • 下游节点的 chain 策略为 ALWAYS(可以与上下游链接,map、flatmap、filter等默认是ALWAYS)
  • 上游节点的 chain 策略为 ALWAYS 或 HEAD(只能与下游链接,不能与上游链接,Source默认是HEAD)
  • 两个节点间数据分区方式是 forward(参考理解数据流的分区)
  • 用户没有禁用 chain
    flink的chain逻辑是一种很常见的设计,比如spring的interceptor也是类似的实现方式。通过把操作符串成一个大操作符,flink避免了把数据序列化后通过网络发送给其他节点的开销,能够大大增强效率。

JobGraph的提交

前面已经提到,JobGraph的提交依赖于JobClient和JobManager之间的异步通信,如图所示:


StreamGraph、JobGraph、ExecutionGraph生成源码分析_第2张图片

在submitJobAndWait方法中,其首先会创建一个JobClientActor的ActorRef,然后向其发起一个SubmitJobAndWait消息,该消息将JobGraph的实例提交给JobClientActor。发起模式是ask,它表示需要一个应答消息。

Future future = Patterns.ask(jobClientActor, new JobClientMessages.SubmitJobAndWait(jobGraph), new Timeout(AkkaUtils.INF_TIMEOUT()));
answer = Await.result(future, AkkaUtils.INF_TIMEOUT());
 
 

该SubmitJobAndWait消息被JobClientActor接收后,最终通过调用tryToSubmitJob方法触发真正的提交动作。当JobManager的actor接收到来自client端的请求后,会执行一个submitJob方法,主要做以下事情:

  • 向BlobLibraryCacheManager注册该Job;
  • 构建ExecutionGraph对象;
  • 对JobGraph中的每个顶点进行初始化;
  • 将DAG拓扑中从source开始排序,排序后的顶点集合附加到Exec> - utionGraph对象;
  • 获取检查点相关的配置,并将其设置到ExecutionGraph对象;
  • 向ExecutionGraph注册相关的listener;
  • 执行恢复操作或者将JobGraph信息写入SubmittedJobGraphStore以在后续用于恢复目的;
  • 响应给客户端JobSubmitSuccess消息;
  • 对ExecutionGraph对象进行调度执行;
    最后,JobManger会返回消息给JobClient,通知该任务是否提交成功。

ExecutionGraph的生成

与StreamGraph和JobGraph不同,ExecutionGraph并不是在我们的客户端程序生成,而是在服务端(JobManager处)生成的,顺便flink只维护一个JobManager。其入口代码是ExecutionGraphBuilder.buildGraph(...)
该方法长200多行,其中一大半是checkpoiont的相关逻辑,我们暂且略过,直接看核心方法executionGraph.attachJobGraph(sortedTopology)
因为ExecutionGraph事实上只是改动了JobGraph的每个节点,而没有对整个拓扑结构进行变动,所以代码里只是挨个遍历jobVertex并进行处理:

for (JobVertex jobVertex : topologiallySorted) {

            if (jobVertex.isInputVertex() && !jobVertex.isStoppable()) {
                this.isStoppable = false;
            }

            //在这里生成ExecutionGraph的每个节点
            //首先是进行了一堆赋值,将任务信息交给要生成的图节点,以及设定并行度等等
            //然后是创建本节点的IntermediateResult,根据本节点的下游节点的个数确定创建几份
            //最后是根据设定好的并行度创建用于执行task的ExecutionVertex
            //如果job有设定inputsplit的话,这里还要指定inputsplits
            ExecutionJobVertex ejv = new ExecutionJobVertex(
                this,
                jobVertex,
                1,
                rpcCallTimeout,
                globalModVersion,
                createTimestamp);
            
            //这里要处理所有的JobEdge
            //对每个edge,获取对应的intermediateResult,并记录到本节点的输入上
            //最后,把每个ExecutorVertex和对应的IntermediateResult关联起来
            ejv.connectToPredecessors(this.intermediateResults);

            ExecutionJobVertex previousTask = this.tasks.putIfAbsent(jobVertex.getID(), ejv);
            if (previousTask != null) {
                throw new JobException(String.format("Encountered two job vertices with ID %s : previous=[%s] / new=[%s]",
                        jobVertex.getID(), ejv, previousTask));
            }

            for (IntermediateResult res : ejv.getProducedDataSets()) {
                IntermediateResult previousDataSet = this.intermediateResults.putIfAbsent(res.getId(), res);
                if (previousDataSet != null) {
                    throw new JobException(String.format("Encountered two intermediate data set with ID %s : previous=[%s] / new=[%s]",
                            res.getId(), res, previousDataSet));
                }
            }

            this.verticesInCreationOrder.add(ejv);
            this.numVerticesTotal += ejv.getParallelism();
            newExecJobVertices.add(ejv);
        }

至此,ExecutorGraph就创建完成了。

你可能感兴趣的:(StreamGraph、JobGraph、ExecutionGraph生成源码分析)