StreamGraph、JobGraph、ExecutionGraph生成源码分析

StreamGraph生成函数分析

我们从StreamGraphGenerator.generate()方法往下看：

public static StreamGraph generate(StreamExecutionEnvironment env, List> transformations) {
        return new StreamGraphGenerator(env).generateInternal(transformations);
    }
    
    //注意，StreamGraph的生成是从sink开始的
    private StreamGraph generateInternal(List> transformations) {
        for (StreamTransformation transformation: transformations) {
            transform(transformation);
        }
        return streamGraph;
    }
    
    //这个方法的核心逻辑就是判断传入的steamOperator是哪种类型，并执行相应的操作，详情见下面那一大堆if-else
    private Collection transform(StreamTransformation transform) {

        if (alreadyTransformed.containsKey(transform)) {
            return alreadyTransformed.get(transform);
        }

        LOG.debug("Transforming " + transform);

        if (transform.getMaxParallelism() <= 0) {

            // if the max parallelism hasn't been set, then first use the job wide max parallelism
            // from theExecutionConfig.
            int globalMaxParallelismFromConfig = env.getConfig().getMaxParallelism();
            if (globalMaxParallelismFromConfig > 0) {
                transform.setMaxParallelism(globalMaxParallelismFromConfig);
            }
        }

        // call at least once to trigger exceptions about MissingTypeInfo
        transform.getOutputType();

        Collection transformedIds;
        //这里对操作符的类型进行判断，并以此调用相应的处理逻辑.简而言之，处理的核心无非是递归的将该节点和节点的上游节点加入图
        if (transform instanceof OneInputTransformation) {
            transformedIds = transformOneInputTransform((OneInputTransformation) transform);
        } else if (transform instanceof TwoInputTransformation) {
            transformedIds = transformTwoInputTransform((TwoInputTransformation) transform);
        } else if (transform instanceof SourceTransformation) {
            transformedIds = transformSource((SourceTransformation) transform);
        } else if (transform instanceof SinkTransformation) {
            transformedIds = transformSink((SinkTransformation) transform);
        } else if (transform instanceof UnionTransformation) {
            transformedIds = transformUnion((UnionTransformation) transform);
        } else if (transform instanceof SplitTransformation) {
            transformedIds = transformSplit((SplitTransformation) transform);
        } else if (transform instanceof SelectTransformation) {
            transformedIds = transformSelect((SelectTransformation) transform);
        } else if (transform instanceof FeedbackTransformation) {
            transformedIds = transformFeedback((FeedbackTransformation) transform);
        } else if (transform instanceof CoFeedbackTransformation) {
            transformedIds = transformCoFeedback((CoFeedbackTransformation) transform);
        } else if (transform instanceof PartitionTransformation) {
            transformedIds = transformPartition((PartitionTransformation) transform);
        } else if (transform instanceof SideOutputTransformation) {
            transformedIds = transformSideOutput((SideOutputTransformation) transform);
        } else {
            throw new IllegalStateException("Unknown transformation: " + transform);
        }

        //注意这里和函数开始时的方法相对应，在有向图中要注意避免循环的产生
        // need this check because the iterate transformation adds itself before
        // transforming the feedback edges
        if (!alreadyTransformed.containsKey(transform)) {
            alreadyTransformed.put(transform, transformedIds);
        }

        if (transform.getBufferTimeout() > 0) {
            streamGraph.setBufferTimeout(transform.getId(), transform.getBufferTimeout());
        }
        if (transform.getUid() != null) {
            streamGraph.setTransformationUID(transform.getId(), transform.getUid());
        }
        if (transform.getUserProvidedNodeHash() != null) {
            streamGraph.setTransformationUserHash(transform.getId(), transform.getUserProvidedNodeHash());
        }

        if (transform.getMinResources() != null && transform.getPreferredResources() != null) {
            streamGraph.setResources(transform.getId(), transform.getMinResources(), transform.getPreferredResources());
        }

        return transformedIds;
    }

因为map，filter等常用操作都是OneInputStreamOperator,我们就来看看transformOneInputTransform((OneInputTransformation) transform)方法。

private  Collection transformOneInputTransform(OneInputTransformation transform) {

        Collection inputIds = transform(transform.getInput());

        // 在递归处理节点过程中，某个节点可能已经被其他子节点先处理过了，需要跳过
        if (alreadyTransformed.containsKey(transform)) {
            return alreadyTransformed.get(transform);
        }

        //这里是获取slotSharingGroup。这个group用来定义当前我们在处理的这个操作符可以跟什么操作符chain到一个slot里进行操作
        //因为有时候我们可能不满意flink替我们做的chain聚合
        //一个slot就是一个执行task的基本容器
        String slotSharingGroup = determineSlotSharingGroup(transform.getSlotSharingGroup(), inputIds);

        //把该operator加入图
        streamGraph.addOperator(transform.getId(),
                slotSharingGroup,
                transform.getOperator(),
                transform.getInputType(),
                transform.getOutputType(),
                transform.getName());
        
        //对于keyedStream，我们还要记录它的keySelector方法
        //flink并不真正为每个keyedStream保存一个key，而是每次需要用到key的时候都使用keySelector方法进行计算
        //因此，我们自定义的keySelector方法需要保证幂等性
        //到后面介绍keyGroup的时候我们还会再次提到这一点
        if (transform.getStateKeySelector() != null) {
            TypeSerializer keySerializer = transform.getStateKeyType().createSerializer(env.getConfig());
            streamGraph.setOneInputStateKey(transform.getId(), transform.getStateKeySelector(), keySerializer);
        }

        streamGraph.setParallelism(transform.getId(), transform.getParallelism());
        streamGraph.setMaxParallelism(transform.getId(), transform.getMaxParallelism());
        
        //为当前节点和它的依赖节点建立边
        //这里可以看到之前提到的select union partition等逻辑节点被合并入edge的过程
        for (Integer inputId: inputIds) {
            streamGraph.addEdge(inputId, transform.getId(), 0);
        }

        return Collections.singleton(transform.getId());
    }
    
    public void addEdge(Integer upStreamVertexID, Integer downStreamVertexID, int typeNumber) {
        addEdgeInternal(upStreamVertexID,
                downStreamVertexID,
                typeNumber,
                null,
                new ArrayList(),
                null);

    }
    //addEdge的实现，会合并一些逻辑节点
    private void addEdgeInternal(Integer upStreamVertexID,
            Integer downStreamVertexID,
            int typeNumber,
            StreamPartitioner partitioner,
            List outputNames,
            OutputTag outputTag) {
        //如果输入边是侧输出节点，则把side的输入边作为本节点的输入边，并递归调用
        if (virtualSideOutputNodes.containsKey(upStreamVertexID)) {
            int virtualId = upStreamVertexID;
            upStreamVertexID = virtualSideOutputNodes.get(virtualId).f0;
            if (outputTag == null) {
                outputTag = virtualSideOutputNodes.get(virtualId).f1;
            }
            addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, null, outputTag);
            //如果输入边是select，则把select的输入边作为本节点的输入边
        } else if (virtualSelectNodes.containsKey(upStreamVertexID)) {
            int virtualId = upStreamVertexID;
            upStreamVertexID = virtualSelectNodes.get(virtualId).f0;
            if (outputNames.isEmpty()) {
                // selections that happen downstream override earlier selections
                outputNames = virtualSelectNodes.get(virtualId).f1;
            }
            addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, outputNames, outputTag);
            //如果是partition节点
        } else if (virtualPartitionNodes.containsKey(upStreamVertexID)) {
            int virtualId = upStreamVertexID;
            upStreamVertexID = virtualPartitionNodes.get(virtualId).f0;
            if (partitioner == null) {
                partitioner = virtualPartitionNodes.get(virtualId).f1;
            }
            addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, outputNames, outputTag);
        } else {
        //正常的edge处理逻辑
            StreamNode upstreamNode = getStreamNode(upStreamVertexID);
            StreamNode downstreamNode = getStreamNode(downStreamVertexID);

            // If no partitioner was specified and the parallelism of upstream and downstream
            // operator matches use forward partitioning, use rebalance otherwise.
            if (partitioner == null && upstreamNode.getParallelism() == downstreamNode.getParallelism()) {
                partitioner = new ForwardPartitioner

StreamGraph、JobGraph、ExecutionGraph生成源码分析

StreamGraph生成函数分析

JobGraph的生成

operator chain的逻辑

JobGraph的提交

ExecutionGraph的生成

你可能感兴趣的:(StreamGraph、JobGraph、ExecutionGraph生成源码分析)