09_Flink Streaming execute

通过env.execute();启动任务。env有四个实现类。LocalStreamEnvironment,是在本地jvm使用多线程模拟一个flink集群。四个实现类都是通过getStreamGraph来获取DAG图对象StreamGraph。

StreamGraph,保存着点边关系,和需要的上下文对象。可以重新api编程的语义。持有一个JobGraph对象,用来提交给集群。由Client,JobClient,JobClientMessages来执行提交操作,通过akka的Patterns来通信启动任务 。

datastream最后都是通过transform接口进行转换,形成transformation和operator。transform会通过getExecutionEnvironment().addOperator(resultTransform);将结果慢慢的反馈给env的List> transformations。

StreamGraphGenerator.generate(this, transformations);最后由StreamGraphGenerator,根据env和transformations来生成StreamGraph这个DAG对象。通过transform

将env中的transformation慢慢的绘制StreamGraph的DAG图。

StreamGraph是代码的执行逻辑,持有的jobgraph才是task运行依据的DAG。通过createJobGraph将逻辑计划转化成任务计划。

结论:execute,就是将编程的逻辑,转换成一个streamGraph对象。然后将转换成jobgraph提交给集群。



/**
	 * Creates an execution environment that represents the context in which the
	 * program is currently executed. If the program is invoked standalone, this
	 * method returns a local execution environment, as returned by
	 * {@link #createLocalEnvironment()}.
	 *
	 * @return The execution environment of the context in which the program is
	 * executed.
	 */
	public static StreamExecutionEnvironment getExecutionEnvironment() {
		if (contextEnvironmentFactory != null) {
			return contextEnvironmentFactory.createExecutionEnvironment();
		}

		// because the streaming project depends on "flink-clients" (and not the other way around)
		// we currently need to intercept the data set environment and create a dependent stream env.
		// this should be fixed once we rework the project dependencies
		
		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		if (env instanceof ContextEnvironment) {
			return new StreamContextEnvironment((ContextEnvironment) env);
		} else if (env instanceof OptimizerPlanEnvironment | env instanceof PreviewPlanEnvironment) {
			return new StreamPlanEnvironment(env);
		} else {
			return createLocalEnvironment();
		}
	}

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.java;

import com.esotericsoftware.kryo.Serializer;

import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.Public;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.Plan;
import org.apache.flink.api.common.cache.DistributedCache.DistributedCacheEntry;
import org.apache.flink.api.common.io.FileInputFormat;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.api.common.operators.OperatorInformation;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.hadoop.mapred.HadoopInputFormat;
import org.apache.flink.api.java.io.CollectionInputFormat;
import org.apache.flink.api.java.io.CsvReader;
import org.apache.flink.api.java.io.IteratorInputFormat;
import org.apache.flink.api.java.io.ParallelIteratorInputFormat;
import org.apache.flink.api.java.io.PrimitiveInputFormat;
import org.apache.flink.api.java.io.TextInputFormat;
import org.apache.flink.api.java.io.TextValueInputFormat;
import org.apache.flink.api.java.operators.DataSink;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.Operator;
import org.apache.flink.api.java.operators.OperatorTranslation;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.PojoTypeInfo;
import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.api.java.typeutils.ValueTypeInfo;
import org.apache.flink.api.java.typeutils.runtime.kryo.Serializers;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.types.StringValue;
import org.apache.flink.util.NumberSequenceIterator;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.SplittableIterator;
import org.apache.flink.util.Visitor;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;

/**
 * The ExecutionEnvironment is the context in which a program is executed. A
 * {@link LocalEnvironment} will cause execution in the current JVM, a
 * {@link RemoteEnvironment} will cause execution on a remote setup.
 * 

* The environment provides methods to control the job execution (such as setting the parallelism) * and to interact with the outside world (data access). *

* Please note that the execution environment needs strong type information for the input and return types * of all operations that are executed. This means that the environments needs to know that the return * value of an operation is for example a Tuple of String and Integer. * Because the Java compiler throws much of the generic type information away, most methods attempt to re- * obtain that information using reflection. In certain cases, it may be necessary to manually supply that * information to some of the methods. * * @see LocalEnvironment * @see RemoteEnvironment */ @Public public abstract class ExecutionEnvironment { /** The logger used by the environment and its subclasses */ protected static final Logger LOG = LoggerFactory.getLogger(ExecutionEnvironment.class); /** The environment of the context (local by default, cluster if invoked through command line) */ private static ExecutionEnvironmentFactory contextEnvironmentFactory; /** The default parallelism used by local environments */ private static int defaultLocalDop = Runtime.getRuntime().availableProcessors(); // -------------------------------------------------------------------------------------------- private final List> sinks = new ArrayList<>(); private final List> cacheFile = new ArrayList<>(); private final ExecutionConfig config = new ExecutionConfig(); /** Result from the latest execution, to make it retrievable when using eager execution methods */ protected JobExecutionResult lastJobExecutionResult; /** The ID of the session, defined by this execution environment. Sessions and Jobs are same in * Flink, as Jobs can consist of multiple parts that are attached to the growing dataflow graph */ protected JobID jobID; /** The session timeout in seconds */ protected long sessionTimeout; /** Flag to indicate whether sinks have been cleared in previous executions */ private boolean wasExecuted = false; /** * Creates a new Execution Environment. */ protected ExecutionEnvironment() { jobID = JobID.generate(); } // -------------------------------------------------------------------------------------------- // Properties // -------------------------------------------------------------------------------------------- /** * Gets the config object that defines execution parameters. * * @return The environment's execution configuration. */ public ExecutionConfig getConfig() { return config; } /** * Gets the parallelism with which operation are executed by default. Operations can * individually override this value to use a specific parallelism via * {@link Operator#setParallelism(int)}. Other operations may need to run with a different * parallelism - for example calling * {@link DataSet#reduce(org.apache.flink.api.common.functions.ReduceFunction)} over the entire * set will insert eventually an operation that runs non-parallel (parallelism of one). * * @return The parallelism used by operations, unless they override that value. This method * returns {@link ExecutionConfig#PARALLELISM_DEFAULT}, if the environment's default parallelism should be used. */ public int getParallelism() { return config.getParallelism(); } /** * Sets the parallelism for operations executed through this environment. * Setting a parallelism of x here will cause all operators (such as join, map, reduce) to run with * x parallel instances. *

* This method overrides the default parallelism for this environment. * The {@link LocalEnvironment} uses by default a value equal to the number of hardware * contexts (CPU cores / threads). When executing the program via the command line client * from a JAR file, the default parallelism is the one configured for that setup. * * @param parallelism The parallelism */ public void setParallelism(int parallelism) { config.setParallelism(parallelism); } <<<<<<< HEAD /** * Sets the restart strategy configuration. The configuration specifies which restart strategy * will be used for the execution graph in case of a restart. * * @param restartStrategyConfiguration Restart strategy configuration to be set */ @PublicEvolving public void setRestartStrategy(RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration) { config.setRestartStrategy(restartStrategyConfiguration); } /** * Returns the specified restart strategy configuration. * * @return The restart strategy configuration to be used */ @PublicEvolving public RestartStrategies.RestartStrategyConfiguration getRestartStrategy() { return config.getRestartStrategy(); } ======= >>>>>>> 644c27504ad6fb89372e3b39123a4f896013e1ad /** * Sets the number of times that failed tasks are re-executed. A value of zero * effectively disables fault tolerance. A value of {@code -1} indicates that the system * default value (as defined in the configuration) should be used. * * @param numberOfExecutionRetries The number of times the system will try to re-execute failed tasks. * * @deprecated This method will be replaced by {@link #setRestartStrategy}. The * {@link RestartStrategies.FixedDelayRestartStrategyConfiguration} contains the number of * execution retries. */ @Deprecated @PublicEvolving public void setNumberOfExecutionRetries(int numberOfExecutionRetries) { config.setNumberOfExecutionRetries(numberOfExecutionRetries); } /** * Gets the number of times the system will try to re-execute failed tasks. A value * of {@code -1} indicates that the system default value (as defined in the configuration) * should be used. * * @return The number of times the system will try to re-execute failed tasks. * * @deprecated This method will be replaced by {@link #getRestartStrategy}. The * {@link RestartStrategies.FixedDelayRestartStrategyConfiguration} contains the number of * execution retries. */ @Deprecated @PublicEvolving public int getNumberOfExecutionRetries() { return config.getNumberOfExecutionRetries(); } /** * Returns the {@link org.apache.flink.api.common.JobExecutionResult} of the last executed job. * * @return The execution result from the latest job execution. */ public JobExecutionResult getLastJobExecutionResult(){ return this.lastJobExecutionResult; } // -------------------------------------------------------------------------------------------- // Session Management // -------------------------------------------------------------------------------------------- /** * Gets the JobID by which this environment is identified. The JobID sets the execution context * in the cluster or local environment. * * @return The JobID of this environment. * @see #getIdString() */ @PublicEvolving public JobID getId() { return this.jobID; } /** * Gets the JobID by which this environment is identified, as a string. * * @return The JobID as a string. * @see #getId() */ @PublicEvolving public String getIdString() { return this.jobID.toString(); } /** * Sets the session timeout to hold the intermediate results of a job. This only * applies the updated timeout in future executions. * * @param timeout The timeout, in seconds. */ @PublicEvolving public void setSessionTimeout(long timeout) { throw new IllegalStateException("Support for sessions is currently disabled. " + "It will be enabled in future Flink versions."); // Session management is disabled, revert this commit to enable //if (timeout < 0) { // throw new IllegalArgumentException("The session timeout must not be less than zero."); //} //this.sessionTimeout = timeout; } /** * Gets the session timeout for this environment. The session timeout defines for how long * after an execution, the job and its intermediate results will be kept for future * interactions. * * @return The session timeout, in seconds. */ @PublicEvolving public long getSessionTimeout() { return sessionTimeout; } /** * Starts a new session, discarding the previous data flow and all of its intermediate results. */ @PublicEvolving public abstract void startNewSession() throws Exception; // -------------------------------------------------------------------------------------------- // Registry for types and serializers // -------------------------------------------------------------------------------------------- /** * Adds a new Kryo default serializer to the Runtime. * * Note that the serializer instance must be serializable (as defined by java.io.Serializable), * because it may be distributed to the worker nodes by java serialization. * * @param type The class of the types serialized with the given serializer. * @param serializer The serializer to use. */ public & Serializable>void addDefaultKryoSerializer(Class type, T serializer) { config.addDefaultKryoSerializer(type, serializer); } /** * Adds a new Kryo default serializer to the Runtime. * * @param type The class of the types serialized with the given serializer. * @param serializerClass The class of the serializer to use. */ public void addDefaultKryoSerializer(Class type, Class> serializerClass) { config.addDefaultKryoSerializer(type, serializerClass); } /** * Registers the given type with a Kryo Serializer. * * Note that the serializer instance must be serializable (as defined by java.io.Serializable), * because it may be distributed to the worker nodes by java serialization. * * @param type The class of the types serialized with the given serializer. * @param serializer The serializer to use. */ public & Serializable>void registerTypeWithKryoSerializer(Class type, T serializer) { config.registerTypeWithKryoSerializer(type, serializer); } /** * Registers the given Serializer via its class as a serializer for the given type at the KryoSerializer * * @param type The class of the types serialized with the given serializer. * @param serializerClass The class of the serializer to use. */ public void registerTypeWithKryoSerializer(Class type, Class> serializerClass) { config.registerTypeWithKryoSerializer(type, serializerClass); } /** * Registers the given type with the serialization stack. If the type is eventually * serialized as a POJO, then the type is registered with the POJO serializer. If the * type ends up being serialized with Kryo, then it will be registered at Kryo to make * sure that only tags are written. * * @param type The class of the type to register. */ public void registerType(Class type) { if (type == null) { throw new NullPointerException("Cannot register null type class."); } TypeInformation typeInfo = TypeExtractor.createTypeInfo(type); if (typeInfo instanceof PojoTypeInfo) { config.registerPojoType(type); } else { config.registerKryoType(type); } } // -------------------------------------------------------------------------------------------- // Data set creations // -------------------------------------------------------------------------------------------- // ---------------------------------- Text Input Format --------------------------------------- /** * Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise. * The file will be read with the system's default character set. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @return A {@link DataSet} that represents the data read from the given file as text lines. */ public DataSource readTextFile(String filePath) { Preconditions.checkNotNull(filePath, "The file path may not be null."); return new DataSource<>(this, new TextInputFormat(new Path(filePath)), BasicTypeInfo.STRING_TYPE_INFO, Utils.getCallLocationName()); } /** * Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise. * The {@link java.nio.charset.Charset} with the given name will be used to read the files. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param charsetName The name of the character set used to read the file. * @return A {@link DataSet} that represents the data read from the given file as text lines. */ public DataSource readTextFile(String filePath, String charsetName) { Preconditions.checkNotNull(filePath, "The file path may not be null."); TextInputFormat format = new TextInputFormat(new Path(filePath)); format.setCharsetName(charsetName); return new DataSource<>(this, format, BasicTypeInfo.STRING_TYPE_INFO, Utils.getCallLocationName()); } // -------------------------- Text Input Format With String Value------------------------------ /** * Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise. * This method is similar to {@link #readTextFile(String)}, but it produces a DataSet with mutable * {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations * to be less object and garbage collection heavy. *

* The file will be read with the system's default character set. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @return A {@link DataSet} that represents the data read from the given file as text lines. */ public DataSource readTextFileWithValue(String filePath) { Preconditions.checkNotNull(filePath, "The file path may not be null."); return new DataSource<>(this, new TextValueInputFormat(new Path(filePath)), new ValueTypeInfo<>(StringValue.class), Utils.getCallLocationName()); } /** * Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise. * This method is similar to {@link #readTextFile(String, String)}, but it produces a DataSet with mutable * {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations * to be less object and garbage collection heavy. *

* The {@link java.nio.charset.Charset} with the given name will be used to read the files. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param charsetName The name of the character set used to read the file. * @param skipInvalidLines A flag to indicate whether to skip lines that cannot be read with the given character set. * * @return A DataSet that represents the data read from the given file as text lines. */ public DataSource readTextFileWithValue(String filePath, String charsetName, boolean skipInvalidLines) { Preconditions.checkNotNull(filePath, "The file path may not be null."); TextValueInputFormat format = new TextValueInputFormat(new Path(filePath)); format.setCharsetName(charsetName); format.setSkipInvalidLines(skipInvalidLines); return new DataSource<>(this, format, new ValueTypeInfo<>(StringValue.class), Utils.getCallLocationName()); } // ----------------------------------- Primitive Input Format --------------------------------------- /** * Creates a {@link DataSet} that represents the primitive type produced by reading the given file line wise. * This method is similar to {@link #readCsvFile(String)} with single field, but it produces a DataSet not through * {@link org.apache.flink.api.java.tuple.Tuple1}. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param typeClass The primitive type class to be read. * @return A {@link DataSet} that represents the data read from the given file as primitive type. */ public DataSource readFileOfPrimitives(String filePath, Class typeClass) { Preconditions.checkNotNull(filePath, "The file path may not be null."); return new DataSource<>(this, new PrimitiveInputFormat<>(new Path(filePath), typeClass), TypeExtractor.getForClass(typeClass), Utils.getCallLocationName()); } /** * Creates a {@link DataSet} that represents the primitive type produced by reading the given file in delimited way. * This method is similar to {@link #readCsvFile(String)} with single field, but it produces a DataSet not through * {@link org.apache.flink.api.java.tuple.Tuple1}. * * @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path"). * @param delimiter The delimiter of the given file. * @param typeClass The primitive type class to be read. * @return A {@link DataSet} that represents the data read from the given file as primitive type. */ public DataSource readFileOfPrimitives(String filePath, String delimiter, Class typeClass) { Preconditions.checkNotNull(filePath, "The file path may not be null."); return new DataSource<>(this, new PrimitiveInputFormat<>(new Path(filePath), delimiter, typeClass), TypeExtractor.getForClass(typeClass), Utils.getCallLocationName()); } // ----------------------------------- CSV Input Format --------------------------------------- /** * Creates a CSV reader to read a comma separated value (CSV) file. The reader has options to * define parameters and field types and will eventually produce the DataSet that corresponds to * the read and parsed CSV input. * * @param filePath The path of the CSV file. * @return A CsvReader that can be used to configure the CSV input. */ public CsvReader readCsvFile(String filePath) { return new CsvReader(filePath, this); } // ------------------------------------ File Input Format ----------------------------------------- public DataSource readFile(FileInputFormat inputFormat, String filePath) { if (inputFormat == null) { throw new IllegalArgumentException("InputFormat must not be null."); } if (filePath == null) { throw new IllegalArgumentException("The file path must not be null."); } inputFormat.setFilePath(new Path(filePath)); try { return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat)); } catch (Exception e) { throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " + "Please specify the TypeInformation of the produced type explicitly by using the " + "'createInput(InputFormat, TypeInformation)' method instead."); } } // ----------------------------------- Generic Input Format --------------------------------------- /** * Generic method to create an input {@link DataSet} with in {@link InputFormat}. The DataSet will not be * immediately created - instead, this method returns a DataSet that will be lazily created from * the input format once the program is executed. *

* Since all data sets need specific information about their types, this method needs to determine * the type of the data produced by the input format. It will attempt to determine the data type * by reflection, unless the input format implements the {@link ResultTypeQueryable} interface. * In the latter case, this method will invoke the {@link ResultTypeQueryable#getProducedType()} * method to determine data type produced by the input format. * * @param inputFormat The input format used to create the data set. * @return A {@link DataSet} that represents the data created by the input format. * * @see #createInput(InputFormat, TypeInformation) */ public DataSource createInput(InputFormat inputFormat) { if (inputFormat == null) { throw new IllegalArgumentException("InputFormat must not be null."); } try { return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat)); } catch (Exception e) { throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " + "Please specify the TypeInformation of the produced type explicitly by using the " + "'createInput(InputFormat, TypeInformation)' method instead.", e); } } /** * Generic method to create an input DataSet with in {@link InputFormat}. The {@link DataSet} will not be * immediately created - instead, this method returns a {@link DataSet} that will be lazily created from * the input format once the program is executed. *

* The {@link DataSet} is typed to the given TypeInformation. This method is intended for input formats that * where the return type cannot be determined by reflection analysis, and that do not implement the * {@link ResultTypeQueryable} interface. * * @param inputFormat The input format used to create the data set. * @return A {@link DataSet} that represents the data created by the input format. * * @see #createInput(InputFormat) */ public DataSource createInput(InputFormat inputFormat, TypeInformation producedType) { if (inputFormat == null) { throw new IllegalArgumentException("InputFormat must not be null."); } if (producedType == null) { throw new IllegalArgumentException("Produced type information must not be null."); } return new DataSource<>(this, inputFormat, producedType, Utils.getCallLocationName()); } // ----------------------------------- Hadoop Input Format --------------------------------------- /** * Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapred.FileInputFormat}. The * given inputName is set on the given job. */ @PublicEvolving public DataSource> readHadoopFile(org.apache.hadoop.mapred.FileInputFormat mapredInputFormat, Class key, Class value, String inputPath, JobConf job) { DataSource> result = createHadoopInput(mapredInputFormat, key, value, job); org.apache.hadoop.mapred.FileInputFormat.addInputPath(job, new org.apache.hadoop.fs.Path(inputPath)); return result; } /** * Creates a {@link DataSet} from {@link org.apache.hadoop.mapred.SequenceFileInputFormat} * A {@link org.apache.hadoop.mapred.JobConf} with the given inputPath is created. */ <<<<<<< HEAD @PublicEvolving ======= >>>>>>> 644c27504ad6fb89372e3b39123a4f896013e1ad public DataSource> readSequenceFile(Class key, Class value, String inputPath) throws IOException { return readHadoopFile(new org.apache.hadoop.mapred.SequenceFileInputFormat(), key, value, inputPath); } /** * Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapred.FileInputFormat}. A * {@link org.apache.hadoop.mapred.JobConf} with the given inputPath is created. */ @PublicEvolving public DataSource> readHadoopFile(org.apache.hadoop.mapred.FileInputFormat mapredInputFormat, Class key, Class value, String inputPath) { return readHadoopFile(mapredInputFormat, key, value, inputPath, new JobConf()); } /** * Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapred.InputFormat}. */ @PublicEvolving public DataSource> createHadoopInput(org.apache.hadoop.mapred.InputFormat mapredInputFormat, Class key, Class value, JobConf job) { HadoopInputFormat hadoopInputFormat = new HadoopInputFormat<>(mapredInputFormat, key, value, job); return this.createInput(hadoopInputFormat); } /** * Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}. The * given inputName is set on the given job. */ @PublicEvolving public DataSource> readHadoopFile(org.apache.hadoop.mapreduce.lib.input.FileInputFormat mapreduceInputFormat, Class key, Class value, String inputPath, Job job) throws IOException { DataSource> result = createHadoopInput(mapreduceInputFormat, key, value, job); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new org.apache .hadoop.fs.Path(inputPath)); return result; } /** * Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}. A * {@link org.apache.hadoop.mapreduce.Job} with the given inputPath is created. */ @PublicEvolving public DataSource> readHadoopFile(org.apache.hadoop.mapreduce.lib.input.FileInputFormat mapreduceInputFormat, Class key, Class value, String inputPath) throws IOException { return readHadoopFile(mapreduceInputFormat, key, value, inputPath, Job.getInstance()); } /** * Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapreduce.InputFormat}. */ @PublicEvolving public DataSource> createHadoopInput(org.apache.hadoop.mapreduce.InputFormat mapreduceInputFormat, Class key, Class value, Job job) { org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat hadoopInputFormat = new org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat<>(mapreduceInputFormat, key, value, job); return this.createInput(hadoopInputFormat); } // ----------------------------------- Collection --------------------------------------- /** * Creates a DataSet from the given non-empty collection. The type of the data set is that * of the elements in the collection. *

* The framework will try and determine the exact type from the collection elements. * In case of generic elements, it may be necessary to manually supply the type information * via {@link #fromCollection(Collection, TypeInformation)}. *

* Note that this operation will result in a non-parallel data source, i.e. a data source with * a parallelism of one. * * @param data The collection of elements to create the data set from. * @return A DataSet representing the given collection. * * @see #fromCollection(Collection, TypeInformation) */ public DataSource fromCollection(Collection data) { if (data == null) { throw new IllegalArgumentException("The data must not be null."); } if (data.size() == 0) { throw new IllegalArgumentException("The size of the collection must not be empty."); } X firstValue = data.iterator().next(); TypeInformation type = TypeExtractor.getForObject(firstValue); CollectionInputFormat.checkCollection(data, type.getTypeClass()); return new DataSource<>(this, new CollectionInputFormat<>(data, type.createSerializer(config)), type, Utils.getCallLocationName()); } /** * Creates a DataSet from the given non-empty collection. Note that this operation will result * in a non-parallel data source, i.e. a data source with a parallelism of one. *

* The returned DataSet is typed to the given TypeInformation. * * @param data The collection of elements to create the data set from. * @param type The TypeInformation for the produced data set. * @return A DataSet representing the given collection. * * @see #fromCollection(Collection) */ public DataSource fromCollection(Collection data, TypeInformation type) { return fromCollection(data, type, Utils.getCallLocationName()); } private DataSource fromCollection(Collection data, TypeInformation type, String callLocationName) { CollectionInputFormat.checkCollection(data, type.getTypeClass()); return new DataSource<>(this, new CollectionInputFormat<>(data, type.createSerializer(config)), type, callLocationName); } /** * Creates a DataSet from the given iterator. Because the iterator will remain unmodified until * the actual execution happens, the type of data returned by the iterator must be given * explicitly in the form of the type class (this is due to the fact that the Java compiler * erases the generic type information). *

* Note that this operation will result in a non-parallel data source, i.e. a data source with * a parallelism of one. * * @param data The collection of elements to create the data set from. * @param type The class of the data produced by the iterator. Must not be a generic class. * @return A DataSet representing the elements in the iterator. * * @see #fromCollection(Iterator, TypeInformation) */ public DataSource fromCollection(Iterator data, Class type) { return fromCollection(data, TypeExtractor.getForClass(type)); } /** * Creates a DataSet from the given iterator. Because the iterator will remain unmodified until * the actual execution happens, the type of data returned by the iterator must be given * explicitly in the form of the type information. This method is useful for cases where the type * is generic. In that case, the type class (as given in {@link #fromCollection(Iterator, Class)} * does not supply all type information. *

* Note that this operation will result in a non-parallel data source, i.e. a data source with * a parallelism of one. * * @param data The collection of elements to create the data set from. * @param type The TypeInformation for the produced data set. * @return A DataSet representing the elements in the iterator. * * @see #fromCollection(Iterator, Class) */ public DataSource fromCollection(Iterator data, TypeInformation type) { return new DataSource<>(this, new IteratorInputFormat<>(data), type, Utils.getCallLocationName()); } /** * Creates a new data set that contains the given elements. The elements must all be of the same type, * for example, all of the {@link String} or {@link Integer}. The sequence of elements must not be empty. *

* The framework will try and determine the exact type from the collection elements. * In case of generic elements, it may be necessary to manually supply the type information * via {@link #fromCollection(Collection, TypeInformation)}. *

* Note that this operation will result in a non-parallel data source, i.e. a data source with * a parallelism of one. * * @param data The elements to make up the data set. * @return A DataSet representing the given list of elements. */ @SafeVarargs public final DataSource fromElements(X... data) { if (data == null) { throw new IllegalArgumentException("The data must not be null."); } if (data.length == 0) { throw new IllegalArgumentException("The number of elements must not be zero."); } TypeInformation typeInfo; try { typeInfo = TypeExtractor.getForObject(data[0]); } catch (Exception e) { throw new RuntimeException("Could not create TypeInformation for type " + data[0].getClass().getName() + "; please specify the TypeInformation manually via " + "ExecutionEnvironment#fromElements(Collection, TypeInformation)"); } return fromCollection(Arrays.asList(data), typeInfo, Utils.getCallLocationName()); } /** * Creates a new data set that contains the given elements. The framework will determine the type according to the * based type user supplied. The elements should be the same or be the subclass to the based type. * The sequence of elements must not be empty. * Note that this operation will result in a non-parallel data source, i.e. a data source with * a parallelism of one. * * @param type The base class type for every element in the collection. * @param data The elements to make up the data set. * @return A DataSet representing the given list of elements. */ @SafeVarargs public final DataSource fromElements(Class type, X... data) { if (data == null) { throw new IllegalArgumentException("The data must not be null."); } if (data.length == 0) { throw new IllegalArgumentException("The number of elements must not be zero."); } TypeInformation typeInfo; try { typeInfo = TypeExtractor.getForClass(type); } catch (Exception e) { throw new RuntimeException("Could not create TypeInformation for type " + type.getName() + "; please specify the TypeInformation manually via " + "ExecutionEnvironment#fromElements(Collection, TypeInformation)"); } return fromCollection(Arrays.asList(data), typeInfo, Utils.getCallLocationName()); } /** * Creates a new data set that contains elements in the iterator. The iterator is splittable, allowing the * framework to create a parallel data source that returns the elements in the iterator. *

* Because the iterator will remain unmodified until the actual execution happens, the type of data * returned by the iterator must be given explicitly in the form of the type class (this is due to the * fact that the Java compiler erases the generic type information). * * @param iterator The iterator that produces the elements of the data set. * @param type The class of the data produced by the iterator. Must not be a generic class. * @return A DataSet representing the elements in the iterator. * * @see #fromParallelCollection(SplittableIterator, TypeInformation) */ public DataSource fromParallelCollection(SplittableIterator iterator, Class type) { return fromParallelCollection(iterator, TypeExtractor.getForClass(type)); } /** * Creates a new data set that contains elements in the iterator. The iterator is splittable, allowing the * framework to create a parallel data source that returns the elements in the iterator. *

* Because the iterator will remain unmodified until the actual execution happens, the type of data * returned by the iterator must be given explicitly in the form of the type information. * This method is useful for cases where the type is generic. In that case, the type class * (as given in {@link #fromParallelCollection(SplittableIterator, Class)} does not supply all type information. * * @param iterator The iterator that produces the elements of the data set. * @param type The TypeInformation for the produced data set. * @return A DataSet representing the elements in the iterator. * * @see #fromParallelCollection(SplittableIterator, Class) */ public DataSource fromParallelCollection(SplittableIterator iterator, TypeInformation type) { return fromParallelCollection(iterator, type, Utils.getCallLocationName()); } // private helper for passing different call location names private DataSource fromParallelCollection(SplittableIterator iterator, TypeInformation type, String callLocationName) { return new DataSource<>(this, new ParallelIteratorInputFormat<>(iterator), type, callLocationName); } /** * Creates a new data set that contains a sequence of numbers. The data set will be created in parallel, * so there is no guarantee about the order of the elements. * * @param from The number to start at (inclusive). * @param to The number to stop at (inclusive). * @return A DataSet, containing all number in the {@code [from, to]} interval. */ public DataSource generateSequence(long from, long to) { return fromParallelCollection(new NumberSequenceIterator(from, to), BasicTypeInfo.LONG_TYPE_INFO, Utils.getCallLocationName()); } // -------------------------------------------------------------------------------------------- // Executing // -------------------------------------------------------------------------------------------- /** * Triggers the program execution. The environment will execute all parts of the program that have * resulted in a "sink" operation. Sink operations are for example printing results ({@link DataSet#print()}, * writing results (e.g. {@link DataSet#writeAsText(String)}, * {@link DataSet#write(org.apache.flink.api.common.io.FileOutputFormat, String)}, or other generic * data sinks created with {@link DataSet#output(org.apache.flink.api.common.io.OutputFormat)}. *

* The program execution will be logged and displayed with a generated default name. * * @return The result of the job execution, containing elapsed time and accumulators. * @throws Exception Thrown, if the program executions fails. */ public JobExecutionResult execute() throws Exception { return execute(getDefaultName()); } /** * Triggers the program execution. The environment will execute all parts of the program that have * resulted in a "sink" operation. Sink operations are for example printing results ({@link DataSet#print()}, * writing results (e.g. {@link DataSet#writeAsText(String)}, * {@link DataSet#write(org.apache.flink.api.common.io.FileOutputFormat, String)}, or other generic * data sinks created with {@link DataSet#output(org.apache.flink.api.common.io.OutputFormat)}. *

* The program execution will be logged and displayed with the given job name. * * @return The result of the job execution, containing elapsed time and accumulators. * @throws Exception Thrown, if the program executions fails. */ public abstract JobExecutionResult execute(String jobName) throws Exception; /** * Creates the plan with which the system will execute the program, and returns it as * a String using a JSON representation of the execution data flow graph. * Note that this needs to be called, before the plan is executed. * * @return The execution plan of the program, as a JSON String. * @throws Exception Thrown, if the compiler could not be instantiated, or the master could not * be contacted to retrieve information relevant to the execution planning. */ public abstract String getExecutionPlan() throws Exception; /** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (as long as all relevant workers have access to it), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. *

* The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. */ public void registerCachedFile(String filePath, String name){ registerCachedFile(filePath, name, false); } /** * Registers a file at the distributed cache under the given name. The file will be accessible * from any user-defined function in the (distributed) runtime under a local path. Files * may be local files (as long as all relevant workers have access to it), or files in a distributed file system. * The runtime will copy the files temporarily to a local cache, if needed. *

* The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via * {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access * {@link org.apache.flink.api.common.cache.DistributedCache} via * {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}. * * @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path") * @param name The name under which the file is registered. * @param executable flag indicating whether the file should be executable */ public void registerCachedFile(String filePath, String name, boolean executable){ this.cacheFile.add(new Tuple2<>(name, new DistributedCacheEntry(filePath, executable))); } /** * Registers all files that were registered at this execution environment's cache registry of the * given plan's cache registry. * * @param p The plan to register files at. * @throws IOException Thrown if checks for existence and sanity fail. */ protected void registerCachedFilesWithPlan(Plan p) throws IOException { for (Tuple2 entry : cacheFile) { p.registerCachedFile(entry.f0, entry.f1); } } /** * Creates the program's {@link Plan}. The plan is a description of all data sources, data sinks, * and operations and how they interact, as an isolated unit that can be executed with a * {@link org.apache.flink.api.common.PlanExecutor}. Obtaining a plan and starting it with an * executor is an alternative way to run a program and is only possible if the program consists * only of distributed operations. * This automatically starts a new stage of execution. * * @return The program's plan. */ @Internal public Plan createProgramPlan() { return createProgramPlan(null); } /** * Creates the program's {@link Plan}. The plan is a description of all data sources, data sinks, * and operations and how they interact, as an isolated unit that can be executed with a * {@link org.apache.flink.api.common.PlanExecutor}. Obtaining a plan and starting it with an * executor is an alternative way to run a program and is only possible if the program consists * only of distributed operations. * This automatically starts a new stage of execution. * * @param jobName The name attached to the plan (displayed in logs and monitoring). * @return The program's plan. */ @Internal public Plan createProgramPlan(String jobName) { return createProgramPlan(jobName, true); } /** * Creates the program's {@link Plan}. The plan is a description of all data sources, data sinks, * and operations and how they interact, as an isolated unit that can be executed with a * {@link org.apache.flink.api.common.PlanExecutor}. Obtaining a plan and starting it with an * executor is an alternative way to run a program and is only possible if the program consists * only of distributed operations. * * @param jobName The name attached to the plan (displayed in logs and monitoring). * @param clearSinks Whether or not to start a new stage of execution. * @return The program's plan. */ @Internal public Plan createProgramPlan(String jobName, boolean clearSinks) { if (this.sinks.isEmpty()) { if (wasExecuted) { throw new RuntimeException("No new data sinks have been defined since the " + "last execution. The last execution refers to the latest call to " + "'execute()', 'count()', 'collect()', or 'print()'."); } else { throw new RuntimeException("No data sinks have been created yet. " + "A program needs at least one sink that consumes data. " + "Examples are writing the data set or printing it."); } } if (jobName == null) { jobName = getDefaultName(); } OperatorTranslation translator = new OperatorTranslation(); Plan plan = translator.translateToPlan(this.sinks, jobName); if (getParallelism() > 0) { plan.setDefaultParallelism(getParallelism()); } plan.setExecutionConfig(getConfig()); // Check plan for GenericTypeInfo's and register the types at the serializers. <<<<<<< HEAD if (!config.isAutoTypeRegistrationDisabled()) { plan.accept(new Visitor>() { private final HashSet> deduplicator = new HashSet<>(); @Override public boolean preVisit(org.apache.flink.api.common.operators.Operator visitable) { OperatorInformation opInfo = visitable.getOperatorInfo(); Serializers.recursivelyRegisterType(opInfo.getOutputType(), config, deduplicator); return true; ======= plan.accept(new Visitor>() { @Override public boolean preVisit(org.apache.flink.api.common.operators.Operator visitable) { OperatorInformation opInfo = visitable.getOperatorInfo(); TypeInformation typeInfo = opInfo.getOutputType(); if(typeInfo instanceof GenericTypeInfo) { GenericTypeInfo genericTypeInfo = (GenericTypeInfo) typeInfo; if(!config.isAutoTypeRegistrationDisabled()) { Serializers.recursivelyRegisterType(genericTypeInfo.getTypeClass(), config); } } if(typeInfo instanceof CompositeType) { List> genericTypesInComposite = new ArrayList<>(); Utils.getContainedGenericTypes((CompositeType)typeInfo, genericTypesInComposite); for(GenericTypeInfo gt : genericTypesInComposite) { Serializers.recursivelyRegisterType(gt.getTypeClass(), config); } >>>>>>> 644c27504ad6fb89372e3b39123a4f896013e1ad } @Override public void postVisit(org.apache.flink.api.common.operators.Operator visitable) {} }); } try { registerCachedFilesWithPlan(plan); } catch (Exception e) { throw new RuntimeException("Error while registering cached files: " + e.getMessage(), e); } // clear all the sinks such that the next execution does not redo everything if (clearSinks) { this.sinks.clear(); wasExecuted = true; } // All types are registered now. Print information. int registeredTypes = config.getRegisteredKryoTypes().size() + config.getRegisteredPojoTypes().size() + config.getRegisteredTypesWithKryoSerializerClasses().size() + config.getRegisteredTypesWithKryoSerializers().size(); int defaultKryoSerializers = config.getDefaultKryoSerializers().size() + config.getDefaultKryoSerializerClasses().size(); LOG.info("The job has {} registered types and {} default Kryo serializers", registeredTypes, defaultKryoSerializers); if(config.isForceKryoEnabled() && config.isForceAvroEnabled()) { LOG.warn("In the ExecutionConfig, both Avro and Kryo are enforced. Using Kryo serializer"); } if(config.isForceKryoEnabled()) { LOG.info("Using KryoSerializer for serializing POJOs"); } if(config.isForceAvroEnabled()) { LOG.info("Using AvroSerializer for serializing POJOs"); } if(LOG.isDebugEnabled()) { LOG.debug("Registered Kryo types: {}", config.getRegisteredKryoTypes().toString()); LOG.debug("Registered Kryo with Serializers types: {}", config.getRegisteredTypesWithKryoSerializers().entrySet().toString()); LOG.debug("Registered Kryo with Serializer Classes types: {}", config.getRegisteredTypesWithKryoSerializerClasses().entrySet().toString()); LOG.debug("Registered Kryo default Serializers: {}", config.getDefaultKryoSerializers().entrySet().toString()); LOG.debug("Registered Kryo default Serializers Classes {}", config.getDefaultKryoSerializerClasses().entrySet().toString()); LOG.debug("Registered POJO types: {}", config.getRegisteredPojoTypes().toString()); // print information about static code analysis LOG.debug("Static code analysis mode: {}", config.getCodeAnalysisMode()); } return plan; } /** * Adds the given sink to this environment. Only sinks that have been added will be executed once * the {@link #execute()} or {@link #execute(String)} method is called. * * @param sink The sink to add for execution. */ @Internal void registerDataSink(DataSink sink) { this.sinks.add(sink); } /** * Gets a default job name, based on the timestamp when this method is invoked. * * @return A default job name. */ private static String getDefaultName() { return "Flink Java Job at " + Calendar.getInstance().getTime(); } // -------------------------------------------------------------------------------------------- // Instantiation of Execution Contexts // -------------------------------------------------------------------------------------------- /** * Creates an execution environment that represents the context in which the program is currently executed. * If the program is invoked standalone, this method returns a local execution environment, as returned by * {@link #createLocalEnvironment()}. If the program is invoked from within the command line client to be * submitted to a cluster, this method returns the execution environment of this cluster. * * @return The execution environment of the context in which the program is executed. */ public static ExecutionEnvironment getExecutionEnvironment() { return contextEnvironmentFactory == null ? createLocalEnvironment() : contextEnvironmentFactory.createExecutionEnvironment(); } /** * Creates a {@link CollectionEnvironment} that uses Java Collections underneath. This will execute in a * single thread in the current JVM. It is very fast but will fail if the data does not fit into * memory. parallelism will always be 1. This is useful during implementation and for debugging. * @return A Collection Environment */ @PublicEvolving public static CollectionEnvironment createCollectionsEnvironment(){ CollectionEnvironment ce = new CollectionEnvironment(); ce.setParallelism(1); return ce; } /** * Creates a {@link LocalEnvironment}. The local execution environment will run the program in a * multi-threaded fashion in the same JVM as the environment was created in. The default * parallelism of the local environment is the number of hardware contexts (CPU cores / threads), * unless it was specified differently by {@link #setDefaultLocalParallelism(int)}. * * @return A local execution environment. */ public static LocalEnvironment createLocalEnvironment() { return createLocalEnvironment(defaultLocalDop); } /** * Creates a {@link LocalEnvironment}. The local execution environment will run the program in a * multi-threaded fashion in the same JVM as the environment was created in. It will use the * parallelism specified in the parameter. * * @param parallelism The parallelism for the local environment. * @return A local execution environment with the specified parallelism. */ public static LocalEnvironment createLocalEnvironment(int parallelism) { LocalEnvironment lee = new LocalEnvironment(); lee.setParallelism(parallelism); return lee; } /** * Creates a {@link LocalEnvironment}. The local execution environment will run the program in a * multi-threaded fashion in the same JVM as the environment was created in. It will use the * parallelism specified in the parameter. * * @param customConfiguration Pass a custom configuration to the LocalEnvironment. * @return A local execution environment with the specified parallelism. */ public static LocalEnvironment createLocalEnvironment(Configuration customConfiguration) { return new LocalEnvironment(customConfiguration); } /** * Creates a {@link RemoteEnvironment}. The remote environment sends (parts of) the program * to a cluster for execution. Note that all file paths used in the program must be accessible from the * cluster. The execution will use the cluster's default parallelism, unless the parallelism is * set explicitly via {@link ExecutionEnvironment#setParallelism(int)}. * * @param host The host name or address of the master (JobManager), where the program should be executed. * @param port The port of the master (JobManager), where the program should be executed. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses * user-defined functions, user-defined input formats, or any libraries, those must be * provided in the JAR files. * @return A remote environment that executes the program on a cluster. */ public static ExecutionEnvironment createRemoteEnvironment(String host, int port, String... jarFiles) { return new RemoteEnvironment(host, port, jarFiles); } /** * Creates a {@link RemoteEnvironment}. The remote environment sends (parts of) the program * to a cluster for execution. Note that all file paths used in the program must be accessible from the * cluster. The custom configuration file is used to configure Akka specific configuration parameters * for the Client only; Program parallelism can be set via {@link ExecutionEnvironment#setParallelism(int)}. * * Cluster configuration has to be done in the remotely running Flink instance. * * @param host The host name or address of the master (JobManager), where the program should be executed. * @param port The port of the master (JobManager), where the program should be executed. * @param clientConfiguration Configuration used by the client that connects to the cluster. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses * user-defined functions, user-defined input formats, or any libraries, those must be * provided in the JAR files. * @return A remote environment that executes the program on a cluster. */ public static ExecutionEnvironment createRemoteEnvironment( String host, int port, Configuration clientConfiguration, String... jarFiles) { return new RemoteEnvironment(host, port, clientConfiguration, jarFiles, null); } /** * Creates a {@link RemoteEnvironment}. The remote environment sends (parts of) the program * to a cluster for execution. Note that all file paths used in the program must be accessible from the * cluster. The execution will use the specified parallelism. * * @param host The host name or address of the master (JobManager), where the program should be executed. * @param port The port of the master (JobManager), where the program should be executed. * @param parallelism The parallelism to use during the execution. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses * user-defined functions, user-defined input formats, or any libraries, those must be * provided in the JAR files. * @return A remote environment that executes the program on a cluster. */ public static ExecutionEnvironment createRemoteEnvironment(String host, int port, int parallelism, String... jarFiles) { RemoteEnvironment rec = new RemoteEnvironment(host, port, jarFiles); rec.setParallelism(parallelism); return rec; } /** * Sets the default parallelism that will be used for the local execution environment created by * {@link #createLocalEnvironment()}. * * @param parallelism The parallelism to use as the default local parallelism. */ public static void setDefaultLocalParallelism(int parallelism) { defaultLocalDop = parallelism; } // -------------------------------------------------------------------------------------------- // Methods to control the context environment and creation of explicit environments other // than the context environment // -------------------------------------------------------------------------------------------- /** * Sets a context environment factory, that creates the context environment for running programs * with pre-configured environments. Examples are running programs from the command line, and * running programs in the Scala shell. * *

When the context environment factors is set, no other environments can be explicitly used. * * @param ctx The context environment factory. */ protected static void initializeContextEnvironment(ExecutionEnvironmentFactory ctx) { contextEnvironmentFactory = Preconditions.checkNotNull(ctx); } /** * Un-sets the context environment factory. After this method is called, the call to * {@link #getExecutionEnvironment()} will again return a default local execution environment, and * it is possible to explicitly instantiate the LocalEnvironment and the RemoteEnvironment. */ protected static void resetContextEnvironment() { contextEnvironmentFactory = null; } /** * Checks whether it is currently permitted to explicitly instantiate a LocalEnvironment * or a RemoteEnvironment. * * @return True, if it is possible to explicitly instantiate a LocalEnvironment or a * RemoteEnvironment, false otherwise. */ @Internal public static boolean areExplicitEnvironmentsAllowed() { return contextEnvironmentFactory == null; } }




/**
	 * Triggers the program execution. The environment will execute all parts of
	 * the program that have resulted in a "sink" operation. Sink operations are
	 * for example printing results or forwarding them to a message queue.
	 * 

* The program execution will be logged and displayed with a generated * default name. * * @return The result of the job execution, containing elapsed time and accumulators. * @throws Exception which occurs during job execution. */ public JobExecutionResult execute() throws Exception { return execute(DEFAULT_JOB_NAME); }


/**
	 * Triggers the program execution. The environment will execute all parts of
	 * the program that have resulted in a "sink" operation. Sink operations are
	 * for example printing results or forwarding them to a message queue.
	 * 

* The program execution will be logged and displayed with the provided name * * @param jobName * Desired name of the job * @return The result of the job execution, containing elapsed time and accumulators. * @throws Exception which occurs during job execution. */ public abstract JobExecutionResult execute(String jobName) throws Exception;


/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.environment;

import org.apache.flink.annotation.Public;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster;

import org.apache.flink.streaming.api.graph.StreamGraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The LocalStreamEnvironment is a StreamExecutionEnvironment that runs the program locally,
 * multi-threaded, in the JVM where the environment is instantiated. It spawns an embedded
 * Flink cluster in the background and executes the program on that cluster.
 *
 * 

When this environment is instantiated, it uses a default parallelism of {@code 1}. The default * parallelism can be set via {@link #setParallelism(int)}. * *

Local environments can also be instantiated through {@link StreamExecutionEnvironment#createLocalEnvironment()} * and {@link StreamExecutionEnvironment#createLocalEnvironment(int)}. The former version will pick a * default parallelism equal to the number of hardware contexts in the local machine. */ @Public public class LocalStreamEnvironment extends StreamExecutionEnvironment { private static final Logger LOG = LoggerFactory.getLogger(LocalStreamEnvironment.class); /** The configuration to use for the local cluster */ private final Configuration conf; /** * Creates a new local stream environment that uses the default configuration. */ public LocalStreamEnvironment() { this(null); } /** * Creates a new local stream environment that configures its local executor with the given configuration. * * @param config The configuration used to configure the local executor. */ public LocalStreamEnvironment(Configuration config) { if (!ExecutionEnvironment.areExplicitEnvironmentsAllowed()) { throw new InvalidProgramException( "The LocalStreamEnvironment cannot be used when submitting a program through a client, " + "or running in a TestEnvironment context."); } this.conf = config == null ? new Configuration() : config; } /** * Executes the JobGraph of the on a mini cluster of CLusterUtil with a user * specified name. * * @param jobName * name of the job * @return The result of the job execution, containing elapsed time and accumulators. */ @Override public JobExecutionResult execute(String jobName) throws Exception { // transform the streaming program into a JobGraph StreamGraph streamGraph = getStreamGraph(); streamGraph.setJobName(jobName); JobGraph jobGraph = streamGraph.getJobGraph(); Configuration configuration = new Configuration(); configuration.addAll(jobGraph.getJobConfiguration()); configuration.setLong(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, -1L); configuration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, jobGraph.getMaximumParallelism()); // add (and override) the settings with what the user defined configuration.addAll(this.conf); if (LOG.isInfoEnabled()) { LOG.info("Running job on local embedded Flink mini cluster"); } LocalFlinkMiniCluster exec = new LocalFlinkMiniCluster(configuration, true); try { exec.start(); return exec.submitJobAndWait(jobGraph, getConfig().isSysoutLoggingEnabled()); } finally { transformations.clear(); exec.stop(); } } }


/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.graph;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.InputTypeConfigurable;
import org.apache.flink.api.java.typeutils.MissingTypeInfo;
import org.apache.flink.optimizer.plan.StreamingPlan;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.operators.OutputTypeConfigurable;
import org.apache.flink.streaming.api.operators.StoppableStreamSource;
import org.apache.flink.streaming.api.operators.StreamOperator;
import org.apache.flink.streaming.api.operators.StreamSource;
import org.apache.flink.streaming.api.operators.TwoInputStreamOperator;
import org.apache.flink.runtime.state.AbstractStateBackend;
import org.apache.flink.streaming.runtime.partitioner.ForwardPartitioner;
import org.apache.flink.streaming.runtime.partitioner.RebalancePartitioner;
import org.apache.flink.streaming.runtime.partitioner.StreamPartitioner;
import org.apache.flink.streaming.runtime.tasks.OneInputStreamTask;
import org.apache.flink.streaming.runtime.tasks.SourceStreamTask;
import org.apache.flink.streaming.runtime.tasks.StoppableSourceStreamTask;
import org.apache.flink.streaming.runtime.tasks.StreamIterationHead;
import org.apache.flink.streaming.runtime.tasks.StreamIterationTail;
import org.apache.flink.streaming.runtime.tasks.TwoInputStreamTask;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class representing the streaming topology. It contains all the information
 * necessary to build the jobgraph for the execution.
 * 
 */
@Internal
public class StreamGraph extends StreamingPlan {
	
	private static final Logger LOG = LoggerFactory.getLogger(StreamGraph.class);

	private String jobName = StreamExecutionEnvironment.DEFAULT_JOB_NAME;

	private final StreamExecutionEnvironment environment;
	private final ExecutionConfig executionConfig;
	private final CheckpointConfig checkpointConfig;
	
	private boolean chaining;

	private Map streamNodes;
	private Set sources;
	private Set sinks;
	private Map>> virtualSelectNodes;
	private Map>> virtuaPartitionNodes;

	protected Map vertexIDtoBrokerID;
	protected Map vertexIDtoLoopTimeout;
	private AbstractStateBackend stateBackend;
	private Set> iterationSourceSinkPairs;


	public StreamGraph(StreamExecutionEnvironment environment) {
		this.environment = environment;
		this.executionConfig = environment.getConfig();
		this.checkpointConfig = environment.getCheckpointConfig();

		// create an empty new stream graph.
		clear();
	}

	/**
	 * Remove all registered nodes etc.
	 */
	public void clear() {
		streamNodes = new HashMap<>();
		virtualSelectNodes = new HashMap<>();
		virtuaPartitionNodes = new HashMap<>();
		vertexIDtoBrokerID = new HashMap<>();
		vertexIDtoLoopTimeout  = new HashMap<>();
		iterationSourceSinkPairs = new HashSet<>();
		sources = new HashSet<>();
		sinks = new HashSet<>();
	}
	
	
	public StreamExecutionEnvironment getEnvironment() {
		return environment;
	}

	public ExecutionConfig getExecutionConfig() {
		return executionConfig;
	}
	
	public CheckpointConfig getCheckpointConfig() {
		return checkpointConfig;
	}

	public String getJobName() {
		return jobName;
	}

	public void setJobName(String jobName) {
		this.jobName = jobName;
	}

	public void setChaining(boolean chaining) {
		this.chaining = chaining;
	}

	public void setStateBackend(AbstractStateBackend backend) {
		this.stateBackend = backend;
	}

	public AbstractStateBackend getStateBackend() {
		return this.stateBackend;
	}

	// Checkpointing
	
	public boolean isChainingEnabled() {
		return chaining;
	}
	

	public boolean isIterative() {
		return!vertexIDtoLoopTimeout.isEmpty();
	}

	public  void addSource(Integer vertexID,
		String slotSharingGroup,
		StreamOperator operatorObject,
		TypeInformation inTypeInfo,
		TypeInformation outTypeInfo,
		String operatorName) {
		addOperator(vertexID, slotSharingGroup, operatorObject, inTypeInfo, outTypeInfo, operatorName);
		sources.add(vertexID);
	}

	public  void addSink(Integer vertexID,
		String slotSharingGroup,
		StreamOperator operatorObject,
		TypeInformation inTypeInfo,
		TypeInformation outTypeInfo,
		String operatorName) {
		addOperator(vertexID, slotSharingGroup, operatorObject, inTypeInfo, outTypeInfo, operatorName);
		sinks.add(vertexID);
	}

	public  void addOperator(
			Integer vertexID,
			String slotSharingGroup,
			StreamOperator operatorObject,
			TypeInformation inTypeInfo,
			TypeInformation outTypeInfo,
			String operatorName) {

		if (operatorObject instanceof StoppableStreamSource) {
			addNode(vertexID, slotSharingGroup, StoppableSourceStreamTask.class, operatorObject, operatorName);
		} else if (operatorObject instanceof StreamSource) {
			addNode(vertexID, slotSharingGroup, SourceStreamTask.class, operatorObject, operatorName);
		} else {
			addNode(vertexID, slotSharingGroup, OneInputStreamTask.class, operatorObject, operatorName);
		}

		TypeSerializer inSerializer = inTypeInfo != null && !(inTypeInfo instanceof MissingTypeInfo) ? inTypeInfo.createSerializer(executionConfig) : null;

		TypeSerializer outSerializer = outTypeInfo != null && !(outTypeInfo instanceof MissingTypeInfo) ? outTypeInfo.createSerializer(executionConfig) : null;

		setSerializers(vertexID, inSerializer, null, outSerializer);

		if (operatorObject instanceof OutputTypeConfigurable) {
			@SuppressWarnings("unchecked")
			OutputTypeConfigurable outputTypeConfigurable = (OutputTypeConfigurable) operatorObject;
			// sets the output type which must be know at StreamGraph creation time
			outputTypeConfigurable.setOutputType(outTypeInfo, executionConfig);
		}

		if (operatorObject instanceof InputTypeConfigurable) {
			InputTypeConfigurable inputTypeConfigurable = (InputTypeConfigurable) operatorObject;
			inputTypeConfigurable.setInputType(inTypeInfo, executionConfig);
		}

		if (LOG.isDebugEnabled()) {
			LOG.debug("Vertex: {}", vertexID);
		}
	}

	public  void addCoOperator(
			Integer vertexID,
			String slotSharingGroup,
			TwoInputStreamOperator taskOperatorObject,
			TypeInformation in1TypeInfo,
			TypeInformation in2TypeInfo,
			TypeInformation outTypeInfo,
			String operatorName) {

		addNode(vertexID, slotSharingGroup, TwoInputStreamTask.class, taskOperatorObject, operatorName);

		TypeSerializer outSerializer = (outTypeInfo != null) && !(outTypeInfo instanceof MissingTypeInfo) ?
				outTypeInfo.createSerializer(executionConfig) : null;

		setSerializers(vertexID, in1TypeInfo.createSerializer(executionConfig), in2TypeInfo.createSerializer(executionConfig), outSerializer);

		if (taskOperatorObject instanceof OutputTypeConfigurable) {
			@SuppressWarnings("unchecked")
			OutputTypeConfigurable outputTypeConfigurable = (OutputTypeConfigurable) taskOperatorObject;
			// sets the output type which must be know at StreamGraph creation time
			outputTypeConfigurable.setOutputType(outTypeInfo, executionConfig);
		}

		if (LOG.isDebugEnabled()) {
			LOG.debug("CO-TASK: {}", vertexID);
		}
	}

	protected StreamNode addNode(Integer vertexID,
		String slotSharingGroup,
		Class vertexClass,
		StreamOperator operatorObject,
		String operatorName) {

		if (streamNodes.containsKey(vertexID)) {
			throw new RuntimeException("Duplicate vertexID " + vertexID);
		}

		StreamNode vertex = new StreamNode(environment,
			vertexID,
			slotSharingGroup,
			operatorObject,
			operatorName,
			new ArrayList>(),
			vertexClass);

		streamNodes.put(vertexID, vertex);

		return vertex;
	}

	/**
	 * Adds a new virtual node that is used to connect a downstream vertex to only the outputs
	 * with the selected names.
	 *
	 * When adding an edge from the virtual node to a downstream node the connection will be made
	 * to the original node, only with the selected names given here.
	 *
	 * @param originalId ID of the node that should be connected to.
	 * @param virtualId ID of the virtual node.
	 * @param selectedNames The selected names.
	 */
	public void addVirtualSelectNode(Integer originalId, Integer virtualId, List selectedNames) {

		if (virtualSelectNodes.containsKey(virtualId)) {
			throw new IllegalStateException("Already has virtual select node with id " + virtualId);
		}

		virtualSelectNodes.put(virtualId,
				new Tuple2>(originalId, selectedNames));
	}

	/**
	 * Adds a new virtual node that is used to connect a downstream vertex to an input with a certain
	 * partitioning.
	 *
	 * When adding an edge from the virtual node to a downstream node the connection will be made
	 * to the original node, but with the partitioning given here.
	 *
	 * @param originalId ID of the node that should be connected to.
	 * @param virtualId ID of the virtual node.
	 * @param partitioner The partitioner
	 */
	public void addVirtualPartitionNode(Integer originalId, Integer virtualId, StreamPartitioner partitioner) {

		if (virtuaPartitionNodes.containsKey(virtualId)) {
			throw new IllegalStateException("Already has virtual partition node with id " + virtualId);
		}

		virtuaPartitionNodes.put(virtualId,
				new Tuple2>(originalId, partitioner));
	}

	/**
	 * Determines the slot sharing group of an operation across virtual nodes.
	 */
	public String getSlotSharingGroup(Integer id) {
		if (virtualSelectNodes.containsKey(id)) {
			Integer mappedId = virtualSelectNodes.get(id).f0;
			return getSlotSharingGroup(mappedId);
		} else if (virtuaPartitionNodes.containsKey(id)) {
			Integer mappedId = virtuaPartitionNodes.get(id).f0;
			return getSlotSharingGroup(mappedId);
		} else {
			StreamNode node = getStreamNode(id);
			return node.getSlotSharingGroup();
		}
	}

	public void addEdge(Integer upStreamVertexID, Integer downStreamVertexID, int typeNumber) {
		addEdgeInternal(upStreamVertexID,
				downStreamVertexID,
				typeNumber,
				null,
				new ArrayList());

	}

	private void addEdgeInternal(Integer upStreamVertexID,
			Integer downStreamVertexID,
			int typeNumber,
			StreamPartitioner partitioner,
			List outputNames) {


		if (virtualSelectNodes.containsKey(upStreamVertexID)) {
			int virtualId = upStreamVertexID;
			upStreamVertexID = virtualSelectNodes.get(virtualId).f0;
			if (outputNames.isEmpty()) {
				// selections that happen downstream override earlier selections
				outputNames = virtualSelectNodes.get(virtualId).f1;
			}
			addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, outputNames);
		} else if (virtuaPartitionNodes.containsKey(upStreamVertexID)) {
			int virtualId = upStreamVertexID;
			upStreamVertexID = virtuaPartitionNodes.get(virtualId).f0;
			if (partitioner == null) {
				partitioner = virtuaPartitionNodes.get(virtualId).f1;
			}
			addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, outputNames);
		} else {
			StreamNode upstreamNode = getStreamNode(upStreamVertexID);
			StreamNode downstreamNode = getStreamNode(downStreamVertexID);

			// If no partitioner was specified and the parallelism of upstream and downstream
			// operator matches use forward partitioning, use rebalance otherwise.
			if (partitioner == null && upstreamNode.getParallelism() == downstreamNode.getParallelism()) {
				partitioner = new ForwardPartitioner();
			} else if (partitioner == null) {
				partitioner = new RebalancePartitioner();
			}

			if (partitioner instanceof ForwardPartitioner) {
				if (upstreamNode.getParallelism() != downstreamNode.getParallelism()) {
					throw new UnsupportedOperationException("Forward partitioning does not allow " +
							"change of parallelism. Upstream operation: " + upstreamNode + " parallelism: " + upstreamNode.getParallelism() +
							", downstream operation: " + downstreamNode + " parallelism: " + downstreamNode.getParallelism() +
							" You must use another partitioning strategy, such as broadcast, rebalance, shuffle or global.");
				}
			}

			StreamEdge edge = new StreamEdge(upstreamNode, downstreamNode, typeNumber, outputNames, partitioner);

			getStreamNode(edge.getSourceId()).addOutEdge(edge);
			getStreamNode(edge.getTargetId()).addInEdge(edge);
		}
	}

	public  void addOutputSelector(Integer vertexID, OutputSelector outputSelector) {
		if (virtuaPartitionNodes.containsKey(vertexID)) {
			addOutputSelector(virtuaPartitionNodes.get(vertexID).f0, outputSelector);
		} else if (virtualSelectNodes.containsKey(vertexID)) {
			addOutputSelector(virtualSelectNodes.get(vertexID).f0, outputSelector);
		} else {
			getStreamNode(vertexID).addOutputSelector(outputSelector);

			if (LOG.isDebugEnabled()) {
				LOG.debug("Outputselector set for {}", vertexID);
			}
		}

	}

	public void setParallelism(Integer vertexID, int parallelism) {
		if (getStreamNode(vertexID) != null) {
			getStreamNode(vertexID).setParallelism(parallelism);
		}
	}

	public void setOneInputStateKey(Integer vertexID, KeySelector keySelector, TypeSerializer keySerializer) {
		StreamNode node = getStreamNode(vertexID);
		node.setStatePartitioner1(keySelector);
		node.setStateKeySerializer(keySerializer);
	}

	public void setTwoInputStateKey(Integer vertexID, KeySelector keySelector1, KeySelector keySelector2, TypeSerializer keySerializer) {
		StreamNode node = getStreamNode(vertexID);
		node.setStatePartitioner1(keySelector1);
		node.setStatePartitioner2(keySelector2);
		node.setStateKeySerializer(keySerializer);
	}

	public void setBufferTimeout(Integer vertexID, long bufferTimeout) {
		if (getStreamNode(vertexID) != null) {
			getStreamNode(vertexID).setBufferTimeout(bufferTimeout);
		}
	}

	public void setSerializers(Integer vertexID, TypeSerializer in1, TypeSerializer in2, TypeSerializer out) {
		StreamNode vertex = getStreamNode(vertexID);
		vertex.setSerializerIn1(in1);
		vertex.setSerializerIn2(in2);
		vertex.setSerializerOut(out);
	}

	public void setSerializersFrom(Integer from, Integer to) {
		StreamNode fromVertex = getStreamNode(from);
		StreamNode toVertex = getStreamNode(to);

		toVertex.setSerializerIn1(fromVertex.getTypeSerializerOut());
		toVertex.setSerializerOut(fromVertex.getTypeSerializerIn1());
	}

	public  void setOutType(Integer vertexID, TypeInformation outType) {
		getStreamNode(vertexID).setSerializerOut(outType.createSerializer(executionConfig));
	}

	public  void setOperator(Integer vertexID, StreamOperator operatorObject) {
		getStreamNode(vertexID).setOperator(operatorObject);
	}

	public void setInputFormat(Integer vertexID, InputFormat inputFormat) {
		getStreamNode(vertexID).setInputFormat(inputFormat);
	}

	void setTransformationId(Integer nodeId, String transformationId) {
		StreamNode node = streamNodes.get(nodeId);
		if (node != null) {
			node.setTransformationId(transformationId);
		}
	}

	public StreamNode getStreamNode(Integer vertexID) {
		return streamNodes.get(vertexID);
	}

	protected Collection getVertexIDs() {
		return streamNodes.keySet();
	}

	public List getStreamEdges(int sourceId, int targetId) {

		List result = new ArrayList<>();
		for (StreamEdge edge : getStreamNode(sourceId).getOutEdges()) {
			if (edge.getTargetId() == targetId) {
				result.add(edge);
			}
		}

		if (result.isEmpty()) {
			throw new RuntimeException("No such edge in stream graph: " + sourceId + " -> " + targetId);
		}

		return result;
	}

	public Collection getSourceIDs() {
		return sources;
	}


	public Collection getSinkIDs() {
		return sinks;
	}

	public Collection getStreamNodes() {
		return streamNodes.values();
	}

	public Set>> getOperators() {
		Set>> operatorSet = new HashSet<>();
		for (StreamNode vertex : streamNodes.values()) {
			operatorSet.add(new Tuple2>(vertex.getId(), vertex
					.getOperator()));
		}
		return operatorSet;
	}

	public String getBrokerID(Integer vertexID) {
		return vertexIDtoBrokerID.get(vertexID);
	}

	public long getLoopTimeout(Integer vertexID) {
		return vertexIDtoLoopTimeout.get(vertexID);
	}

	public Tuple2 createIterationSourceAndSink(int loopId, int sourceId, int sinkId, long timeout, int parallelism) {
		StreamNode source = this.addNode(sourceId,
			null,
			StreamIterationHead.class,
			null,
			"IterationSource-" + loopId);
		sources.add(source.getId());
		setParallelism(source.getId(), parallelism);

		StreamNode sink = this.addNode(sinkId,
			null,
			StreamIterationTail.class,
			null,
			"IterationSink-" + loopId);
		sinks.add(sink.getId());
		setParallelism(sink.getId(), parallelism);

		iterationSourceSinkPairs.add(new Tuple2<>(source, sink));

		this.vertexIDtoBrokerID.put(source.getId(), "broker-" + loopId);
		this.vertexIDtoBrokerID.put(sink.getId(), "broker-" + loopId);
		this.vertexIDtoLoopTimeout.put(source.getId(), timeout);
		this.vertexIDtoLoopTimeout.put(sink.getId(), timeout);

		return new Tuple2<>(source, sink);
	}

	public Set> getIterationSourceSinkPairs() {
		return iterationSourceSinkPairs;
	}

	private void removeEdge(StreamEdge edge) {
		edge.getSourceVertex().getOutEdges().remove(edge);
		edge.getTargetVertex().getInEdges().remove(edge);
	}

	private void removeVertex(StreamNode toRemove) {
		Set edgesToRemove = new HashSet<>();

		edgesToRemove.addAll(toRemove.getInEdges());
		edgesToRemove.addAll(toRemove.getOutEdges());

		for (StreamEdge edge : edgesToRemove) {
			removeEdge(edge);
		}
		streamNodes.remove(toRemove.getId());
	}

	/**
	 * Gets the assembled {@link JobGraph}.
	 */
	@SuppressWarnings("deprecation")
	public JobGraph getJobGraph() {
		// temporarily forbid checkpointing for iterative jobs
		if (isIterative() && checkpointConfig.isCheckpointingEnabled() && !checkpointConfig.isForceCheckpointing()) {
			throw new UnsupportedOperationException(
					"Checkpointing is currently not supported by default for iterative jobs, as we cannot guarantee exactly once semantics. "
							+ "State checkpoints happen normally, but records in-transit during the snapshot will be lost upon failure. "
							+ "\nThe user can force enable state checkpoints with the reduced guarantees by calling: env.enableCheckpointing(interval,true)");
		}

		StreamingJobGraphGenerator jobgraphGenerator = new StreamingJobGraphGenerator(this);

		return jobgraphGenerator.createJobGraph();
	}

	@Override
	public String getStreamingPlanAsJSON() {
		try {
			return new JSONGenerator(this).getJSON();
		}
		catch (Exception e) {
			throw new RuntimeException("JSON plan creation failed", e);
		}
	}

	@Override
	public void dumpStreamingPlanAsJSON(File file) throws IOException {
		PrintWriter pw = null;
		try {
			pw = new PrintWriter(new FileOutputStream(file), false);
			pw.write(getStreamingPlanAsJSON());
			pw.flush();

		} finally {
			if (pw != null) {
				pw.close();
			}
		}
	}

	public static enum ResourceStrategy {
		DEFAULT, ISOLATE, NEWGROUP
	}
}
 
  
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.jobgraph;

import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.FSDataInputStream;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.runtime.blob.BlobClient;
import org.apache.flink.runtime.blob.BlobKey;
import org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings;

import java.io.IOException;
import java.io.Serializable;
import java.net.InetSocketAddress;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.flink.api.common.restartstrategy.RestartStrategies;
/**
 * The JobGraph represents a Flink dataflow program, at the low level that the JobManager accepts.
 * All programs from higher level APIs are transformed into JobGraphs.
 *
 * 

The JobGraph is a graph of vertices and intermediate results that are connected together to * form a DAG. Note that iterations (feedback edges) are currently not encoded inside the JobGraph * but inside certain special vertices that establish the feedback channel amongst themselves.

* *

The JobGraph defines the job-wide configuration settings, while each vertex and intermediate result * define the characteristics of the concrete operation and intermediate data.

*/ public class JobGraph implements Serializable { private static final long serialVersionUID = 1L; // -------------------------------------------------------------------------------------------- // Members that define the structure / topology of the graph // -------------------------------------------------------------------------------------------- /** List of task vertices included in this job graph. */ private final Map taskVertices = new LinkedHashMap(); /** The job configuration attached to this job. */ private final Configuration jobConfiguration = new Configuration(); /** Set of JAR files required to run this job. */ private final List userJars = new ArrayList(); /** Set of blob keys identifying the JAR files required to run this job. */ private final List userJarBlobKeys = new ArrayList(); /** ID of this job. May be set if specific job id is desired (e.g. session management) */ private final JobID jobID; /** Name of this job. */ private final String jobName; /** Configuration which defines which restart strategy to use for the job recovery */ private RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration; /** The number of seconds after which the corresponding ExecutionGraph is removed at the * job manager after it has been executed. */ private long sessionTimeout = 0; /** flag to enable queued scheduling */ private boolean allowQueuedScheduling; /** The mode in which the job is scheduled */ private ScheduleMode scheduleMode = ScheduleMode.FROM_SOURCES; /** The settings for asynchronous snapshots */ private JobSnapshottingSettings snapshotSettings; /** List of classpaths required to run this job. */ private List classpaths = Collections.emptyList(); // -------------------------------------------------------------------------------------------- /** * Constructs a new job graph with no name and a random job ID. */ public JobGraph() { this((String) null); } /** * Constructs a new job graph with the given name, a random job ID. * * @param jobName The name of the job */ public JobGraph(String jobName) { this(null, jobName); } /** * Constructs a new job graph with the given name and a random job ID if null supplied as an id. * * @param jobId The id of the job. A random ID is generated, if {@code null} is passed. * @param jobName The name of the job. */ public JobGraph(JobID jobId, String jobName) { this.jobID = jobId == null ? new JobID() : jobId; this.jobName = jobName == null ? "(unnamed job)" : jobName; } /** * Constructs a new job graph with no name and a random job ID if null supplied as an id. * * @param vertices The vertices to add to the graph. */ public JobGraph(JobVertex... vertices) { this(null, vertices); } /** * Constructs a new job graph with the given name and a random job ID. * * @param jobName The name of the job. * @param vertices The vertices to add to the graph. */ public JobGraph(String jobName, JobVertex... vertices) { this(null, jobName, vertices); } /** * Constructs a new job graph with the given name and a random job ID if null supplied as an id. * * @param jobId The id of the job. A random ID is generated, if {@code null} is passed. * @param jobName The name of the job. * @param vertices The vertices to add to the graph. */ public JobGraph(JobID jobId, String jobName, JobVertex... vertices) { this(jobId, jobName); for (JobVertex vertex : vertices) { addVertex(vertex); } } // -------------------------------------------------------------------------------------------- /** * Returns the ID of the job. * * @return the ID of the job */ public JobID getJobID() { return this.jobID; } /** * Returns the name assigned to the job graph. * * @return the name assigned to the job graph */ public String getName() { return this.jobName; } /** * Returns the configuration object for this job. Job-wide parameters should be set into that * configuration object. * * @return The configuration object for this job. */ public Configuration getJobConfiguration() { return this.jobConfiguration; } /** * Sets the restart strategy configuration. This configuration specifies the restart strategy * to be used by the ExecutionGraph in case of a restart. * * @param restartStrategyConfiguration Restart strategy configuration to be set */ public void setRestartStrategyConfiguration(RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration) { this.restartStrategyConfiguration = restartStrategyConfiguration; } /** * Gets the restart strategy configuration * * @return Restart strategy configuration to be used */ public RestartStrategies.RestartStrategyConfiguration getRestartStrategyConfiguration() { return restartStrategyConfiguration; } /** * Gets the timeout after which the corresponding ExecutionGraph is removed at the * job manager after it has been executed. * @return a timeout as a long in seconds. */ public long getSessionTimeout() { return sessionTimeout; } /** * Sets the timeout of the session in seconds. The timeout specifies how long a job will be kept * in the job manager after it finishes. * @param sessionTimeout The timeout in seconds */ public void setSessionTimeout(long sessionTimeout) { this.sessionTimeout = sessionTimeout; } public void setAllowQueuedScheduling(boolean allowQueuedScheduling) { this.allowQueuedScheduling = allowQueuedScheduling; } public boolean getAllowQueuedScheduling() { return allowQueuedScheduling; } public void setScheduleMode(ScheduleMode scheduleMode) { this.scheduleMode = scheduleMode; } public ScheduleMode getScheduleMode() { return scheduleMode; } /** * Adds a new task vertex to the job graph if it is not already included. * * @param vertex * the new task vertex to be added */ public void addVertex(JobVertex vertex) { final JobVertexID id = vertex.getID(); JobVertex previous = taskVertices.put(id, vertex); // if we had a prior association, restore and throw an exception if (previous != null) { taskVertices.put(id, previous); throw new IllegalArgumentException("The JobGraph already contains a vertex with that id."); } } /** * Returns an Iterable to iterate all vertices registered with the job graph. * * @return an Iterable to iterate all vertices registered with the job graph */ public Iterable getVertices() { return this.taskVertices.values(); } /** * Returns an array of all job vertices that are registered with the job graph. The order in which the vertices * appear in the list is not defined. * * @return an array of all job vertices that are registered with the job graph */ public JobVertex[] getVerticesAsArray() { return this.taskVertices.values().toArray(new JobVertex[this.taskVertices.size()]); } /** * Returns the number of all vertices. * * @return The number of all vertices. */ public int getNumberOfVertices() { return this.taskVertices.size(); } /** * Sets the settings for asynchronous snapshots. A value of {@code null} means that * snapshotting is not enabled. * * @param settings The snapshot settings, or null, to disable snapshotting. */ public void setSnapshotSettings(JobSnapshottingSettings settings) { this.snapshotSettings = settings; } /** * Gets the settings for asynchronous snapshots. This method returns null, when * snapshotting is not enabled. * * @return The snapshot settings, or null, if snapshotting is not enabled. */ public JobSnapshottingSettings getSnapshotSettings() { return snapshotSettings; } /** * Searches for a vertex with a matching ID and returns it. * * @param id * the ID of the vertex to search for * @return the vertex with the matching ID or null if no vertex with such ID could be found */ public JobVertex findVertexByID(JobVertexID id) { return this.taskVertices.get(id); } /** * Sets the classpaths required to run the job on a task manager. * * @param paths paths of the directories/JAR files required to run the job on a task manager */ public void setClasspaths(List paths) { classpaths = paths; } public List getClasspaths() { return classpaths; } /** * Sets the savepoint path to rollback the deployment to. * * @param savepointPath The savepoint path */ public void setSavepointPath(String savepointPath) { if (savepointPath != null) { if (snapshotSettings == null) { throw new IllegalStateException("Checkpointing disabled"); } else { snapshotSettings.setSavepointPath(savepointPath); } } } // -------------------------------------------------------------------------------------------- public List getVerticesSortedTopologicallyFromSources() throws InvalidProgramException { // early out on empty lists if (this.taskVertices.isEmpty()) { return Collections.emptyList(); } List sorted = new ArrayList(this.taskVertices.size()); Set remaining = new LinkedHashSet(this.taskVertices.values()); // start by finding the vertices with no input edges // and the ones with disconnected inputs (that refer to some standalone data set) { Iterator iter = remaining.iterator(); while (iter.hasNext()) { JobVertex vertex = iter.next(); if (vertex.hasNoConnectedInputs()) { sorted.add(vertex); iter.remove(); } } } int startNodePos = 0; // traverse from the nodes that were added until we found all elements while (!remaining.isEmpty()) { // first check if we have more candidates to start traversing from. if not, then the // graph is cyclic, which is not permitted if (startNodePos >= sorted.size()) { throw new InvalidProgramException("The job graph is cyclic."); } JobVertex current = sorted.get(startNodePos++); addNodesThatHaveNoNewPredecessors(current, sorted, remaining); } return sorted; } private void addNodesThatHaveNoNewPredecessors(JobVertex start, List target, Set remaining) { // forward traverse over all produced data sets and all their consumers for (IntermediateDataSet dataSet : start.getProducedDataSets()) { for (JobEdge edge : dataSet.getConsumers()) { // a vertex can be added, if it has no predecessors that are still in the 'remaining' set JobVertex v = edge.getTarget(); if (!remaining.contains(v)) { continue; } boolean hasNewPredecessors = false; for (JobEdge e : v.getInputs()) { // skip the edge through which we came if (e == edge) { continue; } IntermediateDataSet source = e.getSource(); if (remaining.contains(source.getProducer())) { hasNewPredecessors = true; break; } } if (!hasNewPredecessors) { target.add(v); remaining.remove(v); addNodesThatHaveNoNewPredecessors(v, target, remaining); } } } } // -------------------------------------------------------------------------------------------- // Handling of attached JAR files // -------------------------------------------------------------------------------------------- /** * Adds the path of a JAR file required to run the job on a task manager. * * @param jar * path of the JAR file required to run the job on a task manager */ public void addJar(Path jar) { if (jar == null) { throw new IllegalArgumentException(); } if (!userJars.contains(jar)) { userJars.add(jar); } } /** * Adds the BLOB referenced by the key to the JobGraph's dependencies. * * @param key * path of the JAR file required to run the job on a task manager */ public void addBlob(BlobKey key) { if (key == null) { throw new IllegalArgumentException(); } if (!userJarBlobKeys.contains(key)) { userJarBlobKeys.add(key); } } /** * Checks whether the JobGraph has user code JAR files attached. * * @return True, if the JobGraph has user code JAR files attached, false otherwise. */ public boolean hasUsercodeJarFiles() { return this.userJars.size() > 0; } /** * Returns a set of BLOB keys referring to the JAR files required to run this job. * * @return set of BLOB keys referring to the JAR files required to run this job */ public List getUserJarBlobKeys() { return this.userJarBlobKeys; } /** * Uploads the previously added user jar file to the job manager through the job manager's BLOB server. * * @param serverAddress * the network address of the BLOB server * @throws IOException * thrown if an I/O error occurs during the upload */ public void uploadRequiredJarFiles(InetSocketAddress serverAddress) throws IOException { if (this.userJars.isEmpty()) { return; } BlobClient bc = null; try { bc = new BlobClient(serverAddress); for (final Path jar : this.userJars) { final FileSystem fs = jar.getFileSystem(); FSDataInputStream is = null; try { is = fs.open(jar); final BlobKey key = bc.put(is); this.userJarBlobKeys.add(key); } finally { if (is != null) { is.close(); } } } } finally { if (bc != null) { bc.close(); } } } /** * Gets the maximum parallelism of all operations in this job graph. * @return The maximum parallelism of this job graph */ public int getMaximumParallelism() { int maxParallelism = -1; for (JobVertex vertex : taskVertices.values()) { maxParallelism = Math.max(vertex.getParallelism(), maxParallelism); } return maxParallelism; } @Override public String toString() { return "JobGraph(jobId: " + jobID + ")"; } }

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.flink.streaming.api.graph;

import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.FileSourceFunction;
import org.apache.flink.streaming.api.transformations.CoFeedbackTransformation;
import org.apache.flink.streaming.api.transformations.FeedbackTransformation;
import org.apache.flink.streaming.api.transformations.OneInputTransformation;
import org.apache.flink.streaming.api.transformations.PartitionTransformation;
import org.apache.flink.streaming.api.transformations.SelectTransformation;
import org.apache.flink.streaming.api.transformations.SinkTransformation;
import org.apache.flink.streaming.api.transformations.SourceTransformation;
import org.apache.flink.streaming.api.transformations.SplitTransformation;
import org.apache.flink.streaming.api.transformations.StreamTransformation;
import org.apache.flink.streaming.api.transformations.TwoInputTransformation;
import org.apache.flink.streaming.api.transformations.UnionTransformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * A generator that generates a {@link StreamGraph} from a graph of
 * {@link StreamTransformation StreamTransformations}.
 *
 * 

* This traverses the tree of {@code StreamTransformations} starting from the sinks. At each * transformation we recursively transform the inputs, then create a node in the {@code StreamGraph} * and add edges from the input Nodes to our newly created node. The transformation methods * return the IDs of the nodes in the StreamGraph that represent the input transformation. Several * IDs can be returned to be able to deal with feedback transformations and unions. * *

* Partitioning, split/select and union don't create actual nodes in the {@code StreamGraph}. For * these, we create a virtual node in the {@code StreamGraph} that holds the specific property, i.e. * partitioning, selector and so on. When an edge is created from a virtual node to a downstream * node the {@code StreamGraph} resolved the id of the original node and creates an edge * in the graph with the desired property. For example, if you have this graph: * *

 *     Map-1 -> HashPartition-2 -> Map-3
 * 
* * where the numbers represent transformation IDs. We first recurse all the way down. {@code Map-1} * is transformed, i.e. we create a {@code StreamNode} with ID 1. Then we transform the * {@code HashPartition}, for this, we create virtual node of ID 4 that holds the property * {@code HashPartition}. This transformation returns the ID 4. Then we transform the {@code Map-3}. * We add the edge {@code 4 -> 3}. The {@code StreamGraph} resolved the actual node with ID 1 and * creates and edge {@code 1 -> 3} with the property HashPartition. */ @Internal public class StreamGraphGenerator { private static final Logger LOG = LoggerFactory.getLogger(StreamGraphGenerator.class); // The StreamGraph that is being built, this is initialized at the beginning. private StreamGraph streamGraph; private final StreamExecutionEnvironment env; // This is used to assign a unique ID to iteration source/sink protected static Integer iterationIdCounter = 0; public static int getNewIterationNodeId() { iterationIdCounter--; return iterationIdCounter; } // Keep track of which Transforms we have already transformed, this is necessary because // we have loops, i.e. feedback edges. private Map, Collection> alreadyTransformed; /** * Private constructor. The generator should only be invoked using {@link #generate}. */ private StreamGraphGenerator(StreamExecutionEnvironment env) { this.streamGraph = new StreamGraph(env); this.streamGraph.setChaining(env.isChainingEnabled()); this.streamGraph.setStateBackend(env.getStateBackend()); this.env = env; this.alreadyTransformed = new HashMap<>(); } /** * Generates a {@code StreamGraph} by traversing the graph of {@code StreamTransformations} * starting from the given transformations. * * @param env The {@code StreamExecutionEnvironment} that is used to set some parameters of the * job * @param transformations The transformations starting from which to transform the graph * * @return The generated {@code StreamGraph} */ public static StreamGraph generate(StreamExecutionEnvironment env, List> transformations) { return new StreamGraphGenerator(env).generateInternal(transformations); } /** * This starts the actual transformation, beginning from the sinks. */ private StreamGraph generateInternal(List> transformations) { for (StreamTransformation transformation: transformations) { transform(transformation); } return streamGraph; } /** * Transforms one {@code StreamTransformation}. * *

* This checks whether we already transformed it and exits early in that case. If not it * delegates to one of the transformation specific methods. */ private Collection transform(StreamTransformation transform) { if (alreadyTransformed.containsKey(transform)) { return alreadyTransformed.get(transform); } LOG.debug("Transforming " + transform); // call at least once to trigger exceptions about MissingTypeInfo transform.getOutputType(); Collection transformedIds; if (transform instanceof OneInputTransformation) { transformedIds = transformOnInputTransform((OneInputTransformation) transform); } else if (transform instanceof TwoInputTransformation) { transformedIds = transformTwoInputTransform((TwoInputTransformation) transform); } else if (transform instanceof SourceTransformation) { transformedIds = transformSource((SourceTransformation) transform); } else if (transform instanceof SinkTransformation) { transformedIds = transformSink((SinkTransformation) transform); } else if (transform instanceof UnionTransformation) { transformedIds = transformUnion((UnionTransformation) transform); } else if (transform instanceof SplitTransformation) { transformedIds = transformSplit((SplitTransformation) transform); } else if (transform instanceof SelectTransformation) { transformedIds = transformSelect((SelectTransformation) transform); } else if (transform instanceof FeedbackTransformation) { transformedIds = transformFeedback((FeedbackTransformation) transform); } else if (transform instanceof CoFeedbackTransformation) { transformedIds = transformCoFeedback((CoFeedbackTransformation) transform); } else if (transform instanceof PartitionTransformation) { transformedIds = transformPartition((PartitionTransformation) transform); } else { throw new IllegalStateException("Unknown transformation: " + transform); } // need this check because the iterate transformation adds itself before // transforming the feedback edges if (!alreadyTransformed.containsKey(transform)) { alreadyTransformed.put(transform, transformedIds); } if (transform.getBufferTimeout() > 0) { streamGraph.setBufferTimeout(transform.getId(), transform.getBufferTimeout()); } if (transform.getUid() != null) { streamGraph.setTransformationId(transform.getId(), transform.getUid()); } return transformedIds; } /** * Transforms a {@code UnionTransformation}. * *

* This is easy, we only have to transform the inputs and return all the IDs in a list so * that downstream operations can connect to all upstream nodes. */ private Collection transformUnion(UnionTransformation union) { List> inputs = union.getInputs(); List resultIds = new ArrayList<>(); for (StreamTransformation input: inputs) { resultIds.addAll(transform(input)); } return resultIds; } /** * Transforms a {@code PartitionTransformation}. * *

* For this we create a virtual node in the {@code StreamGraph} that holds the partition * property. @see StreamGraphGenerator */ private Collection transformPartition(PartitionTransformation partition) { StreamTransformation input = partition.getInput(); List resultIds = new ArrayList<>(); Collection transformedIds = transform(input); for (Integer transformedId: transformedIds) { int virtualId = StreamTransformation.getNewNodeId(); streamGraph.addVirtualPartitionNode(transformedId, virtualId, partition.getPartitioner()); resultIds.add(virtualId); } return resultIds; } /** * Transforms a {@code SplitTransformation}. * *

* We add the output selector to previously transformed nodes. */ private Collection transformSplit(SplitTransformation split) { StreamTransformation input = split.getInput(); Collection resultIds = transform(input); // the recursive transform call might have transformed this already if (alreadyTransformed.containsKey(split)) { return alreadyTransformed.get(split); } for (int inputId : resultIds) { streamGraph.addOutputSelector(inputId, split.getOutputSelector()); } return resultIds; } /** * Transforms a {@code SelectTransformation}. * *

* For this we create a virtual node in the {@code StreamGraph} holds the selected names. * @see org.apache.flink.streaming.api.graph.StreamGraphGenerator */ private Collection transformSelect(SelectTransformation select) { StreamTransformation input = select.getInput(); Collection resultIds = transform(input); // the recursive transform might have already transformed this if (alreadyTransformed.containsKey(select)) { return alreadyTransformed.get(select); } List virtualResultIds = new ArrayList<>(); for (int inputId : resultIds) { int virtualId = StreamTransformation.getNewNodeId(); streamGraph.addVirtualSelectNode(inputId, virtualId, select.getSelectedNames()); virtualResultIds.add(virtualId); } return virtualResultIds; } /** * Transforms a {@code FeedbackTransformation}. * *

* This will recursively transform the input and the feedback edges. We return the concatenation * of the input IDs and the feedback IDs so that downstream operations can be wired to both. * *

* This is responsible for creating the IterationSource and IterationSink which * are used to feed back the elements. */ private Collection transformFeedback(FeedbackTransformation iterate) { if (iterate.getFeedbackEdges().size() <= 0) { throw new IllegalStateException("Iteration " + iterate + " does not have any feedback edges."); } StreamTransformation input = iterate.getInput(); List resultIds = new ArrayList<>(); // first transform the input stream(s) and store the result IDs Collection inputIds = transform(input); resultIds.addAll(inputIds); // the recursive transform might have already transformed this if (alreadyTransformed.containsKey(iterate)) { return alreadyTransformed.get(iterate); } // create the fake iteration source/sink pair Tuple2 itSourceAndSink = streamGraph.createIterationSourceAndSink( iterate.getId(), getNewIterationNodeId(), getNewIterationNodeId(), iterate.getWaitTime(), iterate.getParallelism()); StreamNode itSource = itSourceAndSink.f0; StreamNode itSink = itSourceAndSink.f1; // We set the proper serializers for the sink/source streamGraph.setSerializers(itSource.getId(), null, null, iterate.getOutputType().createSerializer(env.getConfig())); streamGraph.setSerializers(itSink.getId(), iterate.getOutputType().createSerializer(env.getConfig()), null, null); // also add the feedback source ID to the result IDs, so that downstream operators will // add both as input resultIds.add(itSource.getId()); // at the iterate to the already-seen-set with the result IDs, so that we can transform // the feedback edges and let them stop when encountering the iterate node alreadyTransformed.put(iterate, resultIds); // so that we can determine the slot sharing group from all feedback edges List allFeedbackIds = new ArrayList<>(); for (StreamTransformation feedbackEdge : iterate.getFeedbackEdges()) { Collection feedbackIds = transform(feedbackEdge); allFeedbackIds.addAll(feedbackIds); for (Integer feedbackId: feedbackIds) { streamGraph.addEdge(feedbackId, itSink.getId(), 0 ); } } String slotSharingGroup = determineSlotSharingGroup(null, allFeedbackIds); itSink.setSlotSharingGroup(slotSharingGroup); itSource.setSlotSharingGroup(slotSharingGroup); return resultIds; } /** * Transforms a {@code CoFeedbackTransformation}. * *

* This will only transform feedback edges, the result of this transform will be wired * to the second input of a Co-Transform. The original input is wired directly to the first * input of the downstream Co-Transform. * *

* This is responsible for creating the IterationSource and IterationSink which * are used to feed back the elements. */ private Collection transformCoFeedback(CoFeedbackTransformation coIterate) { // For Co-Iteration we don't need to transform the input and wire the input to the // head operator by returning the input IDs, the input is directly wired to the left // input of the co-operation. This transform only needs to return the ids of the feedback // edges, since they need to be wired to the second input of the co-operation. // create the fake iteration source/sink pair Tuple2 itSourceAndSink = streamGraph.createIterationSourceAndSink( coIterate.getId(), getNewIterationNodeId(), getNewIterationNodeId(), coIterate.getWaitTime(), coIterate.getParallelism()); StreamNode itSource = itSourceAndSink.f0; StreamNode itSink = itSourceAndSink.f1; // We set the proper serializers for the sink/source streamGraph.setSerializers(itSource.getId(), null, null, coIterate.getOutputType().createSerializer(env.getConfig())); streamGraph.setSerializers(itSink.getId(), coIterate.getOutputType().createSerializer(env.getConfig()), null, null); Collection resultIds = Collections.singleton(itSource.getId()); // at the iterate to the already-seen-set with the result IDs, so that we can transform // the feedback edges and let them stop when encountering the iterate node alreadyTransformed.put(coIterate, resultIds); // so that we can determine the slot sharing group from all feedback edges List allFeedbackIds = new ArrayList<>(); for (StreamTransformation feedbackEdge : coIterate.getFeedbackEdges()) { Collection feedbackIds = transform(feedbackEdge); allFeedbackIds.addAll(feedbackIds); for (Integer feedbackId: feedbackIds) { streamGraph.addEdge(feedbackId, itSink.getId(), 0 ); } } String slotSharingGroup = determineSlotSharingGroup(null, allFeedbackIds); itSink.setSlotSharingGroup(slotSharingGroup); itSource.setSlotSharingGroup(slotSharingGroup); return Collections.singleton(itSource.getId()); } /** * Transforms a {@code SourceTransformation}. */ private Collection transformSource(SourceTransformation source) { String slotSharingGroup = determineSlotSharingGroup(source.getSlotSharingGroup(), new ArrayList()); streamGraph.addSource(source.getId(), slotSharingGroup, source.getOperator(), null, source.getOutputType(), "Source: " + source.getName()); if (source.getOperator().getUserFunction() instanceof FileSourceFunction) { FileSourceFunction fs = (FileSourceFunction) source.getOperator().getUserFunction(); streamGraph.setInputFormat(source.getId(), fs.getFormat()); } streamGraph.setParallelism(source.getId(), source.getParallelism()); return Collections.singleton(source.getId()); } /** * Transforms a {@code SourceTransformation}. */ private Collection transformSink(SinkTransformation sink) { Collection inputIds = transform(sink.getInput()); String slotSharingGroup = determineSlotSharingGroup(sink.getSlotSharingGroup(), inputIds); streamGraph.addSink(sink.getId(), slotSharingGroup, sink.getOperator(), sink.getInput().getOutputType(), null, "Sink: " + sink.getName()); streamGraph.setParallelism(sink.getId(), sink.getParallelism()); for (Integer inputId: inputIds) { streamGraph.addEdge(inputId, sink.getId(), 0 ); } if (sink.getStateKeySelector() != null) { TypeSerializer keySerializer = sink.getStateKeyType().createSerializer(env.getConfig()); streamGraph.setOneInputStateKey(sink.getId(), sink.getStateKeySelector(), keySerializer); } return Collections.emptyList(); } /** * Transforms a {@code OneInputTransformation}. * *

* This recusively transforms the inputs, creates a new {@code StreamNode} in the graph and * wired the inputs to this new node. */ private Collection transformOnInputTransform(OneInputTransformation transform) { Collection inputIds = transform(transform.getInput()); // the recursive call might have already transformed this if (alreadyTransformed.containsKey(transform)) { return alreadyTransformed.get(transform); } String slotSharingGroup = determineSlotSharingGroup(transform.getSlotSharingGroup(), inputIds); streamGraph.addOperator(transform.getId(), slotSharingGroup, transform.getOperator(), transform.getInputType(), transform.getOutputType(), transform.getName()); if (transform.getStateKeySelector() != null) { TypeSerializer keySerializer = transform.getStateKeyType().createSerializer(env.getConfig()); streamGraph.setOneInputStateKey(transform.getId(), transform.getStateKeySelector(), keySerializer); } streamGraph.setParallelism(transform.getId(), transform.getParallelism()); for (Integer inputId: inputIds) { streamGraph.addEdge(inputId, transform.getId(), 0); } return Collections.singleton(transform.getId()); } /** * Transforms a {@code TwoInputTransformation}. * *

* This recusively transforms the inputs, creates a new {@code StreamNode} in the graph and * wired the inputs to this new node. */ private Collection transformTwoInputTransform(TwoInputTransformation transform) { Collection inputIds1 = transform(transform.getInput1()); Collection inputIds2 = transform(transform.getInput2()); // the recursive call might have already transformed this if (alreadyTransformed.containsKey(transform)) { return alreadyTransformed.get(transform); } List allInputIds = new ArrayList<>(); allInputIds.addAll(inputIds1); allInputIds.addAll(inputIds2); String slotSharingGroup = determineSlotSharingGroup(transform.getSlotSharingGroup(), allInputIds); streamGraph.addCoOperator( transform.getId(), slotSharingGroup, transform.getOperator(), transform.getInputType1(), transform.getInputType2(), transform.getOutputType(), transform.getName()); if (transform.getStateKeySelector1() != null) { TypeSerializer keySerializer = transform.getStateKeyType().createSerializer(env.getConfig()); streamGraph.setTwoInputStateKey(transform.getId(), transform.getStateKeySelector1(), transform.getStateKeySelector2(), keySerializer); } streamGraph.setParallelism(transform.getId(), transform.getParallelism()); for (Integer inputId: inputIds1) { streamGraph.addEdge(inputId, transform.getId(), 1 ); } for (Integer inputId: inputIds2) { streamGraph.addEdge(inputId, transform.getId(), 2 ); } return Collections.singleton(transform.getId()); } /** * Determines the slot sharing group for an operation based on the slot sharing group set by * the user and the slot sharing groups of the inputs. * *

If the user specifies a group name, this is taken as is. If nothing is specified and * the input operations all have the same group name then this name is taken. Otherwise the * default group is choosen. * * @param specifiedGroup The group specified by the user. * @param inputIds The IDs of the input operations. */ private String determineSlotSharingGroup(String specifiedGroup, Collection inputIds) { if (specifiedGroup != null) { return specifiedGroup; } else { String inputGroup = null; for (int id: inputIds) { String inputGroupCandidate = streamGraph.getSlotSharingGroup(id); if (inputGroup == null) { inputGroup = inputGroupCandidate; } else if (!inputGroup.equals(inputGroupCandidate)) { return "default"; } } return inputGroup == null ? "default" : inputGroup; } } }


/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.streaming.api.graph;

import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;

import org.apache.commons.lang3.StringUtils;

import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.operators.util.UserCodeObjectWrapper;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.io.network.partition.ResultPartitionType;
import org.apache.flink.runtime.jobgraph.DistributionPattern;
import org.apache.flink.runtime.jobgraph.InputFormatVertex;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobgraph.JobVertex;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.ScheduleMode;
import org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable;
import org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationGroup;
import org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup;
import org.apache.flink.runtime.operators.util.TaskConfig;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.operators.AbstractUdfStreamOperator;
import org.apache.flink.streaming.api.operators.ChainingStrategy;
import org.apache.flink.streaming.api.operators.StreamOperator;
import org.apache.flink.streaming.api.transformations.StreamTransformation;
import org.apache.flink.streaming.runtime.partitioner.ForwardPartitioner;
import org.apache.flink.streaming.runtime.partitioner.RescalePartitioner;
import org.apache.flink.streaming.runtime.partitioner.StreamPartitioner;
import org.apache.flink.streaming.runtime.tasks.StreamIterationHead;
import org.apache.flink.streaming.runtime.tasks.StreamIterationTail;
import org.apache.flink.util.InstantiationUtil;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;

import static org.apache.flink.util.StringUtils.byteToHexString;

@Internal
public class StreamingJobGraphGenerator {

	private static final Logger LOG = LoggerFactory.getLogger(StreamingJobGraphGenerator.class);

	/**
	 * Restart delay used for the FixedDelayRestartStrategy in case checkpointing was enabled but
	 * no restart strategy has been specified.
	 */
	private static final long DEFAULT_RESTART_DELAY = 10000L;

	private StreamGraph streamGraph;

	private Map jobVertices;
	private JobGraph jobGraph;
	private Collection builtVertices;

	private List physicalEdgesInOrder;

	private Map> chainedConfigs;

	private Map vertexConfigs;
	private Map chainedNames;

	public StreamingJobGraphGenerator(StreamGraph streamGraph) {
		this.streamGraph = streamGraph;
	}

	private void init() {
		this.jobVertices = new HashMap<>();
		this.builtVertices = new HashSet<>();
		this.chainedConfigs = new HashMap<>();
		this.vertexConfigs = new HashMap<>();
		this.chainedNames = new HashMap<>();
		this.physicalEdgesInOrder = new ArrayList<>();
	}

	public JobGraph createJobGraph() {
		jobGraph = new JobGraph(streamGraph.getJobName());

		// make sure that all vertices start immediately
		jobGraph.setScheduleMode(ScheduleMode.ALL);

		init();

		// Generate deterministic hashes for the nodes in order to identify them across
		// submission iff they didn't change.
		Map hashes = traverseStreamGraphAndGenerateHashes();

		setChaining(hashes);

		setPhysicalEdges();

		setSlotSharing();
		
		configureCheckpointing();

		configureRestartStrategy();

		try {
			InstantiationUtil.writeObjectToConfig(this.streamGraph.getExecutionConfig(), this.jobGraph.getJobConfiguration(), ExecutionConfig.CONFIG_KEY);
		} catch (IOException e) {
			throw new RuntimeException("Config object could not be written to Job Configuration: ", e);
		}
		
		return jobGraph;
	}

	private void setPhysicalEdges() {
		Map> physicalInEdgesInOrder = new HashMap>();

		for (StreamEdge edge : physicalEdgesInOrder) {
			int target = edge.getTargetId();

			List inEdges = physicalInEdgesInOrder.get(target);

			// create if not set
			if (inEdges == null) {
				inEdges = new ArrayList<>();
				physicalInEdgesInOrder.put(target, inEdges);
			}

			inEdges.add(edge);
		}

		for (Map.Entry> inEdges : physicalInEdgesInOrder.entrySet()) {
			int vertex = inEdges.getKey();
			List edgeList = inEdges.getValue();

			vertexConfigs.get(vertex).setInPhysicalEdges(edgeList);
		}
	}

	/**
	 * Sets up task chains from the source {@link StreamNode} instances.
	 *
	 * 

This will recursively create all {@link JobVertex} instances. */ private void setChaining(Map hashes) { for (Integer sourceNodeId : streamGraph.getSourceIDs()) { createChain(sourceNodeId, sourceNodeId, hashes); } } private List createChain( Integer startNodeId, Integer currentNodeId, Map hashes) { if (!builtVertices.contains(startNodeId)) { List transitiveOutEdges = new ArrayList(); List chainableOutputs = new ArrayList(); List nonChainableOutputs = new ArrayList(); for (StreamEdge outEdge : streamGraph.getStreamNode(currentNodeId).getOutEdges()) { if (isChainable(outEdge)) { chainableOutputs.add(outEdge); } else { nonChainableOutputs.add(outEdge); } } for (StreamEdge chainable : chainableOutputs) { transitiveOutEdges.addAll(createChain(startNodeId, chainable.getTargetId(), hashes)); } for (StreamEdge nonChainable : nonChainableOutputs) { transitiveOutEdges.add(nonChainable); createChain(nonChainable.getTargetId(), nonChainable.getTargetId(), hashes); } chainedNames.put(currentNodeId, createChainedName(currentNodeId, chainableOutputs)); StreamConfig config = currentNodeId.equals(startNodeId) ? createJobVertex(startNodeId, hashes) : new StreamConfig(new Configuration()); setVertexConfig(currentNodeId, config, chainableOutputs, nonChainableOutputs); if (currentNodeId.equals(startNodeId)) { config.setChainStart(); config.setOutEdgesInOrder(transitiveOutEdges); config.setOutEdges(streamGraph.getStreamNode(currentNodeId).getOutEdges()); for (StreamEdge edge : transitiveOutEdges) { connect(startNodeId, edge); } config.setTransitiveChainedTaskConfigs(chainedConfigs.get(startNodeId)); } else { Map chainedConfs = chainedConfigs.get(startNodeId); if (chainedConfs == null) { chainedConfigs.put(startNodeId, new HashMap()); } chainedConfigs.get(startNodeId).put(currentNodeId, config); } return transitiveOutEdges; } else { return new ArrayList<>(); } } private String createChainedName(Integer vertexID, List chainedOutputs) { String operatorName = streamGraph.getStreamNode(vertexID).getOperatorName(); if (chainedOutputs.size() > 1) { List outputChainedNames = new ArrayList<>(); for (StreamEdge chainable : chainedOutputs) { outputChainedNames.add(chainedNames.get(chainable.getTargetId())); } return operatorName + " -> (" + StringUtils.join(outputChainedNames, ", ") + ")"; } else if (chainedOutputs.size() == 1) { return operatorName + " -> " + chainedNames.get(chainedOutputs.get(0).getTargetId()); } else { return operatorName; } } private StreamConfig createJobVertex( Integer streamNodeId, Map hashes) { JobVertex jobVertex; StreamNode streamNode = streamGraph.getStreamNode(streamNodeId); byte[] hash = hashes.get(streamNodeId); if (hash == null) { throw new IllegalStateException("Cannot find node hash. " + "Did you generate them before calling this method?"); } JobVertexID jobVertexId = new JobVertexID(hash); if (streamNode.getInputFormat() != null) { jobVertex = new InputFormatVertex( chainedNames.get(streamNodeId), jobVertexId); TaskConfig taskConfig = new TaskConfig(jobVertex.getConfiguration()); taskConfig.setStubWrapper(new UserCodeObjectWrapper(streamNode.getInputFormat())); } else { jobVertex = new JobVertex( chainedNames.get(streamNodeId), jobVertexId); } jobVertex.setInvokableClass(streamNode.getJobVertexClass()); int parallelism = streamNode.getParallelism(); if (parallelism > 0) { jobVertex.setParallelism(parallelism); } if (LOG.isDebugEnabled()) { LOG.debug("Parallelism set: {} for {}", parallelism, streamNodeId); } jobVertices.put(streamNodeId, jobVertex); builtVertices.add(streamNodeId); jobGraph.addVertex(jobVertex); return new StreamConfig(jobVertex.getConfiguration()); } @SuppressWarnings("unchecked") private void setVertexConfig(Integer vertexID, StreamConfig config, List chainableOutputs, List nonChainableOutputs) { StreamNode vertex = streamGraph.getStreamNode(vertexID); config.setVertexID(vertexID); config.setBufferTimeout(vertex.getBufferTimeout()); config.setTypeSerializerIn1(vertex.getTypeSerializerIn1()); config.setTypeSerializerIn2(vertex.getTypeSerializerIn2()); config.setTypeSerializerOut(vertex.getTypeSerializerOut()); config.setStreamOperator(vertex.getOperator()); config.setOutputSelectors(vertex.getOutputSelectors()); config.setNumberOfOutputs(nonChainableOutputs.size()); config.setNonChainedOutputs(nonChainableOutputs); config.setChainedOutputs(chainableOutputs); config.setTimeCharacteristic(streamGraph.getEnvironment().getStreamTimeCharacteristic()); final CheckpointConfig ceckpointCfg = streamGraph.getCheckpointConfig(); config.setStateBackend(streamGraph.getStateBackend()); config.setCheckpointingEnabled(ceckpointCfg.isCheckpointingEnabled()); if (ceckpointCfg.isCheckpointingEnabled()) { config.setCheckpointMode(ceckpointCfg.getCheckpointingMode()); } else { // the "at-least-once" input handler is slightly cheaper (in the absence of checkpoints), // so we use that one if checkpointing is not enabled config.setCheckpointMode(CheckpointingMode.AT_LEAST_ONCE); } config.setStatePartitioner(0, vertex.getStatePartitioner1()); config.setStatePartitioner(1, vertex.getStatePartitioner2()); config.setStateKeySerializer(vertex.getStateKeySerializer()); Class vertexClass = vertex.getJobVertexClass(); if (vertexClass.equals(StreamIterationHead.class) || vertexClass.equals(StreamIterationTail.class)) { config.setIterationId(streamGraph.getBrokerID(vertexID)); config.setIterationWaitTime(streamGraph.getLoopTimeout(vertexID)); } List allOutputs = new ArrayList(chainableOutputs); allOutputs.addAll(nonChainableOutputs); vertexConfigs.put(vertexID, config); } private void connect(Integer headOfChain, StreamEdge edge) { physicalEdgesInOrder.add(edge); Integer downStreamvertexID = edge.getTargetId(); JobVertex headVertex = jobVertices.get(headOfChain); JobVertex downStreamVertex = jobVertices.get(downStreamvertexID); StreamConfig downStreamConfig = new StreamConfig(downStreamVertex.getConfiguration()); downStreamConfig.setNumberOfInputs(downStreamConfig.getNumberOfInputs() + 1); StreamPartitioner partitioner = edge.getPartitioner(); if (partitioner instanceof ForwardPartitioner) { downStreamVertex.connectNewDataSetAsInput( headVertex, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED, true); } else if (partitioner instanceof RescalePartitioner){ downStreamVertex.connectNewDataSetAsInput( headVertex, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED, true); } else { downStreamVertex.connectNewDataSetAsInput( headVertex, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED, true); } if (LOG.isDebugEnabled()) { LOG.debug("CONNECTED: {} - {} -> {}", partitioner.getClass().getSimpleName(), headOfChain, downStreamvertexID); } } private boolean isChainable(StreamEdge edge) { StreamNode upStreamVertex = edge.getSourceVertex(); StreamNode downStreamVertex = edge.getTargetVertex(); StreamOperator headOperator = upStreamVertex.getOperator(); StreamOperator outOperator = downStreamVertex.getOperator(); return downStreamVertex.getInEdges().size() == 1 && outOperator != null && headOperator != null && upStreamVertex.isSameSlotSharingGroup(downStreamVertex) && outOperator.getChainingStrategy() == ChainingStrategy.ALWAYS && (headOperator.getChainingStrategy() == ChainingStrategy.HEAD || headOperator.getChainingStrategy() == ChainingStrategy.ALWAYS) && (edge.getPartitioner() instanceof ForwardPartitioner) && upStreamVertex.getParallelism() == downStreamVertex.getParallelism() && streamGraph.isChainingEnabled(); } private void setSlotSharing() { Map slotSharingGroups = new HashMap<>(); for (Entry entry : jobVertices.entrySet()) { String slotSharingGroup = streamGraph.getStreamNode(entry.getKey()).getSlotSharingGroup(); SlotSharingGroup group = slotSharingGroups.get(slotSharingGroup); if (group == null) { group = new SlotSharingGroup(); slotSharingGroups.put(slotSharingGroup, group); } entry.getValue().setSlotSharingGroup(group); } for (Tuple2 pair : streamGraph.getIterationSourceSinkPairs()) { CoLocationGroup ccg = new CoLocationGroup(); JobVertex source = jobVertices.get(pair.f0.getId()); JobVertex sink = jobVertices.get(pair.f1.getId()); ccg.addVertex(source); ccg.addVertex(sink); source.updateCoLocationGroup(ccg); sink.updateCoLocationGroup(ccg); } } private void configureCheckpointing() { CheckpointConfig cfg = streamGraph.getCheckpointConfig(); if (cfg.isCheckpointingEnabled()) { long interval = cfg.getCheckpointInterval(); if (interval < 1) { throw new IllegalArgumentException("The checkpoint interval must be positive"); } // collect the vertices that receive "trigger checkpoint" messages. // currently, these are all the sources List triggerVertices = new ArrayList<>(); // collect the vertices that need to acknowledge the checkpoint // currently, these are all vertices List ackVertices = new ArrayList<>(jobVertices.size()); // collect the vertices that receive "commit checkpoint" messages // currently, these are all vertices List commitVertices = new ArrayList<>(); for (JobVertex vertex : jobVertices.values()) { if (vertex.isInputVertex()) { triggerVertices.add(vertex.getID()); } // TODO: add check whether the user function implements the checkpointing interface commitVertices.add(vertex.getID()); ackVertices.add(vertex.getID()); } JobSnapshottingSettings settings = new JobSnapshottingSettings( triggerVertices, ackVertices, commitVertices, interval, cfg.getCheckpointTimeout(), cfg.getMinPauseBetweenCheckpoints(), cfg.getMaxConcurrentCheckpoints()); jobGraph.setSnapshotSettings(settings); // check if a restart strategy has been set, if not then set the FixedDelayRestartStrategy if (streamGraph.getExecutionConfig().getRestartStrategy() == null) { // if the user enabled checkpointing, the default number of exec retries is infinitive. streamGraph.getExecutionConfig().setRestartStrategy( RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, DEFAULT_RESTART_DELAY)); } } } private void configureRestartStrategy() { jobGraph.setRestartStrategyConfiguration(streamGraph.getExecutionConfig().getRestartStrategy()); } // ------------------------------------------------------------------------ /** * Returns a map with a hash for each {@link StreamNode} of the {@link * StreamGraph}. The hash is used as the {@link JobVertexID} in order to * identify nodes across job submissions if they didn't change. * *

The complete {@link StreamGraph} is traversed. The hash is either * computed from the transformation's user-specified id (see * {@link StreamTransformation#getUid()}) or generated in a deterministic way. * *

The generated hash is deterministic with respect to: *

    *
  • node-local properties (like parallelism, UDF, node ID), *
  • chained output nodes, and *
  • input nodes hashes *
* * @return A map from {@link StreamNode#id} to hash as 16-byte array. */ private Map traverseStreamGraphAndGenerateHashes() { // The hash function used to generate the hash final HashFunction hashFunction = Hashing.murmur3_128(0); final Map hashes = new HashMap<>(); Set visited = new HashSet<>(); Queue remaining = new ArrayDeque<>(); // We need to make the source order deterministic. The source IDs are // not returned in the same order, which means that submitting the same // program twice might result in different traversal, which breaks the // deterministic hash assignment. List sources = new ArrayList<>(); for (Integer sourceNodeId : streamGraph.getSourceIDs()) { sources.add(sourceNodeId); } Collections.sort(sources); // // Traverse the graph in a breadth-first manner. Keep in mind that // the graph is not a tree and multiple paths to nodes can exist. // // Start with source nodes for (Integer sourceNodeId : sources) { remaining.add(streamGraph.getStreamNode(sourceNodeId)); visited.add(sourceNodeId); } StreamNode currentNode; while ((currentNode = remaining.poll()) != null) { // Generate the hash code. Because multiple path exist to each // node, we might not have all required inputs available to // generate the hash code. if (generateNodeHash(currentNode, hashFunction, hashes)) { // Add the child nodes for (StreamEdge outEdge : currentNode.getOutEdges()) { StreamNode child = outEdge.getTargetVertex(); if (!visited.contains(child.getId())) { remaining.add(child); visited.add(child.getId()); } } } else { // We will revisit this later. visited.remove(currentNode.getId()); } } return hashes; } /** * Generates a hash for the node and returns whether the operation was * successful. * * @param node The node to generate the hash for * @param hashFunction The hash function to use * @param hashes The current state of generated hashes * @return true if the node hash has been generated. * false, otherwise. If the operation is not successful, the * hash needs be generated at a later point when all input is available. * @throws IllegalStateException If node has user-specified hash and is * intermediate node of a chain */ private boolean generateNodeHash( StreamNode node, HashFunction hashFunction, Map hashes) { // Check for user-specified ID String userSpecifiedHash = node.getTransformationId(); if (userSpecifiedHash == null) { // Check that all input nodes have their hashes computed for (StreamEdge inEdge : node.getInEdges()) { // If the input node has not been visited yet, the current // node will be visited again at a later point when all input // nodes have been visited and their hashes set. if (!hashes.containsKey(inEdge.getSourceId())) { return false; } } Hasher hasher = hashFunction.newHasher(); byte[] hash = generateDeterministicHash(node, hasher, hashes); if (hashes.put(node.getId(), hash) != null) { // Sanity check throw new IllegalStateException("Unexpected state. Tried to add node hash " + "twice. This is probably a bug in the JobGraph generator."); } return true; } else { // Check that this node is not part of a chain. This is currently // not supported, because the runtime takes the snapshots by the // operator ID of the first vertex in a chain. It's OK if the node // has chained outputs. for (StreamEdge inEdge : node.getInEdges()) { if (isChainable(inEdge)) { throw new UnsupportedOperationException("Cannot assign user-specified hash " + "to intermediate node in chain. This will be supported in future " + "versions of Flink. As a work around start new chain at task " + node.getOperatorName() + "."); } } Hasher hasher = hashFunction.newHasher(); byte[] hash = generateUserSpecifiedHash(node, hasher); for (byte[] previousHash : hashes.values()) { if (Arrays.equals(previousHash, hash)) { throw new IllegalArgumentException("Hash collision on user-specified ID. " + "Most likely cause is a non-unique ID. Please check that all IDs " + "specified via `uid(String)` are unique."); } } if (hashes.put(node.getId(), hash) != null) { // Sanity check throw new IllegalStateException("Unexpected state. Tried to add node hash " + "twice. This is probably a bug in the JobGraph generator."); } return true; } } /** * Generates a hash from a user-specified ID. */ private byte[] generateUserSpecifiedHash(StreamNode node, Hasher hasher) { hasher.putString(node.getTransformationId(), Charset.forName("UTF-8")); return hasher.hash().asBytes(); } /** * Generates a deterministic hash from node-local properties and input and * output edges. */ private byte[] generateDeterministicHash( StreamNode node, Hasher hasher, Map hashes) { // Include stream node to hash. We use the current size of the computed // hashes as the ID. We cannot use the node's ID, because it is // assigned from a static counter. This will result in two identical // programs having different hashes. generateNodeLocalHash(node, hasher, hashes.size()); // Include chained nodes to hash for (StreamEdge outEdge : node.getOutEdges()) { if (isChainable(outEdge)) { StreamNode chainedNode = outEdge.getTargetVertex(); // Use the hash size again, because the nodes are chained to // this node. This does not add a hash for the chained nodes. generateNodeLocalHash(chainedNode, hasher, hashes.size()); } } byte[] hash = hasher.hash().asBytes(); // Make sure that all input nodes have their hash set before entering // this loop (calling this method). for (StreamEdge inEdge : node.getInEdges()) { byte[] otherHash = hashes.get(inEdge.getSourceId()); // Sanity check if (otherHash == null) { throw new IllegalStateException("Missing hash for input node " + inEdge.getSourceVertex() + ". Cannot generate hash for " + node + "."); } for (int j = 0; j < hash.length; j++) { hash[j] = (byte) (hash[j] * 37 ^ otherHash[j]); } } if (LOG.isDebugEnabled()) { String udfClassName = ""; if (node.getOperator() instanceof AbstractUdfStreamOperator) { udfClassName = ((AbstractUdfStreamOperator) node.getOperator()) .getUserFunction().getClass().getName(); } LOG.debug("Generated hash '" + byteToHexString(hash) + "' for node " + "'" + node.toString() + "' {id: " + node.getId() + ", " + "parallelism: " + node.getParallelism() + ", " + "user function: " + udfClassName + "}"); } return hash; } /** * Applies the {@link Hasher} to the {@link StreamNode} (only node local * attributes are taken into account). The hasher encapsulates the current * state of the hash. * *

The specified ID is local to this node. We cannot use the * {@link StreamNode#id}, because it is incremented in a static counter. * Therefore, the IDs for identical jobs will otherwise be different. */ private void generateNodeLocalHash(StreamNode node, Hasher hasher, int id) { // This resolves conflicts for otherwise identical source nodes. BUT // the generated hash codes depend on the ordering of the nodes in the // stream graph. hasher.putInt(id); hasher.putInt(node.getParallelism()); if (node.getOperator() instanceof AbstractUdfStreamOperator) { String udfClassName = ((AbstractUdfStreamOperator) node.getOperator()) .getUserFunction().getClass().getName(); hasher.putString(udfClassName, Charset.forName("UTF-8")); } } }



你可能感兴趣的:(flink,streaming)