/**
* Creates an execution environment that represents the context in which the
* program is currently executed. If the program is invoked standalone, this
* method returns a local execution environment, as returned by
* {@link #createLocalEnvironment()}.
*
* @return The execution environment of the context in which the program is
* executed.
*/
public static StreamExecutionEnvironment getExecutionEnvironment() {
if (contextEnvironmentFactory != null) {
return contextEnvironmentFactory.createExecutionEnvironment();
}
// because the streaming project depends on "flink-clients" (and not the other way around)
// we currently need to intercept the data set environment and create a dependent stream env.
// this should be fixed once we rework the project dependencies
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
if (env instanceof ContextEnvironment) {
return new StreamContextEnvironment((ContextEnvironment) env);
} else if (env instanceof OptimizerPlanEnvironment | env instanceof PreviewPlanEnvironment) {
return new StreamPlanEnvironment(env);
} else {
return createLocalEnvironment();
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.api.java;
import com.esotericsoftware.kryo.Serializer;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.Public;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.Plan;
import org.apache.flink.api.common.cache.DistributedCache.DistributedCacheEntry;
import org.apache.flink.api.common.io.FileInputFormat;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.api.common.operators.OperatorInformation;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.hadoop.mapred.HadoopInputFormat;
import org.apache.flink.api.java.io.CollectionInputFormat;
import org.apache.flink.api.java.io.CsvReader;
import org.apache.flink.api.java.io.IteratorInputFormat;
import org.apache.flink.api.java.io.ParallelIteratorInputFormat;
import org.apache.flink.api.java.io.PrimitiveInputFormat;
import org.apache.flink.api.java.io.TextInputFormat;
import org.apache.flink.api.java.io.TextValueInputFormat;
import org.apache.flink.api.java.operators.DataSink;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.Operator;
import org.apache.flink.api.java.operators.OperatorTranslation;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.PojoTypeInfo;
import org.apache.flink.api.java.typeutils.ResultTypeQueryable;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.api.java.typeutils.ValueTypeInfo;
import org.apache.flink.api.java.typeutils.runtime.kryo.Serializers;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.types.StringValue;
import org.apache.flink.util.NumberSequenceIterator;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.SplittableIterator;
import org.apache.flink.util.Visitor;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
/**
* The ExecutionEnvironment is the context in which a program is executed. A
* {@link LocalEnvironment} will cause execution in the current JVM, a
* {@link RemoteEnvironment} will cause execution on a remote setup.
*
* The environment provides methods to control the job execution (such as setting the parallelism)
* and to interact with the outside world (data access).
*
* Please note that the execution environment needs strong type information for the input and return types
* of all operations that are executed. This means that the environments needs to know that the return
* value of an operation is for example a Tuple of String and Integer.
* Because the Java compiler throws much of the generic type information away, most methods attempt to re-
* obtain that information using reflection. In certain cases, it may be necessary to manually supply that
* information to some of the methods.
*
* @see LocalEnvironment
* @see RemoteEnvironment
*/
@Public
public abstract class ExecutionEnvironment {
/** The logger used by the environment and its subclasses */
protected static final Logger LOG = LoggerFactory.getLogger(ExecutionEnvironment.class);
/** The environment of the context (local by default, cluster if invoked through command line) */
private static ExecutionEnvironmentFactory contextEnvironmentFactory;
/** The default parallelism used by local environments */
private static int defaultLocalDop = Runtime.getRuntime().availableProcessors();
// --------------------------------------------------------------------------------------------
private final List> sinks = new ArrayList<>();
private final List> cacheFile = new ArrayList<>();
private final ExecutionConfig config = new ExecutionConfig();
/** Result from the latest execution, to make it retrievable when using eager execution methods */
protected JobExecutionResult lastJobExecutionResult;
/** The ID of the session, defined by this execution environment. Sessions and Jobs are same in
* Flink, as Jobs can consist of multiple parts that are attached to the growing dataflow graph */
protected JobID jobID;
/** The session timeout in seconds */
protected long sessionTimeout;
/** Flag to indicate whether sinks have been cleared in previous executions */
private boolean wasExecuted = false;
/**
* Creates a new Execution Environment.
*/
protected ExecutionEnvironment() {
jobID = JobID.generate();
}
// --------------------------------------------------------------------------------------------
// Properties
// --------------------------------------------------------------------------------------------
/**
* Gets the config object that defines execution parameters.
*
* @return The environment's execution configuration.
*/
public ExecutionConfig getConfig() {
return config;
}
/**
* Gets the parallelism with which operation are executed by default. Operations can
* individually override this value to use a specific parallelism via
* {@link Operator#setParallelism(int)}. Other operations may need to run with a different
* parallelism - for example calling
* {@link DataSet#reduce(org.apache.flink.api.common.functions.ReduceFunction)} over the entire
* set will insert eventually an operation that runs non-parallel (parallelism of one).
*
* @return The parallelism used by operations, unless they override that value. This method
* returns {@link ExecutionConfig#PARALLELISM_DEFAULT}, if the environment's default parallelism should be used.
*/
public int getParallelism() {
return config.getParallelism();
}
/**
* Sets the parallelism for operations executed through this environment.
* Setting a parallelism of x here will cause all operators (such as join, map, reduce) to run with
* x parallel instances.
*
* This method overrides the default parallelism for this environment.
* The {@link LocalEnvironment} uses by default a value equal to the number of hardware
* contexts (CPU cores / threads). When executing the program via the command line client
* from a JAR file, the default parallelism is the one configured for that setup.
*
* @param parallelism The parallelism
*/
public void setParallelism(int parallelism) {
config.setParallelism(parallelism);
}
<<<<<<< HEAD
/**
* Sets the restart strategy configuration. The configuration specifies which restart strategy
* will be used for the execution graph in case of a restart.
*
* @param restartStrategyConfiguration Restart strategy configuration to be set
*/
@PublicEvolving
public void setRestartStrategy(RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration) {
config.setRestartStrategy(restartStrategyConfiguration);
}
/**
* Returns the specified restart strategy configuration.
*
* @return The restart strategy configuration to be used
*/
@PublicEvolving
public RestartStrategies.RestartStrategyConfiguration getRestartStrategy() {
return config.getRestartStrategy();
}
=======
>>>>>>> 644c27504ad6fb89372e3b39123a4f896013e1ad
/**
* Sets the number of times that failed tasks are re-executed. A value of zero
* effectively disables fault tolerance. A value of {@code -1} indicates that the system
* default value (as defined in the configuration) should be used.
*
* @param numberOfExecutionRetries The number of times the system will try to re-execute failed tasks.
*
* @deprecated This method will be replaced by {@link #setRestartStrategy}. The
* {@link RestartStrategies.FixedDelayRestartStrategyConfiguration} contains the number of
* execution retries.
*/
@Deprecated
@PublicEvolving
public void setNumberOfExecutionRetries(int numberOfExecutionRetries) {
config.setNumberOfExecutionRetries(numberOfExecutionRetries);
}
/**
* Gets the number of times the system will try to re-execute failed tasks. A value
* of {@code -1} indicates that the system default value (as defined in the configuration)
* should be used.
*
* @return The number of times the system will try to re-execute failed tasks.
*
* @deprecated This method will be replaced by {@link #getRestartStrategy}. The
* {@link RestartStrategies.FixedDelayRestartStrategyConfiguration} contains the number of
* execution retries.
*/
@Deprecated
@PublicEvolving
public int getNumberOfExecutionRetries() {
return config.getNumberOfExecutionRetries();
}
/**
* Returns the {@link org.apache.flink.api.common.JobExecutionResult} of the last executed job.
*
* @return The execution result from the latest job execution.
*/
public JobExecutionResult getLastJobExecutionResult(){
return this.lastJobExecutionResult;
}
// --------------------------------------------------------------------------------------------
// Session Management
// --------------------------------------------------------------------------------------------
/**
* Gets the JobID by which this environment is identified. The JobID sets the execution context
* in the cluster or local environment.
*
* @return The JobID of this environment.
* @see #getIdString()
*/
@PublicEvolving
public JobID getId() {
return this.jobID;
}
/**
* Gets the JobID by which this environment is identified, as a string.
*
* @return The JobID as a string.
* @see #getId()
*/
@PublicEvolving
public String getIdString() {
return this.jobID.toString();
}
/**
* Sets the session timeout to hold the intermediate results of a job. This only
* applies the updated timeout in future executions.
*
* @param timeout The timeout, in seconds.
*/
@PublicEvolving
public void setSessionTimeout(long timeout) {
throw new IllegalStateException("Support for sessions is currently disabled. " +
"It will be enabled in future Flink versions.");
// Session management is disabled, revert this commit to enable
//if (timeout < 0) {
// throw new IllegalArgumentException("The session timeout must not be less than zero.");
//}
//this.sessionTimeout = timeout;
}
/**
* Gets the session timeout for this environment. The session timeout defines for how long
* after an execution, the job and its intermediate results will be kept for future
* interactions.
*
* @return The session timeout, in seconds.
*/
@PublicEvolving
public long getSessionTimeout() {
return sessionTimeout;
}
/**
* Starts a new session, discarding the previous data flow and all of its intermediate results.
*/
@PublicEvolving
public abstract void startNewSession() throws Exception;
// --------------------------------------------------------------------------------------------
// Registry for types and serializers
// --------------------------------------------------------------------------------------------
/**
* Adds a new Kryo default serializer to the Runtime.
*
* Note that the serializer instance must be serializable (as defined by java.io.Serializable),
* because it may be distributed to the worker nodes by java serialization.
*
* @param type The class of the types serialized with the given serializer.
* @param serializer The serializer to use.
*/
public & Serializable>void addDefaultKryoSerializer(Class> type, T serializer) {
config.addDefaultKryoSerializer(type, serializer);
}
/**
* Adds a new Kryo default serializer to the Runtime.
*
* @param type The class of the types serialized with the given serializer.
* @param serializerClass The class of the serializer to use.
*/
public void addDefaultKryoSerializer(Class> type, Class extends Serializer>> serializerClass) {
config.addDefaultKryoSerializer(type, serializerClass);
}
/**
* Registers the given type with a Kryo Serializer.
*
* Note that the serializer instance must be serializable (as defined by java.io.Serializable),
* because it may be distributed to the worker nodes by java serialization.
*
* @param type The class of the types serialized with the given serializer.
* @param serializer The serializer to use.
*/
public & Serializable>void registerTypeWithKryoSerializer(Class> type, T serializer) {
config.registerTypeWithKryoSerializer(type, serializer);
}
/**
* Registers the given Serializer via its class as a serializer for the given type at the KryoSerializer
*
* @param type The class of the types serialized with the given serializer.
* @param serializerClass The class of the serializer to use.
*/
public void registerTypeWithKryoSerializer(Class> type, Class extends Serializer>> serializerClass) {
config.registerTypeWithKryoSerializer(type, serializerClass);
}
/**
* Registers the given type with the serialization stack. If the type is eventually
* serialized as a POJO, then the type is registered with the POJO serializer. If the
* type ends up being serialized with Kryo, then it will be registered at Kryo to make
* sure that only tags are written.
*
* @param type The class of the type to register.
*/
public void registerType(Class> type) {
if (type == null) {
throw new NullPointerException("Cannot register null type class.");
}
TypeInformation> typeInfo = TypeExtractor.createTypeInfo(type);
if (typeInfo instanceof PojoTypeInfo) {
config.registerPojoType(type);
} else {
config.registerKryoType(type);
}
}
// --------------------------------------------------------------------------------------------
// Data set creations
// --------------------------------------------------------------------------------------------
// ---------------------------------- Text Input Format ---------------------------------------
/**
* Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise.
* The file will be read with the system's default character set.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @return A {@link DataSet} that represents the data read from the given file as text lines.
*/
public DataSource readTextFile(String filePath) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
return new DataSource<>(this, new TextInputFormat(new Path(filePath)), BasicTypeInfo.STRING_TYPE_INFO, Utils.getCallLocationName());
}
/**
* Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise.
* The {@link java.nio.charset.Charset} with the given name will be used to read the files.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @param charsetName The name of the character set used to read the file.
* @return A {@link DataSet} that represents the data read from the given file as text lines.
*/
public DataSource readTextFile(String filePath, String charsetName) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
TextInputFormat format = new TextInputFormat(new Path(filePath));
format.setCharsetName(charsetName);
return new DataSource<>(this, format, BasicTypeInfo.STRING_TYPE_INFO, Utils.getCallLocationName());
}
// -------------------------- Text Input Format With String Value------------------------------
/**
* Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise.
* This method is similar to {@link #readTextFile(String)}, but it produces a DataSet with mutable
* {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations
* to be less object and garbage collection heavy.
*
* The file will be read with the system's default character set.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @return A {@link DataSet} that represents the data read from the given file as text lines.
*/
public DataSource readTextFileWithValue(String filePath) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
return new DataSource<>(this, new TextValueInputFormat(new Path(filePath)), new ValueTypeInfo<>(StringValue.class), Utils.getCallLocationName());
}
/**
* Creates a {@link DataSet} that represents the Strings produced by reading the given file line wise.
* This method is similar to {@link #readTextFile(String, String)}, but it produces a DataSet with mutable
* {@link StringValue} objects, rather than Java Strings. StringValues can be used to tune implementations
* to be less object and garbage collection heavy.
*
* The {@link java.nio.charset.Charset} with the given name will be used to read the files.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @param charsetName The name of the character set used to read the file.
* @param skipInvalidLines A flag to indicate whether to skip lines that cannot be read with the given character set.
*
* @return A DataSet that represents the data read from the given file as text lines.
*/
public DataSource readTextFileWithValue(String filePath, String charsetName, boolean skipInvalidLines) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
TextValueInputFormat format = new TextValueInputFormat(new Path(filePath));
format.setCharsetName(charsetName);
format.setSkipInvalidLines(skipInvalidLines);
return new DataSource<>(this, format, new ValueTypeInfo<>(StringValue.class), Utils.getCallLocationName());
}
// ----------------------------------- Primitive Input Format ---------------------------------------
/**
* Creates a {@link DataSet} that represents the primitive type produced by reading the given file line wise.
* This method is similar to {@link #readCsvFile(String)} with single field, but it produces a DataSet not through
* {@link org.apache.flink.api.java.tuple.Tuple1}.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @param typeClass The primitive type class to be read.
* @return A {@link DataSet} that represents the data read from the given file as primitive type.
*/
public DataSource readFileOfPrimitives(String filePath, Class typeClass) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
return new DataSource<>(this, new PrimitiveInputFormat<>(new Path(filePath), typeClass), TypeExtractor.getForClass(typeClass), Utils.getCallLocationName());
}
/**
* Creates a {@link DataSet} that represents the primitive type produced by reading the given file in delimited way.
* This method is similar to {@link #readCsvFile(String)} with single field, but it produces a DataSet not through
* {@link org.apache.flink.api.java.tuple.Tuple1}.
*
* @param filePath The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path").
* @param delimiter The delimiter of the given file.
* @param typeClass The primitive type class to be read.
* @return A {@link DataSet} that represents the data read from the given file as primitive type.
*/
public DataSource readFileOfPrimitives(String filePath, String delimiter, Class typeClass) {
Preconditions.checkNotNull(filePath, "The file path may not be null.");
return new DataSource<>(this, new PrimitiveInputFormat<>(new Path(filePath), delimiter, typeClass), TypeExtractor.getForClass(typeClass), Utils.getCallLocationName());
}
// ----------------------------------- CSV Input Format ---------------------------------------
/**
* Creates a CSV reader to read a comma separated value (CSV) file. The reader has options to
* define parameters and field types and will eventually produce the DataSet that corresponds to
* the read and parsed CSV input.
*
* @param filePath The path of the CSV file.
* @return A CsvReader that can be used to configure the CSV input.
*/
public CsvReader readCsvFile(String filePath) {
return new CsvReader(filePath, this);
}
// ------------------------------------ File Input Format -----------------------------------------
public DataSource readFile(FileInputFormat inputFormat, String filePath) {
if (inputFormat == null) {
throw new IllegalArgumentException("InputFormat must not be null.");
}
if (filePath == null) {
throw new IllegalArgumentException("The file path must not be null.");
}
inputFormat.setFilePath(new Path(filePath));
try {
return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat));
}
catch (Exception e) {
throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " +
"Please specify the TypeInformation of the produced type explicitly by using the " +
"'createInput(InputFormat, TypeInformation)' method instead.");
}
}
// ----------------------------------- Generic Input Format ---------------------------------------
/**
* Generic method to create an input {@link DataSet} with in {@link InputFormat}. The DataSet will not be
* immediately created - instead, this method returns a DataSet that will be lazily created from
* the input format once the program is executed.
*
* Since all data sets need specific information about their types, this method needs to determine
* the type of the data produced by the input format. It will attempt to determine the data type
* by reflection, unless the input format implements the {@link ResultTypeQueryable} interface.
* In the latter case, this method will invoke the {@link ResultTypeQueryable#getProducedType()}
* method to determine data type produced by the input format.
*
* @param inputFormat The input format used to create the data set.
* @return A {@link DataSet} that represents the data created by the input format.
*
* @see #createInput(InputFormat, TypeInformation)
*/
public DataSource createInput(InputFormat inputFormat) {
if (inputFormat == null) {
throw new IllegalArgumentException("InputFormat must not be null.");
}
try {
return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat));
}
catch (Exception e) {
throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " +
"Please specify the TypeInformation of the produced type explicitly by using the " +
"'createInput(InputFormat, TypeInformation)' method instead.", e);
}
}
/**
* Generic method to create an input DataSet with in {@link InputFormat}. The {@link DataSet} will not be
* immediately created - instead, this method returns a {@link DataSet} that will be lazily created from
* the input format once the program is executed.
*
* The {@link DataSet} is typed to the given TypeInformation. This method is intended for input formats that
* where the return type cannot be determined by reflection analysis, and that do not implement the
* {@link ResultTypeQueryable} interface.
*
* @param inputFormat The input format used to create the data set.
* @return A {@link DataSet} that represents the data created by the input format.
*
* @see #createInput(InputFormat)
*/
public DataSource createInput(InputFormat inputFormat, TypeInformation producedType) {
if (inputFormat == null) {
throw new IllegalArgumentException("InputFormat must not be null.");
}
if (producedType == null) {
throw new IllegalArgumentException("Produced type information must not be null.");
}
return new DataSource<>(this, inputFormat, producedType, Utils.getCallLocationName());
}
// ----------------------------------- Hadoop Input Format ---------------------------------------
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapred.FileInputFormat}. The
* given inputName is set on the given job.
*/
@PublicEvolving
public DataSource> readHadoopFile(org.apache.hadoop.mapred.FileInputFormat mapredInputFormat, Class key, Class value, String inputPath, JobConf job) {
DataSource> result = createHadoopInput(mapredInputFormat, key, value, job);
org.apache.hadoop.mapred.FileInputFormat.addInputPath(job, new org.apache.hadoop.fs.Path(inputPath));
return result;
}
/**
* Creates a {@link DataSet} from {@link org.apache.hadoop.mapred.SequenceFileInputFormat}
* A {@link org.apache.hadoop.mapred.JobConf} with the given inputPath is created.
*/
<<<<<<< HEAD
@PublicEvolving
=======
>>>>>>> 644c27504ad6fb89372e3b39123a4f896013e1ad
public DataSource> readSequenceFile(Class key, Class value, String inputPath) throws IOException {
return readHadoopFile(new org.apache.hadoop.mapred.SequenceFileInputFormat(), key, value, inputPath);
}
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapred.FileInputFormat}. A
* {@link org.apache.hadoop.mapred.JobConf} with the given inputPath is created.
*/
@PublicEvolving
public DataSource> readHadoopFile(org.apache.hadoop.mapred.FileInputFormat mapredInputFormat, Class key, Class value, String inputPath) {
return readHadoopFile(mapredInputFormat, key, value, inputPath, new JobConf());
}
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapred.InputFormat}.
*/
@PublicEvolving
public DataSource> createHadoopInput(org.apache.hadoop.mapred.InputFormat mapredInputFormat, Class key, Class value, JobConf job) {
HadoopInputFormat hadoopInputFormat = new HadoopInputFormat<>(mapredInputFormat, key, value, job);
return this.createInput(hadoopInputFormat);
}
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}. The
* given inputName is set on the given job.
*/
@PublicEvolving
public DataSource> readHadoopFile(org.apache.hadoop.mapreduce.lib.input.FileInputFormat mapreduceInputFormat, Class key, Class value, String inputPath, Job job) throws IOException {
DataSource> result = createHadoopInput(mapreduceInputFormat, key, value, job);
org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new org.apache
.hadoop.fs.Path(inputPath));
return result;
}
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}. A
* {@link org.apache.hadoop.mapreduce.Job} with the given inputPath is created.
*/
@PublicEvolving
public DataSource> readHadoopFile(org.apache.hadoop.mapreduce.lib.input.FileInputFormat mapreduceInputFormat, Class key, Class value, String inputPath) throws IOException {
return readHadoopFile(mapreduceInputFormat, key, value, inputPath, Job.getInstance());
}
/**
* Creates a {@link DataSet} from the given {@link org.apache.hadoop.mapreduce.InputFormat}.
*/
@PublicEvolving
public DataSource> createHadoopInput(org.apache.hadoop.mapreduce.InputFormat mapreduceInputFormat, Class key, Class value, Job job) {
org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat hadoopInputFormat = new org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat<>(mapreduceInputFormat, key, value, job);
return this.createInput(hadoopInputFormat);
}
// ----------------------------------- Collection ---------------------------------------
/**
* Creates a DataSet from the given non-empty collection. The type of the data set is that
* of the elements in the collection.
*
* The framework will try and determine the exact type from the collection elements.
* In case of generic elements, it may be necessary to manually supply the type information
* via {@link #fromCollection(Collection, TypeInformation)}.
*
* Note that this operation will result in a non-parallel data source, i.e. a data source with
* a parallelism of one.
*
* @param data The collection of elements to create the data set from.
* @return A DataSet representing the given collection.
*
* @see #fromCollection(Collection, TypeInformation)
*/
public DataSource fromCollection(Collection data) {
if (data == null) {
throw new IllegalArgumentException("The data must not be null.");
}
if (data.size() == 0) {
throw new IllegalArgumentException("The size of the collection must not be empty.");
}
X firstValue = data.iterator().next();
TypeInformation type = TypeExtractor.getForObject(firstValue);
CollectionInputFormat.checkCollection(data, type.getTypeClass());
return new DataSource<>(this, new CollectionInputFormat<>(data, type.createSerializer(config)), type, Utils.getCallLocationName());
}
/**
* Creates a DataSet from the given non-empty collection. Note that this operation will result
* in a non-parallel data source, i.e. a data source with a parallelism of one.
*
* The returned DataSet is typed to the given TypeInformation.
*
* @param data The collection of elements to create the data set from.
* @param type The TypeInformation for the produced data set.
* @return A DataSet representing the given collection.
*
* @see #fromCollection(Collection)
*/
public DataSource fromCollection(Collection data, TypeInformation type) {
return fromCollection(data, type, Utils.getCallLocationName());
}
private DataSource fromCollection(Collection data, TypeInformation type, String callLocationName) {
CollectionInputFormat.checkCollection(data, type.getTypeClass());
return new DataSource<>(this, new CollectionInputFormat<>(data, type.createSerializer(config)), type, callLocationName);
}
/**
* Creates a DataSet from the given iterator. Because the iterator will remain unmodified until
* the actual execution happens, the type of data returned by the iterator must be given
* explicitly in the form of the type class (this is due to the fact that the Java compiler
* erases the generic type information).
*
* Note that this operation will result in a non-parallel data source, i.e. a data source with
* a parallelism of one.
*
* @param data The collection of elements to create the data set from.
* @param type The class of the data produced by the iterator. Must not be a generic class.
* @return A DataSet representing the elements in the iterator.
*
* @see #fromCollection(Iterator, TypeInformation)
*/
public DataSource fromCollection(Iterator data, Class type) {
return fromCollection(data, TypeExtractor.getForClass(type));
}
/**
* Creates a DataSet from the given iterator. Because the iterator will remain unmodified until
* the actual execution happens, the type of data returned by the iterator must be given
* explicitly in the form of the type information. This method is useful for cases where the type
* is generic. In that case, the type class (as given in {@link #fromCollection(Iterator, Class)}
* does not supply all type information.
*
* Note that this operation will result in a non-parallel data source, i.e. a data source with
* a parallelism of one.
*
* @param data The collection of elements to create the data set from.
* @param type The TypeInformation for the produced data set.
* @return A DataSet representing the elements in the iterator.
*
* @see #fromCollection(Iterator, Class)
*/
public DataSource fromCollection(Iterator data, TypeInformation type) {
return new DataSource<>(this, new IteratorInputFormat<>(data), type, Utils.getCallLocationName());
}
/**
* Creates a new data set that contains the given elements. The elements must all be of the same type,
* for example, all of the {@link String} or {@link Integer}. The sequence of elements must not be empty.
*
* The framework will try and determine the exact type from the collection elements.
* In case of generic elements, it may be necessary to manually supply the type information
* via {@link #fromCollection(Collection, TypeInformation)}.
*
* Note that this operation will result in a non-parallel data source, i.e. a data source with
* a parallelism of one.
*
* @param data The elements to make up the data set.
* @return A DataSet representing the given list of elements.
*/
@SafeVarargs
public final DataSource fromElements(X... data) {
if (data == null) {
throw new IllegalArgumentException("The data must not be null.");
}
if (data.length == 0) {
throw new IllegalArgumentException("The number of elements must not be zero.");
}
TypeInformation typeInfo;
try {
typeInfo = TypeExtractor.getForObject(data[0]);
}
catch (Exception e) {
throw new RuntimeException("Could not create TypeInformation for type " + data[0].getClass().getName()
+ "; please specify the TypeInformation manually via "
+ "ExecutionEnvironment#fromElements(Collection, TypeInformation)");
}
return fromCollection(Arrays.asList(data), typeInfo, Utils.getCallLocationName());
}
/**
* Creates a new data set that contains the given elements. The framework will determine the type according to the
* based type user supplied. The elements should be the same or be the subclass to the based type.
* The sequence of elements must not be empty.
* Note that this operation will result in a non-parallel data source, i.e. a data source with
* a parallelism of one.
*
* @param type The base class type for every element in the collection.
* @param data The elements to make up the data set.
* @return A DataSet representing the given list of elements.
*/
@SafeVarargs
public final DataSource fromElements(Class type, X... data) {
if (data == null) {
throw new IllegalArgumentException("The data must not be null.");
}
if (data.length == 0) {
throw new IllegalArgumentException("The number of elements must not be zero.");
}
TypeInformation typeInfo;
try {
typeInfo = TypeExtractor.getForClass(type);
}
catch (Exception e) {
throw new RuntimeException("Could not create TypeInformation for type " + type.getName()
+ "; please specify the TypeInformation manually via "
+ "ExecutionEnvironment#fromElements(Collection, TypeInformation)");
}
return fromCollection(Arrays.asList(data), typeInfo, Utils.getCallLocationName());
}
/**
* Creates a new data set that contains elements in the iterator. The iterator is splittable, allowing the
* framework to create a parallel data source that returns the elements in the iterator.
*
* Because the iterator will remain unmodified until the actual execution happens, the type of data
* returned by the iterator must be given explicitly in the form of the type class (this is due to the
* fact that the Java compiler erases the generic type information).
*
* @param iterator The iterator that produces the elements of the data set.
* @param type The class of the data produced by the iterator. Must not be a generic class.
* @return A DataSet representing the elements in the iterator.
*
* @see #fromParallelCollection(SplittableIterator, TypeInformation)
*/
public DataSource fromParallelCollection(SplittableIterator iterator, Class type) {
return fromParallelCollection(iterator, TypeExtractor.getForClass(type));
}
/**
* Creates a new data set that contains elements in the iterator. The iterator is splittable, allowing the
* framework to create a parallel data source that returns the elements in the iterator.
*
* Because the iterator will remain unmodified until the actual execution happens, the type of data
* returned by the iterator must be given explicitly in the form of the type information.
* This method is useful for cases where the type is generic. In that case, the type class
* (as given in {@link #fromParallelCollection(SplittableIterator, Class)} does not supply all type information.
*
* @param iterator The iterator that produces the elements of the data set.
* @param type The TypeInformation for the produced data set.
* @return A DataSet representing the elements in the iterator.
*
* @see #fromParallelCollection(SplittableIterator, Class)
*/
public DataSource fromParallelCollection(SplittableIterator iterator, TypeInformation type) {
return fromParallelCollection(iterator, type, Utils.getCallLocationName());
}
// private helper for passing different call location names
private DataSource fromParallelCollection(SplittableIterator iterator, TypeInformation type, String callLocationName) {
return new DataSource<>(this, new ParallelIteratorInputFormat<>(iterator), type, callLocationName);
}
/**
* Creates a new data set that contains a sequence of numbers. The data set will be created in parallel,
* so there is no guarantee about the order of the elements.
*
* @param from The number to start at (inclusive).
* @param to The number to stop at (inclusive).
* @return A DataSet, containing all number in the {@code [from, to]} interval.
*/
public DataSource generateSequence(long from, long to) {
return fromParallelCollection(new NumberSequenceIterator(from, to), BasicTypeInfo.LONG_TYPE_INFO, Utils.getCallLocationName());
}
// --------------------------------------------------------------------------------------------
// Executing
// --------------------------------------------------------------------------------------------
/**
* Triggers the program execution. The environment will execute all parts of the program that have
* resulted in a "sink" operation. Sink operations are for example printing results ({@link DataSet#print()},
* writing results (e.g. {@link DataSet#writeAsText(String)},
* {@link DataSet#write(org.apache.flink.api.common.io.FileOutputFormat, String)}, or other generic
* data sinks created with {@link DataSet#output(org.apache.flink.api.common.io.OutputFormat)}.
*
* The program execution will be logged and displayed with a generated default name.
*
* @return The result of the job execution, containing elapsed time and accumulators.
* @throws Exception Thrown, if the program executions fails.
*/
public JobExecutionResult execute() throws Exception {
return execute(getDefaultName());
}
/**
* Triggers the program execution. The environment will execute all parts of the program that have
* resulted in a "sink" operation. Sink operations are for example printing results ({@link DataSet#print()},
* writing results (e.g. {@link DataSet#writeAsText(String)},
* {@link DataSet#write(org.apache.flink.api.common.io.FileOutputFormat, String)}, or other generic
* data sinks created with {@link DataSet#output(org.apache.flink.api.common.io.OutputFormat)}.
*
* The program execution will be logged and displayed with the given job name.
*
* @return The result of the job execution, containing elapsed time and accumulators.
* @throws Exception Thrown, if the program executions fails.
*/
public abstract JobExecutionResult execute(String jobName) throws Exception;
/**
* Creates the plan with which the system will execute the program, and returns it as
* a String using a JSON representation of the execution data flow graph.
* Note that this needs to be called, before the plan is executed.
*
* @return The execution plan of the program, as a JSON String.
* @throws Exception Thrown, if the compiler could not be instantiated, or the master could not
* be contacted to retrieve information relevant to the execution planning.
*/
public abstract String getExecutionPlan() throws Exception;
/**
* Registers a file at the distributed cache under the given name. The file will be accessible
* from any user-defined function in the (distributed) runtime under a local path. Files
* may be local files (as long as all relevant workers have access to it), or files in a distributed file system.
* The runtime will copy the files temporarily to a local cache, if needed.
*
* The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via
* {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access
* {@link org.apache.flink.api.common.cache.DistributedCache} via
* {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}.
*
* @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path")
* @param name The name under which the file is registered.
*/
public void registerCachedFile(String filePath, String name){
registerCachedFile(filePath, name, false);
}
/**
* Registers a file at the distributed cache under the given name. The file will be accessible
* from any user-defined function in the (distributed) runtime under a local path. Files
* may be local files (as long as all relevant workers have access to it), or files in a distributed file system.
* The runtime will copy the files temporarily to a local cache, if needed.
*
* The {@link org.apache.flink.api.common.functions.RuntimeContext} can be obtained inside UDFs via
* {@link org.apache.flink.api.common.functions.RichFunction#getRuntimeContext()} and provides access
* {@link org.apache.flink.api.common.cache.DistributedCache} via
* {@link org.apache.flink.api.common.functions.RuntimeContext#getDistributedCache()}.
*
* @param filePath The path of the file, as a URI (e.g. "file:///some/path" or "hdfs://host:port/and/path")
* @param name The name under which the file is registered.
* @param executable flag indicating whether the file should be executable
*/
public void registerCachedFile(String filePath, String name, boolean executable){
this.cacheFile.add(new Tuple2<>(name, new DistributedCacheEntry(filePath, executable)));
}
/**
* Registers all files that were registered at this execution environment's cache registry of the
* given plan's cache registry.
*
* @param p The plan to register files at.
* @throws IOException Thrown if checks for existence and sanity fail.
*/
protected void registerCachedFilesWithPlan(Plan p) throws IOException {
for (Tuple2 entry : cacheFile) {
p.registerCachedFile(entry.f0, entry.f1);
}
}
/**
* Creates the program's {@link Plan}. The plan is a description of all data sources, data sinks,
* and operations and how they interact, as an isolated unit that can be executed with a
* {@link org.apache.flink.api.common.PlanExecutor}. Obtaining a plan and starting it with an
* executor is an alternative way to run a program and is only possible if the program consists
* only of distributed operations.
* This automatically starts a new stage of execution.
*
* @return The program's plan.
*/
@Internal
public Plan createProgramPlan() {
return createProgramPlan(null);
}
/**
* Creates the program's {@link Plan}. The plan is a description of all data sources, data sinks,
* and operations and how they interact, as an isolated unit that can be executed with a
* {@link org.apache.flink.api.common.PlanExecutor}. Obtaining a plan and starting it with an
* executor is an alternative way to run a program and is only possible if the program consists
* only of distributed operations.
* This automatically starts a new stage of execution.
*
* @param jobName The name attached to the plan (displayed in logs and monitoring).
* @return The program's plan.
*/
@Internal
public Plan createProgramPlan(String jobName) {
return createProgramPlan(jobName, true);
}
/**
* Creates the program's {@link Plan}. The plan is a description of all data sources, data sinks,
* and operations and how they interact, as an isolated unit that can be executed with a
* {@link org.apache.flink.api.common.PlanExecutor}. Obtaining a plan and starting it with an
* executor is an alternative way to run a program and is only possible if the program consists
* only of distributed operations.
*
* @param jobName The name attached to the plan (displayed in logs and monitoring).
* @param clearSinks Whether or not to start a new stage of execution.
* @return The program's plan.
*/
@Internal
public Plan createProgramPlan(String jobName, boolean clearSinks) {
if (this.sinks.isEmpty()) {
if (wasExecuted) {
throw new RuntimeException("No new data sinks have been defined since the " +
"last execution. The last execution refers to the latest call to " +
"'execute()', 'count()', 'collect()', or 'print()'.");
} else {
throw new RuntimeException("No data sinks have been created yet. " +
"A program needs at least one sink that consumes data. " +
"Examples are writing the data set or printing it.");
}
}
if (jobName == null) {
jobName = getDefaultName();
}
OperatorTranslation translator = new OperatorTranslation();
Plan plan = translator.translateToPlan(this.sinks, jobName);
if (getParallelism() > 0) {
plan.setDefaultParallelism(getParallelism());
}
plan.setExecutionConfig(getConfig());
// Check plan for GenericTypeInfo's and register the types at the serializers.
<<<<<<< HEAD
if (!config.isAutoTypeRegistrationDisabled()) {
plan.accept(new Visitor>() {
private final HashSet> deduplicator = new HashSet<>();
@Override
public boolean preVisit(org.apache.flink.api.common.operators.Operator> visitable) {
OperatorInformation> opInfo = visitable.getOperatorInfo();
Serializers.recursivelyRegisterType(opInfo.getOutputType(), config, deduplicator);
return true;
=======
plan.accept(new Visitor>() {
@Override
public boolean preVisit(org.apache.flink.api.common.operators.Operator> visitable) {
OperatorInformation> opInfo = visitable.getOperatorInfo();
TypeInformation> typeInfo = opInfo.getOutputType();
if(typeInfo instanceof GenericTypeInfo) {
GenericTypeInfo> genericTypeInfo = (GenericTypeInfo>) typeInfo;
if(!config.isAutoTypeRegistrationDisabled()) {
Serializers.recursivelyRegisterType(genericTypeInfo.getTypeClass(), config);
}
}
if(typeInfo instanceof CompositeType) {
List> genericTypesInComposite = new ArrayList<>();
Utils.getContainedGenericTypes((CompositeType>)typeInfo, genericTypesInComposite);
for(GenericTypeInfo> gt : genericTypesInComposite) {
Serializers.recursivelyRegisterType(gt.getTypeClass(), config);
}
>>>>>>> 644c27504ad6fb89372e3b39123a4f896013e1ad
}
@Override
public void postVisit(org.apache.flink.api.common.operators.Operator> visitable) {}
});
}
try {
registerCachedFilesWithPlan(plan);
} catch (Exception e) {
throw new RuntimeException("Error while registering cached files: " + e.getMessage(), e);
}
// clear all the sinks such that the next execution does not redo everything
if (clearSinks) {
this.sinks.clear();
wasExecuted = true;
}
// All types are registered now. Print information.
int registeredTypes = config.getRegisteredKryoTypes().size() +
config.getRegisteredPojoTypes().size() +
config.getRegisteredTypesWithKryoSerializerClasses().size() +
config.getRegisteredTypesWithKryoSerializers().size();
int defaultKryoSerializers = config.getDefaultKryoSerializers().size() +
config.getDefaultKryoSerializerClasses().size();
LOG.info("The job has {} registered types and {} default Kryo serializers", registeredTypes, defaultKryoSerializers);
if(config.isForceKryoEnabled() && config.isForceAvroEnabled()) {
LOG.warn("In the ExecutionConfig, both Avro and Kryo are enforced. Using Kryo serializer");
}
if(config.isForceKryoEnabled()) {
LOG.info("Using KryoSerializer for serializing POJOs");
}
if(config.isForceAvroEnabled()) {
LOG.info("Using AvroSerializer for serializing POJOs");
}
if(LOG.isDebugEnabled()) {
LOG.debug("Registered Kryo types: {}", config.getRegisteredKryoTypes().toString());
LOG.debug("Registered Kryo with Serializers types: {}", config.getRegisteredTypesWithKryoSerializers().entrySet().toString());
LOG.debug("Registered Kryo with Serializer Classes types: {}", config.getRegisteredTypesWithKryoSerializerClasses().entrySet().toString());
LOG.debug("Registered Kryo default Serializers: {}", config.getDefaultKryoSerializers().entrySet().toString());
LOG.debug("Registered Kryo default Serializers Classes {}", config.getDefaultKryoSerializerClasses().entrySet().toString());
LOG.debug("Registered POJO types: {}", config.getRegisteredPojoTypes().toString());
// print information about static code analysis
LOG.debug("Static code analysis mode: {}", config.getCodeAnalysisMode());
}
return plan;
}
/**
* Adds the given sink to this environment. Only sinks that have been added will be executed once
* the {@link #execute()} or {@link #execute(String)} method is called.
*
* @param sink The sink to add for execution.
*/
@Internal
void registerDataSink(DataSink> sink) {
this.sinks.add(sink);
}
/**
* Gets a default job name, based on the timestamp when this method is invoked.
*
* @return A default job name.
*/
private static String getDefaultName() {
return "Flink Java Job at " + Calendar.getInstance().getTime();
}
// --------------------------------------------------------------------------------------------
// Instantiation of Execution Contexts
// --------------------------------------------------------------------------------------------
/**
* Creates an execution environment that represents the context in which the program is currently executed.
* If the program is invoked standalone, this method returns a local execution environment, as returned by
* {@link #createLocalEnvironment()}. If the program is invoked from within the command line client to be
* submitted to a cluster, this method returns the execution environment of this cluster.
*
* @return The execution environment of the context in which the program is executed.
*/
public static ExecutionEnvironment getExecutionEnvironment() {
return contextEnvironmentFactory == null ?
createLocalEnvironment() : contextEnvironmentFactory.createExecutionEnvironment();
}
/**
* Creates a {@link CollectionEnvironment} that uses Java Collections underneath. This will execute in a
* single thread in the current JVM. It is very fast but will fail if the data does not fit into
* memory. parallelism will always be 1. This is useful during implementation and for debugging.
* @return A Collection Environment
*/
@PublicEvolving
public static CollectionEnvironment createCollectionsEnvironment(){
CollectionEnvironment ce = new CollectionEnvironment();
ce.setParallelism(1);
return ce;
}
/**
* Creates a {@link LocalEnvironment}. The local execution environment will run the program in a
* multi-threaded fashion in the same JVM as the environment was created in. The default
* parallelism of the local environment is the number of hardware contexts (CPU cores / threads),
* unless it was specified differently by {@link #setDefaultLocalParallelism(int)}.
*
* @return A local execution environment.
*/
public static LocalEnvironment createLocalEnvironment() {
return createLocalEnvironment(defaultLocalDop);
}
/**
* Creates a {@link LocalEnvironment}. The local execution environment will run the program in a
* multi-threaded fashion in the same JVM as the environment was created in. It will use the
* parallelism specified in the parameter.
*
* @param parallelism The parallelism for the local environment.
* @return A local execution environment with the specified parallelism.
*/
public static LocalEnvironment createLocalEnvironment(int parallelism) {
LocalEnvironment lee = new LocalEnvironment();
lee.setParallelism(parallelism);
return lee;
}
/**
* Creates a {@link LocalEnvironment}. The local execution environment will run the program in a
* multi-threaded fashion in the same JVM as the environment was created in. It will use the
* parallelism specified in the parameter.
*
* @param customConfiguration Pass a custom configuration to the LocalEnvironment.
* @return A local execution environment with the specified parallelism.
*/
public static LocalEnvironment createLocalEnvironment(Configuration customConfiguration) {
return new LocalEnvironment(customConfiguration);
}
/**
* Creates a {@link RemoteEnvironment}. The remote environment sends (parts of) the program
* to a cluster for execution. Note that all file paths used in the program must be accessible from the
* cluster. The execution will use the cluster's default parallelism, unless the parallelism is
* set explicitly via {@link ExecutionEnvironment#setParallelism(int)}.
*
* @param host The host name or address of the master (JobManager), where the program should be executed.
* @param port The port of the master (JobManager), where the program should be executed.
* @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses
* user-defined functions, user-defined input formats, or any libraries, those must be
* provided in the JAR files.
* @return A remote environment that executes the program on a cluster.
*/
public static ExecutionEnvironment createRemoteEnvironment(String host, int port, String... jarFiles) {
return new RemoteEnvironment(host, port, jarFiles);
}
/**
* Creates a {@link RemoteEnvironment}. The remote environment sends (parts of) the program
* to a cluster for execution. Note that all file paths used in the program must be accessible from the
* cluster. The custom configuration file is used to configure Akka specific configuration parameters
* for the Client only; Program parallelism can be set via {@link ExecutionEnvironment#setParallelism(int)}.
*
* Cluster configuration has to be done in the remotely running Flink instance.
*
* @param host The host name or address of the master (JobManager), where the program should be executed.
* @param port The port of the master (JobManager), where the program should be executed.
* @param clientConfiguration Configuration used by the client that connects to the cluster.
* @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses
* user-defined functions, user-defined input formats, or any libraries, those must be
* provided in the JAR files.
* @return A remote environment that executes the program on a cluster.
*/
public static ExecutionEnvironment createRemoteEnvironment(
String host, int port, Configuration clientConfiguration, String... jarFiles) {
return new RemoteEnvironment(host, port, clientConfiguration, jarFiles, null);
}
/**
* Creates a {@link RemoteEnvironment}. The remote environment sends (parts of) the program
* to a cluster for execution. Note that all file paths used in the program must be accessible from the
* cluster. The execution will use the specified parallelism.
*
* @param host The host name or address of the master (JobManager), where the program should be executed.
* @param port The port of the master (JobManager), where the program should be executed.
* @param parallelism The parallelism to use during the execution.
* @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses
* user-defined functions, user-defined input formats, or any libraries, those must be
* provided in the JAR files.
* @return A remote environment that executes the program on a cluster.
*/
public static ExecutionEnvironment createRemoteEnvironment(String host, int port, int parallelism, String... jarFiles) {
RemoteEnvironment rec = new RemoteEnvironment(host, port, jarFiles);
rec.setParallelism(parallelism);
return rec;
}
/**
* Sets the default parallelism that will be used for the local execution environment created by
* {@link #createLocalEnvironment()}.
*
* @param parallelism The parallelism to use as the default local parallelism.
*/
public static void setDefaultLocalParallelism(int parallelism) {
defaultLocalDop = parallelism;
}
// --------------------------------------------------------------------------------------------
// Methods to control the context environment and creation of explicit environments other
// than the context environment
// --------------------------------------------------------------------------------------------
/**
* Sets a context environment factory, that creates the context environment for running programs
* with pre-configured environments. Examples are running programs from the command line, and
* running programs in the Scala shell.
*
*
When the context environment factors is set, no other environments can be explicitly used.
*
* @param ctx The context environment factory.
*/
protected static void initializeContextEnvironment(ExecutionEnvironmentFactory ctx) {
contextEnvironmentFactory = Preconditions.checkNotNull(ctx);
}
/**
* Un-sets the context environment factory. After this method is called, the call to
* {@link #getExecutionEnvironment()} will again return a default local execution environment, and
* it is possible to explicitly instantiate the LocalEnvironment and the RemoteEnvironment.
*/
protected static void resetContextEnvironment() {
contextEnvironmentFactory = null;
}
/**
* Checks whether it is currently permitted to explicitly instantiate a LocalEnvironment
* or a RemoteEnvironment.
*
* @return True, if it is possible to explicitly instantiate a LocalEnvironment or a
* RemoteEnvironment, false otherwise.
*/
@Internal
public static boolean areExplicitEnvironmentsAllowed() {
return contextEnvironmentFactory == null;
}
}
/**
* Triggers the program execution. The environment will execute all parts of
* the program that have resulted in a "sink" operation. Sink operations are
* for example printing results or forwarding them to a message queue.
*
* The program execution will be logged and displayed with a generated
* default name.
*
* @return The result of the job execution, containing elapsed time and accumulators.
* @throws Exception which occurs during job execution.
*/
public JobExecutionResult execute() throws Exception {
return execute(DEFAULT_JOB_NAME);
}
/**
* Triggers the program execution. The environment will execute all parts of
* the program that have resulted in a "sink" operation. Sink operations are
* for example printing results or forwarding them to a message queue.
*
* The program execution will be logged and displayed with the provided name
*
* @param jobName
* Desired name of the job
* @return The result of the job execution, containing elapsed time and accumulators.
* @throws Exception which occurs during job execution.
*/
public abstract JobExecutionResult execute(String jobName) throws Exception;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.api.environment;
import org.apache.flink.annotation.Public;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster;
import org.apache.flink.streaming.api.graph.StreamGraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The LocalStreamEnvironment is a StreamExecutionEnvironment that runs the program locally,
* multi-threaded, in the JVM where the environment is instantiated. It spawns an embedded
* Flink cluster in the background and executes the program on that cluster.
*
*
When this environment is instantiated, it uses a default parallelism of {@code 1}. The default
* parallelism can be set via {@link #setParallelism(int)}.
*
*
Local environments can also be instantiated through {@link StreamExecutionEnvironment#createLocalEnvironment()}
* and {@link StreamExecutionEnvironment#createLocalEnvironment(int)}. The former version will pick a
* default parallelism equal to the number of hardware contexts in the local machine.
*/
@Public
public class LocalStreamEnvironment extends StreamExecutionEnvironment {
private static final Logger LOG = LoggerFactory.getLogger(LocalStreamEnvironment.class);
/** The configuration to use for the local cluster */
private final Configuration conf;
/**
* Creates a new local stream environment that uses the default configuration.
*/
public LocalStreamEnvironment() {
this(null);
}
/**
* Creates a new local stream environment that configures its local executor with the given configuration.
*
* @param config The configuration used to configure the local executor.
*/
public LocalStreamEnvironment(Configuration config) {
if (!ExecutionEnvironment.areExplicitEnvironmentsAllowed()) {
throw new InvalidProgramException(
"The LocalStreamEnvironment cannot be used when submitting a program through a client, " +
"or running in a TestEnvironment context.");
}
this.conf = config == null ? new Configuration() : config;
}
/**
* Executes the JobGraph of the on a mini cluster of CLusterUtil with a user
* specified name.
*
* @param jobName
* name of the job
* @return The result of the job execution, containing elapsed time and accumulators.
*/
@Override
public JobExecutionResult execute(String jobName) throws Exception {
// transform the streaming program into a JobGraph
StreamGraph streamGraph = getStreamGraph();
streamGraph.setJobName(jobName);
JobGraph jobGraph = streamGraph.getJobGraph();
Configuration configuration = new Configuration();
configuration.addAll(jobGraph.getJobConfiguration());
configuration.setLong(ConfigConstants.TASK_MANAGER_MEMORY_SIZE_KEY, -1L);
configuration.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, jobGraph.getMaximumParallelism());
// add (and override) the settings with what the user defined
configuration.addAll(this.conf);
if (LOG.isInfoEnabled()) {
LOG.info("Running job on local embedded Flink mini cluster");
}
LocalFlinkMiniCluster exec = new LocalFlinkMiniCluster(configuration, true);
try {
exec.start();
return exec.submitJobAndWait(jobGraph, getConfig().isSysoutLoggingEnabled());
}
finally {
transformations.clear();
exec.stop();
}
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.api.graph;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.InputTypeConfigurable;
import org.apache.flink.api.java.typeutils.MissingTypeInfo;
import org.apache.flink.optimizer.plan.StreamingPlan;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.operators.OutputTypeConfigurable;
import org.apache.flink.streaming.api.operators.StoppableStreamSource;
import org.apache.flink.streaming.api.operators.StreamOperator;
import org.apache.flink.streaming.api.operators.StreamSource;
import org.apache.flink.streaming.api.operators.TwoInputStreamOperator;
import org.apache.flink.runtime.state.AbstractStateBackend;
import org.apache.flink.streaming.runtime.partitioner.ForwardPartitioner;
import org.apache.flink.streaming.runtime.partitioner.RebalancePartitioner;
import org.apache.flink.streaming.runtime.partitioner.StreamPartitioner;
import org.apache.flink.streaming.runtime.tasks.OneInputStreamTask;
import org.apache.flink.streaming.runtime.tasks.SourceStreamTask;
import org.apache.flink.streaming.runtime.tasks.StoppableSourceStreamTask;
import org.apache.flink.streaming.runtime.tasks.StreamIterationHead;
import org.apache.flink.streaming.runtime.tasks.StreamIterationTail;
import org.apache.flink.streaming.runtime.tasks.TwoInputStreamTask;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Class representing the streaming topology. It contains all the information
* necessary to build the jobgraph for the execution.
*
*/
@Internal
public class StreamGraph extends StreamingPlan {
private static final Logger LOG = LoggerFactory.getLogger(StreamGraph.class);
private String jobName = StreamExecutionEnvironment.DEFAULT_JOB_NAME;
private final StreamExecutionEnvironment environment;
private final ExecutionConfig executionConfig;
private final CheckpointConfig checkpointConfig;
private boolean chaining;
private Map streamNodes;
private Set sources;
private Set sinks;
private Map>> virtualSelectNodes;
private Map>> virtuaPartitionNodes;
protected Map vertexIDtoBrokerID;
protected Map vertexIDtoLoopTimeout;
private AbstractStateBackend stateBackend;
private Set> iterationSourceSinkPairs;
public StreamGraph(StreamExecutionEnvironment environment) {
this.environment = environment;
this.executionConfig = environment.getConfig();
this.checkpointConfig = environment.getCheckpointConfig();
// create an empty new stream graph.
clear();
}
/**
* Remove all registered nodes etc.
*/
public void clear() {
streamNodes = new HashMap<>();
virtualSelectNodes = new HashMap<>();
virtuaPartitionNodes = new HashMap<>();
vertexIDtoBrokerID = new HashMap<>();
vertexIDtoLoopTimeout = new HashMap<>();
iterationSourceSinkPairs = new HashSet<>();
sources = new HashSet<>();
sinks = new HashSet<>();
}
public StreamExecutionEnvironment getEnvironment() {
return environment;
}
public ExecutionConfig getExecutionConfig() {
return executionConfig;
}
public CheckpointConfig getCheckpointConfig() {
return checkpointConfig;
}
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public void setChaining(boolean chaining) {
this.chaining = chaining;
}
public void setStateBackend(AbstractStateBackend backend) {
this.stateBackend = backend;
}
public AbstractStateBackend getStateBackend() {
return this.stateBackend;
}
// Checkpointing
public boolean isChainingEnabled() {
return chaining;
}
public boolean isIterative() {
return!vertexIDtoLoopTimeout.isEmpty();
}
public void addSource(Integer vertexID,
String slotSharingGroup,
StreamOperator operatorObject,
TypeInformation inTypeInfo,
TypeInformation outTypeInfo,
String operatorName) {
addOperator(vertexID, slotSharingGroup, operatorObject, inTypeInfo, outTypeInfo, operatorName);
sources.add(vertexID);
}
public void addSink(Integer vertexID,
String slotSharingGroup,
StreamOperator operatorObject,
TypeInformation inTypeInfo,
TypeInformation outTypeInfo,
String operatorName) {
addOperator(vertexID, slotSharingGroup, operatorObject, inTypeInfo, outTypeInfo, operatorName);
sinks.add(vertexID);
}
public void addOperator(
Integer vertexID,
String slotSharingGroup,
StreamOperator operatorObject,
TypeInformation inTypeInfo,
TypeInformation outTypeInfo,
String operatorName) {
if (operatorObject instanceof StoppableStreamSource) {
addNode(vertexID, slotSharingGroup, StoppableSourceStreamTask.class, operatorObject, operatorName);
} else if (operatorObject instanceof StreamSource) {
addNode(vertexID, slotSharingGroup, SourceStreamTask.class, operatorObject, operatorName);
} else {
addNode(vertexID, slotSharingGroup, OneInputStreamTask.class, operatorObject, operatorName);
}
TypeSerializer inSerializer = inTypeInfo != null && !(inTypeInfo instanceof MissingTypeInfo) ? inTypeInfo.createSerializer(executionConfig) : null;
TypeSerializer outSerializer = outTypeInfo != null && !(outTypeInfo instanceof MissingTypeInfo) ? outTypeInfo.createSerializer(executionConfig) : null;
setSerializers(vertexID, inSerializer, null, outSerializer);
if (operatorObject instanceof OutputTypeConfigurable) {
@SuppressWarnings("unchecked")
OutputTypeConfigurable outputTypeConfigurable = (OutputTypeConfigurable) operatorObject;
// sets the output type which must be know at StreamGraph creation time
outputTypeConfigurable.setOutputType(outTypeInfo, executionConfig);
}
if (operatorObject instanceof InputTypeConfigurable) {
InputTypeConfigurable inputTypeConfigurable = (InputTypeConfigurable) operatorObject;
inputTypeConfigurable.setInputType(inTypeInfo, executionConfig);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Vertex: {}", vertexID);
}
}
public void addCoOperator(
Integer vertexID,
String slotSharingGroup,
TwoInputStreamOperator taskOperatorObject,
TypeInformation in1TypeInfo,
TypeInformation in2TypeInfo,
TypeInformation outTypeInfo,
String operatorName) {
addNode(vertexID, slotSharingGroup, TwoInputStreamTask.class, taskOperatorObject, operatorName);
TypeSerializer outSerializer = (outTypeInfo != null) && !(outTypeInfo instanceof MissingTypeInfo) ?
outTypeInfo.createSerializer(executionConfig) : null;
setSerializers(vertexID, in1TypeInfo.createSerializer(executionConfig), in2TypeInfo.createSerializer(executionConfig), outSerializer);
if (taskOperatorObject instanceof OutputTypeConfigurable) {
@SuppressWarnings("unchecked")
OutputTypeConfigurable outputTypeConfigurable = (OutputTypeConfigurable) taskOperatorObject;
// sets the output type which must be know at StreamGraph creation time
outputTypeConfigurable.setOutputType(outTypeInfo, executionConfig);
}
if (LOG.isDebugEnabled()) {
LOG.debug("CO-TASK: {}", vertexID);
}
}
protected StreamNode addNode(Integer vertexID,
String slotSharingGroup,
Class extends AbstractInvokable> vertexClass,
StreamOperator> operatorObject,
String operatorName) {
if (streamNodes.containsKey(vertexID)) {
throw new RuntimeException("Duplicate vertexID " + vertexID);
}
StreamNode vertex = new StreamNode(environment,
vertexID,
slotSharingGroup,
operatorObject,
operatorName,
new ArrayList>(),
vertexClass);
streamNodes.put(vertexID, vertex);
return vertex;
}
/**
* Adds a new virtual node that is used to connect a downstream vertex to only the outputs
* with the selected names.
*
* When adding an edge from the virtual node to a downstream node the connection will be made
* to the original node, only with the selected names given here.
*
* @param originalId ID of the node that should be connected to.
* @param virtualId ID of the virtual node.
* @param selectedNames The selected names.
*/
public void addVirtualSelectNode(Integer originalId, Integer virtualId, List selectedNames) {
if (virtualSelectNodes.containsKey(virtualId)) {
throw new IllegalStateException("Already has virtual select node with id " + virtualId);
}
virtualSelectNodes.put(virtualId,
new Tuple2>(originalId, selectedNames));
}
/**
* Adds a new virtual node that is used to connect a downstream vertex to an input with a certain
* partitioning.
*
* When adding an edge from the virtual node to a downstream node the connection will be made
* to the original node, but with the partitioning given here.
*
* @param originalId ID of the node that should be connected to.
* @param virtualId ID of the virtual node.
* @param partitioner The partitioner
*/
public void addVirtualPartitionNode(Integer originalId, Integer virtualId, StreamPartitioner> partitioner) {
if (virtuaPartitionNodes.containsKey(virtualId)) {
throw new IllegalStateException("Already has virtual partition node with id " + virtualId);
}
virtuaPartitionNodes.put(virtualId,
new Tuple2>(originalId, partitioner));
}
/**
* Determines the slot sharing group of an operation across virtual nodes.
*/
public String getSlotSharingGroup(Integer id) {
if (virtualSelectNodes.containsKey(id)) {
Integer mappedId = virtualSelectNodes.get(id).f0;
return getSlotSharingGroup(mappedId);
} else if (virtuaPartitionNodes.containsKey(id)) {
Integer mappedId = virtuaPartitionNodes.get(id).f0;
return getSlotSharingGroup(mappedId);
} else {
StreamNode node = getStreamNode(id);
return node.getSlotSharingGroup();
}
}
public void addEdge(Integer upStreamVertexID, Integer downStreamVertexID, int typeNumber) {
addEdgeInternal(upStreamVertexID,
downStreamVertexID,
typeNumber,
null,
new ArrayList());
}
private void addEdgeInternal(Integer upStreamVertexID,
Integer downStreamVertexID,
int typeNumber,
StreamPartitioner> partitioner,
List outputNames) {
if (virtualSelectNodes.containsKey(upStreamVertexID)) {
int virtualId = upStreamVertexID;
upStreamVertexID = virtualSelectNodes.get(virtualId).f0;
if (outputNames.isEmpty()) {
// selections that happen downstream override earlier selections
outputNames = virtualSelectNodes.get(virtualId).f1;
}
addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, outputNames);
} else if (virtuaPartitionNodes.containsKey(upStreamVertexID)) {
int virtualId = upStreamVertexID;
upStreamVertexID = virtuaPartitionNodes.get(virtualId).f0;
if (partitioner == null) {
partitioner = virtuaPartitionNodes.get(virtualId).f1;
}
addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, outputNames);
} else {
StreamNode upstreamNode = getStreamNode(upStreamVertexID);
StreamNode downstreamNode = getStreamNode(downStreamVertexID);
// If no partitioner was specified and the parallelism of upstream and downstream
// operator matches use forward partitioning, use rebalance otherwise.
if (partitioner == null && upstreamNode.getParallelism() == downstreamNode.getParallelism()) {
partitioner = new ForwardPartitioner
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.jobgraph;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.FSDataInputStream;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.runtime.blob.BlobClient;
import org.apache.flink.runtime.blob.BlobKey;
import org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings;
import java.io.IOException;
import java.io.Serializable;
import java.net.InetSocketAddress;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
/**
* The JobGraph represents a Flink dataflow program, at the low level that the JobManager accepts.
* All programs from higher level APIs are transformed into JobGraphs.
*
*
The JobGraph is a graph of vertices and intermediate results that are connected together to
* form a DAG. Note that iterations (feedback edges) are currently not encoded inside the JobGraph
* but inside certain special vertices that establish the feedback channel amongst themselves.
*
*
The JobGraph defines the job-wide configuration settings, while each vertex and intermediate result
* define the characteristics of the concrete operation and intermediate data.
*/
public class JobGraph implements Serializable {
private static final long serialVersionUID = 1L;
// --------------------------------------------------------------------------------------------
// Members that define the structure / topology of the graph
// --------------------------------------------------------------------------------------------
/** List of task vertices included in this job graph. */
private final Map taskVertices = new LinkedHashMap();
/** The job configuration attached to this job. */
private final Configuration jobConfiguration = new Configuration();
/** Set of JAR files required to run this job. */
private final List userJars = new ArrayList();
/** Set of blob keys identifying the JAR files required to run this job. */
private final List userJarBlobKeys = new ArrayList();
/** ID of this job. May be set if specific job id is desired (e.g. session management) */
private final JobID jobID;
/** Name of this job. */
private final String jobName;
/** Configuration which defines which restart strategy to use for the job recovery */
private RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration;
/** The number of seconds after which the corresponding ExecutionGraph is removed at the
* job manager after it has been executed. */
private long sessionTimeout = 0;
/** flag to enable queued scheduling */
private boolean allowQueuedScheduling;
/** The mode in which the job is scheduled */
private ScheduleMode scheduleMode = ScheduleMode.FROM_SOURCES;
/** The settings for asynchronous snapshots */
private JobSnapshottingSettings snapshotSettings;
/** List of classpaths required to run this job. */
private List classpaths = Collections.emptyList();
// --------------------------------------------------------------------------------------------
/**
* Constructs a new job graph with no name and a random job ID.
*/
public JobGraph() {
this((String) null);
}
/**
* Constructs a new job graph with the given name, a random job ID.
*
* @param jobName The name of the job
*/
public JobGraph(String jobName) {
this(null, jobName);
}
/**
* Constructs a new job graph with the given name and a random job ID if null supplied as an id.
*
* @param jobId The id of the job. A random ID is generated, if {@code null} is passed.
* @param jobName The name of the job.
*/
public JobGraph(JobID jobId, String jobName) {
this.jobID = jobId == null ? new JobID() : jobId;
this.jobName = jobName == null ? "(unnamed job)" : jobName;
}
/**
* Constructs a new job graph with no name and a random job ID if null supplied as an id.
*
* @param vertices The vertices to add to the graph.
*/
public JobGraph(JobVertex... vertices) {
this(null, vertices);
}
/**
* Constructs a new job graph with the given name and a random job ID.
*
* @param jobName The name of the job.
* @param vertices The vertices to add to the graph.
*/
public JobGraph(String jobName, JobVertex... vertices) {
this(null, jobName, vertices);
}
/**
* Constructs a new job graph with the given name and a random job ID if null supplied as an id.
*
* @param jobId The id of the job. A random ID is generated, if {@code null} is passed.
* @param jobName The name of the job.
* @param vertices The vertices to add to the graph.
*/
public JobGraph(JobID jobId, String jobName, JobVertex... vertices) {
this(jobId, jobName);
for (JobVertex vertex : vertices) {
addVertex(vertex);
}
}
// --------------------------------------------------------------------------------------------
/**
* Returns the ID of the job.
*
* @return the ID of the job
*/
public JobID getJobID() {
return this.jobID;
}
/**
* Returns the name assigned to the job graph.
*
* @return the name assigned to the job graph
*/
public String getName() {
return this.jobName;
}
/**
* Returns the configuration object for this job. Job-wide parameters should be set into that
* configuration object.
*
* @return The configuration object for this job.
*/
public Configuration getJobConfiguration() {
return this.jobConfiguration;
}
/**
* Sets the restart strategy configuration. This configuration specifies the restart strategy
* to be used by the ExecutionGraph in case of a restart.
*
* @param restartStrategyConfiguration Restart strategy configuration to be set
*/
public void setRestartStrategyConfiguration(RestartStrategies.RestartStrategyConfiguration restartStrategyConfiguration) {
this.restartStrategyConfiguration = restartStrategyConfiguration;
}
/**
* Gets the restart strategy configuration
*
* @return Restart strategy configuration to be used
*/
public RestartStrategies.RestartStrategyConfiguration getRestartStrategyConfiguration() {
return restartStrategyConfiguration;
}
/**
* Gets the timeout after which the corresponding ExecutionGraph is removed at the
* job manager after it has been executed.
* @return a timeout as a long in seconds.
*/
public long getSessionTimeout() {
return sessionTimeout;
}
/**
* Sets the timeout of the session in seconds. The timeout specifies how long a job will be kept
* in the job manager after it finishes.
* @param sessionTimeout The timeout in seconds
*/
public void setSessionTimeout(long sessionTimeout) {
this.sessionTimeout = sessionTimeout;
}
public void setAllowQueuedScheduling(boolean allowQueuedScheduling) {
this.allowQueuedScheduling = allowQueuedScheduling;
}
public boolean getAllowQueuedScheduling() {
return allowQueuedScheduling;
}
public void setScheduleMode(ScheduleMode scheduleMode) {
this.scheduleMode = scheduleMode;
}
public ScheduleMode getScheduleMode() {
return scheduleMode;
}
/**
* Adds a new task vertex to the job graph if it is not already included.
*
* @param vertex
* the new task vertex to be added
*/
public void addVertex(JobVertex vertex) {
final JobVertexID id = vertex.getID();
JobVertex previous = taskVertices.put(id, vertex);
// if we had a prior association, restore and throw an exception
if (previous != null) {
taskVertices.put(id, previous);
throw new IllegalArgumentException("The JobGraph already contains a vertex with that id.");
}
}
/**
* Returns an Iterable to iterate all vertices registered with the job graph.
*
* @return an Iterable to iterate all vertices registered with the job graph
*/
public Iterable getVertices() {
return this.taskVertices.values();
}
/**
* Returns an array of all job vertices that are registered with the job graph. The order in which the vertices
* appear in the list is not defined.
*
* @return an array of all job vertices that are registered with the job graph
*/
public JobVertex[] getVerticesAsArray() {
return this.taskVertices.values().toArray(new JobVertex[this.taskVertices.size()]);
}
/**
* Returns the number of all vertices.
*
* @return The number of all vertices.
*/
public int getNumberOfVertices() {
return this.taskVertices.size();
}
/**
* Sets the settings for asynchronous snapshots. A value of {@code null} means that
* snapshotting is not enabled.
*
* @param settings The snapshot settings, or null, to disable snapshotting.
*/
public void setSnapshotSettings(JobSnapshottingSettings settings) {
this.snapshotSettings = settings;
}
/**
* Gets the settings for asynchronous snapshots. This method returns null, when
* snapshotting is not enabled.
*
* @return The snapshot settings, or null, if snapshotting is not enabled.
*/
public JobSnapshottingSettings getSnapshotSettings() {
return snapshotSettings;
}
/**
* Searches for a vertex with a matching ID and returns it.
*
* @param id
* the ID of the vertex to search for
* @return the vertex with the matching ID or null if no vertex with such ID could be found
*/
public JobVertex findVertexByID(JobVertexID id) {
return this.taskVertices.get(id);
}
/**
* Sets the classpaths required to run the job on a task manager.
*
* @param paths paths of the directories/JAR files required to run the job on a task manager
*/
public void setClasspaths(List paths) {
classpaths = paths;
}
public List getClasspaths() {
return classpaths;
}
/**
* Sets the savepoint path to rollback the deployment to.
*
* @param savepointPath The savepoint path
*/
public void setSavepointPath(String savepointPath) {
if (savepointPath != null) {
if (snapshotSettings == null) {
throw new IllegalStateException("Checkpointing disabled");
}
else {
snapshotSettings.setSavepointPath(savepointPath);
}
}
}
// --------------------------------------------------------------------------------------------
public List getVerticesSortedTopologicallyFromSources() throws InvalidProgramException {
// early out on empty lists
if (this.taskVertices.isEmpty()) {
return Collections.emptyList();
}
List sorted = new ArrayList(this.taskVertices.size());
Set remaining = new LinkedHashSet(this.taskVertices.values());
// start by finding the vertices with no input edges
// and the ones with disconnected inputs (that refer to some standalone data set)
{
Iterator iter = remaining.iterator();
while (iter.hasNext()) {
JobVertex vertex = iter.next();
if (vertex.hasNoConnectedInputs()) {
sorted.add(vertex);
iter.remove();
}
}
}
int startNodePos = 0;
// traverse from the nodes that were added until we found all elements
while (!remaining.isEmpty()) {
// first check if we have more candidates to start traversing from. if not, then the
// graph is cyclic, which is not permitted
if (startNodePos >= sorted.size()) {
throw new InvalidProgramException("The job graph is cyclic.");
}
JobVertex current = sorted.get(startNodePos++);
addNodesThatHaveNoNewPredecessors(current, sorted, remaining);
}
return sorted;
}
private void addNodesThatHaveNoNewPredecessors(JobVertex start, List target, Set remaining) {
// forward traverse over all produced data sets and all their consumers
for (IntermediateDataSet dataSet : start.getProducedDataSets()) {
for (JobEdge edge : dataSet.getConsumers()) {
// a vertex can be added, if it has no predecessors that are still in the 'remaining' set
JobVertex v = edge.getTarget();
if (!remaining.contains(v)) {
continue;
}
boolean hasNewPredecessors = false;
for (JobEdge e : v.getInputs()) {
// skip the edge through which we came
if (e == edge) {
continue;
}
IntermediateDataSet source = e.getSource();
if (remaining.contains(source.getProducer())) {
hasNewPredecessors = true;
break;
}
}
if (!hasNewPredecessors) {
target.add(v);
remaining.remove(v);
addNodesThatHaveNoNewPredecessors(v, target, remaining);
}
}
}
}
// --------------------------------------------------------------------------------------------
// Handling of attached JAR files
// --------------------------------------------------------------------------------------------
/**
* Adds the path of a JAR file required to run the job on a task manager.
*
* @param jar
* path of the JAR file required to run the job on a task manager
*/
public void addJar(Path jar) {
if (jar == null) {
throw new IllegalArgumentException();
}
if (!userJars.contains(jar)) {
userJars.add(jar);
}
}
/**
* Adds the BLOB referenced by the key to the JobGraph's dependencies.
*
* @param key
* path of the JAR file required to run the job on a task manager
*/
public void addBlob(BlobKey key) {
if (key == null) {
throw new IllegalArgumentException();
}
if (!userJarBlobKeys.contains(key)) {
userJarBlobKeys.add(key);
}
}
/**
* Checks whether the JobGraph has user code JAR files attached.
*
* @return True, if the JobGraph has user code JAR files attached, false otherwise.
*/
public boolean hasUsercodeJarFiles() {
return this.userJars.size() > 0;
}
/**
* Returns a set of BLOB keys referring to the JAR files required to run this job.
*
* @return set of BLOB keys referring to the JAR files required to run this job
*/
public List getUserJarBlobKeys() {
return this.userJarBlobKeys;
}
/**
* Uploads the previously added user jar file to the job manager through the job manager's BLOB server.
*
* @param serverAddress
* the network address of the BLOB server
* @throws IOException
* thrown if an I/O error occurs during the upload
*/
public void uploadRequiredJarFiles(InetSocketAddress serverAddress) throws IOException {
if (this.userJars.isEmpty()) {
return;
}
BlobClient bc = null;
try {
bc = new BlobClient(serverAddress);
for (final Path jar : this.userJars) {
final FileSystem fs = jar.getFileSystem();
FSDataInputStream is = null;
try {
is = fs.open(jar);
final BlobKey key = bc.put(is);
this.userJarBlobKeys.add(key);
}
finally {
if (is != null) {
is.close();
}
}
}
}
finally {
if (bc != null) {
bc.close();
}
}
}
/**
* Gets the maximum parallelism of all operations in this job graph.
* @return The maximum parallelism of this job graph
*/
public int getMaximumParallelism() {
int maxParallelism = -1;
for (JobVertex vertex : taskVertices.values()) {
maxParallelism = Math.max(vertex.getParallelism(), maxParallelism);
}
return maxParallelism;
}
@Override
public String toString() {
return "JobGraph(jobId: " + jobID + ")";
}
}
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.api.graph;
import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.FileSourceFunction;
import org.apache.flink.streaming.api.transformations.CoFeedbackTransformation;
import org.apache.flink.streaming.api.transformations.FeedbackTransformation;
import org.apache.flink.streaming.api.transformations.OneInputTransformation;
import org.apache.flink.streaming.api.transformations.PartitionTransformation;
import org.apache.flink.streaming.api.transformations.SelectTransformation;
import org.apache.flink.streaming.api.transformations.SinkTransformation;
import org.apache.flink.streaming.api.transformations.SourceTransformation;
import org.apache.flink.streaming.api.transformations.SplitTransformation;
import org.apache.flink.streaming.api.transformations.StreamTransformation;
import org.apache.flink.streaming.api.transformations.TwoInputTransformation;
import org.apache.flink.streaming.api.transformations.UnionTransformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* A generator that generates a {@link StreamGraph} from a graph of
* {@link StreamTransformation StreamTransformations}.
*
*
* This traverses the tree of {@code StreamTransformations} starting from the sinks. At each
* transformation we recursively transform the inputs, then create a node in the {@code StreamGraph}
* and add edges from the input Nodes to our newly created node. The transformation methods
* return the IDs of the nodes in the StreamGraph that represent the input transformation. Several
* IDs can be returned to be able to deal with feedback transformations and unions.
*
*
* Partitioning, split/select and union don't create actual nodes in the {@code StreamGraph}. For
* these, we create a virtual node in the {@code StreamGraph} that holds the specific property, i.e.
* partitioning, selector and so on. When an edge is created from a virtual node to a downstream
* node the {@code StreamGraph} resolved the id of the original node and creates an edge
* in the graph with the desired property. For example, if you have this graph:
*
*
* Map-1 -> HashPartition-2 -> Map-3
*
*
* where the numbers represent transformation IDs. We first recurse all the way down. {@code Map-1}
* is transformed, i.e. we create a {@code StreamNode} with ID 1. Then we transform the
* {@code HashPartition}, for this, we create virtual node of ID 4 that holds the property
* {@code HashPartition}. This transformation returns the ID 4. Then we transform the {@code Map-3}.
* We add the edge {@code 4 -> 3}. The {@code StreamGraph} resolved the actual node with ID 1 and
* creates and edge {@code 1 -> 3} with the property HashPartition.
*/
@Internal
public class StreamGraphGenerator {
private static final Logger LOG = LoggerFactory.getLogger(StreamGraphGenerator.class);
// The StreamGraph that is being built, this is initialized at the beginning.
private StreamGraph streamGraph;
private final StreamExecutionEnvironment env;
// This is used to assign a unique ID to iteration source/sink
protected static Integer iterationIdCounter = 0;
public static int getNewIterationNodeId() {
iterationIdCounter--;
return iterationIdCounter;
}
// Keep track of which Transforms we have already transformed, this is necessary because
// we have loops, i.e. feedback edges.
private Map, Collection> alreadyTransformed;
/**
* Private constructor. The generator should only be invoked using {@link #generate}.
*/
private StreamGraphGenerator(StreamExecutionEnvironment env) {
this.streamGraph = new StreamGraph(env);
this.streamGraph.setChaining(env.isChainingEnabled());
this.streamGraph.setStateBackend(env.getStateBackend());
this.env = env;
this.alreadyTransformed = new HashMap<>();
}
/**
* Generates a {@code StreamGraph} by traversing the graph of {@code StreamTransformations}
* starting from the given transformations.
*
* @param env The {@code StreamExecutionEnvironment} that is used to set some parameters of the
* job
* @param transformations The transformations starting from which to transform the graph
*
* @return The generated {@code StreamGraph}
*/
public static StreamGraph generate(StreamExecutionEnvironment env, List> transformations) {
return new StreamGraphGenerator(env).generateInternal(transformations);
}
/**
* This starts the actual transformation, beginning from the sinks.
*/
private StreamGraph generateInternal(List> transformations) {
for (StreamTransformation> transformation: transformations) {
transform(transformation);
}
return streamGraph;
}
/**
* Transforms one {@code StreamTransformation}.
*
*
* This checks whether we already transformed it and exits early in that case. If not it
* delegates to one of the transformation specific methods.
*/
private Collection transform(StreamTransformation> transform) {
if (alreadyTransformed.containsKey(transform)) {
return alreadyTransformed.get(transform);
}
LOG.debug("Transforming " + transform);
// call at least once to trigger exceptions about MissingTypeInfo
transform.getOutputType();
Collection transformedIds;
if (transform instanceof OneInputTransformation, ?>) {
transformedIds = transformOnInputTransform((OneInputTransformation, ?>) transform);
} else if (transform instanceof TwoInputTransformation, ?, ?>) {
transformedIds = transformTwoInputTransform((TwoInputTransformation, ?, ?>) transform);
} else if (transform instanceof SourceTransformation>) {
transformedIds = transformSource((SourceTransformation>) transform);
} else if (transform instanceof SinkTransformation>) {
transformedIds = transformSink((SinkTransformation>) transform);
} else if (transform instanceof UnionTransformation>) {
transformedIds = transformUnion((UnionTransformation>) transform);
} else if (transform instanceof SplitTransformation>) {
transformedIds = transformSplit((SplitTransformation>) transform);
} else if (transform instanceof SelectTransformation>) {
transformedIds = transformSelect((SelectTransformation>) transform);
} else if (transform instanceof FeedbackTransformation>) {
transformedIds = transformFeedback((FeedbackTransformation>) transform);
} else if (transform instanceof CoFeedbackTransformation>) {
transformedIds = transformCoFeedback((CoFeedbackTransformation>) transform);
} else if (transform instanceof PartitionTransformation>) {
transformedIds = transformPartition((PartitionTransformation>) transform);
} else {
throw new IllegalStateException("Unknown transformation: " + transform);
}
// need this check because the iterate transformation adds itself before
// transforming the feedback edges
if (!alreadyTransformed.containsKey(transform)) {
alreadyTransformed.put(transform, transformedIds);
}
if (transform.getBufferTimeout() > 0) {
streamGraph.setBufferTimeout(transform.getId(), transform.getBufferTimeout());
}
if (transform.getUid() != null) {
streamGraph.setTransformationId(transform.getId(), transform.getUid());
}
return transformedIds;
}
/**
* Transforms a {@code UnionTransformation}.
*
*
* This is easy, we only have to transform the inputs and return all the IDs in a list so
* that downstream operations can connect to all upstream nodes.
*/
private Collection transformUnion(UnionTransformation union) {
List> inputs = union.getInputs();
List resultIds = new ArrayList<>();
for (StreamTransformation input: inputs) {
resultIds.addAll(transform(input));
}
return resultIds;
}
/**
* Transforms a {@code PartitionTransformation}.
*
*
* For this we create a virtual node in the {@code StreamGraph} that holds the partition
* property. @see StreamGraphGenerator
*/
private Collection transformPartition(PartitionTransformation partition) {
StreamTransformation input = partition.getInput();
List resultIds = new ArrayList<>();
Collection transformedIds = transform(input);
for (Integer transformedId: transformedIds) {
int virtualId = StreamTransformation.getNewNodeId();
streamGraph.addVirtualPartitionNode(transformedId, virtualId, partition.getPartitioner());
resultIds.add(virtualId);
}
return resultIds;
}
/**
* Transforms a {@code SplitTransformation}.
*
*
* We add the output selector to previously transformed nodes.
*/
private Collection transformSplit(SplitTransformation split) {
StreamTransformation input = split.getInput();
Collection resultIds = transform(input);
// the recursive transform call might have transformed this already
if (alreadyTransformed.containsKey(split)) {
return alreadyTransformed.get(split);
}
for (int inputId : resultIds) {
streamGraph.addOutputSelector(inputId, split.getOutputSelector());
}
return resultIds;
}
/**
* Transforms a {@code SelectTransformation}.
*
*
* For this we create a virtual node in the {@code StreamGraph} holds the selected names.
* @see org.apache.flink.streaming.api.graph.StreamGraphGenerator
*/
private Collection transformSelect(SelectTransformation select) {
StreamTransformation input = select.getInput();
Collection resultIds = transform(input);
// the recursive transform might have already transformed this
if (alreadyTransformed.containsKey(select)) {
return alreadyTransformed.get(select);
}
List virtualResultIds = new ArrayList<>();
for (int inputId : resultIds) {
int virtualId = StreamTransformation.getNewNodeId();
streamGraph.addVirtualSelectNode(inputId, virtualId, select.getSelectedNames());
virtualResultIds.add(virtualId);
}
return virtualResultIds;
}
/**
* Transforms a {@code FeedbackTransformation}.
*
*
* This will recursively transform the input and the feedback edges. We return the concatenation
* of the input IDs and the feedback IDs so that downstream operations can be wired to both.
*
*
* This is responsible for creating the IterationSource and IterationSink which
* are used to feed back the elements.
*/
private Collection transformFeedback(FeedbackTransformation iterate) {
if (iterate.getFeedbackEdges().size() <= 0) {
throw new IllegalStateException("Iteration " + iterate + " does not have any feedback edges.");
}
StreamTransformation input = iterate.getInput();
List resultIds = new ArrayList<>();
// first transform the input stream(s) and store the result IDs
Collection inputIds = transform(input);
resultIds.addAll(inputIds);
// the recursive transform might have already transformed this
if (alreadyTransformed.containsKey(iterate)) {
return alreadyTransformed.get(iterate);
}
// create the fake iteration source/sink pair
Tuple2 itSourceAndSink = streamGraph.createIterationSourceAndSink(
iterate.getId(),
getNewIterationNodeId(),
getNewIterationNodeId(),
iterate.getWaitTime(),
iterate.getParallelism());
StreamNode itSource = itSourceAndSink.f0;
StreamNode itSink = itSourceAndSink.f1;
// We set the proper serializers for the sink/source
streamGraph.setSerializers(itSource.getId(), null, null, iterate.getOutputType().createSerializer(env.getConfig()));
streamGraph.setSerializers(itSink.getId(), iterate.getOutputType().createSerializer(env.getConfig()), null, null);
// also add the feedback source ID to the result IDs, so that downstream operators will
// add both as input
resultIds.add(itSource.getId());
// at the iterate to the already-seen-set with the result IDs, so that we can transform
// the feedback edges and let them stop when encountering the iterate node
alreadyTransformed.put(iterate, resultIds);
// so that we can determine the slot sharing group from all feedback edges
List allFeedbackIds = new ArrayList<>();
for (StreamTransformation feedbackEdge : iterate.getFeedbackEdges()) {
Collection feedbackIds = transform(feedbackEdge);
allFeedbackIds.addAll(feedbackIds);
for (Integer feedbackId: feedbackIds) {
streamGraph.addEdge(feedbackId,
itSink.getId(),
0
);
}
}
String slotSharingGroup = determineSlotSharingGroup(null, allFeedbackIds);
itSink.setSlotSharingGroup(slotSharingGroup);
itSource.setSlotSharingGroup(slotSharingGroup);
return resultIds;
}
/**
* Transforms a {@code CoFeedbackTransformation}.
*
*
* This will only transform feedback edges, the result of this transform will be wired
* to the second input of a Co-Transform. The original input is wired directly to the first
* input of the downstream Co-Transform.
*
*
* This is responsible for creating the IterationSource and IterationSink which
* are used to feed back the elements.
*/
private Collection transformCoFeedback(CoFeedbackTransformation coIterate) {
// For Co-Iteration we don't need to transform the input and wire the input to the
// head operator by returning the input IDs, the input is directly wired to the left
// input of the co-operation. This transform only needs to return the ids of the feedback
// edges, since they need to be wired to the second input of the co-operation.
// create the fake iteration source/sink pair
Tuple2 itSourceAndSink = streamGraph.createIterationSourceAndSink(
coIterate.getId(),
getNewIterationNodeId(),
getNewIterationNodeId(),
coIterate.getWaitTime(),
coIterate.getParallelism());
StreamNode itSource = itSourceAndSink.f0;
StreamNode itSink = itSourceAndSink.f1;
// We set the proper serializers for the sink/source
streamGraph.setSerializers(itSource.getId(), null, null, coIterate.getOutputType().createSerializer(env.getConfig()));
streamGraph.setSerializers(itSink.getId(), coIterate.getOutputType().createSerializer(env.getConfig()), null, null);
Collection resultIds = Collections.singleton(itSource.getId());
// at the iterate to the already-seen-set with the result IDs, so that we can transform
// the feedback edges and let them stop when encountering the iterate node
alreadyTransformed.put(coIterate, resultIds);
// so that we can determine the slot sharing group from all feedback edges
List allFeedbackIds = new ArrayList<>();
for (StreamTransformation feedbackEdge : coIterate.getFeedbackEdges()) {
Collection feedbackIds = transform(feedbackEdge);
allFeedbackIds.addAll(feedbackIds);
for (Integer feedbackId: feedbackIds) {
streamGraph.addEdge(feedbackId,
itSink.getId(),
0
);
}
}
String slotSharingGroup = determineSlotSharingGroup(null, allFeedbackIds);
itSink.setSlotSharingGroup(slotSharingGroup);
itSource.setSlotSharingGroup(slotSharingGroup);
return Collections.singleton(itSource.getId());
}
/**
* Transforms a {@code SourceTransformation}.
*/
private Collection transformSource(SourceTransformation source) {
String slotSharingGroup = determineSlotSharingGroup(source.getSlotSharingGroup(), new ArrayList());
streamGraph.addSource(source.getId(),
slotSharingGroup,
source.getOperator(),
null,
source.getOutputType(),
"Source: " + source.getName());
if (source.getOperator().getUserFunction() instanceof FileSourceFunction) {
FileSourceFunction fs = (FileSourceFunction) source.getOperator().getUserFunction();
streamGraph.setInputFormat(source.getId(), fs.getFormat());
}
streamGraph.setParallelism(source.getId(), source.getParallelism());
return Collections.singleton(source.getId());
}
/**
* Transforms a {@code SourceTransformation}.
*/
private Collection transformSink(SinkTransformation sink) {
Collection inputIds = transform(sink.getInput());
String slotSharingGroup = determineSlotSharingGroup(sink.getSlotSharingGroup(), inputIds);
streamGraph.addSink(sink.getId(),
slotSharingGroup,
sink.getOperator(),
sink.getInput().getOutputType(),
null,
"Sink: " + sink.getName());
streamGraph.setParallelism(sink.getId(), sink.getParallelism());
for (Integer inputId: inputIds) {
streamGraph.addEdge(inputId,
sink.getId(),
0
);
}
if (sink.getStateKeySelector() != null) {
TypeSerializer> keySerializer = sink.getStateKeyType().createSerializer(env.getConfig());
streamGraph.setOneInputStateKey(sink.getId(), sink.getStateKeySelector(), keySerializer);
}
return Collections.emptyList();
}
/**
* Transforms a {@code OneInputTransformation}.
*
*
* This recusively transforms the inputs, creates a new {@code StreamNode} in the graph and
* wired the inputs to this new node.
*/
private Collection transformOnInputTransform(OneInputTransformation transform) {
Collection inputIds = transform(transform.getInput());
// the recursive call might have already transformed this
if (alreadyTransformed.containsKey(transform)) {
return alreadyTransformed.get(transform);
}
String slotSharingGroup = determineSlotSharingGroup(transform.getSlotSharingGroup(), inputIds);
streamGraph.addOperator(transform.getId(),
slotSharingGroup,
transform.getOperator(),
transform.getInputType(),
transform.getOutputType(),
transform.getName());
if (transform.getStateKeySelector() != null) {
TypeSerializer> keySerializer = transform.getStateKeyType().createSerializer(env.getConfig());
streamGraph.setOneInputStateKey(transform.getId(), transform.getStateKeySelector(), keySerializer);
}
streamGraph.setParallelism(transform.getId(), transform.getParallelism());
for (Integer inputId: inputIds) {
streamGraph.addEdge(inputId, transform.getId(), 0);
}
return Collections.singleton(transform.getId());
}
/**
* Transforms a {@code TwoInputTransformation}.
*
*
* This recusively transforms the inputs, creates a new {@code StreamNode} in the graph and
* wired the inputs to this new node.
*/
private Collection transformTwoInputTransform(TwoInputTransformation transform) {
Collection inputIds1 = transform(transform.getInput1());
Collection inputIds2 = transform(transform.getInput2());
// the recursive call might have already transformed this
if (alreadyTransformed.containsKey(transform)) {
return alreadyTransformed.get(transform);
}
List allInputIds = new ArrayList<>();
allInputIds.addAll(inputIds1);
allInputIds.addAll(inputIds2);
String slotSharingGroup = determineSlotSharingGroup(transform.getSlotSharingGroup(), allInputIds);
streamGraph.addCoOperator(
transform.getId(),
slotSharingGroup,
transform.getOperator(),
transform.getInputType1(),
transform.getInputType2(),
transform.getOutputType(),
transform.getName());
if (transform.getStateKeySelector1() != null) {
TypeSerializer> keySerializer = transform.getStateKeyType().createSerializer(env.getConfig());
streamGraph.setTwoInputStateKey(transform.getId(), transform.getStateKeySelector1(), transform.getStateKeySelector2(), keySerializer);
}
streamGraph.setParallelism(transform.getId(), transform.getParallelism());
for (Integer inputId: inputIds1) {
streamGraph.addEdge(inputId,
transform.getId(),
1
);
}
for (Integer inputId: inputIds2) {
streamGraph.addEdge(inputId,
transform.getId(),
2
);
}
return Collections.singleton(transform.getId());
}
/**
* Determines the slot sharing group for an operation based on the slot sharing group set by
* the user and the slot sharing groups of the inputs.
*
*
If the user specifies a group name, this is taken as is. If nothing is specified and
* the input operations all have the same group name then this name is taken. Otherwise the
* default group is choosen.
*
* @param specifiedGroup The group specified by the user.
* @param inputIds The IDs of the input operations.
*/
private String determineSlotSharingGroup(String specifiedGroup, Collection inputIds) {
if (specifiedGroup != null) {
return specifiedGroup;
} else {
String inputGroup = null;
for (int id: inputIds) {
String inputGroupCandidate = streamGraph.getSlotSharingGroup(id);
if (inputGroup == null) {
inputGroup = inputGroupCandidate;
} else if (!inputGroup.equals(inputGroupCandidate)) {
return "default";
}
}
return inputGroup == null ? "default" : inputGroup;
}
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.api.graph;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;
import org.apache.commons.lang3.StringUtils;
import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.operators.util.UserCodeObjectWrapper;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.io.network.partition.ResultPartitionType;
import org.apache.flink.runtime.jobgraph.DistributionPattern;
import org.apache.flink.runtime.jobgraph.InputFormatVertex;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobgraph.JobVertex;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.ScheduleMode;
import org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable;
import org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationGroup;
import org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup;
import org.apache.flink.runtime.operators.util.TaskConfig;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.operators.AbstractUdfStreamOperator;
import org.apache.flink.streaming.api.operators.ChainingStrategy;
import org.apache.flink.streaming.api.operators.StreamOperator;
import org.apache.flink.streaming.api.transformations.StreamTransformation;
import org.apache.flink.streaming.runtime.partitioner.ForwardPartitioner;
import org.apache.flink.streaming.runtime.partitioner.RescalePartitioner;
import org.apache.flink.streaming.runtime.partitioner.StreamPartitioner;
import org.apache.flink.streaming.runtime.tasks.StreamIterationHead;
import org.apache.flink.streaming.runtime.tasks.StreamIterationTail;
import org.apache.flink.util.InstantiationUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;
import static org.apache.flink.util.StringUtils.byteToHexString;
@Internal
public class StreamingJobGraphGenerator {
private static final Logger LOG = LoggerFactory.getLogger(StreamingJobGraphGenerator.class);
/**
* Restart delay used for the FixedDelayRestartStrategy in case checkpointing was enabled but
* no restart strategy has been specified.
*/
private static final long DEFAULT_RESTART_DELAY = 10000L;
private StreamGraph streamGraph;
private Map jobVertices;
private JobGraph jobGraph;
private Collection builtVertices;
private List physicalEdgesInOrder;
private Map> chainedConfigs;
private Map vertexConfigs;
private Map chainedNames;
public StreamingJobGraphGenerator(StreamGraph streamGraph) {
this.streamGraph = streamGraph;
}
private void init() {
this.jobVertices = new HashMap<>();
this.builtVertices = new HashSet<>();
this.chainedConfigs = new HashMap<>();
this.vertexConfigs = new HashMap<>();
this.chainedNames = new HashMap<>();
this.physicalEdgesInOrder = new ArrayList<>();
}
public JobGraph createJobGraph() {
jobGraph = new JobGraph(streamGraph.getJobName());
// make sure that all vertices start immediately
jobGraph.setScheduleMode(ScheduleMode.ALL);
init();
// Generate deterministic hashes for the nodes in order to identify them across
// submission iff they didn't change.
Map hashes = traverseStreamGraphAndGenerateHashes();
setChaining(hashes);
setPhysicalEdges();
setSlotSharing();
configureCheckpointing();
configureRestartStrategy();
try {
InstantiationUtil.writeObjectToConfig(this.streamGraph.getExecutionConfig(), this.jobGraph.getJobConfiguration(), ExecutionConfig.CONFIG_KEY);
} catch (IOException e) {
throw new RuntimeException("Config object could not be written to Job Configuration: ", e);
}
return jobGraph;
}
private void setPhysicalEdges() {
Map> physicalInEdgesInOrder = new HashMap>();
for (StreamEdge edge : physicalEdgesInOrder) {
int target = edge.getTargetId();
List inEdges = physicalInEdgesInOrder.get(target);
// create if not set
if (inEdges == null) {
inEdges = new ArrayList<>();
physicalInEdgesInOrder.put(target, inEdges);
}
inEdges.add(edge);
}
for (Map.Entry> inEdges : physicalInEdgesInOrder.entrySet()) {
int vertex = inEdges.getKey();
List edgeList = inEdges.getValue();
vertexConfigs.get(vertex).setInPhysicalEdges(edgeList);
}
}
/**
* Sets up task chains from the source {@link StreamNode} instances.
*
*
This will recursively create all {@link JobVertex} instances.
*/
private void setChaining(Map hashes) {
for (Integer sourceNodeId : streamGraph.getSourceIDs()) {
createChain(sourceNodeId, sourceNodeId, hashes);
}
}
private List createChain(
Integer startNodeId,
Integer currentNodeId,
Map hashes) {
if (!builtVertices.contains(startNodeId)) {
List transitiveOutEdges = new ArrayList();
List chainableOutputs = new ArrayList();
List nonChainableOutputs = new ArrayList();
for (StreamEdge outEdge : streamGraph.getStreamNode(currentNodeId).getOutEdges()) {
if (isChainable(outEdge)) {
chainableOutputs.add(outEdge);
} else {
nonChainableOutputs.add(outEdge);
}
}
for (StreamEdge chainable : chainableOutputs) {
transitiveOutEdges.addAll(createChain(startNodeId, chainable.getTargetId(), hashes));
}
for (StreamEdge nonChainable : nonChainableOutputs) {
transitiveOutEdges.add(nonChainable);
createChain(nonChainable.getTargetId(), nonChainable.getTargetId(), hashes);
}
chainedNames.put(currentNodeId, createChainedName(currentNodeId, chainableOutputs));
StreamConfig config = currentNodeId.equals(startNodeId)
? createJobVertex(startNodeId, hashes)
: new StreamConfig(new Configuration());
setVertexConfig(currentNodeId, config, chainableOutputs, nonChainableOutputs);
if (currentNodeId.equals(startNodeId)) {
config.setChainStart();
config.setOutEdgesInOrder(transitiveOutEdges);
config.setOutEdges(streamGraph.getStreamNode(currentNodeId).getOutEdges());
for (StreamEdge edge : transitiveOutEdges) {
connect(startNodeId, edge);
}
config.setTransitiveChainedTaskConfigs(chainedConfigs.get(startNodeId));
} else {
Map chainedConfs = chainedConfigs.get(startNodeId);
if (chainedConfs == null) {
chainedConfigs.put(startNodeId, new HashMap());
}
chainedConfigs.get(startNodeId).put(currentNodeId, config);
}
return transitiveOutEdges;
} else {
return new ArrayList<>();
}
}
private String createChainedName(Integer vertexID, List chainedOutputs) {
String operatorName = streamGraph.getStreamNode(vertexID).getOperatorName();
if (chainedOutputs.size() > 1) {
List outputChainedNames = new ArrayList<>();
for (StreamEdge chainable : chainedOutputs) {
outputChainedNames.add(chainedNames.get(chainable.getTargetId()));
}
return operatorName + " -> (" + StringUtils.join(outputChainedNames, ", ") + ")";
} else if (chainedOutputs.size() == 1) {
return operatorName + " -> " + chainedNames.get(chainedOutputs.get(0).getTargetId());
} else {
return operatorName;
}
}
private StreamConfig createJobVertex(
Integer streamNodeId,
Map hashes) {
JobVertex jobVertex;
StreamNode streamNode = streamGraph.getStreamNode(streamNodeId);
byte[] hash = hashes.get(streamNodeId);
if (hash == null) {
throw new IllegalStateException("Cannot find node hash. " +
"Did you generate them before calling this method?");
}
JobVertexID jobVertexId = new JobVertexID(hash);
if (streamNode.getInputFormat() != null) {
jobVertex = new InputFormatVertex(
chainedNames.get(streamNodeId),
jobVertexId);
TaskConfig taskConfig = new TaskConfig(jobVertex.getConfiguration());
taskConfig.setStubWrapper(new UserCodeObjectWrapper(streamNode.getInputFormat()));
} else {
jobVertex = new JobVertex(
chainedNames.get(streamNodeId),
jobVertexId);
}
jobVertex.setInvokableClass(streamNode.getJobVertexClass());
int parallelism = streamNode.getParallelism();
if (parallelism > 0) {
jobVertex.setParallelism(parallelism);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Parallelism set: {} for {}", parallelism, streamNodeId);
}
jobVertices.put(streamNodeId, jobVertex);
builtVertices.add(streamNodeId);
jobGraph.addVertex(jobVertex);
return new StreamConfig(jobVertex.getConfiguration());
}
@SuppressWarnings("unchecked")
private void setVertexConfig(Integer vertexID, StreamConfig config,
List chainableOutputs, List nonChainableOutputs) {
StreamNode vertex = streamGraph.getStreamNode(vertexID);
config.setVertexID(vertexID);
config.setBufferTimeout(vertex.getBufferTimeout());
config.setTypeSerializerIn1(vertex.getTypeSerializerIn1());
config.setTypeSerializerIn2(vertex.getTypeSerializerIn2());
config.setTypeSerializerOut(vertex.getTypeSerializerOut());
config.setStreamOperator(vertex.getOperator());
config.setOutputSelectors(vertex.getOutputSelectors());
config.setNumberOfOutputs(nonChainableOutputs.size());
config.setNonChainedOutputs(nonChainableOutputs);
config.setChainedOutputs(chainableOutputs);
config.setTimeCharacteristic(streamGraph.getEnvironment().getStreamTimeCharacteristic());
final CheckpointConfig ceckpointCfg = streamGraph.getCheckpointConfig();
config.setStateBackend(streamGraph.getStateBackend());
config.setCheckpointingEnabled(ceckpointCfg.isCheckpointingEnabled());
if (ceckpointCfg.isCheckpointingEnabled()) {
config.setCheckpointMode(ceckpointCfg.getCheckpointingMode());
}
else {
// the "at-least-once" input handler is slightly cheaper (in the absence of checkpoints),
// so we use that one if checkpointing is not enabled
config.setCheckpointMode(CheckpointingMode.AT_LEAST_ONCE);
}
config.setStatePartitioner(0, vertex.getStatePartitioner1());
config.setStatePartitioner(1, vertex.getStatePartitioner2());
config.setStateKeySerializer(vertex.getStateKeySerializer());
Class extends AbstractInvokable> vertexClass = vertex.getJobVertexClass();
if (vertexClass.equals(StreamIterationHead.class)
|| vertexClass.equals(StreamIterationTail.class)) {
config.setIterationId(streamGraph.getBrokerID(vertexID));
config.setIterationWaitTime(streamGraph.getLoopTimeout(vertexID));
}
List allOutputs = new ArrayList(chainableOutputs);
allOutputs.addAll(nonChainableOutputs);
vertexConfigs.put(vertexID, config);
}
private void connect(Integer headOfChain, StreamEdge edge) {
physicalEdgesInOrder.add(edge);
Integer downStreamvertexID = edge.getTargetId();
JobVertex headVertex = jobVertices.get(headOfChain);
JobVertex downStreamVertex = jobVertices.get(downStreamvertexID);
StreamConfig downStreamConfig = new StreamConfig(downStreamVertex.getConfiguration());
downStreamConfig.setNumberOfInputs(downStreamConfig.getNumberOfInputs() + 1);
StreamPartitioner> partitioner = edge.getPartitioner();
if (partitioner instanceof ForwardPartitioner) {
downStreamVertex.connectNewDataSetAsInput(
headVertex,
DistributionPattern.POINTWISE,
ResultPartitionType.PIPELINED,
true);
} else if (partitioner instanceof RescalePartitioner){
downStreamVertex.connectNewDataSetAsInput(
headVertex,
DistributionPattern.POINTWISE,
ResultPartitionType.PIPELINED,
true);
} else {
downStreamVertex.connectNewDataSetAsInput(
headVertex,
DistributionPattern.ALL_TO_ALL,
ResultPartitionType.PIPELINED,
true);
}
if (LOG.isDebugEnabled()) {
LOG.debug("CONNECTED: {} - {} -> {}", partitioner.getClass().getSimpleName(),
headOfChain, downStreamvertexID);
}
}
private boolean isChainable(StreamEdge edge) {
StreamNode upStreamVertex = edge.getSourceVertex();
StreamNode downStreamVertex = edge.getTargetVertex();
StreamOperator> headOperator = upStreamVertex.getOperator();
StreamOperator> outOperator = downStreamVertex.getOperator();
return downStreamVertex.getInEdges().size() == 1
&& outOperator != null
&& headOperator != null
&& upStreamVertex.isSameSlotSharingGroup(downStreamVertex)
&& outOperator.getChainingStrategy() == ChainingStrategy.ALWAYS
&& (headOperator.getChainingStrategy() == ChainingStrategy.HEAD ||
headOperator.getChainingStrategy() == ChainingStrategy.ALWAYS)
&& (edge.getPartitioner() instanceof ForwardPartitioner)
&& upStreamVertex.getParallelism() == downStreamVertex.getParallelism()
&& streamGraph.isChainingEnabled();
}
private void setSlotSharing() {
Map slotSharingGroups = new HashMap<>();
for (Entry entry : jobVertices.entrySet()) {
String slotSharingGroup = streamGraph.getStreamNode(entry.getKey()).getSlotSharingGroup();
SlotSharingGroup group = slotSharingGroups.get(slotSharingGroup);
if (group == null) {
group = new SlotSharingGroup();
slotSharingGroups.put(slotSharingGroup, group);
}
entry.getValue().setSlotSharingGroup(group);
}
for (Tuple2 pair : streamGraph.getIterationSourceSinkPairs()) {
CoLocationGroup ccg = new CoLocationGroup();
JobVertex source = jobVertices.get(pair.f0.getId());
JobVertex sink = jobVertices.get(pair.f1.getId());
ccg.addVertex(source);
ccg.addVertex(sink);
source.updateCoLocationGroup(ccg);
sink.updateCoLocationGroup(ccg);
}
}
private void configureCheckpointing() {
CheckpointConfig cfg = streamGraph.getCheckpointConfig();
if (cfg.isCheckpointingEnabled()) {
long interval = cfg.getCheckpointInterval();
if (interval < 1) {
throw new IllegalArgumentException("The checkpoint interval must be positive");
}
// collect the vertices that receive "trigger checkpoint" messages.
// currently, these are all the sources
List triggerVertices = new ArrayList<>();
// collect the vertices that need to acknowledge the checkpoint
// currently, these are all vertices
List ackVertices = new ArrayList<>(jobVertices.size());
// collect the vertices that receive "commit checkpoint" messages
// currently, these are all vertices
List commitVertices = new ArrayList<>();
for (JobVertex vertex : jobVertices.values()) {
if (vertex.isInputVertex()) {
triggerVertices.add(vertex.getID());
}
// TODO: add check whether the user function implements the checkpointing interface
commitVertices.add(vertex.getID());
ackVertices.add(vertex.getID());
}
JobSnapshottingSettings settings = new JobSnapshottingSettings(
triggerVertices, ackVertices, commitVertices, interval,
cfg.getCheckpointTimeout(), cfg.getMinPauseBetweenCheckpoints(),
cfg.getMaxConcurrentCheckpoints());
jobGraph.setSnapshotSettings(settings);
// check if a restart strategy has been set, if not then set the FixedDelayRestartStrategy
if (streamGraph.getExecutionConfig().getRestartStrategy() == null) {
// if the user enabled checkpointing, the default number of exec retries is infinitive.
streamGraph.getExecutionConfig().setRestartStrategy(
RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, DEFAULT_RESTART_DELAY));
}
}
}
private void configureRestartStrategy() {
jobGraph.setRestartStrategyConfiguration(streamGraph.getExecutionConfig().getRestartStrategy());
}
// ------------------------------------------------------------------------
/**
* Returns a map with a hash for each {@link StreamNode} of the {@link
* StreamGraph}. The hash is used as the {@link JobVertexID} in order to
* identify nodes across job submissions if they didn't change.
*
*
The complete {@link StreamGraph} is traversed. The hash is either
* computed from the transformation's user-specified id (see
* {@link StreamTransformation#getUid()}) or generated in a deterministic way.
*
*
The generated hash is deterministic with respect to:
*
*
* @return A map from {@link StreamNode#id} to hash as 16-byte array.
*/
private Map traverseStreamGraphAndGenerateHashes() {
// The hash function used to generate the hash
final HashFunction hashFunction = Hashing.murmur3_128(0);
final Map hashes = new HashMap<>();
Set visited = new HashSet<>();
Queue remaining = new ArrayDeque<>();
// We need to make the source order deterministic. The source IDs are
// not returned in the same order, which means that submitting the same
// program twice might result in different traversal, which breaks the
// deterministic hash assignment.
List sources = new ArrayList<>();
for (Integer sourceNodeId : streamGraph.getSourceIDs()) {
sources.add(sourceNodeId);
}
Collections.sort(sources);
//
// Traverse the graph in a breadth-first manner. Keep in mind that
// the graph is not a tree and multiple paths to nodes can exist.
//
// Start with source nodes
for (Integer sourceNodeId : sources) {
remaining.add(streamGraph.getStreamNode(sourceNodeId));
visited.add(sourceNodeId);
}
StreamNode currentNode;
while ((currentNode = remaining.poll()) != null) {
// Generate the hash code. Because multiple path exist to each
// node, we might not have all required inputs available to
// generate the hash code.
if (generateNodeHash(currentNode, hashFunction, hashes)) {
// Add the child nodes
for (StreamEdge outEdge : currentNode.getOutEdges()) {
StreamNode child = outEdge.getTargetVertex();
if (!visited.contains(child.getId())) {
remaining.add(child);
visited.add(child.getId());
}
}
}
else {
// We will revisit this later.
visited.remove(currentNode.getId());
}
}
return hashes;
}
/**
* Generates a hash for the node and returns whether the operation was
* successful.
*
* @param node The node to generate the hash for
* @param hashFunction The hash function to use
* @param hashes The current state of generated hashes
* @return true if the node hash has been generated.
* false, otherwise. If the operation is not successful, the
* hash needs be generated at a later point when all input is available.
* @throws IllegalStateException If node has user-specified hash and is
* intermediate node of a chain
*/
private boolean generateNodeHash(
StreamNode node,
HashFunction hashFunction,
Map hashes) {
// Check for user-specified ID
String userSpecifiedHash = node.getTransformationId();
if (userSpecifiedHash == null) {
// Check that all input nodes have their hashes computed
for (StreamEdge inEdge : node.getInEdges()) {
// If the input node has not been visited yet, the current
// node will be visited again at a later point when all input
// nodes have been visited and their hashes set.
if (!hashes.containsKey(inEdge.getSourceId())) {
return false;
}
}
Hasher hasher = hashFunction.newHasher();
byte[] hash = generateDeterministicHash(node, hasher, hashes);
if (hashes.put(node.getId(), hash) != null) {
// Sanity check
throw new IllegalStateException("Unexpected state. Tried to add node hash " +
"twice. This is probably a bug in the JobGraph generator.");
}
return true;
}
else {
// Check that this node is not part of a chain. This is currently
// not supported, because the runtime takes the snapshots by the
// operator ID of the first vertex in a chain. It's OK if the node
// has chained outputs.
for (StreamEdge inEdge : node.getInEdges()) {
if (isChainable(inEdge)) {
throw new UnsupportedOperationException("Cannot assign user-specified hash "
+ "to intermediate node in chain. This will be supported in future "
+ "versions of Flink. As a work around start new chain at task "
+ node.getOperatorName() + ".");
}
}
Hasher hasher = hashFunction.newHasher();
byte[] hash = generateUserSpecifiedHash(node, hasher);
for (byte[] previousHash : hashes.values()) {
if (Arrays.equals(previousHash, hash)) {
throw new IllegalArgumentException("Hash collision on user-specified ID. " +
"Most likely cause is a non-unique ID. Please check that all IDs " +
"specified via `uid(String)` are unique.");
}
}
if (hashes.put(node.getId(), hash) != null) {
// Sanity check
throw new IllegalStateException("Unexpected state. Tried to add node hash " +
"twice. This is probably a bug in the JobGraph generator.");
}
return true;
}
}
/**
* Generates a hash from a user-specified ID.
*/
private byte[] generateUserSpecifiedHash(StreamNode node, Hasher hasher) {
hasher.putString(node.getTransformationId(), Charset.forName("UTF-8"));
return hasher.hash().asBytes();
}
/**
* Generates a deterministic hash from node-local properties and input and
* output edges.
*/
private byte[] generateDeterministicHash(
StreamNode node,
Hasher hasher,
Map hashes) {
// Include stream node to hash. We use the current size of the computed
// hashes as the ID. We cannot use the node's ID, because it is
// assigned from a static counter. This will result in two identical
// programs having different hashes.
generateNodeLocalHash(node, hasher, hashes.size());
// Include chained nodes to hash
for (StreamEdge outEdge : node.getOutEdges()) {
if (isChainable(outEdge)) {
StreamNode chainedNode = outEdge.getTargetVertex();
// Use the hash size again, because the nodes are chained to
// this node. This does not add a hash for the chained nodes.
generateNodeLocalHash(chainedNode, hasher, hashes.size());
}
}
byte[] hash = hasher.hash().asBytes();
// Make sure that all input nodes have their hash set before entering
// this loop (calling this method).
for (StreamEdge inEdge : node.getInEdges()) {
byte[] otherHash = hashes.get(inEdge.getSourceId());
// Sanity check
if (otherHash == null) {
throw new IllegalStateException("Missing hash for input node "
+ inEdge.getSourceVertex() + ". Cannot generate hash for "
+ node + ".");
}
for (int j = 0; j < hash.length; j++) {
hash[j] = (byte) (hash[j] * 37 ^ otherHash[j]);
}
}
if (LOG.isDebugEnabled()) {
String udfClassName = "";
if (node.getOperator() instanceof AbstractUdfStreamOperator) {
udfClassName = ((AbstractUdfStreamOperator, ?>) node.getOperator())
.getUserFunction().getClass().getName();
}
LOG.debug("Generated hash '" + byteToHexString(hash) + "' for node " +
"'" + node.toString() + "' {id: " + node.getId() + ", " +
"parallelism: " + node.getParallelism() + ", " +
"user function: " + udfClassName + "}");
}
return hash;
}
/**
* Applies the {@link Hasher} to the {@link StreamNode} (only node local
* attributes are taken into account). The hasher encapsulates the current
* state of the hash.
*
*
The specified ID is local to this node. We cannot use the
* {@link StreamNode#id}, because it is incremented in a static counter.
* Therefore, the IDs for identical jobs will otherwise be different.
*/
private void generateNodeLocalHash(StreamNode node, Hasher hasher, int id) {
// This resolves conflicts for otherwise identical source nodes. BUT
// the generated hash codes depend on the ordering of the nodes in the
// stream graph.
hasher.putInt(id);
hasher.putInt(node.getParallelism());
if (node.getOperator() instanceof AbstractUdfStreamOperator) {
String udfClassName = ((AbstractUdfStreamOperator, ?>) node.getOperator())
.getUserFunction().getClass().getName();
hasher.putString(udfClassName, Charset.forName("UTF-8"));
}
}
}
lua:
local access_token = ngx.var.cookie_SGAccessToken
if access_token then
ngx.header["Set-Cookie"] = "SGAccessToken="..access_token.."; path=/;Max-Age=3000"
end