10_Flink Streaming jobSubmit

./bin/flink run ./examples/batch/WordCount.jar

通过shell提交job后。flink将程序产生的jobGraph和jar包传给 jobmanager(简称JM)。再由jobmanager(类似nimbus)将任务分解到taskmanager(一个jvm带一个taskmanager)运行。

JM将jobGraph转化成executionGraph。,Client的getOptimizedPlan获取优化后的任务,最后通过JobClient.submitJobAndWait。启动执行。


#!/usr/bin/env bash
################################################################################
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

target="$0"
# For the case, the executable has been directly symlinked, figure out
# the correct bin path by following its symlink up to an upper bound.
# Note: we can't use the readlink utility here if we want to be POSIX
# compatible.
iteration=0
while [ -L "$target" ]; do
    if [ "$iteration" -gt 100 ]; then
        echo "Cannot resolve path: You have a cyclic symlink in $target."
        break
    fi
    ls=`ls -ld -- "$target"`
    target=`expr "$ls" : '.* -> \(.*\)$'`
    iteration=$((iteration + 1))
done

# Convert relative path to absolute path
bin=`dirname "$target"`

# get flink config
. "$bin"/config.sh

if [ "$FLINK_IDENT_STRING" = "" ]; then
        FLINK_IDENT_STRING="$USER"
fi

CC_CLASSPATH=`constructFlinkClassPath`

log=$FLINK_LOG_DIR/flink-$FLINK_IDENT_STRING-client-$HOSTNAME.log
log_setting=(-Dlog.file="$log" -Dlog4j.configuration=file:"$FLINK_CONF_DIR"/log4j-cli.properties -Dlogback.configurationFile=file:"$FLINK_CONF_DIR"/logback.xml)

export FLINK_ROOT_DIR
export FLINK_CONF_DIR

# Add HADOOP_CLASSPATH to allow the usage of Hadoop file systems
$JAVA_RUN $JVM_ARGS "${log_setting[@]}" -classpath "`manglePathList "$CC_CLASSPATH:$INTERNAL_HADOOP_CLASSPATHS"`" org.apache.flink.client.CliFrontend "$@"



/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.client;

import akka.actor.ActorSystem;

import org.apache.commons.cli.CommandLine;

import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobSubmissionResult;
import org.apache.flink.api.common.accumulators.AccumulatorHelper;
import org.apache.flink.client.cli.CancelOptions;
import org.apache.flink.client.cli.CliArgsException;
import org.apache.flink.client.cli.CliFrontendParser;
import org.apache.flink.client.cli.CommandLineOptions;
import org.apache.flink.client.cli.InfoOptions;
import org.apache.flink.client.cli.ListOptions;
import org.apache.flink.client.cli.ProgramOptions;
import org.apache.flink.client.cli.RunOptions;
import org.apache.flink.client.cli.SavepointOptions;
import org.apache.flink.client.cli.StopOptions;
import org.apache.flink.client.program.Client;
import org.apache.flink.client.program.PackagedProgram;
import org.apache.flink.client.program.ProgramInvocationException;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.GlobalConfiguration;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.optimizer.DataStatistics;
import org.apache.flink.optimizer.Optimizer;
import org.apache.flink.optimizer.costs.DefaultCostEstimator;
import org.apache.flink.optimizer.plan.FlinkPlan;
import org.apache.flink.optimizer.plan.OptimizedPlan;
import org.apache.flink.optimizer.plan.StreamingPlan;
import org.apache.flink.optimizer.plandump.PlanJSONDumpGenerator;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.client.JobStatusMessage;
import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.jobgraph.JobStatus;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.messages.JobManagerMessages;
import org.apache.flink.runtime.messages.JobManagerMessages.CancelJob;
import org.apache.flink.runtime.messages.JobManagerMessages.CancellationFailure;
import org.apache.flink.runtime.messages.JobManagerMessages.RunningJobsStatus;
import org.apache.flink.runtime.messages.JobManagerMessages.StopJob;
import org.apache.flink.runtime.messages.JobManagerMessages.StoppingFailure;
import org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint;
import org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepointSuccess;
import org.apache.flink.runtime.security.SecurityUtils;
import org.apache.flink.runtime.util.EnvironmentInformation;
import org.apache.flink.runtime.util.LeaderRetrievalUtils;
import org.apache.flink.runtime.yarn.AbstractFlinkYarnClient;
import org.apache.flink.runtime.yarn.AbstractFlinkYarnCluster;
import org.apache.flink.runtime.yarn.FlinkYarnClusterStatus;
import org.apache.flink.util.StringUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import scala.Some;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.FiniteDuration;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetSocketAddress;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.TimeUnit;

import static org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint;
import static org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepointFailure;
import static org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepointFailure;

/**
 * Implementation of a simple command line frontend for executing programs.
 */
public class CliFrontend {

	// actions
	public static final String ACTION_RUN = "run";
	public static final String ACTION_INFO = "info";
	private static final String ACTION_LIST = "list";
	private static final String ACTION_CANCEL = "cancel";
	private static final String ACTION_STOP = "stop";
	private static final String ACTION_SAVEPOINT = "savepoint";

	// config dir parameters
	private static final String ENV_CONFIG_DIRECTORY = "FLINK_CONF_DIR";
	private static final String CONFIG_DIRECTORY_FALLBACK_1 = "../conf";
	private static final String CONFIG_DIRECTORY_FALLBACK_2 = "conf";

	// YARN-session related constants
	public static final String YARN_PROPERTIES_FILE = ".yarn-properties-";
	public static final String YARN_PROPERTIES_JOBMANAGER_KEY = "jobManager";
	public static final String YARN_PROPERTIES_PARALLELISM = "parallelism";
	public static final String YARN_PROPERTIES_DYNAMIC_PROPERTIES_STRING = "dynamicPropertiesString";

	public static final String YARN_DYNAMIC_PROPERTIES_SEPARATOR = "@@"; // this has to be a regex for String.split()

	/**
	 * A special host name used to run a job by deploying Flink into a YARN cluster,
	 * if this string is specified as the JobManager address
	 */
	public static final String YARN_DEPLOY_JOBMANAGER = "yarn-cluster";

	// --------------------------------------------------------------------------------------------
	// --------------------------------------------------------------------------------------------

	private static final Logger LOG = LoggerFactory.getLogger(CliFrontend.class);


	private final Configuration config;

	private final FiniteDuration clientTimeout;

	private final FiniteDuration lookupTimeout;

	private ActorSystem actorSystem;

	private AbstractFlinkYarnCluster yarnCluster;

	/**
	 *
	 * @throws Exception Thrown if the configuration directory was not found, the configuration could not
	 *                   be loaded, or the YARN properties could not be parsed.
	 */
	public CliFrontend() throws Exception {
		this(getConfigurationDirectoryFromEnv());
	}

	public CliFrontend(String configDir) throws Exception {

		// configure the config directory
		File configDirectory = new File(configDir);
		LOG.info("Using configuration directory " + configDirectory.getAbsolutePath());

		// load the configuration
		LOG.info("Trying to load configuration file");
		GlobalConfiguration.loadConfiguration(configDirectory.getAbsolutePath());
		this.config = GlobalConfiguration.getConfiguration();

		// load the YARN properties
		String defaultPropertiesFileLocation = System.getProperty("java.io.tmpdir");
		String currentUser = System.getProperty("user.name");
		String propertiesFileLocation = config.getString(ConfigConstants.YARN_PROPERTIES_FILE_LOCATION, defaultPropertiesFileLocation);

		File propertiesFile = new File(propertiesFileLocation, CliFrontend.YARN_PROPERTIES_FILE + currentUser);
		if (propertiesFile.exists()) {

			logAndSysout("Found YARN properties file " + propertiesFile.getAbsolutePath());

			Properties yarnProperties = new Properties();
			try {
				try (InputStream is = new FileInputStream(propertiesFile)) {
					yarnProperties.load(is);
				}
			}
			catch (IOException e) {
				throw new Exception("Cannot read the YARN properties file", e);
			}

			// configure the default parallelism from YARN
			String propParallelism = yarnProperties.getProperty(YARN_PROPERTIES_PARALLELISM);
			if (propParallelism != null) { // maybe the property is not set
				try {
					int parallelism = Integer.parseInt(propParallelism);
					this.config.setInteger(ConfigConstants.DEFAULT_PARALLELISM_KEY, parallelism);

					logAndSysout("YARN properties set default parallelism to " + parallelism);
				}
				catch (NumberFormatException e) {
					throw new Exception("Error while parsing the YARN properties: " +
							"Property " + YARN_PROPERTIES_PARALLELISM + " is not an integer.");
				}
			}

			// get the JobManager address from the YARN properties
			String address = yarnProperties.getProperty(YARN_PROPERTIES_JOBMANAGER_KEY);
			InetSocketAddress jobManagerAddress;
			if (address != null) {
				try {
					jobManagerAddress = parseHostPortAddress(address);
					// store address in config from where it is retrieved by the retrieval service
					writeJobManagerAddressToConfig(jobManagerAddress);
				}
				catch (Exception e) {
					throw new Exception("YARN properties contain an invalid entry for JobManager address.", e);
				}

				logAndSysout("Using JobManager address from YARN properties " + jobManagerAddress);
			}

			// handle the YARN client's dynamic properties
			String dynamicPropertiesEncoded = yarnProperties.getProperty(YARN_PROPERTIES_DYNAMIC_PROPERTIES_STRING);
			Map dynamicProperties = getDynamicProperties(dynamicPropertiesEncoded);
			for (Map.Entry dynamicProperty : dynamicProperties.entrySet()) {
				this.config.setString(dynamicProperty.getKey(), dynamicProperty.getValue());
			}
		}

		try {
			FileSystem.setDefaultScheme(config);
		} catch (IOException e) {
			throw new Exception("Error while setting the default " +
				"filesystem scheme from configuration.", e);
		}

		this.clientTimeout = AkkaUtils.getClientTimeout(config);
		this.lookupTimeout = AkkaUtils.getLookupTimeout(config);
	}


	// --------------------------------------------------------------------------------------------
	//  Getter & Setter
	// --------------------------------------------------------------------------------------------

	/**
	 * Getter which returns a copy of the associated configuration
	 *
	 * @return Copy of the associated configuration
	 */
	public Configuration getConfiguration() {
		Configuration copiedConfiguration = new Configuration();

		copiedConfiguration.addAll(config);

		return copiedConfiguration;
	}


	// --------------------------------------------------------------------------------------------
	//  Execute Actions
	// --------------------------------------------------------------------------------------------

	/**
	 * Executions the run action.
	 * 
	 * @param args Command line arguments for the run action.
	 */
	protected int run(String[] args) {
		LOG.info("Running 'run' command.");

		RunOptions options;
		try {
			options = CliFrontendParser.parseRunCommand(args);
		}
		catch (CliArgsException e) {
			return handleArgException(e);
		}
		catch (Throwable t) {
			return handleError(t);
		}

		// evaluate help flag
		if (options.isPrintHelp()) {
			CliFrontendParser.printHelpForRun();
			return 0;
		}

		if (options.getJarFilePath() == null) {
			return handleArgException(new CliArgsException("The program JAR file was not specified."));
		}

		PackagedProgram program;
		try {
			LOG.info("Building program from JAR file");
			program = buildProgram(options);
		}
		catch (FileNotFoundException e) {
			return handleArgException(e);
		}
		catch (Throwable t) {
			return handleError(t);
		}

		int exitCode = 1;
		try {
			int userParallelism = options.getParallelism();
			LOG.debug("User parallelism is set to {}", userParallelism);

			Client client = getClient(options, program.getMainClassName(), userParallelism, options.getDetachedMode());
			client.setPrintStatusDuringExecution(options.getStdoutLogging());
			LOG.debug("Client slots is set to {}", client.getMaxSlots());

			LOG.debug("Savepoint path is set to {}", options.getSavepointPath());

			try {
				if (client.getMaxSlots() != -1 && userParallelism == -1) {
					logAndSysout("Using the parallelism provided by the remote cluster ("+client.getMaxSlots()+"). " +
							"To use another parallelism, set it at the ./bin/flink client.");
					userParallelism = client.getMaxSlots();
				}

				// detached mode
				if (options.getDetachedMode() || (yarnCluster != null && yarnCluster.isDetached())) {
					exitCode = executeProgramDetached(program, client, userParallelism);
				}
				else {
					exitCode = executeProgramBlocking(program, client, userParallelism);
				}

				// show YARN cluster status if its not a detached YARN cluster.
				if (yarnCluster != null && !yarnCluster.isDetached()) {
					List msgs = yarnCluster.getNewMessages();
					if (msgs != null && msgs.size() > 1) {

						logAndSysout("The following messages were created by the YARN cluster while running the Job:");
						for (String msg : msgs) {
							logAndSysout(msg);
						}
					}
					if (yarnCluster.hasFailed()) {
						logAndSysout("YARN cluster is in failed state!");
						logAndSysout("YARN Diagnostics: " + yarnCluster.getDiagnostics());
					}
				}

				return exitCode;
			}
			finally {
				client.shutdown();
			}
		}
		catch (Throwable t) {
			return handleError(t);
		}
		finally {
			if (yarnCluster != null && !yarnCluster.isDetached()) {
				logAndSysout("Shutting down YARN cluster");
				yarnCluster.shutdown(exitCode != 0);
			}
			if (program != null) {
				program.deleteExtractedLibraries();
			}
		}
	}

	/**
	 * Executes the info action.
	 * 
	 * @param args Command line arguments for the info action.
	 */
	protected int info(String[] args) {
		LOG.info("Running 'info' command.");

		// Parse command line options
		InfoOptions options;
		try {
			options = CliFrontendParser.parseInfoCommand(args);
		}
		catch (CliArgsException e) {
			return handleArgException(e);
		}
		catch (Throwable t) {
			return handleError(t);
		}

		// evaluate help flag
		if (options.isPrintHelp()) {
			CliFrontendParser.printHelpForInfo();
			return 0;
		}

		if (options.getJarFilePath() == null) {
			return handleArgException(new CliArgsException("The program JAR file was not specified."));
		}

		// -------- build the packaged program -------------

		PackagedProgram program;
		try {
			LOG.info("Building program from JAR file");
			program = buildProgram(options);
		}
		catch (Throwable t) {
			return handleError(t);
		}

		try {
			int parallelism = options.getParallelism();

			LOG.info("Creating program plan dump");

			Optimizer compiler = new Optimizer(new DataStatistics(), new DefaultCostEstimator(), config);
			FlinkPlan flinkPlan = Client.getOptimizedPlan(compiler, program, parallelism);
			
			String jsonPlan = null;
			if (flinkPlan instanceof OptimizedPlan) {
				jsonPlan = new PlanJSONDumpGenerator().getOptimizerPlanAsJSON((OptimizedPlan) flinkPlan);
			} else if (flinkPlan instanceof StreamingPlan) {
				jsonPlan = ((StreamingPlan) flinkPlan).getStreamingPlanAsJSON();
			}

			if (jsonPlan != null) {
				System.out.println("----------------------- Execution Plan -----------------------");
				System.out.println(jsonPlan);
				System.out.println("--------------------------------------------------------------");
			}
			else {
				System.out.println("JSON plan could not be generated.");
			}

			String description = program.getDescription();
			if (description != null) {
				System.out.println();
				System.out.println(description);
			}
			else {
				System.out.println();
				System.out.println("No description provided.");
			}
			return 0;
		}
		catch (Throwable t) {
			return handleError(t);
		}
		finally {
			program.deleteExtractedLibraries();
		}
	}

	/**
	 * Executes the list action.
	 * 
	 * @param args Command line arguments for the list action.
	 */
	protected int list(String[] args) {
		LOG.info("Running 'list' command.");

		ListOptions options;
		try {
			options = CliFrontendParser.parseListCommand(args);
		}
		catch (CliArgsException e) {
			return handleArgException(e);
		}
		catch (Throwable t) {
			return handleError(t);
		}

		// evaluate help flag
		if (options.isPrintHelp()) {
			CliFrontendParser.printHelpForList();
			return 0;
		}

		boolean running = options.getRunning();
		boolean scheduled = options.getScheduled();

		// print running and scheduled jobs if not option supplied
		if (!running && !scheduled) {
			running = true;
			scheduled = true;
		}

		try {
			ActorGateway jobManagerGateway = getJobManagerGateway(options);

			LOG.info("Connecting to JobManager to retrieve list of jobs");
			Future response = jobManagerGateway.ask(
				JobManagerMessages.getRequestRunningJobsStatus(),
				clientTimeout);

			Object result;
			try {
				result = Await.result(response, clientTimeout);
			}
			catch (Exception e) {
				throw new Exception("Could not retrieve running jobs from the JobManager.", e);
			}

			if (result instanceof RunningJobsStatus) {
				LOG.info("Successfully retrieved list of jobs");

				List jobs = ((RunningJobsStatus) result).getStatusMessages();

				ArrayList runningJobs = null;
				ArrayList scheduledJobs = null;
				if (running) {
					runningJobs = new ArrayList();
				}
				if (scheduled) {
					scheduledJobs = new ArrayList();
				}

				for (JobStatusMessage rj : jobs) {
					if (running && (rj.getJobState().equals(JobStatus.RUNNING)
							|| rj.getJobState().equals(JobStatus.RESTARTING))) {
						runningJobs.add(rj);
					}
					if (scheduled && rj.getJobState().equals(JobStatus.CREATED)) {
						scheduledJobs.add(rj);
					}
				}

				SimpleDateFormat df = new SimpleDateFormat("dd.MM.yyyy HH:mm:ss");
				Comparator njec = new Comparator(){
					@Override
					public int compare(JobStatusMessage o1, JobStatusMessage o2) {
						return (int)(o1.getStartTime()-o2.getStartTime());
					}
				};

				if (running) {
					if(runningJobs.size() == 0) {
						System.out.println("No running jobs.");
					}
					else {
						Collections.sort(runningJobs, njec);

						System.out.println("------------------ Running/Restarting Jobs -------------------");
						for (JobStatusMessage rj : runningJobs) {
							System.out.println(df.format(new Date(rj.getStartTime()))
									+ " : " + rj.getJobId() + " : " + rj.getJobName() + " (" + rj.getJobState() + ")");
						}
						System.out.println("--------------------------------------------------------------");
					}
				}
				if (scheduled) {
					if (scheduledJobs.size() == 0) {
						System.out.println("No scheduled jobs.");
					}
					else {
						Collections.sort(scheduledJobs, njec);

						System.out.println("----------------------- Scheduled Jobs -----------------------");
						for(JobStatusMessage rj : scheduledJobs) {
							System.out.println(df.format(new Date(rj.getStartTime()))
									+ " : " + rj.getJobId() + " : " + rj.getJobName());
						}
						System.out.println("--------------------------------------------------------------");
					}
				}
				return 0;
			}
			else {
				throw new Exception("ReqeustRunningJobs requires a response of type " +
						"RunningJobs. Instead the response is of type " + result.getClass() + ".");
			}
		}
		catch (Throwable t) {
			return handleError(t);
		}
	}

	/**
	 * Executes the STOP action.
	 * 
	 * @param args Command line arguments for the stop action.
	 */
	protected int stop(String[] args) {
		LOG.info("Running 'stop' command.");

		StopOptions options;
		try {
			options = CliFrontendParser.parseStopCommand(args);
		}
		catch (CliArgsException e) {
			return handleArgException(e);
		}
		catch (Throwable t) {
			return handleError(t);
		}

		// evaluate help flag
		if (options.isPrintHelp()) {
			CliFrontendParser.printHelpForStop();
			return 0;
		}

		String[] stopArgs = options.getArgs();
		JobID jobId;

		if (stopArgs.length > 0) {
			String jobIdString = stopArgs[0];
			try {
				jobId = new JobID(StringUtils.hexStringToByte(jobIdString));
			}
			catch (Exception e) {
				return handleError(e);
			}
		}
		else {
			return handleArgException(new CliArgsException("Missing JobID"));
		}

		try {
			ActorGateway jobManager = getJobManagerGateway(options);
			Future response = jobManager.ask(new StopJob(jobId), clientTimeout);

			final Object rc = Await.result(response, clientTimeout);

			if (rc instanceof StoppingFailure) {
				throw new Exception("Stopping the job with ID " + jobId + " failed.",
						((StoppingFailure) rc).cause());
			}

			return 0;
		}
		catch (Throwable t) {
			return handleError(t);
		}
	}

	/**
	 * Executes the CANCEL action.
	 * 
	 * @param args Command line arguments for the cancel action.
	 */
	protected int cancel(String[] args) {
		LOG.info("Running 'cancel' command.");

		CancelOptions options;
		try {
			options = CliFrontendParser.parseCancelCommand(args);
		}
		catch (CliArgsException e) {
			return handleArgException(e);
		}
		catch (Throwable t) {
			return handleError(t);
		}

		// evaluate help flag
		if (options.isPrintHelp()) {
			CliFrontendParser.printHelpForCancel();
			return 0;
		}

		String[] cleanedArgs = options.getArgs();
		JobID jobId;

		if (cleanedArgs.length > 0) {
			String jobIdString = cleanedArgs[0];
			try {
				jobId = new JobID(StringUtils.hexStringToByte(jobIdString));
			}
			catch (Exception e) {
				LOG.error("Error: The value for the Job ID is not a valid ID.");
				System.out.println("Error: The value for the Job ID is not a valid ID.");
				return 1;
			}
		}
		else {
			LOG.error("Missing JobID in the command line arguments.");
			System.out.println("Error: Specify a Job ID to cancel a job.");
			return 1;
		}

		try {
			ActorGateway jobManager = getJobManagerGateway(options);
			Future response = jobManager.ask(new CancelJob(jobId), clientTimeout);

			final Object rc = Await.result(response, clientTimeout);

			if (rc instanceof CancellationFailure) {
				throw new Exception("Canceling the job with ID " + jobId + " failed.",
						((CancellationFailure) rc).cause());
			}

			return 0;
		}
		catch (Throwable t) {
			return handleError(t);
		}
	}

	/**
	 * Executes the SAVEPOINT action.
	 *
	 * @param args Command line arguments for the cancel action.
	 */
	protected int savepoint(String[] args) {
		LOG.info("Running 'savepoint' command.");

		SavepointOptions options;
		try {
			options = CliFrontendParser.parseSavepointCommand(args);
		}
		catch (CliArgsException e) {
			return handleArgException(e);
		}
		catch (Throwable t) {
			return handleError(t);
		}

		// evaluate help flag
		if (options.isPrintHelp()) {
			CliFrontendParser.printHelpForCancel();
			return 0;
		}

		if (options.isDispose()) {
			// Discard
			return disposeSavepoint(options, options.getDisposeSavepointPath());
		}
		else {
			// Trigger
			String[] cleanedArgs = options.getArgs();
			JobID jobId;

			if (cleanedArgs.length > 0) {
				String jobIdString = cleanedArgs[0];
				try {
					jobId = new JobID(StringUtils.hexStringToByte(jobIdString));
				}
				catch (Exception e) {
					return handleError(new IllegalArgumentException(
							"Error: The value for the Job ID is not a valid ID."));
				}
			}
			else {
				return handleError(new IllegalArgumentException(
						"Error: The value for the Job ID is not a valid ID. " +
								"Specify a Job ID to trigger a savepoint."));
			}

			return triggerSavepoint(options, jobId);
		}
	}

	/**
	 * Sends a {@link org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint}
	 * message to the job manager.
	 */
	private int triggerSavepoint(SavepointOptions options, JobID jobId) {
		try {
			ActorGateway jobManager = getJobManagerGateway(options);

			logAndSysout("Triggering savepoint for job " + jobId + ".");
			Future response = jobManager.ask(new TriggerSavepoint(jobId),
					new FiniteDuration(1, TimeUnit.HOURS));

			Object result;
			try {
				logAndSysout("Waiting for response...");
				result = Await.result(response, FiniteDuration.Inf());
			}
			catch (Exception e) {
				throw new Exception("Triggering a savepoint for the job " + jobId + " failed.", e);
			}

			if (result instanceof TriggerSavepointSuccess) {
				TriggerSavepointSuccess success = (TriggerSavepointSuccess) result;
				logAndSysout("Savepoint completed. Path: " + success.savepointPath());
				logAndSysout("You can resume your program from this savepoint with the run command.");

				return 0;
			}
			else if (result instanceof TriggerSavepointFailure) {
				TriggerSavepointFailure failure = (TriggerSavepointFailure) result;
				throw failure.cause();
			}
			else {
				throw new IllegalStateException("Unknown JobManager response of type " +
						result.getClass());
			}
		}
		catch (Throwable t) {
			return handleError(t);
		}
	}

	/**
	 * Sends a {@link org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint}
	 * message to the job manager.
	 */
	private int disposeSavepoint(SavepointOptions options, String savepointPath) {
		try {
			ActorGateway jobManager = getJobManagerGateway(options);
			logAndSysout("Disposing savepoint '" + savepointPath + "'.");
			Future response = jobManager.ask(new DisposeSavepoint(savepointPath), clientTimeout);

			Object result;
			try {
				logAndSysout("Waiting for response...");
				result = Await.result(response, clientTimeout);
			}
			catch (Exception e) {
				throw new Exception("Disposing the savepoint with path" + savepointPath + " failed.", e);
			}

			if (result.getClass() == JobManagerMessages.getDisposeSavepointSuccess().getClass()) {
				logAndSysout("Savepoint '" + savepointPath + "' disposed.");
				return 0;
			}
			else if (result instanceof DisposeSavepointFailure) {
				DisposeSavepointFailure failure = (DisposeSavepointFailure) result;
				throw failure.cause();
			}
			else {
				throw new IllegalStateException("Unknown JobManager response of type " +
						result.getClass());
			}
		}
		catch (Throwable t) {
			return handleError(t);
		}
	}

	// --------------------------------------------------------------------------------------------
	//  Interaction with programs and JobManager
	// --------------------------------------------------------------------------------------------

	protected int executeProgramDetached(PackagedProgram program, Client client, int parallelism) {
		LOG.info("Starting execution of program");

		JobSubmissionResult result;
		try {
			result = client.runDetached(program, parallelism);
		} catch (ProgramInvocationException e) {
			return handleError(e);
		} finally {
			program.deleteExtractedLibraries();
		}

		if (yarnCluster != null) {
			yarnCluster.stopAfterJob(result.getJobID());
			yarnCluster.disconnect();
		}
		
		System.out.println("Job has been submitted with JobID " + result.getJobID());

		return 0;
	}

	protected int executeProgramBlocking(PackagedProgram program, Client client, int parallelism) {
		LOG.info("Starting execution of program");

		JobSubmissionResult result;
		try {
			result = client.runBlocking(program, parallelism);
		}
		catch (ProgramInvocationException e) {
			return handleError(e);
		}
		finally {
			program.deleteExtractedLibraries();
		}

		LOG.info("Program execution finished");

		if (result instanceof JobExecutionResult) {
			JobExecutionResult execResult = (JobExecutionResult) result;
			System.out.println("Job with JobID " + execResult.getJobID() + " has finished.");
			System.out.println("Job Runtime: " + execResult.getNetRuntime() + " ms");
			Map accumulatorsResult = execResult.getAllAccumulatorResults();
			if (accumulatorsResult.size() > 0) {
					System.out.println("Accumulator Results: ");
					System.out.println(AccumulatorHelper.getResultsFormated(accumulatorsResult));
			}
		}

		return 0;
	}

	/**
	 * Creates a Packaged program from the given command line options.
	 *
	 * @return A PackagedProgram (upon success)
	 * @throws java.io.FileNotFoundException
	 * @throws org.apache.flink.client.program.ProgramInvocationException
	 */
	protected PackagedProgram buildProgram(ProgramOptions options)
			throws FileNotFoundException, ProgramInvocationException
	{
		String[] programArgs = options.getProgramArgs();
		String jarFilePath = options.getJarFilePath();
		List classpaths = options.getClasspaths();

		if (jarFilePath == null) {
			throw new IllegalArgumentException("The program JAR file was not specified.");
		}

		File jarFile = new File(jarFilePath);

		// Check if JAR file exists
		if (!jarFile.exists()) {
			throw new FileNotFoundException("JAR file does not exist: " + jarFile);
		}
		else if (!jarFile.isFile()) {
			throw new FileNotFoundException("JAR file is not a file: " + jarFile);
		}

		// Get assembler class
		String entryPointClass = options.getEntryPointClassName();

		PackagedProgram program = entryPointClass == null ?
				new PackagedProgram(jarFile, classpaths, programArgs) :
				new PackagedProgram(jarFile, classpaths, entryPointClass, programArgs);

		program.setSavepointPath(options.getSavepointPath());

		return program;
	}

	/**
	 * Writes the given job manager address to the associated configuration object
	 *
	 * @param address Address to write to the configuration
	 */
	protected void writeJobManagerAddressToConfig(InetSocketAddress address) {
		config.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, address.getHostName());
		config.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, address.getPort());
	}

	/**
	 * Updates the associated configuration with the given command line options
	 *
	 * @param options Command line options
	 */
	protected void updateConfig(CommandLineOptions options) {
		if(options.getJobManagerAddress() != null){
			InetSocketAddress jobManagerAddress = parseHostPortAddress(options.getJobManagerAddress());
			writeJobManagerAddressToConfig(jobManagerAddress);
		}
	}

	/**
	 * Retrieves the {@link ActorGateway} for the JobManager. The JobManager address is retrieved
	 * from the provided {@link CommandLineOptions}.
	 *
	 * @param options CommandLineOptions specifying the JobManager URL
	 * @return Gateway to the JobManager
	 * @throws Exception
	 */
	protected ActorGateway getJobManagerGateway(CommandLineOptions options) throws Exception {
		// overwrite config values with given command line options
		updateConfig(options);

		// start an actor system if needed
		if (this.actorSystem == null) {
			LOG.info("Starting actor system to communicate with JobManager");
			try {
				scala.Tuple2 systemEndpoint = new scala.Tuple2("", 0);
				this.actorSystem = AkkaUtils.createActorSystem(
						config,
						new Some>(systemEndpoint));
			}
			catch (Exception e) {
				throw new IOException("Could not start actor system to communicate with JobManager", e);
			}

			LOG.info("Actor system successfully started");
		}

		LOG.info("Trying to lookup the JobManager gateway");
		// Retrieve the ActorGateway from the LeaderRetrievalService
		LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);

		return LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, lookupTimeout);
	}

	/**
	 * Retrieves a {@link Client} object from the given command line options and other parameters.
	 *
	 * @param options Command line options which contain JobManager address
	 * @param programName Program name
	 * @param userParallelism Given user parallelism
	 * @throws Exception
	 */
	protected Client getClient(
			CommandLineOptions options,
			String programName,
			int userParallelism,
			boolean detachedMode)
		throws Exception {
		InetSocketAddress jobManagerAddress;
		int maxSlots = -1;

		if (YARN_DEPLOY_JOBMANAGER.equals(options.getJobManagerAddress())) {
			logAndSysout("YARN cluster mode detected. Switching Log4j output to console");

			// Default yarn application name to use, if nothing is specified on the command line
			String applicationName = "Flink Application: " + programName;

			// user wants to run Flink in YARN cluster.
			CommandLine commandLine = options.getCommandLine();
			AbstractFlinkYarnClient flinkYarnClient = CliFrontendParser
														.getFlinkYarnSessionCli()
														.withDefaultApplicationName(applicationName)
														.createFlinkYarnClient(commandLine);

			if (flinkYarnClient == null) {
				throw new RuntimeException("Unable to create Flink YARN Client. Check previous log messages");
			}

			// in case the main detached mode wasn't set, we don't wanna overwrite the one loaded
			// from yarn options.
			if (detachedMode) {
				flinkYarnClient.setDetachedMode(true);
			}

			// the number of slots available from YARN:
			int yarnTmSlots = flinkYarnClient.getTaskManagerSlots();
			if (yarnTmSlots == -1) {
				yarnTmSlots = 1;
			}
			maxSlots = yarnTmSlots * flinkYarnClient.getTaskManagerCount();
			if (userParallelism != -1) {
				int slotsPerTM = userParallelism / flinkYarnClient.getTaskManagerCount();
				logAndSysout("The YARN cluster has " + maxSlots + " slots available, " +
						"but the user requested a parallelism of " + userParallelism + " on YARN. " +
						"Each of the " + flinkYarnClient.getTaskManagerCount() + " TaskManagers " +
						"will get "+slotsPerTM+" slots.");
				flinkYarnClient.setTaskManagerSlots(slotsPerTM);
			}

			try {
				yarnCluster = flinkYarnClient.deploy();
				yarnCluster.connectToCluster();
			}
			catch (Exception e) {
				throw new RuntimeException("Error deploying the YARN cluster", e);
			}

			jobManagerAddress = yarnCluster.getJobManagerAddress();
			writeJobManagerAddressToConfig(jobManagerAddress);
			
			// overwrite the yarn client config (because the client parses the dynamic properties)
			this.config.addAll(flinkYarnClient.getFlinkConfiguration());

			logAndSysout("YARN cluster started");
			logAndSysout("JobManager web interface address " + yarnCluster.getWebInterfaceURL());
			logAndSysout("Waiting until all TaskManagers have connected");

			while(true) {
				FlinkYarnClusterStatus status = yarnCluster.getClusterStatus();
				if (status != null) {
					if (status.getNumberOfTaskManagers() < flinkYarnClient.getTaskManagerCount()) {
						logAndSysout("TaskManager status (" + status.getNumberOfTaskManagers() + "/" + flinkYarnClient.getTaskManagerCount() + ")");
					} else {
						logAndSysout("All TaskManagers are connected");
						break;
					}
				} else {
					logAndSysout("No status updates from the YARN cluster received so far. Waiting ...");
				}

				try {
					Thread.sleep(500);
				}
				catch (InterruptedException e) {
					LOG.error("Interrupted while waiting for TaskManagers");
					System.err.println("Thread is interrupted");
					Thread.currentThread().interrupt();
				}
			}
		}
		else {
			if(options.getJobManagerAddress() != null) {
				jobManagerAddress = parseHostPortAddress(options.getJobManagerAddress());
				writeJobManagerAddressToConfig(jobManagerAddress);
			}
		}

		return new Client(config, maxSlots);
	}

	// --------------------------------------------------------------------------------------------
	//  Logging and Exception Handling
	// --------------------------------------------------------------------------------------------

	/**
	 * Displays an exception message for incorrect command line arguments.
	 *
	 * @param e The exception to display.
	 * @return The return code for the process.
	 */
	private int handleArgException(Exception e) {
		LOG.error("Invalid command line arguments." + (e.getMessage() == null ? "" : e.getMessage()));

		System.out.println(e.getMessage());
		System.out.println();
		System.out.println("Use the help option (-h or --help) to get help on the command.");
		return 1;
	}

	/**
	 * Displays an exception message.
	 * 
	 * @param t The exception to display.
	 * @return The return code for the process.
	 */
	private int handleError(Throwable t) {
		LOG.error("Error while running the command.", t);

		System.err.println();
		System.err.println("------------------------------------------------------------");
		System.err.println(" The program finished with the following exception:");
		System.err.println();

		if (t.getCause() instanceof InvalidProgramException) {
			System.err.println(t.getCause().getMessage());
			StackTraceElement[] trace = t.getCause().getStackTrace();
			for (StackTraceElement ele: trace) {
				System.err.println("\t" + ele.toString());
				if (ele.getMethodName().equals("main")) {
					break;
				}
			}
		} else {
			t.printStackTrace();
		}
		return 1;
	}

	private void logAndSysout(String message) {
		LOG.info(message);
		System.out.println(message);
	}

	// --------------------------------------------------------------------------------------------
	//  Entry point for executable
	// --------------------------------------------------------------------------------------------

	/**
	 * Parses the command line arguments and starts the requested action.
	 * 
	 * @param args command line arguments of the client.
	 * @return The return code of the program
	 */
	public int parseParameters(String[] args) {

		// check for action
		if (args.length < 1) {
			CliFrontendParser.printHelp();
			System.out.println("Please specify an action.");
			return 1;
		}

		// get action
		String action = args[0];

		// remove action from parameters
		final String[] params = Arrays.copyOfRange(args, 1, args.length);

		// do action
		switch (action) {
			case ACTION_RUN:
				// run() needs to run in a secured environment for the optimizer.
				if (SecurityUtils.isSecurityEnabled()) {
					String message = "Secure Hadoop environment setup detected. Running in secure context.";
					LOG.info(message);

					try {
						return SecurityUtils.runSecured(new SecurityUtils.FlinkSecuredRunner() {
							@Override
							public Integer run() throws Exception {
								return CliFrontend.this.run(params);
							}
						});
					}
					catch (Exception e) {
						return handleError(e);
					}
				} else {
					return run(params);
				}
			case ACTION_LIST:
				return list(params);
			case ACTION_INFO:
				return info(params);
			case ACTION_CANCEL:
				return cancel(params);
			case ACTION_STOP:
				return stop(params);
			case ACTION_SAVEPOINT:
				return savepoint(params);
			case "-h":
			case "--help":
				CliFrontendParser.printHelp();
				return 0;
			case "-v":
			case "--version":
				String version = EnvironmentInformation.getVersion();
				String commitID = EnvironmentInformation.getRevisionInformation().commitId;
				System.out.print("Version: " + version);
				System.out.println(!commitID.equals(EnvironmentInformation.UNKNOWN) ? ", Commit ID: " + commitID : "");
				return 0;
			default:
				System.out.printf("\"%s\" is not a valid action.\n", action);
				System.out.println();
				System.out.println("Valid actions are \"run\", \"list\", \"info\", \"stop\", or \"cancel\".");
				System.out.println();
				System.out.println("Specify the version option (-v or --version) to print Flink version.");
				System.out.println();
				System.out.println("Specify the help option (-h or --help) to get help on the command.");
				return 1;
		}
	}

	public void shutdown() {
		ActorSystem sys = this.actorSystem;
		if (sys != null) {
			this.actorSystem = null;
			sys.shutdown();
		}
	}

	/**
	 * Submits the job based on the arguments
	 */
	public static void main(String[] args) {
		EnvironmentInformation.logEnvironmentInfo(LOG, "Command Line Client", args);

		try {
			CliFrontend cli = new CliFrontend();
			int retCode = cli.parseParameters(args);
			System.exit(retCode);
		}
		catch (Throwable t) {
			LOG.error("Fatal error while running command line interface.", t);
			t.printStackTrace();
			System.exit(31);
		}
	}

	// --------------------------------------------------------------------------------------------
	//  Miscellaneous Utilities
	// --------------------------------------------------------------------------------------------

	/**
	 * Parses a given host port address of the format URL:PORT and returns an {@link InetSocketAddress}
	 *
	 * @param hostAndPort host port string to be parsed
	 * @return InetSocketAddress object containing the parsed host port information
	 */
	private static InetSocketAddress parseHostPortAddress(String hostAndPort) {
		// code taken from http://stackoverflow.com/questions/2345063/java-common-way-to-validate-and-convert-hostport-to-inetsocketaddress
		URI uri;
		try {
			uri = new URI("my://" + hostAndPort);
		} catch (URISyntaxException e) {
			throw new RuntimeException("Malformed address " + hostAndPort, e);
		}
		String host = uri.getHost();
		int port = uri.getPort();
		if (host == null || port == -1) {
			throw new RuntimeException("Address is missing hostname or port " + hostAndPort);
		}
		return new InetSocketAddress(host, port);
	}

	public static String getConfigurationDirectoryFromEnv() {
		String location = System.getenv(ENV_CONFIG_DIRECTORY);

		if (location != null) {
			if (new File(location).exists()) {
				return location;
			}
			else {
				throw new RuntimeException("The config directory '" + location + "', specified in the '" +
						ENV_CONFIG_DIRECTORY + "' environment variable, does not exist.");
			}
		}
		else if (new File(CONFIG_DIRECTORY_FALLBACK_1).exists()) {
			location = CONFIG_DIRECTORY_FALLBACK_1;
		}
		else if (new File(CONFIG_DIRECTORY_FALLBACK_2).exists()) {
			location = CONFIG_DIRECTORY_FALLBACK_2;
		}
		else {
			throw new RuntimeException("The configuration directory was not specified. " +
					"Please specify the directory containing the configuration file through the '" +
					ENV_CONFIG_DIRECTORY + "' environment variable.");
		}
		return location;
	}

	public static Map getDynamicProperties(String dynamicPropertiesEncoded) {
		if (dynamicPropertiesEncoded != null && dynamicPropertiesEncoded.length() > 0) {
			Map properties = new HashMap<>();
			
			String[] propertyLines = dynamicPropertiesEncoded.split(CliFrontend.YARN_DYNAMIC_PROPERTIES_SEPARATOR);
			for (String propLine : propertyLines) {
				if (propLine == null) {
					continue;
				}
				
				String[] kv = propLine.split("=");
				if (kv.length >= 2 && kv[0] != null && kv[1] != null && kv[0].length() > 0) {
					properties.put(kv[0], kv[1]);
				}
			}
			return properties;
		}
		else {
			return Collections.emptyMap();
		}
	}
}



222

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.client.program;

import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import com.google.common.base.Preconditions;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobSubmissionResult;
import org.apache.flink.api.common.accumulators.AccumulatorHelper;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.Plan;
import org.apache.flink.optimizer.CompilerException;
import org.apache.flink.optimizer.DataStatistics;
import org.apache.flink.optimizer.Optimizer;
import org.apache.flink.optimizer.costs.DefaultCostEstimator;
import org.apache.flink.optimizer.plan.FlinkPlan;
import org.apache.flink.optimizer.plan.OptimizedPlan;
import org.apache.flink.optimizer.plan.StreamingPlan;
import org.apache.flink.optimizer.plandump.PlanJSONDumpGenerator;
import org.apache.flink.optimizer.plantranslate.JobGraphGenerator;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.client.JobClient;
import org.apache.flink.runtime.client.JobExecutionException;
import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.messages.accumulators.AccumulatorResultsErroneous;
import org.apache.flink.runtime.messages.accumulators.AccumulatorResultsFound;
import org.apache.flink.runtime.messages.accumulators.RequestAccumulatorResults;
import org.apache.flink.runtime.messages.JobManagerMessages;
import org.apache.flink.runtime.util.LeaderRetrievalUtils;
import org.apache.flink.util.SerializedValue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.FiniteDuration;
import akka.actor.ActorSystem;

/**
 * Encapsulates the functionality necessary to submit a program to a remote cluster.
 */
public class Client {

	private static final Logger LOG = LoggerFactory.getLogger(Client.class);

	/** The optimizer used in the optimization of batch programs */
	final Optimizer compiler;

	/** The actor system used to communicate with the JobManager */
	private final ActorSystem actorSystem;

	/** Configuration of the client */
	private final Configuration config;

	/** Timeout for futures */
	private final FiniteDuration timeout;

	/** Lookup timeout for the job manager retrieval service */
	private final FiniteDuration lookupTimeout;

	/**
	 * If != -1, this field specifies the total number of available slots on the cluster
	 * connected to the client.
	 */
	private final int maxSlots;

	/** Flag indicating whether to sysout print execution updates */
	private boolean printStatusDuringExecution = true;

	/**
	 * For interactive invocations, the Job ID is only available after the ContextEnvironment has
	 * been run inside the user JAR. We pass the Client to every instance of the ContextEnvironment
	 * which lets us access the last JobID here.
	 */
	private JobID lastJobID;

	// ------------------------------------------------------------------------
	//                            Construction
	// ------------------------------------------------------------------------

	/**
	 * Creates a instance that submits the programs to the JobManager defined in the
	 * configuration. This method will try to resolve the JobManager hostname and throw an exception
	 * if that is not possible.
	 *
	 * @param config The config used to obtain the job-manager's address, and used to configure the optimizer.
	 *
	 * @throws java.io.IOException Thrown, if the client's actor system could not be started.
	 * @throws java.net.UnknownHostException Thrown, if the JobManager's hostname could not be resolved.
	 */
	public Client(Configuration config) throws IOException {
		this(config, -1);
	}

	/**
	 * Creates a new instance of the class that submits the jobs to a job-manager.
	 * at the given address using the default port.
	 *
	 * @param config The configuration for the client-side processes, like the optimizer.
	 * @param maxSlots maxSlots The number of maxSlots on the cluster if != -1.
	 *
	 * @throws java.io.IOException Thrown, if the client's actor system could not be started.
	 * @throws java.net.UnknownHostException Thrown, if the JobManager's hostname could not be resolved.
	 */
	public Client(Configuration config, int maxSlots) throws IOException {
		this.config = Preconditions.checkNotNull(config);
		this.compiler = new Optimizer(new DataStatistics(), new DefaultCostEstimator(), config);
		this.maxSlots = maxSlots;

		LOG.info("Starting client actor system");

		try {
			this.actorSystem = JobClient.startJobClientActorSystem(config);
		} catch (Exception e) {
			throw new IOException("Could start client actor system.", e);
		}

		timeout = AkkaUtils.getClientTimeout(config);
		lookupTimeout = AkkaUtils.getLookupTimeout(config);
	}

	// ------------------------------------------------------------------------
	//  Startup & Shutdown
	// ------------------------------------------------------------------------

	/**
	 * Shuts down the client. This stops the internal actor system and actors.
	 */
	public void shutdown() {
		if (!this.actorSystem.isTerminated()) {
			this.actorSystem.shutdown();
			this.actorSystem.awaitTermination();
		}
	}

	// ------------------------------------------------------------------------
	//  Configuration
	// ------------------------------------------------------------------------

	/**
	 * Configures whether the client should print progress updates during the execution to {@code System.out}.
	 * All updates are logged via the SLF4J loggers regardless of this setting.
	 *
	 * @param print True to print updates to standard out during execution, false to not print them.
	 */
	public void setPrintStatusDuringExecution(boolean print) {
		this.printStatusDuringExecution = print;
	}

	/**
	 * @return whether the client will print progress updates during the execution to {@code System.out}
	 */
	public boolean getPrintStatusDuringExecution() {
		return this.printStatusDuringExecution;
	}

	/**
	 * @return -1 if unknown. The maximum number of available processing slots at the Flink cluster
	 * connected to this client.
	 */
	public int getMaxSlots() {
		return this.maxSlots;
	}

	// ------------------------------------------------------------------------
	//  Access to the Program's Plan
	// ------------------------------------------------------------------------

	public static String getOptimizedPlanAsJson(Optimizer compiler, PackagedProgram prog, int parallelism)
			throws CompilerException, ProgramInvocationException
	{
		PlanJSONDumpGenerator jsonGen = new PlanJSONDumpGenerator();
		return jsonGen.getOptimizerPlanAsJSON((OptimizedPlan) getOptimizedPlan(compiler, prog, parallelism));
	}

	public static FlinkPlan getOptimizedPlan(Optimizer compiler, PackagedProgram prog, int parallelism)
			throws CompilerException, ProgramInvocationException
	{
		Thread.currentThread().setContextClassLoader(prog.getUserCodeClassLoader());
		if (prog.isUsingProgramEntryPoint()) {
			return getOptimizedPlan(compiler, prog.getPlanWithJars(), parallelism);
		} else if (prog.isUsingInteractiveMode()) {
			// temporary hack to support the optimizer plan preview
			OptimizerPlanEnvironment env = new OptimizerPlanEnvironment(compiler);
			if (parallelism > 0) {
				env.setParallelism(parallelism);
			}

			return env.getOptimizedPlan(prog);
		} else {
			throw new RuntimeException("Couldn't determine program mode.");
		}
	}

	public static OptimizedPlan getOptimizedPlan(Optimizer compiler, Plan p, int parallelism) throws CompilerException {
		if (parallelism > 0 && p.getDefaultParallelism() <= 0) {
			LOG.debug("Changing plan default parallelism from {} to {}", p.getDefaultParallelism(), parallelism);
			p.setDefaultParallelism(parallelism);
		}
		LOG.debug("Set parallelism {}, plan default parallelism {}", parallelism, p.getDefaultParallelism());

		return compiler.compile(p);
	}

	// ------------------------------------------------------------------------
	//  Program submission / execution
	// ------------------------------------------------------------------------

	public JobSubmissionResult runBlocking(PackagedProgram prog, int parallelism) throws ProgramInvocationException {
		Thread.currentThread().setContextClassLoader(prog.getUserCodeClassLoader());
		if (prog.isUsingProgramEntryPoint()) {
			return runBlocking(prog.getPlanWithJars(), parallelism, prog.getSavepointPath());
		}
		else if (prog.isUsingInteractiveMode()) {
			LOG.info("Starting program in interactive mode");
			ContextEnvironment.setAsContext(new ContextEnvironmentFactory(this, prog.getAllLibraries(),
					prog.getClasspaths(), prog.getUserCodeClassLoader(), parallelism, true,
					prog.getSavepointPath()));

			// invoke here
			try {
				prog.invokeInteractiveModeForExecution();
			}
			finally {
				ContextEnvironment.unsetContext();
			}

			return new JobSubmissionResult(lastJobID);
		}
		else {
			throw new RuntimeException();
		}
	}

	public JobSubmissionResult runDetached(PackagedProgram prog, int parallelism)
			throws ProgramInvocationException
	{
		Thread.currentThread().setContextClassLoader(prog.getUserCodeClassLoader());
		if (prog.isUsingProgramEntryPoint()) {
			return runDetached(prog.getPlanWithJars(), parallelism, prog.getSavepointPath());
		}
		else if (prog.isUsingInteractiveMode()) {
			LOG.info("Starting program in interactive mode");
			ContextEnvironmentFactory factory = new ContextEnvironmentFactory(this, prog.getAllLibraries(),
					prog.getClasspaths(), prog.getUserCodeClassLoader(), parallelism, false,
					prog.getSavepointPath());
			ContextEnvironment.setAsContext(factory);

			// invoke here
			try {
				prog.invokeInteractiveModeForExecution();
				return ((DetachedEnvironment) factory.getLastEnvCreated()).finalizeExecute();
			}
			finally {
				ContextEnvironment.unsetContext();
			}
		}
		else {
			throw new RuntimeException("PackagedProgram does not have a valid invocation mode.");
		}
	}

	public JobExecutionResult runBlocking(JobWithJars program, int parallelism) throws ProgramInvocationException {
		return runBlocking(program, parallelism, null);
	}

	/**
	 * Runs a program on the Flink cluster to which this client is connected. The call blocks until the
	 * execution is complete, and returns afterwards.
	 *
	 * @param program The program to be executed.
	 * @param parallelism The default parallelism to use when running the program. The default parallelism is used
	 *                    when the program does not set a parallelism by itself.
	 *
	 * @throws CompilerException Thrown, if the compiler encounters an illegal situation.
	 * @throws ProgramInvocationException Thrown, if the program could not be instantiated from its jar file,
	 *                                    or if the submission failed. That might be either due to an I/O problem,
	 *                                    i.e. the job-manager is unreachable, or due to the fact that the
	 *                                    parallel execution failed.
	 */
	public JobExecutionResult runBlocking(JobWithJars program, int parallelism, String savepointPath)
			throws CompilerException, ProgramInvocationException {
		ClassLoader classLoader = program.getUserCodeClassLoader();
		if (classLoader == null) {
			throw new IllegalArgumentException("The given JobWithJars does not provide a usercode class loader.");
		}

		OptimizedPlan optPlan = getOptimizedPlan(compiler, program, parallelism);
		return runBlocking(optPlan, program.getJarFiles(), program.getClasspaths(), classLoader, savepointPath);
	}

	public JobSubmissionResult runDetached(JobWithJars program, int parallelism) throws ProgramInvocationException {
		return runDetached(program, parallelism, null);
	}

	/**
	 * Submits a program to the Flink cluster to which this client is connected. The call returns after the
	 * program was submitted and does not wait for the program to complete.
	 *
	 * @param program The program to be executed.
	 * @param parallelism The default parallelism to use when running the program. The default parallelism is used
	 *                    when the program does not set a parallelism by itself.
	 *
	 * @throws CompilerException Thrown, if the compiler encounters an illegal situation.
	 * @throws ProgramInvocationException Thrown, if the program could not be instantiated from its jar file,
	 *                                    or if the submission failed. That might be either due to an I/O problem,
	 *                                    i.e. the job-manager is unreachable.
	 */
	public JobSubmissionResult runDetached(JobWithJars program, int parallelism, String savepointPath)
			throws CompilerException, ProgramInvocationException {
		ClassLoader classLoader = program.getUserCodeClassLoader();
		if (classLoader == null) {
			throw new IllegalArgumentException("The given JobWithJars does not provide a usercode class loader.");
		}

		OptimizedPlan optimizedPlan = getOptimizedPlan(compiler, program, parallelism);
		return runDetached(optimizedPlan, program.getJarFiles(), program.getClasspaths(), classLoader, savepointPath);
	}

	public JobExecutionResult runBlocking(
			FlinkPlan compiledPlan, List libraries, List classpaths, ClassLoader classLoader) throws ProgramInvocationException {
		return runBlocking(compiledPlan, libraries, classpaths, classLoader, null);
	}

	public JobExecutionResult runBlocking(FlinkPlan compiledPlan, List libraries, List classpaths,
			ClassLoader classLoader, String savepointPath) throws ProgramInvocationException
	{
		JobGraph job = getJobGraph(compiledPlan, libraries, classpaths, savepointPath);
		return runBlocking(job, classLoader);
	}

	public JobSubmissionResult runDetached(FlinkPlan compiledPlan, List libraries, List classpaths, ClassLoader classLoader) throws ProgramInvocationException {
		return runDetached(compiledPlan, libraries, classpaths, classLoader, null);
	}

	public JobSubmissionResult runDetached(FlinkPlan compiledPlan, List libraries, List classpaths,
			ClassLoader classLoader, String savepointPath) throws ProgramInvocationException
	{
		JobGraph job = getJobGraph(compiledPlan, libraries, classpaths, savepointPath);
		return runDetached(job, classLoader);
	}

	public JobExecutionResult runBlocking(JobGraph jobGraph, ClassLoader classLoader) throws ProgramInvocationException {
		LeaderRetrievalService leaderRetrievalService;
		try {
			leaderRetrievalService = LeaderRetrievalUtils.createLeaderRetrievalService(config);
		} catch (Exception e) {
			throw new ProgramInvocationException("Could not create the leader retrieval service.", e);
		}

		try {
			this.lastJobID = jobGraph.getJobID();
			return JobClient.submitJobAndWait(actorSystem, leaderRetrievalService, jobGraph, timeout, printStatusDuringExecution, classLoader);
		} catch (JobExecutionException e) {
			throw new ProgramInvocationException("The program execution failed: " + e.getMessage(), e);
		}
	}

	public JobSubmissionResult runDetached(JobGraph jobGraph, ClassLoader classLoader) throws ProgramInvocationException {
		ActorGateway jobManagerGateway;

		try {
			jobManagerGateway = getJobManagerGateway();
		} catch (Exception e) {
			throw new ProgramInvocationException("Failed to retrieve the JobManager gateway.", e);
		}

		LOG.info("Checking and uploading JAR files");
		try {
			JobClient.uploadJarFiles(jobGraph, jobManagerGateway, timeout);
		}
		catch (IOException e) {
			throw new ProgramInvocationException("Could not upload the program's JAR files to the JobManager.", e);
		}
		try {
			this.lastJobID = jobGraph.getJobID();
			JobClient.submitJobDetached(jobManagerGateway, jobGraph, timeout, classLoader);
			return new JobSubmissionResult(jobGraph.getJobID());
		} catch (JobExecutionException e) {
				throw new ProgramInvocationException("The program execution failed: " + e.getMessage(), e);
		}
	}

	/**
	 * Cancels a job identified by the job id.
	 * @param jobId the job id
	 * @throws Exception In case an error occurred.
	 */
	public void cancel(JobID jobId) throws Exception {
		final ActorGateway jobManagerGateway = getJobManagerGateway();

		final Future response;
		try {
			response = jobManagerGateway.ask(new JobManagerMessages.CancelJob(jobId), timeout);
		} catch (final Exception e) {
			throw new ProgramInvocationException("Failed to query the job manager gateway.", e);
		}

		final Object result = Await.result(response, timeout);

		if (result instanceof JobManagerMessages.CancellationSuccess) {
			LOG.info("Job cancellation with ID " + jobId + " succeeded.");
		} else if (result instanceof JobManagerMessages.CancellationFailure) {
			final Throwable t = ((JobManagerMessages.CancellationFailure) result).cause();
			LOG.info("Job cancellation with ID " + jobId + " failed.", t);
			throw new Exception("Failed to cancel the job because of \n" + t.getMessage());
		} else {
			throw new Exception("Unknown message received while cancelling: " + result.getClass().getName());
		}
	}

	/**
	 * Stops a program on Flink cluster whose job-manager is configured in this client's configuration.
	 * Stopping works only for streaming programs. Be aware, that the program might continue to run for
	 * a while after sending the stop command, because after sources stopped to emit data all operators
	 * need to finish processing.
	 * 
	 * @param jobId
	 *            the job ID of the streaming program to stop
	 * @throws Exception
	 *             If the job ID is invalid (ie, is unknown or refers to a batch job) or if sending the stop signal
	 *             failed. That might be due to an I/O problem, ie, the job-manager is unreachable.
	 */
	public void stop(final JobID jobId) throws Exception {
		final ActorGateway jobManagerGateway = getJobManagerGateway();

		final Future response;
		try {
			response = jobManagerGateway.ask(new JobManagerMessages.StopJob(jobId), timeout);
		} catch (final Exception e) {
			throw new ProgramInvocationException("Failed to query the job manager gateway.", e);
		}

		final Object result = Await.result(response, timeout);

		if (result instanceof JobManagerMessages.StoppingSuccess) {
			LOG.info("Job stopping with ID " + jobId + " succeeded.");
		} else if (result instanceof JobManagerMessages.StoppingFailure) {
			final Throwable t = ((JobManagerMessages.StoppingFailure) result).cause();
			LOG.info("Job stopping with ID " + jobId + " failed.", t);
			throw new Exception("Failed to stop the job because of \n" + t.getMessage());
		} else {
			throw new Exception("Unknown message received while stopping: " + result.getClass().getName());
		}
	}

	/**
	 * Requests and returns the accumulators for the given job identifier. Accumulators can be
	 * requested while a is running or after it has finished. The default class loader is used
	 * to deserialize the incoming accumulator results.
	 * @param jobID The job identifier of a job.
	 * @return A Map containing the accumulator's name and its value.
	 */
	public Map getAccumulators(JobID jobID) throws Exception {
		return getAccumulators(jobID, ClassLoader.getSystemClassLoader());
	}

	/**
	 * Requests and returns the accumulators for the given job identifier. Accumulators can be
	 * requested while a is running or after it has finished.
	 * @param jobID The job identifier of a job.
	 * @param loader The class loader for deserializing the accumulator results.
	 * @return A Map containing the accumulator's name and its value.
	 */
	public Map getAccumulators(JobID jobID, ClassLoader loader) throws Exception {
		ActorGateway jobManagerGateway = getJobManagerGateway();

		Future response;
		try {
			response = jobManagerGateway.ask(new RequestAccumulatorResults(jobID), timeout);
		} catch (Exception e) {
			throw new Exception("Failed to query the job manager gateway for accumulators.", e);
		}

		Object result = Await.result(response, timeout);

		if (result instanceof AccumulatorResultsFound) {
			Map> serializedAccumulators =
					((AccumulatorResultsFound) result).result();

			return AccumulatorHelper.deserializeAccumulators(serializedAccumulators, loader);

		} else if (result instanceof AccumulatorResultsErroneous) {
			throw ((AccumulatorResultsErroneous) result).cause();
		} else {
			throw new Exception("Failed to fetch accumulators for the job " + jobID + ".");
		}
	}


	// ------------------------------------------------------------------------
	//  Sessions
	// ------------------------------------------------------------------------

	/**
	 * Tells the JobManager to finish the session (job) defined by the given ID.
	 * 
	 * @param jobId The ID that identifies the session.
	 */
	public void endSession(JobID jobId) throws Exception {
		if (jobId == null) {
			throw new IllegalArgumentException("The JobID must not be null.");
		}
		endSessions(Collections.singletonList(jobId));
	}

	/**
	 * Tells the JobManager to finish the sessions (jobs) defined by the given IDs.
	 *
	 * @param jobIds The IDs that identify the sessions.
	 */
	public void endSessions(List jobIds) throws Exception {
		if (jobIds == null) {
			throw new IllegalArgumentException("The JobIDs must not be null");
		}

		ActorGateway jobManagerGateway = getJobManagerGateway();
		
		for (JobID jid : jobIds) {
			if (jid != null) {
				LOG.info("Telling job manager to end the session {}.", jid);
				jobManagerGateway.tell(new JobManagerMessages.RemoveCachedJob(jid));
			}
		}
	}

	// ------------------------------------------------------------------------
	//  Internal translation methods
	// ------------------------------------------------------------------------

	/**
	 * Creates the optimized plan for a given program, using this client's compiler.
	 *
	 * @param prog The program to be compiled.
	 * @return The compiled and optimized plan, as returned by the compiler.
	 * @throws CompilerException Thrown, if the compiler encounters an illegal situation.
	 * @throws ProgramInvocationException Thrown, if the program could not be instantiated from its jar file.
	 */
	private static OptimizedPlan getOptimizedPlan(Optimizer compiler, JobWithJars prog, int parallelism)
			throws CompilerException, ProgramInvocationException {
		return getOptimizedPlan(compiler, prog.getPlan(), parallelism);
	}

	public JobGraph getJobGraph(PackagedProgram prog, FlinkPlan optPlan) throws ProgramInvocationException {
		return getJobGraph(optPlan, prog.getAllLibraries(), prog.getClasspaths(), null);
	}

	public JobGraph getJobGraph(PackagedProgram prog, FlinkPlan optPlan, String savepointPath) throws ProgramInvocationException {
		return getJobGraph(optPlan, prog.getAllLibraries(), prog.getClasspaths(), savepointPath);
	}

	private JobGraph getJobGraph(FlinkPlan optPlan, List jarFiles, List classpaths, String savepointPath) {
		JobGraph job;
		if (optPlan instanceof StreamingPlan) {
			job = ((StreamingPlan) optPlan).getJobGraph();
			job.setSavepointPath(savepointPath);
		} else {
			JobGraphGenerator gen = new JobGraphGenerator(this.config);
			job = gen.compileJobGraph((OptimizedPlan) optPlan);
		}

		for (URL jar : jarFiles) {
			try {
				job.addJar(new Path(jar.toURI()));
			} catch (URISyntaxException e) {
				throw new RuntimeException("URL is invalid. This should not happen.", e);
			}
		}
 
		job.setClasspaths(classpaths);

		return job;
	}

	// ------------------------------------------------------------------------
	//  Helper methods
	// ------------------------------------------------------------------------

	/**
	 * Returns the {@link ActorGateway} of the current job manager leader using
	 * the {@link LeaderRetrievalService}.
	 *
	 * @return ActorGateway of the current job manager leader
	 * @throws Exception
	 */
	private ActorGateway getJobManagerGateway() throws Exception {
		LOG.info("Looking up JobManager");
		LeaderRetrievalService leaderRetrievalService;

		leaderRetrievalService = LeaderRetrievalUtils.createLeaderRetrievalService(config);

		return LeaderRetrievalUtils.retrieveLeaderGateway(
			leaderRetrievalService,
			actorSystem,
			lookupTimeout);
	}

}



444

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.flink.client.program;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.List;
import java.util.Random;
import java.util.jar.Attributes;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;
import java.util.jar.Manifest;

import org.apache.flink.api.common.Plan;
import org.apache.flink.api.common.Program;
import org.apache.flink.api.common.ProgramDescription;
import org.apache.flink.optimizer.Optimizer;
import org.apache.flink.optimizer.dag.DataSinkNode;
import org.apache.flink.optimizer.plandump.PlanJSONDumpGenerator;
import org.apache.flink.util.InstantiationUtil;

/**
 * This class encapsulates represents a program, packaged in a jar file. It supplies
 * functionality to extract nested libraries, search for the program entry point, and extract
 * a program plan.
 */
public class PackagedProgram {

	/**
	 * Property name of the entry in JAR manifest file that describes the Flink specific entry point.
	 */
	public static final String MANIFEST_ATTRIBUTE_ASSEMBLER_CLASS = "program-class";
	
	/**
	 * Property name of the entry in JAR manifest file that describes the class with the main method.
	 */
	public static final String MANIFEST_ATTRIBUTE_MAIN_CLASS = "Main-Class";

	// --------------------------------------------------------------------------------------------

	private final URL jarFile;

	private final String[] args;
	
	private final Program program;
	
	private final Class mainClass;
	
	private final List extractedTempLibraries;

	private final List classpaths;
	
	private ClassLoader userCodeClassLoader;
	
	private Plan plan;

	private String savepointPath;

	/**
	 * Creates an instance that wraps the plan defined in the jar file using the given
	 * argument.
	 *
	 * @param jarFile
	 *        The jar file which contains the plan and a Manifest which defines
	 *        the program-class
	 * @param args
	 *        Optional. The arguments used to create the pact plan, depend on
	 *        implementation of the pact plan. See getDescription().
	 * @throws ProgramInvocationException
	 *         This invocation is thrown if the Program can't be properly loaded. Causes
	 *         may be a missing / wrong class or manifest files.
	 */
	public PackagedProgram(File jarFile, String... args) throws ProgramInvocationException {
		this(jarFile, Collections.emptyList(), null, args);
	}

	/**
	 * Creates an instance that wraps the plan defined in the jar file using the given
	 * argument.
	 * 
	 * @param jarFile
	 *        The jar file which contains the plan and a Manifest which defines
	 *        the program-class
	 * @param classpaths
	 *        Additional classpath URLs needed by the Program.
	 * @param args
	 *        Optional. The arguments used to create the pact plan, depend on
	 *        implementation of the pact plan. See getDescription().
	 * @throws ProgramInvocationException
	 *         This invocation is thrown if the Program can't be properly loaded. Causes
	 *         may be a missing / wrong class or manifest files.
	 */
	public PackagedProgram(File jarFile, List classpaths, String... args) throws ProgramInvocationException {
		this(jarFile, classpaths, null, args);
	}

	/**
	 * Creates an instance that wraps the plan defined in the jar file using the given
	 * arguments. For generating the plan the class defined in the className parameter
	 * is used.
	 * 
	 * @param jarFile
	 *        The jar file which contains the plan.
	 * @param entryPointClassName
	 *        Name of the class which generates the plan. Overrides the class defined
	 *        in the jar file manifest
	 * @param args
	 *        Optional. The arguments used to create the pact plan, depend on
	 *        implementation of the pact plan. See getDescription().
	 * @throws ProgramInvocationException
	 *         This invocation is thrown if the Program can't be properly loaded. Causes
	 *         may be a missing / wrong class or manifest files.
	 */
	public PackagedProgram(File jarFile, String entryPointClassName, String... args) throws ProgramInvocationException {
		this(jarFile, Collections.emptyList(), entryPointClassName, args);
	}

	/**
	 * Creates an instance that wraps the plan defined in the jar file using the given
	 * arguments. For generating the plan the class defined in the className parameter
	 * is used.
	 * 
	 * @param jarFile
	 *        The jar file which contains the plan.
	 * @param classpaths
	 *        Additional classpath URLs needed by the Program.
	 * @param entryPointClassName
	 *        Name of the class which generates the plan. Overrides the class defined
	 *        in the jar file manifest
	 * @param args
	 *        Optional. The arguments used to create the pact plan, depend on
	 *        implementation of the pact plan. See getDescription().
	 * @throws ProgramInvocationException
	 *         This invocation is thrown if the Program can't be properly loaded. Causes
	 *         may be a missing / wrong class or manifest files.
	 */
	public PackagedProgram(File jarFile, List classpaths, String entryPointClassName, String... args) throws ProgramInvocationException {
		if (jarFile == null) {
			throw new IllegalArgumentException("The jar file must not be null.");
		}
		
		URL jarFileUrl;
		try {
			jarFileUrl = jarFile.getAbsoluteFile().toURI().toURL();
		} catch (MalformedURLException e1) {
			throw new IllegalArgumentException("The jar file path is invalid.");
		}
		
		checkJarFile(jarFileUrl);
		
		this.jarFile = jarFileUrl;
		this.args = args == null ? new String[0] : args;
		
		// if no entryPointClassName name was given, we try and look one up through the manifest
		if (entryPointClassName == null) {
			entryPointClassName = getEntryPointClassNameFromJar(jarFileUrl);
		}
		
		// now that we have an entry point, we can extract the nested jar files (if any)
		this.extractedTempLibraries = extractContainedLibaries(jarFileUrl);
		this.classpaths = classpaths;
		this.userCodeClassLoader = JobWithJars.buildUserCodeClassLoader(getAllLibraries(), classpaths, getClass().getClassLoader());
		
		// load the entry point class
		this.mainClass = loadMainClass(entryPointClassName, userCodeClassLoader);
		
		// if the entry point is a program, instantiate the class and get the plan
		if (Program.class.isAssignableFrom(this.mainClass)) {
			Program prg = null;
			try {
				prg = InstantiationUtil.instantiate(this.mainClass.asSubclass(Program.class), Program.class);
			} catch (Exception e) {
				// validate that the class has a main method at least.
				// the main method possibly instantiates the program properly
				if (!hasMainMethod(mainClass)) {
					throw new ProgramInvocationException("The given program class implements the " + 
							Program.class.getName() + " interface, but cannot be instantiated. " +
							"It also declares no main(String[]) method as alternative entry point", e);
				}
			} catch (Throwable t) {
				throw new ProgramInvocationException("Error while trying to instantiate program class.", t);
			}
			this.program = prg;
		} else if (hasMainMethod(mainClass)) {
			this.program = null;
		} else {
			throw new ProgramInvocationException("The given program class neither has a main(String[]) method, nor does it implement the " + 
					Program.class.getName() + " interface.");
		}
	}
	
	PackagedProgram(Class entryPointClass, String... args) throws ProgramInvocationException {
		this.jarFile = null;
		this.args = args == null ? new String[0] : args;
		
		this.extractedTempLibraries = Collections.emptyList();
		this.classpaths = Collections.emptyList();
		this.userCodeClassLoader = entryPointClass.getClassLoader();
		
		// load the entry point class
		this.mainClass = entryPointClass;
		
		// if the entry point is a program, instantiate the class and get the plan
		if (Program.class.isAssignableFrom(this.mainClass)) {
			Program prg = null;
			try {
				prg = InstantiationUtil.instantiate(this.mainClass.asSubclass(Program.class), Program.class);
			} catch (Exception e) {
				// validate that the class has a main method at least.
				// the main method possibly instantiates the program properly
				if (!hasMainMethod(mainClass)) {
					throw new ProgramInvocationException("The given program class implements the " + 
							Program.class.getName() + " interface, but cannot be instantiated. " +
							"It also declares no main(String[]) method as alternative entry point", e);
				}
			} catch (Throwable t) {
				throw new ProgramInvocationException("Error while trying to instantiate program class.", t);
			}
			this.program = prg;
		} else if (hasMainMethod(mainClass)) {
			this.program = null;
		} else {
			throw new ProgramInvocationException("The given program class neither has a main(String[]) method, nor does it implement the " + 
					Program.class.getName() + " interface.");
		}
	}

	public void setSavepointPath(String savepointPath) {
		this.savepointPath = savepointPath;
	}

	public String getSavepointPath() {
		return savepointPath;
	}

	public String[] getArguments() {
		return this.args;
	}
	
	public String getMainClassName() {
		return this.mainClass.getName();
	}
	
	public boolean isUsingInteractiveMode() {
		return this.program == null;
	}
	
	public boolean isUsingProgramEntryPoint() {
		return this.program != null;
	}

	/**
	 * Returns the plan with all required jars.
	 * 
	 * @return The plan with attached jar files.
	 * @throws ProgramInvocationException 
	 */
	public JobWithJars getPlanWithJars() throws ProgramInvocationException {
		if (isUsingProgramEntryPoint()) {
			return new JobWithJars(getPlan(), getAllLibraries(), classpaths, userCodeClassLoader);
		} else {
			throw new ProgramInvocationException("Cannot create a " + JobWithJars.class.getSimpleName() + 
					" for a program that is using the interactive mode.");
		}
	}

	/**
	 * Returns the analyzed plan without any optimizations.
	 * 
	 * @return
	 *         the analyzed plan without any optimizations.
	 * @throws ProgramInvocationException Thrown if an error occurred in the
	 *  user-provided pact assembler. This may indicate
	 *         missing parameters for generation.
	 */
	public String getPreviewPlan() throws ProgramInvocationException {
		Thread.currentThread().setContextClassLoader(this.getUserCodeClassLoader());
		List previewPlan;
		
		if (isUsingProgramEntryPoint()) {
			previewPlan = Optimizer.createPreOptimizedPlan(getPlan());
		}
		else if (isUsingInteractiveMode()) {
			// temporary hack to support the web client
			PreviewPlanEnvironment env = new PreviewPlanEnvironment();
			env.setAsContext();
			try {
				invokeInteractiveModeForExecution();
			}
			catch (ProgramInvocationException e) {
				throw e;
			}
			catch (Throwable t) {
				// the invocation gets aborted with the preview plan
				if (env.previewPlan != null) {
					previewPlan = env.previewPlan;
				} else if (env.preview != null) {
					return env.preview;
				} else {
					throw new ProgramInvocationException("The program caused an error: ", t);
				}
			}
			finally {
				env.unsetAsContext();
			}
			
			if (env.previewPlan != null) {
				previewPlan =  env.previewPlan;
			} else {
				throw new ProgramInvocationException(
						"The program plan could not be fetched. The program silently swallowed the control flow exceptions.");
			}
		}
		else {
			throw new RuntimeException();
		}

		PlanJSONDumpGenerator jsonGen = new PlanJSONDumpGenerator();
		StringWriter string = new StringWriter(1024);
		try (PrintWriter pw = new PrintWriter(string)) {
			jsonGen.dumpPactPlanAsJSON(previewPlan, pw);
		}
		return string.toString();

	}

	/**
	 * Returns the description provided by the Program class. This
	 * may contain a description of the plan itself and its arguments.
	 * 
	 * @return The description of the PactProgram's input parameters.
	 * @throws ProgramInvocationException
	 *         This invocation is thrown if the Program can't be properly loaded. Causes
	 *         may be a missing / wrong class or manifest files.
	 */
	public String getDescription() throws ProgramInvocationException {
		if (ProgramDescription.class.isAssignableFrom(this.mainClass)) {
			
			ProgramDescription descr;
			if (this.program != null) {
				descr = (ProgramDescription) this.program;
			} else {
				try {
					descr =  InstantiationUtil.instantiate(
						this.mainClass.asSubclass(ProgramDescription.class), ProgramDescription.class);
				} catch (Throwable t) {
					return null;
				}
			}
			
			try {
				return descr.getDescription();
			}
			catch (Throwable t) {
				throw new ProgramInvocationException("Error while getting the program description" + 
						(t.getMessage() == null ? "." : ": " + t.getMessage()), t);
			}
			
		} else {
			return null;
		}
	}
	
	/**
	 * 
	 * This method assumes that the context environment is prepared, or the execution
	 * will be a local execution by default.
	 */
	public void invokeInteractiveModeForExecution() throws ProgramInvocationException{
		if (isUsingInteractiveMode()) {
			callMainMethod(mainClass, args);
		} else {
			throw new ProgramInvocationException("Cannot invoke a plan-based program directly.");
		}
	}

	/**
	 * Returns the classpaths that are required by the program.
	 *
	 * @return List of {@link java.net.URL}s.
	 */
	public List getClasspaths() {
		return this.classpaths;
	}

	/**
	 * Gets the {@link java.lang.ClassLoader} that must be used to load user code classes.
	 * 
	 * @return The user code ClassLoader.
	 */
	public ClassLoader getUserCodeClassLoader() {
		return this.userCodeClassLoader;
	}

	public List getAllLibraries() {
		List libs = new ArrayList(this.extractedTempLibraries.size() + 1);

		if (jarFile != null) {
			libs.add(jarFile);
		}
		for (File tmpLib : this.extractedTempLibraries) {
			try {
				libs.add(tmpLib.getAbsoluteFile().toURI().toURL());
			}
			catch (MalformedURLException e) {
				throw new RuntimeException("URL is invalid. This should not happen.", e);
			}
		}

		return libs;
	}

	/**
	 * Deletes all temporary files created for contained packaged libraries.
	 */
	public void deleteExtractedLibraries() {
		deleteExtractedLibraries(this.extractedTempLibraries);
		this.extractedTempLibraries.clear();
	}
	
	
	/**
	 * Returns the plan as generated from the Pact Assembler.
	 * 
	 * @return The program's plan.
	 * @throws ProgramInvocationException Thrown, if an error occurred in the program while
	 *         creating the program's {@link Plan}.
	 */
	private Plan getPlan() throws ProgramInvocationException {
		if (this.plan == null) {
			Thread.currentThread().setContextClassLoader(this.userCodeClassLoader);
			this.plan = createPlanFromProgram(this.program, this.args);
		}
		
		return this.plan;
	}
	
	private static boolean hasMainMethod(Class entryClass) {
		Method mainMethod;
		try {
			mainMethod = entryClass.getMethod("main", String[].class);
		} catch (NoSuchMethodException e) {
			return false;
		}
		catch (Throwable t) {
			throw new RuntimeException("Could not look up the main(String[]) method from the class " + 
					entryClass.getName() + ": " + t.getMessage(), t);
		}
		
		return Modifier.isStatic(mainMethod.getModifiers()) && Modifier.isPublic(mainMethod.getModifiers());
	}
	
	private static void callMainMethod(Class entryClass, String[] args) throws ProgramInvocationException {
		Method mainMethod;
		try {
			mainMethod = entryClass.getMethod("main", String[].class);
		} catch (NoSuchMethodException e) {
			throw new ProgramInvocationException("The class " + entryClass.getName() + " has no main(String[]) method.");
		}
		catch (Throwable t) {
			throw new ProgramInvocationException("Could not look up the main(String[]) method from the class " + 
					entryClass.getName() + ": " + t.getMessage(), t);
		}
		
		if (!Modifier.isStatic(mainMethod.getModifiers())) {
			throw new ProgramInvocationException("The class " + entryClass.getName() + " declares a non-static main method.");
		}
		if (!Modifier.isPublic(mainMethod.getModifiers())) {
			throw new ProgramInvocationException("The class " + entryClass.getName() + " declares a non-public main method.");
		}
		
		try {
			mainMethod.invoke(null, (Object) args);
		}
		catch (IllegalArgumentException e) {
			throw new ProgramInvocationException("Could not invoke the main method, arguments are not matching.", e);
		}
		catch (IllegalAccessException e) {
			throw new ProgramInvocationException("Access to the main method was denied: " + e.getMessage(), e);
		}
		catch (InvocationTargetException e) {
			Throwable exceptionInMethod = e.getTargetException();
			if (exceptionInMethod instanceof Error) {
				throw (Error) exceptionInMethod;
			} else if (exceptionInMethod instanceof ProgramInvocationException) {
				throw (ProgramInvocationException) exceptionInMethod;
			} else {
				throw new ProgramInvocationException("The main method caused an error.", exceptionInMethod);
			}
		}
		catch (Throwable t) {
			throw new ProgramInvocationException("An error occurred while invoking the program's main method: " + t.getMessage(), t);
		}
	}

	private static String getEntryPointClassNameFromJar(URL jarFile) throws ProgramInvocationException {
		JarFile jar;
		Manifest manifest;
		String className;

		// Open jar file
		try {
			jar = new JarFile(new File(jarFile.toURI()));
		} catch (URISyntaxException use) {
			throw new ProgramInvocationException("Invalid file path '" + jarFile.getPath() + "'", use);
		} catch (IOException ioex) {
			throw new ProgramInvocationException("Error while opening jar file '" + jarFile.getPath() + "'. "
				+ ioex.getMessage(), ioex);
		}

		// jar file must be closed at the end
		try {
			// Read from jar manifest
			try {
				manifest = jar.getManifest();
			} catch (IOException ioex) {
				throw new ProgramInvocationException("The Manifest in the jar file could not be accessed '"
					+ jarFile.getPath() + "'. " + ioex.getMessage(), ioex);
			}
	
			if (manifest == null) {
				throw new ProgramInvocationException("No manifest found in jar file '" + jarFile.getPath() + "'. The manifest is need to point to the program's main class.");
			}
	
			Attributes attributes = manifest.getMainAttributes();
			
			// check for a "program-class" entry first
			className = attributes.getValue(PackagedProgram.MANIFEST_ATTRIBUTE_ASSEMBLER_CLASS);
			if (className != null) {
				return className;
			}
			
			
			// check for a main class
			className = attributes.getValue(PackagedProgram.MANIFEST_ATTRIBUTE_MAIN_CLASS);
			if (className != null) {
				return className;
			} else {
				throw new ProgramInvocationException("Neither a '" + MANIFEST_ATTRIBUTE_MAIN_CLASS + "', nor a '" +
						MANIFEST_ATTRIBUTE_ASSEMBLER_CLASS + "' entry was found in the jar file.");
			}
		}
		finally {
			try {
				jar.close();
			} catch (Throwable t) {
				throw new ProgramInvocationException("Could not close the JAR file: " + t.getMessage(), t);
			}
		}
	}
	
	private static Class loadMainClass(String className, ClassLoader cl) throws ProgramInvocationException {
		ClassLoader contextCl = null;
		try {
			contextCl = Thread.currentThread().getContextClassLoader();
			Thread.currentThread().setContextClassLoader(cl);
			return Class.forName(className, false, cl);
		}
		catch (ClassNotFoundException e) {
			throw new ProgramInvocationException("The program's entry point class '" + className
				+ "' was not found in the jar file.", e);
		}
		catch (ExceptionInInitializerError e) {
			throw new ProgramInvocationException("The program's entry point class '" + className
				+ "' threw an error during initialization.", e);
		}
		catch (LinkageError e) {
			throw new ProgramInvocationException("The program's entry point class '" + className
				+ "' could not be loaded due to a linkage failure.", e);
		}
		catch (Throwable t) {
			throw new ProgramInvocationException("The program's entry point class '" + className
				+ "' caused an exception during initialization: "+ t.getMessage(), t);
		} finally {
			if (contextCl != null) {
				Thread.currentThread().setContextClassLoader(contextCl);
			}
		}
	}
	
	/**
	 * Takes the jar described by the given file and invokes its pact assembler class to
	 * assemble a plan. The assembler class name is either passed through a parameter,
	 * or it is read from the manifest of the jar. The assembler is handed the given options
	 * for its assembly.
	 * 
	 * @param program The program to create the plan for.
	 * @param options
	 *        The options for the assembler.
	 * @return The plan created by the program.
	 * @throws ProgramInvocationException
	 *         Thrown, if an error occurred in the user-provided pact assembler.
	 */
	private static Plan createPlanFromProgram(Program program, String[] options) throws ProgramInvocationException {
		try {
			return program.getPlan(options);
		} catch (Throwable t) {
			throw new ProgramInvocationException("Error while calling the program: " + t.getMessage(), t);
		}
	}
	
	/**
	 * Takes all JAR files that are contained in this program's JAR file and extracts them
	 * to the system's temp directory.
	 * 
	 * @return The file names of the extracted temporary files.
	 * @throws ProgramInvocationException Thrown, if the extraction process failed.
	 */
	private static List extractContainedLibaries(URL jarFile) throws ProgramInvocationException {
		
		Random rnd = new Random();
		
		JarFile jar = null;
		try {
			jar = new JarFile(new File(jarFile.toURI()));
			final List containedJarFileEntries = new ArrayList();
			
			Enumeration entries = jar.entries();
			while (entries.hasMoreElements()) {
				JarEntry entry = entries.nextElement();
				String name = entry.getName();
				
				if (name.length() > 8 && name.startsWith("lib/") && name.endsWith(".jar")) {
					containedJarFileEntries.add(entry);
				}
			}
			
			if (containedJarFileEntries.isEmpty()) {
				return Collections.emptyList();
			}
			else {
				// go over all contained jar files
				final List extractedTempLibraries = new ArrayList(containedJarFileEntries.size());
				final byte[] buffer = new byte[4096];
				
				boolean incomplete = true;
				
				try {
					for (int i = 0; i < containedJarFileEntries.size(); i++) {
						final JarEntry entry = containedJarFileEntries.get(i);
						String name = entry.getName();
						name = name.replace(File.separatorChar, '_');
					
						File tempFile;
						try {
							tempFile = File.createTempFile(String.valueOf(Math.abs(rnd.nextInt()) + "_"), name);
							tempFile.deleteOnExit();
						}
						catch (IOException e) {
							throw new ProgramInvocationException(
								"An I/O error occurred while creating temporary file to extract nested library '" + 
										entry.getName() + "'.", e);
						}
						
						extractedTempLibraries.add(tempFile);
						
						// copy the temp file contents to a temporary File
						OutputStream out = null;
						InputStream in = null; 
						try {
							
							
							out = new FileOutputStream(tempFile);
							in = new BufferedInputStream(jar.getInputStream(entry));
							
							int numRead = 0;
							while ((numRead = in.read(buffer)) != -1) {
								out.write(buffer, 0, numRead);
							}
						}
						catch (IOException e) {
							throw new ProgramInvocationException("An I/O error occurred while extracting nested library '"
									+ entry.getName() + "' to temporary file '" + tempFile.getAbsolutePath() + "'.");
						}
						finally {
							if (out != null) {
								out.close();
							}
							if (in != null) {
								in.close();
							}
						}
					}
					
					incomplete = false;
				}
				finally {
					if (incomplete) {
						deleteExtractedLibraries(extractedTempLibraries);
					}
				}
				
				return extractedTempLibraries;
			}
		}
		catch (Throwable t) {
			throw new ProgramInvocationException("Unknown I/O error while extracting contained jar files.", t);
		}
		finally {
			if (jar != null) {
				try {
					jar.close();
				} catch (Throwable t) {}
			}
		}
	}
	
	private static void deleteExtractedLibraries(List tempLibraries) {
		for (File f : tempLibraries) {
			f.delete();
		}
	}
	
	private static void checkJarFile(URL jarfile) throws ProgramInvocationException {
		try {
			JobWithJars.checkJarFile(jarfile);
		}
		catch (IOException e) {
			throw new ProgramInvocationException(e.getMessage());
		}
		catch (Throwable t) {
			throw new ProgramInvocationException("Cannot access jar file" + (t.getMessage() == null ? "." : ": " + t.getMessage()), t);
		}
	}

}


555

/*
 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 */

package java.util.jar;

import java.io.FilterInputStream;
import java.io.DataOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.IOException;
import java.util.Map;
import java.util.HashMap;
import java.util.Iterator;

/**
 * The Manifest class is used to maintain Manifest entry names and their
 * associated Attributes. There are main Manifest Attributes as well as
 * per-entry Attributes. For information on the Manifest format, please
 * see the
 * 
 * Manifest format specification.
 *
 * @author  David Connelly
 * @see     Attributes
 * @since   1.2
 */
public class Manifest implements Cloneable {
    // manifest main attributes
    private Attributes attr = new Attributes();

    // manifest entries
    private Map entries = new HashMap();

    /**
     * Constructs a new, empty Manifest.
     */
    public Manifest() {
    }

    /**
     * Constructs a new Manifest from the specified input stream.
     *
     * @param is the input stream containing manifest data
     * @throws IOException if an I/O error has occured
     */
    public Manifest(InputStream is) throws IOException {
        read(is);
    }

    /**
     * Constructs a new Manifest that is a copy of the specified Manifest.
     *
     * @param man the Manifest to copy
     */
    public Manifest(Manifest man) {
        attr.putAll(man.getMainAttributes());
        entries.putAll(man.getEntries());
    }

    /**
     * Returns the main Attributes for the Manifest.
     * @return the main Attributes for the Manifest
     */
    public Attributes getMainAttributes() {
        return attr;
    }

    /**
     * Returns a Map of the entries contained in this Manifest. Each entry
     * is represented by a String name (key) and associated Attributes (value).
     * The Map permits the {@code null} key, but no entry with a null key is
     * created by {@link #read}, nor is such an entry written by using {@link
     * #write}.
     *
     * @return a Map of the entries contained in this Manifest
     */
    public Map getEntries() {
        return entries;
    }

    /**
     * Returns the Attributes for the specified entry name.
     * This method is defined as:
     * 
     *      return (Attributes)getEntries().get(name)
     * 
* Though {@code null} is a valid {@code name}, when * {@code getAttributes(null)} is invoked on a {@code Manifest} * obtained from a jar file, {@code null} will be returned. While jar * files themselves do not allow {@code null}-named attributes, it is * possible to invoke {@link #getEntries} on a {@code Manifest}, and * on that result, invoke {@code put} with a null key and an * arbitrary value. Subsequent invocations of * {@code getAttributes(null)} will return the just-{@code put} * value. *

* Note that this method does not return the manifest's main attributes; * see {@link #getMainAttributes}. * * @param name entry name * @return the Attributes for the specified entry name */ public Attributes getAttributes(String name) { return getEntries().get(name); } /** * Clears the main Attributes as well as the entries in this Manifest. */ public void clear() { attr.clear(); entries.clear(); } /** * Writes the Manifest to the specified OutputStream. * Attributes.Name.MANIFEST_VERSION must be set in * MainAttributes prior to invoking this method. * * @param out the output stream * @exception IOException if an I/O error has occurred * @see #getMainAttributes */ public void write(OutputStream out) throws IOException { DataOutputStream dos = new DataOutputStream(out); // Write out the main attributes for the manifest attr.writeMain(dos); // Now write out the pre-entry attributes Iterator it = entries.entrySet().iterator(); while (it.hasNext()) { Map.Entry e = (Map.Entry)it.next(); StringBuffer buffer = new StringBuffer("Name: "); String value = (String)e.getKey(); if (value != null) { byte[] vb = value.getBytes("UTF8"); value = new String(vb, 0, 0, vb.length); } buffer.append(value); buffer.append("\r\n"); make72Safe(buffer); dos.writeBytes(buffer.toString()); ((Attributes)e.getValue()).write(dos); } dos.flush(); } /** * Adds line breaks to enforce a maximum 72 bytes per line. */ static void make72Safe(StringBuffer line) { int length = line.length(); if (length > 72) { int index = 70; while (index < length - 2) { line.insert(index, "\r\n "); index += 72; length += 3; } } return; } /** * Reads the Manifest from the specified InputStream. The entry * names and attributes read will be merged in with the current * manifest entries. * * @param is the input stream * @exception IOException if an I/O error has occurred */ public void read(InputStream is) throws IOException { // Buffered input stream for reading manifest data FastInputStream fis = new FastInputStream(is); // Line buffer byte[] lbuf = new byte[512]; // Read the main attributes for the manifest attr.read(fis, lbuf); // Total number of entries, attributes read int ecount = 0, acount = 0; // Average size of entry attributes int asize = 2; // Now parse the manifest entries int len; String name = null; boolean skipEmptyLines = true; byte[] lastline = null; while ((len = fis.readLine(lbuf)) != -1) { if (lbuf[--len] != '\n') { throw new IOException("manifest line too long"); } if (len > 0 && lbuf[len-1] == '\r') { --len; } if (len == 0 && skipEmptyLines) { continue; } skipEmptyLines = false; if (name == null) { name = parseName(lbuf, len); if (name == null) { throw new IOException("invalid manifest format"); } if (fis.peek() == ' ') { // name is wrapped lastline = new byte[len - 6]; System.arraycopy(lbuf, 6, lastline, 0, len - 6); continue; } } else { // continuation line byte[] buf = new byte[lastline.length + len - 1]; System.arraycopy(lastline, 0, buf, 0, lastline.length); System.arraycopy(lbuf, 1, buf, lastline.length, len - 1); if (fis.peek() == ' ') { // name is wrapped lastline = buf; continue; } name = new String(buf, 0, buf.length, "UTF8"); lastline = null; } Attributes attr = getAttributes(name); if (attr == null) { attr = new Attributes(asize); entries.put(name, attr); } attr.read(fis, lbuf); ecount++; acount += attr.size(); //XXX: Fix for when the average is 0. When it is 0, // you get an Attributes object with an initial // capacity of 0, which tickles a bug in HashMap. asize = Math.max(2, acount / ecount); name = null; skipEmptyLines = true; } } private String parseName(byte[] lbuf, int len) { if (toLower(lbuf[0]) == 'n' && toLower(lbuf[1]) == 'a' && toLower(lbuf[2]) == 'm' && toLower(lbuf[3]) == 'e' && lbuf[4] == ':' && lbuf[5] == ' ') { try { return new String(lbuf, 6, len - 6, "UTF8"); } catch (Exception e) { } } return null; } private int toLower(int c) { return (c >= 'A' && c <= 'Z') ? 'a' + (c - 'A') : c; } /** * Returns true if the specified Object is also a Manifest and has * the same main Attributes and entries. * * @param o the object to be compared * @return true if the specified Object is also a Manifest and has * the same main Attributes and entries */ public boolean equals(Object o) { if (o instanceof Manifest) { Manifest m = (Manifest)o; return attr.equals(m.getMainAttributes()) && entries.equals(m.getEntries()); } else { return false; } } /** * Returns the hash code for this Manifest. */ public int hashCode() { return attr.hashCode() + entries.hashCode(); } /** * Returns a shallow copy of this Manifest. The shallow copy is * implemented as follows: *

     *     public Object clone() { return new Manifest(this); }
     * 
* @return a shallow copy of this Manifest */ public Object clone() { return new Manifest(this); } /* * A fast buffered input stream for parsing manifest files. */ static class FastInputStream extends FilterInputStream { private byte buf[]; private int count = 0; private int pos = 0; FastInputStream(InputStream in) { this(in, 8192); } FastInputStream(InputStream in, int size) { super(in); buf = new byte[size]; } public int read() throws IOException { if (pos >= count) { fill(); if (pos >= count) { return -1; } } return buf[pos++] & 0xff; } public int read(byte[] b, int off, int len) throws IOException { int avail = count - pos; if (avail <= 0) { if (len >= buf.length) { return in.read(b, off, len); } fill(); avail = count - pos; if (avail <= 0) { return -1; } } if (len > avail) { len = avail; } System.arraycopy(buf, pos, b, off, len); pos += len; return len; } /* * Reads 'len' bytes from the input stream, or until an end-of-line * is reached. Returns the number of bytes read. */ public int readLine(byte[] b, int off, int len) throws IOException { byte[] tbuf = this.buf; int total = 0; while (total < len) { int avail = count - pos; if (avail <= 0) { fill(); avail = count - pos; if (avail <= 0) { return -1; } } int n = len - total; if (n > avail) { n = avail; } int tpos = pos; int maxpos = tpos + n; while (tpos < maxpos && tbuf[tpos++] != '\n') ; n = tpos - pos; System.arraycopy(tbuf, pos, b, off, n); off += n; total += n; pos = tpos; if (tbuf[tpos-1] == '\n') { break; } } return total; } public byte peek() throws IOException { if (pos == count) fill(); if (pos == count) return -1; // nothing left in buffer return buf[pos]; } public int readLine(byte[] b) throws IOException { return readLine(b, 0, b.length); } public long skip(long n) throws IOException { if (n <= 0) { return 0; } long avail = count - pos; if (avail <= 0) { return in.skip(n); } if (n > avail) { n = avail; } pos += n; return n; } public int available() throws IOException { return (count - pos) + in.available(); } public void close() throws IOException { if (in != null) { in.close(); in = null; buf = null; } } private void fill() throws IOException { count = pos = 0; int n = in.read(buf, 0, buf.length); if (n > 0) { count = n; } } } }

666

/*
 * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
 * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 *
 */

package java.util.jar;

import java.io.*;
import java.lang.ref.SoftReference;
import java.net.URL;
import java.util.*;
import java.util.zip.*;
import java.security.CodeSigner;
import java.security.cert.Certificate;
import java.security.AccessController;
import java.security.CodeSource;
import sun.misc.IOUtils;
import sun.security.action.GetPropertyAction;
import sun.security.util.ManifestEntryVerifier;
import sun.misc.SharedSecrets;
import sun.security.util.SignatureFileVerifier;

/**
 * The JarFile class is used to read the contents of a jar file
 * from any file that can be opened with java.io.RandomAccessFile.
 * It extends the class java.util.zip.ZipFile with support
 * for reading an optional Manifest entry. The
 * Manifest can be used to specify meta-information about the
 * jar file and its entries.
 *
 * 

Unless otherwise noted, passing a null argument to a constructor * or method in this class will cause a {@link NullPointerException} to be * thrown. * * @author David Connelly * @see Manifest * @see java.util.zip.ZipFile * @see java.util.jar.JarEntry * @since 1.2 */ public class JarFile extends ZipFile { private SoftReference manRef; private JarEntry manEntry; private JarVerifier jv; private boolean jvInitialized; private boolean verify; private boolean computedHasClassPathAttribute; private boolean hasClassPathAttribute; // Set up JavaUtilJarAccess in SharedSecrets static { SharedSecrets.setJavaUtilJarAccess(new JavaUtilJarAccessImpl()); } /** * The JAR manifest file name. */ public static final String MANIFEST_NAME = "META-INF/MANIFEST.MF"; /** * Creates a new JarFile to read from the specified * file name. The JarFile will be verified if * it is signed. * @param name the name of the jar file to be opened for reading * @throws IOException if an I/O error has occurred * @throws SecurityException if access to the file is denied * by the SecurityManager */ public JarFile(String name) throws IOException { this(new File(name), true, ZipFile.OPEN_READ); } /** * Creates a new JarFile to read from the specified * file name. * @param name the name of the jar file to be opened for reading * @param verify whether or not to verify the jar file if * it is signed. * @throws IOException if an I/O error has occurred * @throws SecurityException if access to the file is denied * by the SecurityManager */ public JarFile(String name, boolean verify) throws IOException { this(new File(name), verify, ZipFile.OPEN_READ); } /** * Creates a new JarFile to read from the specified * File object. The JarFile will be verified if * it is signed. * @param file the jar file to be opened for reading * @throws IOException if an I/O error has occurred * @throws SecurityException if access to the file is denied * by the SecurityManager */ public JarFile(File file) throws IOException { this(file, true, ZipFile.OPEN_READ); } /** * Creates a new JarFile to read from the specified * File object. * @param file the jar file to be opened for reading * @param verify whether or not to verify the jar file if * it is signed. * @throws IOException if an I/O error has occurred * @throws SecurityException if access to the file is denied * by the SecurityManager. */ public JarFile(File file, boolean verify) throws IOException { this(file, verify, ZipFile.OPEN_READ); } /** * Creates a new JarFile to read from the specified * File object in the specified mode. The mode argument * must be either OPEN_READ or OPEN_READ | OPEN_DELETE. * * @param file the jar file to be opened for reading * @param verify whether or not to verify the jar file if * it is signed. * @param mode the mode in which the file is to be opened * @throws IOException if an I/O error has occurred * @throws IllegalArgumentException * if the mode argument is invalid * @throws SecurityException if access to the file is denied * by the SecurityManager * @since 1.3 */ public JarFile(File file, boolean verify, int mode) throws IOException { super(file, mode); this.verify = verify; } /** * Returns the jar file manifest, or null if none. * * @return the jar file manifest, or null if none * * @throws IllegalStateException * may be thrown if the jar file has been closed */ public Manifest getManifest() throws IOException { return getManifestFromReference(); } private Manifest getManifestFromReference() throws IOException { Manifest man = manRef != null ? manRef.get() : null; if (man == null) { JarEntry manEntry = getManEntry(); // If found then load the manifest if (manEntry != null) { if (verify) { byte[] b = getBytes(manEntry); man = new Manifest(new ByteArrayInputStream(b)); if (!jvInitialized) { jv = new JarVerifier(b); } } else { man = new Manifest(super.getInputStream(manEntry)); } manRef = new SoftReference(man); } } return man; } private native String[] getMetaInfEntryNames(); /** * Returns the JarEntry for the given entry name or * null if not found. * * @param name the jar file entry name * @return the JarEntry for the given entry name or * null if not found. * * @throws IllegalStateException * may be thrown if the jar file has been closed * * @see java.util.jar.JarEntry */ public JarEntry getJarEntry(String name) { return (JarEntry)getEntry(name); } /** * Returns the ZipEntry for the given entry name or * null if not found. * * @param name the jar file entry name * @return the ZipEntry for the given entry name or * null if not found * * @throws IllegalStateException * may be thrown if the jar file has been closed * * @see java.util.zip.ZipEntry */ public ZipEntry getEntry(String name) { ZipEntry ze = super.getEntry(name); if (ze != null) { return new JarFileEntry(ze); } return null; } /** * Returns an enumeration of the zip file entries. */ public Enumeration entries() { final Enumeration enum_ = super.entries(); return new Enumeration() { public boolean hasMoreElements() { return enum_.hasMoreElements(); } public JarFileEntry nextElement() { ZipEntry ze = (ZipEntry)enum_.nextElement(); return new JarFileEntry(ze); } }; } private class JarFileEntry extends JarEntry { JarFileEntry(ZipEntry ze) { super(ze); } public Attributes getAttributes() throws IOException { Manifest man = JarFile.this.getManifest(); if (man != null) { return man.getAttributes(getName()); } else { return null; } } public Certificate[] getCertificates() { try { maybeInstantiateVerifier(); } catch (IOException e) { throw new RuntimeException(e); } if (certs == null && jv != null) { certs = jv.getCerts(JarFile.this, this); } return certs == null ? null : certs.clone(); } public CodeSigner[] getCodeSigners() { try { maybeInstantiateVerifier(); } catch (IOException e) { throw new RuntimeException(e); } if (signers == null && jv != null) { signers = jv.getCodeSigners(JarFile.this, this); } return signers == null ? null : signers.clone(); } } /* * Ensures that the JarVerifier has been created if one is * necessary (i.e., the jar appears to be signed.) This is done as * a quick check to avoid processing of the manifest for unsigned * jars. */ private void maybeInstantiateVerifier() throws IOException { if (jv != null) { return; } if (verify) { String[] names = getMetaInfEntryNames(); if (names != null) { for (int i = 0; i < names.length; i++) { String name = names[i].toUpperCase(Locale.ENGLISH); if (name.endsWith(".DSA") || name.endsWith(".RSA") || name.endsWith(".EC") || name.endsWith(".SF")) { // Assume since we found a signature-related file // that the jar is signed and that we therefore // need a JarVerifier and Manifest getManifest(); return; } } } // No signature-related files; don't instantiate a // verifier verify = false; } } /* * Initializes the verifier object by reading all the manifest * entries and passing them to the verifier. */ private void initializeVerifier() { ManifestEntryVerifier mev = null; // Verify "META-INF/" entries... try { String[] names = getMetaInfEntryNames(); if (names != null) { for (int i = 0; i < names.length; i++) { String uname = names[i].toUpperCase(Locale.ENGLISH); if (MANIFEST_NAME.equals(uname) || SignatureFileVerifier.isBlockOrSF(uname)) { JarEntry e = getJarEntry(names[i]); if (e == null) { throw new JarException("corrupted jar file"); } if (mev == null) { mev = new ManifestEntryVerifier (getManifestFromReference()); } byte[] b = getBytes(e); if (b != null && b.length > 0) { jv.beginEntry(e, mev); jv.update(b.length, b, 0, b.length, mev); jv.update(-1, null, 0, 0, mev); } } } } } catch (IOException ex) { // if we had an error parsing any blocks, just // treat the jar file as being unsigned jv = null; verify = false; if (JarVerifier.debug != null) { JarVerifier.debug.println("jarfile parsing error!"); ex.printStackTrace(); } } // if after initializing the verifier we have nothing // signed, we null it out. if (jv != null) { jv.doneWithMeta(); if (JarVerifier.debug != null) { JarVerifier.debug.println("done with meta!"); } if (jv.nothingToVerify()) { if (JarVerifier.debug != null) { JarVerifier.debug.println("nothing to verify!"); } jv = null; verify = false; } } } /* * Reads all the bytes for a given entry. Used to process the * META-INF files. */ private byte[] getBytes(ZipEntry ze) throws IOException { try (InputStream is = super.getInputStream(ze)) { return IOUtils.readFully(is, (int)ze.getSize(), true); } } /** * Returns an input stream for reading the contents of the specified * zip file entry. * @param ze the zip file entry * @return an input stream for reading the contents of the specified * zip file entry * @throws ZipException if a zip file format error has occurred * @throws IOException if an I/O error has occurred * @throws SecurityException if any of the jar file entries * are incorrectly signed. * @throws IllegalStateException * may be thrown if the jar file has been closed */ public synchronized InputStream getInputStream(ZipEntry ze) throws IOException { maybeInstantiateVerifier(); if (jv == null) { return super.getInputStream(ze); } if (!jvInitialized) { initializeVerifier(); jvInitialized = true; // could be set to null after a call to // initializeVerifier if we have nothing to // verify if (jv == null) return super.getInputStream(ze); } // wrap a verifier stream around the real stream return new JarVerifier.VerifierStream( getManifestFromReference(), ze instanceof JarFileEntry ? (JarEntry) ze : getJarEntry(ze.getName()), super.getInputStream(ze), jv); } // Statics for hand-coded Boyer-Moore search in hasClassPathAttribute() // The bad character shift for "class-path" private static int[] lastOcc; // The good suffix shift for "class-path" private static int[] optoSft; // Initialize the shift arrays to search for "class-path" private static char[] src = {'c','l','a','s','s','-','p','a','t','h'}; static { lastOcc = new int[128]; optoSft = new int[10]; lastOcc[(int)'c']=1; lastOcc[(int)'l']=2; lastOcc[(int)'s']=5; lastOcc[(int)'-']=6; lastOcc[(int)'p']=7; lastOcc[(int)'a']=8; lastOcc[(int)'t']=9; lastOcc[(int)'h']=10; for (int i=0; i<9; i++) optoSft[i]=10; optoSft[9]=1; } private JarEntry getManEntry() { if (manEntry == null) { // First look up manifest entry using standard name manEntry = getJarEntry(MANIFEST_NAME); if (manEntry == null) { // If not found, then iterate through all the "META-INF/" // entries to find a match. String[] names = getMetaInfEntryNames(); if (names != null) { for (int i = 0; i < names.length; i++) { if (MANIFEST_NAME.equals( names[i].toUpperCase(Locale.ENGLISH))) { manEntry = getJarEntry(names[i]); break; } } } } } return manEntry; } // Returns true iff this jar file has a manifest with a class path // attribute. Returns false if there is no manifest or the manifest // does not contain a "Class-Path" attribute. Currently exported to // core libraries via sun.misc.SharedSecrets. boolean hasClassPathAttribute() throws IOException { if (computedHasClassPathAttribute) { return hasClassPathAttribute; } hasClassPathAttribute = false; if (!isKnownToNotHaveClassPathAttribute()) { JarEntry manEntry = getManEntry(); if (manEntry != null) { byte[] b = getBytes(manEntry); int last = b.length - src.length; int i = 0; next: while (i<=last) { for (int j=9; j>=0; j--) { char c = (char) b[i+j]; c = (((c-'A')|('Z'-c)) >= 0) ? (char)(c + 32) : c; if (c != src[j]) { i += Math.max(j + 1 - lastOcc[c&0x7F], optoSft[j]); continue next; } } hasClassPathAttribute = true; break; } } } computedHasClassPathAttribute = true; return hasClassPathAttribute; } private static String javaHome; private static String[] jarNames; private boolean isKnownToNotHaveClassPathAttribute() { // Optimize away even scanning of manifest for jar files we // deliver which don't have a class-path attribute. If one of // these jars is changed to include such an attribute this code // must be changed. if (javaHome == null) { javaHome = AccessController.doPrivileged( new GetPropertyAction("java.home")); } if (jarNames == null) { String[] names = new String[10]; String fileSep = File.separator; int i = 0; names[i++] = fileSep + "rt.jar"; names[i++] = fileSep + "sunrsasign.jar"; names[i++] = fileSep + "jsse.jar"; names[i++] = fileSep + "jce.jar"; names[i++] = fileSep + "charsets.jar"; names[i++] = fileSep + "dnsns.jar"; names[i++] = fileSep + "ldapsec.jar"; names[i++] = fileSep + "localedata.jar"; names[i++] = fileSep + "sunjce_provider.jar"; names[i++] = fileSep + "sunpkcs11.jar"; jarNames = names; } String name = getName(); String localJavaHome = javaHome; if (name.startsWith(localJavaHome)) { String[] names = jarNames; for (int i = 0; i < names.length; i++) { if (name.endsWith(names[i])) { return true; } } } return false; } private synchronized void ensureInitialization() { try { maybeInstantiateVerifier(); } catch (IOException e) { throw new RuntimeException(e); } if (jv != null && !jvInitialized) { initializeVerifier(); jvInitialized = true; } } JarEntry newEntry(ZipEntry ze) { return new JarFileEntry(ze); } Enumeration entryNames(CodeSource[] cs) { ensureInitialization(); if (jv != null) { return jv.entryNames(this, cs); } /* * JAR file has no signed content. Is there a non-signing * code source? */ boolean includeUnsigned = false; for (int i = 0; i < cs.length; i++) { if (cs[i].getCodeSigners() == null) { includeUnsigned = true; break; } } if (includeUnsigned) { return unsignedEntryNames(); } else { return new Enumeration() { public boolean hasMoreElements() { return false; } public String nextElement() { throw new NoSuchElementException(); } }; } } /** * Returns an enumeration of the zip file entries * excluding internal JAR mechanism entries and including * signed entries missing from the ZIP directory. */ Enumeration entries2() { ensureInitialization(); if (jv != null) { return jv.entries2(this, super.entries()); } // screen out entries which are never signed final Enumeration enum_ = super.entries(); return new Enumeration() { ZipEntry entry; public boolean hasMoreElements() { if (entry != null) { return true; } while (enum_.hasMoreElements()) { ZipEntry ze = (ZipEntry) enum_.nextElement(); if (JarVerifier.isSigningRelated(ze.getName())) { continue; } entry = ze; return true; } return false; } public JarFileEntry nextElement() { if (hasMoreElements()) { ZipEntry ze = entry; entry = null; return new JarFileEntry(ze); } throw new NoSuchElementException(); } }; } CodeSource[] getCodeSources(URL url) { ensureInitialization(); if (jv != null) { return jv.getCodeSources(this, url); } /* * JAR file has no signed content. Is there a non-signing * code source? */ Enumeration unsigned = unsignedEntryNames(); if (unsigned.hasMoreElements()) { return new CodeSource[]{JarVerifier.getUnsignedCS(url)}; } else { return null; } } private Enumeration unsignedEntryNames() { final Enumeration entries = entries(); return new Enumeration() { String name; /* * Grab entries from ZIP directory but screen out * metadata. */ public boolean hasMoreElements() { if (name != null) { return true; } while (entries.hasMoreElements()) { String value; ZipEntry e = (ZipEntry) entries.nextElement(); value = e.getName(); if (e.isDirectory() || JarVerifier.isSigningRelated(value)) { continue; } name = value; return true; } return false; } public String nextElement() { if (hasMoreElements()) { String value = name; name = null; return value; } throw new NoSuchElementException(); } }; } CodeSource getCodeSource(URL url, String name) { ensureInitialization(); if (jv != null) { if (jv.eagerValidation) { CodeSource cs = null; JarEntry je = getJarEntry(name); if (je != null) { cs = jv.getCodeSource(url, this, je); } else { cs = jv.getCodeSource(url, name); } return cs; } else { return jv.getCodeSource(url, name); } } return JarVerifier.getUnsignedCS(url); } void setEagerValidation(boolean eager) { try { maybeInstantiateVerifier(); } catch (IOException e) { throw new RuntimeException(e); } if (jv != null) { jv.setEagerValidation(eager); } } List getManifestDigests() { ensureInitialization(); if (jv != null) { return jv.getManifestDigests(); } return new ArrayList(); } }


Manifest-Version: 1.0
Implementation-Title: flink-python
Implementation-Version: 1.0.3
Archiver-Version: Plexus Archiver
Built-By: hadoop
Specification-Vendor: The Apache Software Foundation
Specification-Title: flink-python
Implementation-Vendor-Id: org.apache.flink
Class-Path: flink-core-1.0.3.jar flink-annotations-1.0.3.jar kryo-2.24
 .0.jar minlog-1.2.jar objenesis-2.1.jar avro-1.7.6.jar jackson-core-a
 sl-1.9.13.jar jackson-mapper-asl-1.9.13.jar paranamer-2.3.jar flink-s
 haded-hadoop1_2.10-1.0.3.jar xmlenc-0.52.jar commons-io-2.4.jar commo
 ns-logging-1.0.3.jar commons-codec-1.4.jar commons-math-2.1.jar commo
 ns-configuration-1.7.jar commons-collections-3.2.1.jar commons-lang-2
 .6.jar commons-digester-1.8.1.jar commons-beanutils-1.8.3.jar commons
 -net-1.4.1.jar jetty-util-6.1.26.jar commons-el-1.0.jar hsqldb-1.8.0.
 10.jar oro-2.0.8.jar flink-java-1.0.3.jar commons-math3-3.5.jar flink
 -optimizer_2.10-1.0.3.jar flink-runtime_2.10-1.0.3.jar commons-cli-1.
 2.jar netty-all-4.0.27.Final.jar javassist-3.18.2-GA.jar scala-librar
 y-2.10.4.jar akka-actor_2.10-2.3.7.jar config-1.2.1.jar akka-remote_2
 .10-2.3.7.jar netty-3.8.0.Final.jar protobuf-java-2.5.0.jar uncommons
 -maths-1.2.2a.jar akka-slf4j_2.10-2.3.7.jar grizzled-slf4j_2.10-1.0.2
 .jar scopt_2.10-3.2.0.jar metrics-core-3.1.0.jar metrics-jvm-3.1.0.ja
 r metrics-json-3.1.0.jar jackson-databind-2.4.2.jar jackson-annotatio
 ns-2.4.2.jar jackson-core-2.4.2.jar zookeeper-3.4.6.jar jline-0.9.94.
 jar chill_2.10-0.7.4.jar chill-java-0.7.4.jar flink-clients_2.10-1.0.
 3.jar force-shading-1.0.3.jar commons-lang3-3.3.2.jar slf4j-api-1.7.7
 .jar slf4j-log4j12-1.7.7.jar log4j-1.2.17.jar
Implementation-Vendor: The Apache Software Foundation
Main-Class: org.apache.flink.python.api.PythonPlanBinder
Created-By: Apache Maven 3.0.5
Build-Jdk: 1.8.0_72-internal
Specification-Version: 1.0.3

888

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE
 * file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
 * License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package org.apache.flink.python.api;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Random;

import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.LocalEnvironment;
import org.apache.flink.api.java.io.PrintingOutputFormat;
import org.apache.flink.api.java.io.TupleCsvInputFormat;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.CoGroupRawOperator;
import org.apache.flink.api.java.operators.CrossOperator.DefaultCross;
import org.apache.flink.api.java.operators.Grouping;
import org.apache.flink.api.common.operators.Keys;
import org.apache.flink.api.java.operators.SortedGrouping;
import org.apache.flink.api.java.operators.UdfOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.GlobalConfiguration;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.python.api.functions.util.NestedKeyDiscarder;
import org.apache.flink.python.api.functions.util.StringTupleDeserializerMap;
import org.apache.flink.python.api.PythonOperationInfo.DatasizeHint;
import static org.apache.flink.python.api.PythonOperationInfo.DatasizeHint.HUGE;
import static org.apache.flink.python.api.PythonOperationInfo.DatasizeHint.NONE;
import static org.apache.flink.python.api.PythonOperationInfo.DatasizeHint.TINY;
import org.apache.flink.python.api.functions.PythonCoGroup;
import org.apache.flink.python.api.functions.util.IdentityGroupReduce;
import org.apache.flink.python.api.functions.PythonMapPartition;
import org.apache.flink.python.api.functions.util.KeyDiscarder;
import org.apache.flink.python.api.functions.util.SerializerMap;
import org.apache.flink.python.api.functions.util.StringDeserializerMap;
import org.apache.flink.python.api.streaming.plan.PythonPlanStreamer;
import org.apache.flink.runtime.filecache.FileCache;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class allows the execution of a Flink plan written in python.
 */
public class PythonPlanBinder {
	static final Logger LOG = LoggerFactory.getLogger(PythonPlanBinder.class);

	public static final String ARGUMENT_PYTHON_2 = "2";
	public static final String ARGUMENT_PYTHON_3 = "3";

	public static final String FLINK_PYTHON_DC_ID = "flink";
	public static final String FLINK_PYTHON_PLAN_NAME = File.separator + "plan.py";

	public static final String FLINK_PYTHON2_BINARY_KEY = "python.binary.python2";
	public static final String FLINK_PYTHON3_BINARY_KEY = "python.binary.python3";
	public static final String PLANBINDER_CONFIG_BCVAR_COUNT = "PLANBINDER_BCVAR_COUNT";
	public static final String PLANBINDER_CONFIG_BCVAR_NAME_PREFIX = "PLANBINDER_BCVAR_";
	public static String FLINK_PYTHON2_BINARY_PATH = GlobalConfiguration.getString(FLINK_PYTHON2_BINARY_KEY, "python");
	public static String FLINK_PYTHON3_BINARY_PATH = GlobalConfiguration.getString(FLINK_PYTHON3_BINARY_KEY, "python3");

	private static final Random r = new Random();

	public static final String FLINK_PYTHON_FILE_PATH = System.getProperty("java.io.tmpdir") + File.separator + "flink_plan";
	private static final String FLINK_PYTHON_REL_LOCAL_PATH = File.separator + "resources" + File.separator + "python";
	private static final String FLINK_DIR = System.getenv("FLINK_ROOT_DIR");
	private static String FULL_PATH;

	public static StringBuilder arguments = new StringBuilder();

	public static boolean usePython3 = false;

	private static String FLINK_HDFS_PATH = "hdfs:/tmp";
	public static final String FLINK_TMP_DATA_DIR = System.getProperty("java.io.tmpdir") + File.separator + "flink_data";

	private HashMap sets = new HashMap<>();
	public ExecutionEnvironment env;
	private PythonPlanStreamer streamer;

	public static final int MAPPED_FILE_SIZE = 1024 * 1024 * 64;

	/**
	 * Entry point for the execution of a python plan.
	 *
	 * @param args planPath[ package1[ packageX[ - parameter1[ parameterX]]]]
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		if (args.length < 2) {
			System.out.println("Usage: ./bin/pyflink<2/3>.[sh/bat] [ [ [ ]]");
			return;
		}
		usePython3 = args[0].equals(ARGUMENT_PYTHON_3);
		PythonPlanBinder binder = new PythonPlanBinder();
		binder.runPlan(Arrays.copyOfRange(args, 1, args.length));
	}

	public PythonPlanBinder() throws IOException {
		FLINK_PYTHON2_BINARY_PATH = GlobalConfiguration.getString(FLINK_PYTHON2_BINARY_KEY, "python");
		FLINK_PYTHON3_BINARY_PATH = GlobalConfiguration.getString(FLINK_PYTHON3_BINARY_KEY, "python3");
		FULL_PATH = FLINK_DIR != null
				//command-line
				? FLINK_DIR + FLINK_PYTHON_REL_LOCAL_PATH
				//testing
				: new Path(FileSystem.getLocalFileSystem().getWorkingDirectory(), "src/main/python/org/apache/flink/python/api").toString();
	}

	private void runPlan(String[] args) throws Exception {
		env = ExecutionEnvironment.getExecutionEnvironment();

		int split = 0;
		for (int x = 0; x < args.length; x++) {
			if (args[x].compareTo("-") == 0) {
				split = x;
			}
		}

		try {
			String tmpPath = FLINK_PYTHON_FILE_PATH + r.nextInt();
			prepareFiles(tmpPath, Arrays.copyOfRange(args, 0, split == 0 ? 1 : split));
			startPython(tmpPath, Arrays.copyOfRange(args, split == 0 ? args.length : split + 1, args.length));
			receivePlan();

			if (env instanceof LocalEnvironment) {
				FLINK_HDFS_PATH = "file:" + System.getProperty("java.io.tmpdir") + File.separator + "flink";
			}

			distributeFiles(tmpPath, env);
			JobExecutionResult jer = env.execute();
			sendResult(jer);
			close();
		} catch (Exception e) {
			close();
			throw e;
		}
	}

	//=====Setup========================================================================================================
	/**
	 * Copies all files to a common directory (FLINK_PYTHON_FILE_PATH). This allows us to distribute it as one big
	 * package, and resolves PYTHONPATH issues.
	 *
	 * @param filePaths
	 * @throws IOException
	 * @throws URISyntaxException
	 */
	private void prepareFiles(String tempFilePath, String... filePaths) throws IOException, URISyntaxException {
		//Flink python package
		clearPath(tempFilePath);
		FileCache.copy(new Path(FULL_PATH), new Path(tempFilePath), false);

		//plan file		
		copyFile(filePaths[0], tempFilePath, FLINK_PYTHON_PLAN_NAME);

		//additional files/folders
		for (int x = 1; x < filePaths.length; x++) {
			copyFile(filePaths[x], tempFilePath, null);
		}
	}

	private static void clearPath(String path) throws IOException, URISyntaxException {
		FileSystem fs = FileSystem.get(new Path(path).toUri());
		if (fs.exists(new Path(path))) {
			fs.delete(new Path(path), true);
		}
	}

	private static void copyFile(String path, String target, String name) throws IOException, URISyntaxException {
		if (path.endsWith("/")) {
			path = path.substring(0, path.length() - 1);
		}
		String identifier = name == null ? path.substring(path.lastIndexOf("/")) : name;
		String tmpFilePath = target + "/" + identifier;
		clearPath(tmpFilePath);
		Path p = new Path(path);
		FileCache.copy(p.makeQualified(FileSystem.get(p.toUri())), new Path(tmpFilePath), true);
	}

	private static void distributeFiles(String tmpPath, ExecutionEnvironment env) throws IOException, URISyntaxException {
		clearPath(FLINK_HDFS_PATH);
		FileCache.copy(new Path(tmpPath), new Path(FLINK_HDFS_PATH), true);
		env.registerCachedFile(FLINK_HDFS_PATH, FLINK_PYTHON_DC_ID);
		clearPath(tmpPath);
	}

	private void startPython(String tempPath, String[] args) throws IOException {
		for (String arg : args) {
			arguments.append(" ").append(arg);
		}
		streamer = new PythonPlanStreamer();
		streamer.open(tempPath, arguments.toString());
	}

	private void sendResult(JobExecutionResult jer) throws IOException {
		long runtime = jer.getNetRuntime();
		streamer.sendRecord(runtime);
	}

	private void close() {
		try { //prevent throwing exception so that previous exceptions aren't hidden.
			FileSystem hdfs = FileSystem.get(new URI(FLINK_HDFS_PATH));
			hdfs.delete(new Path(FLINK_HDFS_PATH), true);

			FileSystem local = FileSystem.getLocalFileSystem();
			local.delete(new Path(FLINK_PYTHON_FILE_PATH), true);
			local.delete(new Path(FLINK_TMP_DATA_DIR), true);
			streamer.close();
		} catch (NullPointerException npe) {
		} catch (IOException ioe) {
			LOG.error("PythonAPI file cleanup failed. " + ioe.getMessage());
		} catch (URISyntaxException use) { // can't occur
		}
	}

	//====Plan==========================================================================================================
	private void receivePlan() throws IOException {
		receiveParameters();
		receiveOperations();
	}

	//====Environment===================================================================================================
	/**
	 * This enum contains the identifiers for all supported environment parameters.
	 */
	private enum Parameters {
		DOP,
		MODE,
		RETRY
	}

	private void receiveParameters() throws IOException {
		for (int x = 0; x < 3; x++) {
			Tuple value = (Tuple) streamer.getRecord(true);
			switch (Parameters.valueOf(((String) value.getField(0)).toUpperCase())) {
				case DOP:
					Integer dop = (Integer) value.getField(1);
					env.setParallelism(dop);
					break;
				case MODE:
					FLINK_HDFS_PATH = (Boolean) value.getField(1) ? "file:/tmp/flink" : "hdfs:/tmp/flink";
					break;
				case RETRY:
					int retry = (Integer) value.getField(1);
					env.setRestartStrategy(RestartStrategies.fixedDelayRestart(retry, 10000L));
					break;
			}
		}
		if (env.getParallelism() < 0) {
			env.setParallelism(1);
		}
	}

	//====Operations====================================================================================================
	/**
	 * This enum contains the identifiers for all supported DataSet operations.
	 */
	protected enum Operation {
		SOURCE_CSV, SOURCE_TEXT, SOURCE_VALUE, SOURCE_SEQ, SINK_CSV, SINK_TEXT, SINK_PRINT,
		SORT, UNION, FIRST, DISTINCT, GROUPBY, AGGREGATE,
		REBALANCE, PARTITION_HASH,
		BROADCAST,
		COGROUP, CROSS, CROSS_H, CROSS_T, FILTER, FLATMAP, GROUPREDUCE, JOIN, JOIN_H, JOIN_T, MAP, REDUCE, MAPPARTITION
	}

	private void receiveOperations() throws IOException {
		Integer operationCount = (Integer) streamer.getRecord(true);
		for (int x = 0; x < operationCount; x++) {
			PythonOperationInfo info = new PythonOperationInfo(streamer);
			Operation op;
			try {
				op = Operation.valueOf(info.identifier.toUpperCase());
			} catch (IllegalArgumentException iae) {
				throw new IllegalArgumentException("Invalid operation specified: " + info.identifier);
			}
			switch (op) {
				case SOURCE_CSV:
					createCsvSource(info);
					break;
				case SOURCE_TEXT:
					createTextSource(info);
					break;
				case SOURCE_VALUE:
					createValueSource(info);
					break;
				case SOURCE_SEQ:
					createSequenceSource(info);
					break;
				case SINK_CSV:
					createCsvSink(info);
					break;
				case SINK_TEXT:
					createTextSink(info);
					break;
				case SINK_PRINT:
					createPrintSink(info);
					break;
				case BROADCAST:
					createBroadcastVariable(info);
					break;
				case AGGREGATE:
					createAggregationOperation(info);
					break;
				case DISTINCT:
					createDistinctOperation(info);
					break;
				case FIRST:
					createFirstOperation(info);
					break;
				case PARTITION_HASH:
					createHashPartitionOperation(info);
					break;
				case REBALANCE:
					createRebalanceOperation(info);
					break;
				case GROUPBY:
					createGroupOperation(info);
					break;
				case SORT:
					createSortOperation(info);
					break;
				case UNION:
					createUnionOperation(info);
					break;
				case COGROUP:
					createCoGroupOperation(info);
					break;
				case CROSS:
					createCrossOperation(NONE, info);
					break;
				case CROSS_H:
					createCrossOperation(HUGE, info);
					break;
				case CROSS_T:
					createCrossOperation(TINY, info);
					break;
				case FILTER:
					createFilterOperation(info);
					break;
				case FLATMAP:
					createFlatMapOperation(info);
					break;
				case GROUPREDUCE:
					createGroupReduceOperation(info);
					break;
				case JOIN:
					createJoinOperation(NONE, info);
					break;
				case JOIN_H:
					createJoinOperation(HUGE, info);
					break;
				case JOIN_T:
					createJoinOperation(TINY, info);
					break;
				case MAP:
					createMapOperation(info);
					break;
				case MAPPARTITION:
					createMapPartitionOperation(info);
					break;
				case REDUCE:
					createReduceOperation(info);
					break;
			}
		}
	}

	private int getParallelism(PythonOperationInfo info) {
		return info.parallelism == -1 ? env.getParallelism() : info.parallelism;
	}

	@SuppressWarnings("unchecked")
	private void createCsvSource(PythonOperationInfo info) throws IOException {
		if (!(info.types instanceof TupleTypeInfo)) {
			throw new RuntimeException("The output type of a csv source has to be a tuple. The derived type is " + info);
		}
		Path path = new Path(info.path);
		String lineD = info.lineDelimiter;
		String fieldD = info.fieldDelimiter;
		TupleTypeInfo types = (TupleTypeInfo) info.types;
		sets.put(info.setID, env.createInput(new TupleCsvInputFormat(path, lineD, fieldD, types), info.types).setParallelism(getParallelism(info)).name("CsvSource")
				.map(new SerializerMap<>()).setParallelism(getParallelism(info)).name("CsvSourcePostStep"));
	}

	private void createTextSource(PythonOperationInfo info) throws IOException {
		sets.put(info.setID, env.readTextFile(info.path).setParallelism(getParallelism(info)).name("TextSource")
				.map(new SerializerMap()).setParallelism(getParallelism(info)).name("TextSourcePostStep"));
	}

	private void createValueSource(PythonOperationInfo info) throws IOException {
		sets.put(info.setID, env.fromElements(info.values).setParallelism(getParallelism(info)).name("ValueSource")
				.map(new SerializerMap<>()).setParallelism(getParallelism(info)).name("ValueSourcePostStep"));
	}

	private void createSequenceSource(PythonOperationInfo info) throws IOException {
		sets.put(info.setID, env.generateSequence(info.frm, info.to).setParallelism(getParallelism(info)).name("SequenceSource")
				.map(new SerializerMap()).setParallelism(getParallelism(info)).name("SequenceSourcePostStep"));
	}

	@SuppressWarnings("unchecked")
	private void createCsvSink(PythonOperationInfo info) throws IOException {
		DataSet parent = (DataSet) sets.get(info.parentID);
		parent.map(new StringTupleDeserializerMap()).setParallelism(getParallelism(info)).name("CsvSinkPreStep")
				.writeAsCsv(info.path, info.lineDelimiter, info.fieldDelimiter, info.writeMode).setParallelism(getParallelism(info)).name("CsvSink");
	}

	@SuppressWarnings("unchecked")
	private void createTextSink(PythonOperationInfo info) throws IOException {
		DataSet parent = (DataSet) sets.get(info.parentID);
		parent.map(new StringDeserializerMap()).setParallelism(getParallelism(info))
			.writeAsText(info.path, info.writeMode).setParallelism(getParallelism(info)).name("TextSink");
	}

	@SuppressWarnings("unchecked")
	private void createPrintSink(PythonOperationInfo info) throws IOException {
		DataSet parent = (DataSet) sets.get(info.parentID);
		parent.map(new StringDeserializerMap()).setParallelism(getParallelism(info)).name("PrintSinkPreStep")
			.output(new PrintingOutputFormat(info.toError)).setParallelism(getParallelism(info));
	}

	private void createBroadcastVariable(PythonOperationInfo info) throws IOException {
		UdfOperator op1 = (UdfOperator) sets.get(info.parentID);
		DataSet op2 = (DataSet) sets.get(info.otherID);

		op1.withBroadcastSet(op2, info.name);
		Configuration c = op1.getParameters();

		if (c == null) {
			c = new Configuration();
		}

		int count = c.getInteger(PLANBINDER_CONFIG_BCVAR_COUNT, 0);
		c.setInteger(PLANBINDER_CONFIG_BCVAR_COUNT, count + 1);
		c.setString(PLANBINDER_CONFIG_BCVAR_NAME_PREFIX + count, info.name);

		op1.withParameters(c);
	}

	private void createAggregationOperation(PythonOperationInfo info) throws IOException {
		DataSet op = (DataSet) sets.get(info.parentID);
		AggregateOperator ao = op.aggregate(info.aggregates[0].agg, info.aggregates[0].field);

		for (int x = 1; x < info.count; x++) {
			ao = ao.and(info.aggregates[x].agg, info.aggregates[x].field);
		}

		sets.put(info.setID, ao.setParallelism(getParallelism(info)).name("Aggregation"));
	}

	@SuppressWarnings("unchecked")
	private void createDistinctOperation(PythonOperationInfo info) throws IOException {
		DataSet op = (DataSet) sets.get(info.parentID);
		sets.put(info.setID, op.distinct(info.keys).setParallelism(getParallelism(info)).name("Distinct")
				.map(new KeyDiscarder()).setParallelism(getParallelism(info)).name("DistinctPostStep"));
	}

	private void createFirstOperation(PythonOperationInfo info) throws IOException {
		DataSet op = (DataSet) sets.get(info.parentID);
		sets.put(info.setID, op.first(info.count).setParallelism(getParallelism(info)).name("First"));
	}

	private void createGroupOperation(PythonOperationInfo info) throws IOException {
		DataSet op1 = (DataSet) sets.get(info.parentID);
		sets.put(info.setID, op1.groupBy(info.keys));
	}

	@SuppressWarnings("unchecked")
	private void createHashPartitionOperation(PythonOperationInfo info) throws IOException {
		DataSet op1 = (DataSet) sets.get(info.parentID);
		sets.put(info.setID, op1.partitionByHash(info.keys).setParallelism(getParallelism(info))
				.map(new KeyDiscarder()).setParallelism(getParallelism(info)).name("HashPartitionPostStep"));
	}

	private void createRebalanceOperation(PythonOperationInfo info) throws IOException {
		DataSet op = (DataSet) sets.get(info.parentID);
		sets.put(info.setID, op.rebalance().setParallelism(getParallelism(info)).name("Rebalance"));
	}

	private void createSortOperation(PythonOperationInfo info) throws IOException {
		Grouping op1 = (Grouping) sets.get(info.parentID);
		if (op1 instanceof UnsortedGrouping) {
			sets.put(info.setID, ((UnsortedGrouping) op1).sortGroup(info.field, info.order));
			return;
		}
		if (op1 instanceof SortedGrouping) {
			sets.put(info.setID, ((SortedGrouping) op1).sortGroup(info.field, info.order));
		}
	}

	@SuppressWarnings("unchecked")
	private void createUnionOperation(PythonOperationInfo info) throws IOException {
		DataSet op1 = (DataSet) sets.get(info.parentID);
		DataSet op2 = (DataSet) sets.get(info.otherID);
		sets.put(info.setID, op1.union(op2).setParallelism(getParallelism(info)).name("Union"));
	}

	@SuppressWarnings("unchecked")
	private void createCoGroupOperation(PythonOperationInfo info) {
		DataSet op1 = (DataSet) sets.get(info.parentID);
		DataSet op2 = (DataSet) sets.get(info.otherID);
		Keys.ExpressionKeys key1 = new Keys.ExpressionKeys(info.keys1, op1.getType());
		Keys.ExpressionKeys key2 = new Keys.ExpressionKeys(info.keys2, op2.getType());
		PythonCoGroup pcg = new PythonCoGroup(info.setID, info.types);
		sets.put(info.setID, new CoGroupRawOperator(op1, op2, key1, key2, pcg, info.types, info.name).setParallelism(getParallelism(info)));
	}

	@SuppressWarnings("unchecked")
	private void createCrossOperation(DatasizeHint mode, PythonOperationInfo info) {
		DataSet op1 = (DataSet) sets.get(info.parentID);
		DataSet op2 = (DataSet) sets.get(info.otherID);

		DefaultCross defaultResult;
		switch (mode) {
			case NONE:
				defaultResult = op1.cross(op2);
				break;
			case HUGE:
				defaultResult = op1.crossWithHuge(op2);
				break;
			case TINY:
				defaultResult = op1.crossWithTiny(op2);
				break;
			default:
				throw new IllegalArgumentException("Invalid Cross mode specified: " + mode);
		}

		defaultResult.setParallelism(getParallelism(info));
		if (info.usesUDF) {
			sets.put(info.setID, defaultResult.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name));
		} else {
			sets.put(info.setID, defaultResult.name("DefaultCross"));
		}
	}

	@SuppressWarnings("unchecked")
	private void createFilterOperation(PythonOperationInfo info) {
		DataSet op1 = (DataSet) sets.get(info.parentID);
		sets.put(info.setID, op1.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name));
	}

	@SuppressWarnings("unchecked")
	private void createFlatMapOperation(PythonOperationInfo info) {
		DataSet op1 = (DataSet) sets.get(info.parentID);
		sets.put(info.setID, op1.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name));
	}

	private void createGroupReduceOperation(PythonOperationInfo info) {
		Object op1 = sets.get(info.parentID);
		if (op1 instanceof DataSet) {
			sets.put(info.setID, applyGroupReduceOperation((DataSet) op1, info));
			return;
		}
		if (op1 instanceof UnsortedGrouping) {
			sets.put(info.setID, applyGroupReduceOperation((UnsortedGrouping) op1, info));
			return;
		}
		if (op1 instanceof SortedGrouping) {
			sets.put(info.setID, applyGroupReduceOperation((SortedGrouping) op1, info));
		}
	}

	@SuppressWarnings("unchecked")
	private DataSet applyGroupReduceOperation(DataSet op1, PythonOperationInfo info) {
		return op1.reduceGroup(new IdentityGroupReduce()).setCombinable(false).name("PythonGroupReducePreStep").setParallelism(getParallelism(info))
				.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name);
	}

	@SuppressWarnings("unchecked")
	private DataSet applyGroupReduceOperation(UnsortedGrouping op1, PythonOperationInfo info) {
		return op1.reduceGroup(new IdentityGroupReduce()).setCombinable(false).setParallelism(getParallelism(info)).name("PythonGroupReducePreStep")
				.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name);
	}

	@SuppressWarnings("unchecked")
	private DataSet applyGroupReduceOperation(SortedGrouping op1, PythonOperationInfo info) {
		return op1.reduceGroup(new IdentityGroupReduce()).setCombinable(false).setParallelism(getParallelism(info)).name("PythonGroupReducePreStep")
				.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name);
	}

	@SuppressWarnings("unchecked")
	private void createJoinOperation(DatasizeHint mode, PythonOperationInfo info) {
		DataSet op1 = (DataSet) sets.get(info.parentID);
		DataSet op2 = (DataSet) sets.get(info.otherID);

		if (info.usesUDF) {
			sets.put(info.setID, createDefaultJoin(op1, op2, info.keys1, info.keys2, mode, getParallelism(info))
					.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name));
		} else {
			sets.put(info.setID, createDefaultJoin(op1, op2, info.keys1, info.keys2, mode, getParallelism(info)));
		}
	}

	@SuppressWarnings("unchecked")
	private DataSet createDefaultJoin(DataSet op1, DataSet op2, String[] firstKeys, String[] secondKeys, DatasizeHint mode, int parallelism) {
		switch (mode) {
			case NONE:
				return op1.join(op2).where(firstKeys).equalTo(secondKeys).setParallelism(parallelism)
					.map(new NestedKeyDiscarder()).setParallelism(parallelism).name("DefaultJoinPostStep");
			case HUGE:
				return op1.joinWithHuge(op2).where(firstKeys).equalTo(secondKeys).setParallelism(parallelism)
					.map(new NestedKeyDiscarder()).setParallelism(parallelism).name("DefaultJoinPostStep");
			case TINY:
				return op1.joinWithTiny(op2).where(firstKeys).equalTo(secondKeys).setParallelism(parallelism)
					.map(new NestedKeyDiscarder()).setParallelism(parallelism).name("DefaultJoinPostStep");
			default:
				throw new IllegalArgumentException("Invalid join mode specified.");
		}
	}

	@SuppressWarnings("unchecked")
	private void createMapOperation(PythonOperationInfo info) {
		DataSet op1 = (DataSet) sets.get(info.parentID);
		sets.put(info.setID, op1.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name));
	}

	@SuppressWarnings("unchecked")
	private void createMapPartitionOperation(PythonOperationInfo info) {
		DataSet op1 = (DataSet) sets.get(info.parentID);
		sets.put(info.setID, op1.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name));
	}

	private void createReduceOperation(PythonOperationInfo info) {
		Object op1 = sets.get(info.parentID);
		if (op1 instanceof DataSet) {
			sets.put(info.setID, applyReduceOperation((DataSet) op1, info));
			return;
		}
		if (op1 instanceof UnsortedGrouping) {
			sets.put(info.setID, applyReduceOperation((UnsortedGrouping) op1, info));
		}
	}

	@SuppressWarnings("unchecked")
	private DataSet applyReduceOperation(DataSet op1, PythonOperationInfo info) {
		return op1.reduceGroup(new IdentityGroupReduce()).setCombinable(false).setParallelism(getParallelism(info)).name("PythonReducePreStep")
				.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name);
	}

	@SuppressWarnings("unchecked")
	private DataSet applyReduceOperation(UnsortedGrouping op1, PythonOperationInfo info) {
		return op1.reduceGroup(new IdentityGroupReduce()).setCombinable(false).setParallelism(getParallelism(info)).name("PythonReducePreStep")
				.mapPartition(new PythonMapPartition(info.setID, info.types)).setParallelism(getParallelism(info)).name(info.name);
	}
}

999

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.java;

import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.annotation.Public;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.Plan;
import org.apache.flink.api.common.PlanExecutor;
import org.apache.flink.configuration.Configuration;

import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

/**
 * An {@link ExecutionEnvironment} that sends programs to a cluster for execution. The environment
 * needs to be created with the address and port of the JobManager of the Flink cluster that
 * should execute the programs.
 * 
 * 

Many programs executed via the remote environment depend on additional classes. Such classes * may be the classes of functions (transformation, aggregation, ...) or libraries. Those classes * must be attached to the remote environment as JAR files, to allow the environment to ship the * classes into the cluster for the distributed execution. */ @Public public class RemoteEnvironment extends ExecutionEnvironment { /** The hostname of the JobManager */ protected final String host; /** The port of the JobManager main actor system */ protected final int port; /** The jar files that need to be attached to each job */ protected final List jarFiles; /** The configuration used by the client that connects to the cluster */ protected Configuration clientConfiguration; /** The remote executor lazily created upon first use */ protected PlanExecutor executor; /** Optional shutdown hook, used in session mode to eagerly terminate the last session */ private Thread shutdownHook; /** The classpaths that need to be attached to each job */ protected final List globalClasspaths; /** * Creates a new RemoteEnvironment that points to the master (JobManager) described by the * given host name and port. * *

Each program execution will have all the given JAR files in its classpath. * * @param host The host name or address of the master (JobManager), where the program should be executed. * @param port The port of the master (JobManager), where the program should be executed. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses * user-defined functions, user-defined input formats, or any libraries, those must be * provided in the JAR files. */ public RemoteEnvironment(String host, int port, String... jarFiles) { this(host, port, null, jarFiles, null); } /** * Creates a new RemoteEnvironment that points to the master (JobManager) described by the * given host name and port. * *

Each program execution will have all the given JAR files in its classpath. * * @param host The host name or address of the master (JobManager), where the program should be executed. * @param port The port of the master (JobManager), where the program should be executed. * @param clientConfig The configuration used by the client that connects to the cluster. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses * user-defined functions, user-defined input formats, or any libraries, those must be * provided in the JAR files. */ public RemoteEnvironment(String host, int port, Configuration clientConfig, String[] jarFiles) { this(host, port, clientConfig, jarFiles, null); } /** * Creates a new RemoteEnvironment that points to the master (JobManager) described by the * given host name and port. * *

Each program execution will have all the given JAR files in its classpath. * * @param host The host name or address of the master (JobManager), where the program should be executed. * @param port The port of the master (JobManager), where the program should be executed. * @param clientConfig The configuration used by the client that connects to the cluster. * @param jarFiles The JAR files with code that needs to be shipped to the cluster. If the program uses * user-defined functions, user-defined input formats, or any libraries, those must be * provided in the JAR files. * @param globalClasspaths The paths of directories and JAR files that are added to each user code * classloader on all nodes in the cluster. Note that the paths must specify a * protocol (e.g. file://) and be accessible on all nodes (e.g. by means of a NFS share). * The protocol must be supported by the {@link java.net.URLClassLoader}. */ public RemoteEnvironment(String host, int port, Configuration clientConfig, String[] jarFiles, URL[] globalClasspaths) { if (!ExecutionEnvironment.areExplicitEnvironmentsAllowed()) { throw new InvalidProgramException( "The RemoteEnvironment cannot be instantiated when running in a pre-defined context " + "(such as Command Line Client, Scala Shell, or TestEnvironment)"); } if (host == null) { throw new NullPointerException("Host must not be null."); } if (port < 1 || port >= 0xffff) { throw new IllegalArgumentException("Port out of range"); } this.host = host; this.port = port; this.clientConfiguration = clientConfig == null ? new Configuration() : clientConfig; if (jarFiles != null) { this.jarFiles = new ArrayList<>(jarFiles.length); for (String jarFile : jarFiles) { try { this.jarFiles.add(new File(jarFile).getAbsoluteFile().toURI().toURL()); } catch (MalformedURLException e) { throw new IllegalArgumentException("JAR file path invalid", e); } } } else { this.jarFiles = Collections.emptyList(); } if (globalClasspaths == null) { this.globalClasspaths = Collections.emptyList(); } else { this.globalClasspaths = Arrays.asList(globalClasspaths); } } // ------------------------------------------------------------------------ @Override public JobExecutionResult execute(String jobName) throws Exception { PlanExecutor executor = getExecutor(); Plan p = createProgramPlan(jobName); // Session management is disabled, revert this commit to enable //p.setJobId(jobID); //p.setSessionTimeout(sessionTimeout); JobExecutionResult result = executor.executePlan(p); this.lastJobExecutionResult = result; return result; } @Override public String getExecutionPlan() throws Exception { Plan p = createProgramPlan("plan", false); // make sure that we do not start an new executor here // if one runs, fine, of not, we create a local executor (lightweight) and let it // generate the plan if (executor != null) { return executor.getOptimizerPlanAsJSON(p); } else { PlanExecutor le = PlanExecutor.createLocalExecutor(null); String plan = le.getOptimizerPlanAsJSON(p); le.stop(); return plan; } } @Override @PublicEvolving public void startNewSession() throws Exception { dispose(); jobID = JobID.generate(); installShutdownHook(); } protected PlanExecutor getExecutor() throws Exception { if (executor == null) { executor = PlanExecutor.createRemoteExecutor(host, port, clientConfiguration, jarFiles, globalClasspaths); executor.setPrintStatusDuringExecution(getConfig().isSysoutLoggingEnabled()); } // if we are using sessions, we keep the executor running if (getSessionTimeout() > 0 && !executor.isRunning()) { executor.start(); installShutdownHook(); } return executor; } // ------------------------------------------------------------------------ // Dispose // ------------------------------------------------------------------------ protected void dispose() { // Remove shutdown hook to prevent resource leaks, unless this is invoked by the // shutdown hook itself if (shutdownHook != null && shutdownHook != Thread.currentThread()) { try { Runtime.getRuntime().removeShutdownHook(shutdownHook); } catch (IllegalStateException e) { // race, JVM is in shutdown already, we can safely ignore this } catch (Throwable t) { LOG.warn("Exception while unregistering the cleanup shutdown hook."); } } try { PlanExecutor executor = this.executor; if (executor != null) { executor.endSession(jobID); executor.stop(); } } catch (Exception e) { throw new RuntimeException("Failed to dispose the session shutdown hook."); } } @Override public String toString() { return "Remote Environment (" + this.host + ":" + this.port + " - parallelism = " + (getParallelism() == -1 ? "default" : getParallelism()) + ") : " + getIdString(); } // ------------------------------------------------------------------------ // Shutdown hooks and reapers // ------------------------------------------------------------------------ private void installShutdownHook() { if (shutdownHook == null) { Thread shutdownHook = new Thread(new Runnable() { @Override public void run() { try { dispose(); } catch (Throwable t) { LOG.error("Error in cleanup of RemoteEnvironment during JVM shutdown: " + t.getMessage(), t); } } }); try { // Add JVM shutdown hook to call shutdown of service Runtime.getRuntime().addShutdownHook(shutdownHook); this.shutdownHook = shutdownHook; } catch (IllegalStateException e) { // JVM is already shutting down. no need or a shutdown hook } catch (Throwable t) { LOG.error("Cannot register shutdown hook that cleanly terminates the BLOB service."); } } } }



101010

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.api.common;

import org.apache.flink.annotation.Internal;
import org.apache.flink.configuration.Configuration;

import java.net.URL;
import java.util.Collections;
import java.util.List;

/**
 * A PlanExecutor executes a Flink program's dataflow plan. All Flink programs are translated to
 * dataflow plans prior to execution.
 * 
 * 

The specific implementation (such as the org.apache.flink.client.LocalExecutor * and org.apache.flink.client.RemoteExecutor) determines where and how to run the dataflow. * The concrete implementations of the executors are loaded dynamically, because they depend on * the full set of all runtime classes.

* *

PlanExecutors can be started explicitly, in which case they keep running until stopped. If * a program is submitted to a plan executor that is not running, it will start up for that * program, and shut down afterwards.

*/ @Internal public abstract class PlanExecutor { private static final String LOCAL_EXECUTOR_CLASS = "org.apache.flink.client.LocalExecutor"; private static final String REMOTE_EXECUTOR_CLASS = "org.apache.flink.client.RemoteExecutor"; // ------------------------------------------------------------------------ // Config Options // ------------------------------------------------------------------------ /** If true, all execution progress updates are not only logged, but also printed to System.out */ private boolean printUpdatesToSysout = true; /** * Sets whether the executor should print progress results to "standard out" ({@link System#out}). * All progress messages are logged using the configured logging framework independent of the value * set here. * * @param printStatus True, to print progress updates to standard out, false to not do that. */ public void setPrintStatusDuringExecution(boolean printStatus) { this.printUpdatesToSysout = printStatus; } /** * Gets whether the executor prints progress results to "standard out" ({@link System#out}). * * @return True, if the executor prints progress messages to standard out, false if not. */ public boolean isPrintingStatusDuringExecution() { return this.printUpdatesToSysout; } // ------------------------------------------------------------------------ // Startup & Shutdown // ------------------------------------------------------------------------ /** * Starts the program executor. After the executor has been started, it will keep * running until {@link #stop()} is called. * * @throws Exception Thrown, if the executor startup failed. */ public abstract void start() throws Exception; /** * Shuts down the plan executor and releases all local resources. * *

This method also ends all sessions created by this executor. Remote job executions * may complete, but the session is not kept alive after that.

* * @throws Exception Thrown, if the proper shutdown failed. */ public abstract void stop() throws Exception; /** * Checks if this executor is currently running. * * @return True is the executor is running, false otherwise. */ public abstract boolean isRunning(); // ------------------------------------------------------------------------ // Program Execution // ------------------------------------------------------------------------ /** * Execute the given program. * *

If the executor has not been started before, then this method will start the * executor and stop it after the execution has completed. This implies that one needs * to explicitly start the executor for all programs where multiple dataflow parts * depend on each other. Otherwise, the previous parts will no longer * be available, because the executor immediately shut down after the execution.

* * @param plan The plan of the program to execute. * @return The execution result, containing for example the net runtime of the program, and the accumulators. * * @throws Exception Thrown, if job submission caused an exception. */ public abstract JobExecutionResult executePlan(Plan plan) throws Exception; /** * Gets the programs execution plan in a JSON format. * * @param plan The program to get the execution plan for. * @return The execution plan, as a JSON string. * * @throws Exception Thrown, if the executor could not connect to the compiler. */ public abstract String getOptimizerPlanAsJSON(Plan plan) throws Exception; /** * Ends the job session, identified by the given JobID. Jobs can be kept around as sessions, * if a session timeout is specified. Keeping Jobs as sessions allows users to incrementally * add new operations to their dataflow, that refer to previous intermediate results of the * dataflow. * * @param jobID The JobID identifying the job session. * @throws Exception Thrown, if the message to finish the session cannot be delivered. */ public abstract void endSession(JobID jobID) throws Exception; // ------------------------------------------------------------------------ // Executor Factories // ------------------------------------------------------------------------ /** * Creates an executor that runs the plan locally in a multi-threaded environment. * * @return A local executor. */ public static PlanExecutor createLocalExecutor(Configuration configuration) { Class leClass = loadExecutorClass(LOCAL_EXECUTOR_CLASS); try { return leClass.getConstructor(Configuration.class).newInstance(configuration); } catch (Throwable t) { throw new RuntimeException("An error occurred while loading the local executor (" + LOCAL_EXECUTOR_CLASS + ").", t); } } /** * Creates an executor that runs the plan on a remote environment. The remote executor is typically used * to send the program to a cluster for execution. * * @param hostname The address of the JobManager to send the program to. * @param port The port of the JobManager to send the program to. * @param clientConfiguration The configuration for the client (Akka, default.parallelism). * @param jarFiles A list of jar files that contain the user-defined function (UDF) classes and all classes used * from within the UDFs. * @param globalClasspaths A list of URLs that are added to the classpath of each user code classloader of the * program. Paths must specify a protocol (e.g. file://) and be accessible on all nodes. * @return A remote executor. */ public static PlanExecutor createRemoteExecutor(String hostname, int port, Configuration clientConfiguration, List jarFiles, List globalClasspaths) { if (hostname == null) { throw new IllegalArgumentException("The hostname must not be null."); } if (port <= 0 || port > 0xffff) { throw new IllegalArgumentException("The port value is out of range."); } Class reClass = loadExecutorClass(REMOTE_EXECUTOR_CLASS); List files = (jarFiles == null) ? Collections.emptyList() : jarFiles; List paths = (globalClasspaths == null) ? Collections.emptyList() : globalClasspaths; try { PlanExecutor executor = (clientConfiguration == null) ? reClass.getConstructor(String.class, int.class, List.class) .newInstance(hostname, port, files) : reClass.getConstructor(String.class, int.class, Configuration.class, List.class, List.class) .newInstance(hostname, port, clientConfiguration, files, paths); return executor; } catch (Throwable t) { throw new RuntimeException("An error occurred while loading the remote executor (" + REMOTE_EXECUTOR_CLASS + ").", t); } } private static Class loadExecutorClass(String className) { try { Class leClass = Class.forName(className); return leClass.asSubclass(PlanExecutor.class); } catch (ClassNotFoundException cnfe) { throw new RuntimeException("Could not load the executor class (" + className + "). Do you have the 'flink-clients' project in your dependencies?"); } catch (Throwable t) { throw new RuntimeException("An error occurred while loading the executor (" + className + ").", t); } } }


111111

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.client;

import java.net.InetSocketAddress;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Collections;
import java.util.List;

import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.Plan;
import org.apache.flink.api.common.PlanExecutor;
import org.apache.flink.client.program.Client;
import org.apache.flink.client.program.JobWithJars;
import org.apache.flink.optimizer.DataStatistics;
import org.apache.flink.optimizer.Optimizer;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.optimizer.costs.DefaultCostEstimator;
import org.apache.flink.optimizer.plan.OptimizedPlan;
import org.apache.flink.optimizer.plandump.PlanJSONDumpGenerator;
import org.apache.flink.configuration.Configuration;

/**
 * The RemoteExecutor is a {@link org.apache.flink.api.common.PlanExecutor} that takes the program
 * and ships it to a remote Flink cluster for execution.
 * 
 * 

The RemoteExecutor is pointed at the JobManager and gets the program and (if necessary) the * set of libraries that need to be shipped together with the program.

* *

The RemoteExecutor is used in the {@link org.apache.flink.api.java.RemoteEnvironment} to * remotely execute program parts.

*/ public class RemoteExecutor extends PlanExecutor { private final Object lock = new Object(); private final List jarFiles; private final List globalClasspaths; private final Configuration clientConfiguration; private Client client; private int defaultParallelism = 1; public RemoteExecutor(String hostname, int port) { this(hostname, port, new Configuration(), Collections.emptyList(), Collections.emptyList()); } public RemoteExecutor(String hostname, int port, URL jarFile) { this(hostname, port, new Configuration(), Collections.singletonList(jarFile), Collections.emptyList()); } public RemoteExecutor(String hostport, URL jarFile) { this(getInetFromHostport(hostport), new Configuration(), Collections.singletonList(jarFile), Collections.emptyList()); } public RemoteExecutor(String hostname, int port, List jarFiles) { this(new InetSocketAddress(hostname, port), new Configuration(), jarFiles, Collections.emptyList()); } public RemoteExecutor(String hostname, int port, Configuration clientConfiguration) { this(hostname, port, clientConfiguration, Collections.emptyList(), Collections.emptyList()); } public RemoteExecutor(String hostname, int port, Configuration clientConfiguration, URL jarFile) { this(hostname, port, clientConfiguration, Collections.singletonList(jarFile), Collections.emptyList()); } public RemoteExecutor(String hostport, Configuration clientConfiguration, URL jarFile) { this(getInetFromHostport(hostport), clientConfiguration, Collections.singletonList(jarFile), Collections.emptyList()); } public RemoteExecutor(String hostname, int port, Configuration clientConfiguration, List jarFiles, List globalClasspaths) { this(new InetSocketAddress(hostname, port), clientConfiguration, jarFiles, globalClasspaths); } public RemoteExecutor(InetSocketAddress inet, Configuration clientConfiguration, List jarFiles, List globalClasspaths) { this.clientConfiguration = clientConfiguration; this.jarFiles = jarFiles; this.globalClasspaths = globalClasspaths; clientConfiguration.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, inet.getHostName()); clientConfiguration.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, inet.getPort()); } // ------------------------------------------------------------------------ // Properties // ------------------------------------------------------------------------ /** * Sets the parallelism that will be used when neither the program does not define * any parallelism at all. * * @param defaultParallelism The default parallelism for the executor. */ public void setDefaultParallelism(int defaultParallelism) { if (defaultParallelism < 1) { throw new IllegalArgumentException("The default parallelism must be at least one"); } this.defaultParallelism = defaultParallelism; } /** * Gets the parallelism that will be used when neither the program does not define * any parallelism at all. * * @return The default parallelism for the executor. */ public int getDefaultParallelism() { return defaultParallelism; } // ------------------------------------------------------------------------ // Startup & Shutdown // ------------------------------------------------------------------------ @Override public void start() throws Exception { synchronized (lock) { if (client == null) { client = new Client(clientConfiguration); client.setPrintStatusDuringExecution(isPrintingStatusDuringExecution()); } else { throw new IllegalStateException("The remote executor was already started."); } } } @Override public void stop() throws Exception { synchronized (lock) { if (client != null) { client.shutdown(); client = null; } } } @Override public boolean isRunning() { return client != null; } // ------------------------------------------------------------------------ // Executing programs // ------------------------------------------------------------------------ @Override public JobExecutionResult executePlan(Plan plan) throws Exception { if (plan == null) { throw new IllegalArgumentException("The plan may not be null."); } JobWithJars p = new JobWithJars(plan, this.jarFiles, this.globalClasspaths); return executePlanWithJars(p); } public JobExecutionResult executePlanWithJars(JobWithJars program) throws Exception { if (program == null) { throw new IllegalArgumentException("The job may not be null."); } synchronized (this.lock) { // check if we start a session dedicated for this execution final boolean shutDownAtEnd; if (client == null) { shutDownAtEnd = true; // start the executor for us start(); } else { // we use the existing session shutDownAtEnd = false; } try { return client.runBlocking(program, defaultParallelism); } finally { if (shutDownAtEnd) { stop(); } } } } @Override public String getOptimizerPlanAsJSON(Plan plan) throws Exception { Optimizer opt = new Optimizer(new DataStatistics(), new DefaultCostEstimator(), new Configuration()); OptimizedPlan optPlan = opt.compile(plan); return new PlanJSONDumpGenerator().getOptimizerPlanAsJSON(optPlan); } @Override public void endSession(JobID jobID) throws Exception { if (jobID == null) { throw new NullPointerException("The supplied jobID must not be null."); } synchronized (this.lock) { // check if we start a session dedicated for this execution final boolean shutDownAtEnd; if (client == null) { shutDownAtEnd = true; // start the executor for us start(); } else { // we use the existing session shutDownAtEnd = false; } try { client.endSession(jobID); } finally { if (shutDownAtEnd) { stop(); } } } } // -------------------------------------------------------------------------------------------- // Utilities // -------------------------------------------------------------------------------------------- /** * Utility method that converts a string of the form "host:port" into an {@link InetSocketAddress}. * The returned InetSocketAddress may be unresolved! * * @param hostport The "host:port" string. * @return The converted InetSocketAddress. */ private static InetSocketAddress getInetFromHostport(String hostport) { // from http://stackoverflow.com/questions/2345063/java-common-way-to-validate-and-convert-hostport-to-inetsocketaddress URI uri; try { uri = new URI("my://" + hostport); } catch (URISyntaxException e) { throw new RuntimeException("Could not identify hostname and port in '" + hostport + "'.", e); } String host = uri.getHost(); int port = uri.getPort(); if (host == null || port == -1) { throw new RuntimeException("Could not identify hostname and port in '" + hostport + "'."); } return new InetSocketAddress(host, port); } }


333

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.jobmanager

import java.io.{File, IOException}
import java.net.{BindException, ServerSocket, UnknownHostException, InetAddress, InetSocketAddress}
import java.util.UUID
import java.util.concurrent.{TimeUnit, ExecutorService}

import akka.actor.Status.Failure
import akka.actor._
import akka.pattern.ask

import grizzled.slf4j.Logger

import org.apache.flink.api.common.{ExecutionConfig, JobID}
import org.apache.flink.configuration.{ConfigConstants, Configuration, GlobalConfiguration}
import org.apache.flink.core.fs.FileSystem
import org.apache.flink.core.io.InputSplitAssigner
import org.apache.flink.runtime.accumulators.AccumulatorSnapshot
import org.apache.flink.runtime.akka.{AkkaUtils, ListeningBehaviour}
import org.apache.flink.runtime.blob.BlobServer
import org.apache.flink.runtime.checkpoint._
import org.apache.flink.runtime.checkpoint.stats.{SimpleCheckpointStatsTracker, DisabledCheckpointStatsTracker, CheckpointStatsTracker}
import org.apache.flink.runtime.client._
import org.apache.flink.runtime.execution.UnrecoverableException
import org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager
import org.apache.flink.runtime.executiongraph.restart.{RestartStrategy, RestartStrategyFactory}
import org.apache.flink.runtime.executiongraph.{ExecutionGraph, ExecutionJobVertex}
import org.apache.flink.runtime.instance.{AkkaActorGateway, InstanceManager}
import org.apache.flink.runtime.jobgraph.jsonplan.JsonPlanGenerator
import org.apache.flink.runtime.jobgraph.{JobGraph, JobStatus, JobVertexID}
import org.apache.flink.runtime.jobmanager.SubmittedJobGraphStore.SubmittedJobGraphListener
import org.apache.flink.runtime.jobmanager.scheduler.{Scheduler => FlinkScheduler}
import org.apache.flink.runtime.leaderelection.{LeaderContender, LeaderElectionService, StandaloneLeaderElectionService}
import org.apache.flink.runtime.messages.ArchiveMessages.ArchiveExecutionGraph
import org.apache.flink.runtime.messages.ExecutionGraphMessages.JobStatusChanged
import org.apache.flink.runtime.messages.JobManagerMessages._
import org.apache.flink.runtime.messages.Messages.{Acknowledge, Disconnect}
import org.apache.flink.runtime.messages.RegistrationMessages._
import org.apache.flink.runtime.messages.TaskManagerMessages.{Heartbeat, SendStackTrace}
import org.apache.flink.runtime.messages.TaskMessages.{PartitionState, UpdateTaskExecutionState}
import org.apache.flink.runtime.messages.accumulators.{AccumulatorMessage, AccumulatorResultStringsFound, AccumulatorResultsErroneous, AccumulatorResultsFound, RequestAccumulatorResults, RequestAccumulatorResultsStringified}
import org.apache.flink.runtime.messages.checkpoint.{DeclineCheckpoint, AbstractCheckpointMessage, AcknowledgeCheckpoint}
import org.apache.flink.runtime.messages.webmonitor._
import org.apache.flink.runtime.process.ProcessReaper
import org.apache.flink.runtime.security.SecurityUtils
import org.apache.flink.runtime.security.SecurityUtils.FlinkSecuredRunner
import org.apache.flink.runtime.taskmanager.TaskManager
import org.apache.flink.runtime.util._
import org.apache.flink.runtime.webmonitor.{WebMonitor, WebMonitorUtils}
import org.apache.flink.runtime.{FlinkActor, LeaderSessionMessageFilter, LogMessages}
import org.apache.flink.util.{ExceptionUtils, InstantiationUtil, NetUtils}

import org.jboss.netty.channel.ChannelException

import scala.annotation.tailrec
import scala.collection.JavaConverters._
import scala.concurrent._
import scala.concurrent.duration._
import scala.concurrent.forkjoin.ForkJoinPool
import scala.language.postfixOps

/**
 * The job manager is responsible for receiving Flink jobs, scheduling the tasks, gathering the
 * job status and managing the task managers. It is realized as an actor and receives amongst others
 * the following messages:
 *
 *  - [[RegisterTaskManager]] is sent by a TaskManager which wants to register at the job manager.
 *  A successful registration at the instance manager is acknowledged by [[AcknowledgeRegistration]]
 *
 *  - [[SubmitJob]] is sent by a client which wants to submit a job to the system. The submit
 *  message contains the job description in the form of the JobGraph. The JobGraph is appended to
 *  the ExecutionGraph and the corresponding ExecutionJobVertices are scheduled for execution on
 *  the TaskManagers.
 *
 *  - [[CancelJob]] requests to cancel the job with the specified jobID. A successful cancellation
 *  is indicated by [[CancellationSuccess]] and a failure by [[CancellationFailure]]
 *
 * - [[UpdateTaskExecutionState]] is sent by a TaskManager to update the state of an
 * ExecutionVertex contained in the [[ExecutionGraph]].
 * A successful update is acknowledged by true and otherwise false.
 *
 * - [[RequestNextInputSplit]] requests the next input split for a running task on a
 * [[TaskManager]]. The assigned input split or null is sent to the sender in the form of the
 * message [[NextInputSplit]].
 *
 * - [[JobStatusChanged]] indicates that the status of job (RUNNING, CANCELING, FINISHED, etc.) has
 * changed. This message is sent by the ExecutionGraph.
 */
class JobManager(
    protected val flinkConfiguration: Configuration,
    protected val executorService: ExecutorService,
    protected val instanceManager: InstanceManager,
    protected val scheduler: FlinkScheduler,
    protected val libraryCacheManager: BlobLibraryCacheManager,
    protected val archive: ActorRef,
    protected val defaultRestartStrategy: RestartStrategy,
    protected val timeout: FiniteDuration,
    protected val leaderElectionService: LeaderElectionService,
    protected val submittedJobGraphs : SubmittedJobGraphStore,
    protected val checkpointRecoveryFactory : CheckpointRecoveryFactory,
    protected val savepointStore: SavepointStore,
    protected val jobRecoveryTimeout: FiniteDuration)
  extends FlinkActor
  with LeaderSessionMessageFilter // mixin oder is important, we want filtering after logging
  with LogMessages // mixin order is important, we want first logging
  with LeaderContender
  with SubmittedJobGraphListener {

  override val log = Logger(getClass)

  /** The extra execution context, for futures, with a custom logging reporter */
  protected val executionContext: ExecutionContext = ExecutionContext.fromExecutor(
    executorService,
    (t: Throwable) => {
      if (!context.system.isTerminated) {
        log.error("Executor could not execute task", t)
      }
    })

  /** Either running or not yet archived jobs (session hasn't been ended). */
  protected val currentJobs = scala.collection.mutable.HashMap[JobID, (ExecutionGraph, JobInfo)]()

  protected val recoveryMode = RecoveryMode.fromConfig(flinkConfiguration)

  var leaderSessionID: Option[UUID] = None

  /** Futures which have to be completed before terminating the job manager */
  var futuresToComplete: Option[Seq[Future[Unit]]] = None

  /**
   * The port of the web monitor as configured. Make sure that it is actually configured before
   * starting the JobManager. This tightly couples the web monitor with the job manager. It is a
   * temporary workaround until all execution graph components are properly serializable and all
   * web monitors can transparently interact with each job manager. Currently each web server has
   * to run in the actor system of the associated job manager.
   */
  val webMonitorPort : Int = flinkConfiguration.getInteger(
    ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, -1)

  /**
   * Run when the job manager is started. Simply logs an informational message.
   * The method also starts the leader election service.
   */
  override def preStart(): Unit = {
    log.info(s"Starting JobManager at $getAddress.")

    try {
      leaderElectionService.start(this)
    } catch {
      case e: Exception =>
        log.error("Could not start the JobManager because the leader election service did not " +
          "start.", e)
        throw new RuntimeException("Could not start the leader election service.", e)
    }

    try {
      submittedJobGraphs.start(this)
    } catch {
      case e: Exception =>
        log.error("Could not start the submitted job graphs service.", e)
        throw new RuntimeException("Could not start the submitted job graphs service.", e)
    }

    try {
      checkpointRecoveryFactory.start()
    } catch {
      case e: Exception =>
        log.error("Could not start the checkpoint recovery service.", e)
        throw new RuntimeException("Could not start the checkpoint recovery service.", e)
    }

    try {
      savepointStore.start()
    } catch {
      case e: Exception =>
        log.error("Could not start the savepoint store.", e)
        throw new RuntimeException("Could not start the  savepoint store store.", e)
    }
  }

  override def postStop(): Unit = {
    log.info(s"Stopping JobManager $getAddress.")

    val newFuturesToComplete = cancelAndClearEverything(
      new Exception("The JobManager is shutting down."),
      removeJobFromStateBackend = true)

    implicit val executionContext = context.dispatcher

    val futureToComplete = Future.sequence(
      futuresToComplete.getOrElse(Seq()) ++ newFuturesToComplete)

    Await.ready(futureToComplete, timeout)

    // disconnect the registered task managers
    instanceManager.getAllRegisteredInstances.asScala.foreach {
      _.getActorGateway().tell(
        Disconnect("JobManager is shutting down"),
        new AkkaActorGateway(self, leaderSessionID.orNull))
    }

    try {
      savepointStore.stop()
    } catch {
      case e: Exception =>
        log.error("Could not stop the savepoint store.", e)
        throw new RuntimeException("Could not stop the  savepoint store store.", e)
    }

    try {
      // revoke leadership and stop leader election service
      leaderElectionService.stop()
    } catch {
      case e: Exception => log.error("Could not properly shutdown the leader election service.")
    }

    try {
      submittedJobGraphs.stop()
    } catch {
      case e: Exception => log.error("Could not properly stop the submitted job graphs service.")
    }

    try {
      checkpointRecoveryFactory.stop()
    } catch {
      case e: Exception => log.error("Could not properly stop the checkpoint recovery service.")
    }

    if (archive != ActorRef.noSender) {
      archive ! decorateMessage(PoisonPill)
    }

    instanceManager.shutdown()
    scheduler.shutdown()

    try {
      libraryCacheManager.shutdown()
    } catch {
      case e: IOException => log.error("Could not properly shutdown the library cache manager.", e)
    }

    // shut down the extra thread pool for futures
    executorService.shutdown()

    log.debug(s"Job manager ${self.path} is completely stopped.")
  }

  /**
   * Central work method of the JobManager actor. Receives messages and reacts to them.
   *
   * @return
   */
  override def handleMessage: Receive = {

    case GrantLeadership(newLeaderSessionID) =>
      log.info(s"JobManager $getAddress was granted leadership with leader session ID " +
        s"$newLeaderSessionID.")

      leaderSessionID = newLeaderSessionID

      // confirming the leader session ID might be blocking, thus do it in a future
      future {
        leaderElectionService.confirmLeaderSessionID(newLeaderSessionID.orNull)

        // TODO (critical next step) This needs to be more flexible and robust (e.g. wait for task
        // managers etc.)
        if (recoveryMode != RecoveryMode.STANDALONE) {
          log.info(s"Delaying recovery of all jobs by $jobRecoveryTimeout.")

          context.system.scheduler.scheduleOnce(
            jobRecoveryTimeout,
            self,
            decorateMessage(RecoverAllJobs))(
            context.dispatcher)
        }
      }(context.dispatcher)

    case RevokeLeadership =>
      log.info(s"JobManager ${self.path.toSerializationFormat} was revoked leadership.")

      val newFuturesToComplete = cancelAndClearEverything(
        new Exception("JobManager is no longer the leader."),
        removeJobFromStateBackend = false)

      futuresToComplete = Some(futuresToComplete.getOrElse(Seq()) ++ newFuturesToComplete)

      // disconnect the registered task managers
      instanceManager.getAllRegisteredInstances.asScala.foreach {
        _.getActorGateway().tell(
          Disconnect("JobManager is no longer the leader"),
          new AkkaActorGateway(self, leaderSessionID.orNull))
      }

      instanceManager.unregisterAllTaskManagers()

      leaderSessionID = None

    case RegisterTaskManager(
      connectionInfo,
      hardwareInformation,
      numberOfSlots) =>

      val taskManager = sender()

      if (instanceManager.isRegistered(taskManager)) {
        val instanceID = instanceManager.getRegisteredInstance(taskManager).getId

        // IMPORTANT: Send the response to the "sender", which is not the
        //            TaskManager actor, but the ask future!
        sender() ! decorateMessage(
          AlreadyRegistered(
            instanceID,
            libraryCacheManager.getBlobServerPort)
        )
      }
      else {
        try {
          val instanceID = instanceManager.registerTaskManager(
            taskManager,
            connectionInfo,
            hardwareInformation,
            numberOfSlots,
            leaderSessionID.orNull)

          // IMPORTANT: Send the response to the "sender", which is not the
          //            TaskManager actor, but the ask future!
          sender() ! decorateMessage(
            AcknowledgeRegistration(
              instanceID,
              libraryCacheManager.getBlobServerPort)
          )

          // to be notified when the taskManager is no longer reachable
          context.watch(taskManager)
        }
        catch {
          // registerTaskManager throws an IllegalStateException if it is already shut down
          // let the actor crash and restart itself in this case
          case e: Exception =>
            log.error("Failed to register TaskManager at instance manager", e)

            // IMPORTANT: Send the response to the "sender", which is not the
            //            TaskManager actor, but the ask future!
            sender() ! decorateMessage(
              RefuseRegistration(
                ExceptionUtils.stringifyException(e))
            )
        }
      }

    case RequestNumberRegisteredTaskManager =>
      sender ! decorateMessage(instanceManager.getNumberOfRegisteredTaskManagers)

    case RequestTotalNumberOfSlots =>
      sender ! decorateMessage(instanceManager.getTotalNumberOfSlots)

    case SubmitJob(jobGraph, listeningBehaviour) =>
      val client = sender()

      val jobInfo = new JobInfo(client, listeningBehaviour, System.currentTimeMillis(),
        jobGraph.getSessionTimeout)

      submitJob(jobGraph, jobInfo)

    case RecoverSubmittedJob(submittedJobGraph) =>
      if (!currentJobs.contains(submittedJobGraph.getJobId)) {
        submitJob(
          submittedJobGraph.getJobGraph(),
          submittedJobGraph.getJobInfo(),
          isRecovery = true)
      }
      else {
        log.info(s"Ignoring job recovery for ${submittedJobGraph.getJobId}, " +
          s"because it is already submitted.")
      }

    case RecoverJob(jobId) =>
      future {
        try {
          // The ActorRef, which is part of the submitted job graph can only be
          // de-serialized in the scope of an actor system.
          akka.serialization.JavaSerializer.currentSystem.withValue(
            context.system.asInstanceOf[ExtendedActorSystem]) {

            log.info(s"Attempting to recover job $jobId.")
            val submittedJobGraphOption = submittedJobGraphs.recoverJobGraph(jobId)

            submittedJobGraphOption match {
              case Some(submittedJobGraph) =>
                if (!leaderElectionService.hasLeadership()) {
                  // we've lost leadership. mission: abort.
                  log.warn(s"Lost leadership during recovery. Aborting recovery of $jobId.")
                } else {
                  self ! decorateMessage(RecoverSubmittedJob(submittedJobGraph))
                }

              case None => log.info(s"Attempted to recover job $jobId, but no job graph found.")
            }
          }
        } catch {
          case t: Throwable => log.error(s"Failed to recover job $jobId.", t)
        }
      }(context.dispatcher)

    case RecoverAllJobs =>
      future {
        try {
          // The ActorRef, which is part of the submitted job graph can only be
          // de-serialized in the scope of an actor system.
          akka.serialization.JavaSerializer.currentSystem.withValue(
            context.system.asInstanceOf[ExtendedActorSystem]) {

            log.info(s"Attempting to recover all jobs.")

            val jobGraphs = submittedJobGraphs.recoverJobGraphs().asScala

            if (!leaderElectionService.hasLeadership()) {
              // we've lost leadership. mission: abort.
              log.warn(s"Lost leadership during recovery. Aborting recovery of ${jobGraphs.size} " +
                s"jobs.")
            } else {
              log.info(s"Re-submitting ${jobGraphs.size} job graphs.")

              jobGraphs.foreach{
                submittedJobGraph =>
                  self ! decorateMessage(RecoverSubmittedJob(submittedJobGraph))
              }
            }
          }
        } catch {
          case t: Throwable => log.error("Fatal error: Failed to recover jobs.", t)
        }
      }(context.dispatcher)

    case CancelJob(jobID) =>
      log.info(s"Trying to cancel job with ID $jobID.")

      currentJobs.get(jobID) match {
        case Some((executionGraph, _)) =>
          // execute the cancellation asynchronously
          Future {
            executionGraph.cancel()
          }(context.dispatcher)

          sender ! decorateMessage(CancellationSuccess(jobID))
        case None =>
          log.info(s"No job found with ID $jobID.")
          sender ! decorateMessage(
            CancellationFailure(
              jobID,
              new IllegalArgumentException(s"No job found with ID $jobID."))
          )
      }

    case StopJob(jobID) =>
      log.info(s"Trying to stop job with ID $jobID.")

      currentJobs.get(jobID) match {
        case Some((executionGraph, _)) =>
          try {
            if (!executionGraph.isStoppable()) {
              sender ! decorateMessage(
                StoppingFailure(
                  jobID,
                  new IllegalStateException(s"Job with ID $jobID is not stoppable."))
              )
            } else if (executionGraph.getState() != JobStatus.RUNNING) {
              sender ! decorateMessage(
                StoppingFailure(
                  jobID,
                  new IllegalStateException(s"Job with ID $jobID is in state " +
                    executionGraph.getState().name() + " but stopping is only allowed in state " +
                    "RUNNING."))
              )
            } else {
              executionGraph.stop()
              sender ! decorateMessage(StoppingSuccess(jobID))
            }
          } catch {
            case t: Throwable =>  sender ! decorateMessage(StoppingFailure(jobID, t))
          }
        case None =>
          log.info(s"No job found with ID $jobID.")
          sender ! decorateMessage(
            StoppingFailure(
              jobID,
              new IllegalArgumentException(s"No job found with ID $jobID."))
          )
      }

    case UpdateTaskExecutionState(taskExecutionState) =>
      if (taskExecutionState == null) {
        sender ! decorateMessage(false)
      } else {
        currentJobs.get(taskExecutionState.getJobID) match {
          case Some((executionGraph, _)) =>
            val originalSender = sender()

            Future {
              val result = executionGraph.updateState(taskExecutionState)
              originalSender ! decorateMessage(result)
            }(context.dispatcher)

          case None => log.error("Cannot find execution graph for ID " +
            s"${taskExecutionState.getJobID} to change state to " +
            s"${taskExecutionState.getExecutionState}.")
            sender ! decorateMessage(false)
        }
      }

    case RequestNextInputSplit(jobID, vertexID, executionAttempt) =>
      val serializedInputSplit = currentJobs.get(jobID) match {
        case Some((executionGraph,_)) =>
          val execution = executionGraph.getRegisteredExecutions.get(executionAttempt)

          if (execution == null) {
            log.error(s"Can not find Execution for attempt $executionAttempt.")
            null
          } else {
            val slot = execution.getAssignedResource
            val taskId = execution.getVertex.getParallelSubtaskIndex

            val host = if (slot != null) {
              slot.getInstance().getInstanceConnectionInfo.getHostname
            } else {
              null
            }

            executionGraph.getJobVertex(vertexID) match {
              case vertex: ExecutionJobVertex => vertex.getSplitAssigner match {
                case splitAssigner: InputSplitAssigner =>
                  val nextInputSplit = splitAssigner.getNextInputSplit(host, taskId)

                  log.debug(s"Send next input split $nextInputSplit.")

                  try {
                    InstantiationUtil.serializeObject(nextInputSplit)
                  } catch {
                    case ex: Exception =>
                      log.error(s"Could not serialize the next input split of " +
                        s"class ${nextInputSplit.getClass}.", ex)
                      vertex.fail(new RuntimeException("Could not serialize the next input split " +
                        "of class " + nextInputSplit.getClass + ".", ex))
                      null
                  }

                case _ =>
                  log.error(s"No InputSplitAssigner for vertex ID $vertexID.")
                  null
              }
              case _ =>
                log.error(s"Cannot find execution vertex for vertex ID $vertexID.")
                null
          }
        }
        case None =>
          log.error(s"Cannot find execution graph for job ID $jobID.")
          null
      }

      sender ! decorateMessage(NextInputSplit(serializedInputSplit))

    case checkpointMessage : AbstractCheckpointMessage =>
      handleCheckpointMessage(checkpointMessage)

    case TriggerSavepoint(jobId) =>
      currentJobs.get(jobId) match {
        case Some((graph, _)) =>
          val savepointCoordinator = graph.getSavepointCoordinator()

          if (savepointCoordinator != null) {
            // Immutable copy for the future
            val senderRef = sender()

            future {
              try {
                // Do this async, because checkpoint coordinator operations can
                // contain blocking calls to the state backend or ZooKeeper.
                val savepointFuture = savepointCoordinator.triggerSavepoint(
                  System.currentTimeMillis())

                savepointFuture.onComplete {
                  // Success, respond with the savepoint path
                  case scala.util.Success(savepointPath) =>
                    senderRef ! TriggerSavepointSuccess(jobId, savepointPath)

                  // Failure, respond with the cause
                  case scala.util.Failure(t) =>
                    senderRef ! TriggerSavepointFailure(
                      jobId,
                      new Exception("Failed to complete savepoint", t))
                }(context.dispatcher)
              } catch {
                case e: Exception =>
                  senderRef ! TriggerSavepointFailure(jobId, new Exception(
                    "Failed to trigger savepoint", e))
              }
            }(context.dispatcher)
          } else {
            sender() ! TriggerSavepointFailure(jobId, new IllegalStateException(
              "Checkpointing disabled. You can enable it via the execution environment of " +
                "your job."))
          }

        case None =>
          sender() ! TriggerSavepointFailure(jobId, new IllegalArgumentException("Unknown job."))
      }

    case DisposeSavepoint(savepointPath) =>
      val senderRef = sender()
      future {
        try {
          log.info(s"Disposing savepoint at '$savepointPath'.")

          val savepoint = savepointStore.getState(savepointPath)

          log.debug(s"$savepoint")

          // Discard the associated checkpoint
          savepoint.discard(getClass.getClassLoader)

          // Dispose the savepoint
          savepointStore.disposeState(savepointPath)

          senderRef ! DisposeSavepointSuccess
        } catch {
          case t: Throwable =>
            log.error(s"Failed to dispose savepoint at '$savepointPath'.", t)

            senderRef ! DisposeSavepointFailure(t)
        }
      }(context.dispatcher)

    case JobStatusChanged(jobID, newJobStatus, timeStamp, error) =>
      currentJobs.get(jobID) match {
        case Some((executionGraph, jobInfo)) => executionGraph.getJobName

          log.info(
            s"Status of job $jobID (${executionGraph.getJobName}) changed to $newJobStatus.",
            error)

          if (newJobStatus.isTerminalState()) {
            jobInfo.end = timeStamp

            future{
              // TODO If removing the JobGraph from the SubmittedJobGraphsStore fails, the job will
              // linger around and potentially be recovered at a later time. There is nothing we
              // can do about that, but it should be communicated with the Client.
              if (jobInfo.sessionAlive) {
                jobInfo.setLastActive()
                val lastActivity = jobInfo.lastActive
                context.system.scheduler.scheduleOnce(jobInfo.sessionTimeout seconds) {
                  // remove only if no activity occurred in the meantime
                  if (lastActivity == jobInfo.lastActive) {
                    self ! decorateMessage(RemoveJob(jobID, removeJobFromStateBackend = true))
                  }
                }(context.dispatcher)
              } else {
                self ! decorateMessage(RemoveJob(jobID, removeJobFromStateBackend = true))
              }

              // is the client waiting for the job result?
              if (jobInfo.listeningBehaviour != ListeningBehaviour.DETACHED) {
                newJobStatus match {
                  case JobStatus.FINISHED =>
                  try {
                    val accumulatorResults = executionGraph.getAccumulatorsSerialized()
                    val result = new SerializedJobExecutionResult(
                      jobID,
                      jobInfo.duration,
                      accumulatorResults)

                    jobInfo.client ! decorateMessage(JobResultSuccess(result))
                  } catch {
                    case e: Exception =>
                      log.error(s"Cannot fetch final accumulators for job $jobID", e)
                      val exception = new JobExecutionException(jobID,
                        "Failed to retrieve accumulator results.", e)

                      jobInfo.client ! decorateMessage(JobResultFailure(
                        new SerializedThrowable(exception)))
                  }

                  case JobStatus.CANCELED =>
                    // the error may be packed as a serialized throwable
                    val unpackedError = SerializedThrowable.get(
                      error, executionGraph.getUserClassLoader())

                    jobInfo.client ! decorateMessage(JobResultFailure(
                      new SerializedThrowable(
                        new JobCancellationException(jobID, "Job was cancelled.", unpackedError))))

                  case JobStatus.FAILED =>
                    val unpackedError = SerializedThrowable.get(
                      error, executionGraph.getUserClassLoader())

                    jobInfo.client ! decorateMessage(JobResultFailure(
                      new SerializedThrowable(
                        new JobExecutionException(jobID, "Job execution failed.", unpackedError))))

                  case x =>
                    val exception = new JobExecutionException(jobID, s"$x is not a terminal state.")
                    jobInfo.client ! decorateMessage(JobResultFailure(
                      new SerializedThrowable(exception)))
                    throw exception
                }
              }
            }(context.dispatcher)
          }
        case None =>
          self ! decorateMessage(RemoveJob(jobID, removeJobFromStateBackend = true))
      }

    case ScheduleOrUpdateConsumers(jobId, partitionId) =>
      currentJobs.get(jobId) match {
        case Some((executionGraph, _)) =>
          sender ! decorateMessage(Acknowledge)
          executionGraph.scheduleOrUpdateConsumers(partitionId)
        case None =>
          log.error(s"Cannot find execution graph for job ID $jobId to schedule or update " +
            s"consumers.")
          sender ! decorateMessage(
            Failure(
              new IllegalStateException("Cannot find execution graph for job ID " +
                s"$jobId to schedule or update consumers.")
            )
          )
      }

    case RequestPartitionState(jobId, partitionId, taskExecutionId, taskResultId) =>
      val state = currentJobs.get(jobId) match {
        case Some((executionGraph, _)) =>
          val execution = executionGraph.getRegisteredExecutions.get(partitionId.getProducerId)

          if (execution != null) execution.getState else null
        case None =>
          // Nothing to do. This is not an error, because the request is received when a sending
          // task fails during a remote partition request.
          log.debug(s"Cannot find execution graph for job $jobId.")

          null
      }

      sender ! decorateMessage(
        PartitionState(
          taskExecutionId,
          taskResultId,
          partitionId.getPartitionId,
          state)
      )

    case RequestJobStatus(jobID) =>
      currentJobs.get(jobID) match {
        case Some((executionGraph,_)) =>
          sender ! decorateMessage(CurrentJobStatus(jobID, executionGraph.getState))
        case None =>
          // check the archive
          archive forward decorateMessage(RequestJobStatus(jobID))
      }

    case RequestRunningJobs =>
      val executionGraphs = currentJobs map {
        case (_, (eg, jobInfo)) => eg
      }

      sender ! decorateMessage(RunningJobs(executionGraphs))

    case RequestRunningJobsStatus =>
      try {
        val jobs = currentJobs map {
          case (_, (eg, _)) =>
            new JobStatusMessage(
              eg.getJobID,
              eg.getJobName,
              eg.getState,
              eg.getStatusTimestamp(JobStatus.CREATED)
            )
        }

        sender ! decorateMessage(RunningJobsStatus(jobs))
      }
      catch {
        case t: Throwable => log.error("Exception while responding to RequestRunningJobsStatus", t)
      }

    case RequestJob(jobID) =>
      currentJobs.get(jobID) match {
        case Some((eg, _)) => sender ! decorateMessage(JobFound(jobID, eg))
        case None =>
          // check the archive
          archive forward decorateMessage(RequestJob(jobID))
      }

    case RequestBlobManagerPort =>
      sender ! decorateMessage(libraryCacheManager.getBlobServerPort)

    case RequestArchive =>
      sender ! decorateMessage(ResponseArchive(archive))

    case RequestRegisteredTaskManagers =>
      import scala.collection.JavaConverters._
      sender ! decorateMessage(
        RegisteredTaskManagers(
          instanceManager.getAllRegisteredInstances.asScala
        )
      )

    case RequestTaskManagerInstance(instanceID) =>
      sender ! decorateMessage(
        TaskManagerInstance(Option(instanceManager.getRegisteredInstanceById(instanceID)))
      )

    case Heartbeat(instanceID, metricsReport, accumulators) =>
      log.debug(s"Received heartbeat message from $instanceID.")

      updateAccumulators(accumulators)

      instanceManager.reportHeartBeat(instanceID, metricsReport)

    case message: AccumulatorMessage => handleAccumulatorMessage(message)

    case message: InfoMessage => handleInfoRequestMessage(message, sender())

    case RequestStackTrace(instanceID) =>
      val gateway = instanceManager.getRegisteredInstanceById(instanceID).getActorGateway
      gateway.forward(SendStackTrace, new AkkaActorGateway(sender, leaderSessionID.orNull))

    case Terminated(taskManager) =>
      if (instanceManager.isRegistered(taskManager)) {
        log.info(s"Task manager ${taskManager.path} terminated.")

        instanceManager.unregisterTaskManager(taskManager, true)
        context.unwatch(taskManager)
      }

    case RequestJobManagerStatus =>
      sender() ! decorateMessage(JobManagerStatusAlive)

    case RemoveJob(jobID, clearPersistedJob) =>
      currentJobs.get(jobID) match {
        case Some((graph, info)) =>
            removeJob(graph.getJobID, clearPersistedJob) match {
              case Some(futureToComplete) =>
                futuresToComplete = Some(futuresToComplete.getOrElse(Seq()) :+ futureToComplete)
              case None =>
            }
        case None =>
      }

    case RemoveCachedJob(jobID) =>
      currentJobs.get(jobID) match {
        case Some((graph, info)) =>
          if (graph.getState.isTerminalState) {
            removeJob(graph.getJobID, removeJobFromStateBackend = true) match {
              case Some(futureToComplete) =>
                futuresToComplete = Some(futuresToComplete.getOrElse(Seq()) :+ futureToComplete)
              case None =>
            }
          } else {
            // triggers removal upon completion of job
            info.sessionAlive = false
          }
        case None =>
      }

    case Disconnect(msg) =>
      val taskManager = sender()

      if (instanceManager.isRegistered(taskManager)) {
        log.info(s"Task manager ${taskManager.path} wants to disconnect, because $msg.")

        instanceManager.unregisterTaskManager(taskManager, false)
        context.unwatch(taskManager)
      }

    case RequestLeaderSessionID =>
      sender() ! ResponseLeaderSessionID(leaderSessionID.orNull)

    case RequestWebMonitorPort =>
      sender() ! ResponseWebMonitorPort(webMonitorPort)
  }

  /**
   * Submits a job to the job manager. The job is registered at the libraryCacheManager which
   * creates the job's class loader. The job graph is appended to the corresponding execution
   * graph and the execution vertices are queued for scheduling.
   *
   * @param jobGraph representing the Flink job
   * @param jobInfo the job info
   * @param isRecovery Flag indicating whether this is a recovery or initial submission
   */
  private def submitJob(jobGraph: JobGraph, jobInfo: JobInfo, isRecovery: Boolean = false): Unit = {

    if (jobGraph == null) {
      jobInfo.client ! decorateMessage(JobResultFailure(
        new SerializedThrowable(
          new JobSubmissionException(null, "JobGraph must not be null.")
        )
      ))
    }
    else {
      val jobId = jobGraph.getJobID
      val jobName = jobGraph.getName
      var executionGraph: ExecutionGraph = null

      log.info(s"Submitting job $jobId ($jobName)" + (if (isRecovery) " (Recovery)" else "") + ".")

      try {
        // Important: We need to make sure that the library registration is the first action,
        // because this makes sure that the uploaded jar files are removed in case of
        // unsuccessful
        try {
          libraryCacheManager.registerJob(jobGraph.getJobID, jobGraph.getUserJarBlobKeys,
            jobGraph.getClasspaths)
        }
        catch {
          case t: Throwable =>
            throw new JobSubmissionException(jobId,
              "Cannot set up the user code libraries: " + t.getMessage, t)
        }

        val userCodeLoader = libraryCacheManager.getClassLoader(jobGraph.getJobID)
        if (userCodeLoader == null) {
          throw new JobSubmissionException(jobId,
            "The user code class loader could not be initialized.")
        }

        if (jobGraph.getNumberOfVertices == 0) {
          throw new JobSubmissionException(jobId, "The given job is empty")
        }

        val restartStrategy = Option(jobGraph.getRestartStrategyConfiguration())
          .map(RestartStrategyFactory.createRestartStrategy(_)) match {
            case Some(strategy) => strategy
            case None => defaultRestartStrategy
          }

        log.info(s"Using restart strategy $restartStrategy for $jobId.")

        // see if there already exists an ExecutionGraph for the corresponding job ID
        executionGraph = currentJobs.get(jobGraph.getJobID) match {
          case Some((graph, currentJobInfo)) =>
            currentJobInfo.setLastActive()
            graph
          case None =>
            val graph = new ExecutionGraph(
              executionContext,
              jobGraph.getJobID,
              jobGraph.getName,
              jobGraph.getJobConfiguration,
              timeout,
              restartStrategy,
              jobGraph.getUserJarBlobKeys,
              jobGraph.getClasspaths,
              userCodeLoader)

            currentJobs.put(jobGraph.getJobID, (graph, jobInfo))
            graph
        }

        executionGraph.setScheduleMode(jobGraph.getScheduleMode())
        executionGraph.setQueuedSchedulingAllowed(jobGraph.getAllowQueuedScheduling())

        try {
          executionGraph.setJsonPlan(JsonPlanGenerator.generatePlan(jobGraph))
        }
        catch {
          case t: Throwable =>
            log.warn("Cannot create JSON plan for job", t)
            executionGraph.setJsonPlan("{}")
        }

        // initialize the vertices that have a master initialization hook
        // file output formats create directories here, input formats create splits
        if (log.isDebugEnabled) {
          log.debug(s"Running initialization on master for job $jobId ($jobName).")
        }

        val numSlots = scheduler.getTotalNumberOfSlots()

        for (vertex <- jobGraph.getVertices.asScala) {
          val executableClass = vertex.getInvokableClassName
          if (executableClass == null || executableClass.length == 0) {
            throw new JobSubmissionException(jobId,
              s"The vertex ${vertex.getID} (${vertex.getName}) has no invokable class.")
          }

          if (vertex.getParallelism() == ExecutionConfig.PARALLELISM_AUTO_MAX) {
            vertex.setParallelism(numSlots)
          }

          try {
            vertex.initializeOnMaster(userCodeLoader)
          }
          catch {
            case t: Throwable =>
              throw new JobExecutionException(jobId,
                "Cannot initialize task '" + vertex.getName() + "': " + t.getMessage, t)
          }
        }

        // topologically sort the job vertices and attach the graph to the existing one
        val sortedTopology = jobGraph.getVerticesSortedTopologicallyFromSources()
        if (log.isDebugEnabled) {
          log.debug(s"Adding ${sortedTopology.size()} vertices from " +
            s"job graph $jobId ($jobName).")
        }
        executionGraph.attachJobGraph(sortedTopology)

        if (log.isDebugEnabled) {
          log.debug("Successfully created execution graph from job " +
            s"graph $jobId ($jobName).")
        }

        // configure the state checkpointing
        val snapshotSettings = jobGraph.getSnapshotSettings
        if (snapshotSettings != null) {
          val jobId = jobGraph.getJobID()

          val idToVertex: JobVertexID => ExecutionJobVertex = id => {
            val vertex = executionGraph.getJobVertex(id)
            if (vertex == null) {
              throw new JobSubmissionException(jobId,
                "The snapshot checkpointing settings refer to non-existent vertex " + id)
            }
            vertex
          }

          val triggerVertices: java.util.List[ExecutionJobVertex] =
            snapshotSettings.getVerticesToTrigger().asScala.map(idToVertex).asJava

          val ackVertices: java.util.List[ExecutionJobVertex] =
            snapshotSettings.getVerticesToAcknowledge().asScala.map(idToVertex).asJava

          val confirmVertices: java.util.List[ExecutionJobVertex] =
            snapshotSettings.getVerticesToConfirm().asScala.map(idToVertex).asJava

          val completedCheckpoints = checkpointRecoveryFactory
            .createCompletedCheckpoints(jobId, userCodeLoader)

          val checkpointIdCounter = checkpointRecoveryFactory.createCheckpointIDCounter(jobId)

          // Checkpoint stats tracker
          val isStatsDisabled: Boolean = flinkConfiguration.getBoolean(
            ConfigConstants.JOB_MANAGER_WEB_CHECKPOINTS_DISABLE,
            ConfigConstants.DEFAULT_JOB_MANAGER_WEB_CHECKPOINTS_DISABLE)

          val checkpointStatsTracker: CheckpointStatsTracker =
            if (isStatsDisabled) {
              new DisabledCheckpointStatsTracker()
            } else {
              val historySize: Int = flinkConfiguration.getInteger(
                ConfigConstants.JOB_MANAGER_WEB_CHECKPOINTS_HISTORY_SIZE,
                ConfigConstants.DEFAULT_JOB_MANAGER_WEB_CHECKPOINTS_HISTORY_SIZE)

              new SimpleCheckpointStatsTracker(historySize, ackVertices)
            }

          executionGraph.enableSnapshotCheckpointing(
            snapshotSettings.getCheckpointInterval,
            snapshotSettings.getCheckpointTimeout,
            snapshotSettings.getMinPauseBetweenCheckpoints,
            snapshotSettings.getMaxConcurrentCheckpoints,
            triggerVertices,
            ackVertices,
            confirmVertices,
            context.system,
            leaderSessionID.orNull,
            checkpointIdCounter,
            completedCheckpoints,
            recoveryMode,
            savepointStore,
            checkpointStatsTracker)
        }

        // get notified about job status changes
        executionGraph.registerJobStatusListener(
          new AkkaActorGateway(self, leaderSessionID.orNull))

        if (jobInfo.listeningBehaviour == ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES) {
          // the sender wants to be notified about state changes
          val gateway = new AkkaActorGateway(jobInfo.client, leaderSessionID.orNull)

          executionGraph.registerExecutionListener(gateway)
          executionGraph.registerJobStatusListener(gateway)
        }
      }
      catch {
        case t: Throwable =>
          log.error(s"Failed to submit job $jobId ($jobName)", t)

          libraryCacheManager.unregisterJob(jobId)
          currentJobs.remove(jobId)

          if (executionGraph != null) {
            executionGraph.fail(t)
          }

          val rt: Throwable = if (t.isInstanceOf[JobExecutionException]) {
            t
          } else {
            new JobExecutionException(jobId, s"Failed to submit job $jobId ($jobName)", t)
          }

          jobInfo.client ! decorateMessage(JobResultFailure(new SerializedThrowable(rt)))
          return
      }

      // execute the recovery/writing the jobGraph into the SubmittedJobGraphStore asynchronously
      // because it is a blocking operation
      future {
        try {
          if (isRecovery) {
            executionGraph.restoreLatestCheckpointedState()
          }
          else {
            val snapshotSettings = jobGraph.getSnapshotSettings
            if (snapshotSettings != null) {
              val savepointPath = snapshotSettings.getSavepointPath()

              // Reset state back to savepoint
              if (savepointPath != null) {
                try {
                  executionGraph.restoreSavepoint(savepointPath)
                } catch {
                  case e: Exception =>
                    throw new UnrecoverableException(e)
                }
              }
            }

            submittedJobGraphs.putJobGraph(new SubmittedJobGraph(jobGraph, jobInfo))
          }

          jobInfo.client ! decorateMessage(JobSubmitSuccess(jobGraph.getJobID))

          if (leaderElectionService.hasLeadership) {
            // There is a small chance that multiple job managers schedule the same job after if
            // they try to recover at the same time. This will eventually be noticed, but can not be
            // ruled out from the beginning.

            // NOTE: Scheduling the job for execution is a separate action from the job submission.
            // The success of submitting the job must be independent from the success of scheduling
            // the job.
            log.info(s"Scheduling job $jobId ($jobName).")

            executionGraph.scheduleForExecution(scheduler)
          } else {
            // Remove the job graph. Otherwise it will be lingering around and possibly removed from
            // ZooKeeper by this JM.
            self ! decorateMessage(RemoveJob(jobId, removeJobFromStateBackend = false))

            log.warn(s"Submitted job $jobId, but not leader. The other leader needs to recover " +
              "this. I am not scheduling the job for execution.")
          }
        } catch {
          case t: Throwable => try {
            executionGraph.fail(t)
          }
          catch {
            case tt: Throwable =>
              log.error("Error while marking ExecutionGraph as failed.", tt)
          }
        }
      }(context.dispatcher)
    }
  }

  /**
   * Dedicated handler for checkpoint messages.
   *
   * @param actorMessage The checkpoint actor message.
   */
  private def handleCheckpointMessage(actorMessage: AbstractCheckpointMessage): Unit = {
    actorMessage match {
      case ackMessage: AcknowledgeCheckpoint =>
        val jid = ackMessage.getJob()
        currentJobs.get(jid) match {
          case Some((graph, _)) =>
            val checkpointCoordinator = graph.getCheckpointCoordinator()
            val savepointCoordinator = graph.getSavepointCoordinator()

            if (checkpointCoordinator != null && savepointCoordinator != null) {
              future {
                try {
                  if (checkpointCoordinator.receiveAcknowledgeMessage(ackMessage)) {
                    // OK, this is the common case
                  }
                  else {
                    // Try the savepoint coordinator if the message was not addressed
                    // to the periodic checkpoint coordinator.
                    if (!savepointCoordinator.receiveAcknowledgeMessage(ackMessage)) {
                      log.info("Received message for non-existing checkpoint " +
                        ackMessage.getCheckpointId)
                    }
                  }
                }
                catch {
                  case t: Throwable =>
                    log.error(s"Error in CheckpointCoordinator while processing $ackMessage", t)
                }
              }(context.dispatcher)
            }
            else {
              log.error(
                s"Received AcknowledgeCheckpoint message for job $jid with no " +
                  s"CheckpointCoordinator")
            }

          case None => log.error(s"Received AcknowledgeCheckpoint for unavailable job $jid")
        }

      case declineMessage: DeclineCheckpoint =>
        val jid = declineMessage.getJob()
        currentJobs.get(jid) match {
          case Some((graph, _)) =>
            val checkpointCoordinator = graph.getCheckpointCoordinator()
            val savepointCoordinator = graph.getSavepointCoordinator()

            if (checkpointCoordinator != null && savepointCoordinator != null) {
              future {
                try {
                  if (checkpointCoordinator.receiveDeclineMessage(declineMessage)) {
                    // OK, this is the common case
                  }
                  else {
                    // Try the savepoint coordinator if the message was not addressed
                    // to the periodic checkpoint coordinator.
                    if (!savepointCoordinator.receiveDeclineMessage(declineMessage)) {
                      log.info("Received message for non-existing checkpoint " +
                        declineMessage.getCheckpointId)
                    }
                  }
                }
                catch {
                  case t: Throwable =>
                    log.error(s"Error in CheckpointCoordinator while processing $declineMessage", t)
                }
              }(context.dispatcher)
            }
            else {
              log.error(
                s"Received DeclineCheckpoint message for job $jid with no CheckpointCoordinator")
            }

          case None => log.error(s"Received DeclineCheckpoint for unavailable job $jid")
        }


      // unknown checkpoint message
      case _ => unhandled(actorMessage)
    }
  }
  
  /**
   * Handle unmatched messages with an exception.
   */
  override def unhandled(message: Any): Unit = {
    // let the actor crash
    throw new RuntimeException("Received unknown message " + message)
  }

  /**
   * Handle messages that request or report accumulators.
   *
   * @param message The accumulator message.
   */
  private def handleAccumulatorMessage(message: AccumulatorMessage): Unit = {
    message match {
      case RequestAccumulatorResults(jobID) =>
        try {
          currentJobs.get(jobID) match {
            case Some((graph, jobInfo)) =>
              val accumulatorValues = graph.getAccumulatorsSerialized()
              sender() ! decorateMessage(AccumulatorResultsFound(jobID, accumulatorValues))
            case None =>
              archive.forward(message)
          }
        } catch {
        case e: Exception =>
          log.error("Cannot serialize accumulator result.", e)
          sender() ! decorateMessage(AccumulatorResultsErroneous(jobID, e))
        }

      case RequestAccumulatorResultsStringified(jobId) =>
        currentJobs.get(jobId) match {
          case Some((graph, jobInfo)) =>
            val stringifiedAccumulators = graph.getAccumulatorResultsStringified()
            sender() ! decorateMessage(
              AccumulatorResultStringsFound(jobId, stringifiedAccumulators)
            )
          case None =>
            archive.forward(message)
        }

      case unknown =>
        log.warn(s"Received unknown AccumulatorMessage: $unknown")
    }
  }

  /**
   * Dedicated handler for monitor info request messages.
   * 
   * Note that this handler does not fail. Errors while responding to info messages are logged,
   * but will not cause the actor to crash.
   *
   * @param actorMessage The info request message.
   */
  private def handleInfoRequestMessage(actorMessage: InfoMessage, theSender: ActorRef): Unit = {
    try {
      actorMessage match {

        case _ : RequestJobsOverview =>
          // get our own overview
          val ourJobs = createJobStatusOverview()

          // get the overview from the archive
          val future = (archive ? RequestJobsOverview.getInstance())(timeout)

          future.onSuccess {
            case archiveOverview: JobsOverview =>
              theSender ! new JobsOverview(ourJobs, archiveOverview)
          }(context.dispatcher)

        case _ : RequestJobsWithIDsOverview =>
          // get our own overview
          val ourJobs = createJobStatusWithIDsOverview()

          // get the overview from the archive
          val future = (archive ? RequestJobsWithIDsOverview.getInstance())(timeout)

          future.onSuccess {
            case archiveOverview: JobsWithIDsOverview =>
              theSender ! new JobsWithIDsOverview(ourJobs, archiveOverview)
          }(context.dispatcher)

        case _ : RequestStatusOverview =>

          val ourJobs = createJobStatusOverview()

          val numTMs = instanceManager.getNumberOfRegisteredTaskManagers()
          val numSlotsTotal = instanceManager.getTotalNumberOfSlots()
          val numSlotsAvailable = instanceManager.getNumberOfAvailableSlots()

          // add to that the jobs from the archive
          val future = (archive ? RequestJobsOverview.getInstance())(timeout)
          future.onSuccess {
            case archiveOverview: JobsOverview =>
              theSender ! new StatusOverview(numTMs, numSlotsTotal, numSlotsAvailable,
                ourJobs, archiveOverview)
          }(context.dispatcher)

        case msg : RequestJobDetails => 
          
          val ourDetails: Array[JobDetails] = if (msg.shouldIncludeRunning()) {
            currentJobs.values.map {
              v => WebMonitorUtils.createDetailsForJob(v._1)
            }.toArray[JobDetails]
          } else {
            null
          }
          
          if (msg.shouldIncludeFinished()) {
            val future = (archive ? msg)(timeout)
            future.onSuccess {
              case archiveDetails: MultipleJobsDetails =>
                theSender ! new MultipleJobsDetails(ourDetails, archiveDetails.getFinishedJobs())
            }(context.dispatcher)
          } else {
            theSender ! new MultipleJobsDetails(ourDetails, null)
          }
          
        case _ => log.error("Unrecognized info message " + actorMessage)
      }
    }
    catch {
      case e: Throwable => log.error(s"Error responding to message $actorMessage", e)
    }
  }

  private def createJobStatusOverview() : JobsOverview = {
    var runningOrPending = 0
    var finished = 0
    var canceled = 0
    var failed = 0

    currentJobs.values.foreach {
      _._1.getState() match {
        case JobStatus.FINISHED => finished += 1
        case JobStatus.CANCELED => canceled += 1
        case JobStatus.FAILED => failed += 1
        case _ => runningOrPending += 1
      }
    }

    new JobsOverview(runningOrPending, finished, canceled, failed)
  }

  private def createJobStatusWithIDsOverview() : JobsWithIDsOverview = {
    val runningOrPending = new java.util.ArrayList[JobID]()
    val finished = new java.util.ArrayList[JobID]()
    val canceled = new java.util.ArrayList[JobID]()
    val failed = new java.util.ArrayList[JobID]()
    
    currentJobs.values.foreach { case (graph, _) =>
      graph.getState() match {
        case JobStatus.FINISHED => finished.add(graph.getJobID)
        case JobStatus.CANCELED => canceled.add(graph.getJobID)
        case JobStatus.FAILED => failed.add(graph.getJobID)
        case _ => runningOrPending.add(graph.getJobID)
      }
    }

    new JobsWithIDsOverview(runningOrPending, finished, canceled, failed)
  }

  /**
   * Removes the job and sends it to the MemoryArchivist.
   *
   * This should be called asynchronously. Removing the job from the [[SubmittedJobGraphStore]]
   * might block. Therefore be careful not to block the actor thread.
   *
   * @param jobID ID of the job to remove and archive
   * @param removeJobFromStateBackend true if the job shall be archived and removed from the state
   *                            backend
   */
  private def removeJob(jobID: JobID, removeJobFromStateBackend: Boolean): Option[Future[Unit]] = {
    // Don't remove the job yet...
    val futureOption = currentJobs.get(jobID) match {
      case Some((eg, _)) =>
        val result = if (removeJobFromStateBackend) {
          val futureOption = Some(future {
            try {
              // ...otherwise, we can have lingering resources when there is a  concurrent shutdown
              // and the ZooKeeper client is closed. Not removing the job immediately allow the
              // shutdown to release all resources.
              submittedJobGraphs.removeJobGraph(jobID)
            } catch {
              case t: Throwable => log.error(s"Could not remove submitted job graph $jobID.", t)
            }
          }(context.dispatcher))

          try {
            eg.prepareForArchiving()

            archive ! decorateMessage(ArchiveExecutionGraph(jobID, eg))
          } catch {
            case t: Throwable => log.error(s"Could not prepare the execution graph $eg for " +
              "archiving.", t)
          }

          futureOption
        } else {
          None
        }

        currentJobs.remove(jobID)

        result
      case None => None
    }

    try {
      libraryCacheManager.unregisterJob(jobID)
    } catch {
      case t: Throwable =>
        log.error(s"Could not properly unregister job $jobID form the library cache.", t)
    }

    futureOption
  }

  /** Fails all currently running jobs and empties the list of currently running jobs. If the
    * [[JobClientActor]] waits for a result, then a [[JobExecutionException]] is sent.
    *
    * @param cause Cause for the cancelling.
    */
  private def cancelAndClearEverything(
      cause: Throwable,
      removeJobFromStateBackend: Boolean)
    : Seq[Future[Unit]] = {
    val futures = for ((jobID, (eg, jobInfo)) <- currentJobs) yield {
      future {
        if (removeJobFromStateBackend) {
          try {
            submittedJobGraphs.removeJobGraph(jobID)
          }
          catch {
            case t: Throwable =>
              log.error("Error during submitted job graph clean up.", t)
          }
        }

        eg.fail(cause)

        if (jobInfo.listeningBehaviour != ListeningBehaviour.DETACHED) {
          jobInfo.client ! decorateMessage(
            Failure(new JobExecutionException(jobID, "All jobs are cancelled and cleared.", cause)))
        }
      }(context.dispatcher)
    }

    currentJobs.clear()

    futures.toSeq
  }

  override def grantLeadership(newLeaderSessionID: UUID): Unit = {
    self ! decorateMessage(GrantLeadership(Option(newLeaderSessionID)))
  }

  override def revokeLeadership(): Unit = {
    leaderSessionID = None
    self ! decorateMessage(RevokeLeadership)
  }

  override def onAddedJobGraph(jobId: JobID): Unit = {
    if (leaderSessionID.isDefined && !currentJobs.contains(jobId)) {
      self ! decorateMessage(RecoverJob(jobId))
    }
  }

  override def onRemovedJobGraph(jobId: JobID): Unit = {
    if (leaderSessionID.isDefined) {
      currentJobs.get(jobId).foreach(
        job =>
          future {
            // Fail the execution graph
            job._1.fail(new IllegalStateException("Another JobManager removed the job from " +
              "ZooKeeper."))
          }(context.dispatcher)
      )
    }
  }

  override def getAddress: String = {
    AkkaUtils.getAkkaURL(context.system, self)
  }

  /** Handles error occurring in the leader election service
    *
    * @param exception Exception being thrown in the leader election service
    */
  override def handleError(exception: Exception): Unit = {
    log.error("Received an error from the LeaderElectionService.", exception)

    // terminate JobManager in case of an error
    self ! decorateMessage(PoisonPill)
  }
  
  /**
   * Updates the accumulators reported from a task manager via the Heartbeat message.
    *
    * @param accumulators list of accumulator snapshots
   */
  private def updateAccumulators(accumulators : Seq[AccumulatorSnapshot]) = {
    accumulators foreach {
      case accumulatorEvent =>
        currentJobs.get(accumulatorEvent.getJobID) match {
          case Some((jobGraph, jobInfo)) =>
            future {
              jobGraph.updateAccumulators(accumulatorEvent)
            }(context.dispatcher)
          case None =>
          // ignore accumulator values for old job
        }
    }
  }
}

/**
 * Job Manager companion object. Contains the entry point (main method) to run the JobManager in a
 * standalone fashion. Also contains various utility methods to start the JobManager and to
 * look up the JobManager actor reference.
 */
object JobManager {

  val LOG = Logger(classOf[JobManager])

  val STARTUP_FAILURE_RETURN_CODE = 1
  val RUNTIME_FAILURE_RETURN_CODE = 2

  /** Name of the JobManager actor */
  val JOB_MANAGER_NAME = "jobmanager"

  /** Name of the archive actor */
  val ARCHIVE_NAME = "archive"


  /**
   * Entry point (main method) to run the JobManager in a standalone fashion.
   *
   * @param args The command line arguments.
   */
  def main(args: Array[String]): Unit = {
    // startup checks and logging
    EnvironmentInformation.logEnvironmentInfo(LOG.logger, "JobManager", args)
    SignalHandler.register(LOG.logger)

    // parsing the command line arguments
    val (configuration: Configuration,
         executionMode: JobManagerMode,
         listeningHost: String,
         listeningPortRange: java.util.Iterator[Integer]) =
    try {
      parseArgs(args)
    }
    catch {
      case t: Throwable =>
        LOG.error(t.getMessage(), t)
        System.exit(STARTUP_FAILURE_RETURN_CODE)
        null
    }

    // we want to check that the JobManager hostname is in the config
    // if it is not in there, the actor system will bind to the loopback interface's
    // address and will not be reachable from anyone remote
    if (listeningHost == null) {
      val message = "Config parameter '" + ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY +
        "' is missing (hostname/address to bind JobManager to)."
      LOG.error(message)
      System.exit(STARTUP_FAILURE_RETURN_CODE)
    }

    if (!listeningPortRange.hasNext) {
      if (ZooKeeperUtils.isZooKeeperRecoveryMode(configuration)) {
        val message = "Config parameter '" + ConfigConstants.RECOVERY_JOB_MANAGER_PORT +
          "' does not specify a valid port range."
        LOG.error(message)
        System.exit(STARTUP_FAILURE_RETURN_CODE)
      }
      else {
        val message = s"Config parameter '" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY +
          "' does not specify a valid port."
        LOG.error(message)
        System.exit(STARTUP_FAILURE_RETURN_CODE)
      }
    }

    // run the job manager
    try {
      if (SecurityUtils.isSecurityEnabled) {
        LOG.info("Security is enabled. Starting secure JobManager.")
        SecurityUtils.runSecured(new FlinkSecuredRunner[Unit] {
          override def run(): Unit = {
            runJobManager(
              configuration,
              executionMode,
              listeningHost,
              listeningPortRange)
          }
        })
      } else {
        LOG.info("Security is not enabled. Starting non-authenticated JobManager.")
        runJobManager(
          configuration,
          executionMode,
          listeningHost,
          listeningPortRange)
      }
    } catch {
      case t: Throwable =>
        LOG.error("Failed to run JobManager.", t)
        System.exit(STARTUP_FAILURE_RETURN_CODE)
    }
  }

  /**
   * Starts and runs the JobManager with all its components. First, this method starts a
   * dedicated actor system for the JobManager. Second, its starts all components of the
   * JobManager (including library cache, instance manager, scheduler). Finally, it starts
   * the JobManager actor itself.
   *
   * This method blocks indefinitely (or until the JobManager's actor system is shut down).
   *
   * @param configuration The configuration object for the JobManager.
   * @param executionMode The execution mode in which to run. Execution mode LOCAL will spawn an
   *                      an additional TaskManager in the same process.
   * @param listeningAddress The hostname where the JobManager should listen for messages.
   * @param listeningPort The port where the JobManager should listen for messages.
   */
  def runJobManager(
      configuration: Configuration,
      executionMode: JobManagerMode,
      listeningAddress: String,
      listeningPort: Int)
    : Unit = {
    
    val (jobManagerSystem, _, _, _) = startActorSystemAndJobManagerActors(
      configuration,
      executionMode,
      listeningAddress,
      listeningPort,
      classOf[JobManager],
      classOf[MemoryArchivist]
    )

    // block until everything is shut down
    jobManagerSystem.awaitTermination()
  }

  /**
    * Starts and runs the JobManager with all its components trying to bind to
    * a port in the specified range.
    *
    * @param configuration The configuration object for the JobManager.
    * @param executionMode The execution mode in which to run. Execution mode LOCAL will spawn an
    *                      an additional TaskManager in the same process.
    * @param listeningAddress The hostname where the JobManager should listen for messages.
    * @param listeningPortRange The port range where the JobManager should listen for messages.
    */
  def runJobManager(
      configuration: Configuration,
      executionMode: JobManagerMode,
      listeningAddress: String,
      listeningPortRange: java.util.Iterator[Integer])
    : Unit = {

    val result = retryOnBindException({
      // Try all ports in the range until successful
      val socket = NetUtils.createSocketFromPorts(
        listeningPortRange,
        new NetUtils.SocketFactory {
          override def createSocket(port: Int): ServerSocket = new ServerSocket(
            // Use the correct listening address, bound ports will only be
            // detected later by Akka.
            port, 0, InetAddress.getByName(listeningAddress))
        })

      val port =
        if (socket == null) {
          throw new BindException(s"Unable to allocate port for JobManager.")
        } else {
          try {
            socket.getLocalPort()
          } finally {
            socket.close()
          }
        }

      runJobManager(configuration, executionMode, listeningAddress, port)
    }, { !listeningPortRange.hasNext }, 5000)

    result match {
      case scala.util.Failure(f) => throw f
      case _ =>
    }
  }

  /**
    * Retries a function if it fails because of a [[java.net.BindException]].
    *
    * @param fn The function to retry
    * @param stopCond Flag to signal termination
    * @param maxSleepBetweenRetries Max random sleep time between retries
    * @tparam T Return type of the the function to retry
    * @return Return value of the the function to retry
    */
  @tailrec
  def retryOnBindException[T](
      fn: => T,
      stopCond: => Boolean,
      maxSleepBetweenRetries : Long = 0 )
    : scala.util.Try[T] = {

    def sleepBeforeRetry() : Unit = {
      if (maxSleepBetweenRetries > 0) {
        val sleepTime = (Math.random() * maxSleepBetweenRetries).asInstanceOf[Long]
        LOG.info(s"Retrying after bind exception. Sleeping for ${sleepTime} ms.")
        Thread.sleep(sleepTime)
      }
    }

    scala.util.Try {
      fn
    } match {
      case scala.util.Failure(x: BindException) =>
        if (stopCond) {
          scala.util.Failure(new RuntimeException(
            "Unable to do further retries starting the actor system"))
        } else {
          sleepBeforeRetry()
          retryOnBindException(fn, stopCond)
        }
      case scala.util.Failure(x: Exception) => x.getCause match {
        case c: ChannelException =>
          if (stopCond) {
            scala.util.Failure(new RuntimeException(
              "Unable to do further retries starting the actor system"))
          } else {
            sleepBeforeRetry()
            retryOnBindException(fn, stopCond)
          }
        case _ => scala.util.Failure(x)
      }
      case f => f
    }
  }

  /** Starts an ActorSystem, the JobManager and all its components including the WebMonitor.
    *
    * @param configuration The configuration object for the JobManager
    * @param executionMode The execution mode in which to run. Execution mode LOCAL with spawn an
    *                      additional TaskManager in the same process.
    * @param listeningAddress The hostname where the JobManager should listen for messages.
    * @param listeningPort The port where the JobManager should listen for messages
    * @param jobManagerClass The class of the JobManager to be started
    * @param archiveClass The class of the Archivist to be started
    * @return A tuple containing the started ActorSystem, ActorRefs to the JobManager and the
    *         Archivist and an Option containing a possibly started WebMonitor
    */
  def startActorSystemAndJobManagerActors(
      configuration: Configuration,
      executionMode: JobManagerMode,
      listeningAddress: String,
      listeningPort: Int,
      jobManagerClass: Class[_ <: JobManager],
      archiveClass: Class[_ <: MemoryArchivist])
    : (ActorSystem, ActorRef, ActorRef, Option[WebMonitor]) = {

    LOG.info("Starting JobManager")

    // Bring up the job manager actor system first, bind it to the given address.
    val hostPortUrl = NetUtils.hostAndPortToUrlString(listeningAddress, listeningPort)
    LOG.info(s"Starting JobManager actor system at $hostPortUrl")

    val jobManagerSystem = try {
      val akkaConfig = AkkaUtils.getAkkaConfig(
        configuration,
        Some((listeningAddress, listeningPort))
      )
      if (LOG.isDebugEnabled) {
        LOG.debug("Using akka configuration\n " + akkaConfig)
      }
      AkkaUtils.createActorSystem(akkaConfig)
    }
    catch {
      case t: Throwable =>
        if (t.isInstanceOf[org.jboss.netty.channel.ChannelException]) {
          val cause = t.getCause()
          if (cause != null && t.getCause().isInstanceOf[java.net.BindException]) {
            val address = listeningAddress + ":" + listeningPort
            throw new Exception("Unable to create JobManager at address " + address +
              " - " + cause.getMessage(), t)
          }
        }
        throw new Exception("Could not create JobManager actor system", t)
    }

    val address = AkkaUtils.getAddress(jobManagerSystem)

    configuration.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, address.host.get)
    configuration.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, address.port.get)

    val webMonitor: Option[WebMonitor] =
      if (configuration.getInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, 0) >= 0) {
        LOG.info("Starting JobManger web frontend")
        val leaderRetrievalService = LeaderRetrievalUtils
          .createLeaderRetrievalService(configuration)

        // start the web frontend. we need to load this dynamically
        // because it is not in the same project/dependencies
        val webServer = WebMonitorUtils.startWebRuntimeMonitor(
          configuration,
          leaderRetrievalService,
          jobManagerSystem)

        Option(webServer)
      }
      else {
        None
      }

    // Reset the port (necessary in case of automatic port selection)
    webMonitor.foreach{ monitor => configuration.setInteger(
      ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, monitor.getServerPort) }

    try {
      // bring up the job manager actor
      LOG.info("Starting JobManager actor")
      val (jobManager, archive) = startJobManagerActors(
        configuration,
        jobManagerSystem,
        jobManagerClass,
        archiveClass)

      // start a process reaper that watches the JobManager. If the JobManager actor dies,
      // the process reaper will kill the JVM process (to ensure easy failure detection)
      LOG.debug("Starting JobManager process reaper")
      jobManagerSystem.actorOf(
        Props(
          classOf[ProcessReaper],
          jobManager,
          LOG.logger,
          RUNTIME_FAILURE_RETURN_CODE),
        "JobManager_Process_Reaper")

      // bring up a local task manager, if needed
      if (executionMode == JobManagerMode.LOCAL) {
        LOG.info("Starting embedded TaskManager for JobManager's LOCAL execution mode")

        val taskManagerActor = TaskManager.startTaskManagerComponentsAndActor(
          configuration,
          jobManagerSystem,
          listeningAddress,
          Some(TaskManager.TASK_MANAGER_NAME),
          None,
          localTaskManagerCommunication = true,
          classOf[TaskManager])

        LOG.debug("Starting TaskManager process reaper")
        jobManagerSystem.actorOf(
          Props(
            classOf[ProcessReaper],
            taskManagerActor,
            LOG.logger,
            RUNTIME_FAILURE_RETURN_CODE),
          "TaskManager_Process_Reaper")
      }

      webMonitor.foreach {
        monitor =>
          val jobManagerAkkaUrl = JobManager.getRemoteJobManagerAkkaURL(configuration)
          monitor.start(jobManagerAkkaUrl)
      }

      (jobManagerSystem, jobManager, archive, webMonitor)
    }
    catch {
      case t: Throwable =>
        LOG.error("Error while starting up JobManager", t)
        try {
          jobManagerSystem.shutdown()
        } catch {
          case tt: Throwable => LOG.warn("Could not cleanly shut down actor system", tt)
        }
        throw t
    }
  }

  /**
   * Loads the configuration, execution mode and the listening address from the provided command
   * line arguments.
   *
   * @param args command line arguments
   * @return Quadruple of configuration, execution mode and an optional listening address
   */
  def parseArgs(args: Array[String])
    : (Configuration, JobManagerMode, String, java.util.Iterator[Integer]) = {
    val parser = new scopt.OptionParser[JobManagerCliOptions]("JobManager") {
      head("Flink JobManager")

      opt[String]("configDir") action { (arg, conf) => 
        conf.setConfigDir(arg)
        conf
      } text {
        "The configuration directory."
      }

      opt[String]("executionMode") action { (arg, conf) =>
        conf.setJobManagerMode(arg)
        conf
      } text {
        "The execution mode of the JobManager (CLUSTER / LOCAL)"
      }

      opt[String]("host").optional().action { (arg, conf) =>
        conf.setHost(arg)
        conf
      } text {
        "Network address for communication with the job manager"
      }

      opt[Int]("webui-port").optional().action { (arg, conf) =>
        conf.setWebUIPort(arg)
        conf
      } text {
        "Port for the UI web server"
      }
    }

    val config = parser.parse(args, new JobManagerCliOptions()).getOrElse {
      throw new Exception(
        s"Invalid command line arguments: ${args.mkString(" ")}. Usage: ${parser.usage}")
    }
    
    val configDir = config.getConfigDir()
    
    if (configDir == null) {
      throw new Exception("Missing parameter '--configDir'")
    }
    if (config.getJobManagerMode() == null) {
      throw new Exception("Missing parameter '--executionMode'")
    }

    LOG.info("Loading configuration from " + configDir)
    GlobalConfiguration.loadConfiguration(configDir)
    val configuration = GlobalConfiguration.getConfiguration()

    try {
      FileSystem.setDefaultScheme(configuration)
    }
    catch {
      case e: IOException => {
        throw new Exception("Error while setting the default " +
          "filesystem scheme from configuration.", e)
      }
    }

    if (new File(configDir).isDirectory) {
      configuration.setString(ConfigConstants.FLINK_BASE_DIR_PATH_KEY, configDir + "/..")
    }

    if (config.getWebUIPort() >= 0) {
      configuration.setInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, config.getWebUIPort())
    }

    if (config.getHost() != null) {
      configuration.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, config.getHost())
    }

    val host = configuration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null)

    val portRange =
      // high availability mode
      if (ZooKeeperUtils.isZooKeeperRecoveryMode(configuration)) {
        LOG.info("Starting JobManager with high-availability")

        configuration.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, 0)

        // The port range of allowed job manager ports or 0 for random
        configuration.getString(
          ConfigConstants.RECOVERY_JOB_MANAGER_PORT,
          ConfigConstants.DEFAULT_RECOVERY_JOB_MANAGER_PORT)
      }
      else {
        LOG.info("Starting JobManager without high-availability")

        // In standalone mode, we don't allow port ranges
        val listeningPort = configuration.getInteger(
          ConfigConstants.JOB_MANAGER_IPC_PORT_KEY,
          ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT)

        if (listeningPort <= 0 || listeningPort >= 65536) {
          val message = "Config parameter '" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY +
            "' is invalid, it must be greater than 0 and less than 65536."
          LOG.error(message)
          System.exit(STARTUP_FAILURE_RETURN_CODE)
        }

        String.valueOf(listeningPort)
      }

    val executionMode = config.getJobManagerMode
    val hostUrl = NetUtils.ipAddressToUrlString(InetAddress.getByName(host))

    LOG.info(s"Starting JobManager on $hostUrl:$portRange with execution mode $executionMode")

    val portRangeIterator = NetUtils.getPortRangeFromString(portRange)

    (configuration, executionMode, host, portRangeIterator)
  }

  /**
   * Create the job manager components as (instanceManager, scheduler, libraryCacheManager,
   *              archiverProps, defaultExecutionRetries,
   *              delayBetweenRetries, timeout)
   *
   * @param configuration The configuration from which to parse the config values.
   * @param leaderElectionServiceOption LeaderElectionService which shall be returned if the option
   *                                    is defined
   * @return The members for a default JobManager.
   */
  def createJobManagerComponents(
      configuration: Configuration,
      leaderElectionServiceOption: Option[LeaderElectionService]) :
    (ExecutorService,
    InstanceManager,
    FlinkScheduler,
    BlobLibraryCacheManager,
    RestartStrategy,
    FiniteDuration, // timeout
    Int, // number of archived jobs
    LeaderElectionService,
    SubmittedJobGraphStore,
    CheckpointRecoveryFactory,
    SavepointStore,
    FiniteDuration // timeout for job recovery
   ) = {

    val timeout: FiniteDuration = AkkaUtils.getTimeout(configuration)

    val cleanupInterval = configuration.getLong(
      ConfigConstants.LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL,
      ConfigConstants.DEFAULT_LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL) * 1000

    val restartStrategy = RestartStrategyFactory
      .createFromConfig(configuration)

    val archiveCount = configuration.getInteger(ConfigConstants.JOB_MANAGER_WEB_ARCHIVE_COUNT,
      ConfigConstants.DEFAULT_JOB_MANAGER_WEB_ARCHIVE_COUNT)


    var blobServer: BlobServer = null
    var instanceManager: InstanceManager = null
    var scheduler: FlinkScheduler = null
    var libraryCacheManager: BlobLibraryCacheManager = null

    val executorService: ExecutorService = new ForkJoinPool()
    
    try {
      blobServer = new BlobServer(configuration)
      instanceManager = new InstanceManager()
      scheduler = new FlinkScheduler(ExecutionContext.fromExecutor(executorService))
      libraryCacheManager = new BlobLibraryCacheManager(blobServer, cleanupInterval)

      instanceManager.addInstanceListener(scheduler)
    }
    catch {
      case t: Throwable =>
        if (libraryCacheManager != null) {
          libraryCacheManager.shutdown()
        }
        if (scheduler != null) {
          scheduler.shutdown()
        }
        if (instanceManager != null) {
          instanceManager.shutdown()
        }
        if (blobServer != null) {
          blobServer.shutdown()
        }
        executorService.shutdownNow()
        
        throw t
    }

    // Create recovery related components
    val (leaderElectionService, submittedJobGraphs, checkpointRecoveryFactory) =
      RecoveryMode.fromConfig(configuration) match {
        case RecoveryMode.STANDALONE =>
          val leaderElectionService = leaderElectionServiceOption match {
            case Some(les) => les
            case None => new StandaloneLeaderElectionService()
          }

          (leaderElectionService,
            new StandaloneSubmittedJobGraphStore(),
            new StandaloneCheckpointRecoveryFactory())

        case RecoveryMode.ZOOKEEPER =>
          val client = ZooKeeperUtils.startCuratorFramework(configuration)

          val leaderElectionService = leaderElectionServiceOption match {
            case Some(les) => les
            case None => ZooKeeperUtils.createLeaderElectionService(client, configuration)
          }

          (leaderElectionService,
            ZooKeeperUtils.createSubmittedJobGraphs(client, configuration),
            new ZooKeeperCheckpointRecoveryFactory(client, configuration))
      }

    val savepointStore = SavepointStoreFactory.createFromConfig(configuration)

    val jobRecoveryTimeoutStr = configuration.getString(ConfigConstants.RECOVERY_JOB_DELAY, "");

    val jobRecoveryTimeout = if (jobRecoveryTimeoutStr == null || jobRecoveryTimeoutStr.isEmpty) {
      timeout
    } else {
      try {
        FiniteDuration(Duration(jobRecoveryTimeoutStr).toMillis, TimeUnit.MILLISECONDS)
      } catch {
        case n: NumberFormatException =>
          throw new Exception(
            s"Invalid config value for ${ConfigConstants.RECOVERY_JOB_DELAY}: " +
              s"$jobRecoveryTimeoutStr. Value must be a valid duration (such as '10 s' or '1 min')")
      }
    }

    (executorService,
      instanceManager,
      scheduler,
      libraryCacheManager,
      restartStrategy,
      timeout,
      archiveCount,
      leaderElectionService,
      submittedJobGraphs,
      checkpointRecoveryFactory,
      savepointStore,
      jobRecoveryTimeout)
  }

  /**
   * Starts the JobManager and job archiver based on the given configuration, in the
   * given actor system.
   *
   * @param configuration The configuration for the JobManager
   * @param actorSystem The actor system running the JobManager
   * @param jobManagerClass The class of the JobManager to be started
   * @param archiveClass The class of the MemoryArchivist to be started
    * @return A tuple of references (JobManager Ref, Archiver Ref)
   */
  def startJobManagerActors(
      configuration: Configuration,
      actorSystem: ActorSystem,
      jobManagerClass: Class[_ <: JobManager],
      archiveClass: Class[_ <: MemoryArchivist])
    : (ActorRef, ActorRef) = {

    startJobManagerActors(
      configuration,
      actorSystem,
      Some(JOB_MANAGER_NAME),
      Some(ARCHIVE_NAME),
      jobManagerClass,
      archiveClass)
  }

  /**
   * Starts the JobManager and job archiver based on the given configuration, in the
   * given actor system.
   *
   * @param configuration The configuration for the JobManager
   * @param actorSystem The actor system running the JobManager
   * @param jobMangerActorName Optionally the name of the JobManager actor. If none is given,
   *                          the actor will have the name generated by the actor system.
   * @param archiveActorName Optionally the name of the archive actor. If none is given,
   *                          the actor will have the name generated by the actor system.
   * @param jobManagerClass The class of the JobManager to be started
   * @param archiveClass The class of the MemoryArchivist to be started
    * @return A tuple of references (JobManager Ref, Archiver Ref)
   */
  def startJobManagerActors(
      configuration: Configuration,
      actorSystem: ActorSystem,
      jobMangerActorName: Option[String],
      archiveActorName: Option[String],
      jobManagerClass: Class[_ <: JobManager],
      archiveClass: Class[_ <: MemoryArchivist])
    : (ActorRef, ActorRef) = {

    val (executorService: ExecutorService,
    instanceManager,
    scheduler,
    libraryCacheManager,
    restartStrategy,
    timeout,
    archiveCount,
    leaderElectionService,
    submittedJobGraphs,
    checkpointRecoveryFactory,
    savepointStore,
    jobRecoveryTimeout) = createJobManagerComponents(
      configuration,
      None)

    val archiveProps = Props(archiveClass, archiveCount)

    // start the archiver with the given name, or without (avoid name conflicts)
    val archive: ActorRef = archiveActorName match {
      case Some(actorName) => actorSystem.actorOf(archiveProps, actorName)
      case None => actorSystem.actorOf(archiveProps)
    }

    val jobManagerProps = Props(
      jobManagerClass,
      configuration,
      executorService,
      instanceManager,
      scheduler,
      libraryCacheManager,
      archive,
      restartStrategy,
      timeout,
      leaderElectionService,
      submittedJobGraphs,
      checkpointRecoveryFactory,
      savepointStore,
      jobRecoveryTimeout)

    val jobManager: ActorRef = jobMangerActorName match {
      case Some(actorName) => actorSystem.actorOf(jobManagerProps, actorName)
      case None => actorSystem.actorOf(jobManagerProps)
    }

    (jobManager, archive)
  }

  def startActor(props: Props, actorSystem: ActorSystem): ActorRef = {
    actorSystem.actorOf(props, JOB_MANAGER_NAME)
  }

  // --------------------------------------------------------------------------
  //  Resolving the JobManager endpoint
  // --------------------------------------------------------------------------

  /**
   * Builds the akka actor path for the JobManager actor, given the socket address
   * where the JobManager's actor system runs.
   *
   * @param address The address of the JobManager's actor system.
   * @return The akka URL of the JobManager actor.
   */
  def getRemoteJobManagerAkkaURL(
      address: InetSocketAddress,
      name: Option[String] = None)
    : String = {
    val hostPort = NetUtils.socketAddressToUrlString(address)

    getJobManagerAkkaURLHelper(s"akka.tcp://flink@$hostPort", name)
  }

  /**
   * Returns the JobManager actor's remote Akka URL, given the configured hostname and port.
   *
   * @param config The configuration to parse
   * @return JobManager actor remote Akka URL
   */
  def getRemoteJobManagerAkkaURL(config: Configuration) : String = {
    val (hostname, port) = TaskManager.getAndCheckJobManagerAddress(config)

    var hostPort: InetSocketAddress = null

    try {
      val inetAddress: InetAddress = InetAddress.getByName(hostname)
      hostPort = new InetSocketAddress(inetAddress, port)
    }
    catch {
      case e: UnknownHostException =>
        throw new UnknownHostException(s"Cannot resolve the JobManager hostname '$hostname' " +
          s"specified in the configuration")
    }

    JobManager.getRemoteJobManagerAkkaURL(hostPort, Option.empty)
  }

  /**
   * Builds the akka actor path for the JobManager actor to address the actor within
   * its own actor system.
   *
   * @return The local akka URL of the JobManager actor.
   */
  def getLocalJobManagerAkkaURL(name: Option[String] = None): String = {
    getJobManagerAkkaURLHelper("akka://flink", name)
  }

  def getJobManagerAkkaURL(system: ActorSystem, name: Option[String] = None): String = {
    getJobManagerAkkaURLHelper(AkkaUtils.getAddress(system).toString, name)
  }

  private def getJobManagerAkkaURLHelper(address: String, name: Option[String]): String = {
    address + "/user/" + name.getOrElse(JOB_MANAGER_NAME)
  }

  def getJobManagerActorRefFuture(
      address: InetSocketAddress,
      system: ActorSystem,
      timeout: FiniteDuration)
    : Future[ActorRef] = {
    AkkaUtils.getActorRefFuture(getRemoteJobManagerAkkaURL(address), system, timeout)
  }

  /**
   * Resolves the JobManager actor reference in a blocking fashion.
   *
   * @param jobManagerUrl The akka URL of the JobManager.
   * @param system The local actor system that should perform the lookup.
   * @param timeout The maximum time to wait until the lookup fails.
   * @throws java.io.IOException Thrown, if the lookup fails.
   * @return The ActorRef to the JobManager
   */
  @throws(classOf[IOException])
  def getJobManagerActorRef(
      jobManagerUrl: String,
      system: ActorSystem,
      timeout: FiniteDuration)
    : ActorRef = {
    AkkaUtils.getActorRef(jobManagerUrl, system, timeout)
  }

  /**
   * Resolves the JobManager actor reference in a blocking fashion.
   *
   * @param address The socket address of the JobManager's actor system.
   * @param system The local actor system that should perform the lookup.
   * @param timeout The maximum time to wait until the lookup fails.
   * @throws java.io.IOException Thrown, if the lookup fails.
   * @return The ActorRef to the JobManager
   */
  @throws(classOf[IOException])
  def getJobManagerActorRef(
      address: InetSocketAddress,
      system: ActorSystem,
      timeout: FiniteDuration)
    : ActorRef = {

    val jmAddress = getRemoteJobManagerAkkaURL(address)
    getJobManagerActorRef(jmAddress, system, timeout)
  }

  /**
   * Resolves the JobManager actor reference in a blocking fashion.
   *
   * @param address The socket address of the JobManager's actor system.
   * @param system The local actor system that should perform the lookup.
   * @param config The config describing the maximum time to wait until the lookup fails.
   * @throws java.io.IOException Thrown, if the lookup fails.
   * @return The ActorRef to the JobManager
   */
  @throws(classOf[IOException])
  def getJobManagerActorRef(
      address: InetSocketAddress,
      system: ActorSystem,
      config: Configuration)
    : ActorRef = {

    val timeout = AkkaUtils.getLookupTimeout(config)
    getJobManagerActorRef(address, system, timeout)
  }
}


/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.executiongraph;

import akka.actor.ActorSystem;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.accumulators.Accumulator;
import org.apache.flink.api.common.accumulators.AccumulatorHelper;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.JobException;
import org.apache.flink.runtime.StoppingException;
import org.apache.flink.runtime.accumulators.AccumulatorRegistry;
import org.apache.flink.runtime.accumulators.AccumulatorSnapshot;
import org.apache.flink.runtime.accumulators.StringifiedAccumulatorResult;
import org.apache.flink.runtime.blob.BlobKey;
import org.apache.flink.runtime.checkpoint.CheckpointCoordinator;
import org.apache.flink.runtime.checkpoint.CheckpointIDCounter;
import org.apache.flink.runtime.checkpoint.CompletedCheckpoint;
import org.apache.flink.runtime.checkpoint.CompletedCheckpointStore;
import org.apache.flink.runtime.checkpoint.SavepointCoordinator;
import org.apache.flink.runtime.checkpoint.StateStore;
import org.apache.flink.runtime.checkpoint.stats.CheckpointStatsTracker;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.execution.UnrecoverableException;
import org.apache.flink.runtime.executiongraph.restart.RestartStrategy;
import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobgraph.JobStatus;
import org.apache.flink.runtime.jobgraph.JobVertex;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.ScheduleMode;
import org.apache.flink.runtime.jobmanager.RecoveryMode;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationGroup;
import org.apache.flink.runtime.jobmanager.scheduler.Scheduler;
import org.apache.flink.runtime.messages.ExecutionGraphMessages;
import org.apache.flink.runtime.taskmanager.TaskExecutionState;
import org.apache.flink.runtime.util.SerializableObject;
import org.apache.flink.runtime.util.SerializedThrowable;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.InstantiationUtil;
import org.apache.flink.util.SerializedValue;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.concurrent.ExecutionContext;
import scala.concurrent.duration.FiniteDuration;

import java.io.IOException;
import java.io.Serializable;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;

import static com.google.common.base.Preconditions.checkNotNull;
/**
 * The execution graph is the central data structure that coordinates the distributed
 * execution of a data flow. It keeps representations of each parallel task, each
 * intermediate result, and the communication between them.
 *
 * The execution graph consists of the following constructs:
 * 
    *
  • The {@link ExecutionJobVertex} represents one vertex from the JobGraph (usually one operation like * "map" or "join") during execution. It holds the aggregated state of all parallel subtasks. * The ExecutionJobVertex is identified inside the graph by the {@link JobVertexID}, which it takes * from the JobGraph's corresponding JobVertex.
  • *
  • The {@link ExecutionVertex} represents one parallel subtask. For each ExecutionJobVertex, there are * as many ExecutionVertices as the parallelism. The ExecutionVertex is identified by * the ExecutionJobVertex and the number of the parallel subtask
  • *
  • The {@link Execution} is one attempt to execute a ExecutionVertex. There may be multiple Executions * for the ExecutionVertex, in case of a failure, or in the case where some data needs to be recomputed * because it is no longer available when requested by later operations. An Execution is always * identified by an {@link ExecutionAttemptID}. All messages between the JobManager and the TaskManager * about deployment of tasks and updates in the task status always use the ExecutionAttemptID to * address the message receiver.
  • *
* *

The ExecutionGraph implements {@link java.io.Serializable}, because it can be archived by * sending it to an archive actor via an actor message. The execution graph does contain some * non-serializable fields. These fields are not required in the archived form and are cleared * in the {@link #prepareForArchiving()} method.

*/ public class ExecutionGraph implements Serializable { private static final long serialVersionUID = 42L; private static final AtomicReferenceFieldUpdater STATE_UPDATER = AtomicReferenceFieldUpdater.newUpdater(ExecutionGraph.class, JobStatus.class, "state"); /** The log object used for debugging. */ static final Logger LOG = LoggerFactory.getLogger(ExecutionGraph.class); // -------------------------------------------------------------------------------------------- /** The lock used to secure all access to mutable fields, especially the tracking of progress * within the job. */ private final SerializableObject progressLock = new SerializableObject(); /** The ID of the job this graph has been built for. */ private final JobID jobID; /** The name of the original job graph. */ private final String jobName; /** The job configuration that was originally attached to the JobGraph. */ private final Configuration jobConfiguration; /** {@code true} if all source tasks are stoppable. */ private boolean isStoppable = true; /** All job vertices that are part of this graph */ private final ConcurrentHashMap tasks; /** All vertices, in the order in which they were created **/ private final List verticesInCreationOrder; /** All intermediate results that are part of this graph */ private final ConcurrentHashMap intermediateResults; /** The currently executed tasks, for callbacks */ private final ConcurrentHashMap currentExecutions; /** A list of all libraries required during the job execution. Libraries have to be stored * inside the BlobService and are referenced via the BLOB keys. */ private final List requiredJarFiles; /** A list of all classpaths required during the job execution. Classpaths have to be * accessible on all nodes in the cluster. */ private final List requiredClasspaths; /** Listeners that receive messages when the entire job switches it status (such as from * RUNNING to FINISHED) */ private final List jobStatusListenerActors; /** Listeners that receive messages whenever a single task execution changes its status */ private final List executionListenerActors; /** Timestamps (in milliseconds as returned by {@code System.currentTimeMillis()} when * the execution graph transitioned into a certain state. The index into this array is the * ordinal of the enum value, i.e. the timestamp when the graph went into state "RUNNING" is * at {@code stateTimestamps[RUNNING.ordinal()]}. */ private final long[] stateTimestamps; /** The timeout for all messages that require a response/acknowledgement */ private final FiniteDuration timeout; // ------ Configuration of the Execution ------- /** Flag to indicate whether the scheduler may queue tasks for execution, or needs to be able * to deploy them immediately. */ private boolean allowQueuedScheduling = false; /** The mode of scheduling. Decides how to select the initial set of tasks to be deployed. * May indicate to deploy all sources, or to deploy everything, or to deploy via backtracking * from results than need to be materialized. */ private ScheduleMode scheduleMode = ScheduleMode.FROM_SOURCES; /** Flag to indicate whether the Graph has been archived */ private boolean isArchived = false; // ------ Execution status and progress. These values are volatile, and accessed under the lock ------- /** Current status of the job execution */ private volatile JobStatus state = JobStatus.CREATED; /** The exception that caused the job to fail. This is set to the first root exception * that was not recoverable and triggered job failure */ private volatile Throwable failureCause; /** The number of job vertices that have reached a terminal state */ private volatile int numFinishedJobVertices; // ------ Fields that are relevant to the execution and need to be cleared before archiving ------- /** The scheduler to use for scheduling new tasks as they are needed */ @SuppressWarnings("NonSerializableFieldInSerializableClass") private Scheduler scheduler; /** Strategy to use for restarts */ @SuppressWarnings("NonSerializableFieldInSerializableClass") private RestartStrategy restartStrategy; /** The classloader for the user code. Needed for calls into user code classes */ @SuppressWarnings("NonSerializableFieldInSerializableClass") private ClassLoader userClassLoader; /** The coordinator for checkpoints, if snapshot checkpoints are enabled */ @SuppressWarnings("NonSerializableFieldInSerializableClass") private CheckpointCoordinator checkpointCoordinator; /** The coordinator for savepoints, if snapshot checkpoints are enabled */ private transient SavepointCoordinator savepointCoordinator; /** Checkpoint stats tracker seperate from the coordinator in order to be * available after archiving. */ private CheckpointStatsTracker checkpointStatsTracker; /** The execution context which is used to execute futures. */ @SuppressWarnings("NonSerializableFieldInSerializableClass") private ExecutionContext executionContext; // ------ Fields that are only relevant for archived execution graphs ------------ private ExecutionConfig executionConfig; private String jsonPlan; // -------------------------------------------------------------------------------------------- // Constructors // -------------------------------------------------------------------------------------------- /** * This constructor is for tests only, because it does not include class loading information. */ ExecutionGraph( ExecutionContext executionContext, JobID jobId, String jobName, Configuration jobConfig, FiniteDuration timeout, RestartStrategy restartStrategy) { this( executionContext, jobId, jobName, jobConfig, timeout, restartStrategy, new ArrayList(), new ArrayList(), ExecutionGraph.class.getClassLoader() ); } public ExecutionGraph( ExecutionContext executionContext, JobID jobId, String jobName, Configuration jobConfig, FiniteDuration timeout, RestartStrategy restartStrategy, List requiredJarFiles, List requiredClasspaths, ClassLoader userClassLoader) { checkNotNull(executionContext); checkNotNull(jobId); checkNotNull(jobName); checkNotNull(jobConfig); checkNotNull(userClassLoader); this.executionContext = executionContext; this.jobID = jobId; this.jobName = jobName; this.jobConfiguration = jobConfig; this.userClassLoader = userClassLoader; this.tasks = new ConcurrentHashMap(); this.intermediateResults = new ConcurrentHashMap(); this.verticesInCreationOrder = new ArrayList(); this.currentExecutions = new ConcurrentHashMap(); this.jobStatusListenerActors = new CopyOnWriteArrayList(); this.executionListenerActors = new CopyOnWriteArrayList(); this.stateTimestamps = new long[JobStatus.values().length]; this.stateTimestamps[JobStatus.CREATED.ordinal()] = System.currentTimeMillis(); this.requiredJarFiles = requiredJarFiles; this.requiredClasspaths = requiredClasspaths; this.timeout = timeout; this.restartStrategy = restartStrategy; } // -------------------------------------------------------------------------------------------- // Configuration of Data-flow wide execution settings // -------------------------------------------------------------------------------------------- /** * Gets the number of job vertices currently held by this execution graph. * @return The current number of job vertices. */ public int getNumberOfExecutionJobVertices() { return this.verticesInCreationOrder.size(); } public boolean isQueuedSchedulingAllowed() { return this.allowQueuedScheduling; } public void setQueuedSchedulingAllowed(boolean allowed) { this.allowQueuedScheduling = allowed; } public void setScheduleMode(ScheduleMode scheduleMode) { this.scheduleMode = scheduleMode; } public ScheduleMode getScheduleMode() { return scheduleMode; } public boolean isArchived() { return isArchived; } public void enableSnapshotCheckpointing( long interval, long checkpointTimeout, long minPauseBetweenCheckpoints, int maxConcurrentCheckpoints, List verticesToTrigger, List verticesToWaitFor, List verticesToCommitTo, ActorSystem actorSystem, UUID leaderSessionID, CheckpointIDCounter checkpointIDCounter, CompletedCheckpointStore completedCheckpointStore, RecoveryMode recoveryMode, StateStore savepointStore, CheckpointStatsTracker statsTracker) throws Exception { // simple sanity checks if (interval < 10 || checkpointTimeout < 10) { throw new IllegalArgumentException(); } if (state != JobStatus.CREATED) { throw new IllegalStateException("Job must be in CREATED state"); } ExecutionVertex[] tasksToTrigger = collectExecutionVertices(verticesToTrigger); ExecutionVertex[] tasksToWaitFor = collectExecutionVertices(verticesToWaitFor); ExecutionVertex[] tasksToCommitTo = collectExecutionVertices(verticesToCommitTo); // disable to make sure existing checkpoint coordinators are cleared disableSnaphotCheckpointing(); checkpointStatsTracker = Objects.requireNonNull(statsTracker, "Checkpoint stats tracker"); // create the coordinator that triggers and commits checkpoints and holds the state checkpointCoordinator = new CheckpointCoordinator( jobID, interval, checkpointTimeout, minPauseBetweenCheckpoints, maxConcurrentCheckpoints, tasksToTrigger, tasksToWaitFor, tasksToCommitTo, userClassLoader, checkpointIDCounter, completedCheckpointStore, recoveryMode, checkpointStatsTracker); // the periodic checkpoint scheduler is activated and deactivated as a result of // job status changes (running -> on, all other states -> off) registerJobStatusListener( checkpointCoordinator.createActivatorDeactivator(actorSystem, leaderSessionID)); // Savepoint Coordinator savepointCoordinator = new SavepointCoordinator( jobID, interval, checkpointTimeout, tasksToTrigger, tasksToWaitFor, tasksToCommitTo, userClassLoader, // Important: this counter needs to be shared with the periodic // checkpoint coordinator. checkpointIDCounter, savepointStore, checkpointStatsTracker); registerJobStatusListener(savepointCoordinator .createActivatorDeactivator(actorSystem, leaderSessionID)); } /** * Disables checkpointing. * *

The shutdown of the checkpoint coordinator might block. Make sure that calls to this * method don't block the job manager actor and run asynchronously. */ public void disableSnaphotCheckpointing() throws Exception { if (state != JobStatus.CREATED) { throw new IllegalStateException("Job must be in CREATED state"); } if (checkpointCoordinator != null) { checkpointCoordinator.shutdown(); checkpointCoordinator = null; checkpointStatsTracker = null; } if (savepointCoordinator != null) { savepointCoordinator.shutdown(); savepointCoordinator = null; } } public CheckpointCoordinator getCheckpointCoordinator() { return checkpointCoordinator; } public SavepointCoordinator getSavepointCoordinator() { return savepointCoordinator; } public RestartStrategy getRestartStrategy() { return restartStrategy; } public CheckpointStatsTracker getCheckpointStatsTracker() { return checkpointStatsTracker; } private ExecutionVertex[] collectExecutionVertices(List jobVertices) { if (jobVertices.size() == 1) { ExecutionJobVertex jv = jobVertices.get(0); if (jv.getGraph() != this) { throw new IllegalArgumentException("Can only use ExecutionJobVertices of this ExecutionGraph"); } return jv.getTaskVertices(); } else { ArrayList all = new ArrayList(); for (ExecutionJobVertex jv : jobVertices) { if (jv.getGraph() != this) { throw new IllegalArgumentException("Can only use ExecutionJobVertices of this ExecutionGraph"); } all.addAll(Arrays.asList(jv.getTaskVertices())); } return all.toArray(new ExecutionVertex[all.size()]); } } // -------------------------------------------------------------------------------------------- // Properties and Status of the Execution Graph // -------------------------------------------------------------------------------------------- /** * Returns a list of BLOB keys referring to the JAR files required to run this job * @return list of BLOB keys referring to the JAR files required to run this job */ public List getRequiredJarFiles() { return this.requiredJarFiles; } /** * Returns a list of classpaths referring to the directories/JAR files required to run this job * @return list of classpaths referring to the directories/JAR files required to run this job */ public List getRequiredClasspaths() { return this.requiredClasspaths; } // -------------------------------------------------------------------------------------------- public void setJsonPlan(String jsonPlan) { this.jsonPlan = jsonPlan; } public String getJsonPlan() { return jsonPlan; } public Scheduler getScheduler() { return scheduler; } public JobID getJobID() { return jobID; } public String getJobName() { return jobName; } public boolean isStoppable() { return this.isStoppable; } public Configuration getJobConfiguration() { return jobConfiguration; } public ClassLoader getUserClassLoader() { return this.userClassLoader; } public JobStatus getState() { return state; } public Throwable getFailureCause() { return failureCause; } public ExecutionJobVertex getJobVertex(JobVertexID id) { return this.tasks.get(id); } public Map getAllVertices() { return Collections.unmodifiableMap(this.tasks); } public Iterable getVerticesTopologically() { // we return a specific iterator that does not fail with concurrent modifications // the list is append only, so it is safe for that final int numElements = this.verticesInCreationOrder.size(); return new Iterable() { @Override public Iterator iterator() { return new Iterator() { private int pos = 0; @Override public boolean hasNext() { return pos < numElements; } @Override public ExecutionJobVertex next() { if (hasNext()) { return verticesInCreationOrder.get(pos++); } else { throw new NoSuchElementException(); } } @Override public void remove() { throw new UnsupportedOperationException(); } }; } }; } public Map getAllIntermediateResults() { return Collections.unmodifiableMap(this.intermediateResults); } public Iterable getAllExecutionVertices() { return new Iterable() { @Override public Iterator iterator() { return new AllVerticesIterator(getVerticesTopologically().iterator()); } }; } public long getStatusTimestamp(JobStatus status) { return this.stateTimestamps[status.ordinal()]; } /** * Returns the ExecutionContext associated with this ExecutionGraph. * * @return ExecutionContext associated with this ExecutionGraph */ public ExecutionContext getExecutionContext() { return executionContext; } /** * Gets the internal flink accumulator map of maps which contains some metrics. * @return A map of accumulators for every executed task. */ public Map>> getFlinkAccumulators() { Map>> flinkAccumulators = new HashMap>>(); for (ExecutionVertex vertex : getAllExecutionVertices()) { Map> taskAccs = vertex.getCurrentExecutionAttempt().getFlinkAccumulators(); flinkAccumulators.put(vertex.getCurrentExecutionAttempt().getAttemptId(), taskAccs); } return flinkAccumulators; } /** * Merges all accumulator results from the tasks previously executed in the Executions. * @return The accumulator map */ public Map> aggregateUserAccumulators() { Map> userAccumulators = new HashMap>(); for (ExecutionVertex vertex : getAllExecutionVertices()) { Map> next = vertex.getCurrentExecutionAttempt().getUserAccumulators(); if (next != null) { AccumulatorHelper.mergeInto(userAccumulators, next); } } return userAccumulators; } /** * Gets a serialized accumulator map. * @return The accumulator map with serialized accumulator values. * @throws IOException */ public Map> getAccumulatorsSerialized() throws IOException { Map> accumulatorMap = aggregateUserAccumulators(); Map> result = new HashMap>(); for (Map.Entry> entry : accumulatorMap.entrySet()) { result.put(entry.getKey(), new SerializedValue(entry.getValue().getLocalValue())); } return result; } /** * Returns the a stringified version of the user-defined accumulators. * @return an Array containing the StringifiedAccumulatorResult objects */ public StringifiedAccumulatorResult[] getAccumulatorResultsStringified() { Map> accumulatorMap = aggregateUserAccumulators(); return StringifiedAccumulatorResult.stringifyAccumulatorResults(accumulatorMap); } // -------------------------------------------------------------------------------------------- // Actions // -------------------------------------------------------------------------------------------- public void attachJobGraph(List topologiallySorted) throws JobException { if (LOG.isDebugEnabled()) { LOG.debug(String.format("Attaching %d topologically sorted vertices to existing job graph with %d " + "vertices and %d intermediate results.", topologiallySorted.size(), tasks.size(), intermediateResults.size())); } final long createTimestamp = System.currentTimeMillis(); for (JobVertex jobVertex : topologiallySorted) { if (jobVertex.isInputVertex() && !jobVertex.isStoppable()) { this.isStoppable = false; } // create the execution job vertex and attach it to the graph ExecutionJobVertex ejv = new ExecutionJobVertex(this, jobVertex, 1, timeout, createTimestamp); ejv.connectToPredecessors(this.intermediateResults); ExecutionJobVertex previousTask = this.tasks.putIfAbsent(jobVertex.getID(), ejv); if (previousTask != null) { throw new JobException(String.format("Encountered two job vertices with ID %s : previous=[%s] / new=[%s]", jobVertex.getID(), ejv, previousTask)); } for (IntermediateResult res : ejv.getProducedDataSets()) { IntermediateResult previousDataSet = this.intermediateResults.putIfAbsent(res.getId(), res); if (previousDataSet != null) { throw new JobException(String.format("Encountered two intermediate data set with ID %s : previous=[%s] / new=[%s]", res.getId(), res, previousDataSet)); } } this.verticesInCreationOrder.add(ejv); } } public void scheduleForExecution(Scheduler scheduler) throws JobException { if (scheduler == null) { throw new IllegalArgumentException("Scheduler must not be null."); } if (this.scheduler != null && this.scheduler != scheduler) { throw new IllegalArgumentException("Cannot use different schedulers for the same job"); } if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) { this.scheduler = scheduler; switch (scheduleMode) { case FROM_SOURCES: // simply take the vertices without inputs. for (ExecutionJobVertex ejv : this.tasks.values()) { if (ejv.getJobVertex().isInputVertex()) { ejv.scheduleAll(scheduler, allowQueuedScheduling); } } break; case ALL: for (ExecutionJobVertex ejv : getVerticesTopologically()) { ejv.scheduleAll(scheduler, allowQueuedScheduling); } break; case BACKTRACKING: // go back from vertices that need computation to the ones we need to run throw new JobException("BACKTRACKING is currently not supported as schedule mode."); default: throw new JobException("Schedule mode is invalid."); } } else { throw new IllegalStateException("Job may only be scheduled from state " + JobStatus.CREATED); } } public void cancel() { while (true) { JobStatus current = state; if (current == JobStatus.RUNNING || current == JobStatus.CREATED) { if (transitionState(current, JobStatus.CANCELLING)) { for (ExecutionJobVertex ejv : verticesInCreationOrder) { ejv.cancel(); } return; } } // Executions are being canceled. Go into cancelling and wait for // all vertices to be in their final state. else if (current == JobStatus.FAILING) { if (transitionState(current, JobStatus.CANCELLING)) { return; } } // All vertices have been cancelled and it's safe to directly go // into the canceled state. else if (current == JobStatus.RESTARTING) { synchronized (progressLock) { if (transitionState(current, JobStatus.CANCELED)) { postRunCleanup(); progressLock.notifyAll(); LOG.info("Canceled during restart."); return; } } } else { // no need to treat other states return; } } } public void stop() throws StoppingException { if(this.isStoppable) { for(ExecutionVertex ev : this.getAllExecutionVertices()) { if(ev.getNumberOfInputs() == 0) { // send signal to sources only ev.stop(); } } } else { throw new StoppingException("This job is not stoppable."); } } public void fail(Throwable t) { while (true) { JobStatus current = state; if (current == JobStatus.FAILING || current.isTerminalState()) { return; } else if (transitionState(current, JobStatus.FAILING, t)) { this.failureCause = t; if (!verticesInCreationOrder.isEmpty()) { // cancel all. what is failed will not cancel but stay failed for (ExecutionJobVertex ejv : verticesInCreationOrder) { ejv.cancel(); } } else { // set the state of the job to failed transitionState(JobStatus.FAILING, JobStatus.FAILED, t); } return; } // no need to treat other states } } public void restart() { try { synchronized (progressLock) { JobStatus current = state; if (current == JobStatus.CANCELED) { LOG.info("Canceled job during restart. Aborting restart."); return; } else if (current != JobStatus.RESTARTING) { throw new IllegalStateException("Can only restart job from state restarting."); } if (scheduler == null) { throw new IllegalStateException("The execution graph has not been scheduled before - scheduler is null."); } this.currentExecutions.clear(); Collection colGroups = new HashSet<>(); for (ExecutionJobVertex jv : this.verticesInCreationOrder) { CoLocationGroup cgroup = jv.getCoLocationGroup(); if(cgroup != null && !colGroups.contains(cgroup)){ cgroup.resetConstraints(); colGroups.add(cgroup); } jv.resetForNewExecution(); } for (int i = 0; i < stateTimestamps.length; i++) { stateTimestamps[i] = 0; } numFinishedJobVertices = 0; transitionState(JobStatus.RESTARTING, JobStatus.CREATED); // if we have checkpointed state, reload it into the executions if (checkpointCoordinator != null) { boolean restored = checkpointCoordinator .restoreLatestCheckpointedState(getAllVertices(), false, false); // TODO(uce) Temporary work around to restore initial state on // failure during recovery. Will be superseded by FLINK-3397. if (!restored && savepointCoordinator != null) { String savepointPath = savepointCoordinator.getSavepointRestorePath(); if (savepointPath != null) { savepointCoordinator.restoreSavepoint(getAllVertices(), savepointPath); } } } } scheduleForExecution(scheduler); } catch (Throwable t) { fail(t); } } /** * Restores the latest checkpointed state. * *

The recovery of checkpoints might block. Make sure that calls to this method don't * block the job manager actor and run asynchronously. * */ public void restoreLatestCheckpointedState() throws Exception { synchronized (progressLock) { if (checkpointCoordinator != null) { checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), false, false); } } } /** * Restores the execution state back to a savepoint. * *

The execution vertices need to be in state {@link ExecutionState#CREATED} when calling * this method. The operation might block. Make sure that calls don't block the job manager * actor. * * @param savepointPath The path of the savepoint to rollback to. * @throws IllegalStateException If checkpointing is disabled * @throws IllegalStateException If checkpoint coordinator is shut down * @throws Exception If failure during rollback */ public void restoreSavepoint(String savepointPath) throws Exception { synchronized (progressLock) { if (savepointCoordinator != null) { LOG.info("Restoring savepoint: " + savepointPath + "."); savepointCoordinator.restoreSavepoint( getAllVertices(), savepointPath); } else { // Sanity check throw new IllegalStateException("Checkpointing disabled."); } } } /** * This method cleans fields that are irrelevant for the archived execution attempt. */ public void prepareForArchiving() { if (!state.isTerminalState()) { throw new IllegalStateException("Can only archive the job from a terminal state"); } // "unpack" execution config before we throw away the usercode classloader. try { executionConfig = (ExecutionConfig) InstantiationUtil.readObjectFromConfig(jobConfiguration, ExecutionConfig.CONFIG_KEY,userClassLoader); } catch (Exception e) { LOG.warn("Error deserializing the execution config while archiving the execution graph", e); } // clear the non-serializable fields userClassLoader = null; scheduler = null; checkpointCoordinator = null; executionContext = null; for (ExecutionJobVertex vertex : verticesInCreationOrder) { vertex.prepareForArchiving(); } intermediateResults.clear(); currentExecutions.clear(); requiredJarFiles.clear(); jobStatusListenerActors.clear(); executionListenerActors.clear(); isArchived = true; } public ExecutionConfig getExecutionConfig() { return this.executionConfig; } /** * For testing: This waits until the job execution has finished. * @throws InterruptedException */ public void waitUntilFinished() throws InterruptedException { synchronized (progressLock) { while (!state.isTerminalState()) { progressLock.wait(); } } } private boolean transitionState(JobStatus current, JobStatus newState) { return transitionState(current, newState, null); } private boolean transitionState(JobStatus current, JobStatus newState, Throwable error) { if (STATE_UPDATER.compareAndSet(this, current, newState)) { if (LOG.isDebugEnabled()) { LOG.debug("{} switched from {} to {}.", this.getJobName(), current, newState); } stateTimestamps[newState.ordinal()] = System.currentTimeMillis(); notifyJobStatusChange(newState, error); return true; } else { return false; } } void jobVertexInFinalState() { synchronized (progressLock) { if (numFinishedJobVertices >= verticesInCreationOrder.size()) { throw new IllegalStateException("All vertices are already finished, cannot transition vertex to finished."); } numFinishedJobVertices++; if (numFinishedJobVertices == verticesInCreationOrder.size()) { // we are done, transition to the final state JobStatus current; while (true) { current = this.state; if (current == JobStatus.RUNNING) { if (transitionState(current, JobStatus.FINISHED)) { postRunCleanup(); break; } } else if (current == JobStatus.CANCELLING) { if (transitionState(current, JobStatus.CANCELED)) { postRunCleanup(); break; } } else if (current == JobStatus.FAILING) { boolean isRecoverable = !(failureCause instanceof UnrecoverableException); if (isRecoverable && restartStrategy.canRestart() && transitionState(current, JobStatus.RESTARTING)) { restartStrategy.restart(this); break; } else if ((!isRecoverable || !restartStrategy.canRestart()) && transitionState(current, JobStatus.FAILED, failureCause)) { postRunCleanup(); break; } } else { fail(new Exception("ExecutionGraph went into final state from state " + current)); } } // done transitioning the state // also, notify waiters progressLock.notifyAll(); } } } private void postRunCleanup() { try { CheckpointCoordinator coord = this.checkpointCoordinator; this.checkpointCoordinator = null; if (coord != null) { coord.shutdown(); } // We don't clean the checkpoint stats tracker, because we want // it to be available after the job has terminated. } catch (Exception e) { LOG.error("Error while cleaning up after execution", e); } try { CheckpointCoordinator coord = this.savepointCoordinator; this.savepointCoordinator = null; if (coord != null) { coord.shutdown(); } } catch (Exception e) { LOG.error("Error while cleaning up after execution", e); } } // -------------------------------------------------------------------------------------------- // Callbacks and Callback Utilities // -------------------------------------------------------------------------------------------- /** * Updates the state of one of the ExecutionVertex's Execution attempts. * If the new status if "FINISHED", this also updates the * * @param state The state update. * @return True, if the task update was properly applied, false, if the execution attempt was not found. */ public boolean updateState(TaskExecutionState state) { Execution attempt = this.currentExecutions.get(state.getID()); if (attempt != null) { switch (state.getExecutionState()) { case RUNNING: return attempt.switchToRunning(); case FINISHED: try { AccumulatorSnapshot accumulators = state.getAccumulators(); Map> flinkAccumulators = accumulators.deserializeFlinkAccumulators(); Map> userAccumulators = accumulators.deserializeUserAccumulators(userClassLoader); attempt.markFinished(flinkAccumulators, userAccumulators); } catch (Exception e) { LOG.error("Failed to deserialize final accumulator results.", e); attempt.markFailed(e); } return true; case CANCELED: attempt.cancelingComplete(); return true; case FAILED: attempt.markFailed(state.getError(userClassLoader)); return true; default: // we mark as failed and return false, which triggers the TaskManager // to remove the task attempt.fail(new Exception("TaskManager sent illegal state update: " + state.getExecutionState())); return false; } } else { return false; } } public void scheduleOrUpdateConsumers(ResultPartitionID partitionId) { final Execution execution = currentExecutions.get(partitionId.getProducerId()); if (execution == null) { fail(new IllegalStateException("Cannot find execution for execution ID " + partitionId.getPartitionId())); } else if (execution.getVertex() == null){ fail(new IllegalStateException("Execution with execution ID " + partitionId.getPartitionId() + " has no vertex assigned.")); } else { execution.getVertex().scheduleOrUpdateConsumers(partitionId); } } public Map getRegisteredExecutions() { return Collections.unmodifiableMap(currentExecutions); } void registerExecution(Execution exec) { Execution previous = currentExecutions.putIfAbsent(exec.getAttemptId(), exec); if (previous != null) { fail(new Exception("Trying to register execution " + exec + " for already used ID " + exec.getAttemptId())); } } void deregisterExecution(Execution exec) { Execution contained = currentExecutions.remove(exec.getAttemptId()); if (contained != null && contained != exec) { fail(new Exception("De-registering execution " + exec + " failed. Found for same ID execution " + contained)); } } /** * Updates the accumulators during the runtime of a job. Final accumulator results are transferred * through the UpdateTaskExecutionState message. * @param accumulatorSnapshot The serialized flink and user-defined accumulators */ public void updateAccumulators(AccumulatorSnapshot accumulatorSnapshot) { Map> flinkAccumulators; Map> userAccumulators; try { flinkAccumulators = accumulatorSnapshot.deserializeFlinkAccumulators(); userAccumulators = accumulatorSnapshot.deserializeUserAccumulators(userClassLoader); ExecutionAttemptID execID = accumulatorSnapshot.getExecutionAttemptID(); Execution execution = currentExecutions.get(execID); if (execution != null) { execution.setAccumulators(flinkAccumulators, userAccumulators); } else { LOG.warn("Received accumulator result for unknown execution {}.", execID); } } catch (Exception e) { LOG.error("Cannot update accumulators for job {}.", jobID, e); } } // -------------------------------------------------------------------------------------------- // Listeners & Observers // -------------------------------------------------------------------------------------------- public void registerJobStatusListener(ActorGateway listener) { if (listener != null) { this.jobStatusListenerActors.add(listener); } } public void registerExecutionListener(ActorGateway listener) { if (listener != null) { this.executionListenerActors.add(listener); } } private void notifyJobStatusChange(JobStatus newState, Throwable error) { if (jobStatusListenerActors.size() > 0) { ExecutionGraphMessages.JobStatusChanged message = new ExecutionGraphMessages.JobStatusChanged(jobID, newState, System.currentTimeMillis(), error == null ? null : new SerializedThrowable(error)); for (ActorGateway listener: jobStatusListenerActors) { listener.tell(message); } } } void notifyExecutionChange(JobVertexID vertexId, int subtask, ExecutionAttemptID executionID, ExecutionState newExecutionState, Throwable error) { ExecutionJobVertex vertex = getJobVertex(vertexId); if (executionListenerActors.size() > 0) { String message = error == null ? null : ExceptionUtils.stringifyException(error); ExecutionGraphMessages.ExecutionStateChanged actorMessage = new ExecutionGraphMessages.ExecutionStateChanged(jobID, vertexId, vertex.getJobVertex().getName(), vertex.getParallelism(), subtask, executionID, newExecutionState, System.currentTimeMillis(), message); for (ActorGateway listener : executionListenerActors) { listener.tell(actorMessage); } } // see what this means for us. currently, the first FAILED state means -> FAILED if (newExecutionState == ExecutionState.FAILED) { fail(error); } } }






你可能感兴趣的:(flink,streaming)