先将submitJob的主要步骤总结写在开头,然后一步步分析。
下面一步一步分析:
case SubmitJob(jobGraph, listeningBehaviour) =>
val client = sender()
val jobInfo = new JobInfo(client, listeningBehaviour, System.currentTimeMillis(),
jobGraph.getSessionTimeout)
submitJob(jobGraph, jobInfo)
if (jobGraph == null) {
jobInfo.notifyClients(
decorateMessage(JobResultFailure(
new SerializedThrowable(
new JobSubmissionException(null, "JobGraph must not be null.")))))
}
libraryCacheManager.registerJob(
jobGraph.getJobID, jobGraph.getUserJarBlobKeys, jobGraph.getClasspaths)
val userCodeLoader = libraryCacheManager.getClassLoader(jobGraph.getJobID)
...
val restartStrategy =
Option(jobGraph.getSerializedExecutionConfig()
.deserializeValue(userCodeLoader)
.getRestartStrategy())
.map(RestartStrategyFactory.createRestartStrategy)
.filter(p => p != null) match {
case Some(strategy) => strategy
case None => restartStrategyFactory.createRestartStrategy()
}
val registerNewGraph = currentJobs.get(jobGraph.getJobID) match {
case Some((graph, currentJobInfo)) =>
executionGraph = graph
currentJobInfo.setLastActive()
false
case None =>
true
}
val allocationTimeout: Long = flinkConfiguration.getLong(
JobManagerOptions.SLOT_REQUEST_TIMEOUT)
val resultPartitionLocationTrackerProxy: ResultPartitionLocationTrackerProxy =
new ResultPartitionLocationTrackerProxy(flinkConfiguration)
executionGraph = ExecutionGraphBuilder.buildGraph(
executionGraph,
jobGraph,
flinkConfiguration,
futureExecutor,
ioExecutor,
scheduler,
userCodeLoader,
checkpointRecoveryFactory,
Time.of(timeout.length, timeout.unit),
restartStrategy,
jobMetrics,
numSlots,
blobServer,
resultPartitionLocationTrackerProxy,
Time.milliseconds(allocationTimeout),
log.logger)
...
//加入缓存
if (registerNewGraph) {
currentJobs.put(jobGraph.getJobID, (executionGraph, jobInfo))
}
val conf = new Configuration(jobGraph.getJobConfiguration)
conf.addAll(jobGraph.getSchedulingConfiguration)
val graphManagerPlugin = GraphManagerPluginFactory.createGraphManagerPlugin(
jobGraph.getSchedulingConfiguration, userCodeLoader)
val operationLogManager = new OperationLogManager(
OperationLogStoreLoader.loadOperationLogStore(jobGraph.getJobID(), conf))
val graphManager =
new GraphManager(graphManagerPlugin, null, operationLogManager, executionGraph)
graphManager.open(jobGraph, new SchedulingConfig(conf, userCodeLoader))
executionGraph.setGraphManager(graphManager)
operationLogManager.start()
executionGraph.registerJobStatusListener(
new StatusListenerMessenger(self, leaderSessionID.orNull))
jobInfo.clients foreach {
// the sender wants to be notified about state changes
case (client, ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES) =>
val listener = new StatusListenerMessenger(client, leaderSessionID.orNull)
executionGraph.registerExecutionListener(listener)
executionGraph.registerJobStatusListener(listener)
case _ => // do nothing
}
if (isRecovery) {
// this is a recovery of a master failure (this master takes over)
executionGraph.restoreLatestCheckpointedState(false, false)
}
val savepointSettings = jobGraph.getSavepointRestoreSettings
if (savepointSettings.restoreSavepoint()) {
try {
val savepointPath = savepointSettings.getRestorePath()
val allowNonRestored = savepointSettings.allowNonRestoredState()
val resumeFromLatestCheckpoint = savepointSettings.resumeFromLatestCheckpoint()
executionGraph.getCheckpointCoordinator.restoreSavepoint(
savepointPath,
allowNonRestored,
resumeFromLatestCheckpoint,
executionGraph.getAllVertices,
executionGraph.getUserClassLoader
)
} catch {
...
}
}
jobInfo.notifyClients(
decorateMessage(JobSubmitSuccess(jobGraph.getJobID)))
if (leaderSessionID.isDefined &&
leaderElectionService.hasLeadership(leaderSessionID.get)) {
executionGraph.scheduleForExecution()
} else {
self ! decorateMessage(RemoveJob(jobId, removeJobFromStateBackend = false))
log.warn(s"Submitted job $jobId, but not leader. The other leader needs to recover " +
"this. I am not scheduling the job for execution.")
}
if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) {
graphManager.startScheduling();
}
public void startScheduling() {
LOG.info("Start scheduling execution graph with graph manager plugin: {}",
graphManagerPlugin.getClass().getName());
graphManagerPlugin.onSchedulingStarted();
}
flink实现了三种GraphManagerPlugin:EagerSchedulingPlugin,RunningUnitGraphManagerPlugin,StepwiseSchedulingPlugin。
public void onSchedulingStarted() {
final List verticesToSchedule = new ArrayList<>();
for (JobVertex vertex : jobGraph.getVerticesSortedTopologicallyFromSources()) {
for (int i = 0; i < vertex.getParallelism(); i++) {
verticesToSchedule.add(new ExecutionVertexID(vertex.getID(), i));
}
}
scheduler.scheduleExecutionVertices(verticesToSchedule);
}
public void onSchedulingStarted() {
runningUnitMap.values().stream()
.filter(LogicalJobVertexRunningUnit::allDependReady)
.forEach(this::addToScheduleQueue);
checkScheduleNewRunningUnit();
}
public void onSchedulingStarted() {
final List verticesToSchedule = new ArrayList<>();
for (JobVertex vertex : jobGraph.getVerticesSortedTopologicallyFromSources()) {
if (vertex.isInputVertex()) {
for (int i = 0; i < vertex.getParallelism(); i++) {
verticesToSchedule.add(new ExecutionVertexID(vertex.getID(), i));
}
}
}
scheduleOneByOne(verticesToSchedule);
}
上面三种plugin不同是调度vertice的顺序,但是vertice调度方法是一样的,最终都是调用ExecutionGraphVertexScheduler的scheduleExecutionVertices方法;
public class ExecutionGraphVertexScheduler implements VertexScheduler {
public void scheduleExecutionVertices(Collection verticesToSchedule) {
synchronized (executionVerticesToBeScheduled) {
if (isReconciling) {
executionVerticesToBeScheduled.add(verticesToSchedule);
return;
}
}
executionGraph.scheduleVertices(verticesToSchedule);
}
}
public void scheduleVertices(Collection verticesToSchedule) {
try {
。。。
final CompletableFuture schedulingFuture = schedule(vertices);
if (state == JobStatus.RUNNING && currentGlobalModVersion == globalModVersion) {
schedulingFutures.put(schedulingFuture, schedulingFuture);
schedulingFuture.whenCompleteAsync(
(Void ignored, Throwable throwable) -> {
schedulingFutures.remove(schedulingFuture);
},
futureExecutor);
} else {
schedulingFuture.cancel(false);
}
} catch (Throwable t) {
。。。
}
}
深入schedule(vertices)方法,这是真正调度vertices的方法,看看具体做了什么。
checkState(state == JobStatus.RUNNING, "job is not running currently");
final boolean queued = allowQueuedScheduling;
List slotRequestIds = new ArrayList<>(vertices.size());
List scheduledUnits = new ArrayList<>(vertices.size());
List slotProfiles = new ArrayList<>(vertices.size());
List scheduledExecutions = new ArrayList<>(vertices.size());
//为每个vertice准备调度的资源
for (ExecutionVertex ev : vertices) {
final Execution exec = ev.getCurrentExecutionAttempt();
try {
Tuple2 scheduleUnitAndSlotProfile = exec.enterScheduledAndPrepareSchedulingResources();
slotRequestIds.add(new SlotRequestId());
scheduledUnits.add(scheduleUnitAndSlotProfile.f0);
slotProfiles.add(scheduleUnitAndSlotProfile.f1);
scheduledExecutions.add(exec);
} catch (IllegalExecutionStateException e) {
LOG.info("The execution {} may be already scheduled by other thread.", ev.getTaskNameWithSubtaskIndex(), e);
}
}
List> allocationFutures =
slotProvider.allocateSlots(slotRequestIds, scheduledUnits, queued, slotProfiles, allocationTimeout);
List> assignFutures = new ArrayList<>(slotRequestIds.size());
for (int i = 0; i < allocationFutures.size(); i++) {
final int index = i;
allocationFutures.get(i).whenComplete(
(ignore, throwable) -> {
if (throwable != null) {
slotProvider.cancelSlotRequest(
slotRequestIds.get(index),
scheduledUnits.get(index).getSlotSharingGroupId(),
scheduledUnits.get(index).getCoLocationConstraint(),
throwable);
}
}
);
assignFutures.add(allocationFutures.get(i).thenAccept(
(LogicalSlot logicalSlot) -> {
if
//
(!scheduledExecutions.get(index).tryAssignResource(logicalSlot)) {
// release the slot
Exception e = new FlinkException("Could not assign logical slot to execution " + scheduledExecutions.get(index) + '.');
logicalSlot.releaseSlot(e);
throw new CompletionException(e);
}
})
);
}
CompletableFuture currentSchedulingFuture = allAssignFutures
.异常处理
.handleAsync(
(Collection ignored, Throwable throwable) -> {
if (throwable != null) {
throw new CompletionException(throwable);
} else {
boolean hasFailure = false;
for (int i = 0; i < scheduledExecutions.size(); i++) {
try {
scheduledExecutions.get(i).deploy();
} catch (Exception e) {
hasFailure = true;
scheduledExecutions.get(i).markFailed(e);
}
}
if (hasFailure) {
throw new CompletionException(
new FlinkException("Fail to deploy some executions."));
}
}
return null;
}, futureExecutor);
下面深入deploy方法,deploy负责将Execution部署到先前分配好的资源上,提交task到taskManagerGateway,然后由taskManagerGateway转发给Taskmanager。TaskManager如何处理SubmitTask消息之后分析。
public void deploy() throws JobException {
...一系列检查保证slot可用
executor.execute(
() -> {
try {
final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor(
attemptId,
slot,
taskRestore,
attemptNumber);
// null taskRestore to let it be GC'ed
taskRestore = null;
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
//提交task到taskManagerGateway,然后由taskManagerGateway转发给Taskmanager
final CompletableFuture submitResultFuture = taskManagerGateway.submitTask(deployment, rpcTimeout);
...
} catch (Throwable t) {
markFailed(t);
}
}
);
}