dispatcherGateway.submitJob(jobGraph, rpcTimeout)
JobManagerRunner jobManagerRunner = createJobManagerRunner(jobGraph, initializationTimestamp);
负责对集群中的作业进行接收和分发处理,客户端可以通过与Dispatcher建立RPC连接,将作业通过ClusterClient提交到集群Dispatcher服务中。
Dispatcher通过JobGraph对象启动JobManagerRunner服务。
负责启动和管理Dispatcher组件,并支持对Dispatcher组件的Leader选举。
当Dispatcher集群组件出现异常并停止时,会通过DispatcherRunner重新选择和启动新的Dispatcher服务,保证Dispatcher高可用。
DispatcherRunner有DefaultDispatcherRunner和DispatcherRunnerLeaderElectionLifecycleManager两种实现。
前者是DispatcherRunner接口的主要实现;
后者实现了DispatcherRunner的LeaderElection生命周期管理,包括使用LeaderElectionService启动和停止DispatcherRunner线程。
DefaultDispatcherRunnerFactory->[SessionDispatcherLeaderProcessFactoryFactory\JobDispatcherLeaderProcessFactoryFactory]
负责管理Dispatcher生命周期,提供了对JobGraph的任务恢复功能。
如果基于ZooKeeper实现了集群高可用,DispatcherLeaderProcess会将提交的JobGraph存储在ZooKeeper中,当集群停止或者出现异常时,会通过DispatcherLeaderProcess对集群中的JobGraph进行恢复,这些JobGraph都会被存储在JobGraphStore的实现类中。
在DispatcherLeaderProcess接口中定义了start()方法,用于启动DispatcherLeaderProcess服务,同时提供了获取DispatcherGateway.ShutDownFuture的方法。
在AbstractDispatcherLeaderProcess基本实现类中,主要实现了DispatcherLeaderProcess中的接口方法,并提供了onStart()和onClose()两个抽象方法,用于定义和实现子类。
在AbstractDispatcherLeaderProcess类中,通过内部类定义了DispatcherGatewayService接口以及获取DispatcherGatewayService的工厂接口。
在SessionDispatcherLeaderProcess实现类中主要实现了与Session集群相关的Dispatcher处理逻辑,主要用于对JobGraphStore中存储的JobGraph进行恢复。
在非高可用集群下,JobGraphStore的实现类为StandaloneJobGraphStore,也就是不对JobGraph进行存储和管理。
在高可用集群中,JobGraphStore基于ZooKeeper存储集群中的JobGraph。
在JobDispatcherLeaderProcess实现类中包含了对单个JobGraph进行创建和提交的方法,因此JobDispatcherLeaderProcess主要涵盖了对单个JobGraph的提交逻辑,不存在JobGraphStore的概念。
JobDispatcherLeaderProcess伴随作业的结束,其生命周期也会同步终止。
主要基于Dispatcher实现的GatewayService,用于获取DispatcherGateway。
final PartialDispatcherServices partialDispatcherServices =
new PartialDispatcherServices(
configuration,
highAvailabilityServices,
resourceManagerGatewayRetriever,
blobServer,
heartbeatServices,
() ->
MetricUtils.instantiateJobManagerMetricGroup(
metricRegistry, hostname),
executionGraphInfoStore,
fatalErrorHandler,
historyServerArchivist,
metricRegistry.getMetricQueryServiceGatewayRpcAddress(),
ioExecutor);
dispatcherRunner =
dispatcherRunnerFactory.createDispatcherRunner(
highAvailabilityServices.getDispatcherLeaderElectionService(),
fatalErrorHandler,
new HaServicesJobGraphStoreFactory(highAvailabilityServices),
ioExecutor,
rpcService,
partialDispatcherServices);
private DispatcherRunnerLeaderElectionLifecycleManager(
T dispatcherRunner, LeaderElectionService leaderElectionService) throws Exception {
this.dispatcherRunner = dispatcherRunner;
this.leaderElectionService = leaderElectionService;
leaderElectionService.start(dispatcherRunner);
}
@Override
public JobGraphStore create() {
try {
return highAvailabilityServices.getJobGraphStore();
} catch (Exception e) {
throw new FlinkRuntimeException(
String.format(
"Could not create %s from %s.",
JobGraphStore.class.getSimpleName(),
highAvailabilityServices.getClass().getSimpleName()),
e);
}
}
private void startServices() {
try {
jobGraphStore.start(this);
} catch (Exception e) {
throw new FlinkRuntimeException(
String.format(
"Could not start %s when trying to start the %s.",
jobGraphStore.getClass().getSimpleName(), getClass().getSimpleName()),
e);
}
}
return jobGraphStore.recoverJobGraph(jobId);
return new StandaloneDispatcher(
rpcService,
fencingToken,
recoveredJobs,
dispatcherBootstrapFactory,
DispatcherServices.from(
partialDispatcherServicesWithJobGraphStore,
JobMasterServiceLeadershipRunnerFactory.INSTANCE));
return new DispatcherServices(
partialDispatcherServicesWithJobGraphStore.getConfiguration(),
partialDispatcherServicesWithJobGraphStore.getHighAvailabilityServices(),
partialDispatcherServicesWithJobGraphStore.getResourceManagerGatewayRetriever(),
partialDispatcherServicesWithJobGraphStore.getBlobServer(),
partialDispatcherServicesWithJobGraphStore.getHeartbeatServices(),
partialDispatcherServicesWithJobGraphStore.getArchivedExecutionGraphStore(),
partialDispatcherServicesWithJobGraphStore.getFatalErrorHandler(),
partialDispatcherServicesWithJobGraphStore.getHistoryServerArchivist(),
partialDispatcherServicesWithJobGraphStore.getMetricQueryServiceAddress(),
partialDispatcherServicesWithJobGraphStore
.getJobManagerMetricGroupFactory()
.create(),
partialDispatcherServicesWithJobGraphStore.getJobGraphWriter(),
jobManagerRunnerFactory,
partialDispatcherServicesWithJobGraphStore.getIoExecutor());
protected RpcEndpoint(final RpcService rpcService, final String endpointId) {
this.rpcService = checkNotNull(rpcService, "rpcService");
this.endpointId = checkNotNull(endpointId, "endpointId");
this.rpcServer = rpcService.startServer(this);
this.mainThreadExecutor = new MainThreadExecutor(rpcServer, this::validateRunsInMainThread);
}
dispatcher.start()->Dispatcher#onStart
@Override
public void onStart() throws Exception {
try {
// 注册 Dispatcher 监控
startDispatcherServices();
} catch (Throwable t) {
... ...
}
// 根据恢复的 JobGraph 执行 Job
startRecoveredJobs();
// 返回 DispatcherGateway
this.dispatcherBootstrap =
this.dispatcherBootstrapFactory.create(
getSelfGateway(DispatcherGateway.class),
this.getRpcService().getScheduledExecutor(),
this::onFatalError);
}