flink 容错机制的核心是对数据流做连续的分布式快照(snapshots)。在系统失败时,各个算子可以从这些快照构成的检查点(checkpoint)恢复到故障之前的状态,保证即使遇到故障,作业的最终结果只被数据流中的每一条消息影响一次(exactly-once) (这里可以通过配置退化成 at least once)。生成分布式快照的机制在 “Lightweight Asynchronous Snapshots for Distributed Dataflows” 这篇文章中有详细描述。它的灵感来自于 Chandy-Lamport algorithm, 并且在 flink 的实现模型中做了调整。
快照状态的保存机制我们在 Flink 如何保存状态数据 这篇文章中介绍过。本文介绍 flink 是如何进行分布式快照的。
本文代码基于 flink-1.10。
flink-1.9 及之前版本只能使用 api 来进行 checkpoint 配置。flink-1.10 可以在 conf/flink-conf.yaml 或通过 -yD/-D 方式配置。
CheckpointConfig 增加了 configure(ReadableConfig configuration) 方法,并在 StreamExecutionEnvironment 初始化时调用。
// StreamExecutionEnvironment.java
public void configure(ReadableConfig configuration, ClassLoader classLoader) {
...
checkpointCfg.configure(configuration);
}
// CheckpointConfig.java
public void configure(ReadableConfig configuration) {
configuration.getOptional(ExecutionCheckpointingOptions.CHECKPOINTING_MODE)
.ifPresent(this::setCheckpointingMode);
configuration.getOptional(ExecutionCheckpointingOptions.CHECKPOINTING_INTERVAL)
.ifPresent(i -> this.setCheckpointInterval(i.toMillis()));
configuration.getOptional(ExecutionCheckpointingOptions.CHECKPOINTING_TIMEOUT)
.ifPresent(t -> this.setCheckpointTimeout(t.toMillis()));
configuration.getOptional(ExecutionCheckpointingOptions.MAX_CONCURRENT_CHECKPOINTS)
.ifPresent(this::setMaxConcurrentCheckpoints);
configuration.getOptional(ExecutionCheckpointingOptions.MIN_PAUSE_BETWEEN_CHECKPOINTS)
.ifPresent(m -> this.setMinPauseBetweenCheckpoints(m.toMillis()));
configuration.getOptional(ExecutionCheckpointingOptions.PREFER_CHECKPOINT_FOR_RECOVERY)
.ifPresent(this::setPreferCheckpointForRecovery);
configuration.getOptional(ExecutionCheckpointingOptions.TOLERABLE_FAILURE_NUMBER)
.ifPresent(this::setTolerableCheckpointFailureNumber);
configuration.getOptional(ExecutionCheckpointingOptions.EXTERNALIZED_CHECKPOINT)
.ifPresent(this::enableExternalizedCheckpoints);
}
在 StreamGraphGenerator 生成 StreamGraph 时,CheckpointConfig 直接传递给 StreamGraph。
// StreamGraphGenerator.java
public StreamGraph generate() {
streamGraph = new StreamGraph(executionConfig, checkpointConfig, savepointRestoreSettings);
...
return builtStreamGraph;
}
StreamGraph 转换成 JobGraph 时,定义了三种顶点:
其次,还生成 CheckpointCoordinatorConfiguration,CheckpointCoordinator 初始化时会用到。
// StreamingJobGraphGenerator.java
private void configureCheckpointing() {
CheckpointConfig cfg = streamGraph.getCheckpointConfig();
long interval = cfg.getCheckpointInterval();
if (interval < MINIMAL_CHECKPOINT_TIME) {
// interval of max value means disable periodic checkpoint
interval = Long.MAX_VALUE;
}
// --- configure the participating vertices ---
// collect the vertices that receive "trigger checkpoint" messages.
// currently, these are all the sources
List<JobVertexID> triggerVertices = new ArrayList<>();
// collect the vertices that need to acknowledge the checkpoint
// currently, these are all vertices
List<JobVertexID> ackVertices = new ArrayList<>(jobVertices.size());
// collect the vertices that receive "commit checkpoint" messages
// currently, these are all vertices
List<JobVertexID> commitVertices = new ArrayList<>(jobVertices.size());
for (JobVertex vertex : jobVertices.values()) {
if (vertex.isInputVertex()) {
triggerVertices.add(vertex.getID());
}
commitVertices.add(vertex.getID());
ackVertices.add(vertex.getID());
}
// --- configure options ---
CheckpointRetentionPolicy retentionAfterTermination;
if (cfg.isExternalizedCheckpointsEnabled()) {
CheckpointConfig.ExternalizedCheckpointCleanup cleanup = cfg.getExternalizedCheckpointCleanup();
// Sanity check
if (cleanup == null) {
throw new IllegalStateException("Externalized checkpoints enabled, but no cleanup mode configured.");
}
retentionAfterTermination = cleanup.deleteOnCancellation() ?
CheckpointRetentionPolicy.RETAIN_ON_FAILURE :
CheckpointRetentionPolicy.RETAIN_ON_CANCELLATION;
} else {
retentionAfterTermination = CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION;
}
CheckpointingMode mode = cfg.getCheckpointingMode();
boolean isExactlyOnce;
if (mode == CheckpointingMode.EXACTLY_ONCE) {
isExactlyOnce = cfg.isCheckpointingEnabled();
} else if (mode == CheckpointingMode.AT_LEAST_ONCE) {
isExactlyOnce = false;
} else {
throw new IllegalStateException("Unexpected checkpointing mode. " +
"Did not expect there to be another checkpointing mode besides " +
"exactly-once or at-least-once.");
}
// --- configure the master-side checkpoint hooks ---
final ArrayList<MasterTriggerRestoreHook.Factory> hooks = new ArrayList<>();
for (StreamNode node : streamGraph.getStreamNodes()) {
if (node.getOperatorFactory() instanceof UdfStreamOperatorFactory) {
Function f = ((UdfStreamOperatorFactory) node.getOperatorFactory()).getUserFunction();
if (f instanceof WithMasterCheckpointHook) {
hooks.add(new FunctionMasterCheckpointHookFactory((WithMasterCheckpointHook<?>) f));
}
}
}
// because the hooks can have user-defined code, they need to be stored as
// eagerly serialized values
final SerializedValue<MasterTriggerRestoreHook.Factory[]> serializedHooks;
if (hooks.isEmpty()) {
serializedHooks = null;
} else {
try {
MasterTriggerRestoreHook.Factory[] asArray =
hooks.toArray(new MasterTriggerRestoreHook.Factory[hooks.size()]);
serializedHooks = new SerializedValue<>(asArray);
}
catch (IOException e) {
throw new FlinkRuntimeException("Trigger/restore hook is not serializable", e);
}
}
// because the state backend can have user-defined code, it needs to be stored as
// eagerly serialized value
final SerializedValue<StateBackend> serializedStateBackend;
if (streamGraph.getStateBackend() == null) {
serializedStateBackend = null;
} else {
try {
serializedStateBackend =
new SerializedValue<StateBackend>(streamGraph.getStateBackend());
}
catch (IOException e) {
throw new FlinkRuntimeException("State backend is not serializable", e);
}
}
// --- done, put it all together ---
JobCheckpointingSettings settings = new JobCheckpointingSettings(
triggerVertices,
ackVertices,
commitVertices,
new CheckpointCoordinatorConfiguration(
interval,
cfg.getCheckpointTimeout(),
cfg.getMinPauseBetweenCheckpoints(),
cfg.getMaxConcurrentCheckpoints(),
retentionAfterTermination,
isExactlyOnce,
cfg.isPreferCheckpointForRecovery(),
cfg.getTolerableCheckpointFailureNumber()),
serializedStateBackend,
serializedHooks);
jobGraph.setSnapshotSettings(settings);
}
// ExecutionGraph.java
public void enableCheckpointing(
CheckpointCoordinatorConfiguration chkConfig,
List<ExecutionJobVertex> verticesToTrigger,
List<ExecutionJobVertex> verticesToWaitFor,
List<ExecutionJobVertex> verticesToCommitTo,
List<MasterTriggerRestoreHook<?>> masterHooks,
CheckpointIDCounter checkpointIDCounter,
CompletedCheckpointStore checkpointStore,
StateBackend checkpointStateBackend,
CheckpointStatsTracker statsTracker) {
checkState(state == JobStatus.CREATED, "Job must be in CREATED state");
checkState(checkpointCoordinator == null, "checkpointing already enabled");
ExecutionVertex[] tasksToTrigger = collectExecutionVertices(verticesToTrigger);
ExecutionVertex[] tasksToWaitFor = collectExecutionVertices(verticesToWaitFor);
ExecutionVertex[] tasksToCommitTo = collectExecutionVertices(verticesToCommitTo);
checkpointStatsTracker = checkNotNull(statsTracker, "CheckpointStatsTracker");
CheckpointFailureManager failureManager = new CheckpointFailureManager(
chkConfig.getTolerableCheckpointFailureNumber(),
new CheckpointFailureManager.FailJobCallback() {
@Override
public void failJob(Throwable cause) {
getJobMasterMainThreadExecutor().execute(() -> failGlobal(cause));
}
@Override
public void failJobDueToTaskFailure(Throwable cause, ExecutionAttemptID failingTask) {
getJobMasterMainThreadExecutor().execute(() -> failGlobalIfExecutionIsStillRunning(cause, failingTask));
}
}
);
checkState(checkpointCoordinatorTimer == null);
checkpointCoordinatorTimer = Executors.newSingleThreadScheduledExecutor(
new DispatcherThreadFactory(
Thread.currentThread().getThreadGroup(), "Checkpoint Timer"));
// create the coordinator that triggers and commits checkpoints and holds the state
checkpointCoordinator = new CheckpointCoordinator(
jobInformation.getJobId(),
chkConfig,
tasksToTrigger,
tasksToWaitFor,
tasksToCommitTo,
checkpointIDCounter,
checkpointStore,
checkpointStateBackend,
ioExecutor,
new ScheduledExecutorServiceAdapter(checkpointCoordinatorTimer),
SharedStateRegistry.DEFAULT_FACTORY,
failureManager);
// register the master hooks on the checkpoint coordinator
for (MasterTriggerRestoreHook<?> hook : masterHooks) {
if (!checkpointCoordinator.addMasterHook(hook)) {
LOG.warn("Trying to register multiple checkpoint hooks with the name: {}", hook.getIdentifier());
}
}
checkpointCoordinator.setCheckpointStatsTracker(checkpointStatsTracker);
// interval of max long value indicates disable periodic checkpoint,
// the CheckpointActivatorDeactivator should be created only if the interval is not max value
if (chkConfig.getCheckpointInterval() != Long.MAX_VALUE) {
// the periodic checkpoint scheduler is activated and deactivated as a result of
// job status changes (running -> on, all other states -> off)
registerJobStatusListener(checkpointCoordinator.createActivatorDeactivator());
}
this.stateBackendName = checkpointStateBackend.getClass().getSimpleName();
}
总结一下 ExecutionGraph 生成过程中有关 checkpoint 的操作:
当由于各种原因 checkpoint 失败时,CheckpointFailureManager 负责进行处理,其中有两个重要的参数:
public void handleJobLevelCheckpointException(CheckpointException exception, long checkpointId) {
checkFailureCounter(exception, checkpointId);
if (continuousFailureCounter.get() > tolerableCpFailureNumber) {
clearCount();
failureCallback.failJob(new FlinkRuntimeException("Exceeded checkpoint tolerable failure threshold."));
}
}
...
public void handleTaskLevelCheckpointException(
CheckpointException exception,
long checkpointId,
ExecutionAttemptID executionAttemptID) {
checkFailureCounter(exception, checkpointId);
if (continuousFailureCounter.get() > tolerableCpFailureNumber) {
clearCount();
failureCallback.failJobDueToTaskFailure(new FlinkRuntimeException("Exceeded checkpoint tolerable failure threshold."), executionAttemptID);
}
}
CheckpointCoordinator 是 flink 的一个核心组件,位于 JobManager 进程,用于协调分布式快照和状态的触发与存储。CheckpointCoordinator 向相关算子(全部 source 算子)发送触发 checkpoint 的消息,并收集每个算子上报的快照完成的 ack 消息,这些 ack 消息包含算子进行快照后的状态句柄,CheckpointCoordinator 则对这些状态句柄进行维护;待所有算子都上报 ack 消息后,CheckpointCoordinator 将这些元数据信息进行保存(根据选择的 StateBackend 保存在不同的位置)。
CheckpointCoordinatorDeActivator 实际上是一个监听器,当作业状态转化成 JobStatus.RUNNING 时,CheckpointCoordinator 中的调度器启动。
// CheckpointCoordinator.java
public JobStatusListener createActivatorDeactivator() {
synchronized (lock) {
if (shutdown) {
throw new IllegalArgumentException("Checkpoint coordinator is shut down");
}
if (jobStatusListener == null) {
jobStatusListener = new CheckpointCoordinatorDeActivator(this);
}
return jobStatusListener;
}
}
// CheckpointCoordinatorDeActivator.java
public void jobStatusChanges(JobID jobId, JobStatus newJobStatus, long timestamp, Throwable error) {
if (newJobStatus == JobStatus.RUNNING) {
// start the checkpoint scheduler
coordinator.startCheckpointScheduler();
} else {
// anything else should stop the trigger for now
coordinator.stopCheckpointScheduler();
}
}
CheckpointCoordinator 的定时器会在(minPauseBetweenCheckpoints,checkpoint baseInterval + 1)之间随机等待一段时间后,定时执行 ScheduledTrigger,这里终于执行到了 triggerCheckpoint() 方法
// CheckpointCoordinator.java
public void startCheckpointScheduler() {
synchronized (lock) {
if (shutdown) {
throw new IllegalArgumentException("Checkpoint coordinator is shut down");
}
// make sure all prior timers are cancelled
stopCheckpointScheduler();
periodicScheduling = true;
currentPeriodicTrigger = scheduleTriggerWithDelay(getRandomInitDelay());
}
}
...
private long getRandomInitDelay() {
return ThreadLocalRandom.current().nextLong(minPauseBetweenCheckpoints, baseInterval + 1L);
}
...
private ScheduledFuture<?> scheduleTriggerWithDelay(long initDelay) {
return timer.scheduleAtFixedRate(
new ScheduledTrigger(),
initDelay, baseInterval, TimeUnit.MILLISECONDS);
}
...
private final class ScheduledTrigger implements Runnable {
@Override
public void run() {
try {
triggerCheckpoint(System.currentTimeMillis(), true);
}
catch (Exception e) {
LOG.error("Exception while triggering checkpoint for job {}.", job, e);
}
}
}
triggerCheckpoint() 包含了 checkpoint 和 savepoint 的逻辑,这里我们只关注 checkpoint。
1.首先我们看 triggerCheckpoint() 方法体。timestamp 表示当前 checkpoint 的时间;isPeriodic 表示 checkpoint 是否是周期性触发的,对于 savepoint 来说,isPeriodic=false;props 描述了 checkpoint 何时被清理以及是否强制进行 checkpoint;externalSavepointLocation 和 advanceToEndOfTime 是进行 savepoint 时的配置,externalSavepointLocation 描述了 savepoint 的持久化地址,而 advanceToEndOfTime=true 时,SourceTask 会发送 MAX_WATERMARK,意为将所有还未处理的消息(未达到水位线)进行处理。
public CompletableFuture<CompletedCheckpoint> triggerCheckpoint(
long timestamp,
CheckpointProperties props,
@Nullable String externalSavepointLocation,
boolean isPeriodic,
boolean advanceToEndOfTime) throws CheckpointException{...}
2.接下来进行 pre-checks,检查是否满足进行 checkpoint 的条件。
if (advanceToEndOfTime && !(props.isSynchronous() && props.isSavepoint())) {
throw new IllegalArgumentException("Only synchronous savepoints are allowed to advance the watermark to MAX.");
}
// make some eager pre-checks
synchronized (lock) {
preCheckBeforeTriggeringCheckpoint(isPeriodic, props.forceCheckpoint());
}
当出现以下情况时,checkpoint 会终止:
需要注意的是,如果当前 checkpoint 是强制执行的,上面后两项检查会忽略。同时上面的几项检查是加锁的。对应的代码如下:
private void preCheckBeforeTriggeringCheckpoint(boolean isPeriodic, boolean forceCheckpoint) throws CheckpointException {
// abort if the coordinator has been shutdown in the meantime
if (shutdown) {
throw new CheckpointException(CheckpointFailureReason.CHECKPOINT_COORDINATOR_SHUTDOWN);
}
// Don't allow periodic checkpoint if scheduling has been disabled
if (isPeriodic && !periodicScheduling) {
throw new CheckpointException(CheckpointFailureReason.PERIODIC_SCHEDULER_SHUTDOWN);
}
if (!forceCheckpoint) {
if (triggerRequestQueued) {
LOG.warn("Trying to trigger another checkpoint for job {} while one was queued already.", job);
throw new CheckpointException(CheckpointFailureReason.ALREADY_QUEUED);
}
checkConcurrentCheckpoints();
checkMinPauseBetweenCheckpoints();
}
}
// check if all tasks that we need to trigger are running.
// if not, abort the checkpoint
Execution[] executions = new Execution[tasksToTrigger.length];
for (int i = 0; i < tasksToTrigger.length; i++) {
Execution ee = tasksToTrigger[i].getCurrentExecutionAttempt();
if (ee == null) {
LOG.info("Checkpoint triggering task {} of job {} is not being executed at the moment. Aborting checkpoint.",
tasksToTrigger[i].getTaskNameWithSubtaskIndex(),
job);
throw new CheckpointException(CheckpointFailureReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
} else if (ee.getState() == ExecutionState.RUNNING) {
executions[i] = ee;
} else {
LOG.info("Checkpoint triggering task {} of job {} is not in state {} but {} instead. Aborting checkpoint.",
tasksToTrigger[i].getTaskNameWithSubtaskIndex(),
job,
ExecutionState.RUNNING,
ee.getState());
throw new CheckpointException(CheckpointFailureReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
}
}
// next, check if all tasks that need to acknowledge the checkpoint are running.
// if not, abort the checkpoint
Map<ExecutionAttemptID, ExecutionVertex> ackTasks = new HashMap<>(tasksToWaitFor.length);
for (ExecutionVertex ev : tasksToWaitFor) {
Execution ee = ev.getCurrentExecutionAttempt();
if (ee != null) {
ackTasks.put(ee.getAttemptId(), ev);
} else {
LOG.info("Checkpoint acknowledging task {} of job {} is not being executed at the moment. Aborting checkpoint.",
ev.getTaskNameWithSubtaskIndex(),
job);
throw new CheckpointException(CheckpointFailureReason.NOT_ALL_REQUIRED_TASKS_RUNNING);
}
}
如果满足上述触发 checkpoint 条件,就进入了真正触发 checkpoint 的环节。
3.确定 checkpointID 和 CheckpointStorageLocation
checkpointID 用于标识一次 checkpoint,由 CheckpointIDCounter 生成,根据是否开启 HA 模式,有以下两种实现类:
CheckpointStorageLocation 即 checkpoint 持久化时的保存位置。根据选择的 StateBackend,会有以下两种:
需要注意的是,这一步没有获取同步锁,因为获取 checkpointID 和初始化 CheckpointStorageLocation 是一个比较耗时的操作。
// we will actually trigger this checkpoint!
final CheckpointStorageLocation checkpointStorageLocation;
final long checkpointID;
try {
// this must happen outside the coordinator-wide lock, because it communicates
// with external services (in HA mode) and may block for a while.
checkpointID = checkpointIdCounter.getAndIncrement();
checkpointStorageLocation = props.isSavepoint() ?
checkpointStorage.initializeLocationForSavepoint(checkpointID, externalSavepointLocation) :
checkpointStorage.initializeLocationForCheckpoint(checkpointID);
}
catch (Throwable t) {
int numUnsuccessful = numUnsuccessfulCheckpointsTriggers.incrementAndGet();
LOG.warn("Failed to trigger checkpoint for job {} ({} consecutive failed attempts so far).",
job,
numUnsuccessful,
t);
throw new CheckpointException(CheckpointFailureReason.EXCEPTION, t);
}
4.接下来创建 PendingCheckpoint
PendingCheckpoint 代表当前已经开始的 checkpoint,当 CheckpointCoordinator 收到所有 task 对该 checkpoint 的 ack 消息后,PendingCheckpoint 成为 CompletedCheckpoint。
final PendingCheckpoint checkpoint = new PendingCheckpoint(
job,
checkpointID,
timestamp,
ackTasks,
masterHooks.keySet(),
props,
checkpointStorageLocation,
executor);
5.创建对当前 checkpoint 超时清理的取消器
创建的 canceller 是一个 Runnable 对象,将在接下来的代码中添加到 timer 定时器中。
canceller 对超时的 checkpoint 相关资源进行清理,并立即触发下一个 checkpoint(如果有的话)。
// schedule the timer that will clean up the expired checkpoints
final Runnable canceller = () -> {
synchronized (lock) {
// only do the work if the checkpoint is not discarded anyways
// note that checkpoint completion discards the pending checkpoint object
if (!checkpoint.isDiscarded()) {
LOG.info("Checkpoint {} of job {} expired before completing.", checkpointID, job);
failPendingCheckpoint(checkpoint, CheckpointFailureReason.CHECKPOINT_EXPIRED);
pendingCheckpoints.remove(checkpointID);
rememberRecentCheckpointId(checkpointID);
triggerQueuedRequests();
}
}
};
6.再次获取同步锁,检查条件
由于第3步没有在同步代码块中执行,所以仍然需要对 checkpoint 条件进行检查,再次调用 preCheckBeforeTriggeringCheckpoint().
检查通过后,会将 PendingCheckpoint 放入 PendingCheckpoint 集合中;同时开始第5步生成的 canceller 定时调度。
// re-acquire the coordinator-wide lock
synchronized (lock) {
preCheckBeforeTriggeringCheckpoint(isPeriodic, props.forceCheckpoint());
LOG.info("Triggering checkpoint {} @ {} for job {}.", checkpointID, timestamp, job);
pendingCheckpoints.put(checkpointID, checkpoint);
ScheduledFuture<?> cancellerHandle = timer.schedule(
canceller,
checkpointTimeout, TimeUnit.MILLISECONDS);
if (!checkpoint.setCancellerHandle(cancellerHandle)) {
// checkpoint is already disposed!
cancellerHandle.cancel(false);
}
// TODO, asynchronously snapshots master hook without waiting here
for (MasterTriggerRestoreHook<?> masterHook : masterHooks.values()) {
final MasterState masterState =
MasterHooks.triggerHook(masterHook, checkpointID, timestamp, executor)
.get(checkpointTimeout, TimeUnit.MILLISECONDS);
checkpoint.acknowledgeMasterState(masterHook.getIdentifier(), masterState);
}
Preconditions.checkState(checkpoint.areMasterStatesFullyAcknowledged());
}
7.发送 trigger checkpoint 消息给 Task
此时,checkpoint 流程进入到 TaskManager。executions 是所有的 source 节点。
// send the messages to the tasks that trigger their checkpoint
for (Execution execution: executions) {
if (props.isSynchronous()) {
execution.triggerSynchronousSavepoint(checkpointID, timestamp, checkpointOptions, advanceToEndOfTime);
} else {
execution.triggerCheckpoint(checkpointID, timestamp, checkpointOptions);
}
}
最终通过 TaskManagerGateway (基于 akka 消息系统)发送 trigger checkpoint 消息给对应的 TaskExecutor,执行 TaskExecutor#triggerCheckpoint().
// Execution.java
private void triggerCheckpointHelper(long checkpointId, long timestamp, CheckpointOptions checkpointOptions, boolean advanceToEndOfEventTime) {
final CheckpointType checkpointType = checkpointOptions.getCheckpointType();
if (advanceToEndOfEventTime && !(checkpointType.isSynchronous() && checkpointType.isSavepoint())) {
throw new IllegalArgumentException("Only synchronous savepoints are allowed to advance the watermark to MAX.");
}
final LogicalSlot slot = assignedResource;
if (slot != null) {
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
taskManagerGateway.triggerCheckpoint(attemptId, getVertex().getJobId(), checkpointId, timestamp, checkpointOptions, advanceToEndOfEventTime);
} else {
LOG.debug("The execution has no slot assigned. This indicates that the execution is no longer running.");
}
}dvanceToEndOfEventTime);
// RpcTaskManagerGateway.java
public void triggerCheckpoint(ExecutionAttemptID executionAttemptID, JobID jobId, long checkpointId, long timestamp, CheckpointOptions checkpointOptions, boolean advanceToEndOfEventTime) {
taskExecutorGateway.triggerCheckpoint(
executionAttemptID,
checkpointId,
timestamp,
checkpointOptions,
advanceToEndOfEventTime);
}
TaskManager 的所有 Task 执行完 checkpoint 后,会向 CheckpointCoordinator 发送本次 checkpoint 的 ack 消息。
JobMaster 接收 ack 消息并传递给 CheckpointCoordinator 的调用链为:JobMaster#acknowledgeCheckpoint() -> SchedulerBase#acknowledgeCheckpoint() -> CheckpointCoordinator#receiveAcknowledgeMessage()
当 CheckpointCoordinator 收到所有 task 的 ack 消息后,由 PendingCheckpoint 生成 CompletedCheckpoint。过程中会将所有 Task 上报的 meta 信息进行持久化。
// PendingCheckpoint.java
public CompletedCheckpoint finalizeCheckpoint() throws IOException {
synchronized (lock) {
checkState(areMasterStatesFullyAcknowledged(),
"Pending checkpoint has not been fully acknowledged by master states yet.");
checkState(areTasksFullyAcknowledged(),
"Pending checkpoint has not been fully acknowledged by tasks yet.");
// make sure we fulfill the promise with an exception if something fails
try {
// write out the metadata
final Savepoint savepoint = new SavepointV2(checkpointId, operatorStates.values(), masterStates);
final CompletedCheckpointStorageLocation finalizedLocation;
try (CheckpointMetadataOutputStream out = targetLocation.createMetadataOutputStream()) {
Checkpoints.storeCheckpointMetadata(savepoint, out);
finalizedLocation = out.closeAndFinalizeCheckpoint();
}
CompletedCheckpoint completed = new CompletedCheckpoint(
jobId,
checkpointId,
checkpointTimestamp,
System.currentTimeMillis(),
operatorStates,
masterStates,
props,
finalizedLocation);
onCompletionPromise.complete(completed);
...
}
}
生成 CompletedCheckpoint 后,将 pendingCheckpoint 移除,并通知所有 task,本次 checkpoint 完成(StreamTask#notifyCheckpointComplete()),有些 task operator 会进行最后的操作,比如开启事务提交后 kafkaproducer 的 commit()。
// CheckpointCoordinator.java
private void completePendingCheckpoint(PendingCheckpoint pendingCheckpoint) throws CheckpointException {
final long checkpointId = pendingCheckpoint.getCheckpointId();
final CompletedCheckpoint completedCheckpoint;
// As a first step to complete the checkpoint, we register its state with the registry
Map<OperatorID, OperatorState> operatorStates = pendingCheckpoint.getOperatorStates();
sharedStateRegistry.registerAll(operatorStates.values());
try {
try {
// PendingCheckpoint.finalizeCheckpoint 生成 CompletedCheckpoint
completedCheckpoint = pendingCheckpoint.finalizeCheckpoint();
failureManager.handleCheckpointSuccess(pendingCheckpoint.getCheckpointId());
}
...
// send the "notify complete" call to all vertices
final long timestamp = completedCheckpoint.getTimestamp();
for (ExecutionVertex ev : tasksToCommitTo) {
Execution ee = ev.getCurrentExecutionAttempt();
if (ee != null) {
ee.notifyCheckpointComplete(checkpointId, timestamp);
}
}
}
TaskManager 端的 checkpoint 流程又可以细分为 SourceStreamTask 和其他 Task。
Source task 所在的 TaskExecutor 收到 trigger checkpoint 消息,继续进行 checkpoint。
// TaskExecutor.java
public CompletableFuture<Acknowledge> triggerCheckpoint(
ExecutionAttemptID executionAttemptID,
long checkpointId,
long checkpointTimestamp,
CheckpointOptions checkpointOptions,
boolean advanceToEndOfEventTime) {
...
// 这里是 SourceStreamTask
final Task task = taskSlotTable.getTask(executionAttemptID);
if (task != null) {
task.triggerCheckpointBarrier(checkpointId, checkpointTimestamp, checkpointOptions, advanceToEndOfEventTime);
return CompletableFuture.completedFuture(Acknowledge.get());
}
...
}
这一步仍然会判断当前 task 的执行状态,不是 RUNNING 状态的话会通知 CheckpointCoordinator 取消这次 checkpoint。
这里的 invokable 实际是 SourceStreamTask
// Task.java
public void triggerCheckpointBarrier(
final long checkpointID,
final long checkpointTimestamp,
final CheckpointOptions checkpointOptions,
final boolean advanceToEndOfEventTime) {
if (executionState == ExecutionState.RUNNING && invokable != null) {
try {
invokable.triggerCheckpointAsync(checkpointMetaData, checkpointOptions, advanceToEndOfEventTime);
}
...
}
else {
LOG.debug("Declining checkpoint request for non-running task {} ({}).", taskNameWithSubtask, executionId);
// send back a message that we did not do the checkpoint
checkpointResponder.declineCheckpoint(jobId, executionId, checkpointID,
new CheckpointException("Task name with subtask : " + taskNameWithSubtask, CheckpointFailureReason.CHECKPOINT_DECLINED_TASK_NOT_READY));
}
}
SourceStreamTask 和 StreamTask 都是返回一个 Future 对象,并不会阻塞。
// SourceStreamTask.java
public Future<Boolean> triggerCheckpointAsync(CheckpointMetaData checkpointMetaData, CheckpointOptions checkpointOptions, boolean advanceToEndOfEventTime) {
if (!externallyInducedCheckpoints) {
return super.triggerCheckpointAsync(checkpointMetaData, checkpointOptions, advanceToEndOfEventTime);
}
else {
// we do not trigger checkpoints here, we simply state whether we can trigger them
synchronized (getCheckpointLock()) {
return CompletableFuture.completedFuture(isRunning());
}
}
}
flink-1.10 重构了 StreamTask 的线程模型(点击这里了解更多),新的线程模型参考了 Actor 模型的 MailbBox,新的 checkpoint 消息会加入一个阻塞队列,然后由 MailBox 主线程从队列中拉取并执行。下面的 lambda 表达式就是 MailBox 线程最终要执行的逻辑。
// StreamTask.java
public Future<Boolean> triggerCheckpointAsync(
CheckpointMetaData checkpointMetaData,
CheckpointOptions checkpointOptions,
boolean advanceToEndOfEventTime) {
return mailboxProcessor.getMainMailboxExecutor().submit(
() -> triggerCheckpoint(checkpointMetaData, checkpointOptions, advanceToEndOfEventTime),
"checkpoint %s with %s",
checkpointMetaData,
checkpointOptions);
}
这里开始进入了 MailBox 主线程处理 checkpoint 的流程。
按照处理顺序,核心处理流程如下:
// StreamTask.java
private boolean performCheckpoint(
CheckpointMetaData checkpointMetaData,
CheckpointOptions checkpointOptions,
CheckpointMetrics checkpointMetrics,
boolean advanceToEndOfTime) throws Exception {
LOG.debug("Starting checkpoint ({}) {} on task {}",
checkpointMetaData.getCheckpointId(), checkpointOptions.getCheckpointType(), getName());
final long checkpointId = checkpointMetaData.getCheckpointId();
// 判断当前 task 是否是 RUNNING 状态
if (isRunning) {
actionExecutor.runThrowing(() -> {
final CheckpointMetaData tmpCheckpointMetaData = new CheckpointMetaData(checkpointMetaData.getCheckpointId(),
checkpointMetaData.getTimestamp());
if (checkpointOptions.getCheckpointType().isSynchronous()) {
setSynchronousSavepointId(checkpointId);
// 当作业处于 TERMINATED 状态时,SourceStreamTask 会向下游发送 MAX_WATERMARK,触发所有的 timer,使得相关的 state 数据(例如 window state)能够刷盘
if (advanceToEndOfTime) {
advanceToEndOfEventTime();
}
}
// All of the following steps happen as an atomic step from the perspective of barriers and
// records/watermarks/timers/callbacks.
// We generally try to emit the checkpoint barrier as soon as possible to not affect downstream
// checkpoint alignments
// Step (1): Prepare the checkpoint, allow operators to do some pre-barrier work.
// The pre-barrier work should be nothing or minimal in the common case.
operatorChain.prepareSnapshotPreBarrier(checkpointId);
// Step (2): Send the checkpoint barrier downstream
operatorChain.broadcastCheckpointBarrier(
checkpointId,
tmpCheckpointMetaData.getTimestamp(),
checkpointOptions);
// Step (3): Take the state snapshot. This should be largely asynchronous, to not
// impact progress of the streaming topology
checkpointState(tmpCheckpointMetaData, checkpointOptions, checkpointMetrics);
});
return true;
} else {
// Task 已经不是 RUNNING 状态了,向下游 Task 广播 CancelCheckpointMarker,下游算子会执行 abort。然后 CheckpointCoordinator 发送 declineCheckpoint 消息
actionExecutor.runThrowing(() -> {
// we cannot perform our checkpoint - let the downstream operators know that they
// should not wait for any input from this operator
// we cannot broadcast the cancellation markers on the 'operator chain', because it may not
// yet be created
final CancelCheckpointMarker message = new CancelCheckpointMarker(checkpointMetaData.getCheckpointId());
recordWriter.broadcastEvent(message);
});
return false;
}
}
executeCheckpointing() 方法首先对当前所有 operator 执行 checkpointStreamOperator():
// StreamTask.java
private void checkpointStreamOperator(StreamOperator<?> op) throws Exception {
if (null != op) {
OperatorSnapshotFutures snapshotInProgress = op.snapshotState(
checkpointMetaData.getCheckpointId(),
checkpointMetaData.getTimestamp(),
checkpointOptions,
storageLocation);
operatorSnapshotsInProgress.put(op.getOperatorID(), snapshotInProgress);
}
}
StreamOperator 的 snapshotState(long checkpointId, long timestamp, CheckpointOptions checkpointOptions, CheckpointStreamFactory storageLocation) 方法最终由它的子类 AbstractStreamOperator 给出了一个 final 实现。注意下面的 snapshotState(snapshotContext) 方法。
// AbstractStreamOperator.java
public final OperatorSnapshotFutures snapshotState(long checkpointId, long timestamp, CheckpointOptions checkpointOptions,
CheckpointStreamFactory factory) throws Exception {
KeyGroupRange keyGroupRange = null != keyedStateBackend ?
keyedStateBackend.getKeyGroupRange() : KeyGroupRange.EMPTY_KEY_GROUP_RANGE;
OperatorSnapshotFutures snapshotInProgress = new OperatorSnapshotFutures();
StateSnapshotContextSynchronousImpl snapshotContext = new StateSnapshotContextSynchronousImpl(
checkpointId,
timestamp,
factory,
keyGroupRange,
getContainingTask().getCancelables());
try {
// 子类会各自实现 snapshotState(StateSnapshotContext context)
snapshotState(snapshotContext);
snapshotInProgress.setKeyedStateRawFuture(snapshotContext.getKeyedStateStreamFuture());
snapshotInProgress.setOperatorStateRawFuture(snapshotContext.getOperatorStateStreamFuture());
if (null != operatorStateBackend) {
snapshotInProgress.setOperatorStateManagedFuture(
operatorStateBackend.snapshot(checkpointId, timestamp, factory, checkpointOptions));
}
if (null != keyedStateBackend) {
snapshotInProgress.setKeyedStateManagedFuture(
keyedStateBackend.snapshot(checkpointId, timestamp, factory, checkpointOptions));
}
} catch (Exception snapshotException) {
try {
snapshotInProgress.cancel();
} catch (Exception e) {
snapshotException.addSuppressed(e);
}
String snapshotFailMessage = "Could not complete snapshot " + checkpointId + " for operator " +
getOperatorName() + ".";
if (!getContainingTask().isCanceled()) {
LOG.info(snapshotFailMessage, snapshotException);
}
try {
snapshotContext.closeExceptionally();
} catch (IOException e) {
snapshotException.addSuppressed(e);
}
throw new CheckpointException(snapshotFailMessage, CheckpointFailureReason.CHECKPOINT_DECLINED, snapshotException);
}
return snapshotInProgress;
}
另外需要注意,这一步是同步阶段,获得的 Futures 会放入 operatorSnapshotsInProgress,传递给 AsyncCheckpointRunnable,在异步阶段执行。
executeCheckpointing() 下一步触发 checkpoint 的异步执行阶段。上面获取的 operatorSnapshotsInProgress 连同 checkpoint 配置等信息被包装在 AsyncCheckpointRunnable 中,放入预先启动的名为 AsyncOperations 的线程池中执行:
// StreamTask.java
asyncOperationsThreadPool = Executors.newCachedThreadPool(new ExecutorThreadFactory("AsyncOperations", uncaughtExceptionHandler));
AsyncCheckpointRunnable 是一个 Runnable 方法,run() 方法中,首先由具体的 StateBackend 完成状态快照(该流程本篇不再展开,会有另一篇文章介绍),并返回完成快照的句柄,这里包括本地状态备份和远程状态(共享存储中的状态,比如 hdfs s3 nfs 等)。
// StreamTask.java
...
for (Map.Entry<OperatorID, OperatorSnapshotFutures> entry : operatorSnapshotsInProgress.entrySet()) {
OperatorID operatorID = entry.getKey();
OperatorSnapshotFutures snapshotInProgress = entry.getValue();
// finalize the async part of all by executing all snapshot runnables
OperatorSnapshotFinalizer finalizedSnapshots =
new OperatorSnapshotFinalizer(snapshotInProgress);
...
}
所有 operator 快照完成后,会给 CheckpointCoordinator 发送本次 checkpoint 的 ack 消息,消息中包含 checkpoint metadata/metrics 远端和本地的状态快照句柄信息。
public void reportTaskStateSnapshots(
@Nonnull CheckpointMetaData checkpointMetaData,
@Nonnull CheckpointMetrics checkpointMetrics,
@Nullable TaskStateSnapshot acknowledgedState,
@Nullable TaskStateSnapshot localState) {
long checkpointId = checkpointMetaData.getCheckpointId();
localStateStore.storeLocalState(checkpointId, localState);
checkpointResponder.acknowledgeCheckpoint(
jobId,
executionAttemptID,
checkpointId,
checkpointMetrics,
acknowledgedState);
}
6.4 节我们介绍了 performCheckpoint 方法执行时,会向下游 task 广播 CheckpointBarrier:
// StreamTask.java
operatorChain.broadcastCheckpointBarrier(
checkpointId,
tmpCheckpointMetaData.getTimestamp(),
checkpointOptions);
// OperatorChain.java
public void broadcastCheckpointBarrier(long id, long timestamp, CheckpointOptions checkpointOptions) throws IOException {
CheckpointBarrier barrier = new CheckpointBarrier(id, timestamp, checkpointOptions);
for (RecordWriterOutput<?> streamOutput : streamOutputs) {
streamOutput.broadcastEvent(barrier);
}
}
CheckpointBarrier 作为消息事件(AbstractEvent) 被插入到数据流(元素流被包装成 Buffer 发送到下游)中。我们知道下游算子通过 InputGate 拉取上游 ResultSubPartition 的数据,而 CheckpointedInputGate 负责处理 CheckpointBarrier 消息(还处理 CancelCheckpointMarker EndOfPartitionEvent)。我们常说的 Exactly-once 语义需要进行的 Barrier 对齐也是这里进行的。
// CheckpointedInputGate.java
public Optional<BufferOrEvent> pollNext() throws Exception {
while (true) {
// 从缓冲区或者 InputGate 中拉取数据
// process buffered BufferOrEvents before grabbing new ones
Optional<BufferOrEvent> next;
// 如果当前缓冲区为空,则从 InputGate 获取数据
if (bufferStorage.isEmpty()) {
next = inputGate.pollNext();
}
// 缓冲区不为空,直接从缓冲区获取
else {
// TODO: FLINK-12536 for non credit-based flow control, getNext method is blocking
next = bufferStorage.pollNext();
// 缓冲区中没有数据了
if (!next.isPresent()) {
return pollNext();
}
}
if (!next.isPresent()) {
return handleEmptyBuffer();
}
BufferOrEvent bufferOrEvent = next.get();
// 如果一个 channel 阻塞了,说明还有其他 channel barrier 没有到来,把阻塞的 channel 元素保存在 bufferStorage
if (barrierHandler.isBlocked(offsetChannelIndex(bufferOrEvent.getChannelIndex()))) {
// if the channel is blocked, we just store the BufferOrEvent
bufferStorage.add(bufferOrEvent);
if (bufferStorage.isFull()) {
barrierHandler.checkpointSizeLimitExceeded(bufferStorage.getMaxBufferedBytes());
bufferStorage.rollOver();
}
}
// 如果是 Buffer,直接返回,交给 operator 处理
else if (bufferOrEvent.isBuffer()) {
return next;
}
// CheckpointBarrier 交给 barrierHandler 处理
else if (bufferOrEvent.getEvent().getClass() == CheckpointBarrier.class) {
CheckpointBarrier checkpointBarrier = (CheckpointBarrier) bufferOrEvent.getEvent();
if (!endOfInputGate) {
// process barriers only if there is a chance of the checkpoint completing
if (barrierHandler.processBarrier(checkpointBarrier, offsetChannelIndex(bufferOrEvent.getChannelIndex()), bufferStorage.getPendingBytes())) {
bufferStorage.rollOver();
}
}
}
else if (bufferOrEvent.getEvent().getClass() == CancelCheckpointMarker.class) {
if (barrierHandler.processCancellationBarrier((CancelCheckpointMarker) bufferOrEvent.getEvent())) {
bufferStorage.rollOver();
}
}
else {
if (bufferOrEvent.getEvent().getClass() == EndOfPartitionEvent.class) {
if (barrierHandler.processEndOfPartition()) {
bufferStorage.rollOver();
}
}
return next;
}
}
}
上述代码中 bufferStorage 是 barrier 对齐时需要的缓冲区,CheckpointedInputGate 仍然会从未阻塞的 channel 接接收数据,然后写入缓冲区中。bufferStorage 有三种实现类:
生成过程如下:
// InputProcessorUtil.java
private static BufferStorage createBufferStorage(
CheckpointingMode checkpointMode,
IOManager ioManager,
int pageSize,
Configuration taskManagerConfig,
String taskName) {
switch (checkpointMode) {
case EXACTLY_ONCE: {
long maxAlign = taskManagerConfig.getLong(TaskManagerOptions.TASK_CHECKPOINT_ALIGNMENT_BYTES_LIMIT);
if (!(maxAlign == -1 || maxAlign > 0)) {
throw new IllegalConfigurationException(
TaskManagerOptions.TASK_CHECKPOINT_ALIGNMENT_BYTES_LIMIT.key()
+ " must be positive or -1 (infinite)");
}
return new CachedBufferStorage(pageSize, maxAlign, taskName);
}
case AT_LEAST_ONCE:
return new EmptyBufferStorage();
default:
throw new UnsupportedOperationException("Unrecognized Checkpointing Mode: " + checkpointMode);
}
}
获得的 CheckpointBarrier 消息最终由 CheckpointBarrierHandler 处理,CheckpointBarrierHandler 有两个子类,CheckpointBarrierAligner 和 CheckpointBarrierTracker,从类名可以看出,一个进行 Barrier 对齐(exactly once),另一个只是跟踪 Barrier(at least once)。
CheckpointBarrierAligner#processBarrier() 逻辑如下:
// CheckpointBarrierAligner.java
public boolean processBarrier(CheckpointBarrier receivedBarrier, int channelIndex, long bufferedBytes) throws Exception {
final long barrierId = receivedBarrier.getId();
// fast path for single channel cases
// 如果只有一个 inputchannel,就不需要对齐操作了,barrierId 有效就直接触发 checkpoint
if (totalNumberOfInputChannels == 1) {
if (barrierId > currentCheckpointId) {
// new checkpoint
currentCheckpointId = barrierId;
notifyCheckpoint(receivedBarrier, bufferedBytes, latestAlignmentDurationNanos);
}
return false;
}
// 下面都是处理对齐 barrier 的过程,barrier 对齐后才能触发 checkpoint
boolean checkpointAborted = false;
// -- general code path for multiple input channels --
if (numBarriersReceived > 0) {
// this is only true if some alignment is already progress and was not canceled
if (barrierId == currentCheckpointId) {
// regular case
// 阻塞对应的 channel
onBarrier(channelIndex);
}
else if (barrierId > currentCheckpointId) {
// we did not complete the current checkpoint, another started before
LOG.warn("{}: Received checkpoint barrier for checkpoint {} before completing current checkpoint {}. " +
"Skipping current checkpoint.",
taskName,
barrierId,
currentCheckpointId);
// let the task know we are not completing this
notifyAbort(currentCheckpointId,
new CheckpointException(
"Barrier id: " + barrierId,
CheckpointFailureReason.CHECKPOINT_DECLINED_SUBSUMED));
// abort the current checkpoint
releaseBlocksAndResetBarriers();
checkpointAborted = true;
// begin a the new checkpoint
beginNewAlignment(barrierId, channelIndex);
}
else {
// ignore trailing barrier from an earlier checkpoint (obsolete now)
return false;
}
}
else if (barrierId > currentCheckpointId) {
// first barrier of a new checkpoint
beginNewAlignment(barrierId, channelIndex);
}
else {
// either the current checkpoint was canceled (numBarriers == 0) or
// this barrier is from an old subsumed checkpoint
return false;
}
// check if we have all barriers - since canceled checkpoints always have zero barriers
// this can only happen on a non canceled checkpoint
// 收到了所有 inputchannel 的 barrier,可以触发 checkpoint 了,并且要移除 channel 的阻塞标记
if (numBarriersReceived + numClosedChannels == totalNumberOfInputChannels) {
// actually trigger checkpoint
if (LOG.isDebugEnabled()) {
LOG.debug("{}: Received all barriers, triggering checkpoint {} at {}.",
taskName,
receivedBarrier.getId(),
receivedBarrier.getTimestamp());
}
releaseBlocksAndResetBarriers();
notifyCheckpoint(receivedBarrier, bufferedBytes, latestAlignmentDurationNanos);
return true;
}
return checkpointAborted;
}
notifyCheckpoint() 会调用 StreamTask#triggerCheckpointOnBarrier() -> StreamTask#performCheckpoint(), 之后的执行流程就和 Source Task 一样了。
// CheckpointBarrierHandler.java
protected void notifyCheckpoint(CheckpointBarrier checkpointBarrier, long bufferedBytes, long alignmentDurationNanos) throws Exception {
if (toNotifyOnCheckpoint != null) {
CheckpointMetaData checkpointMetaData =
new CheckpointMetaData(checkpointBarrier.getId(), checkpointBarrier.getTimestamp());
CheckpointMetrics checkpointMetrics = new CheckpointMetrics()
.setBytesBufferedInAlignment(bufferedBytes)
.setAlignmentDurationNanos(alignmentDurationNanos);
toNotifyOnCheckpoint.triggerCheckpointOnBarrier(
checkpointMetaData,
checkpointBarrier.getCheckpointOptions(),
checkpointMetrics);
}
}