管理 多个 TaskSetManager,一般会在 TaskSchedulerImpl 中使用,所以大多数的方法是在 TaskSchedulerImpl 中被调用的。
private[spark] class Pool(
val poolName: String, //poll 名称
val schedulingMode: SchedulingMode, //调度模式 公平型和先入先出型
initMinShare: Int,//权重
initWeight: Int)//计算资源中的cpu核数
extends Schedulable with Logging {
val schedulableQueue = new ConcurrentLinkedQueue[Schedulable] //任务队列
val schedulableNameToSchedulable = new ConcurrentHashMap[String, Schedulable] //保存调度 任务的名称和任务的 map
val weight = initWeight
val minShare = initMinShare
var runningTasks = 0 //目前这个 数量 只是统计 正在运行的 任务数
val priority = 0
// A pool's stage id is used to break the tie in scheduling.
var stageId = -1
val name = poolName
var parent: Pool = null
//根据 调度模式 返回 调度算法
private val taskSetSchedulingAlgorithm: SchedulingAlgorithm = {
schedulingMode match {
case SchedulingMode.FAIR =>
new FairSchedulingAlgorithm()
case SchedulingMode.FIFO => //默认 FIFO
new FIFOSchedulingAlgorithm()
case _ =>
val msg = s"Unsupported scheduling mode: $schedulingMode. Use FAIR or FIFO instead."
throw new IllegalArgumentException(msg)
// add 一个 TaskSetManager
//Schedulable 的一个子类 是 TaskSetManager
//这个方法会在 FIFOSchedulableBuilder 中 调用的,那么最终的 在TaskSchedulerImpl的submitTasks的方法中调用这个 addSchedulable 方法
override def addSchedulable(schedulable: Schedulable) {
require(schedulable != null)
schedulableQueue.add(schedulable) //加到任务队列里面去
schedulableNameToSchedulable.put(schedulable.name, schedulable) //加到 任务map中去
schedulable.parent = this //更新 schedulable.parent
//移除一个 TaskSetManager
//这个方法会在 TaskSchedulerImpl的 taskSetFinished方法中 通过manager.parent.removeSchedulable(manager) 调用,因为在 addSchedulable方法中 设置schedulable.parent = this
override def removeSchedulable(schedulable: Schedulable) {
//通过 任务Name 获取任务实体 首先从 任务map中获取,否则从 任务队列中 获取
override def getSchedulableByName(schedulableName: String): Schedulable = {
if (schedulableNameToSchedulable.containsKey(schedulableName)) {
return schedulableNameToSchedulable.get(schedulableName)
for (schedulable <- schedulableQueue.asScala) {
val sched = schedulable.getSchedulableByName(schedulableName)
if (sched != null) {
return sched
//executor 丢失处理
//在 TaskSchedulerImpl的removeExecutor中会调用这个方法
override def executorLost(executorId: String, host: String, reason: ExecutorLossReason) {
schedulableQueue.asScala.foreach(_.executorLost(executorId, host, reason)) //依次处理 TaskSetManager 中的这个 executorId 的任务
//检查 推测 任务,只要 任务队列里面 有一个 任务满足 则返回true
//这个方法会在 TaskSchedulerImpl中的checkSpeculatableTasks中调用,并且是一个定时任务
override def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean = {
var shouldRevive = false
for (schedulable <- schedulableQueue.asScala) {
shouldRevive |= schedulable.checkSpeculatableTasks(minTimeToSpeculation) //这个方法是 TaskSetManager
//获取 sorted task
//在 TaskSchedulerImpl的resourceOffers中会调用这个方法
override def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] = {
val sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]
val sortedSchedulableQueue =
schedulableQueue.asScala.toSeq.sortWith(taskSetSchedulingAlgorithm.comparator) //根据 调度算法 返回 排序后的 taskSetManager
for (schedulable <- sortedSchedulableQueue) {
sortedTaskSetQueue ++= schedulable.getSortedTaskSetQueue
//增加 running的 task 数量
def increaseRunningTasks(taskNum: Int) {
runningTasks += taskNum
if (parent != null) {
//减少 running的 task 数量
def decreaseRunningTasks(taskNum: Int) {
runningTasks -= taskNum
if (parent != null) {
主要管理Task的调度,所以会在DAGScheduler中使用,在一个Task的执行过程中 需要和executos交互,所以在本类的initialize方法中会传入SchedulerBackend用来和executos交互
private[spark] class TaskSchedulerImpl(
val sc: SparkContext,
val maxTaskFailures: Int, //默认是4
isLocal: Boolean = false)
extends TaskScheduler with Logging {
import TaskSchedulerImpl._
def this(sc: SparkContext) = {
this(sc, sc.conf.get(config.MAX_TASK_FAILURES)) //默认是 4
// Lazily initializing blacklistTrackerOpt to avoid getting empty ExecutorAllocationClient,
// because ExecutorAllocationClient is created after this TaskSchedulerImpl.
private[scheduler] lazy val blacklistTrackerOpt = maybeCreateBlacklistTracker(sc) //默认关闭的话 是 None
val conf = sc.conf
// How often to check for speculative tasks
//推测任务 间隔 默认 100ms
val SPECULATION_INTERVAL_MS = conf.getTimeAsMs("spark.speculation.interval", "100ms")
// Duplicate copies of a task will only be launched if the original copy has been running for
// at least this amount of time. This is to avoid the overhead of launching speculative copies
// of tasks that are very short.
private val speculationScheduler = //推测 scheduler 后台线程
// Threshold above which we warn user initial TaskSet may be starved
val STARVATION_TIMEOUT_MS = conf.getTimeAsMs("spark.starvation.timeout", "15s")
// CPUs to request per task
val CPUS_PER_TASK = conf.getInt("spark.task.cpus", 1)
// TaskSetManagers are not thread safe, so any access to one should be synchronized
// on this class.
//一个Stage 有多个 attemptNumber的次数,每一次都会有一个对应的 TaskSetManager
private val taskSetsByStageIdAndAttempt = new HashMap[Int, HashMap[Int, TaskSetManager]] //stage 与 taskID和TaskSetManager 的 关系
// Protected by `this`
private[scheduler] val taskIdToTaskSetManager = new ConcurrentHashMap[Long, TaskSetManager] // task id 和 TaskSetManager 的关系
val taskIdToExecutorId = new HashMap[Long, String] //taskID 和 executor的关系
@volatile private var hasReceivedTask = false
@volatile private var hasLaunchedTask = false
private val starvationTimer = new Timer(true)
// Incrementing task IDs
val nextTaskId = new AtomicLong(0) //产生task ID
// IDs of the tasks running on each executor
private val executorIdToRunningTaskIds = new HashMap[String, HashSet[Long]] //executor 与running 任务的 关系
//返回 executors 运行的 任务数 的 map
def runningTasksByExecutors: Map[String, Int] = synchronized {
// The set of executors we have on each host; this is used to compute hostsAlive, which
// in turn is used to decide when we can attain data locality on a given host
protected val hostToExecutors = new HashMap[String, HashSet[String]] //host 和executor 的关系,一个host节点可能有多个executor
protected val hostsByRack = new HashMap[String, HashSet[String]]
protected val executorIdToHost = new HashMap[String, String] //executor 和 host的map
// Listener object to pass upcalls into
var dagScheduler: DAGScheduler = null
var backend: SchedulerBackend = null //这个是driver 和其他 exector 通信的
val mapOutputTracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] //保持 stage 的 map output 的 location
private var schedulableBuilder: SchedulableBuilder = null
// default scheduler is FIFO
private val schedulingModeConf = conf.get(SCHEDULER_MODE_PROPERTY, SchedulingMode.FIFO.toString) //调度模式 默认FIFO
val schedulingMode: SchedulingMode =
try {
} catch {
case e: java.util.NoSuchElementException =>
throw new SparkException(s"Unrecognized $SCHEDULER_MODE_PROPERTY: $schedulingModeConf")
val rootPool: Pool = new Pool("", schedulingMode, 0, 0)
// This is a var so that we can reset it for testing purposes.
// task 结果 获取器
private[spark] var taskResultGetter = new TaskResultGetter(sc.env, this) //用于获取 Task 的 结果
//设置 DAGScheduler,是在DAGScheduler中调用的
override def setDAGScheduler(dagScheduler: DAGScheduler) {
this.dagScheduler = dagScheduler
//初始化方法,在 SparkContext中随后调用,在 start方法调用之前
def initialize(backend: SchedulerBackend) {
this.backend = backend
schedulableBuilder = {
schedulingMode match {
case SchedulingMode.FIFO =>
new FIFOSchedulableBuilder(rootPool)
case SchedulingMode.FAIR =>
new FairSchedulableBuilder(rootPool, conf)
case _ =>
throw new IllegalArgumentException(s"Unsupported $SCHEDULER_MODE_PROPERTY: " +
def newTaskId(): Long = nextTaskId.getAndIncrement() //递增的 task id
//start方法 在 SparkContext 中 随后调用 line 508 主要启动 推测 scheduler 定时轮询 后台线程
override def start() {
backend.start() //backend start方法
if (!isLocal && conf.getBoolean("spark.speculation", false)) { //开启 spark 推测 机制的话
logInfo("Starting speculative execution thread")
speculationScheduler.scheduleWithFixedDelay(new Runnable { //推测 scheduler 定时轮询 后台线程
override def run(): Unit = Utils.tryOrStopSparkContext(sc) {
checkSpeculatableTasks() //检查 spark 推测 任务
//start的 钩子函数
override def postStartHook() {
waitBackendReady() //等待 backend ready
//提交一个Stage的任务 -》更新 taskSetsByStageIdAndAttempt
//在DAGScheduler中 submitMissingTasks 方法中 调用的
override def submitTasks(taskSet: TaskSet) {//这里的TaskSet 是一个Stage里的所有的任务
val tasks: Array[Task[_]] = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
val manager: TaskSetManager = createTaskSetManager(taskSet, maxTaskFailures) //为这个Stage的TaskSet new 一个 TaskSetManager
val stage: Int = taskSet.stageId //这个Stage Id
val stageTaskSets: mutable.Map[Int, TaskSetManager] =
taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager]) //一个Stage 有多个 attemptNumber的次数,每一次都会有一个对应的 TaskSetManager
// Mark all the existing TaskSetManagers of this stage as zombie, as we are adding a new one.
// This is necessary to handle a corner case. Let's say a stage has 10 partitions and has 2
// TaskSetManagers: TSM1(zombie) and TSM2(active). TSM1 has a running task for partition 10
// and it completes. TSM2 finishes tasks for partition 1-9, and thinks he is still active
// because partition 10 is not completed yet. However, DAGScheduler gets task completion
// events for all the 10 partitions and thinks the stage is finished. If it's a shuffle stage
// and somehow it has missing map outputs, then DAGScheduler will resubmit it and create a
// TSM3 for it. As a stage can't have more than one active task set managers, we must mark
// TSM2 as zombie (it actually is).
stageTaskSets.foreach { case (_, ts) => //为已经存在的 TaskSetManagers 的 isZombie 设置为 true,表示这个Stage的 tasks 都已经 运行完成了,当新添加一个Stage时,必然前面的Stage已经完成了
ts.isZombie = true
stageTaskSets(taskSet.stageAttemptId) = manager //更新这个 stageAttemptId的 TaskSetManager
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties) //使用FIFOSchedulableBuilder add TaskSetManager 到 Pool中去
if (!isLocal && !hasReceivedTask) {//hasReceivedTask 首次 是 false, isLocal在yarn-cluster是false
starvationTimer.scheduleAtFixedRate(new TimerTask() {//定时任务
override def run() {
if (!hasLaunchedTask) {//hasLaunchedTask 首次是false
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {//当任务 第二次的时候,走这个 分支,下面这个方法就是 stop 自己的 TimerTask 线程
hasReceivedTask = true//hasReceivedTask 更新为true
backend.reviveOffers() //通知SchedulerBackend 拿到对应的task的TaskDescript ,来通知executor 执行tasks
// Label as private[scheduler] to allow tests to swap in different task set managers if necessary
private[scheduler] def createTaskSetManager(
taskSet: TaskSet,
maxTaskFailures: Int): TaskSetManager = {
new TaskSetManager(this, taskSet, maxTaskFailures, blacklistTrackerOpt) //新建 TaskSetManager 对象 , spark 的 黑名单 默认关闭的话 blacklistTrackerOpt 是 None
//取消 某个 Stage的tasks
//在DAGScheduler中 failJobAndIndependentStages 方法中 调用的
override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = synchronized {
logInfo("Cancelling stage " + stageId)
taskSetsByStageIdAndAttempt.get(stageId).foreach { attempts => //get到 这个Stage的 attempts
attempts.foreach { case (_, tsm) => //tsm 就是 TaskSetManager的实例
// There are two possible cases here:
// 1. The task set manager has been created and some tasks have been scheduled.
// In this case, send a kill signal to the executors to kill the task and then abort
// the stage.
// 2. The task set manager has been created but no tasks have been scheduled. In this case,
// simply abort the stage.
tsm.runningTasksSet.foreach { tid => //所有的task
taskIdToExecutorId.get(tid).foreach(execId => //取得运行task 的 executorID
backend.killTask(tid, execId, interruptThread, reason = "Stage cancelled")) //kill 这个 execID 的task
tsm.abort("Stage %s cancelled".format(stageId)) //再次 abort 确保正确
logInfo("Stage %d was cancelled".format(stageId))
//kill 一个 task,在DAGScheduler中调用
//在DAGScheduler中 killTaskAttempt 方法中 调用的
override def killTaskAttempt(taskId: Long, interruptThread: Boolean, reason: String): Boolean = {
logInfo(s"Killing task $taskId: $reason")
val execId = taskIdToExecutorId.get(taskId) //拿到这个task的 execID
if (execId.isDefined) {
backend.killTask(taskId, execId.get, interruptThread, reason) //使用 backend kill 掉这个 execID的这个task
} else {
logWarning(s"Could not kill task $taskId because no task with that ID was found.")
* Called to indicate that all task attempts (including speculated tasks) associated with the
* given TaskSetManager have completed, so state associated with the TaskSetManager should be
* cleaned up.
//这个TaskSetManager 处理成功
//这个方法会在 TaskSetManager 中的 maybeFinishTaskSet 中调用,因为一个Stage的tasks 是否已经运行完成,是在 TaskSetManager 中保存维持的
def taskSetFinished(manager: TaskSetManager): Unit = synchronized {
taskSetsByStageIdAndAttempt.get(manager.taskSet.stageId).foreach { taskSetsForStage => //taskSetsForStage这个就是 HashMap[Int, TaskSetManager]
taskSetsForStage -= manager.taskSet.stageAttemptId // HashMap[Int, TaskSetManager] 中去掉 这个成功的stageAttemptId
if (taskSetsForStage.isEmpty) { //可能需要清理这个 taskSetsByStageIdAndAttempt
taskSetsByStageIdAndAttempt -= manager.taskSet.stageId
manager.parent.removeSchedulable(manager) //这个 manager.parent 就是 Pool这个对象,通知 Pool 移除这个 TaskSetMananger
logInfo(s"Removed TaskSet ${manager.taskSet.id}, whose tasks have all completed, from pool" +
s" ${manager.parent.name}")
//此方法本类中使用, 下面的 resourceOffers 会被调用 用来 更新 tasks ArrayBuffer的TaskDescription 信息,会从 此 TaskSetManager 的 resourceOffer拿到这些 tasks
private def resourceOfferSingleTaskSet(
taskSet: TaskSetManager,
maxLocality: TaskLocality, //maxLocality 从 PROCESS_LOCAL -》 ANY
shuffledOffers: Seq[WorkerOffer],//每个executor 的 信息
availableCpus: Array[Int], //每个executor 可用的核数
tasks: IndexedSeq[ArrayBuffer[TaskDescription]]) : Boolean = { //这个目前 只是 null 的 ArrayBuffer[TaskDescription]
var launchedTask = false
// nodes and executors that are blacklisted for the entire application have already been
// filtered out by this point
for (i <- 0 until shuffledOffers.size) { //shuffledOffers 是 executor 的粒度
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
if (availableCpus(i) >= CPUS_PER_TASK) {//可用核数 》= 1
try {//resourceOffer 这个方法只会返回 一个 Option[TaskDescription]
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) { //在 此 host 的 execId 的maxLocality 条件下 调度任务,返回 TaskDescription 信息
tasks(i) += task //更新上面的 null 的 ArrayBuffer[TaskDescription] ,注意这里是 ArrayBuffer
val tid = task.taskId
taskIdToTaskSetManager.put(tid, taskSet)
taskIdToExecutorId(tid) = execId
availableCpus(i) -= CPUS_PER_TASK //更新这个 exector的 可用核数
assert(availableCpus(i) >= 0)
launchedTask = true
} catch {
case e: TaskNotSerializableException =>
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
// Do not offer resources for this task, but don't throw an error to allow other
// task sets to be submitted.
return launchedTask
return launchedTask
* Called by cluster manager to offer resources on slaves. We respond by asking our active task
* sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so
* that tasks are balanced across the cluster.
//WorkerOffer 是executor的空闲核数
//offers 是 存活的 executors 的 包含 空闲核数的 包装类 WorkerOffer
//这个方法在 CoarseGrainedSchedulerBackend 中的 makeOffers 中调用,在 CoarseGrainedSchedulerBackend 中 持有这个 TaskScheduler 对象
def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
// Mark each slave as alive and remember its hostname
// Also track if new executor is added
var newExecAvail = false
for (o <- offers) {//offers 是 存活的 executors 的 包含 空闲核数的 包装类 WorkerOffer
if (!hostToExecutors.contains(o.host)) {
hostToExecutors(o.host) = new HashSet[String]()
if (!executorIdToRunningTaskIds.contains(o.executorId)) {
hostToExecutors(o.host) += o.executorId
executorAdded(o.executorId, o.host)
executorIdToHost(o.executorId) = o.host
executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()
newExecAvail = true
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
// Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do
// this here to avoid a separate thread and added synchronization overhead, and also because
// updating the blacklist is only relevant when task offers are being made.
//因为 Spark黑名单机制么有开启,所以 filteredOffers 就是 offers
val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>
offers.filter { offer =>
!blacklistTracker.isNodeBlacklisted(offer.host) &&
//做一个 shuffle
val shuffledOffers: IndexedSeq[WorkerOffer] = shuffleOffers(filteredOffers)
// Build a list of tasks to assign to each worker.
val tasks: IndexedSeq[ArrayBuffer[TaskDescription]] = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK)) //CPUS_PER_TASK 的意思就是 一个 CPU分配 几个task任务,默认一个CPU一个task
val availableCpus: Array[Int] = shuffledOffers.map(o => o.cores).toArray //每个 Offers 可用的核数
val sortedTaskSets: mutable.Seq[TaskSetManager] = rootPool.getSortedTaskSetQueue //获取 sorted task 一般情况下,这个 Queue中只有一个 Stage的 TaskSetManager
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks)) //这个 taskSet.parent.name 就是 Pool的name 也就是 本类中 new Pool 时指定的 名称 ""
//taskSet.name 就是 taskSet_id这个id 就是 DAGScheduler 传过来的 Task id
if (newExecAvail) {
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
// of locality levels so that it gets a chance to launch local tasks on all of them.
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
for (taskSet <- sortedTaskSets) {
var launchedAnyTask = false
var launchedTaskAtCurrentMaxLocality = false
for (currentMaxLocality <- taskSet.myLocalityLevels) {
do {
launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet( //更新 tasks 这个resourceOfferSingleTaskSet方法就是 紧邻的上面
taskSet, currentMaxLocality, shuffledOffers, availableCpus, tasks)
launchedAnyTask |= launchedTaskAtCurrentMaxLocality
} while (launchedTaskAtCurrentMaxLocality)
if (!launchedAnyTask) {
if (tasks.size > 0) {
hasLaunchedTask = true
return tasks //返回 分配好的 IndexedSeq[ArrayBuffer[TaskDescription]]
* Shuffle offers around to avoid always placing tasks on the same workers. Exposed to allow
* overriding in tests, so it can be deterministic.
//shuffled WorkerOffer
protected def shuffleOffers(offers: IndexedSeq[WorkerOffer]): IndexedSeq[WorkerOffer] = {
//更新 某个 task 的执行状态
//这个方法会在 CoarseGrainedSchedulerbackend 中 当 executor 中的一个 task 运行完成后,会通知 backend 执行 StatusUpdate,接着就会 调用本方法
//当 这个 任务的 state 是 FINISHED时,
def statusUpdate(tid: Long, state: TaskState, serializedData: ByteBuffer) {
var failedExecutor: Option[String] = None
var reason: Option[ExecutorLossReason] = None
synchronized {
try {
Option(taskIdToTaskSetManager.get(tid)) match {//取得 对应的 taskSetManager
case Some(taskSet) =>
if (state == TaskState.LOST) {//TaskLost 情况
// TaskState.LOST is only used by the deprecated Mesos fine-grained scheduling mode,
// where each executor corresponds to a single task, so mark the executor as failed.
val execId = taskIdToExecutorId.getOrElse(tid, throw new IllegalStateException( //获取 对应的 executor
"taskIdToTaskSetManager.contains(tid) <=> taskIdToExecutorId.contains(tid)"))
if (executorIdToRunningTaskIds.contains(execId)) {//这个 executor 上是否有运行的 任务
reason = Some(
SlaveLost(s"Task $tid was lost, so marking the executor as lost as well."))
removeExecutor(execId, reason.get) //移除一个executor,会移除这个exec上的所有的任务,更新 hostToExecutors 信息,hostsByRack,executorIdToHost
failedExecutor = Some(execId)
if (TaskState.isFinished(state)) {//Task Finished 情况
cleanupTaskState(tid) //清理 taskIdToTaskSetManager,taskIdToExecutorId,executorIdToRunningTaskIds 信息
taskSet.removeRunningTask(tid) //从 runningTasksSet 和 Poll 中移除这个 task id
if (state == TaskState.FINISHED) {
taskResultGetter.enqueueSuccessfulTask(taskSet, tid, serializedData) //调用 taskResultGetter 的 enqueueSuccessfulTask 方法,这个方法中 会调用本类的 handleSuccessfulTask 方法
//而这个 handleSuccessfulTask 方法中 会调用 TaskSetManager 的 handleSuccessfulTask 方法,TaskSetManager 的 handleSuccessfulTask 中 会 调用 TaskSetManager 的 maybeFinishTaskSet
//方法,可能这个Stage 的 所有的tasks 都已经完成了
//同理 下面的 Task FAILED,KILLED,LOST的时候也会 和上面的 调用逻辑类似
} else if (Set(TaskState.FAILED, TaskState.KILLED, TaskState.LOST).contains(state)) {
taskResultGetter.enqueueFailedTask(taskSet, tid, state, serializedData)
case None =>
("Ignoring update with state %s for TID %s because its task set is gone (this is " +
"likely the result of receiving duplicate task finished status updates) or its " +
"executor has been marked as failed.")
.format(state, tid))
} catch {
case e: Exception => logError("Exception in statusUpdate", e)
// Update the DAGScheduler without holding a lock on this, since that can deadlock
if (failedExecutor.isDefined) {//如果有 failedExecutor 通知 dagScheduler.executorLost 和 backend
dagScheduler.executorLost(failedExecutor.get, reason.get)
* Update metrics for in-progress tasks and let the master know that the BlockManager is still
* alive. Return true if the driver knows about the given block manager. Otherwise, return false,
* indicating that the block manager should re-register.
//driver 的 blockManagerMasterEndpoint 中是否已经注册过了这个 blockManagerId
//这个方法 会 在 HeartbeatReceiver 的 receiveAndReply 的 case heartbeat @ Heartbeat(executorId, accumUpdates, blockManagerId) => 方法中调用
//主要用来 更新 累加变量
override def executorHeartbeatReceived(
execId: String,
accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
blockManagerId: BlockManagerId): Boolean = {
// (taskId, stageId, stageAttemptId, accumUpdates)
val accumUpdatesWithTaskIds: Array[(Long, Int, Int, Seq[AccumulableInfo])] = {
accumUpdates.flatMap { case (id, updates) =>
val accInfos = updates.map(acc => acc.toInfo(Some(acc.value), None))
Option(taskIdToTaskSetManager.get(id)).map { taskSetMgr =>
(id, taskSetMgr.stageId, taskSetMgr.taskSet.stageAttemptId, accInfos)
//driver de blockManagerMasterEndpoint 中是否已经注册过了这个 blockManagerId
dagScheduler.executorHeartbeatReceived(execId, accumUpdatesWithTaskIds, blockManagerId)
//在 TaskResultGetter 的 enqueueSuccessfulTask 方法中 会使用到
def handleTaskGettingResult(taskSetManager: TaskSetManager, tid: Long): Unit = synchronized {
//在 TaskResultGetter 的 enqueueSuccessfulTask 方法中 会使用到
def handleSuccessfulTask(
taskSetManager: TaskSetManager,
tid: Long,
taskResult: DirectTaskResult[_]): Unit = synchronized {
taskSetManager.handleSuccessfulTask(tid, taskResult) //调用 TaskScheduler的 handleSuccessfulTask 方法
//在 TaskResultGetter 的 enqueueSuccessfulTask 方法中 会使用到
def handleFailedTask(
taskSetManager: TaskSetManager,
tid: Long,
taskState: TaskState,
reason: TaskFailedReason): Unit = synchronized {
taskSetManager.handleFailedTask(tid, taskState, reason)
if (!taskSetManager.isZombie && !taskSetManager.someAttemptSucceeded(tid)) {
// Need to revive offers again now that the task set manager state has been updated to
// reflect failed tasks that need to be re-run.
def error(message: String) {
synchronized {
if (taskSetsByStageIdAndAttempt.nonEmpty) {
// Have each task set throw a SparkException with the error
for {
attempts <- taskSetsByStageIdAndAttempt.values
manager <- attempts.values
} {
try {
manager.abort(message) //每个 Stage 的 TaskSetManager 手动 taskSetFailed
} catch {
case e: Exception => logError("Exception in error callback", e)
} else {
// No task sets are active but we still got an error. Just exit since this
// must mean the error is during registration.
// It might be good to do something smarter here in the future.
throw new SparkException(s"Exiting due to error from cluster scheduler: $message")
//在DAGScheduler中 stop 方法中 调用的
override def stop() {
if (backend != null) {
if (taskResultGetter != null) {
//在DAGScheduler 中会使用到
override def defaultParallelism(): Int = backend.defaultParallelism()
// Check for speculatable tasks in all our active jobs.
//检查 spark 推测 任务,本类的定时任务
def checkSpeculatableTasks() {
var shouldRevive = false
synchronized {
shouldRevive = rootPool.checkSpeculatableTasks(MIN_TIME_TO_SPECULATION) //调用 Pool 的 checkSpeculatableTasks
if (shouldRevive) {//有可以推测的任务
backend.reviveOffers() //执行这些任务
//HeartbeatReceiver 监测这个executor 超时后, taskScheduler 执行 executor lost 相应的 操作
//在 Backend 中的removeExecutor中会使用到
override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {
var failedExecutor: Option[String] = None
synchronized {
if (executorIdToRunningTaskIds.contains(executorId)) {
val hostPort = executorIdToHost(executorId)
logExecutorLoss(executorId, hostPort, reason)
removeExecutor(executorId, reason)
failedExecutor = Some(executorId)
} else {
executorIdToHost.get(executorId) match {
case Some(hostPort) =>
// If the host mapping still exists, it means we don't know the loss reason for the
// executor. So call removeExecutor() to update tasks running on that executor when
// the real loss reason is finally known.
logExecutorLoss(executorId, hostPort, reason)
removeExecutor(executorId, reason)
case None =>
// We may get multiple executorLost() calls with different loss reasons. For example,
// one may be triggered by a dropped connection from the slave while another may be a
// report of executor termination from Mesos. We produce log messages for both so we
// eventually report the termination reason.
logError(s"Lost an executor $executorId (already removed): $reason")
// Call dagScheduler.executorLost without holding the lock on this to prevent deadlock
if (failedExecutor.isDefined) {
dagScheduler.executorLost(failedExecutor.get, reason)
//在 Backend 中的removeExecutor中会使用到
override def workerRemoved(workerId: String, host: String, message: String): Unit = {
logInfo(s"Handle removed worker $workerId: $message")
dagScheduler.workerRemoved(workerId, host, message)
private def logExecutorLoss(
executorId: String,
hostPort: String,
reason: ExecutorLossReason): Unit = reason match {
case LossReasonPending =>
logDebug(s"Executor $executorId on $hostPort lost, but reason not yet known.")
case ExecutorKilled =>
logInfo(s"Executor $executorId on $hostPort killed by driver.")
case _ =>
logError(s"Lost executor $executorId on $hostPort: $reason")
* Cleans up the TaskScheduler's state for tracking the given task.
//清理 TaskScheduler 状态等,本类自己使用
private def cleanupTaskState(tid: Long): Unit = {
taskIdToTaskSetManager.remove(tid) //清理taskIdToTaskSetManager
taskIdToExecutorId.remove(tid).foreach { executorId => //清理taskIdToExecutorId 上的 这个任务
executorIdToRunningTaskIds.get(executorId).foreach { _.remove(tid) }
* Remove an executor from all our data structures and mark it as lost. If the executor's loss
* reason is not yet known, do not yet remove its association with its host nor update the status
* of any running tasks, since the loss reason defines whether we'll fail those tasks.
//移除一个executor,会移除这个exec上的所有的任务,更新 hostToExecutors 信息,hostsByRack,executorIdToHost
//本类的 statusUpdate 和 executorLost 方法中会调用
private def removeExecutor(executorId: String, reason: ExecutorLossReason) {
// The tasks on the lost executor may not send any more status updates (because the executor
// has been lost), so they should be cleaned up here.
executorIdToRunningTaskIds.remove(executorId).foreach { taskIds => //对这个 executor 上的 task 依次 cleanupTaskState
logDebug("Cleaning up TaskScheduler state for tasks " +
s"${taskIds.mkString("[", ",", "]")} on failed executor $executorId")
// We do not notify the TaskSetManager of the task failures because that will
// happen below in the rootPool.executorLost() call.
taskIds.foreach(cleanupTaskState) //清理 TaskScheduler 状态等
val host = executorIdToHost(executorId) //拿到这个 executor的host
val execs: mutable.Set[String] = hostToExecutors.getOrElse(host, new HashSet) //更新 hostToExecutors 信息,hostsByRack,executorIdToHost
execs -= executorId
if (execs.isEmpty) {
hostToExecutors -= host
for (rack <- getRackForHost(host); hosts <- hostsByRack.get(rack)) {
hosts -= host
if (hosts.isEmpty) {
hostsByRack -= rack
if (reason != LossReasonPending) {
executorIdToHost -= executorId
rootPool.executorLost(executorId, host, reason) //
//增加一个 executor
def executorAdded(execId: String, host: String) {
dagScheduler.executorAdded(execId, host)
//这个 host上的 executor的 set
def getExecutorsAliveOnHost(host: String): Option[Set[String]] = synchronized {
hostToExecutors.get(host).map(_.toSet) //这个 host上的 executor的 set
def hasExecutorsAliveOnHost(host: String): Boolean = synchronized {
hostToExecutors.contains(host) //host与executor的关系中 是否存在这个 host
//这个rack里面是否有 host存在
def hasHostAliveOnRack(rack: String): Boolean = synchronized {
def isExecutorAlive(execId: String): Boolean = synchronized {
executorIdToRunningTaskIds.contains(execId) //executor 与running 任务的 关系
def isExecutorBusy(execId: String): Boolean = synchronized {
* Get a snapshot of the currently blacklisted nodes for the entire application. This is
* thread-safe -- it can be called without a lock on the TaskScheduler.
def nodeBlacklist(): scala.collection.immutable.Set[String] = {
// By default, rack is unknown
def getRackForHost(value: String): Option[String] = None
// 等待 backend ready
private def waitBackendReady(): Unit = {
if (backend.isReady) { //如果 backend 已经ready 则直接 return ,否则 等待 backend 就绪
while (!backend.isReady) {
// Might take a while for backend to be ready if it is waiting on resources.
if (sc.stopped.get) {
// For example: the master removes the application for some reason
throw new IllegalStateException("Spark context stopped while waiting for backend")
synchronized {
//获取 applicationID
override def applicationId(): String = backend.applicationId()
//获取application AttemptID
override def applicationAttemptId(): Option[String] = backend.applicationAttemptId()
//获取 这个 stageId 的 stageAttemptId 的 TaskSetManager
private[scheduler] def taskSetManagerForAttempt(
stageId: Int,
stageAttemptId: Int): Option[TaskSetManager] = {
for {
attempts <- taskSetsByStageIdAndAttempt.get(stageId)
manager <- attempts.get(stageAttemptId)
} yield {
* Marks the task has completed in all TaskSetManagers for the given stage.
* After stage failure and retry, there may be multiple TaskSetManagers for the stage.
* If an earlier attempt of a stage completes a task, we should ensure that the later attempts
* do not also submit those same tasks. That also means that a task completion from an earlier
* attempt can lead to the entire stage getting marked as successful.
//标记 这个 stageId 的 partitionId这个task 完成
//在 TaskSetMananger 中的 handleSuccessfulTask 方法中会调用这个 markPartitionCompletedInAllTaskSets 方法
private[scheduler] def markPartitionCompletedInAllTaskSets(
stageId: Int,
partitionId: Int,
taskInfo: TaskInfo) = {
taskSetsByStageIdAndAttempt.getOrElse(stageId, Map()).values.foreach { tsm => //。values 就是所有的 TaskSetManager
tsm.markPartitionCompleted(partitionId, taskInfo)//调用 TaskSetManager 的 markPartitionCompleted 方法
private[spark] object TaskSchedulerImpl {
val SCHEDULER_MODE_PROPERTY = "spark.scheduler.mode"
* Used to balance containers across hosts.
* Accepts a map of hosts to resource offers for that host, and returns a prioritized list of
* resource offers representing the order in which the offers should be used. The resource
* offers are ordered such that we'll allocate one container on each host before allocating a
* second container on any host, and so on, in order to reduce the damage if a host fails.
* For example, given {@literal }, {@literal } and
* {@literal }, returns {@literal [o1, o5, o4, o2, o6, o3]}.
def prioritizeContainers[K, T] (map: HashMap[K, ArrayBuffer[T]]): List[T] = {
val _keyList = new ArrayBuffer[K](map.size)
_keyList ++= map.keys
// order keyList based on population of value in map
val keyList = _keyList.sortWith(
(left, right) => map(left).size > map(right).size
val retval = new ArrayBuffer[T](keyList.size * 2)
var index = 0
var found = true
while (found) {
found = false
for (key <- keyList) {
val containerList: ArrayBuffer[T] = map.getOrElse(key, null)
assert(containerList != null)
// Get the index'th entry for this host - if present
if (index < containerList.size) {
retval += containerList.apply(index)
found = true
index += 1
private def maybeCreateBlacklistTracker(sc: SparkContext): Option[BlacklistTracker] = {
if (BlacklistTracker.isBlacklistEnabled(sc.conf)) {//默认关闭
val executorAllocClient: Option[ExecutorAllocationClient] = sc.schedulerBackend match {
case b: ExecutorAllocationClient => Some(b)
case _ => None
Some(new BlacklistTracker(sc, executorAllocClient))
} else {
在 TaskSchedulerImpl 的createTaskSetManager 方法中 会 new 这个对象
所以一般 会在 TaskSchedulerImpl 使用这个类的方法
管理 一个Stage 的 tasks.
private[spark] class TaskSetManager(
sched: TaskSchedulerImpl,
val taskSet: TaskSet, //这个Stage的任务 集合,由DAGScheduler 产生且传入进来
val maxTaskFailures: Int,
blacklistTracker: Option[BlacklistTracker] = None, //spark 的 黑名单 默认关闭的话 blacklistTrackerOpt 是 None
clock: Clock = new SystemClock()) extends Schedulable with Logging {
private val conf = sched.sc.conf
// SPARK-21563 make a copy of the jars/files so they are consistent across the TaskSet
private val addedJars = HashMap[String, Long](sched.sc.addedJars.toSeq: _*)
private val addedFiles = HashMap[String, Long](sched.sc.addedFiles.toSeq: _*)
// Quantile of tasks at which to start speculation
val SPECULATION_QUANTILE = conf.getDouble("spark.speculation.quantile", 0.75)
val SPECULATION_MULTIPLIER = conf.getDouble("spark.speculation.multiplier", 1.5)
// Limit of bytes for total size of results (default is 1GB)
val maxResultSize = Utils.getMaxResultSize(conf) // spark.driver.maxResultSize 的限制
val speculationEnabled = conf.getBoolean("spark.speculation", false) //spark 推测 机制 是否开启
// Serializer for closures and tasks.
val env = SparkEnv.get
val ser = env.closureSerializer.newInstance() //序列化器 默认 java 序列化
val tasks: Array[Task[_]] = taskSet.tasks
private[scheduler] val partitionToIndex: Map[Int, Int] = tasks.zipWithIndex
.map { case (t, idx) => t.partitionId -> idx }.toMap //task 的 partitionId和index 的map
val numTasks = tasks.length //task的长度
val copiesRunning = new Array[Int](numTasks) //记录这个 task 正在运行的 数量
// For each task, tracks whether a copy of the task has succeeded. A task will also be
// marked as "succeeded" if it failed with a fetch failure, in which case it should not
// be re-run because the missing map data needs to be regenerated first.
val successful = new Array[Boolean](numTasks) //成功任务结果的 统计 数组
private val numFailures = new Array[Int](numTasks)//失败任务结果的 统计 数组
// Add the tid of task into this HashSet when the task is killed by other attempt tasks.
// This happened while we set the `spark.speculation` to true. The task killed by others
// should not resubmit while executor lost.
private val killedByOtherAttempt = new HashSet[Long] //被 kill 的
//TaskInfo : 一个 task 的描述 包括 taskID,index,attemptNum,executorid,host,task本地行,是否可以推断 等属性
val taskAttempts: Array[List[TaskInfo]] = Array.fill[List[TaskInfo]](numTasks)(Nil)
private[scheduler] var tasksSuccessful = 0
val weight = 1
val minShare = 0
var priority = taskSet.priority
var stageId = taskSet.stageId
val name = "TaskSet_" + taskSet.id
var parent: Pool = null
private var totalResultSize = 0L
private var calculatedTasks = 0
private[scheduler] val taskSetBlacklistHelperOpt: Option[TaskSetBlacklist] = { ////spark 的 黑名单 默认关闭的话 blacklistTrackerOpt 是 None ;这个也是为 None
blacklistTracker.map { _ => //spark 的 黑名单 默认关闭的话 blacklistTrackerOpt 是 None
new TaskSetBlacklist(conf, stageId, clock)
private[scheduler] val runningTasksSet = new HashSet[Long] //正在running task 的set
//这个方法在 TaskSchedulerImpl 中会被使用
override def runningTasks: Int = runningTasksSet.size //正在running task 的数量
//返回 成功任务结果的 统计 数组 中的这个 tid 的状态,在 TaskSchedulerImpl 中会被使用
def someAttemptSucceeded(tid: Long): Boolean = {
successful(taskInfos(tid).index) //taskInfos = taskID 和 TaskInfo 的映射关系; successful = 成功任务结果的 统计 数组
// True once no more tasks should be launched for this task set manager. TaskSetManagers enter
// the zombie state once at least one attempt of each task has completed successfully, or if the
// task set is aborted (for example, because it was killed). TaskSetManagers remain in the zombie
// state until all tasks have finished running; we keep TaskSetManagers that are in the zombie
// state in order to continue to track and account for the running tasks.
// TODO: We should kill any running task attempts when the task set manager becomes a zombie.
private[scheduler] var isZombie = false
// Set of pending tasks for each executor. These collections are actually
// treated as stacks, in which new tasks are added to the end of the
// ArrayBuffer and removed from the end. This makes it faster to detect
// tasks that repeatedly fail because whenever a task failed, it is put
// back at the head of the stack. These collections may contain duplicates
// for two reasons:
// (1): Tasks are only removed lazily; when a task is launched, it remains
// in all the pending lists except the one that it was launched from.
// (2): Tasks may be re-added to these lists multiple times as a result
// of failures.
// Duplicates are handled in dequeueTaskFromList, which ensures that a
// task hasn't already started running before launching it.
private val pendingTasksForExecutor = new HashMap[String, ArrayBuffer[Int]] //pending的 executor 和 task 的 map
// Set of pending tasks for each host. Similar to pendingTasksForExecutor,
// but at host level.
private val pendingTasksForHost = new HashMap[String, ArrayBuffer[Int]] //pending 的 host 和 task 的 map
// Set of pending tasks for each rack -- similar to the above.
private val pendingTasksForRack = new HashMap[String, ArrayBuffer[Int]] //pending 的 rack 和 task 的 map
// Set containing pending tasks with no locality preferences.
private[scheduler] var pendingTasksWithNoPrefs = new ArrayBuffer[Int] //pending 的 无特性 和 task 的 array
// Set containing all pending tasks (also used as a stack, as above).
private val allPendingTasks = new ArrayBuffer[Int] //所有 pending的 task的 array
// Tasks that can be speculated. Since these will be a small fraction of total
// tasks, we'll just hold them in a HashSet.
private[scheduler] val speculatableTasks = new HashSet[Int] //推测 任务的 set集合
// Task index, start and finish time for each task attempt (indexed by task ID)
private[scheduler] val taskInfos = new HashMap[Long, TaskInfo] //taskID 和 TaskInfo 的映射关系
// Use a MedianHeap to record durations of successful tasks so we know when to launch
// speculative tasks. This is only used when speculation is enabled, to avoid the overhead
// of inserting into the heap when the heap won't be used.
val successfulTaskDurations = new MedianHeap() //记录 推测的任务 durations
// How frequently to reprint duplicate exceptions in full, in milliseconds
conf.getLong("spark.logging.exceptionPrintInterval", 10000)
// Map of recent exceptions (identified by string representation and top stack frame) to
// duplicate count (how many times the same exception has appeared) and time the full exception
// was printed. This should ideally be an LRU map that can drop old exceptions automatically.
private val recentExceptions = HashMap[String, (Int, Long)]()
// Figure out the current map output tracker epoch and set it on all tasks
val epoch = sched.mapOutputTracker.getEpoch
logDebug("Epoch for " + taskSet + ": " + epoch)
for (t <- tasks) {
t.epoch = epoch
// Add all our tasks to the pending lists. We do this in reverse order
// of task index so that tasks with low indices get launched first.
for (i <- (0 until numTasks).reverse) {
* Track the set of locality levels which are valid given the tasks locality preferences and
* the set of currently available executors. This is updated as executors are added and removed.
* This allows a performance optimization, of skipping levels that aren't relevant (eg., skip
* PROCESS_LOCAL if no tasks could be run PROCESS_LOCAL for the current set of executors).
//计算 有效的 task 本地特性,一般的 levels 除了 RACK_LOCAL都会有的
private[scheduler] var myLocalityLevels = computeValidLocalityLevels()
// Time to wait at each level
//不同 级别的 task 本地性 等待时间 spark.locality.wait 默认 3s
private[scheduler] var localityWaits: Array[Long] = myLocalityLevels.map(getLocalityWait)
// Delay scheduling variables: we keep track of our current locality level and the time we
// last launched a task at that level, and move up a level when localityWaits[curLevel] expires.
// We then move down if we manage to launch a "more local" task.
private var currentLocalityIndex = 0 // Index of our current locality level in validLocalityLevels
private var lastLaunchTime = clock.getTimeMillis() // Time we last launched a task at this level
override def schedulableQueue: ConcurrentLinkedQueue[Schedulable] = null
override def schedulingMode: SchedulingMode = SchedulingMode.NONE
private[scheduler] var emittedTaskSizeWarning = false
/** Add a task to all the pending-task lists that it should be on. */
//add 一个 pending的任务 根据 task的本地性 放到尽量放到 队executor,host,rack, NoPrefs列中
//这个方法在 本对象初始化的时候 已经调用过了
private[spark] def addPendingTask(index: Int) {
for (loc <- tasks(index).preferredLocations) { //获取 task的 本地特性
loc match {
case e: ExecutorCacheTaskLocation => //同一个 executor
pendingTasksForExecutor.getOrElseUpdate(e.executorId, new ArrayBuffer) += index // pendingTasksForExecutor = //pending的 executor 和 task 的 map 增加这个的 task
case e: HDFSCacheTaskLocation => //同一个 host 且可以在 hdfs上 缓存
val exe = sched.getExecutorsAliveOnHost(loc.host) //这个 host上的 executor的 set
exe match {
case Some(set) =>
for (e <- set) {//遍历这些 executor
pendingTasksForExecutor.getOrElseUpdate(e, new ArrayBuffer) += index // pendingTasksForExecutor = //pending的 executor 和 task 的 map 增加这个的 task
logInfo(s"Pending task $index has a cached location at ${e.host} " +
", where there are executors " + set.mkString(","))
case None => logDebug(s"Pending task $index has a cached location at ${e.host} " +
", but there are no executors alive there.")
case _ =>
pendingTasksForHost.getOrElseUpdate(loc.host, new ArrayBuffer) += index //pending 的 host 和 task 的 map
for (rack <- sched.getRackForHost(loc.host)) { //获取所在的 机架
pendingTasksForRack.getOrElseUpdate(rack, new ArrayBuffer) += index //pending 的 rack 和 task 的 map
if (tasks(index).preferredLocations == Nil) {// 本地特性不存在
pendingTasksWithNoPrefs += index //pending 的 无特性 和 task 的 map
//所有 pending的 task的 array
allPendingTasks += index // No point scanning this whole list to find the old task there
* Return the pending tasks list for a given executor ID, or an empty list if
* there is no map entry for that host
//获取 这个 executor 上的 pending的 tasks
private def getPendingTasksForExecutor(executorId: String): ArrayBuffer[Int] = {
pendingTasksForExecutor.getOrElse(executorId, ArrayBuffer())
* Return the pending tasks list for a given host, or an empty list if
* there is no map entry for that host
//获取这个 host 上的 pending 的 tasks
private def getPendingTasksForHost(host: String): ArrayBuffer[Int] = {
pendingTasksForHost.getOrElse(host, ArrayBuffer())
* Return the pending rack-local task list for a given rack, or an empty list if
* there is no map entry for that rack
//获取这个 rack 上的 pending 的 tasks
private def getPendingTasksForRack(rack: String): ArrayBuffer[Int] = {
pendingTasksForRack.getOrElse(rack, ArrayBuffer())
* Dequeue a pending task from the given list and return its index.
* Return None if the list is empty.
* This method also cleans up any tasks in the list that have already
* been launched, since we want that to happen lazily.
//在这个 host的executor的pending tasks 中 从后向前 拿到 没有运行成功和 没有copyRun的 task index
private def dequeueTaskFromList(
execId: String,
host: String,
list: ArrayBuffer[Int]): Option[Int] = {
var indexOffset = list.size
while (indexOffset > 0) {
indexOffset -= 1
val index = list(indexOffset) //拿到尾端元素
if (!isTaskBlacklistedOnExecOrNode(index, execId, host)) { //spark 黑名单机制没有开启的话,isTaskBlacklistedOnExecOrNode 方法返回的是 false
// This should almost always be list.trimEnd(1) to remove tail
list.remove(indexOffset) //去处尾端 元素
if (copiesRunning(index) == 0 && !successful(index)) {//这个任务 copiesRunning 状态是0 和 没有运行成功 , copiesRunning 表示 在运行和完成后 它一直是 有值的,
//所以这里通过 copiesRunning(index) == 0 来过滤 正在运行和已经完成的 task
return Some(index) //返回这个 task
/** Check whether a task is currently running an attempt on a given host */
//指定的 taskAttempts 里面的 这个 list的 TaskInfo 里面 是否存在 这个 host的 taskInfo
private def hasAttemptOnHost(taskIndex: Int, host: String): Boolean = {
val x: Seq[TaskInfo] = taskAttempts(taskIndex)
taskAttempts(taskIndex).exists(_.host == host) //这个 list的 TaskInfo 里面 是否存在 这个 host的 taskInfo
//spark 黑名单机制 没有开启的话,taskSetBlacklistHelperOpt 应该是 None,所以这里应该返回false
private def isTaskBlacklistedOnExecOrNode(index: Int, execId: String, host: String): Boolean = {
taskSetBlacklistHelperOpt.exists { blacklist =>
blacklist.isNodeBlacklistedForTask(host, index) ||
blacklist.isExecutorBlacklistedForTask(execId, index)
* Return a speculative task for a given executor if any are available. The task should not have
* an attempt running on this host, in case the host is slow. In addition, the task should meet
* the given locality constraint.
// Labeled as protected to allow tests to override providing speculative tasks if necessary
// 处理推测task array 没有完成的 tasks,根据 task 本地化特性 在不同的 host 上启动推测 任务
protected def dequeueSpeculativeTask(execId: String, host: String, locality: TaskLocality.Value)
: Option[(Int, TaskLocality.Value)] =
{ // 推测 任务的 set集合 去掉 成功的 任务
speculatableTasks.retain(index => !successful(index)) // Remove finished tasks from set
def canRunOnHost(index: Int): Boolean = {
!hasAttemptOnHost(index, host) &&
!isTaskBlacklistedOnExecOrNode(index, execId, host) //默认 isTaskBlacklistedOnExecOrNode 返回false ,所以 有 这个任务没有在这个 host上运行的话 整个返回 true
if (!speculatableTasks.isEmpty) {//推测 任务的 set集合 中还有元素
// Check for process-local tasks; note that tasks can be process-local
// on multiple nodes when we replicate cached blocks, as in Spark Streaming
for (index <- speculatableTasks if canRunOnHost(index)) {//过滤 没有在 这个 host上 运行的这个 index
val prefs: Seq[TaskLocation] = tasks(index).preferredLocations //task 本地性
val executors: Seq[String] = prefs.flatMap(_ match { //拿到这个 任务 可能 在 同一个 executor 启动的 所有的 executors
case e: ExecutorCacheTaskLocation => Some(e.executorId)
case _ => None
if (executors.contains(execId)) { //去除掉已经 在运行的这个 executor
speculatableTasks -= index
return Some((index, TaskLocality.PROCESS_LOCAL))
// Check for node-local tasks
if (TaskLocality.isAllowed(locality, TaskLocality.NODE_LOCAL)) {
for (index <- speculatableTasks if canRunOnHost(index)) {
val locations: Seq[String] = tasks(index).preferredLocations.map(_.host)
if (locations.contains(host)) {
speculatableTasks -= index
return Some((index, TaskLocality.NODE_LOCAL)) //是否可以在 同一个 host的不同executor 上启动这个 任务
// Check for no-preference tasks
if (TaskLocality.isAllowed(locality, TaskLocality.NO_PREF)) {
for (index <- speculatableTasks if canRunOnHost(index)) {
val locations = tasks(index).preferredLocations
if (locations.size == 0) {
speculatableTasks -= index
return Some((index, TaskLocality.PROCESS_LOCAL)) //没有 本地性的偏好 则默认是 PROCESS_LOCAL
// Check for rack-local tasks
if (TaskLocality.isAllowed(locality, TaskLocality.RACK_LOCAL)) {
for (rack <- sched.getRackForHost(host)) {
for (index <- speculatableTasks if canRunOnHost(index)) {
val racks = tasks(index).preferredLocations.map(_.host).flatMap(sched.getRackForHost)
if (racks.contains(rack)) {
speculatableTasks -= index
return Some((index, TaskLocality.RACK_LOCAL)) // 同一个 机架
// Check for non-local tasks
if (TaskLocality.isAllowed(locality, TaskLocality.ANY)) {
for (index <- speculatableTasks if canRunOnHost(index)) {
speculatableTasks -= index
return Some((index, TaskLocality.ANY)) //任何位置
* Dequeue a pending task for a given node and return its index and locality level.
* Only search for tasks matching the given locality constraint.
* @return An option containing (task index within the task set, locality, is speculative?)
//先 处理 pennding,最后处理 推测 tasks
private def dequeueTask(execId: String, host: String, maxLocality: TaskLocality.Value)
: Option[(Int, TaskLocality.Value, Boolean)] =
for (index <- dequeueTaskFromList(execId, host, getPendingTasksForExecutor(execId))) {
// getPendingTasksForExecutor 获取 这个 executor 上的 pending的 tasks
// 在这个 host的executor的pending tasks 中 从后向前 拿到 没有运行成功和 没有copyRun的 task index
//这个任务 就是 PROCESS_LOCAL 级别的
return Some((index, TaskLocality.PROCESS_LOCAL, false))
if (TaskLocality.isAllowed(maxLocality, TaskLocality.NODE_LOCAL)) { //如果允许 NODE_LOCAL 级别的话
for (index <- dequeueTaskFromList(execId, host, getPendingTasksForHost(host))) {
//getPendingTasksForHost 获取这个 host 上的 pending 的 tasks
// 在这个 host的executor的pending tasks 中 从后向前 拿到 没有运行成功和 没有copyRun的 task index
//这个任务 就是 NODE_LOCAL 级别的
return Some((index, TaskLocality.NODE_LOCAL, false))
if (TaskLocality.isAllowed(maxLocality, TaskLocality.NO_PREF)) {//如果允许 NO_PREF 级别的话
// Look for noPref tasks after NODE_LOCAL for minimize cross-rack traffic
for (index <- dequeueTaskFromList(execId, host, pendingTasksWithNoPrefs)) { //pendingTasksWithNoPrefs = pending 的 无特性 和 task 的 array
//在这个 host的executor的pending tasks 中 从后向前 拿到 没有运行成功和 没有copyRun的 task index
//这个任务 就是 PROCESS_LOCAL 级别的
return Some((index, TaskLocality.PROCESS_LOCAL, false))
if (TaskLocality.isAllowed(maxLocality, TaskLocality.RACK_LOCAL)) {//如果允许 RACK_LOCAL 级别的话
for {
rack <- sched.getRackForHost(host) //默认的机架的 位置信息 是 None
index <- dequeueTaskFromList(execId, host, getPendingTasksForRack(rack))
} {
return Some((index, TaskLocality.RACK_LOCAL, false))
if (TaskLocality.isAllowed(maxLocality, TaskLocality.ANY)) {//如果允许 ANY 级别的话
for (index <- dequeueTaskFromList(execId, host, allPendingTasks)) {
//allPendingTasks = 从 所有的 pending task 的 array中
//在这个 host的executor的pending tasks 中 从后向前 拿到 没有运行成功和 没有copyRun的 task index
//这个任务 就是 ANY 级别的
return Some((index, TaskLocality.ANY, false))
// find a speculative task if all others tasks have been scheduled
//如果 已经 走到这一步的时候,说明所有的 pending的任务 都运行起来了,这个时候 就会启动 推测 任务
// 处理推测task array 没有完成的 tasks,根据 task 本地化特性 在不同的 host 上启动推测 任务
dequeueSpeculativeTask(execId, host, maxLocality).map {
case (taskIndex, allowedLocality) => (taskIndex, allowedLocality, true)}
* Respond to an offer of a single executor from the scheduler by finding a task
* NOTE: this function is either called with a maxLocality which
* would be adjusted by delay scheduling algorithm or it will be with a special
* NO_PREF locality which will be not modified
* @param execId the executor Id of the offered resource
* @param host the host Id of the offered resource
* @param maxLocality the maximum locality we want to schedule the tasks at
//在 此 host 的 execId 的maxLocality 条件下 调度任务,返回 TaskDescription 信息
//这个方法 会在 TaskSchedulerImpl 的 resourceOfferSingleTaskSet 方法中调用
def resourceOffer(
execId: String,
host: String,
maxLocality: TaskLocality.TaskLocality)
: Option[TaskDescription] =
val offerBlacklisted = taskSetBlacklistHelperOpt.exists { blacklist => //没有开启 spark 的 黑名单机制的话 是 false
blacklist.isNodeBlacklistedForTaskSet(host) ||
if (!isZombie && !offerBlacklisted) {//isZombie 是 false 是正常状态,所以一般这里 是 true
val curTime = clock.getTimeMillis()
var allowedLocality = maxLocality
if (maxLocality != TaskLocality.NO_PREF) {
allowedLocality = getAllowedLocalityLevel(curTime)//获取 此刻 的 允许的 task 本地化级别(有pending的任务的 task 本地化特性) 由 PROCESS_LOCAL-》NODE_LOCAL-》NO_PREF-》RACK_LOCAL
if (allowedLocality > maxLocality) {//如果 allowedLocality 的比 maxLocality的 宽松
// We're not allowed to search for farther-away tasks
allowedLocality = maxLocality //更新 allowedLocality,因为 maxLocality 这个是要求的 特性,更接近 数据的位置
//dequeueTask 先 处理 pennding,最后处理 推测 tasks
dequeueTask(execId, host, allowedLocality).map { case ((index, taskLocality, speculative)) =>
// Found a task; do some bookkeeping and return a task description
val task: Task[_] = tasks(index) //拿到这个 task,这里的task是 一个Stage的TaskS 中的task
val taskId = sched.newTaskId() //生成taskID
// Do various bookkeeping
copiesRunning(index) += 1 //记录这个 task 正在运行的 数量
val attemptNum = taskAttempts(index).size //拿到这个 array 的长度。默认 一个
val info = new TaskInfo(taskId, index, attemptNum, curTime,
execId, host, taskLocality, speculative)
taskInfos(taskId) = info //更新 taskInfos 信息,在 handleSuccessfulTask 和 handleFailedTask 方法中 以便可以获取到
taskAttempts(index) = info :: taskAttempts(index)
// Update our locality level for delay scheduling
// NO_PREF will not affect the variables related to delay scheduling
if (maxLocality != TaskLocality.NO_PREF) {
currentLocalityIndex = getLocalityIndex(taskLocality)
lastLaunchTime = curTime
// Serialize and return the task
val serializedTask: ByteBuffer = try {
} catch {
// If the task cannot be serialized, then there's no point to re-attempt the task,
// as it will always fail. So just abort the whole task-set.
case NonFatal(e) =>
val msg = s"Failed to serialize task $taskId, not attempting to retry it."
logError(msg, e)
abort(s"$msg Exception during serialization: $e")
throw new TaskNotSerializableException(e)
if (serializedTask.limit() > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 &&
!emittedTaskSizeWarning) {
emittedTaskSizeWarning = true
logWarning(s"Stage ${task.stageId} contains a task of very large size " +
s"(${serializedTask.limit() / 1024} KB). The maximum recommended task size is " +
s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.") //task 序列化后的大小警告限制 100K
addRunningTask(taskId) //更新runningTasksSet 和 Poll的 runningTasks
// We used to log the time it takes to serialize the task, but task size is already
// a good proxy to task serialization time.
// val timeTaken = clock.getTime() - startTime
val taskName = s"task ${info.id} in stage ${taskSet.id}" //某个Stage中的某个task
logInfo(s"Starting $taskName (TID $taskId, $host, executor ${info.executorId}, " +
s"partition ${task.partitionId}, $taskLocality, ${serializedTask.limit()} bytes)")
sched.dagScheduler.taskStarted(task, info)
new TaskDescription( //返回 这个task描述类的对象
} else {
// 可能 Stage tasks 已经 运行完成,这个方法 在本类内部调用
private def maybeFinishTaskSet() {
if (isZombie && runningTasks == 0) {//当isZombie 为 true 和 runningTasks 的数目是 0 的时候,表示 这个Stage的所有Tasks 都已经完成了
sched.taskSetFinished(this)//调用 TaskSchedulerImpl 的 taskSetFinished 方法
if (tasksSuccessful == numTasks) {
* Get the level we can launch tasks according to delay scheduling, based on current wait time.
//获取 此刻 的 允许的 task 本地化级别(有pending的任务的 task 本地化特性) 由 PROCESS_LOCAL-》NODE_LOCAL-》NO_PREF-》RACK_LOCAL
private def getAllowedLocalityLevel(curTime: Long): TaskLocality.TaskLocality = {
// Remove the scheduled or finished tasks lazily
//pendingTaskIds 移除已经 running 和 完成的 task,遇到 没有在 running 和 已经运行完成 的 话 返回 true
def tasksNeedToBeScheduledFrom(pendingTaskIds: ArrayBuffer[Int]): Boolean = {
var indexOffset = pendingTaskIds.size
while (indexOffset > 0) {
indexOffset -= 1
val index = pendingTaskIds(indexOffset)
if (copiesRunning(index) == 0 && !successful(index)) { //这个 任务 没有在 running 和 已经运行完成 的 话 返回 true
return true
} else {
pendingTaskIds.remove(indexOffset) //移除已经 running 和 完成的 task
// Walk through the list of tasks that can be scheduled at each location and returns true
// if there are any tasks that still need to be scheduled. Lazily cleans up tasks that have
// already been scheduled.
//清理这个 pendingTasks 的 非pending tasks, 返回是否还有 pending的 任务
def moreTasksToRunIn(pendingTasks: HashMap[String, ArrayBuffer[Int]]): Boolean = {
val emptyKeys = new ArrayBuffer[String]
val hasTasks: Boolean = pendingTasks.exists {
case (id: String, tasks: ArrayBuffer[Int]) =>
if (tasksNeedToBeScheduledFrom(tasks)) { // tasksNeedToBeScheduledFrom pendingTaskIds 移除已经 running 和 完成的 task,遇到 没有在 running 和 已经运行完成 的 话 返回 true
} else {
emptyKeys += id //这个 key下面 任务都 没有在 pending的话 加入到 emptyKeys
// The key could be executorId, host or rackId
emptyKeys.foreach(id => pendingTasks.remove(id)) //清理这个 pendingTasks 的 value 是空的 key-value
hasTasks //这里返回是 true 的话 说明 还有 pending的任务
while (currentLocalityIndex < myLocalityLevels.length - 1) {// currentLocalityIndex 从 0 开始
val moreTasks: Boolean = myLocalityLevels(currentLocalityIndex) match {
case TaskLocality.PROCESS_LOCAL => moreTasksToRunIn(pendingTasksForExecutor)//同一个 executor的 pending队列 ; moreTasksToRunIn返回是true的话,说明 还有 pending的任务
case TaskLocality.NODE_LOCAL => moreTasksToRunIn(pendingTasksForHost)//同一个 host的pending对列
case TaskLocality.NO_PREF => pendingTasksWithNoPrefs.nonEmpty //没有 任务本地性偏好的 tasks
case TaskLocality.RACK_LOCAL => moreTasksToRunIn(pendingTasksForRack)
if (!moreTasks) {//这里的话,就是 没有pending的任务了
// This is a performance optimization: if there are no more tasks that can
// be scheduled at a particular locality level, there is no point in waiting
// for the locality wait timeout (SPARK-4939).
lastLaunchTime = curTime //更新 lastLaunchTime
logDebug(s"No tasks for locality level ${myLocalityLevels(currentLocalityIndex)}, " +
s"so moving to locality level ${myLocalityLevels(currentLocalityIndex + 1)}")
currentLocalityIndex += 1
} else if (curTime - lastLaunchTime >= localityWaits(currentLocalityIndex)) { //如果 curTime - lastLaunchTime差值 超过 设置的 (task 本地性 等待时间 spark.locality.wait )
// Jump to the next locality level, and reset lastLaunchTime so that the next locality
// wait timer doesn't immediately expire
lastLaunchTime += localityWaits(currentLocalityIndex) //更新 lastLaunchTime
logDebug(s"Moving to ${myLocalityLevels(currentLocalityIndex + 1)} after waiting for " +
currentLocalityIndex += 1
} else {
return myLocalityLevels(currentLocalityIndex) //这个级别的 TaskLocality
myLocalityLevels(currentLocalityIndex)//这个级别的 TaskLocality
* Find the index in myLocalityLevels for a given locality. This is also designed to work with
* localities that are not in myLocalityLevels (in case we somehow get those) by returning the
* next-biggest level we have. Uses the fact that the last value in myLocalityLevels is ANY.
//获取 locality 在 myLocalityLevels 中的 index
def getLocalityIndex(locality: TaskLocality.TaskLocality): Int = {
var index = 0
while (locality > myLocalityLevels(index)) {
index += 1
* Check whether the given task set has been blacklisted to the point that it can't run anywhere.
* It is possible that this taskset has become impossible to schedule *anywhere* due to the
* blacklist. The most common scenario would be if there are fewer executors than
* spark.task.maxFailures. We need to detect this so we can fail the task set, otherwise the job
* will hang.
* There's a tradeoff here: we could make sure all tasks in the task set are schedulable, but that
* would add extra time to each iteration of the scheduling loop. Here, we take the approach of
* making sure at least one of the unscheduled tasks is schedulable. This means we may not detect
* the hang as quickly as we could have, but we'll always detect the hang eventually, and the
* method is faster in the typical case. In the worst case, this method can take
* O(maxTaskFailures + numTasks) time, but it will be faster when there haven't been any task
* failures (this is because the method picks one unscheduled task, and then iterates through each
* executor until it finds one that the task isn't blacklisted on).
private[scheduler] def abortIfCompletelyBlacklisted(//spark 的 黑名单 默认关闭的话 blacklistTrackerOpt 是 None ;这个也是为 None
hostToExecutors: HashMap[String, HashSet[String]]): Unit = {
taskSetBlacklistHelperOpt.foreach { taskSetBlacklist =>
val appBlacklist = blacklistTracker.get
// Only look for unschedulable tasks when at least one executor has registered. Otherwise,
// task sets will be (unnecessarily) aborted in cases when no executors have registered yet.
if (hostToExecutors.nonEmpty) {
// find any task that needs to be scheduled
val pendingTask: Option[Int] = {
// usually this will just take the last pending task, but because of the lazy removal
// from each list, we may need to go deeper in the list. We poll from the end because
// failed tasks are put back at the end of allPendingTasks, so we're more likely to find
// an unschedulable task this way.
val indexOffset = allPendingTasks.lastIndexWhere { indexInTaskSet =>
copiesRunning(indexInTaskSet) == 0 && !successful(indexInTaskSet)
if (indexOffset == -1) {
} else {
pendingTask.foreach { indexInTaskSet =>
// try to find some executor this task can run on. Its possible that some *other*
// task isn't schedulable anywhere, but we will discover that in some later call,
// when that unschedulable task is the last task remaining.
val blacklistedEverywhere = hostToExecutors.forall { case (host, execsOnHost) =>
// Check if the task can run on the node
val nodeBlacklisted =
appBlacklist.isNodeBlacklisted(host) ||
taskSetBlacklist.isNodeBlacklistedForTaskSet(host) ||
taskSetBlacklist.isNodeBlacklistedForTask(host, indexInTaskSet)
if (nodeBlacklisted) {
} else {
// Check if the task can run on any of the executors
execsOnHost.forall { exec =>
appBlacklist.isExecutorBlacklisted(exec) ||
taskSetBlacklist.isExecutorBlacklistedForTaskSet(exec) ||
taskSetBlacklist.isExecutorBlacklistedForTask(exec, indexInTaskSet)
if (blacklistedEverywhere) {
val partition = tasks(indexInTaskSet).partitionId
|Aborting $taskSet because task $indexInTaskSet (partition $partition)
|cannot run anywhere due to node and executor blacklist.
|Most recent failure:
|Blacklisting behavior can be configured via spark.blacklist.*.
* Marks the task as getting result and notifies the DAG Scheduler
def handleTaskGettingResult(tid: Long): Unit = {
val info = taskInfos(tid)
* Check whether has enough quota to fetch the result with `size` bytes
//计算 driver 获取的结果量大小 是否超过 spark.driver.maxResultSize,如果超过 返回false 就是不能 获取到更多的result
def canFetchMoreResults(size: Long): Boolean = sched.synchronized {
totalResultSize += size
calculatedTasks += 1
if (maxResultSize > 0 && totalResultSize > maxResultSize) {
val msg = s"Total size of serialized results of ${calculatedTasks} tasks " +
s"(${Utils.bytesToString(totalResultSize)}) is bigger than spark.driver.maxResultSize " +
} else {
* Marks a task as successful and notifies the DAGScheduler that the task has ended.
//处理成功的 task,会在 TaskSchedulerImpl的 handleSuccessfulTask 方法中被调用
//开始是由 executor 执行 task 完成之后,向 driver的 CoarseGrainedSchedulerbackend 发送 StatusUpdate 信息后,调用 TaskSchedulerImpl 的
//statusUpdate ,statusUpdate 里面根据 task 的完成状态 通过 TaskResultGetter和TaskSchedulerImpl 传递调用本类的 handleSuccessfulTask 或者 handleFailedTask
def handleSuccessfulTask(tid: Long, result: DirectTaskResult[_]): Unit = {
val info = taskInfos(tid) //拿到info信息
val index = info.index
info.markFinished(TaskState.FINISHED, clock.getTimeMillis()) //标记成功
if (speculationEnabled) { //spark 推测机制开启的话,
removeRunningTask(tid)//从 runningTasksSet 和 Poll 中移除这个 task id
// Kill any other attempts for the same task (since those are unnecessary now that one
// attempt completed successfully).
for (attemptInfo <- taskAttempts(index) if attemptInfo.running) {
logInfo(s"Killing attempt ${attemptInfo.attemptNumber} for task ${attemptInfo.id} " +
s"in stage ${taskSet.id} (TID ${attemptInfo.taskId}) on ${attemptInfo.host} " +
s"as the attempt ${info.attemptNumber} succeeded on ${info.host}")
killedByOtherAttempt += attemptInfo.taskId
interruptThread = true,
reason = "another attempt succeeded")
if (!successful(index)) {
tasksSuccessful += 1
logInfo(s"Finished task ${info.id} in stage ${taskSet.id} (TID ${info.taskId}) in" +
s" ${info.duration} ms on ${info.host} (executor ${info.executorId})" +
s" ($tasksSuccessful/$numTasks)")
// Mark successful and stop if all the tasks have succeeded.
successful(index) = true
if (tasksSuccessful == numTasks) {
isZombie = true
} else {
logInfo("Ignoring task-finished event for " + info.id + " in stage " + taskSet.id +
" because task " + index + " has already completed successfully")
// There may be multiple tasksets for this stage -- we let all of them know that the partition
// was completed. This may result in some of the tasksets getting completed.
sched.markPartitionCompletedInAllTaskSets(stageId, tasks(index).partitionId, info)//
// This method is called by "TaskSchedulerImpl.handleSuccessfulTask" which holds the
// "TaskSchedulerImpl" lock until exiting. To avoid the SPARK-7655 issue, we should not
// "deserialize" the value when holding a lock to avoid blocking other threads. So we call
// "result.value()" in "TaskResultGetter.enqueueSuccessfulTask" before reaching here.
// Note: "result.value()" only deserializes the value when it's called at the first time, so
// here "result.value()" just returns the value and won't block other threads.
sched.dagScheduler.taskEnded(tasks(index), Success, result.value(), result.accumUpdates, info)
maybeFinishTaskSet() //一个task 运行完成之后,可能就是这个Stage的最后一个任务,所以需要检查 这个 Stage的tasks 是否都已经完成
//partitionId 是 tasks.zipWithIndex
//标记 这个 partitionId 的task 完成,当一个 task完成的时候,可能这个task 就是这个Stage的最后一个任务,所以要 maybeFinishTaskSet
//在 TaskSchedulerImpl 中 markPartitionCompletedInAllTaskSets 方法中会使用 会使用
private[scheduler] def markPartitionCompleted(partitionId: Int, taskInfo: TaskInfo): Unit = {
partitionToIndex.get(partitionId).foreach { index =>
if (!successful(index)) {//还没有成功的话 ,执行下面
if (speculationEnabled && !isZombie) {
tasksSuccessful += 1 //标记成功
successful(index) = true
if (tasksSuccessful == numTasks) { //如果 所有的 任务 都运行成功了,标记这个 TaskSet 的所有的任务成功
isZombie = true
maybeFinishTaskSet() //一个task 运行完成之后,可能就是这个Stage的最后一个任务,所以需要检查 这个 Stage的tasks 是否都已经完成
* Marks the task as failed, re-adds it to the list of pending tasks, and notifies the
* DAG Scheduler.
//处理失败的任务,在一些 情况下会 再加入到 pending 队列去
//处理成功的 task,会在 TaskSchedulerImpl的 handleFailedTask 方法中被调用
//开始是由 executor 执行 task 完成之后,向 driver的 CoarseGrainedSchedulerbackend 发送 StatusUpdate 信息后,调用 TaskSchedulerImpl 的
//statusUpdate ,statusUpdate 里面根据 task 的完成状态 通过 TaskResultGetter和TaskSchedulerImpl 传递调用本类的 handleSuccessfulTask 或者 handleFailedTask
def handleFailedTask(tid: Long, state: TaskState, reason: TaskFailedReason) {
val info = taskInfos(tid) //拿到 taskInfo信息
if (info.failed || info.killed) {//如果这个 任务 已经failed或者killed 则直接返回
removeRunningTask(tid)//从 runningTasksSet 和 Poll 中移除这个 task id
info.markFinished(state, clock.getTimeMillis()) //标记这个 任务 已经完成
val index = info.index
copiesRunning(index) -= 1 //正在运行的任务数 -1
var accumUpdates: Seq[AccumulatorV2[_, _]] = Seq.empty
val failureReason = s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid, ${info.host}," +
s" executor ${info.executorId}): ${reason.toErrorString}"
val failureException: Option[Throwable] = reason match {
case fetchFailed: FetchFailed => //获取运行结果失败
if (!successful(index)) { //标记为 success
successful(index) = true
tasksSuccessful += 1
isZombie = true
if (fetchFailed.bmAddress != null) {
fetchFailed.bmAddress.host, fetchFailed.bmAddress.executorId))
case ef: ExceptionFailure =>
// ExceptionFailure's might have accumulator updates
accumUpdates = ef.accums
if (ef.className == classOf[NotSerializableException].getName) {
// If the task result wasn't serializable, there's no point in trying to re-execute it.
logError("Task %s in stage %s (TID %d) had a not serializable result: %s; not retrying"
.format(info.id, taskSet.id, tid, ef.description))
abort("Task %s in stage %s (TID %d) had a not serializable result: %s".format(
info.id, taskSet.id, tid, ef.description))
val key = ef.description
val now = clock.getTimeMillis()
val (printFull, dupCount) = {
if (recentExceptions.contains(key)) {
val (dupCount, printTime) = recentExceptions(key)
if (now - printTime > EXCEPTION_PRINT_INTERVAL) {
recentExceptions(key) = (0, now)
(true, 0)
} else {
recentExceptions(key) = (dupCount + 1, printTime)
(false, dupCount + 1)
} else {
recentExceptions(key) = (0, now)
(true, 0)
if (printFull) {
} else {
s"Lost task ${info.id} in stage ${taskSet.id} (TID $tid) on ${info.host}, executor" +
s" ${info.executorId}: ${ef.className} (${ef.description}) [duplicate $dupCount]")
case e: ExecutorLostFailure if !e.exitCausedByApp =>
logInfo(s"Task $tid failed because while it was being computed, its executor " +
"exited for a reason unrelated to the task. Not counting this failure towards the " +
"maximum number of failures for the task.")
case e: TaskFailedReason => // TaskResultLost, TaskKilled, and others
sched.dagScheduler.taskEnded(tasks(index), reason, null, accumUpdates, info)
if (!isZombie && reason.countTowardsTaskFailures) {
assert (null != failureReason)
info.host, info.executorId, index, failureReason))
numFailures(index) += 1
if (numFailures(index) >= maxTaskFailures) {
logError("Task %d in stage %s failed %d times; aborting job".format(
index, taskSet.id, maxTaskFailures))
abort("Task %d in stage %s failed %d times, most recent failure: %s\nDriver stacktrace:"
.format(index, taskSet.id, maxTaskFailures, failureReason), failureException)
if (successful(index)) {//如果被标记 success 则不会继续 加入到 pending 队列中去
logInfo(s"Task ${info.id} in stage ${taskSet.id} (TID $tid) failed, but the task will not" +
s" be re-executed (either because the task failed with a shuffle data fetch failure," +
s" so the previous stage needs to be re-run, or because a different copy of the task" +
s" has already succeeded).")
} else {
addPendingTask(index) //继续 加入到 pending 队列中去,重新运行
maybeFinishTaskSet()//一个task 运行完成之后,可能就是这个Stage的最后一个任务,所以需要检查 这个 Stage的tasks 是否都已经完成
//手动 taskSetFailed
def abort(message: String, exception: Option[Throwable] = None): Unit = sched.synchronized {
// TODO: Kill running tasks if we were not terminated due to a Mesos error
sched.dagScheduler.taskSetFailed(taskSet, message, exception) //通知 DAGScheduler 这个 taskSet 任务失败,DAGScheduler 里面会 使用 TaskSchedulerImpl 取消这个 Stage 的所有的任务
isZombie = true //手动设置异常,停止 tasks的运行
maybeFinishTaskSet() //一个task 运行完成之后,可能就是这个Stage的最后一个任务,所以需要检查 这个 Stage的tasks 是否都已经完成
/** If the given task ID is not in the set of running tasks, adds it.
* Used to keep track of the number of running tasks, for enforcing scheduling policies.
//runningTasksSet 增加这个 tid,Pool中也 增加 increaseRunningTasks
//在 本类的 resourceOffer 方法中使用
def addRunningTask(tid: Long) {
if (runningTasksSet.add(tid) && parent != null) {// runningTasksSet 正在running task 的set
parent.increaseRunningTasks(1) //更新 Poll 中的runningTasks 的 数量
/** If the given task ID is in the set of running tasks, removes it. */
//从 runningTasksSet 和 Poll 中移除这个 task id
//在 本类的 handleSuccessfulTask 和 handleFailedTask 方法中使用
def removeRunningTask(tid: Long) {
if (runningTasksSet.remove(tid) && parent != null) { //runningTasksSet 正在running task 的数量
override def getSchedulableByName(name: String): Schedulable = {
override def addSchedulable(schedulable: Schedulable) {}
override def removeSchedulable(schedulable: Schedulable) {}
override def getSortedTaskSetQueue(): ArrayBuffer[TaskSetManager] = {
val sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]()
sortedTaskSetQueue += this
/** Called by TaskScheduler when an executor is lost so we can re-enqueue our tasks */
//TaskSchedulerImpl 的 removeExecutor方法 调用Pool 中的 executorLost 方法, Pool 中的 executorLost会调用 本方法
override def executorLost(execId: String, host: String, reason: ExecutorLossReason) {
// Re-enqueue any tasks that ran on the failed executor if this is a shuffle map stage,
// and we are not using an external shuffle server which could serve the shuffle outputs.
// The reason is the next stage wouldn't be able to fetch the data from this dead executor
// so we would need to rerun these tasks on other executors.
if (tasks(0).isInstanceOf[ShuffleMapTask] && !env.blockManager.externalShuffleServiceEnabled
&& !isZombie) {
for ((tid, info) <- taskInfos if info.executorId == execId) {
val index = taskInfos(tid).index
if (successful(index) && !killedByOtherAttempt.contains(tid)) {
successful(index) = false
copiesRunning(index) -= 1
tasksSuccessful -= 1
// Tell the DAGScheduler that this task was resubmitted so that it doesn't think our
// stage finishes when a total of tasks.size tasks finish.
tasks(index), Resubmitted, null, Seq.empty, info)
for ((tid, info) <- taskInfos if info.running && info.executorId == execId) {
val exitCausedByApp: Boolean = reason match {
case exited: ExecutorExited => exited.exitCausedByApp
case ExecutorKilled => false
case _ => true
handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure(info.executorId, exitCausedByApp,
// recalculate valid locality levels and waits when executor is lost
* Check for tasks to be speculated and return true if there are any. This is called periodically
* by the TaskScheduler.
//在 Poll的checkSpeculatableTasks 中调用这个方法
override def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean = {
// Can't speculate if we only have one task, and no need to speculate if the task set is a
// zombie.
if (isZombie || numTasks == 1) { //如果只有 1个 任务 那就 不用推测执行了
return false
var foundTasks = false
val minFinishedForSpeculation = (SPECULATION_QUANTILE * numTasks).floor.toInt // SPECULATION_QUANTILE = 0。75
logDebug("Checking for speculative tasks: minFinished = " + minFinishedForSpeculation)
if (tasksSuccessful >= minFinishedForSpeculation && tasksSuccessful > 0) { //只有超过一定的阀值 才会 开始推测任务,成功的task数量超过一定量
val time = clock.getTimeMillis()
val medianDuration: Double = successfulTaskDurations.median
val threshold = max(SPECULATION_MULTIPLIER * medianDuration, minTimeToSpeculation)
// TODO: Threshold should also look at standard deviation of task durations and have a lower
// bound based on that.
logDebug("Task length threshold for speculation: " + threshold)
for (tid <- runningTasksSet) {
val info = taskInfos(tid)
val index = info.index
if (!successful(index) && copiesRunning(index) == 1 && info.timeRunning(time) > threshold && //这个task 已经运行超过了一定的阀值
!speculatableTasks.contains(index)) { //满足 推测任务的 要求
"Marking task %d in stage %s (on %s) as speculatable because it ran more than %.0f ms"
.format(index, taskSet.id, info.host, threshold))
speculatableTasks += index //加入到 推测Tasks中去
sched.dagScheduler.speculativeTaskSubmitted(tasks(index)) //提交这个 task
foundTasks = true
foundTasks //有满足条件的推测任务 则返回true
private def getLocalityWait(level: TaskLocality.TaskLocality): Long = {
val defaultWait = conf.get(config.LOCALITY_WAIT) //spark.locality.wait 默认 3s
val localityWaitKey = level match {
case TaskLocality.PROCESS_LOCAL => "spark.locality.wait.process"
case TaskLocality.NODE_LOCAL => "spark.locality.wait.node"
case TaskLocality.RACK_LOCAL => "spark.locality.wait.rack"
case _ => null
if (localityWaitKey != null) {
conf.getTimeAsMs(localityWaitKey, defaultWait.toString)
} else {
* Compute the locality levels used in this TaskSet. Assumes that all tasks have already been
* added to queues using addPendingTask.
//计算 有效的 task 本地特性,一般的 levels 除了 RACK_LOCAL都会有的
private def computeValidLocalityLevels(): Array[TaskLocality.TaskLocality] = {
val levels = new ArrayBuffer[TaskLocality.TaskLocality]
if (!pendingTasksForExecutor.isEmpty &&
pendingTasksForExecutor.keySet.exists(sched.isExecutorAlive(_))) { //pending的 executor 和 task 的 map 不为空 和 这个队列中 存在的 executor 中有任务运行
if (!pendingTasksForHost.isEmpty &&
pendingTasksForHost.keySet.exists(sched.hasExecutorsAliveOnHost(_))) {
levels += NODE_LOCAL //级别中加入 NODE_LOCAL
if (!pendingTasksWithNoPrefs.isEmpty) {
levels += NO_PREF //级别中加入 NO_PREF
if (!pendingTasksForRack.isEmpty &&
pendingTasksForRack.keySet.exists(sched.hasHostAliveOnRack(_))) {
levels += RACK_LOCAL
levels += ANY //级别中加入 ANY
logDebug("Valid locality levels for " + taskSet + ": " + levels.mkString(", "))
def recomputeLocality() {
val previousLocalityLevel = myLocalityLevels(currentLocalityIndex)
myLocalityLevels = computeValidLocalityLevels()
localityWaits = myLocalityLevels.map(getLocalityWait)
currentLocalityIndex = getLocalityIndex(previousLocalityLevel)
def executorAdded() {
private[spark] object TaskSetManager {
// The user will be warned if any stages contain a task that has a serialized size greater than
// this.
TaskResultGetter主要的作用是在 executor 完成一个task之后,根据TaskState的状态,TaskState.FINISHED的话 使用 taskResultGetter.enqueueSuccessfulTask 方法;TaskState.FAILED|TaskState.KILLED|TaskState.LOST使用enqueueFailedTask的方法。
enqueueSuccessfulTask 主要是反序列化 拿到 executor的task 执行结果;
enqueueFailedTask 主要目的是反序列化 拿到 任务错误的 reason。
//在 TaskSchedulerImpl line 142 中 被实例化的
private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedulerImpl)
extends Logging {
private val THREADS = sparkEnv.conf.getInt("spark.resultGetter.threads", 4)
// Exposed for testing.
protected val getTaskResultExecutor: ExecutorService =
ThreadUtils.newDaemonFixedThreadPool(THREADS, "task-result-getter")
// Exposed for testing.
protected val serializer = new ThreadLocal[SerializerInstance] {
override def initialValue(): SerializerInstance = {
protected val taskResultSerializer = new ThreadLocal[SerializerInstance] {
override def initialValue(): SerializerInstance = {
//出对 成功的task
//在 TaskSchedulerImpl 中 的 statusUpdate 中调用 当 task 的执行状态是 TaskState.FINISHED
//主要是反序列化 拿到 executor的task 执行结果
def enqueueSuccessfulTask(
taskSetManager: TaskSetManager,
tid: Long,
serializedData: ByteBuffer): Unit = {
getTaskResultExecutor.execute(new Runnable {
override def run(): Unit = Utils.logUncaughtExceptions {
try {
val (result, size) = serializer.get().deserialize[TaskResult[_]](serializedData) match {
case directResult: DirectTaskResult[_] =>
if (!taskSetManager.canFetchMoreResults(serializedData.limit())) { //超过大小限制
// deserialize "value" without holding any lock so that it won't block other threads.
// We should call it here, so that when it's called again in
// "TaskSetManager.handleSuccessfulTask", it does not need to deserialize the value.
directResult.value(taskResultSerializer.get()) //value 方法内部已经完成了反序列化操作 拿到 executor task 的执行结果
(directResult, serializedData.limit())
case IndirectTaskResult(blockId, size) =>
if (!taskSetManager.canFetchMoreResults(size)) {
// dropped by executor if size is larger than maxResultSize
logDebug("Fetching indirect task result for TID %s".format(tid))
scheduler.handleTaskGettingResult(taskSetManager, tid) //
val serializedTaskResult = sparkEnv.blockManager.getRemoteBytes(blockId)
if (!serializedTaskResult.isDefined) {
/* We won't be able to get the task result if the machine that ran the task failed
* between when the task ended and when we tried to fetch the result, or if the
* block manager had to flush the result. */
taskSetManager, tid, TaskState.FINISHED, TaskResultLost)
val deserializedResult = serializer.get().deserialize[DirectTaskResult[_]](
// force deserialization of referenced value
(deserializedResult, size)
// Set the task result size in the accumulator updates received from the executors.
// We need to do this here on the driver because if we did this on the executors then
// we would have to serialize the result again after updating the size.
result.accumUpdates = result.accumUpdates.map { a =>
if (a.name == Some(InternalAccumulator.RESULT_SIZE)) {
val acc = a.asInstanceOf[LongAccumulator]
assert(acc.sum == 0L, "task result size should not have been set on the executors")
} else {
scheduler.handleSuccessfulTask(taskSetManager, tid, result)
} catch {
case cnf: ClassNotFoundException =>
val loader = Thread.currentThread.getContextClassLoader
taskSetManager.abort("ClassNotFound with classloader: " + loader)
// Matching NonFatal so we don't catch the ControlThrowable from the "return" above.
case NonFatal(ex) =>
logError("Exception while getting task result", ex)
taskSetManager.abort("Exception while getting task result: %s".format(ex))
//出对 失败的任务
//在 TaskSchedulerImpl 中 的 statusUpdate 中调用 当 task 的执行状态是 TaskState.FAILED | TaskState.KILLED | TaskState.LOST
//主要目的是 拿到 任务错误的 reason
def enqueueFailedTask(taskSetManager: TaskSetManager, tid: Long, taskState: TaskState,
serializedData: ByteBuffer) {
var reason : TaskFailedReason = UnknownReason
try {
getTaskResultExecutor.execute(new Runnable {
override def run(): Unit = Utils.logUncaughtExceptions {
val loader = Utils.getContextOrSparkClassLoader
try {
if (serializedData != null && serializedData.limit() > 0) {
reason = serializer.get().deserialize[TaskFailedReason](
serializedData, loader)
} catch {
case cnd: ClassNotFoundException =>
// Log an error but keep going here -- the task failed, so not catastrophic
// if we can't deserialize the reason.
"Could not deserialize TaskEndReason: ClassNotFound with classloader " + loader)
case ex: Exception => // No-op
} finally {
// If there's an error while deserializing the TaskEndReason, this Runnable
// will die. Still tell the scheduler about the task failure, to avoid a hang
// where the scheduler thinks the task is still running.
scheduler.handleFailedTask(taskSetManager, tid, taskState, reason)
} catch {
case e: RejectedExecutionException if sparkEnv.isStopped =>
// ignore it
def stop() {