Spark源码2.4.2之超详细的Task提交的流程分析

当Job提交之后,就会生成DAG图,根据RDD的Dependency进行Stage的划分。stage分为ShuffleMapStage和ResultStage两种类型,根据stage类型生成对应的task,分别是ShuffleMapTask、ResultTask。我们从Stage的提交开始分析:

1. 首先对判断该Stage的父Stage是否已经提交执行,若没有则递归提交其父Stage,否则调用submitMissingTasks函数提交当前Stage。

注:
代码框的最上面一行注释代表代码所在的文件。
代码分析是在standalone的模式下。

//DAGSheduler.scala
private def submitStage(stage: Stage) {
  val jobId = activeJobForStage(stage)
  if (jobId.isDefined) {
    logDebug("submitStage(" + stage + ")")
    if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
      val missing = getMissingParentStages(stage).sortBy(_.id) // 1 获得未提交的父Stage
      logDebug("missing: " + missing)
      if (missing.isEmpty) { // 若不存在没有提交执行的父stage
        logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
        submitMissingTasks(stage, jobId.get) // 2 提交当前Stage
      } else {
        for (parent <- missing) {
          submitStage(parent)
        }
        waitingStages += stage
      }
    }
  } else {
    abortStage(stage, "No active job for stage " + stage.id, None)
  }
}

2. 根据Stage的类型生成相应Task对象,然后将整个Task集合封装成TaskSet对象再提交给TaskScheduler。

//DAGScheduler.scala
private def submitMissingTasks(stage: Stage, jobId: Int) {
    ...
    // 生成Task
     val tasks: Seq[Task[_]] = try {
        val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
        stage match {
          case stage: ShuffleMapStage =>
            stage.pendingPartitions.clear()
            partitionsToCompute.map { id =>
              val locs = taskIdToLocations(id)
              val part = partitions(id)
              stage.pendingPartitions += id
              new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
                taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
                Option(sc.applicationId), sc.applicationAttemptId, stage.rdd.isBarrier())
            }
    
          case stage: ResultStage =>
            partitionsToCompute.map { id =>
              val p: Int = stage.partitions(id)
              val part = partitions(p)
              val locs = taskIdToLocations(id)
              new ResultTask(stage.id, stage.latestInfo.attemptNumber,
                taskBinary, part, locs, id, properties, serializedTaskMetrics,
                Option(jobId), Option(sc.applicationId), sc.applicationAttemptId,
                stage.rdd.isBarrier())
            }
        }
    }
  
    if (tasks.size > 0) {
      logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
        s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
        // taskScheduler:TaskSchedulerImpl
      taskScheduler.submitTasks(new TaskSet( // 1 提交task
        tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
    }    
}

TaskSet类定义如下:

// TaskSet.scala
private[spark] class TaskSet(
    val tasks: Array[Task[_]],
    val stageId: Int,
    val stageAttemptId: Int,
    val priority: Int, // 也就是jobid
    val properties: Properties) {
  val id: String = stageId + "." + stageAttemptId

  override def toString: String = "TaskSet " + id
}

3. TaskScheduler首先为TaskSet生成TaskSetManager对象,TaskSetManager对象负责TaskSet内部的调度逻辑。然后将TaskSetManager对象加入调度池。再进行调度资源分配。

// TaskSchedulerImpl.scala
override def submitTasks(taskSet: TaskSet) {
  val tasks = taskSet.tasks
  logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
  this.synchronized {
    //创建TaskSetManager,TaskSetManager用于对TaskSet中的Task进行调度,包括跟踪Task的运行、Task失败重试等
    val manager = createTaskSetManager(taskSet, maxTaskFailures) // 1 
    val stage = taskSet.stageId
    val stageTaskSets =
      taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
    stageTaskSets(taskSet.stageAttemptId) = manager
    val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
      ts.taskSet != taskSet && !ts.isZombie
    }
    if (conflictingTaskSet) {
      throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
        s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
    }
    //schedulableBuilder中添加TaskSetManager,用于完成所有TaskSet的调度,
    // 即整个Spark程序生成的DAG图对应Stage的TaskSet调度
    schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties) // 2

    if (!isLocal && !hasReceivedTask) {
      // 设置检查TaskSchedulerImpl的饥饿状态的定时器
      starvationTimer.scheduleAtFixedRate(new TimerTask() {
        override def run() {
            // 注意这个if判断的是是否已经launch了task
          if (!hasLaunchedTask) {
            logWarning("Initial job has not accepted any resources; " +
              "check your cluster UI to ensure that workers are registered " +
              "and have sufficient resources")
          } else {
            this.cancel() // 当TaskSchedulerImpl已经launch Task后,取消此定时器
          }
        }
      }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
    }
    // 一旦有TaskSetManager加入到调度池,则该标记项为true。
    hasReceivedTask = true
  } 
  // 发送ReviveOffers消息给Driver, 为Task分配运行资源
  // backend:StandaloneSchedulerBackend(Standalone模式下) 但是是调用其父类CoarseGrainedSchedulerBackend的reviveOffers方法
  backend.reviveOffers() // 3
}

在这个过程中用到了schedulableBuilder,顺便看看该对象是何时创建的。
我们知道在SparkContext的初始化过程中会创建DAGScheduler和TaskScheduler。如以下代码所示:

// SparkContext.scala
// Create and start the scheduler
// 根据资源管理器类型,创建对应的SchedulerBackend、TaskScheduler
val (sched, ts) = SparkContext.createTaskScheduler(this, master, deployMode)
_schedulerBackend = sched
_taskScheduler = ts
_dagScheduler = new DAGScheduler(this)
_heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)

// start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's
// constructor
// 启动TaskScheduler、SchedulerBackend
_taskScheduler.start()

在调用createTaskScheduler创建TaskScheduler的时候会调用其initialize()方法。

// SparkContext.scala
private def createTaskScheduler(
    sc: SparkContext,
    master: String,
    deployMode: String): (SchedulerBackend, TaskScheduler) = {
  import SparkMasterRegex._

  // When running locally, don't try to re-execute tasks on failure.
  val MAX_LOCAL_TASK_FAILURES = 1

  master match {
    case SPARK_REGEX(sparkUrl) =>
      // TaskSchedulerImpl是TaskScheduler的实现类
      val scheduler = new TaskSchedulerImpl(sc)
      val masterUrls = sparkUrl.split(",").map("spark://" + _)
      // 根据Spark的不同运行模式会创建不同的SchedulerBackend对象。
      val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)
      scheduler.initialize(backend)
      (backend, scheduler)
    ...      
 }
}

在调用TaskScheduler的initialize()方法时会创建调度池。如以下代码所示:

// TaskSchedulerImpl.scala
def initialize(backend: SchedulerBackend) {
  this.backend = backend
  schedulableBuilder = {
    schedulingMode match {
      case SchedulingMode.FIFO =>
        new FIFOSchedulableBuilder(rootPool)
      case SchedulingMode.FAIR =>
        new FairSchedulableBuilder(rootPool, conf)
      case _ =>
        throw new IllegalArgumentException(s"Unsupported $SCHEDULER_MODE_PROPERTY: " +
        s"$schedulingMode")
    }
  }
  schedulableBuilder.buildPools()
}

4. 调用CoarseGrainedSchedulerBackend的revive函数发送资源申请的请求。

4.1 该函数主要是向Driver端发送ReviveOffers消息。
// CoarseGrainedSchedulerBackend.scala
override def reviveOffers() {
  // 发送ReviveOffers消息给Driver
  driverEndpoint.send(ReviveOffers)
}

driverEndpoint代表Driver端EndPoint的通信地址,Spark中使用Netty作为通信框架,各个组件都有对应的EndPoint,想和某个组件通信时就要先获得对应组件的EndPoint的引用,然后就可以向该组件发送消息了。

4.2 然后Driver就收到消息,并调用makeOffers()。
// CoarseGrainedSchedulerBackend.scala/DriverEndPoint
case ReviveOffers =>
  makeOffers()

5. 为任务分配资源并进行任务的调度。

// CoarseGrainedSchedulerBackend.scala/DriverEndPoint
// Make fake resource offers on all executors
private def makeOffers() {
  // Make sure no executor is killed while some task is launching on it
  val taskDescs = CoarseGrainedSchedulerBackend.this.synchronized {
    // Filter out executors under killing
    // 所有可用的Executor
    val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
    // WorkOffer表示Executor上可用的资源,
    val workOffers = activeExecutors.map {
      case (id, executorData) =>
        new WorkerOffer(id, executorData.executorHost, executorData.freeCores,
          Some(executorData.executorAddress.hostPort))
    }.toIndexedSeq
    // 调用TaskSchedulerImpl的resourceOffers给Task分配资源,其他资源管理器也是如此
    scheduler.resourceOffers(workOffers) // 1 
  }
  if (!taskDescs.isEmpty) {
      // 先调用TaskSchedulerImpl的resourceOffers方法,为Task的运行分配资源
      // 再调用CoarseGrainedSchedulerBackend中的launchTasks方法启动Task的运行,
      // 最终Task被提交到Worker节点上的Executor上运行
    launchTasks(taskDescs) // 2
  }
}

为了更清楚看懂源码,将涉及到的executorDataMap的类型进行说明,其key为executorId,value为该Executor的相关信息。

private val executorDataMap = new HashMap[String, ExecutorData]

private[cluster] class ExecutorData(
   val executorEndpoint: RpcEndpointRef,
   val executorAddress: RpcAddress,
   override val executorHost: String,
   var freeCores: Int,
   override val totalCores: Int,
   override val logUrlMap: Map[String, String]
) extends ExecutorInfo(executorHost, totalCores, logUrlMap)
5.1 makeOffers()函数中首先获得当前集群可用的资源,并调用resourceOffers函数对可用资源进行分配。
// TaskSchedulerImpl.scala
// WorkOffer表示Executor上可用的资源
def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
  // Mark each slave as alive and remember its hostname
  // Also track if new executor is added
  var newExecAvail = false
  // 该for循环用来更新hostToExecutors、executorIdToHost、hostsByRack等集合
  for (o <- offers) {
    if (!hostToExecutors.contains(o.host)) {
      // hostToExecutors里维护着每个节点上已经激活的所有的executor
      hostToExecutors(o.host) = new HashSet[String]()
    }
    // executorIdToRunningTaskIds里维护着每个executor中运行中的每个task
    if (!executorIdToRunningTaskIds.contains(o.executorId)) {
      hostToExecutors(o.host) += o.executorId
      // 向DAGScheduler发送ExecutorAdded消息
      executorAdded(o.executorId, o.host)
      executorIdToHost(o.executorId) = o.host
      executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()
      newExecAvail = true // 标记添加了新的Executor
    }
    for (rack <- getRackForHost(o.host)) {
      hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
    }
    // 这里的hostToExecutors及hostsByRack集合是为了在资源分配时计算Task本地性使用
  }

  // Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do
  // this here to avoid a separate thread and added synchronization overhead, and also because
  // updating the blacklist is only relevant when task offers are being made.
  // 在这里它会过滤掉黑名单中的过期节点
  blacklistTrackerOpt.foreach(_.applyBlacklistTimeout())

  val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>
    offers.filter { offer =>
      !blacklistTracker.isNodeBlacklisted(offer.host) &&
        !blacklistTracker.isExecutorBlacklisted(offer.executorId)
    }
  }.getOrElse(offers)
  // 随机打散,使Task均匀分配各Worker节点上,为了负载均衡。 避免将任务总是分配给同样一组Worker
  val shuffledOffers = shuffleOffers(filteredOffers)
  // Build a list of tasks to assign to each worker.
  // 根据每个WorkOffer的可用CPU核数创建同等尺寸的任务描述(TaskDescription)数组
  val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
  val availableCpus = shuffledOffers.map(o => o.cores).toArray
  val availableSlots = shuffledOffers.map(o => o.cores / CPUS_PER_TASK).sum
  val sortedTaskSets = rootPool.getSortedTaskSetQueue // 1 得到TaskSetManager的调度顺序
  for (taskSet <- sortedTaskSets) {
    logDebug("parentName: %s, name: %s, runningTasks: %s".format(
      taskSet.parent.name, taskSet.name, taskSet.runningTasks))
    if (newExecAvail) {
      // 里面会调用TaskSetManager本地级别的分配算法,
      // 为每个task分配计算本地级别的等级
      // 因为有新添加的Executor,所以会重新计算TaskSet的本地性       
      taskSet.executorAdded() 
    }
  }

  // Take each TaskSet in our scheduling order, and then offer it each node in increasing order
  // of locality levels so that it gets a chance to launch local tasks on all of them.
  // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
  // 按照本地化原则对Task进行调度
  for (taskSet <- sortedTaskSets) {
    // Skip the barrier taskSet if the available slots are less than the number of pending tasks.
    // taskSet是Barrier则说明该TaskSet中的所有Task要同时启动。所以如果当前taskSet是Barrier则需要可以的cores数量
    // 大于当前taskSet的Task数量。taskSet的isBarrier参数默认为false。
    if (taskSet.isBarrier && availableSlots < taskSet.numTasks) {
      // Skip the launch process.
      // TODO SPARK-24819 If the job requires more slots than available (both busy and free
      // slots), fail the job on submit.
      logInfo(s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +
        s"because the barrier taskSet requires ${taskSet.numTasks} slots, while the total " +
        s"number of available slots is $availableSlots.")
    } else {
      var launchedAnyTask = false
      // Record all the executor IDs assigned barrier tasks on.
      val addressesWithDescs = ArrayBuffer[(String, TaskDescription)]()
      // 遍历当前taskSet的所有本地级别
      for (currentMaxLocality <- taskSet.myLocalityLevels) {
        var launchedTaskAtCurrentMaxLocality = false
        do {
          // 2 为当前本地化级别分配资源   
          launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(taskSet,
            currentMaxLocality, shuffledOffers, availableCpus, tasks, addressesWithDescs)
          launchedAnyTask |= launchedTaskAtCurrentMaxLocality
        } while (launchedTaskAtCurrentMaxLocality)
        // 直到该本地化级别分配失败 换下一个级别
        // 但其实要等达到等待时间(默认3秒)才会换下一个级别,
        // 在resourceOfferSingleTaskSet中调用resourceOffer函数会看到这一点
      }
      if (!launchedAnyTask) {
        taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
      }
      if (launchedAnyTask && taskSet.isBarrier) {
        // Check whether the barrier tasks are partially launched.
        // TODO SPARK-24818 handle the assert failure case (that can happen when some locality
        // requirements are not fulfilled, and we should revert the launched tasks).
        require(addressesWithDescs.size == taskSet.numTasks,
          s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +
            s"because only ${addressesWithDescs.size} out of a total number of " +
            s"${taskSet.numTasks} tasks got resource offers. The resource offers may have " +
            "been blacklisted or cannot fulfill task locality requirements.")

        // materialize the barrier coordinator.
        maybeInitBarrierCoordinator()

        // Update the taskInfos into all the barrier task properties.
        val addressesStr = addressesWithDescs
          // Addresses ordered by partitionId
          .sortBy(_._2.partitionId)
          .map(_._1)
          .mkString(",")
        addressesWithDescs.foreach(_._2.properties.setProperty("addresses", addressesStr))

        logInfo(s"Successfully scheduled all the ${addressesWithDescs.size} tasks for barrier " +
          s"stage ${taskSet.stageId}.")
      }
    }
  }

  // TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get
  // launched within a configured time.
  if (tasks.size > 0) {
    hasLaunchedTask = true
  }
  return tasks // 返回已经获得了资源的任务列表
}

重点代码解释:

  • 因为调度池中可能有多个TaskSetManager对象,所以首先要获得他们之间的调度顺序,也即val sortedTaskSets = rootPool.getSortedTaskSetQueue代码实现的功能。
// Pool.scala
override def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] = {
  val sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]
  val sortedSchedulableQueue =
    schedulableQueue.asScala.toSeq.sortWith(taskSetSchedulingAlgorithm.comparator) // 1
  for (schedulable <- sortedSchedulableQueue) {
    sortedTaskSetQueue ++= schedulable.getSortedTaskSetQueue
  }
  sortedTaskSetQueue
}

其核心就是排序操作,而排序操作又因为不同的比较器会产生不同的顺序。在Spark中,stage调度模式有FIFO和FAIR两种,所以根据程序运行时设置的模型会生成不同的调度算法。默认是FIFO。

// Pool.scala
private val taskSetSchedulingAlgorithm: SchedulingAlgorithm = {
  schedulingMode match {
    case SchedulingMode.FAIR =>
      new FairSchedulingAlgorithm()
    case SchedulingMode.FIFO =>
      new FIFOSchedulingAlgorithm()
    case _ =>
      val msg = s"Unsupported scheduling mode: $schedulingMode. Use FAIR or FIFO instead."
      throw new IllegalArgumentException(msg)
  }
}

两种算法的比较器分别如下:
FIAR

// SchedulingAlgorithm.scala
// 调度池运行的task数小于minShare的优先级比不小于的优先级要高。
// 若两者运行的task个数都比minShare小,则比较minShare使用率,使用率约低优先级越高。相等则比较名字。
// 若两者运行的task个数都比minShare大,则比较权重使用率,使用率约低优先级越高。相等则比较名字。
private[spark] class FairSchedulingAlgorithm extends SchedulingAlgorithm {
  override def comparator(s1: Schedulable, s2: Schedulable): Boolean = {
    val minShare1 = s1.minShare
    val minShare2 = s2.minShare
    // 参数runningTasks其实就是runningTasksSet集合的size
    val runningTasks1 = s1.runningTasks
    val runningTasks2 = s2.runningTasks
    val s1Needy = runningTasks1 < minShare1
    val s2Needy = runningTasks2 < minShare2
    val minShareRatio1 = runningTasks1.toDouble / math.max(minShare1, 1.0)
    val minShareRatio2 = runningTasks2.toDouble / math.max(minShare2, 1.0)
    val taskToWeightRatio1 = runningTasks1.toDouble / s1.weight.toDouble
    val taskToWeightRatio2 = runningTasks2.toDouble / s2.weight.toDouble

    var compare = 0
    if (s1Needy && !s2Needy) {
      return true
    } else if (!s1Needy && s2Needy) {
      return false
    } else if (s1Needy && s2Needy) {
      compare = minShareRatio1.compareTo(minShareRatio2)
    } else {
      compare = taskToWeightRatio1.compareTo(taskToWeightRatio2)
    }
    if (compare < 0) {
      true
    } else if (compare > 0) {
      false
    } else {
      s1.name < s2.name
    }
  }
}

FIFO

// SchedulingAlgorithm.scala
private[spark] class FIFOSchedulingAlgorithm extends SchedulingAlgorithm {
  override def comparator(s1: Schedulable, s2: Schedulable): Boolean = {
    val priority1 = s1.priority
    val priority2 = s2.priority
    var res = math.signum(priority1 - priority2)
    if (res == 0) {
      // 当前job的没有关联关系的两个Stage可以同时提交到调度池  
      val stageId1 = s1.stageId
      val stageId2 = s2.stageId
      res = math.signum(stageId1 - stageId2)
    }
    res < 0
  }
}
  • 因为上面的代码用到了myLocalityLevels,所以顺便解释其取值。myLocalityLevels代表该TaskSetManager管理的所以Task中的数据本地性级别有哪些。
    在TaskSetManager类中有如下代码,也就是在创建TaskSetManager的时候会调用computeValidLocalityLevels函数初始化myLocalityLevels 对象。
private[scheduler] var myLocalityLevels = computeValidLocalityLevels()
// TaskSetManager.scala
private def computeValidLocalityLevels(): Array[TaskLocality.TaskLocality] = {
  import TaskLocality.{PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY}
  val levels = new ArrayBuffer[TaskLocality.TaskLocality]
  if (!pendingTasksForExecutor.isEmpty &&
      pendingTasksForExecutor.keySet.exists(sched.isExecutorAlive(_))) {
    levels += PROCESS_LOCAL
  }
  if (!pendingTasksForHost.isEmpty &&
      pendingTasksForHost.keySet.exists(sched.hasExecutorsAliveOnHost(_))) {
    levels += NODE_LOCAL
  }
  if (!pendingTasksWithNoPrefs.isEmpty) {
    levels += NO_PREF
  }
  if (!pendingTasksForRack.isEmpty &&
      pendingTasksForRack.keySet.exists(sched.hasHostAliveOnRack(_))) {
    levels += RACK_LOCAL
  }
  levels += ANY
  logDebug("Valid locality levels for " + taskSet + ": " + levels.mkString(", "))
  levels.toArray
}

而pendingTasksForExecutor等数组的初始化则是在以下代码中进行的:

// TaskSetManager.scala
// 在创建TaskSetManager对象的时候会执行以下代码 
// 初始化pendingTasksForExecutor、pendingTasksForHost等数组
// 按序号逆序加进数组中的,所以在后面选择task的时候也是默认选择最后一个。
// 因为对于数组的删除操作,删除最后一个元素效率教高
for (i <- (0 until numTasks).reverse) {
  addPendingTask(i)
}

addPendingTask()方法初始化pendingTasksForExecutor、pendingTasksForHost等数组。确定Task的本地化级别。

// TaskSetManager.scala
private[spark] def addPendingTask(index: Int) {
  for (loc <- tasks(index).preferredLocations) {
    loc match {
      case e: ExecutorCacheTaskLocation =>
        pendingTasksForExecutor.getOrElseUpdate(e.executorId, new ArrayBuffer) += index
      case e: HDFSCacheTaskLocation =>
        val exe = sched.getExecutorsAliveOnHost(loc.host)
        exe match {
          case Some(set) =>
            for (e <- set) {
              pendingTasksForExecutor.getOrElseUpdate(e, new ArrayBuffer) += index
            }
            logInfo(s"Pending task $index has a cached location at ${e.host} " +
              ", where there are executors " + set.mkString(","))
          case None => logDebug(s"Pending task $index has a cached location at ${e.host} " +
              ", but there are no executors alive there.")
        }
      case _ =>
    }
    pendingTasksForHost.getOrElseUpdate(loc.host, new ArrayBuffer) += index
    for (rack <- sched.getRackForHost(loc.host)) {
      pendingTasksForRack.getOrElseUpdate(rack, new ArrayBuffer) += index
    }
  }
  // 如果没有偏向位置,则认为该task的本地化级别为no_pref
  if (tasks(index).preferredLocations == Nil) {
    pendingTasksWithNoPrefs += index
  }

  allPendingTasks += index  // No point scanning this whole list to find the old task there
}

preferredLocations则代表该Task的偏向位置。

  • 为当前本地化级别分配资源,相当于为空闲的资源寻找满足条件的task。对应resourceOfferSingleTaskSet函数实现的功能。
// TaskSchedulerImpl.scala
private def resourceOfferSingleTaskSet(
    taskSet: TaskSetManager,
    maxLocality: TaskLocality,
    shuffledOffers: Seq[WorkerOffer],
    availableCpus: Array[Int],
    tasks: IndexedSeq[ArrayBuffer[TaskDescription]],
    addressesWithDescs: ArrayBuffer[(String, TaskDescription)]) : Boolean = {
  var launchedTask = false
  // nodes and executors that are blacklisted for the entire application have already been
  // filtered out by this point
  // 遍历每个executor
  for (i <- 0 until shuffledOffers.size) {
    val execId = shuffledOffers(i).executorId
    val host = shuffledOffers(i).host
    // 当前的cpu数量是否大于等于每个task需要的cpu数量,默认为1
    if (availableCpus(i) >= CPUS_PER_TASK) {
      try {
        // resourceOffer主要用来对每个task做标记,最后返回每个task的TaskDescription
        // task:TaskDescription
       // resourceOffer返回的是在当前Executor上能满足其本地化级别的Task
        for (task <- taskSet.resourceOffer(execId, host, maxLocality)) { // 1
          tasks(i) += task
          val tid = task.taskId
          taskIdToTaskSetManager.put(tid, taskSet)
          taskIdToExecutorId(tid) = execId
          executorIdToRunningTaskIds(execId).add(tid)
          availableCpus(i) -= CPUS_PER_TASK
          assert(availableCpus(i) >= 0)
          // Only update hosts for a barrier task.
          if (taskSet.isBarrier) {
            // The executor address is expected to be non empty.
            addressesWithDescs += (shuffledOffers(i).address.get -> task)
          }
          launchedTask = true
        }
      } catch {
        case e: TaskNotSerializableException =>
          logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
          // Do not offer resources for this task, but don't throw an error to allow other
          // task sets to be submitted.
          return launchedTask
      }
    }
  }
  return launchedTask
}

在以上过程会调用TaskSetManager的resourceOffer()方法,该方法主要用来将task序列化,并封装成TaskDescription,用于最后的launchTask方法提交Task。

// TaskSetManager.scala
def resourceOffer(
    execId: String,
    host: String,
    maxLocality: TaskLocality.TaskLocality)
  : Option[TaskDescription] =
{
  val offerBlacklisted = taskSetBlacklistHelperOpt.exists { blacklist =>
    blacklist.isNodeBlacklistedForTaskSet(host) ||
      blacklist.isExecutorBlacklistedForTaskSet(execId)
  }
  if (!isZombie && !offerBlacklisted) {
    // 获取当前时间
    val curTime = clock.getTimeMillis()
    // 当前最优的Task本地化级别
    var allowedLocality = maxLocality
     // 如果级别不是NO_PREF
    if (maxLocality != TaskLocality.NO_PREF) {
      // 这里会拿到这个task其他可用的本地级别
      // 代表即使在外层循环(resourceOffers函数中),某一级别分配失败,但是由于没有达到等待时间,所以仍会以该级别去
      // 选择满足条件的Task。
      allowedLocality = getAllowedLocalityLevel(curTime)
      // 例如:传进来的参数maxLocality是NODE_LOCAL级别,但是因为没有达到等待时间(默认3秒),则仍然以PROCESS_LOCAL
      // 级别进行调度。这是spark延迟调度的机制,其期待等待时间小于网络传输时间。
      // PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY五个级别依次对应01234
      if (allowedLocality > maxLocality) {
        // We're not allowed to search for farther-away tasks
        allowedLocality = maxLocality
      }
    }
    // 得到一个在execId上能实现allowedLocality的本地化级别的TaskDescription对象 
    dequeueTask(execId, host, allowedLocality).map { case ((index, taskLocality, speculative)) =>
      // Found a task; do some bookkeeping and return a task description
      // 获取task
      val task = tasks(index)
      val taskId = sched.newTaskId()
      // Do various bookkeeping
      copiesRunning(index) += 1
      val attemptNum = taskAttempts(index).size
      // 生成一个TaskInfo 里面注入了这个task的所有元数据
      val info = new TaskInfo(taskId, index, attemptNum, curTime,
        execId, host, taskLocality, speculative)
      taskInfos(taskId) = info
      taskAttempts(index) = info :: taskAttempts(index)
      // Update our locality level for delay scheduling
      // NO_PREF will not affect the variables related to delay scheduling
      if (maxLocality != TaskLocality.NO_PREF) {
        currentLocalityIndex = getLocalityIndex(taskLocality)
        lastLaunchTime = curTime
      }
      // Serialize and return the task
      // 序列化这个Task
      val serializedTask: ByteBuffer = try {
        ser.serialize(task)
      } catch {
        // If the task cannot be serialized, then there's no point to re-attempt the task,
        // as it will always fail. So just abort the whole task-set.
        case NonFatal(e) =>
          val msg = s"Failed to serialize task $taskId, not attempting to retry it."
          logError(msg, e)
          abort(s"$msg Exception during serialization: $e")
          throw new TaskNotSerializableException(e)
      }
      if (serializedTask.limit() > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 &&
        !emittedTaskSizeWarning) {
        emittedTaskSizeWarning = true
        logWarning(s"Stage ${task.stageId} contains a task of very large size " +
          s"(${serializedTask.limit() / 1024} KB). The maximum recommended task size is " +
          s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.")
      }
      addRunningTask(taskId)

      // We used to log the time it takes to serialize the task, but task size is already
      // a good proxy to task serialization time.
      // val timeTaken = clock.getTime() - startTime
      val taskName = s"task ${info.id} in stage ${taskSet.id}"
      logInfo(s"Starting $taskName (TID $taskId, $host, executor ${info.executorId}, " +
        s"partition ${task.partitionId}, $taskLocality, ${serializedTask.limit()} bytes)")

      sched.dagScheduler.taskStarted(task, info)
      // 生成一个TaskDescription
     // 标记着这个task在那个host的哪个executor执行
     // 以及需要添加到executor的Classpath上的所有Jar包和File
      new TaskDescription(
        taskId,
        attemptNum,
        execId,
        taskName,
        index,
        task.partitionId,
        addedFiles,
        addedJars,
        task.localProperties,
        serializedTask)
    }
  } else {
    None
  }
}
5.2 调度任务
// CoarseGrainedSchedulerBackend.scala
// Launch tasks returned by a set of resource offers
private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
  for (task <- tasks.flatten) {
      //首先对每个executor需要执行的task消息序列化一下,可以在网络间进行传输
    val serializedTask = TaskDescription.encode(task)
    if (serializedTask.limit() >= maxRpcMessageSize) {
      Option(scheduler.taskIdToTaskSetManager.get(task.taskId)).foreach { taskSetMgr =>
        try {
          var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
            "spark.rpc.message.maxSize (%d bytes). Consider increasing " +
            "spark.rpc.message.maxSize or using broadcast variables for large values."
          msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)
          taskSetMgr.abort(msg)
        } catch {
          case e: Exception => logError("Exception in error callback", e)
        }
      }
    }
    else {
      //根据task消息中的executorId找到运行的executor
      val executorData = executorDataMap(task.executorId)
      //并将executor空余的core数减去自身需要的core数
      executorData.freeCores -= scheduler.CPUS_PER_TASK

      logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
        s"${executorData.executorHost}.")
      //向executor发送LaunchTask消息,用于在对应executor上启动task
      executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
    }
  }
}

将序列化后的task调度到对应Executor上执行。

有写的不对的或者有疑问的欢迎留言探讨!

你可能感兴趣的:(Spark)