当Job提交之后,就会生成DAG图,根据RDD的Dependency进行Stage的划分。stage分为ShuffleMapStage和ResultStage两种类型,根据stage类型生成对应的task,分别是ShuffleMapTask、ResultTask。我们从Stage的提交开始分析:
注:
代码框的最上面一行注释代表代码所在的文件。
代码分析是在standalone的模式下。
//DAGSheduler.scala
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
val missing = getMissingParentStages(stage).sortBy(_.id) // 1 获得未提交的父Stage
logDebug("missing: " + missing)
if (missing.isEmpty) { // 若不存在没有提交执行的父stage
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
submitMissingTasks(stage, jobId.get) // 2 提交当前Stage
} else {
for (parent <- missing) {
submitStage(parent)
}
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id, None)
}
}
//DAGScheduler.scala
private def submitMissingTasks(stage: Stage, jobId: Int) {
...
// 生成Task
val tasks: Seq[Task[_]] = try {
val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
stage match {
case stage: ShuffleMapStage =>
stage.pendingPartitions.clear()
partitionsToCompute.map { id =>
val locs = taskIdToLocations(id)
val part = partitions(id)
stage.pendingPartitions += id
new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
Option(sc.applicationId), sc.applicationAttemptId, stage.rdd.isBarrier())
}
case stage: ResultStage =>
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = partitions(p)
val locs = taskIdToLocations(id)
new ResultTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, id, properties, serializedTaskMetrics,
Option(jobId), Option(sc.applicationId), sc.applicationAttemptId,
stage.rdd.isBarrier())
}
}
}
if (tasks.size > 0) {
logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
// taskScheduler:TaskSchedulerImpl
taskScheduler.submitTasks(new TaskSet( // 1 提交task
tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
}
}
TaskSet类定义如下:
// TaskSet.scala
private[spark] class TaskSet(
val tasks: Array[Task[_]],
val stageId: Int,
val stageAttemptId: Int,
val priority: Int, // 也就是jobid
val properties: Properties) {
val id: String = stageId + "." + stageAttemptId
override def toString: String = "TaskSet " + id
}
// TaskSchedulerImpl.scala
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
//创建TaskSetManager,TaskSetManager用于对TaskSet中的Task进行调度,包括跟踪Task的运行、Task失败重试等
val manager = createTaskSetManager(taskSet, maxTaskFailures) // 1
val stage = taskSet.stageId
val stageTaskSets =
taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
stageTaskSets(taskSet.stageAttemptId) = manager
val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
ts.taskSet != taskSet && !ts.isZombie
}
if (conflictingTaskSet) {
throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
}
//schedulableBuilder中添加TaskSetManager,用于完成所有TaskSet的调度,
// 即整个Spark程序生成的DAG图对应Stage的TaskSet调度
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties) // 2
if (!isLocal && !hasReceivedTask) {
// 设置检查TaskSchedulerImpl的饥饿状态的定时器
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
// 注意这个if判断的是是否已经launch了task
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {
this.cancel() // 当TaskSchedulerImpl已经launch Task后,取消此定时器
}
}
}, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
}
// 一旦有TaskSetManager加入到调度池,则该标记项为true。
hasReceivedTask = true
}
// 发送ReviveOffers消息给Driver, 为Task分配运行资源
// backend:StandaloneSchedulerBackend(Standalone模式下) 但是是调用其父类CoarseGrainedSchedulerBackend的reviveOffers方法
backend.reviveOffers() // 3
}
在这个过程中用到了schedulableBuilder,顺便看看该对象是何时创建的。
我们知道在SparkContext的初始化过程中会创建DAGScheduler和TaskScheduler。如以下代码所示:
// SparkContext.scala
// Create and start the scheduler
// 根据资源管理器类型,创建对应的SchedulerBackend、TaskScheduler
val (sched, ts) = SparkContext.createTaskScheduler(this, master, deployMode)
_schedulerBackend = sched
_taskScheduler = ts
_dagScheduler = new DAGScheduler(this)
_heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)
// start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's
// constructor
// 启动TaskScheduler、SchedulerBackend
_taskScheduler.start()
在调用createTaskScheduler创建TaskScheduler的时候会调用其initialize()方法。
// SparkContext.scala
private def createTaskScheduler(
sc: SparkContext,
master: String,
deployMode: String): (SchedulerBackend, TaskScheduler) = {
import SparkMasterRegex._
// When running locally, don't try to re-execute tasks on failure.
val MAX_LOCAL_TASK_FAILURES = 1
master match {
case SPARK_REGEX(sparkUrl) =>
// TaskSchedulerImpl是TaskScheduler的实现类
val scheduler = new TaskSchedulerImpl(sc)
val masterUrls = sparkUrl.split(",").map("spark://" + _)
// 根据Spark的不同运行模式会创建不同的SchedulerBackend对象。
val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)
scheduler.initialize(backend)
(backend, scheduler)
...
}
}
在调用TaskScheduler的initialize()方法时会创建调度池。如以下代码所示:
// TaskSchedulerImpl.scala
def initialize(backend: SchedulerBackend) {
this.backend = backend
schedulableBuilder = {
schedulingMode match {
case SchedulingMode.FIFO =>
new FIFOSchedulableBuilder(rootPool)
case SchedulingMode.FAIR =>
new FairSchedulableBuilder(rootPool, conf)
case _ =>
throw new IllegalArgumentException(s"Unsupported $SCHEDULER_MODE_PROPERTY: " +
s"$schedulingMode")
}
}
schedulableBuilder.buildPools()
}
// CoarseGrainedSchedulerBackend.scala
override def reviveOffers() {
// 发送ReviveOffers消息给Driver
driverEndpoint.send(ReviveOffers)
}
driverEndpoint代表Driver端EndPoint的通信地址,Spark中使用Netty作为通信框架,各个组件都有对应的EndPoint,想和某个组件通信时就要先获得对应组件的EndPoint的引用,然后就可以向该组件发送消息了。
// CoarseGrainedSchedulerBackend.scala/DriverEndPoint
case ReviveOffers =>
makeOffers()
// CoarseGrainedSchedulerBackend.scala/DriverEndPoint
// Make fake resource offers on all executors
private def makeOffers() {
// Make sure no executor is killed while some task is launching on it
val taskDescs = CoarseGrainedSchedulerBackend.this.synchronized {
// Filter out executors under killing
// 所有可用的Executor
val activeExecutors = executorDataMap.filterKeys(executorIsAlive)
// WorkOffer表示Executor上可用的资源,
val workOffers = activeExecutors.map {
case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores,
Some(executorData.executorAddress.hostPort))
}.toIndexedSeq
// 调用TaskSchedulerImpl的resourceOffers给Task分配资源,其他资源管理器也是如此
scheduler.resourceOffers(workOffers) // 1
}
if (!taskDescs.isEmpty) {
// 先调用TaskSchedulerImpl的resourceOffers方法,为Task的运行分配资源
// 再调用CoarseGrainedSchedulerBackend中的launchTasks方法启动Task的运行,
// 最终Task被提交到Worker节点上的Executor上运行
launchTasks(taskDescs) // 2
}
}
为了更清楚看懂源码,将涉及到的executorDataMap的类型进行说明,其key为executorId,value为该Executor的相关信息。
private val executorDataMap = new HashMap[String, ExecutorData]
private[cluster] class ExecutorData(
val executorEndpoint: RpcEndpointRef,
val executorAddress: RpcAddress,
override val executorHost: String,
var freeCores: Int,
override val totalCores: Int,
override val logUrlMap: Map[String, String]
) extends ExecutorInfo(executorHost, totalCores, logUrlMap)
// TaskSchedulerImpl.scala
// WorkOffer表示Executor上可用的资源
def resourceOffers(offers: IndexedSeq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
// Mark each slave as alive and remember its hostname
// Also track if new executor is added
var newExecAvail = false
// 该for循环用来更新hostToExecutors、executorIdToHost、hostsByRack等集合
for (o <- offers) {
if (!hostToExecutors.contains(o.host)) {
// hostToExecutors里维护着每个节点上已经激活的所有的executor
hostToExecutors(o.host) = new HashSet[String]()
}
// executorIdToRunningTaskIds里维护着每个executor中运行中的每个task
if (!executorIdToRunningTaskIds.contains(o.executorId)) {
hostToExecutors(o.host) += o.executorId
// 向DAGScheduler发送ExecutorAdded消息
executorAdded(o.executorId, o.host)
executorIdToHost(o.executorId) = o.host
executorIdToRunningTaskIds(o.executorId) = HashSet[Long]()
newExecAvail = true // 标记添加了新的Executor
}
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
}
// 这里的hostToExecutors及hostsByRack集合是为了在资源分配时计算Task本地性使用
}
// Before making any offers, remove any nodes from the blacklist whose blacklist has expired. Do
// this here to avoid a separate thread and added synchronization overhead, and also because
// updating the blacklist is only relevant when task offers are being made.
// 在这里它会过滤掉黑名单中的过期节点
blacklistTrackerOpt.foreach(_.applyBlacklistTimeout())
val filteredOffers = blacklistTrackerOpt.map { blacklistTracker =>
offers.filter { offer =>
!blacklistTracker.isNodeBlacklisted(offer.host) &&
!blacklistTracker.isExecutorBlacklisted(offer.executorId)
}
}.getOrElse(offers)
// 随机打散,使Task均匀分配各Worker节点上,为了负载均衡。 避免将任务总是分配给同样一组Worker
val shuffledOffers = shuffleOffers(filteredOffers)
// Build a list of tasks to assign to each worker.
// 根据每个WorkOffer的可用CPU核数创建同等尺寸的任务描述(TaskDescription)数组
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
val availableSlots = shuffledOffers.map(o => o.cores / CPUS_PER_TASK).sum
val sortedTaskSets = rootPool.getSortedTaskSetQueue // 1 得到TaskSetManager的调度顺序
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
// 里面会调用TaskSetManager本地级别的分配算法,
// 为每个task分配计算本地级别的等级
// 因为有新添加的Executor,所以会重新计算TaskSet的本地性
taskSet.executorAdded()
}
}
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
// of locality levels so that it gets a chance to launch local tasks on all of them.
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
// 按照本地化原则对Task进行调度
for (taskSet <- sortedTaskSets) {
// Skip the barrier taskSet if the available slots are less than the number of pending tasks.
// taskSet是Barrier则说明该TaskSet中的所有Task要同时启动。所以如果当前taskSet是Barrier则需要可以的cores数量
// 大于当前taskSet的Task数量。taskSet的isBarrier参数默认为false。
if (taskSet.isBarrier && availableSlots < taskSet.numTasks) {
// Skip the launch process.
// TODO SPARK-24819 If the job requires more slots than available (both busy and free
// slots), fail the job on submit.
logInfo(s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +
s"because the barrier taskSet requires ${taskSet.numTasks} slots, while the total " +
s"number of available slots is $availableSlots.")
} else {
var launchedAnyTask = false
// Record all the executor IDs assigned barrier tasks on.
val addressesWithDescs = ArrayBuffer[(String, TaskDescription)]()
// 遍历当前taskSet的所有本地级别
for (currentMaxLocality <- taskSet.myLocalityLevels) {
var launchedTaskAtCurrentMaxLocality = false
do {
// 2 为当前本地化级别分配资源
launchedTaskAtCurrentMaxLocality = resourceOfferSingleTaskSet(taskSet,
currentMaxLocality, shuffledOffers, availableCpus, tasks, addressesWithDescs)
launchedAnyTask |= launchedTaskAtCurrentMaxLocality
} while (launchedTaskAtCurrentMaxLocality)
// 直到该本地化级别分配失败 换下一个级别
// 但其实要等达到等待时间(默认3秒)才会换下一个级别,
// 在resourceOfferSingleTaskSet中调用resourceOffer函数会看到这一点
}
if (!launchedAnyTask) {
taskSet.abortIfCompletelyBlacklisted(hostToExecutors)
}
if (launchedAnyTask && taskSet.isBarrier) {
// Check whether the barrier tasks are partially launched.
// TODO SPARK-24818 handle the assert failure case (that can happen when some locality
// requirements are not fulfilled, and we should revert the launched tasks).
require(addressesWithDescs.size == taskSet.numTasks,
s"Skip current round of resource offers for barrier stage ${taskSet.stageId} " +
s"because only ${addressesWithDescs.size} out of a total number of " +
s"${taskSet.numTasks} tasks got resource offers. The resource offers may have " +
"been blacklisted or cannot fulfill task locality requirements.")
// materialize the barrier coordinator.
maybeInitBarrierCoordinator()
// Update the taskInfos into all the barrier task properties.
val addressesStr = addressesWithDescs
// Addresses ordered by partitionId
.sortBy(_._2.partitionId)
.map(_._1)
.mkString(",")
addressesWithDescs.foreach(_._2.properties.setProperty("addresses", addressesStr))
logInfo(s"Successfully scheduled all the ${addressesWithDescs.size} tasks for barrier " +
s"stage ${taskSet.stageId}.")
}
}
}
// TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get
// launched within a configured time.
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks // 返回已经获得了资源的任务列表
}
重点代码解释:
// Pool.scala
override def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] = {
val sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]
val sortedSchedulableQueue =
schedulableQueue.asScala.toSeq.sortWith(taskSetSchedulingAlgorithm.comparator) // 1
for (schedulable <- sortedSchedulableQueue) {
sortedTaskSetQueue ++= schedulable.getSortedTaskSetQueue
}
sortedTaskSetQueue
}
其核心就是排序操作,而排序操作又因为不同的比较器会产生不同的顺序。在Spark中,stage调度模式有FIFO和FAIR两种,所以根据程序运行时设置的模型会生成不同的调度算法。默认是FIFO。
// Pool.scala
private val taskSetSchedulingAlgorithm: SchedulingAlgorithm = {
schedulingMode match {
case SchedulingMode.FAIR =>
new FairSchedulingAlgorithm()
case SchedulingMode.FIFO =>
new FIFOSchedulingAlgorithm()
case _ =>
val msg = s"Unsupported scheduling mode: $schedulingMode. Use FAIR or FIFO instead."
throw new IllegalArgumentException(msg)
}
}
两种算法的比较器分别如下:
FIAR
// SchedulingAlgorithm.scala
// 调度池运行的task数小于minShare的优先级比不小于的优先级要高。
// 若两者运行的task个数都比minShare小,则比较minShare使用率,使用率约低优先级越高。相等则比较名字。
// 若两者运行的task个数都比minShare大,则比较权重使用率,使用率约低优先级越高。相等则比较名字。
private[spark] class FairSchedulingAlgorithm extends SchedulingAlgorithm {
override def comparator(s1: Schedulable, s2: Schedulable): Boolean = {
val minShare1 = s1.minShare
val minShare2 = s2.minShare
// 参数runningTasks其实就是runningTasksSet集合的size
val runningTasks1 = s1.runningTasks
val runningTasks2 = s2.runningTasks
val s1Needy = runningTasks1 < minShare1
val s2Needy = runningTasks2 < minShare2
val minShareRatio1 = runningTasks1.toDouble / math.max(minShare1, 1.0)
val minShareRatio2 = runningTasks2.toDouble / math.max(minShare2, 1.0)
val taskToWeightRatio1 = runningTasks1.toDouble / s1.weight.toDouble
val taskToWeightRatio2 = runningTasks2.toDouble / s2.weight.toDouble
var compare = 0
if (s1Needy && !s2Needy) {
return true
} else if (!s1Needy && s2Needy) {
return false
} else if (s1Needy && s2Needy) {
compare = minShareRatio1.compareTo(minShareRatio2)
} else {
compare = taskToWeightRatio1.compareTo(taskToWeightRatio2)
}
if (compare < 0) {
true
} else if (compare > 0) {
false
} else {
s1.name < s2.name
}
}
}
FIFO
// SchedulingAlgorithm.scala
private[spark] class FIFOSchedulingAlgorithm extends SchedulingAlgorithm {
override def comparator(s1: Schedulable, s2: Schedulable): Boolean = {
val priority1 = s1.priority
val priority2 = s2.priority
var res = math.signum(priority1 - priority2)
if (res == 0) {
// 当前job的没有关联关系的两个Stage可以同时提交到调度池
val stageId1 = s1.stageId
val stageId2 = s2.stageId
res = math.signum(stageId1 - stageId2)
}
res < 0
}
}
private[scheduler] var myLocalityLevels = computeValidLocalityLevels()
// TaskSetManager.scala
private def computeValidLocalityLevels(): Array[TaskLocality.TaskLocality] = {
import TaskLocality.{PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY}
val levels = new ArrayBuffer[TaskLocality.TaskLocality]
if (!pendingTasksForExecutor.isEmpty &&
pendingTasksForExecutor.keySet.exists(sched.isExecutorAlive(_))) {
levels += PROCESS_LOCAL
}
if (!pendingTasksForHost.isEmpty &&
pendingTasksForHost.keySet.exists(sched.hasExecutorsAliveOnHost(_))) {
levels += NODE_LOCAL
}
if (!pendingTasksWithNoPrefs.isEmpty) {
levels += NO_PREF
}
if (!pendingTasksForRack.isEmpty &&
pendingTasksForRack.keySet.exists(sched.hasHostAliveOnRack(_))) {
levels += RACK_LOCAL
}
levels += ANY
logDebug("Valid locality levels for " + taskSet + ": " + levels.mkString(", "))
levels.toArray
}
而pendingTasksForExecutor等数组的初始化则是在以下代码中进行的:
// TaskSetManager.scala
// 在创建TaskSetManager对象的时候会执行以下代码
// 初始化pendingTasksForExecutor、pendingTasksForHost等数组
// 按序号逆序加进数组中的,所以在后面选择task的时候也是默认选择最后一个。
// 因为对于数组的删除操作,删除最后一个元素效率教高
for (i <- (0 until numTasks).reverse) {
addPendingTask(i)
}
addPendingTask()方法初始化pendingTasksForExecutor、pendingTasksForHost等数组。确定Task的本地化级别。
// TaskSetManager.scala
private[spark] def addPendingTask(index: Int) {
for (loc <- tasks(index).preferredLocations) {
loc match {
case e: ExecutorCacheTaskLocation =>
pendingTasksForExecutor.getOrElseUpdate(e.executorId, new ArrayBuffer) += index
case e: HDFSCacheTaskLocation =>
val exe = sched.getExecutorsAliveOnHost(loc.host)
exe match {
case Some(set) =>
for (e <- set) {
pendingTasksForExecutor.getOrElseUpdate(e, new ArrayBuffer) += index
}
logInfo(s"Pending task $index has a cached location at ${e.host} " +
", where there are executors " + set.mkString(","))
case None => logDebug(s"Pending task $index has a cached location at ${e.host} " +
", but there are no executors alive there.")
}
case _ =>
}
pendingTasksForHost.getOrElseUpdate(loc.host, new ArrayBuffer) += index
for (rack <- sched.getRackForHost(loc.host)) {
pendingTasksForRack.getOrElseUpdate(rack, new ArrayBuffer) += index
}
}
// 如果没有偏向位置,则认为该task的本地化级别为no_pref
if (tasks(index).preferredLocations == Nil) {
pendingTasksWithNoPrefs += index
}
allPendingTasks += index // No point scanning this whole list to find the old task there
}
preferredLocations则代表该Task的偏向位置。
// TaskSchedulerImpl.scala
private def resourceOfferSingleTaskSet(
taskSet: TaskSetManager,
maxLocality: TaskLocality,
shuffledOffers: Seq[WorkerOffer],
availableCpus: Array[Int],
tasks: IndexedSeq[ArrayBuffer[TaskDescription]],
addressesWithDescs: ArrayBuffer[(String, TaskDescription)]) : Boolean = {
var launchedTask = false
// nodes and executors that are blacklisted for the entire application have already been
// filtered out by this point
// 遍历每个executor
for (i <- 0 until shuffledOffers.size) {
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
// 当前的cpu数量是否大于等于每个task需要的cpu数量,默认为1
if (availableCpus(i) >= CPUS_PER_TASK) {
try {
// resourceOffer主要用来对每个task做标记,最后返回每个task的TaskDescription
// task:TaskDescription
// resourceOffer返回的是在当前Executor上能满足其本地化级别的Task
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) { // 1
tasks(i) += task
val tid = task.taskId
taskIdToTaskSetManager.put(tid, taskSet)
taskIdToExecutorId(tid) = execId
executorIdToRunningTaskIds(execId).add(tid)
availableCpus(i) -= CPUS_PER_TASK
assert(availableCpus(i) >= 0)
// Only update hosts for a barrier task.
if (taskSet.isBarrier) {
// The executor address is expected to be non empty.
addressesWithDescs += (shuffledOffers(i).address.get -> task)
}
launchedTask = true
}
} catch {
case e: TaskNotSerializableException =>
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
// Do not offer resources for this task, but don't throw an error to allow other
// task sets to be submitted.
return launchedTask
}
}
}
return launchedTask
}
在以上过程会调用TaskSetManager的resourceOffer()方法,该方法主要用来将task序列化,并封装成TaskDescription,用于最后的launchTask方法提交Task。
// TaskSetManager.scala
def resourceOffer(
execId: String,
host: String,
maxLocality: TaskLocality.TaskLocality)
: Option[TaskDescription] =
{
val offerBlacklisted = taskSetBlacklistHelperOpt.exists { blacklist =>
blacklist.isNodeBlacklistedForTaskSet(host) ||
blacklist.isExecutorBlacklistedForTaskSet(execId)
}
if (!isZombie && !offerBlacklisted) {
// 获取当前时间
val curTime = clock.getTimeMillis()
// 当前最优的Task本地化级别
var allowedLocality = maxLocality
// 如果级别不是NO_PREF
if (maxLocality != TaskLocality.NO_PREF) {
// 这里会拿到这个task其他可用的本地级别
// 代表即使在外层循环(resourceOffers函数中),某一级别分配失败,但是由于没有达到等待时间,所以仍会以该级别去
// 选择满足条件的Task。
allowedLocality = getAllowedLocalityLevel(curTime)
// 例如:传进来的参数maxLocality是NODE_LOCAL级别,但是因为没有达到等待时间(默认3秒),则仍然以PROCESS_LOCAL
// 级别进行调度。这是spark延迟调度的机制,其期待等待时间小于网络传输时间。
// PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY五个级别依次对应01234
if (allowedLocality > maxLocality) {
// We're not allowed to search for farther-away tasks
allowedLocality = maxLocality
}
}
// 得到一个在execId上能实现allowedLocality的本地化级别的TaskDescription对象
dequeueTask(execId, host, allowedLocality).map { case ((index, taskLocality, speculative)) =>
// Found a task; do some bookkeeping and return a task description
// 获取task
val task = tasks(index)
val taskId = sched.newTaskId()
// Do various bookkeeping
copiesRunning(index) += 1
val attemptNum = taskAttempts(index).size
// 生成一个TaskInfo 里面注入了这个task的所有元数据
val info = new TaskInfo(taskId, index, attemptNum, curTime,
execId, host, taskLocality, speculative)
taskInfos(taskId) = info
taskAttempts(index) = info :: taskAttempts(index)
// Update our locality level for delay scheduling
// NO_PREF will not affect the variables related to delay scheduling
if (maxLocality != TaskLocality.NO_PREF) {
currentLocalityIndex = getLocalityIndex(taskLocality)
lastLaunchTime = curTime
}
// Serialize and return the task
// 序列化这个Task
val serializedTask: ByteBuffer = try {
ser.serialize(task)
} catch {
// If the task cannot be serialized, then there's no point to re-attempt the task,
// as it will always fail. So just abort the whole task-set.
case NonFatal(e) =>
val msg = s"Failed to serialize task $taskId, not attempting to retry it."
logError(msg, e)
abort(s"$msg Exception during serialization: $e")
throw new TaskNotSerializableException(e)
}
if (serializedTask.limit() > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 &&
!emittedTaskSizeWarning) {
emittedTaskSizeWarning = true
logWarning(s"Stage ${task.stageId} contains a task of very large size " +
s"(${serializedTask.limit() / 1024} KB). The maximum recommended task size is " +
s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.")
}
addRunningTask(taskId)
// We used to log the time it takes to serialize the task, but task size is already
// a good proxy to task serialization time.
// val timeTaken = clock.getTime() - startTime
val taskName = s"task ${info.id} in stage ${taskSet.id}"
logInfo(s"Starting $taskName (TID $taskId, $host, executor ${info.executorId}, " +
s"partition ${task.partitionId}, $taskLocality, ${serializedTask.limit()} bytes)")
sched.dagScheduler.taskStarted(task, info)
// 生成一个TaskDescription
// 标记着这个task在那个host的哪个executor执行
// 以及需要添加到executor的Classpath上的所有Jar包和File
new TaskDescription(
taskId,
attemptNum,
execId,
taskName,
index,
task.partitionId,
addedFiles,
addedJars,
task.localProperties,
serializedTask)
}
} else {
None
}
}
// CoarseGrainedSchedulerBackend.scala
// Launch tasks returned by a set of resource offers
private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
//首先对每个executor需要执行的task消息序列化一下,可以在网络间进行传输
val serializedTask = TaskDescription.encode(task)
if (serializedTask.limit() >= maxRpcMessageSize) {
Option(scheduler.taskIdToTaskSetManager.get(task.taskId)).foreach { taskSetMgr =>
try {
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
"spark.rpc.message.maxSize (%d bytes). Consider increasing " +
"spark.rpc.message.maxSize or using broadcast variables for large values."
msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)
taskSetMgr.abort(msg)
} catch {
case e: Exception => logError("Exception in error callback", e)
}
}
}
else {
//根据task消息中的executorId找到运行的executor
val executorData = executorDataMap(task.executorId)
//并将executor空余的core数减去自身需要的core数
executorData.freeCores -= scheduler.CPUS_PER_TASK
logDebug(s"Launching task ${task.taskId} on executor id: ${task.executorId} hostname: " +
s"${executorData.executorHost}.")
//向executor发送LaunchTask消息,用于在对应executor上启动task
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
}
}
}
将序列化后的task调度到对应Executor上执行。
有写的不对的或者有疑问的欢迎留言探讨!