先来一张图描述整个stage算法划分的由来:
/**
* DAGScheduler的job调度的核心入口函数
*/
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
allowLocal: Boolean,
callSite: CallSite,
listener: JobListener,
properties: Properties = null)
{
// 使用触发job的最后一个rdd,创建finalStage
var finalStage: Stage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
// 第一步: 创建一个stage对象 , 并且将stage加入DAGScheduler内存的内存缓存中
finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
if (finalStage != null) {
// 第二步: 用finalStage创建一个job , 也就是说这个job的最后一个stage就是finalStage
val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
job.jobId, callSite.shortForm, partitions.length, allowLocal))
logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))
val shouldRunLocally =
localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
val jobSubmissionTime = clock.getTimeMillis()
if (shouldRunLocally) {
// Compute very short actions like first() or take() with no parent stages locally.
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, Seq.empty, properties))
runLocally(job)
} else {
// 第三步 : 将job加入内存缓存中
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.resultOfJob = Some(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
// 第四步 : 使用submitStage提交finalStage
// 这个方法的调用其实会导致第一个stage提交 , 并且导致其它所有的stage , 都给放入waitingStages队列里了
submitStage(finalStage)
// stage划分算法非常重要 , 对于spark高手来说必须对stage划分算法很清晰
// 知道自己编写的spark application被划分为几个job
// 每个job被划分为几个stage
// 每个stage又包括了哪些代码
// 只有知道了这些情况才能发现某个具体的stage执行特别慢或者报错 , 最后才能排查问题 , 性能调优
// stage划分算法总结 :
// 1.从finalStage倒推
// 2.通过宽依赖来进行新的stage的划分
// 3.使用递归优先提交父stage
}
}
// 提交等待的stage
submitWaitingStages()
}
/**
* 提交stage的方法
* 这个其实是stage的划分算法的入口
* 但是stage划分算法其实就是submitStage()方法与getMissingParentStage()方法共同组成的
*/
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
// 调用getMissingParentStages方法获取当前stage的父stage
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
// 这里会反复递归调用直到最初的stage没有父stage了 , 那么此时就会去首先提交第一个stage
// 其余的stage此时全部都在waitingStages中
if (missing == Nil) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
submitMissingTasks(stage, jobId.get)
} else {
// 递归调用submit()方法去提交父stage , 这里的递归就是stage划分算法的推动者和精髓
for (parent <- missing) {
submitStage(parent)
}
// 并且将当前stage加入waitingStages等待执行的stage的队列中
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id)
}
}
/**
* stage的划分算法核心就在这里
* 获取某个stage的父stage
* 这个方法其实就是对最后的一个rdd的所有依赖都是窄依赖 , 那么就不会创建新的stage
* 只要发现这个stage的rdd宽依赖了某个rdd,那么就用宽依赖的那个rdd创建一个新的stage
* 然后立即将新的stage返回
*/
private def getMissingParentStages(stage: Stage): List[Stage] = {
val missing = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
val waitingForVisit = new Stack[RDD[_]]
// 自定义的visit方法
def visit(rdd: RDD[_]) {
if (!visited(rdd)) {
visited += rdd
if (getCacheLocs(rdd).contains(Nil)) {
// 遍历rdd的依赖
// 所以说 , 针对我们之前的那个图来看其实对于每一种有shuffle的操作,比如groupByKey , reduceByKey , countByKey ,
// 底层对应了三个RDD : MapPartitionRDD , shuffleRDD , MapPartitionsRDD
for (dep <- rdd.dependencies) {
dep match {
// 如果是宽依赖 , 那么使用宽依赖的那个rdd创建一个stage , 并且会将isShuffleMap设置为true
// 默认最后一个stage不是shufflemap stage ,但是finaStage之前所有的stage都是shuffleMap stage
case shufDep: ShuffleDependency[_, _, _] =>
val mapStage = getShuffleMapStage(shufDep, stage.jobId)
if (!mapStage.isAvailable) {
missing += mapStage
}
// 如果是窄依赖 , 那么将依赖的rdd放入栈中
case narrowDep: NarrowDependency[_] =>
waitingForVisit.push(narrowDep.rdd)
}
}
}
}
}
// 首先往栈中推入了一个stage的最后一个RDD
waitingForVisit.push(stage.rdd)
// 然后进行while循环
while (!waitingForVisit.isEmpty) {
// 对stage的最后一个rdd调用自己定义的visit方法
visit(waitingForVisit.pop())
}
missing.toList
}
/**
* 提交stage , 为stage创建一批task , task数量与partition数量相同
*/
private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug("submitMissingTasks(" + stage + ")")
// Get our pending tasks and remember them in our pendingTasks entry
stage.pendingTasks.clear()
// First figure out the indexes of partition ids to compute.
// 获取你要创建的task的数量
val partitionsToCompute: Seq[Int] = {
if (stage.isShuffleMap) {
(0 until stage.numPartitions).filter(id => stage.outputLocs(id) == Nil)
} else {
val job = stage.resultOfJob.get
(0 until job.numPartitions).filter(id => !job.finished(id))
}
}
val properties = if (jobIdToActiveJob.contains(jobId)) {
jobIdToActiveJob(stage.jobId).properties
} else {
// this stage will be assigned to "default" pool
null
}
// 将stage加入runningStages队列
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
// event.
stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size))
outputCommitCoordinator.stageStart(stage.id)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
val taskBinaryBytes: Array[Byte] =
if (stage.isShuffleMap) {
closureSerializer.serialize((stage.rdd, stage.shuffleDep.get) : AnyRef).array()
} else {
closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func) : AnyRef).array()
}
taskBinary = sc.broadcast(taskBinaryBytes)
} catch {
// In the case of a failure during serialization, abort the stage.
case e: NotSerializableException =>
abortStage(stage, "Task not serializable: " + e.toString)
runningStages -= stage
return
case NonFatal(e) =>
abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")
runningStages -= stage
return
}
// 为stage创建指定数量的task
// 这里有一点很关键就是task的最佳位置计算算法
val tasks: Seq[Task[_]] = if (stage.isShuffleMap) {
partitionsToCompute.map { id =>
// 给每一个partition创建一个task
// 给每个task计算最佳位置
val locs = getPreferredLocs(stage.rdd, id)
val part = stage.rdd.partitions(id)
// 然后对于finalStage之外的stage , 它的shuffleMap都是true , 所以会创建shuffleMapTask
new ShuffleMapTask(stage.id, taskBinary, part, locs)
}
} else {
// 如果不是shuffleMap , 那么就是finalStage , final Stage是创建ResultTask的
val job = stage.resultOfJob.get
partitionsToCompute.map { id =>
val p: Int = job.partitions(id)
val part = stage.rdd.partitions(p)
val locs = getPreferredLocs(stage.rdd, p)
new ResultTask(stage.id, taskBinary, part, locs, id)
}
}
if (tasks.size > 0) {
logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
stage.pendingTasks ++= tasks
logDebug("New pending tasks: " + stage.pendingTasks)
// 最后 , 针对stage的task创建TaskSet对象 , 调用TaskScheduler的submitTasks方法提交TaskSet
// 默认情况下Standalone模式使用的是TaskSchedulerImpl , TaskScheduler只是一个TaskSchusterImpl的接口
taskScheduler.submitTasks(
new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should post
// SparkListenerStageCompleted here in case there are no tasks to run.
outputCommitCoordinator.stageEnd(stage.id)
listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
logDebug("Stage " + stage + " is actually done; %b %d %d".format(
stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))
runningStages -= stage
}
}
taskScheduler.submitTasks(
new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
/**
* TaskScheduler提交任务的入口
*/
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
// 给每一TaskSet都会创建一个TaskSetManager
// TaskSetManager实际上在后面会负责它的那个TaskSet的任务执行状况的监视和管理
val manager = createTaskSetManager(taskSet, maxTaskFailures)
// 然后加入内存缓存中
activeTaskSets(taskSet.id) = manager
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
if (!isLocal && !hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {
this.cancel()
}
}
}, STARVATION_TIMEOUT, STARVATION_TIMEOUT)
}
hasReceivedTask = true
}
// 在SparkContext原理分析的时候说过创建TaskScheduler的时候就是为TaskSchedulerImpl创建一个SparkDeploySchedulerBackend,这里的backend指的就是之前创建好的SparkDeploySchedulerBackend
// 而且这个backend是负责创建AppClient向master注册Application的
backend.reviveOffers()
}
override def reviveOffers() {
driverActor ! ReviveOffers
}
case ReviveOffers =>
makeOffers()
// Make fake resource offers on all executors
def makeOffers() {
// 第一步 : 调用TaskSchedulerImpl的resourceOffer方法 , 执行任务分配算法将各个task分配到executor上去
// 第二步 : 分配好task到executor之后执行自己的launchTasks方法 , 将分配的task发送LaunchTask消息到对应的executor上去 , 由executor启动并执行task
// 给resourceOffer方法传入的是这个Application所有可用的executor , 并且将其封装成了WorkerOffer , 每个workerOffer代表了每个executor可用的cpu资源数量
launchTasks(scheduler.resourceOffers(executorDataMap.map { case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
}.toSeq))
}
/**
* Called by cluster manager to offer resources on slaves. We respond by asking our active task
* sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so
* that tasks are balanced across the cluster.
*/
def resourceOffers(offers: Seq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
// Mark each slave as alive and remember its hostname
// Also track if new executor is added
var newExecAvail = false
for (o <- offers) {
executorIdToHost(o.executorId) = o.host
activeExecutorIds += o.executorId
if (!executorsByHost.contains(o.host)) {
executorsByHost(o.host) = new HashSet[String]()
executorAdded(o.executorId, o.host)
newExecAvail = true
}
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
}
}
// Randomly shuffle offers to avoid always placing tasks on the same set of workers.
// 首先将可用executor进行shuffle , 也就是打散尽量做到负载均衡
val shuffledOffers = Random.shuffle(offers)
// Build a list of tasks to assign to each worker.
// 然后针对WorkerOffer创建出一堆需要用的东西
// 比如tasks , 很重要 , 它可以理解为一个二位数组ArrayBuffer , 元素又是一个ArrayBuffer
// 并且每个子ArrayBuffer的数量是固定的 , 也就是这个executor可用的cpu数量
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
// 从rootPool中取出了排序的TaskSet , 之前说了TaskScheduler初始化的时候创建完TaskSchedulerImpl,SparkDeploySchedulerBackend之后执行一个initialize方法
// 在这个方法中会创建一个调度池
// 这里相当于是所有提交的TaskSet会先放入调度池 , 然后在执行task分配算法的时候会从这个调度池中取出排好队的TaskSet
val sortedTaskSets = rootPool.getSortedTaskSetQueue
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
taskSet.executorAdded()
}
}
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
// of locality levels so that it gets a chance to launch local tasks on all of them.
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
// 这里就是核心的任务分配算法的核心了
// 双重for循环遍历所有的taskset , 以及每一种本地化级别
// 本地化级别(从上到下性能越来越差): Process_Local->进程本地化,rdd的partition和task进入一个executor内 , 那么速度当然快
// NODE_LOCAL , rdd的partition和task不在一个executor中,不在一个进程中,但是在一个worker节点上
// NO_PREF , 没有本地化级别
// RACK_LOCAL , 机架本地化 , 至少rdd的partition和task在一个机架上
// ANY , 任意的本地化级别
// 对每个TaskSet从最好的一种本地化级别开始遍历
var launchedTask = false
for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) {
do {
// 对当前TaskSet尝试优先使用每一种本地化几倍 , 将TaskSet的task在executor上进行启动
// 如果启动不了那么跳出这个do while循环 , 进入下一种本地化级别 , 也就是放大本地化级别 , 依次类推直到尝试将TaskSet在某些本地化级别下
// 让task在executor上全部启动
launchedTask = resourceOfferSingleTaskSet(
taskSet, maxLocality, shuffledOffers, availableCpus, tasks)
} while (launchedTask)
}
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks
}
private def resourceOfferSingleTaskSet(
taskSet: TaskSetManager,
maxLocality: TaskLocality,
shuffledOffers: Seq[WorkerOffer],
availableCpus: Array[Int],
tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {
var launchedTask = false
// 遍历所有executor
for (i <- 0 until shuffledOffers.size) {
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
// 如果当前executor的cpu数量至少大于每个task要使用的cpu数量 , 默认是1
if (availableCpus(i) >= CPUS_PER_TASK) {
try {
// 调用TaskSetManager的resourceOffer方法去找到在这个executor上就用这种本地化级别的情况下
// 哪些TaskSet的哪些task可以启动
// 遍历使用当前本地化级别 , 可以在该executor上启动的task
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
// 放入tasks这个二位数组 , 给指定的executor加上要启动的task
tasks(i) += task
// 到这里为止其实就是task分配算法的实现了
// 尝试用本地化级别这种模型去优化task的分配和启动 , 优先希望在最佳本地化的地方启动task , 然后将task分配给executor
// 将相应的分配信息加入内存缓存
val tid = task.taskId
taskIdToTaskSetId(tid) = taskSet.taskSet.id
taskIdToExecutorId(tid) = execId
executorsByHost(host) += execId
availableCpus(i) -= CPUS_PER_TASK
assert(availableCpus(i) >= 0)
//
launchedTask = true
}
} catch {
case e: TaskNotSerializableException =>
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
// Do not offer resources for this task, but don't throw an error to allow other
// task sets to be submitted.
return launchedTask
}
}
}
return launchedTask
}
// 根据分配好的情况去在executor上启动相应的task
def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
// 首先将每个executor要执行的task信息统一进行序列化操作
val ser = SparkEnv.get.closureSerializer.newInstance()
val serializedTask = ser.serialize(task)
if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
val taskSetId = scheduler.taskIdToTaskSetId(task.taskId)
scheduler.activeTaskSets.get(taskSetId).foreach { taskSet =>
try {
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
"spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +
"spark.akka.frameSize or using broadcast variables for large values."
msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,
AkkaUtils.reservedSizeBytes)
taskSet.abort(msg)
} catch {
case e: Exception => logError("Exception in error callback", e)
}
}
}
else {
// 找到对应的executor
val executorData = executorDataMap(task.executorId)
// 给executor上的资源减去要使用的cpu资源
executorData.freeCores -= scheduler.CPUS_PER_TASK
// 向executor发送LaunchTask消息 , 来在executor上启动task
executorData.executorActor ! LaunchTask(new SerializableBuffer(serializedTask))
}
}
}