Job触发流程:
#wordcount程序
val lines = sc.textFile()
val words = lines.flatMap(line => line.split(" "))
val pairs = words.map(word => (word, 1))
// 其实RDD.scala里是没有reduceByKey的,因此对RDD调用reduceByKey()方法的时候,会触发scala的隐式转换;
//此时就会在作用域内,寻找隐式转换,会在RDD.scala中找到rddToPairRDDFunctions()隐式转换,
//然后将RDD转换为PairRDDFunctions(reduceByKey,groupByKey都在这里)。
// 接着会调用PairRDDFunctions中的reduceByKey()方法
val counts = pairs.reduceByKey(_ + _)
counts.foreach(count => println(count._1 + ": " + count._2))
源码SparkContext.scala
//首先,hadoopFile()的调用,会创建一个HadoopRDD,其中的元素,其实是(key,value)pair
//key是hdfs或文本文件的每一行的offset(LongWritable),value就是文本行(text)
//然后对HadoopRDD调用map()方法,会剔除key,只保留value,然后会获得一个MapPartitionRDD
//MapPartitionRDD内部的元素,就是一行一行的文本行
def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String] = {
assertNotStopped()
hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
minPartitions).map(pair => pair._2.toString).setName(path)
}
------------------------------------------------------------------------------------------
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
allowLocal: Boolean,
resultHandler: (Int, U) => Unit) {
if (stopped) {
throw new IllegalStateException("SparkContext has been shutdown")
}
val callSite = getCallSite
val cleanedFunc = clean(func)
logInfo("Starting job: " + callSite.shortForm)
if (conf.getBoolean("spark.logLineage", false)) {
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
}
//使用之前初始化SparkContext时创建的DAGScheduler的runJob()方法
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal,
resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
rdd.doCheckpoint()
}
RDD.sacla
def foreach(f: T => Unit) {
val cleanF = sc.clean(f)
sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))
}
DAGScheduler stage划分算法:
在4040UI端口查看应用stage应用的时候,会发现stage0是由reduceByKey触发,stage1也是由reduceBykey触发,因为reduceBykey包括3个RDD,第一个RDD是属于stage0,后面两个是属于stage1
DAGSchduler是stage划分算法总结:会从出发action操作的那个RDD(8)往前倒推,首先会为最后一个RDD(7)创建一个stage(stage1),然后往前倒推的时候,如果发现对某个RDD(5)的宽依赖,那么就会将宽依赖的那个RDD(5)创建一个新的stage,那个RDD(5)就是新的stage的最后一个RDD(5);最后依此类推,继续往前倒推,根据窄依赖,或者宽依赖,进行stage的划分,知道所有的RDD全部遍历完了为止
DAGScheduler源码分析:
源码入口:从org.apache.spark.SparkContext#runJob(dagScheduler.runJob) —>
org.apache.spark.scheduler.DAGScheduler#runJob(submitJob) —>
org.apache.spark.scheduler.JobSubmitted$#onReceive(JobSubmitted) —> org.apache.spark.scheduler.DAGScheduler#handleJobSubmitted
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
allowLocal: Boolean,
callSite: CallSite,
listener: JobListener,
properties: Properties = null)
{
//第一步,使用触发job的最后一个rdd,创建finalStage
var finalStage: Stage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
//创建一个stage对象,并且将stage加入DAGScheduler内部的内存缓存中
finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
//第二步,用finalStage创建一个job,也就是说,这个job的最后一个stage,就是我们的finalStage
if (finalStage != null) {
val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
job.jobId, callSite.shortForm, partitions.length, allowLocal))
logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))
val shouldRunLocally =
localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
val jobSubmissionTime = clock.getTimeMillis()
if (shouldRunLocally) {
// Compute very short actions like first() or take() with no parent stages locally.
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, Seq.empty, properties))
runLocally(job)
} else {
//第三步,将job加入内存缓存中
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.resultOfJob = Some(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
//第四步,使用submitStage()方法提交finalStage
//这个方法的调用,其实会导致第一个stage提交
//并且导致其他所有的stage,都给放入waitingStages队列里了
submitStage(finalStage)
}
}
//提交等待的stages
submitWaitingStages()
}
-----------------------------------------------------------------------------------------------
//提交stage的方法,也是stage划分算法的入口
//但是,stage划分算法是由submitStage()方法与getMissingParentStages()方法共同组成的
/** Submits stage, but first recursively submits any missing parents. */
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
//调用getMissingParentStages()方法,去获取当前这个stage的父stage
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
//这里其实会反复递归调用,直到最初的stage,他没有父stage
//那么,此时,就会去首先提交这个第一个stage,stage0
//其余的stage,此时全部都在waitingStage里面
if (missing == Nil) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
submitMissingTasks(stage, jobId.get)
} else {
//递归调用submit()方法,去提交父stage
//这里的递归,就是stage划分算法的推动者和精髓
for (parent <- missing) {
submitStage(parent)
}
//并且将当前的stage放入waitingStages等待执行的stage的队列中
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id)
}
}
=======================================================/**
* Return true if one of stage's ancestors is target.
*/
private def stageDependsOn(stage: Stage, target: Stage): Boolean = {
if (stage == target) {
return true
}
val visitedRdds = new HashSet[RDD[_]]
val visitedStages = new HashSet[Stage]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
val waitingForVisit = new Stack[RDD[_]]
def visit(rdd: RDD[_]) {
if (!visitedRdds(rdd)) {
visitedRdds += rdd
//遍历rdd的依赖
//其实对于每一种shuffle的操作,比如groupByKey,reduceByKey,countByKey,
//底层对应了三个RDD:MapPartitionRDD、ShuffleRDD、MapPartitionRDD
for (dep <- rdd.dependencies) {
dep match {
//如果是宽依赖
case shufDep: ShuffleDependency[_, _, _] =>
//那么使用宽依赖的那个rdd,创建一个stage,并且会将isShuffleMap这只为true
//默认最后一个stage,不是shuffleMap stage
//但是finalStage之前所有的stage,都是shuffleMap stage
val mapStage = getShuffleMapStage(shufDep, stage.jobId)
if (!mapStage.isAvailable) {
visitedStages += mapStage
waitingForVisit.push(mapStage.rdd)
} // Otherwise there's no need to follow the dependency back
//如果是窄依赖,那么将依赖的rdd放入栈中
case narrowDep: NarrowDependency[_] =>
waitingForVisit.push(narrowDep.rdd)
}
}
}
}
//首先往栈中,推入了stage最后的一个rdd
waitingForVisit.push(stage.rdd)
//然后进行while循环
while (!waitingForVisit.isEmpty) {
//对stage的最后一个rdd,调用自己内部定义的visit()方法
visit(waitingForVisit.pop())
}
visitedRdds.contains(target.rdd)
}
=======================================================
//获取某个stage的父stage,这个方法的意思,就是说
//对一个stage,如果它的最后一个rdd的所有依赖,都是窄依赖,那么就不会创建任何新的stage
//但是,只要发现这个stage的rdd宽依赖了某个rdd,那么就用宽依赖的那个rdd,创建一个新的stage
//然后立即将新的stage返回
private def getShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _], jobId: Int): Stage = {
shuffleToMapStage.get(shuffleDep.shuffleId) match {
case Some(stage) => stage
case None =>
// We are going to register ancestor shuffle dependencies
registerShuffleDependencies(shuffleDep, jobId)
// Then register current shuffleDep
val stage =
newOrUsedStage(
shuffleDep.rdd, shuffleDep.rdd.partitions.size, shuffleDep, jobId,
shuffleDep.rdd.creationSite)
shuffleToMapStage(shuffleDep.shuffleId) = stage
stage
}
}
=======================================================
stage划分算法总结:
1、从finalStage倒推
2、通过宽依赖,来进行新的stage的划分
3、使用递归,优先提交父stage
stage划分算法,特别重要,必须对stage划分算法很清晰,
直到你自己编写的spark application被划分为几个job,每个job被划分成了几个stage,每个stage包括了哪些代码,
只有知道了每个stage包括了你的哪些代码之后,在线上,如果你发现某个stage执行特别慢,或者某个stage一直报错,
才能针对那个stage对应的代码,去排查问题,或者是性能调优
=======================================================
//提交stage,为stage创建一批task,task数量与partition数量相同
/** Called when stage's parents are available and we can now do its task. */
private def submitMissingTasks(stage: Stage, jobId: Int) {
logDebug("submitMissingTasks(" + stage + ")")
// Get our pending tasks and remember them in our pendingTasks entry
stage.pendingTasks.clear()
//获取你要创建的task的数量
// First figure out the indexes of partition ids to compute.
val partitionsToCompute: Seq[Int] = {
if (stage.isShuffleMap) {
(0 until stage.numPartitions).filter(id => stage.outputLocs(id) == Nil)
} else {
val job = stage.resultOfJob.get
(0 until job.numPartitions).filter(id => !job.finished(id))
}
}
val properties = if (jobIdToActiveJob.contains(jobId)) {
jobIdToActiveJob(stage.jobId).properties
} else {
// this stage will be assigned to "default" pool
null
}
//将stage加入runningStage队列
runningStages += stage
// SparkListenerStageSubmitted should be posted before testing whether tasks are
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
// event.
stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size))
outputCommitCoordinator.stageStart(stage.id)
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
// the serialized copy of the RDD and for each task we will deserialize it, which means each
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
// might modify state of objects referenced in their closures. This is necessary in Hadoop
// where the JobConf/Configuration object is not thread-safe.
var taskBinary: Broadcast[Array[Byte]] = null
try {
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
// For ResultTask, serialize and broadcast (rdd, func).
val taskBinaryBytes: Array[Byte] =
if (stage.isShuffleMap) {
closureSerializer.serialize((stage.rdd, stage.shuffleDep.get) : AnyRef).array()
} else {
closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func) : AnyRef).array()
}
taskBinary = sc.broadcast(taskBinaryBytes)
} catch {
// In the case of a failure during serialization, abort the stage.
case e: NotSerializableException =>
abortStage(stage, "Task not serializable: " + e.toString)
runningStages -= stage
return
case NonFatal(e) =>
abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")
runningStages -= stage
return
}
//为stage创建指定数量的task
//这里很关键的一点是,task的最佳位置计算算法
val tasks: Seq[Task[_]] = if (stage.isShuffleMap) {
partitionsToCompute.map { id =>
val locs = getPreferredLocs(stage.rdd, id)
val part = stage.rdd.partitions(id)
//然后对于finalStage之外的stage,它的isShuffleMap都是true
//所以会创建ShuffleMapTask
new ShuffleMapTask(stage.id, taskBinary, part, locs)
}
} else {
//如果不是shuffleMap,那么就是finalStage
//final Stage 是创建ResultTask的
val job = stage.resultOfJob.get
partitionsToCompute.map { id =>
val p: Int = job.partitions(id)
val part = stage.rdd.partitions(p)
val locs = getPreferredLocs(stage.rdd, p)
new ResultTask(stage.id, taskBinary, part, locs, id)
}
}
if (tasks.size > 0) {
logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
stage.pendingTasks ++= tasks
logDebug("New pending tasks: " + stage.pendingTasks)
//最后,针对stage的task,创建TaskSet对象,调用TaskScheduler的submitTasks()方法,提交TaskSet
//默认情况下,我们的standalone模式,是使用的TaskSchedulerImpl,TaskScheduler只是一个trait(类似Java的interface)
taskScheduler.submitTasks(
new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should post
// SparkListenerStageCompleted here in case there are no tasks to run.
outputCommitCoordinator.stageEnd(stage.id)
listenerBus.post(SparkListenerStageCompleted(stage.latestInfo))
logDebug("Stage " + stage + " is actually done; %b %d %d".format(
stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))
runningStages -= stage
}
}
=======================================================
//计算每个task对应的partition 的最佳位置
//从stage的最后一个rdd开始,去找哪个rdd的partition,是被cache了,或者checkpoint
//那么,task的最佳位置,就是缓存的 checkpoint的partition的位置
//因为这样的话,task就在哪个节点上执行,不需要计算之前的rdd了
private def getPreferredLocsInternal(
rdd: RDD[_],
partition: Int,
visited: HashSet[(RDD[_],Int)])
: Seq[TaskLocation] =
{
// If the partition has already been visited, no need to re-visit.
// This avoids exponential path exploration. SPARK-695
if (!visited.add((rdd,partition))) {
// Nil has already been returned for previously visited partitions.
return Nil
}
//寻找当前rdd的partition是否缓存了
// If the partition is cached, return the cache locations
val cached = getCacheLocs(rdd)(partition)
if (!cached.isEmpty) {
return cached
}
//寻找当前rdd的partitions 是否checkpoint了
// If the RDD has some placement preferences (as is the case for input RDDs), get those
val rddPrefs = rdd.preferredLocations(rdd.partitions(partition)).toList
if (!rddPrefs.isEmpty) {
return rddPrefs.map(TaskLocation(_))
}
// If the RDD has narrow dependencies, pick the first partition of the first narrow dep
// that has any placement preferences. Ideally we would choose based on transfer sizes,
// but this will do for now.
//最后,递归调用自己,去寻找rdd 的父rdd,看着对应的partition是否缓存或者checkpoint
rdd.dependencies.foreach {
case n: NarrowDependency[_] =>
for (inPart <- n.getParents(partition)) {
val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
if (locs != Nil) {
return locs
}
}
case _ =>
}
//如果这个stage,从最后一个rdd,到最开始的rdd,partition都没有被缓存或者checkpoint
//那么,task的最佳位置preferredLocs,就是Nil
Nil
}
TaskScheduler源码分析:
入口是org.apache.spark.scheduler.TaskSchedulerImpl#submitTasks,在上面的submitMissingTasks中调用;
submitTasks ->backend.reviveOffers() ——>org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.ReviveOffers$->makeOffers()->launchTasks
override def submitTasks(taskSet: TaskSet) {
val tasks = taskSet.tasks
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
this.synchronized {
//给每一个TaskSet,都会创建一个TaskSetManager
//TaskSetManager实际上会负责他的那个TaskSet的任务执行状况的监视和管理
val manager = createTaskSetManager(taskSet, maxTaskFailures)
//然后加入到内存缓存中
activeTaskSets(taskSet.id) = manager
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
if (!isLocal && !hasReceivedTask) {
starvationTimer.scheduleAtFixedRate(new TimerTask() {
override def run() {
if (!hasLaunchedTask) {
logWarning("Initial job has not accepted any resources; " +
"check your cluster UI to ensure that workers are registered " +
"and have sufficient resources")
} else {
this.cancel()
}
}
}, STARVATION_TIMEOUT, STARVATION_TIMEOUT)
}
hasReceivedTask = true
}
//初始化SparkContext,创建TaskScheduler的时候,一件非常重要的事情,就是为TaskSchedulerImpl创建一个SparkDeploySchedulerBackend
//而且这个backend是负责创建AppClient,向Master注册Application的
//
backend.reviveOffers()
}
org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.DriverActor#launchTasks
def makeOffers() {
//第一步,调用TaskSchedulerImpl的resourceOffers()方法,执行任务分配算法,将各个task分配到executor上去
//第二步,分配好task到executor之后,执行自己的LaunchTasks()方法,
//将分配的task发送LaunchTask消息到到对应的executor上去,由executor启动并执行task
//给resourceOffers方法传入的是什么?
//传入的是这个Application所有可用的executor,并且将其封装成workOffer,每个workOffer代表了
//每个executor可用的cpu资源数量
launchTasks(scheduler.resourceOffers(executorDataMap.map { case (id, executorData) =>
new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
}.toSeq))
}
根据分配好的情况,去在executor上启动相应的task
def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
for (task <- tasks.flatten) {
//首先将每个executor要执行的task信息,统一进行序列化操作
val ser = SparkEnv.get.closureSerializer.newInstance()
val serializedTask = ser.serialize(task)
if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
val taskSetId = scheduler.taskIdToTaskSetId(task.taskId)
scheduler.activeTaskSets.get(taskSetId).foreach { taskSet =>
try {
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
"spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +
"spark.akka.frameSize or using broadcast variables for large values."
msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,
AkkaUtils.reservedSizeBytes)
taskSet.abort(msg)
} catch {
case e: Exception => logError("Exception in error callback", e)
}
}
}
else {
//找到对应的executor
val executorData = executorDataMap(task.executorId)
//给executor上的资源,减去要使用的cpu资源
executorData.freeCores -= scheduler.CPUS_PER_TASK
//向executor发送LaunchTask消息,来在executor上启动task
executorData.executorActor ! LaunchTask(new SerializableBuffer(serializedTask))
}
}
}
org.apache.spark.scheduler.TaskSchedulerImpl#resourceOffers
def resourceOffers(offers: Seq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
// Mark each slave as alive and remember its hostname
// Also track if new executor is added
var newExecAvail = false
for (o <- offers) {
executorIdToHost(o.executorId) = o.host
activeExecutorIds += o.executorId
if (!executorsByHost.contains(o.host)) {
executorsByHost(o.host) = new HashSet[String]()
executorAdded(o.executorId, o.host)
newExecAvail = true
}
for (rack <- getRackForHost(o.host)) {
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
}
}
// Randomly shuffle offers to avoid always placing tasks on the same set of workers.
//首先将可用的executor进行shuffle,也就是说,进行打散,从而做到,尽量可以进行负载均衡
val shuffledOffers = Random.shuffle(offers)
// Build a list of tasks to assign to each worker.
//然后针对WorkOffer,创建出一堆需要用的东西
//比如tasks,这里可以理解为一个二维数组,ArrayBuffer中的元素又是一个ArrayBuffer
//并且每个子ArrayBuffer的数量是固定的,也就是这个executor可用的cpu数量
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
val availableCpus = shuffledOffers.map(o => o.cores).toArray
//这个很重要,从rootPool中取出了排序的TaskSet
//之前我们初始化TaskScheduler的时候,创建完TaskSchedulerImpl、SparkDeploySchedulerBackend之后
//执行了一个initialize()方法,在这个方法中,其实会创建一个调度池,
//这里,相当于是说,所有提交的TaskSet,首先会放入这个调度池
//然后在执行task分配算法的时候,会从这个调度池中,取出排好队的Taskset
val sortedTaskSets = rootPool.getSortedTaskSetQueue
for (taskSet <- sortedTaskSets) {
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
if (newExecAvail) {
taskSet.executorAdded()
}
}
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
// of locality levels so that it gets a chance to launch local tasks on all of them.
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
//这里就是任务分配算法的核心
//双重for循环,遍历所有的taskset,以及每一种本地化级别
//本地化级别,PROCESS_LOCAL(进程本地化,rdd的partition和task,进入一个executor内,速度最快),
// NODE_LOCAL(rdd的partition和task,不在一个executor(进程)中,但是在一个worker节点上),
// RACK_LOCAL(机架本地化,至少rddd partition和task,在一个机架上), //ANY(任意的本地化级别)
//对每个taskSet,从最好的一种本地化级别开始遍历
var launchedTask = false
for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) {
do {
//对当前taskSet,尝试优先使用最小的本地化级别,将taskSet的task在executor上进行启动
//如果启动不了,那么就跳出这个do while循环,进入下一个本地化级别,也就是放大本地化级别
//依次类推,直到尝试将taskset在某些本地化级别下,让task在executor上全部启动
launchedTask = resourceOfferSingleTaskSet(
taskSet, maxLocality, shuffledOffers, availableCpus, tasks)
} while (launchedTask)
}
if (tasks.size > 0) {
hasLaunchedTask = true
}
return tasks
}
private def resourceOfferSingleTaskSet(
taskSet: TaskSetManager,
maxLocality: TaskLocality,
shuffledOffers: Seq[WorkerOffer],
availableCpus: Array[Int],
tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {
var launchedTask = false
//遍历所有executor
for (i <- 0 until shuffledOffers.size) {
val execId = shuffledOffers(i).executorId
val host = shuffledOffers(i).host
//如果当前executor的cpu数量至少大于每个task要使用的cpu数量,默认是1
if (availableCpus(i) >= CPUS_PER_TASK) {
try {
//调用TaskSetManager的resourceOffer方法,去找到在这个executor上
//就用这种本地化级别,taskSet哪些可以启动
//遍历使用当前本地化级别,可以在该executor上启动task
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {
//放入tasks这个二维数组,给指定的executor加上要启动的task
tasks(i) += task
//到这里为止,就是task分配算法的实现了
//我们尝试着用本地化级别这种模型,去优化task的分配和启动个,优先希望在最佳本地化的地方启动task
//然后呢,将task分配给executor
//将相应的分配信息加入内存缓存中
val tid = task.taskId
taskIdToTaskSetId(tid) = taskSet.taskSet.id
taskIdToExecutorId(tid) = execId
executorsByHost(host) += execId
availableCpus(i) -= CPUS_PER_TASK
assert(availableCpus(i) >= 0)
//标识为true
launchedTask = true
}
} catch {
case e: TaskNotSerializableException =>
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
// Do not offer resources for this task, but don't throw an error to allow other
// task sets to be submitted.
return launchedTask
}
}
}
return launchedTask
}
TaskSetManager:在TaskSchedulerImpl中,对一个单独的TaskSet的任务进行调度,这个类负责追踪每一个task,如果task失败的话,会负责重试task,知道超过重试的次数限制,并且会通过延迟调度,为这个taskset处理本地化调度机制。他的主要接口是resourceOffer。在这个接口中,TaskSet会希望在一个节点上运行一个任务,并且通过接受任务的状态改变消息,来知道他负责的task的状态改变了