注意看表红的地方,那是代码的执行流程,有些通信的地方没有标注,因为通信是在太多了
DAG:
<span style="font-size:14px;"><span style="font-size:18px;"> private[scheduler] def handleJobSubmitted(jobId: Int, finalRDD: RDD[_], func: (TaskContext, Iterator[_]) => _, partitions: Array[Int], allowLocal: Boolean, callSite: CallSite, listener: JobListener, properties: Properties = null) { var finalStage: Stage = null try { // New stage creation may throw an exception if, for example, jobs are run on a // HadoopRDD whose underlying HDFS files have been deleted. finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite) } catch { case e: Exception => logWarning("Creating new stage failed due to exception - job: " + jobId, e) listener.jobFailed(e) return } if (finalStage != null) { <span style="color:#ff0000;"> val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)</span> clearCacheLocs() logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format( job.jobId, callSite.shortForm, partitions.length, allowLocal)) logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")") logInfo("Parents of final stage: " + finalStage.parents) logInfo("Missing parents: " + getMissingParentStages(finalStage)) val shouldRunLocally = localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1 if (shouldRunLocally) { // Compute very short actions like first() or take() with no parent stages locally. listenerBus.post(SparkListenerJobStart(job.jobId, Array[Int](), properties)) runLocally(job) } else { jobIdToActiveJob(jobId) = job activeJobs += job finalStage.resultOfJob = Some(job) listenerBus.post(SparkListenerJobStart(job.jobId, jobIdToStageIds(jobId).toArray, properties)) <span style="color:#ff0000;"> submitStage(finalStage)</span> } } submitWaitingStages() }</span></span>再来:
<span style="font-size:14px;"><span style="font-size:18px;">/** Submits stage, but first recursively submits any missing parents. */ private def submitStage(stage: Stage) { val jobId = activeJobForStage(stage) if (jobId.isDefined) { logDebug("submitStage(" + stage + ")") if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) { val missing = getMissingParentStages(stage).sortBy(_.id)<span style="color:#ff0000;">//这个方法可以和submitStage对比一下</span> logDebug("missing: " + missing) if (missing == Nil) {<span style="color:#ff0000;">//如果这个条件满足,那么就是第一个stage,这个不用解释吧</span> logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents") <span style="color:#ff0000;"> submitMissingTasks(stage, jobId.get)</span> } else { for (parent <- missing) { submitStage(parent) } waitingStages += stage } } } else { abortStage(stage, "No active job for stage " + stage.id) } } </span></span>再来:这个方法是将stage转化为tasks的,task是并行运行的且task是根据stage的partitions的个数来确定的
<span style="font-size:14px;"><span style="font-size:18px;"> private def submitMissingTasks(stage: Stage, jobId: Int) { logDebug("submitMissingTasks(" + stage + ")") // Get our pending tasks and remember them in our pendingTasks entry stage.pendingTasks.clear()//这边为什么要清楚tasks呢,因为这次submittask可能是因为有些partition运行失败,但不是所有的都失败,所以只需要运行相关的partitions就行 // First figure out the indexes of partition ids to compute. <span style="color:#ff0000;"> val partitionsToCompute: Seq[Int] = {//shuffle是要写到不同文件中的 if (stage.isShuffleMap) { (0 until stage.numPartitions).filter(id => stage.outputLocs(id) == Nil)//灭有没处理的partition } else { val job = stage.resultOfJob.get (0 until job.numPartitions).filter(id => !job.finished(id)) } }</span> val properties = if (jobIdToActiveJob.contains(jobId)) { jobIdToActiveJob(stage.jobId).properties } else { // this stage will be assigned to "default" pool null } runningStages += stage // SparkListenerStageSubmitted should be posted before testing whether tasks are // serializable. If tasks are not serializable, a SparkListenerStageCompleted event // will be posted, which should always come after a corresponding SparkListenerStageSubmitted // event. stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size)) listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties)) // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times. // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast // the serialized copy of the RDD and for each task we will deserialize it, which means each // task gets a different copy of the RDD. This provides stronger isolation between tasks that // might modify state of objects referenced in their closures. This is necessary in Hadoop // where the JobConf/Configuration object is not thread-safe. <span style="color:#ff0000;"> var taskBinary: Broadcast[Array[Byte]] = null//序列化stage try { // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep). // For ResultTask, serialize and broadcast (rdd, func). val taskBinaryBytes: Array[Byte] = if (stage.isShuffleMap) { closureSerializer.serialize((stage.rdd, stage.shuffleDep.get) : AnyRef).array()//每进行一次shhuffle就会进行一次广播,但这个不是真正运行的任务 } else { closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func) : AnyRef).array() } taskBinary = sc.broadcast(taskBinaryBytes)</span> } catch { // In the case of a failure during serialization, abort the stage. case e: NotSerializableException => abortStage(stage, "Task not serializable: " + e.toString) runningStages -= stage return case NonFatal(e) => abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}") runningStages -= stage return } <span style="color:#ff0000;"> val tasks: Seq[Task[_]] = if (stage.isShuffleMap) { partitionsToCompute.map { id => val locs = getPreferredLocs(stage.rdd, id) val part = stage.rdd.partitions(id) new ShuffleMapTask(stage.id, taskBinary, part, locs)//因为有那么多的partitions,所以要更具不同的配置信息创建多个ShuffleMapTask,task只有来年各种,resulttask,shuffletask } } else { val job = stage.resultOfJob.get partitionsToCompute.map { id => val p: Int = job.partitions(id) val part = stage.rdd.partitions(p) val locs = getPreferredLocs(stage.rdd, p) new ResultTask(stage.id, taskBinary, part, locs, id) } }</span> if (tasks.size > 0) { // Preemptively serialize a task to make sure it can be serialized. We are catching this // exception here because it would be fairly hard to catch the non-serializable exception // down the road, where we have several different implementations for local scheduler and // cluster schedulers. // // We've already serialized RDDs and closures in taskBinary, but here we check for all other // objects such as Partition. try { closureSerializer.serialize(tasks.head) } catch { case e: NotSerializableException => abortStage(stage, "Task not serializable: " + e.toString) runningStages -= stage return case NonFatal(e) => // Other exceptions, such as IllegalArgumentException from Kryo. abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}") runningStages -= stage return } logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")") <span style="color:#ff0000;"> stage.pendingTasks ++= tasks//这才是要真正要运行的,这些task是并行运行的</span> logDebug("New pending tasks: " + stage.pendingTasks) <span style="background-color: rgb(102, 102, 102);"> <span style="color:#ff0000;">taskScheduler</span>.submitTasks( new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))//<span style="color:#ff0000;">现在就到了TaskSchedulerImpl中</span></span> stage.latestInfo.submissionTime = Some(clock.getTime()) } else { // Because we posted SparkListenerStageSubmitted earlier, we should post // SparkListenerStageCompleted here in case there are no tasks to run. listenerBus.post(SparkListenerStageCompleted(stage.latestInfo)) logDebug("Stage " + stage + " is actually done; %b %d %d".format( stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions)) runningStages -= stage } }</span></span>再来:
<span style="font-size:14px;"> override def submitTasks(taskSet: TaskSet) { val tasks = taskSet.tasks//这是一个task数组,数组中任务的执行逻辑是一样的,只是数据的location不一样 logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks") this.synchronized { val manager = new TaskSetManager(this, taskSet, maxTaskFailures) activeTaskSets(taskSet.id) = manager schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties) } <span style="color:#ff0000;"> backend.reviveOffers()</span> }</span>再来:
<span style="font-size:14px;"> // Make fake resource offers on all executors def makeOffers() { launchTasks(<span style="color:#ff0000;">scheduler</span>.resourceOffers(executorDataMap.map { case (id, executorData) =>//划红线的部分是:TaskSchedulerImpl new WorkerOffer(id, executorData.executorHost, executorData.freeCores) }.toSeq)) }</span>
<span style="font-size:14px;"> /** * <span style="color:#ff0000;">Called by cluster manager to offer resources on slaves</span>. We respond by asking our active task * sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so * that tasks are balanced across the cluster. */ def resourceOffers(offers: Seq[WorkerOffer]):<span style="color:#ff0000;"> Seq[Seq[TaskDescription]]</span> = synchronized {//他返回的是TaskDescription,而这个又是什么呢,看下面 // Mark each slave as alive and remember its hostname // Also track if new executor is added var newExecAvail = false for (o <- offers) { executorIdToHost(o.executorId) = o.host if (!executorsByHost.contains(o.host)) { executorsByHost(o.host) = new HashSet[String]() executorAdded(o.executorId, o.host) newExecAvail = true } for (rack <- getRackForHost(o.host)) { hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host } }</span>
再来:
<span style="font-size:14px;">private[spark] class TaskDescription( val taskId: Long, val executorId: String, val name: String, val index: Int, // Index within this task's TaskSet <span style="color:#ff0000;"> _serializedTask: ByteBuffer</span>)//他是一个bytebuffer,也就是说他是一个序列化之后的对象 extends Serializable { // Because ByteBuffers are not serializable, wrap the task in a SerializableBuffer private val buffer = new SerializableBuffer(_serializedTask) def serializedTask: ByteBuffer = buffer.value override def toString: String = "TaskDescription(TID=%d, index=%d)".format(taskId, index) }</span>
<span style="font-size:14px;">// Launch tasks returned by a set of resource offers def launchTasks(tasks: Seq[Seq[TaskDescription]]) { for (task <- tasks.flatten) { val ser = SparkEnv.get.closureSerializer.newInstance() val serializedTask = ser.serialize(task) if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) { val taskSetId = scheduler.taskIdToTaskSetId(task.taskId) scheduler.activeTaskSets.get(taskSetId).foreach { taskSet => try { var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " + "spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " + "spark.akka.frameSize or using broadcast variables for large values." msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize, AkkaUtils.reservedSizeBytes) taskSet.abort(msg) } catch { case e: Exception => logError("Exception in error callback", e) } } } else { val executorData = executorDataMap(task.executorId) executorData.freeCores -= scheduler.CPUS_PER_TASK <span style="color:#ff0000;"> executorData.executorActor ! LaunchTask(new SerializableBuffer(serializedTask))//发布任务 </span> } } }</span>至于在executor中是怎么运行的,一句话就是反序列化rdd,进行计算
当一个task运行完成的时候:
Task执行是通过TaskRunner来运行,它需要通过ExecutorBackend和Driver通信,通信消息是StatusUpdate:
1、Task运行之前,告诉Driver当前Task的状态为TaskState.RUNNING。
2、Task运行之后,告诉Driver当前Task的状态为TaskState.FINISHED,并返回计算结果。
3、如果Task运行过程中发生错误,告诉Driver当前Task的状态为TaskState.FAILED,并返回错误原因。
4、如果Task在中途被Kill掉了,告诉Driver当前Task的状态为TaskState.FAILED。
下面讲的是运行成功的状态,图太大了,所以插到了最后面。
1、Task运行结束之后,调用ExecutorBackend的statusUpdate方法,把结果返回。结果超过10M,就把结果保存在blockManager处,返回blockId,需要的时候通过blockId到blockManager认领。
2、ExecutorBackend直接向Driver发送StatusUpdate返回Task的信息。
3、Driver(这里具体指的是SchedulerBackend)接收到StatusUpdate消息之后,调用TaskScheduler的statusUpdate方法,然后准备给ExecutorBackend发送下一批Task。
4、TaskScheduler通过TaskId找到管理这个Task的TaskSetManager(负责管理一批Task的类),从TaskSetManager里面删掉这个Task,并把Task插入到TaskResultGetter(负责获取Task结果的类)的成功队列里。
5、TaskResultGetter获取到结果之后,调用TaskScheduler的handleSuccessfulTask方法把结果返回。
6、TaskScheduler调用TaskSetManager的handleSuccessfulTask方法,处理成功的Task。
7、TaskSetManager调用DAGScheduler的taskEnded方法,告诉DAGScheduler这个Task运行结束了,如果这个时候Task全部成功了,就会结束TaskSetManager。
8、DAGScheduler在taskEnded方法里触发CompletionEvent事件,CompletionEvent分ResultTask和ShuffleMapTask来处理。
再来:<span style="font-size:14px;"> private[scheduler] def handleTaskCompletion(event: CompletionEvent) { val task = event.task val stageId = task.stageId val taskType = Utils.getFormattedClassName(task) // The success case is dealt with separately below, since we need to compute accumulator // updates before posting. if (event.reason != Success) { val attemptId = stageIdToStage.get(task.stageId).map(_.latestInfo.attemptId).getOrElse(-1) listenerBus.post(SparkListenerTaskEnd(stageId, attemptId, taskType, event.reason,event.taskInfo, event.taskMetrics)) } if (!stageIdToStage.contains(task.stageId)) { // Skip all the actions if the stage has been cancelled. return } val stage = stageIdToStage(task.stageId) def markStageAsFinished(stage: Stage, errorMessage: Option[String] = None) = { val serviceTime = stage.latestInfo.submissionTime match { case Some(t) => "%.03f".format((clock.getTime() - t) / 1000.0) case _ => "Unknown" } if (errorMessage.isEmpty) { logInfo("%s (%s) finished in %s s".format(stage, stage.name, serviceTime)) stage.latestInfo.completionTime = Some(clock.getTime()) } else { stage.latestInfo.stageFailed(errorMessage.get) logInfo("%s (%s) failed in %s s".format(stage, stage.name, serviceTime)) } listenerBus.post(SparkListenerStageCompleted(stage.latestInfo)) runningStages -= stage } event.reason match { case Success => if (event.accumUpdates != null) { try { Accumulators.add(event.accumUpdates) event.accumUpdates.foreach { case (id, partialValue) => val acc = Accumulators.originals(id).asInstanceOf[Accumulable[Any, Any]] // To avoid UI cruft, ignore cases where value wasn't updated if (acc.name.isDefined && partialValue != acc.zero) { val name = acc.name.get val stringPartialValue = Accumulators.stringifyPartialValue(partialValue) val stringValue = Accumulators.stringifyValue(acc.value) stage.latestInfo.accumulables(id) = AccumulableInfo(id, name, stringValue) event.taskInfo.accumulables += AccumulableInfo(id, name, Some(stringPartialValue), stringValue) } } } catch { // If we see an exception during accumulator update, just log the error and move on. case e: Exception => logError(s"Failed to update accumulators for $task", e) } } listenerBus.post(SparkListenerTaskEnd(stageId, stage.latestInfo.attemptId, taskType, event.reason, event.taskInfo, event.taskMetrics)) stage.pendingTasks -= task task match { case rt: ResultTask[_, _] => stage.resultOfJob match { case Some(job) => if (!job.finished(rt.outputId)) { job.finished(rt.outputId) = true job.numFinished += 1 // If the whole job has finished, remove it if (job.numFinished == job.numPartitions) { markStageAsFinished(stage) cleanupStateForJobAndIndependentStages(job) listenerBus.post(SparkListenerJobEnd(job.jobId, JobSucceeded)) } // taskSucceeded runs some user code that might throw an exception. Make sure // we are resilient against that. try { job.listener.taskSucceeded(rt.outputId, event.result) } catch { case e: Exception => // TODO: Perhaps we want to mark the stage as failed? job.listener.jobFailed(new SparkDriverExecutionException(e)) } } case None => logInfo("Ignoring result from " + rt + " because its job has finished") } case smt: ShuffleMapTask => val status = event.result.asInstanceOf[MapStatus] val execId = status.location.executorId logDebug("ShuffleMapTask finished on " + execId) if (failedEpoch.contains(execId) && smt.epoch <= failedEpoch(execId)) { logInfo("Ignoring possibly bogus ShuffleMapTask completion from " + execId) } else { stage.addOutputLoc(smt.partitionId, status) } if (runningStages.contains(stage) && stage.pendingTasks.isEmpty) { markStageAsFinished(stage) logInfo("looking for newly runnable stages") logInfo("running: " + runningStages) logInfo("waiting: " + waitingStages) logInfo("failed: " + failedStages) if (stage.shuffleDep.isDefined) { // We supply true to increment the epoch number here in case this is a // recomputation of the map outputs. In that case, some nodes may have cached // locations with holes (from when we detected the error) and will need the // epoch incremented to refetch them. // TODO: Only increment the epoch number if this is not the first time // we registered these map outputs. mapOutputTracker.registerMapOutputs( stage.shuffleDep.get.shuffleId, stage.outputLocs.map(list => if (list.isEmpty) null else list.head).toArray, changeEpoch = true) } clearCacheLocs() if (stage.outputLocs.exists(_ == Nil)) { // Some tasks had failed; let's resubmit this stage // TODO: Lower-level scheduler should also deal with this logInfo("Resubmitting " + stage + " (" + stage.name + ") because some of its tasks had failed: " + stage.outputLocs.zipWithIndex.filter(_._1 == Nil).map(_._2).mkString(", ")) submitStage(stage) } else { val newlyRunnable = new ArrayBuffer[Stage] for (stage <- waitingStages) { logInfo("Missing parents for " + stage + ": " + getMissingParentStages(stage)) } for (stage <- waitingStages if getMissingParentStages(stage) == Nil) { newlyRunnable += stage } waitingStages --= newlyRunnable runningStages ++= newlyRunnable for { stage <- newlyRunnable.sortBy(_.id) jobId <- activeJobForStage(stage) } { <span style="color:#ff0000;"> logInfo("Submitting " + stage + " (" + stage.rdd + "), which is now runnable") submitMissingTasks(stage, jobId)//</span> } } } }</span>