《Apache Spark源码剖析》学习笔记之Spark作业提交





/**  * Applies a function f to all elements of this RDD.  */ def foreach(f: T => Unit): Unit = withScope {
  val cleanF = sc.clean(f)
  sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))



步骤1:指定了Final RDD和 作用于RDD上的Function



/**  * Run a job on all partitions in an RDD and return the results in an array.  */ def runJob[T, U: ClassTag](rdd: RDD[T], func: Iterator[T] => U): Array[U] = {
  runJob(rdd, func, 0 until rdd.partitions.size, false)


步骤2:读取Final RDD的分区,数,并指定是否允许本地执行。



/**  * Run a job on a given set of partitions of an RDD, but take a function of type  * `Iterator[T] => U` instead of `(TaskContext, Iterator[T]) => U`.  */ def runJob[T, U: ClassTag](
    rdd: RDD[T],
    func: Iterator[T] => U,
    partitions: Seq[Int],
    allowLocal: Boolean
    ): Array[U] = {
  runJob(rdd, (context: TaskContext, iter: Iterator[T]) => func(iter), partitions, allowLocal)



/**  * Run a function on a given set of partitions in an RDD and return the results as an array. The  * allowLocal flag specifies whether the scheduler can run the computation on the driver rather  * than shipping it out to the cluster, for short actions like first().  */ def runJob[T, U: ClassTag](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    allowLocal: Boolean
    ): Array[U] = {
  val results = new Array[U](partitions.size)
  runJob[T, U](rdd, func, partitions, allowLocal, (index, res) => results(index) = res)



/**  * Run a function on a given set of partitions in an RDD and pass the results to the given  * handler function. This is the main entry point for all actions in Spark. The allowLocal  * flag specifies whether the scheduler can run the computation on the driver rather than  * shipping it out to the cluster, for short actions like first().  */ def runJob[T, U: ClassTag](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    allowLocal: Boolean,
    resultHandler: (Int, U) => Unit) {
  if (stopped) {
    throw new IllegalStateException("SparkContext has been shutdown")
  val callSite = getCallSite
  val cleanedFunc = clean(func)
  logInfo("Starting job: " + callSite.shortForm)
  if (conf.getBoolean("spark.logLineage", false)) {
    logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
  dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, allowLocal,
    resultHandler, localProperties.get)


/**  * Clean a closure to make it ready to serialized and send to tasks  * (removes unreferenced variables in $outer's, updates REPL variables)  * If <tt>checkSerializable</tt> is set, <tt>clean</tt> will also proactively  * check to see if <tt>f</tt> is serializable and throw a <tt>SparkException</tt>
 * if not.  *  * @param f the closure to clean  * @param checkSerializable whether or not to immediately check <tt>f</tt> for serializability  * @throws SparkException if <tt>checkSerializable</tt> is set but <tt>f</tt> is not  * serializable  */ private[spark] def clean[F <: AnyRef](f: F, checkSerializable: Boolean = true): F = {
  ClosureCleaner.clean(f, checkSerializable)


当Scala在创建一个闭包时,需要先判断哪些变量会被闭包所使用并将这些需要使用的变量存储在闭包之内。这一特性是的闭包可以在创建闭包的作用范围之外也能得以正确的 执行



理解了ClosureCleaner存在的原因,也就会明白为什么在写Spark Application的时候,经常会遇到的"Task Not Serializable"是在什么地方报错的了。产生无法序列化的原因就是在RDD的操作中引用了无法序列化的变量。



(2)根据RDD DAG将Job分割成多个Stage。

2.1 依赖性分析及Stage划分

private[scheduler] def handleJobSubmitted(jobId: Int,
    finalRDD: RDD[_],
    func: (TaskContext, Iterator[_]) => _,
    partitions: Array[Int],
    allowLocal: Boolean,
    callSite: CallSite,
    listener: JobListener,
    properties: Properties = null)
  var finalStage: Stage = null  try {
    // New stage creation may throw an exception if, for example, jobs are run on a  // HadoopRDD whose underlying HDFS files have been deleted.  finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
  } catch {
    case e: Exception =>
      logWarning("Creating new stage failed due to exception - job: " + jobId, e)
      return  }
  if (finalStage != null) {
    val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
    logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
      job.jobId, callSite.shortForm, partitions.length, allowLocal))
    logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
    logInfo("Parents of final stage: " + finalStage.parents)
    logInfo("Missing parents: " + getMissingParentStages(finalStage))
    val shouldRunLocally =
      localExecutionEnabled && allowLocal && finalStage.parents.isEmpty && partitions.length == 1
    val jobSubmissionTime = clock.getTimeMillis()
    if (shouldRunLocally) {
      // Compute very short actions like first() or take() with no parent stages locally.  listenerBus.post(
        SparkListenerJobStart(job.jobId, jobSubmissionTime, Seq.empty, properties))
    } else {
      jobIdToActiveJob(jobId) = job
      activeJobs += job
      finalStage.resultOfJob = Some(job)
      val stageIds = jobIdToStageIds(jobId).toArray
      val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
        SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
finalStage = newStage(finalRDD, partitions.size, None, jobId, callSite)
/**  * Create a Stage -- either directly for use as a result stage, or as part of the (re)-creation  * of a shuffle map stage in newOrUsedStage. The stage will be associated with the provided  * jobId. Production of shuffle map stages should always use newOrUsedStage, not newStage  * directly.  */ private def newStage(
    rdd: RDD[_],  numTasks: Int,  shuffleDep: Option[ShuffleDependency[_, _, _]],  jobId: Int,  callSite: CallSite)
  : Stage =
  val parentStages = getParentStages(rdd, jobId)
  val id = nextStageId.getAndIncrement()
  val stage = new Stage(id, rdd, numTasks, shuffleDep, parentStages, jobId, callSite)
  stageIdToStage(id) = stage
  updateJobIdStageIdMaps(jobId, stage)
private[spark] class Stage(
    val id: Int,// Stage的序号,数值越大,越优先执行。如3,2,1.  val rdd: RDD[_],// 归属于本Stage的最后一个RDD  val numTasks: Int,// 创建的Task数目,等于父rdd的输出Partition数目  val shuffleDep: Option[ShuffleDependency[_, _, _]],  // Output shuffle if stage is a map stage  // 是否存在ShuffleDependency  val parents: List[Stage],//父Stage列表  val jobId: Int,// 作业Id  val callSite: CallSite)
  extends Logging {





/**  * Tracks information about an active job in the DAGScheduler.  */ private[spark] class ActiveJob(
    val jobId: Int,// 每个作业都分配一个唯一的Id  val finalStage: Stage,// 最终的Stage  val func: (TaskContext, Iterator[_]) => _,// 作用与最后一个Stage上的函数  val partitions: Array[Int],//分区列表,  // 注意这里表示需要从多少个分区读入数据并进行处理  val callSite: CallSite,
    val listener: JobListener,
    val properties: Properties) {

  val numPartitions = partitions.length
  val finished = Array.fill[Boolean](numPartitions)(false)
  var numFinished = 0
  • 所依赖的Stage是否都已经完成,如果没有则先执行所依赖的Stage。
  • 如果所有的依赖已经完成,则提交自身所处的Stage。
/** Submits stage, but first recursively submits any missing parents. */ private def submitStage(stage: Stage) {
  val jobId = activeJobForStage(stage)
  if (jobId.isDefined) {
    logDebug("submitStage(" + stage + ")")
    if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
      val missing = getMissingParentStages(stage).sortBy(_.id)
      logDebug("missing: " + missing)
      if (missing == Nil) {
        logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
        submitMissingTasks(stage, jobId.get)
      } else {
        for (parent <- missing) {
        waitingStages += stage
  } else {
    abortStage(stage, "No active job for stage " + stage.id)
val missing = getMissingParentStages(stage).sortBy(_.id)
private def getMissingParentStages(stage: Stage): List[Stage] = {
  val missing = new HashSet[Stage]
  val visited = new HashSet[RDD[_]]
  // We are manually maintaining a stack here to prevent StackOverflowError  // caused by recursively visiting  val waitingForVisit = new Stack[RDD[_]]
  def visit(rdd: RDD[_]) {
    if (!visited(rdd)) {
      visited += rdd
      if (getCacheLocs(rdd).contains(Nil)) {
        for (dep <- rdd.dependencies) {
          dep match {
            case shufDep: ShuffleDependency[_, _, _] =>
              val mapStage = getShuffleMapStage(shufDep, stage.jobId)
              if (!mapStage.isAvailable) {
                missing += mapStage
            case narrowDep: NarrowDependency[_] =>
  while (!waitingForVisit.isEmpty) {
  • ShuffledRDD
  • CoGroupedRDD
  • SubtractedRDD
override def getDependencies: Seq[Dependency[_]] = {
  List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine))


Spark将由Executor执行的Task分为 ShuffleMapTask和ResultTask两种,可以简单地将其对应于Hadoop中的Map和Reduce。
/** Called when stage's parents are available and we can now do its task. */ private def submitMissingTasks(stage: Stage, jobId: Int) {
  logDebug("submitMissingTasks(" + stage + ")")
  // Get our pending tasks and remember them in our pendingTasks entry  stage.pendingTasks.clear()

  // First figure out the indexes of partition ids to compute.  val partitionsToCompute: Seq[Int] = {
    if (stage.isShuffleMap) {
      (0 until stage.numPartitions).filter(id => stage.outputLocs(id) == Nil)
    } else {
      val job = stage.resultOfJob.get
      (0 until job.numPartitions).filter(id => !job.finished(id))

  val properties = if (jobIdToActiveJob.contains(jobId)) {
  } else {
    // this stage will be assigned to "default" pool  null  }

  runningStages += stage
  // SparkListenerStageSubmitted should be posted before testing whether tasks are  // serializable. If tasks are not serializable, a SparkListenerStageCompleted event  // will be posted, which should always come after a corresponding SparkListenerStageSubmitted  // event.  stage.latestInfo = StageInfo.fromStage(stage, Some(partitionsToCompute.size))
  listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))

  // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.  // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast  // the serialized copy of the RDD and for each task we will deserialize it, which means each  // task gets a different copy of the RDD. This provides stronger isolation between tasks that  // might modify state of objects referenced in their closures. This is necessary in Hadoop  // where the JobConf/Configuration object is not thread-safe.  var taskBinary: Broadcast[Array[Byte]] = null  try {
    // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).  // For ResultTask, serialize and broadcast (rdd, func).  val taskBinaryBytes: Array[Byte] =
      if (stage.isShuffleMap) {
        closureSerializer.serialize((stage.rdd, stage.shuffleDep.get) : AnyRef).array()
      } else {
        closureSerializer.serialize((stage.rdd, stage.resultOfJob.get.func) : AnyRef).array()
    taskBinary = sc.broadcast(taskBinaryBytes)
  } catch {
    // In the case of a failure during serialization, abort the stage.  case e: NotSerializableException =>
      abortStage(stage, "Task not serializable: " + e.toString)
      runningStages -= stage
      return  case NonFatal(e) =>
      abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}")
      runningStages -= stage
      return  }

  val tasks: Seq[Task[_]] = if (stage.isShuffleMap) {
    partitionsToCompute.map { id =>
      val locs = getPreferredLocs(stage.rdd, id)
      val part = stage.rdd.partitions(id)
      new ShuffleMapTask(stage.id, taskBinary, part, locs)
  } else {
    val job = stage.resultOfJob.get
    partitionsToCompute.map { id =>
      val p: Int = job.partitions(id)
      val part = stage.rdd.partitions(p)
      val locs = getPreferredLocs(stage.rdd, p)
      new ResultTask(stage.id, taskBinary, part, locs, id)

  if (tasks.size > 0) {
    logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
    stage.pendingTasks ++= tasks
    logDebug("New pending tasks: " + stage.pendingTasks)
      new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties))
    stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
  } else {
    // Because we posted SparkListenerStageSubmitted earlier, we should post  // SparkListenerStageCompleted here in case there are no tasks to run.  outputCommitCoordinator.stageEnd(stage.id)
    logDebug("Stage " + stage + " is actually done; %b %d %d".format(
      stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))
    runningStages -= stage
// Make fake resource offers on all executors def makeOffers() {
  launchTasks(scheduler.resourceOffers(executorDataMap.map { case (id, executorData) =>
    new WorkerOffer(id, executorData.executorHost, executorData.freeCores)
/**  * Called by cluster manager to offer resources on slaves. We respond by asking our active task  * sets for tasks in order of priority. We fill each node with tasks in a round-robin manner so  * that tasks are balanced across the cluster.  */ def resourceOffers(offers: Seq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized {
  // Mark each slave as alive and remember its hostname  // Also track if new executor is added  var newExecAvail = false  for (o <- offers) {
    executorIdToHost(o.executorId) = o.host
    activeExecutorIds += o.executorId
    if (!executorsByHost.contains(o.host)) {
      executorsByHost(o.host) = new HashSet[String]()
      executorAdded(o.executorId, o.host)
      newExecAvail = true  }
    for (rack <- getRackForHost(o.host)) {
      hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host

  // Randomly shuffle offers to avoid always placing tasks on the same set of workers.  val shuffledOffers = Random.shuffle(offers)
  // Build a list of tasks to assign to each worker.  val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
  val availableCpus = shuffledOffers.map(o => o.cores).toArray
  val sortedTaskSets = rootPool.getSortedTaskSetQueue
  for (taskSet <- sortedTaskSets) {
    logDebug("parentName: %s, name: %s, runningTasks: %s".format(
      taskSet.parent.name, taskSet.name, taskSet.runningTasks))
    if (newExecAvail) {


def launchTask(
    context: ExecutorBackend,
    taskId: Long,
    attemptNumber: Int,
    taskName: String,
    serializedTask: ByteBuffer) {
  val tr = new TaskRunner(context, taskId = taskId, attemptNumber = attemptNumber, taskName,
  runningTasks.put(taskId, tr)
val tr = new TaskRunner(context, taskId = taskId, attemptNumber = attemptNumber, taskName,
override def run() {
  val deserializeStartTime = System.currentTimeMillis()
  val ser = env.closureSerializer.newInstance()
  logInfo(s"Running $taskName (TID $taskId)")
  execBackend.statusUpdate(taskId, TaskState.RUNNING, EMPTY_BYTE_BUFFER)
  var taskStart: Long = 0
  startGCTime = gcTime

  try {
    val (taskFiles, taskJars, taskBytes) = Task.deserializeWithDependencies(serializedTask)
    updateDependencies(taskFiles, taskJars)
    task = ser.deserialize[Task[Any]](taskBytes, Thread.currentThread.getContextClassLoader)
 updateDependencies(taskFiles, taskJars)
/**  * Download any missing dependencies if we receive a new set of files and JARs from the  * SparkContext. Also adds any new JARs we fetched to the class loader.  */ private def updateDependencies(newFiles: HashMap[String, Long], newJars: HashMap[String, Long]) {
  lazy val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
  synchronized {
    // Fetch missing dependencies  for ((name, timestamp) <- newFiles if currentFiles.getOrElse(name, -1L) < timestamp) {
      logInfo("Fetching " + name + " with timestamp " + timestamp)
      // Fetch file with useCache mode, close cache for local mode.  Utils.fetchFile(name, new File(SparkFiles.getRootDirectory), conf,
        env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
      currentFiles(name) = timestamp
    for ((name, timestamp) <- newJars) {
      val localName = name.split("/").last
      val currentTimeStamp = currentJars.get(name)
      if (currentTimeStamp < timestamp) {
        logInfo("Fetching " + name + " with timestamp " + timestamp)
        // Fetch file with useCache mode, close cache for local mode.  Utils.fetchFile(name, new File(SparkFiles.getRootDirectory), conf,
          env.securityManager, hadoopConf, timestamp, useCache = !isLocal)
        currentJars(name) = timestamp
        // Add it to our class loader  val url = new File(SparkFiles.getRootDirectory, localName).toURI.toURL
        if (!urlClassLoader.getURLs.contains(url)) {
          logInfo("Adding " + url + " to class loader")
  • HttpFileServer
  • HDFS
  • 本地文件

2.4 Shuffle Task


override def runTask(context: TaskContext): MapStatus = {
  // Deserialize the RDD using the broadcast variable.  val ser = SparkEnv.get.closureSerializer.newInstance()
  val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
    ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)

  metrics = Some(context.taskMetrics)
  var writer: ShuffleWriter[Any, Any] = null  try {
    val manager = SparkEnv.get.shuffleManager
    writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
    writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
    return writer.stop(success = true).get
  } catch {
    case e: Exception =>
      try {
        if (writer != null) {
          writer.stop(success = false)
      } catch {
        case e: Exception =>
          log.debug("Could not stop writer", e)
      throw e
final def iterator(split: Partition, context: TaskContext): Iterator[T] = {
  if (storageLevel != StorageLevel.NONE) {
    SparkEnv.get.cacheManager.getOrCompute(this, split, context, storageLevel)
  } else {
    computeOrReadCheckpoint(split, context)
private[spark] def computeOrReadCheckpoint(split: Partition, context: TaskContext): Iterator[T] =
  if (isCheckpointed) firstParent[T].iterator(split, context) else compute(split, context)
private[spark] class ResultTask[T, U](
    stageId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    @transient locs: Seq[TaskLocation],
    val outputId: Int)
  extends Task[U](stageId, partition.index) with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.  val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)

    metrics = Some(context.taskMetrics)
    func(context, rdd.iterator(partition, context))

2.5 结果返回

2.6 WebUI

2.7 Metrics

2.8 存储机制

