Spark源码解析之Stage划分

        这里以count操作为例,一步步解析Spark在执行一个Job时如何进行DAG图的解析。Spark在遇到Action类型算子时,会使用SparkContext进行一系列的runJob方法调用,最终会调用DAGScheduler的runJob方法来划分DAG图。

一、runJob方法调用

// 计算RDD中包含的键值对个数,此时会触发一个SparkContext来提交执行Job
def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum

def runJob[T, U: ClassTag](rdd: RDD[T], func: Iterator[T] => U): Array[U] = {
  runJob(rdd, func, 0 until rdd.partitions.length)
}

def runJob[T, U: ClassTag](
    rdd: RDD[T],
    func: Iterator[T] => U,
    partitions: Seq[Int]): Array[U] = {
  val cleanedFunc = clean(func)
  runJob(rdd, (ctx: TaskContext, it: Iterator[T]) => cleanedFunc(it), partitions)
}

def runJob[T, U: ClassTag](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int]): Array[U] = {
  val results = new Array[U](partitions.size)
  runJob[T, U](rdd, func, partitions, (index, res) => results(index) = res)
  results
}

def runJob[T, U: ClassTag](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    resultHandler: (Int, U) => Unit): Unit = {
  if (stopped.get()) {
    throw new IllegalStateException("SparkContext has been shutdown")
  }
  val callSite = getCallSite
  val cleanedFunc = clean(func)
  logInfo("Starting job: " + callSite.shortForm)
  if (conf.getBoolean("spark.logLineage", false)) {
    logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
  }
  // 调用DAGScheduler的runJob进行处理,负责任务的逻辑调度,将Job拆分成不同阶段的具有依赖关系的任务集
  dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
  progressBar.foreach(_.finishAll())
  // 调用rdd的doCheckPoint方法来缓存RDD数据,会以一个额外的Job来执行
  rdd.doCheckpoint()
}



二、DAGScheduler中进行DAG划分

1、runJob方法,该方法主要使用DAGScheduler来对Job进行逻辑提交执行。

def runJob[T, U](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    callSite: CallSite,
    resultHandler: (Int, U) => Unit,
    properties: Properties): Unit = {
  val start = System.nanoTime
  // DAGScheduler提交该Job,它会等待作业提交的结果。此时提交任务的线程会在这里阻塞直至返回Job
  // 执行结果。然后判断一下成功或者是失败来进行下一步操作。
  val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
  val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait]
  waiter.completionFuture.ready(Duration.Inf)(awaitPermission)
  waiter.completionFuture.value.get match {
    case scala.util.Success(_) =>
      logInfo("Job %d finished: %s, took %f s".format
        (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
    case scala.util.Failure(exception) =>
      logInfo("Job %d failed: %s, took %f s".format
        (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
      val callerStackTrace = Thread.currentThread().getStackTrace.tail
      exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
      throw exception
    }
  }

2、submitJob方法,使用事件驱动设计。将Job封装成一个JobSubmitted事件,然后利用DAGSchedulerEventProcessLoop提交该事件。

def submitJob[T, U](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    callSite: CallSite,
    resultHandler: (Int, U) => Unit,
    properties: Properties): JobWaiter[U] = {
  // 检测RDD中Partition的个数,真正执行时一个Partition对应着一个Task
  val maxPartitions = rdd.partitions.length
  partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
    throw new IllegalArgumentException(
      "Attempting to access a non-existent partition: " + p + ". " +
      "Total number of partitions: " + maxPartitions)
  }

  // 生成jobId
  val jobId = nextJobId.getAndIncrement()
  if (partitions.size == 0) {
    // Return immediately if the job is running 0 tasks
    return new JobWaiter[U](this, jobId, 0, resultHandler)
  }

  assert(partitions.size > 0)
  val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
  val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
  // 将该Job的执行封装成JobSubmitted事件并通过一个事件处理队列来提交此事件,从而进行后续处理。
  eventProcessLoop.post(JobSubmitted(
    jobId, rdd, func2, partitions.toArray, callSite, waiter,
    SerializationUtils.clone(properties)))
  waiter
}

3、DAGSchedulerEventProcessLoop中对提交的JobSubmitted事件进行处理。此时在处理方法中会生成ResultStage以及所有的ShuffleMapStage。

/**
  * DAGSchedulerEventProcessLoop重写了EventLoop的onReceieve方法,在该方法中调用
  * doOnReceive方法来处理提交Job是对应的JobSubmitted事件
  */
override def onReceive(event: DAGSchedulerEvent): Unit = {
  val timerContext = timer.time()
  try {
    doOnReceive(event)
  } finally {
    timerContext.stop()
  }
}

/**
  * 使用doOnReceive方法来处理不同的事件
  */
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
 // 处理提交一个Job时对应的JobSubmitted事件,此时会调用DAGScheduler中的handleJobSubmitted方法
  case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
    dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
}

/**
  *  使用DAGScheduler的handleJobSubmitted方法来处理JobSubmiited事件
  */
private[scheduler] def handleJobSubmitted(jobId: Int,
                                          finalRDD: RDD[_],
                                          func: (TaskContext, Iterator[_]) => _,
                                          partitions: Array[Int],
                                          callSite: CallSite,
                                          listener: JobListener,
                                          properties: Properties) {
  var finalStage: ResultStage = null
  try {
    // 利用DAG图中的最后一个RDD,即finalRDD来创建finalStage,并将它加入DAGScheduler内部的内
    // 存缓冲中,利用createResultStage方法来创建ResultStage,而在createResultStage方法中会将
    // 对应的所有ShuffleMapStage创建完毕。
    finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
  } catch {
    case e: Exception =>
      logWarning("Creating new stage failed due to exception - job: " + jobId, e)
      listener.jobFailed(e)
      return
  }

  // 用finalStage来创建一个Job
  val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
  clearCacheLocs()
  logInfo("Got job %s (%s) with %d output partitions".format(
    job.jobId, callSite.shortForm, partitions.length))
  logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
  logInfo("Parents of final stage: " + finalStage.parents)
  logInfo("Missing parents: " + getMissingParentStages(finalStage))

  // 将Job加入到内部缓存中
  val jobSubmissionTime = clock.getTimeMillis()
  jobIdToActiveJob(jobId) = job
  activeJobs += job
  finalStage.setActiveJob(job)
  val stageIds = jobIdToStageIds(jobId).toArray
  val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
  listenerBus.post(
    SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
  // 使用submitStage方法来提交finalStage,最后的结果就是第一个未执行的Stage提交执行,
  // 其他Stage都在等待队列中
  submitStage(finalStage)
}

4、生成ResultStage以及所有的ShuffleMapStage。实际上在createResultStage方法中会生成所有的Stage。下面将分析该方法的执行逻辑。


/**
  * 创建ResultStage以及生成其所依赖的所有ShuffleMapStage
  */
private def createResultStage(rdd: RDD[_],
                              func: (TaskContext, Iterator[_]) => _,
                              partitions: Array[Int],
                              jobId: Int,
                              callSite: CallSite): ResultStage = {
  // 传入finalRDD,得到其所有的父Stage
  val parents = getOrCreateParentStages(rdd, jobId)
  val id = nextStageId.getAndIncrement()
  // 利用finalRDD以及对应的parentStages创建ResultStage
  val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
  stageIdToStage(id) = stage
  updateJobIdStageIdMaps(jobId, stage)
  stage
}

/**
  * 返回一个RDD依赖的所有Stage
  */
private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
  // getShuffleDependencies方法获得RDD直接相关的ShuffleDependencies
  getShuffleDependencies(rdd).map { shuffleDep =>
    // getOrCreateShuffleMapStage获得或者新建一个ShuffleDependence依赖的所有ShuffleMapStage。
    getOrCreateShuffleMapStage(shuffleDep, firstJobId)
  }.toList
}

/**
  * 返回一个RDD的所有宽依赖,即和该RDD直接相关的父依赖集合
  */
private[scheduler] def getShuffleDependencies(rdd: RDD[_]):      
                                     HashSet[ShuffleDependency[_, _, _]] = {
  // 保存RDD直接依赖的ShuffleDependency集合
  val parents = new HashSet[ShuffleDependency[_, _, _]]
  // 存放遍历过的RDD
  val visited = new HashSet[RDD[_]]
  // 存放将要遍历的RDD的栈
  val waitingForVisit = new Stack[RDD[_]]
  waitingForVisit.push(rdd)
  while (waitingForVisit.nonEmpty) {
    val toVisit = waitingForVisit.pop()
    if (!visited(toVisit)) {
      visited += toVisit
      // 遍历该RDD的依赖关系
      toVisit.dependencies.foreach {
        // 如果是宽依赖,则加入到宽依赖关系集合中
        case shuffleDep: ShuffleDependency[_, _, _] => parents += shuffleDep
        // 窄依赖,将其连接的RDD加入到将要访问的栈中
        case dependency => waitingForVisit.push(dependency.rdd)
      }
    }
  }
  parents
}

/**
  * 对于给定的ShuffleDependency,该方法会创建一个ShuffleMapStage并且与其相关的所有的祖先
  * ShuffleDependency对应的ShuffleMapStage都会被创建。
  */
private def getOrCreateShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _],
                                         firstJobId: Int): ShuffleMapStage = {
  shuffleIdToMapStage.get(shuffleDep.shuffleId) match {
    // 存在一个ShuffleMapStage与ShuffleDependency对应,直接返回
    case Some(stage) =>
      stage
    // 不存在,则新建一个ShuffleMapStage
     case None =>
      // 获取或新建该ShuffleDependency所依赖的所有的ShuffleMapStage
      getMissingAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
        if (!shuffleIdToMapStage.contains(dep.shuffleId)) {
           createShuffleMapStage(dep, firstJobId)
        }
      }
        // 对当前的ShuffleDependency,新建一个ShuffleMapStage
        createShuffleMapStage(shuffleDep, firstJobId)
    }
}

/**
  * 返回一个RDD的所有祖先ShuffleDependency
  */
private def getMissingAncestorShuffleDependencies(rdd: RDD[_]): 
                                 Stack[ShuffleDependency[_, _, _]] = {
  // 存放RDD所有的祖先ShuffleDependency
  val ancestors = new Stack[ShuffleDependency[_, _, _]]
  val visited = new HashSet[RDD[_]]
  val waitingForVisit = new Stack[RDD[_]]
  waitingForVisit.push(rdd)
  // 从后向前遍历RDD,获取RDD所有的祖先ShuffleDependency
  while (waitingForVisit.nonEmpty) {
    val toVisit = waitingForVisit.pop()
    if (!visited(toVisit)) {
      visited += toVisit
      // getShuffleDependencies方法返回RDD的宽依赖
      getShuffleDependencies(toVisit).foreach { shuffleDep =>
        // 宽依赖未被注册,则加入,并且继续向前寻找未被注册的宽依赖
        if (!shuffleIdToMapStage.contains(shuffleDep.shuffleId)) {
          ancestors.push(shuffleDep)
          waitingForVisit.push(shuffleDep.rdd)
        } 
      }
    }
  }
  ancestors
}

5、提交每个Stage。实际上在调用createResultStage之后,一个DAG图中的所有的Stage都已经被创建出来,即ResultStage和ShuffleMapStage,此时需要将Stage进行提交执行。即调用submitStage方法,下面将分析该方法的执行逻辑:

/** 
  * Submits stage, but first recursively submits any missing parents. 
  * 在submitStage方法中提交执行stage,在提交该stage时,会递归提交计算该stage未被计算的父satges
  */
private def submitStage(stage: Stage) {
   val jobId = activeJobForStage(stage)
   if (jobId.isDefined) {
     logDebug("submitStage(" + stage + ")")
     if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
       // 调用getMissingParentStages方法获得该Stage所有的父Stage,将其父stage存放
       // 在missing中并且按照stage id进行排序
       val missing = getMissingParentStages(stage).sortBy(_.id)
       logDebug("missing: " + missing)
       //如果该stage不存在父stage,则直接提交该stage中的所有任务
       if (missing.isEmpty) {
         logInfo("Submitting " + stage + " (" + stage.rdd + "), " +
           "which has no missing parents")
         // 如果该Stage不包含未被计算的父Stage,则调用submitMissingTasks方法提交该stage对应的taskSet
         submitMissingTasks(stage, jobId.get)
       } else {
         // 如果存在未被提交执行的父Stage,则递归提交父stage
         for (parent <- missing) {
           submitStage(parent)
         }
         // 将该stage加入到等待stage集合中
         waitingStages += stage
       }
     }
   } else {
     abortStage(stage, "No active job for stage " + stage.id, None)
  }
}

/**
  * 得到该Stage所有未被计算的祖先Stage集合
  */
private def getMissingParentStages(stage: Stage): List[Stage] = {
  val missing = new HashSet[Stage]
  val visited = new HashSet[RDD[_]]
  // 利用一个栈来保存未被处理的RDD
  val waitingForVisit = new Stack[RDD[_]]

  // 使用visit方法来遍历栈中未被处理的RDD
  def visit(rdd: RDD[_]) {
    if (!visited(rdd)) {
      visited += rdd
      val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
      if (rddHasUncachedPartitions) {
        // 依次遍历RDD中的依赖关系
        for (dep <- rdd.dependencies) {
          dep match {
            // 如果是宽依赖关系,利用getOrCreateShuffleMapStage方法获取该ShuffleDependency对应的ShuffleMapStage
            // 以及所有依赖的祖先ShuffleMapStage
            case shufDep: ShuffleDependency[_, _, _] =>
              val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)
              // 判断Stage是否可用
              if (!mapStage.isAvailable) {
                missing += mapStage
              }
            //如果是窄依赖,则将RDD放入到栈中
            case narrowDep: NarrowDependency[_] =>
              waitingForVisit.push(narrowDep.rdd)
          }
        }
      }
    }
  }
  // 首先向栈中推入了该stage中的最后一个RDD
  waitingForVisit.push(stage.rdd)
  // 如果栈不空,则遍历所有栈中的RDD
  while (waitingForVisit.nonEmpty) {
    visit(waitingForVisit.pop())
  }
  // 返回Stage所有父Stage
  missing.toList
}

 

你可能感兴趣的:(Spark)