带你看懂Spark2.x源码之stage划分

在Spark底层源码中,最重要的莫过于Stage划分和Task分配两个算法的源码了。由于之前我也学习过1.6的源码,这次看2.2的源码发现有很多方法发生了修改,所以将我对2.2源码的分析与大家分析一下,希望可以共同学习。

先通过action算子,调用run job()方法,例如foreach算子

  def foreach(f: T => Unit): Unit = withScope {
    val cleanF = sc.clean(f)
    sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))
  }

然后一直点run job()直到出现如下,dagScheduler中的run job()方法

def runJob[T, U: ClassTag](
      rdd: RDD[T],
      func: (TaskContext, Iterator[T]) => U,
      partitions: Seq[Int],
      resultHandler: (Int, U) => Unit): Unit = {
    if (stopped.get()) {
      throw new IllegalStateException("SparkContext has been shutdown")
    }
    val callSite = getCallSite
    val cleanedFunc = clean(func)
    logInfo("Starting job: " + callSite.shortForm)
    if (conf.getBoolean("spark.logLineage", false)) {
      logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
    }
    //进入spark最核心的DAGScheduler
    dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
    progressBar.foreach(_.finishAll())

进入提交任务的代码

def runJob[T, U](
      rdd: RDD[T],
      func: (TaskContext, Iterator[T]) => U,
      partitions: Seq[Int],
      callSite: CallSite,
      resultHandler: (Int, U) => Unit,
      properties: Properties): Unit = {
    val start = System.nanoTime
    //在这里会提交一个Job任务,然后会返回一个阻塞的线程等待Job执行完成
    val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
    ThreadUtils.awaitReady(waiter.completionFuture, Duration.Inf)
    //下面是根据不同的Job任务执行情况打印不同的Log信息
    waiter.completionFuture.value.get match {
      case scala.util.Success(_) =>
        logInfo("Job %d finished: %s, took %f s".format
          (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
      case scala.util.Failure(exception) =>
        logInfo("Job %d failed: %s, took %f s".format
          (waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
        // SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
        val callerStackTrace = Thread.currentThread().getStackTrace.tail
        exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
        throw exception
    }
  }

点击submitJob(),进入提交任务代码的方法中

def submitJob[T, U](
      rdd: RDD[T],
      func: (TaskContext, Iterator[T]) => U,
      partitions: Seq[Int],
      callSite: CallSite,
      resultHandler: (Int, U) => Unit,
      properties: Properties): JobWaiter[U] = {
    // Check to make sure we are not launching a task on a partition that does not exist.
    //检查分区是否存在保证Task正常运行
    val maxPartitions = rdd.partitions.length
    partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
      throw new IllegalArgumentException(
        "Attempting to access a non-existent partition: " + p + ". " +
          "Total number of partitions: " + maxPartitions)
    }
    //  增加一个JobId作当前Job的标识(+1)
    val jobId = nextJobId.getAndIncrement()
    if (partitions.size == 0) {
      // Return immediately if the job is running 0 tasks
      //  如果没有Task任务,将立即返回JobWaiter
      return new JobWaiter[U](this, jobId, 0, resultHandler)
    }
   //  为分区做个判断,确保分区大于0
    assert(partitions.size > 0)
    val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
  // 首先构造一个JobWaiter阻塞线程 等待job完成 然后把完成结果提交给resultHandler
    val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
    // eventProcessLoop是DAGScheduler的事件队列
    // 因为可能集群同时运行着多个Job,而DAGSchduler默认是FIFO先进先出的资源调度
    // 这里传入的事件类型为JobSubmitted,而在eventProcessLoop会调用doOnReceive
    // 来匹配事件类型并执行对应的操作,最终会匹配到dagScheduler.handleJobSubmitted(....)
    eventProcessLoop.post(JobSubmitted(
      jobId, rdd, func2, partitions.toArray, callSite, waiter,
      SerializationUtils.clone(properties)))
    waiter
  }

这里会调用eventProcessLoop,而eventProcessLoop又是DAGSchedulerEventProcessLoop类的实例化,DAGSchedulerEventProcessLoop继承了EventLoop,EventLoop中有一个事件队列也就是eventQueue,每次接收到事件放入队列中,生成线程从eventQueue中取出event事件,使用onRecevie方法执行事件,这时DAGSchedulerEventProcessLoop会调用doOnReceive对接收到的事件进行模式匹配,匹配到handleJobSubmitted

private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
    case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
      //大家注意,这里是Stage划分的精髓所在
      dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)

    case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>
      dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)

    case StageCancelled(stageId, reason) =>
      dagScheduler.handleStageCancellation(stageId, reason)

    case JobCancelled(jobId, reason) =>
      dagScheduler.handleJobCancellation(jobId, reason)

    case JobGroupCancelled(groupId) =>
      dagScheduler.handleJobGroupCancelled(groupId)

    case AllJobsCancelled =>
      dagScheduler.doCancelAllJobs()

    case ExecutorAdded(execId, host) =>
      dagScheduler.handleExecutorAdded(execId, host)

    case ExecutorLost(execId, reason) =>
      val filesLost = reason match {
        case SlaveLost(_, true) => true
        case _ => false
      }

在handleJobSubmitted中会创造ResultStage也就是最后一个stage

 private[scheduler] def handleJobSubmitted(jobId: Int,
      finalRDD: RDD[_],
      func: (TaskContext, Iterator[_]) => _,
      partitions: Array[Int],
      callSite: CallSite,
      listener: JobListener,
      properties: Properties) {
    // 创建ResultStage,这里才是真正开始处理提交的job划分stage的时候
    var finalStage: ResultStage = null
    try {
      // New stage creation may throw an exception if, for example, jobs are run on a
      // HadoopRDD whose underlying HDFS files have been deleted.

      // 它会从后往前找递归遍历它的每一个父RDD,从持久化中抽取反之重新计算
      // 补充下:stage分为shuffleMapStage和ResultStage两种
      // 每个job都是由1个ResultStage和0+个ShuffleMapStage组成
      finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
    } catch {
      case e: Exception =>
        logWarning("Creating new stage failed due to exception - job: " + jobId, e)
        listener.jobFailed(e)
        return
    }

在createResultStage中有调用了getOrCreateParentStage用来得来FinalStage的父stage

private def createResultStage(
      rdd: RDD[_],
      func: (TaskContext, Iterator[_]) => _,
      partitions: Array[Int],
      jobId: Int,
      callSite: CallSite): ResultStage = {
    // 开始创建ResultStage的父stage
    // 里面有多个嵌套获取shuffle依赖和循环创建shuffleMapStage,若没有shuffle操作返回为空List
    val parents = getOrCreateParentStages(rdd, jobId)
    // 当前的stageId标识+1
    val id = nextStageId.getAndIncrement()
    // 放入刚刚生成的父stage等核心参数,生成ResultStage
    val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
    // 把ResultStage和它的ID加入stageIdToStage
    stageIdToStage(id) = stage
    // 更新jobIds和jobIdToStageIds
    updateJobIdStageIdMaps(jobId, stage)
    // 返回这个ResultStage
    stage
  }

在getOrCreateParentStage中又调用了getOrCreateShuffleMapStage,由于除了最后一个stage其余的都是shuffleStage,所以就可以调用这个方法来创造父stage

private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
    //从getShuffleDependencies开始,
    // 这里仅仅是抽取当前RDD的Shuffle依赖
    // (Job的Stage是以Shuffle划分的,1个Job中只会生成0+个ShuffleMapStage
    // 和1个ResultStage),如果不是ShuffleDependency就继续抽取父RDD...
    // 迭代遍历一直到抽取出为止或者没有
    getShuffleDependencies(rdd).map { shuffleDep =>
      getOrCreateShuffleMapStage(shuffleDep, firstJobId)
    }.toList

而在getOrCreateShuffleMapStage中讲解了如何去创建MapStage,如果能够通过传入的stage提取到父stage就返回父stage,如果不能就提取该stage的依赖关系,最后会创建出宽依赖的shuffleMapStage

private def getOrCreateShuffleMapStage(
      shuffleDep: ShuffleDependency[_, _, _],
      firstJobId: Int): ShuffleMapStage = {
    // 通过从ShuffleDependency提取到的shuffleId来提取shuffleIdToMapStage中的ShuffleMapStage
    shuffleIdToMapStage.get(shuffleDep.shuffleId) match {
      //  如果能提取到 就直接返回
      case Some(stage) =>
        stage
      // 如果提取不到就会依次找到所有父ShuffleDependencies并且构建所有父ShuffleMapStage
      case None =>
        // Create stages for all missing ancestor shuffle dependencies.
        getMissingAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
          // Even though getMissingAncestorShuffleDependencies only returns shuffle dependencies
          // that were not already in shuffleIdToMapStage, it's possible that by the time we
          // get to a particular dependency in the foreach loop, it's been added to
          // shuffleIdToMapStage by the stage creation process for an earlier dependency. See
          // SPARK-13902 for more information.

          // 根据遍历出来的所有ShuffleDependencies依次创建所有父ShuffleMapStage
          // 接下来进行判断是否是父stage
          if (!shuffleIdToMapStage.contains(dep.shuffleId)) {
            createShuffleMapStage(dep, firstJobId)
          }
        }
        // 最后会创建当前ShuffleDependency的ShuffleMapStage
        // Finally, create a stage for the given shuffle dependency.
        createShuffleMapStage(shuffleDep, firstJobId)
    }
  }

这时,我们就得到了finalStage的父stage就可以创建出finalStage,当然在之前的代码中可以看出可能会报异常,如果处理的hdfs上的数据发生改动或删除就会报错。当创建出finalStage后,会打印log日志

logInfo("Got job %s (%s) with %d output partitions".format(
   job.jobId, callSite.shortForm, partitions.length))
logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
    //TODO 接下来开始进行最重要的操作,就是执行Stage划分算法
logInfo("Missing parents: " + getMissingParentStages(finalStage))

接着会对job进行一系列操作,为提交stage做铺垫

val jobSubmissionTime = clock.getTimeMillis()
    // HashMap结构,维护着jobId和jobIdToActiveJob的映射关系
    jobIdToActiveJob(jobId) = job
    // HashSet结构,维护着所有ActiveJob
    activeJobs += job
    // finalStage一旦生成就会把封装自己的ActiveJob注册到自己的_activeJob上
    finalStage.setActiveJob(job)
    // 提取出jobId对应的所有StageIds并转换成数组
    val stageIds = jobIdToStageIds(jobId).toArray
    // 提取出每个stage的最新尝试信息,当job启动时会告知SparkListenersJob
    val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
    listenerBus.post(
      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
    //接下来开始提交Stage
    submitStage(finalStage)

在提交stage的方法中,我们可以看出我们先将stage放入,然后对这个stage进行判断,如果这个stage有尚未提交的parentStage,就继续调用这个方法提交它的parentStage并将该stage放入到waitingStages中,当它的父stage全部被提交完时才会提交它。这里会对每个stage进行判断,若stage没有尚未提交的父stage,就会调用submitMissingTasks方法提交它的task,当Task提交完后这个stage就提交完成了。注意!!!在之后的提交Task,当stage提交Task提交完后,会调用一个submitWaitingChildStages来提交子stage,在其中对子stage根据id排序后再调用submitStage,所以我们可以说它是递归调用方法来提交stage。

 private def submitStage(stage: Stage) {
    val jobId = activeJobForStage(stage)
    if (jobId.isDefined) {
      logDebug("submitStage(" + stage + ")")
      if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
        val missing = getMissingParentStages(stage).sortBy(_.id)
        logDebug("missing: " + missing)
        if (missing.isEmpty) {
          logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
          //当所有的Stage划分之后,进行Task提交操作
          submitMissingTasks(stage, jobId.get)
        } else {
          for (parent <- missing) {
            submitStage(parent)
          }
          waitingStages += stage
        }
      }
    } else {
      abortStage(stage, "No active job for stage " + stage.id, None)
    }
  }

先获取到stage中分区数量,因为一个分区对应一个task,根据stage来生成一系列的task。

 val tasks: Seq[Task[_]] = try {
      val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
      stage match {
        // 当匹配到生成的是ShuffleMapStage
        case stage: ShuffleMapStage =>
          // 首先保证pendingPartitions为空
          // pendingPartitions中放的是还没完成的partition,还没完成的task
          // 如果完成了就会从中清除
          // DAGScheduler会用它来确定此state是否已完成
          stage.pendingPartitions.clear()
          // 开始遍历操作每个需要计算的分区
          partitionsToCompute.map { id =>
            // 拿到分区地址
            val locs = taskIdToLocations(id)
            // 拿到此stage对应的rdd的分区
            val part = stage.rdd.partitions(id)
            // 加入运行状态
            stage.pendingPartitions += id
            // 开始构建ShuffleMapTask对象,之后会通过这个对象调用runTask,
            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
              taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
              Option(sc.applicationId), sc.applicationAttemptId)
          }
        // 当匹配到ResultStage时生成的是ResultTask
        case stage: ResultStage =>
          partitionsToCompute.map { id =>
            val p: Int = stage.partitions(id)
            val part = stage.rdd.partitions(p)
            val locs = taskIdToLocations(id)
            // 他也进行封装一些列参数,然后开始创建ResultTask。
            new ResultTask(stage.id, stage.latestInfo.attemptId,
              taskBinary, part, locs, id, properties, serializedTaskMetrics,
              Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
          }
      }
    } catch {
      case NonFatal(e) =>
        abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
        runningStages -= stage
        return
    }

之后会封装Task来创建TaskSet,然后将TaskSet进行提交。接下来DagScheduler的工作就结束了,该进入到TaskScheduler的Task分配算法了。

if (tasks.size > 0) {
  logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
    s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
 //通过TaskScheduler来提交所有的TaskSet
  taskScheduler.submitTasks(new TaskSet(
    tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
  stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
  // Because we posted SparkListenerStageSubmitted earlier, we should mark
  // the stage as completed here in case there are no tasks to run
  markStageAsFinished(stage, None)

下篇博客中,我会向大家讲解一下关于task分配算法的源码。这是我个人的理解,如果有哪点错了,希望指点一下。戳这里查看=>Task分配源码分析。

							compiled up by JiaMingcan
							转载请署名:JiaMingcan

你可能感兴趣的:(大数据基础)