spark RDD action job的提交过程

spark RDD action job的提交过程

以RDD的 count()方法的例子为例,来演示 dag生成提交的过程。

def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum

上面直接调用到 runJob的方法

  def runJob[T, U: ClassTag](
  rdd: RDD[T],
  func: (TaskContext, Iterator[T]) => U,
  partitions: Seq[Int],
  resultHandler: (Int, U) => Unit): Unit = {

// 调用 于 dagScheduler的方法了
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
rdd.doCheckpoint()
  }

然后在 dagScheduler 中进行任务的提交

 def submitJob[T, U](
  rdd: RDD[T],
  func: (TaskContext, Iterator[T]) => U,
  partitions: Seq[Int],
  callSite: CallSite,
  resultHandler: (Int, U) => Unit,
  properties: Properties): JobWaiter[U] = {


assert(partitions.size > 0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
// 提交任务了
eventProcessLoop.post(JobSubmitted(
  jobId, rdd, func2, partitions.toArray, callSite, waiter,
  SerializationUtils.clone(properties)))
waiter
  }

然后还是在 dagScheduler 类中进行消息的接收,进行任务的启动

 private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
  // 接收到提交的job任务
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
  dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)

case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>
  dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)

case StageCancelled(stageId) =>
  dagScheduler.handleStageCancellation(stageId)

case JobCancelled(jobId) =>
  dagScheduler.handleJobCancellation(jobId)

case JobGroupCancelled(groupId) =>
  dagScheduler.handleJobGroupCancelled(groupId)

case AllJobsCancelled =>
  dagScheduler.doCancelAllJobs()

case ExecutorAdded(execId, host) =>
  dagScheduler.handleExecutorAdded(execId, host)

case ExecutorLost(execId) =>
  dagScheduler.handleExecutorLost(execId, fetchFailed = false)

case BeginEvent(task, taskInfo) =>
  // 任务的开始了
  dagScheduler.handleBeginEvent(task, taskInfo)

case GettingResultEvent(taskInfo) =>
  dagScheduler.handleGetTaskResult(taskInfo)

case completion @ CompletionEvent(task, reason, _, _, taskInfo, taskMetrics) =>
  dagScheduler.handleTaskCompletion(completion)

case TaskSetFailed(taskSet, reason, exception) =>
  dagScheduler.handleTaskSetFailed(taskSet, reason, exception)

case ResubmitFailedStages =>
  dagScheduler.resubmitFailedStages()
  }

然后进行 stage job的创建

private[scheduler] def handleJobSubmitted(jobId: Int,
  finalRDD: RDD[_],
  func: (TaskContext, Iterator[_]) => _,
  partitions: Array[Int],
  callSite: CallSite,
  listener: JobListener,
  properties: Properties) {
var finalStage: ResultStage = null
// 接收job任务的提交
try {
  // New stage creation may throw an exception if, for example, jobs are run on a
  // HadoopRDD whose underlying HDFS files have been deleted.
  finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite)
} catch {
  case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}

val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions".format(
  job.jobId, callSite.shortForm, partitions.length))
logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))

val jobSubmissionTime = clock.getTimeMillis()
jobIdToActiveJob(jobId) = job
activeJobs += job
// 设置 一个stage对应于那一个job
finalStage.setActiveJob(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
  SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
// 提交这个Stage了
submitStage(finalStage)

submitWaitingStages()
  }

然后进行 ResultStage 对象的创建

  private def newResultStage(
  rdd: RDD[_],
  func: (TaskContext, Iterator[_]) => _,
  partitions: Array[Int],
  jobId: Int,
  callSite: CallSite): ResultStage = {
// 拿到依赖的stages
val (parentStages: List[Stage], id: Int) = getParentStagesAndId(rdd, jobId)
val stage = new ResultStage(id, rdd, func, partitions, parentStages, jobId, callSite)
stageIdToStage(id) = stage
updateJobIdStageIdMaps(jobId, stage)
stage
  }

可以看到上面会去拿到 Stage的父依赖的相关 parentStages

  private def getParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
val parents = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
// We are manually maintaining a stack here to prevent StackOverflowError
// caused by recursively visiting
val waitingForVisit = new Stack[RDD[_]]
def visit(r: RDD[_]) {
  if (!visited(r)) {
visited += r
// Kind of ugly: need to register RDDs with the cache here since
// we can't do it in its constructor because # of partitions is unknown
for (dep <- r.dependencies) {
  // 从父依赖当中一步一步回溯
  dep match {
case shufDep: ShuffleDependency[_, _, _] =>
  // 只有shffle才有stage ?
  parents += getShuffleMapStage(shufDep, firstJobId)
case _ =>
  waitingForVisit.push(dep.rdd)
  }
}
  }
}
waitingForVisit.push(rdd)
while (waitingForVisit.nonEmpty) {
  visit(waitingForVisit.pop())
}
parents.toList
  }

可以看到上面的 Stage的父网络的生成,其实就是通过RDD的 dependencies 依赖来生成的,就是说 RDD的依赖关系和 stage的依赖关系一致
,然后只有 ShuffleDependency 才生成一个 stage.

  private def getShuffleMapStage(
  shuffleDep: ShuffleDependency[_, _, _],
  firstJobId: Int): ShuffleMapStage = {
shuffleToMapStage.get(shuffleDep.shuffleId) match {
  case Some(stage) => stage
  case None =>
// We are going to register ancestor shuffle dependencies
getAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
  // 之前没有,这个时候才注册
  shuffleToMapStage(dep.shuffleId) = newOrUsedShuffleStage(dep, firstJobId)
}
// Then register current shuffleDep 新建一个stage对象出来了
val stage = newOrUsedShuffleStage(shuffleDep, firstJobId)
shuffleToMapStage(shuffleDep.shuffleId) = stage
stage
}
  }

可以看到上面的 ShuffleDependency 中,也通过其RDD的 Dependencies 进行stage的生成,

  private def newShuffleMapStage(
  rdd: RDD[_],
  numTasks: Int,
  shuffleDep: ShuffleDependency[_, _, _],
  firstJobId: Int,
  callSite: CallSite): ShuffleMapStage = {
// 从这里看出 stage的网络是通过shuffle依赖进行的
val (parentStages: List[Stage], id: Int) = getParentStagesAndId(rdd, firstJobId)
val stage: ShuffleMapStage = new ShuffleMapStage(id, rdd, numTasks, parentStages,
  firstJobId, callSite, shuffleDep)

stageIdToStage(id) = stage
updateJobIdStageIdMaps(firstJobId, stage)
stage
  }

可以看到上面 stage的创建,也很明确的依赖于父的 parentStages的,所以这个树状的依赖过程就形成了。
现在相关的依赖 Stage已经创建了,现在就是一个提交的过程了

  /** Submits stage, but first recursively submits any missing parents. */
  private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
  logDebug("submitStage(" + stage + ")")
  if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
if (missing.isEmpty) {
  // 依赖的父节点已经准备好,所以直接执行当前的节点
  logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
  submitMissingTasks(stage, jobId.get)
} else {
  for (parent <- missing) {
// 依赖的父的Stage还有些没有准备好,所以在这里直接提交
submitStage(parent)
  }
  waitingStages += stage
}
  }
} else {
  abortStage(stage, "No active job for stage " + stage.id, None)
}
  }

可以看到上面还会先去拿依赖的父的stage,看下是否已经提交过执行没有。

/** Called when stage's parents are available and we can now do its task. */
  private def submitMissingTasks(stage: Stage, jobId: Int) {
// 提交还没有执行的任务
logDebug("submitMissingTasks(" + stage + ")")
// Get our pending tasks and remember them in our pendingTasks entry
stage.pendingPartitions.clear()

// First figure out the indexes of partition ids to compute.
// 这里有 ShuffleMapStage 和ResultStage 中还没有执行完成的partition
val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()


val properties = jobIdToActiveJob(jobId).properties
// 准备开始运行了,先记录一下
runningStages += stage

// 说明spark 的stage 树状依赖关系 只有两种类型 ,shuffle和result,shuffle就是对RDD数据的分片功能 result就是单纯的RDD分片数据的计算
stage match {
  case s: ShuffleMapStage =>
    // 这里开始提交任务了
    outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)
  case s: ResultStage =>
    outputCommitCoordinator.stageStart(
      stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)
}
// 算出那些分区要执行的,然后同时计算出该在那个节点里面进行执行
val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
  stage match {
    case s: ShuffleMapStage =>
      // 计算这些还没有运行完成的stage,还有这个RDD期望执行的主机节点
      partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
    case s: ResultStage =>
      val job = s.activeJob.get
      partitionsToCompute.map { id =>
        val p = s.partitions(id)
        (id, getPreferredLocs(stage.rdd, p))
      }.toMap
  }
} catch {
  case NonFatal(e) =>
    stage.makeNewStageAttempt(partitionsToCompute.size)
    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
    abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
    // 先删除这个stage运行中的
    runningStages -= stage
    return
}

stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
// 这里应该是日志监控来的
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))

var taskBinary: Broadcast[Array[Byte]] = null
try {
  // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
  // For ResultTask, serialize and broadcast (rdd, func).
  // 从上面可以看到,开始序列化 ShuffleMapTask 和 ResultTask task对象了
  val taskBinaryBytes: Array[Byte] = stage match {
    case stage: ShuffleMapStage =>
      closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
    case stage: ResultStage =>
      closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()
  }
  // 这里进行广播数据操作了
  taskBinary = sc.broadcast(taskBinaryBytes)
} catch {

    return
}
// 这里创建task任务对象了
val tasks: Seq[Task[_]] = try {
  stage match {
    case stage: ShuffleMapStage =>
      partitionsToCompute.map { id =>
        val locs = taskIdToLocations(id)
        val part = stage.rdd.partitions(id)
        // 这样就创建一个task任务了
        new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
          taskBinary, part, locs, stage.internalAccumulators)
      }
    // 这里就是普通的 RDD 的分片计算 任务了
    case stage: ResultStage =>
      val job = stage.activeJob.get
      partitionsToCompute.map { id =>
        val p: Int = stage.partitions(id)
        val part = stage.rdd.partitions(p)
        val locs = taskIdToLocations(id)
        new ResultTask(stage.id, stage.latestInfo.attemptId,
          taskBinary, part, locs, id, stage.internalAccumulators)
      }
  }
} catch {
  case NonFatal(e) =>
    abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
    runningStages -= stage
    return
}

if (tasks.size > 0) {
  logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
  stage.pendingPartitions ++= tasks.map(_.partitionId)
  logDebug("New pending partitions: " + stage.pendingPartitions)
  // dag提交任务了,比较关键的一步了
  taskScheduler.submitTasks(new TaskSet(
    tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
  stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
  // Because we posted SparkListenerStageSubmitted earlier, we should mark
  // the stage as completed here in case there are no tasks to run
  markStageAsFinished(stage, None)
}  }

可以看到 stage.findMissingPartitions() 先去拿还没有执行的 RDD 分片数据,
然后再去 partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap 拿该
分片期望执行的进程或者节点地址。
最后创建 ShuffleMapTask或者 ResultTask task任务。
然后把创建的任务进行 taskScheduler.submitTasks 提交上去。

你可能感兴趣的:(spark)