spark stage的划分和task分配

1 spark中的宽依赖算子

spark的rdd基本操作包括transformation和action,rdd都是懒加载的,通过DAGGraph生成一个有向无环链来代表rdd的生成关系,只有遇到action以后才会真正的去执行。

在执行过程中会根据宽/窄依赖进行stage划分,常见的宽依赖包括groupByKey/reduceByKey/partitionBy……

以reduceByKey为例,调用reduceByKey时,会通过hashPartitioner方式去调用combineByKeyWithClassTag,实现如下:

def combineByKeyWithClassTag[C](
    createCombiner: V => C,
    mergeValue: (C, V) => C,
    mergeCombiners: (C, C) => C,
    partitioner: Partitioner,
    mapSideCombine: Boolean = true,
    serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
  code
  val aggregator = new Aggregator[K, V, C](
    self.context.clean(createCombiner),
    self.context.clean(mergeValue),
    self.context.clean(mergeCombiners))
  if (self.partitioner == Some(partitioner)) {
    //code
  } else {
//生成shuffledRDD
    new ShuffledRDD[K, V, C](self, partitioner)
      .setSerializer(serializer)
      .setAggregator(aggregator)
      .setMapSideCombine(mapSideCombine)
  }
}

核心在于根据传入的序列化方式/聚合器等参数生成新的ShuffledRDD.

2 宽依赖划分stage

action会触发流程真正开始执行

以count为例,

def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum

调用count的时候sparkContext会开始调用runJob方法,进一步调用dagSchedule的runJob方法

/**
 * 在rdd给定的partitions上运行func,并将函数结果传给resultHandler。 
 */
def runJob[T, U: ClassTag](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    resultHandler: (Int, U) => Unit): Unit = {
  if (stopped.get()) {
    throw new IllegalStateException("SparkContext has been shutdown")
  }
  val callSite = getCallSite
  val cleanedFunc = clean(func)
  logInfo("Starting job: " + callSite.shortForm)
  if (conf.getBoolean("spark.logLineage", false)) {
    logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
  }
  dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)//调用dagSchedule.runJob
  progressBar.foreach(_.finishAll())
  rdd.doCheckpoint()
}

在dagScheduler里面,调用情况如下:

def runJob[T, U](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    callSite: CallSite,
    resultHandler: (Int, U) => Unit,
    properties: Properties): Unit = {
  val start = System.nanoTime
  //核心是submitJob  
  val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
  ……
}

/**
 * 提交action给scheduler.
 * @param rdd 目标rdd
 * @param func 执行函数
 * @param partitions 目标rdd的分区集合
 * @param callSite where in the user program this job was called
 * @param resultHandler 结果接收方
 * @param properties 配置属性
 * @return a JobWaiter 用来等候job完成,获取取消job
 */
def submitJob[T, U](
    rdd: RDD[T],
    func: (TaskContext, Iterator[T]) => U,
    partitions: Seq[Int],
    callSite: CallSite,
    resultHandler: (Int, U) => Unit,
    properties: Properties): JobWaiter[U] = {
  // Check to make sure we are not launching a task on a partition that does not exist.
  val maxPartitions = rdd.partitions.length
  //判断分区数目是否合理
  partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
    throw new IllegalArgumentException(
      "Attempting to access a non-existent partition: " + p + ". " +
        "Total number of partitions: " + maxPartitions)
  }
  //生成新的jobid
  val jobId = nextJobId.getAndIncrement()
  if (partitions.size == 0) {
    // Return immediately if the job is running 0 tasks
    return new JobWaiter[U](this, jobId, 0, resultHandler)
  }
  val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
  val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
  //将新生成的JobSubmit实例添加到job队列中去
  eventProcessLoop.post(JobSubmitted(
    jobId, rdd, func2, partitions.toArray, callSite, waiter,
    SerializationUtils.clone(properties)))
  waiter
}

eventProcessLoop继承至EventLoop,里面维护有job队列

当队列里面不为空的时候,通过handleJobSubmitteed来处理这个job

private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
  case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
    dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
  ……
}


//完成job到stage的转换,每个job都有一个finalStage:ResultStage
private[scheduler] def handleJobSubmitted(jobId: Int,
    finalRDD: RDD[_],
    func: (TaskContext, Iterator[_]) => _,
    partitions: Array[Int],
    callSite: CallSite,
    listener: JobListener,
    properties: Properties) {
  var finalStage: ResultStage = null
  try {
    // 生成结果stage,执行过程中可能会报异常
    finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite)
  } catch {
    //异常处理
  }
//根据finalStage和jobId信息生成新的ActiveJob对象
  val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
  clearCacheLocs()
  val jobSubmissionTime = clock.getTimeMillis()
  jobIdToActiveJob(jobId) = job   //job绑定id
  activeJobs += job    
  finalStage.setActiveJob(job) //再次绑定
  val stageIds = jobIdToStageIds(jobId).toArray//获取指定job下面的所有stageId
  val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
  listenerBus.post(
    SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
  submitStage(finalStage)//根据finalStage提交stage

  submitWaitingStages()
}

/** 提交stage,但是会先递归提交祖先stage. */
private def submitStage(stage: Stage) {
  val jobId = activeJobForStage(stage)
  if (jobId.isDefined) {
    logDebug("submitStage(" + stage + ")")
    if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
      val missing = getMissingParentStages(stage).sortBy(_.id) //获取stage的依赖关系,划分stage
      logDebug("missing: " + missing)
      if (missing.isEmpty) {
        logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
        submitMissingTasks(stage, jobId.get) //所有祖先stage可用之后调用
      } else {
        for (parent <- missing) {
          submitStage(parent) //计算丢失的祖先stage信息,优先提交祖先stage
        }
        waitingStages += stage //计算祖先stage信息时,修改stage信息为waiting状态
      }
    }
  } else {
    abortStage(stage, "No active job for stage " + stage.id, None)
  }
}
//stage划分依据
 
  
private def getMissingParentStages(stage: Stage): List[Stage] = {
  val missing = new HashSet[Stage]
  val visited = new HashSet[RDD[_]]
  // 用栈来保存已经访问过的rdd信息
  val waitingForVisit = new Stack[RDD[_]]
  def visit(rdd: RDD[_]) {
    if (!visited(rdd)) {
      visited += rdd
      val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)//是否存在未缓存的partition
      if (rddHasUncachedPartitions) {
        for (dep <- rdd.dependencies) {//遍历依赖的rdd
          dep match {
            case shufDep: ShuffleDependency[_, _, _] =>
              val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)//宽依赖则生成新的ShuffleMapStage信息
              if (!mapStage.isAvailable) {
                missing += mapStage
              }
            case narrowDep: NarrowDependency[_] =>
              waitingForVisit.push(narrowDep.rdd)//窄依赖就将rdd添加到栈中
          }
        }
      }
    }
  }
  waitingForVisit.push(stage.rdd)
  while (waitingForVisit.nonEmpty) {
    visit(waitingForVisit.pop())
  }
  missing.toList
}

ps:通过getCacheLocs来获取rdd各个分区的缓存位置(未缓存的话每个分区的缓存位置是Nil)

3 task分配 DAGScheduler.submitMissingTasks

private def submitMissingTasks(stage: Stage, jobId: Int) {
  stage.pendingPartitions.clear()
  // 获取所有需要处理的partition
  val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()
  val properties = jobIdToActiveJob(jobId).properties
  runningStages += stage//stage切换到running状态
  // 验证stage是否可以提交
  stage match {
    case s: ShuffleMapStage =>
      outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1)
    case s: ResultStage =>
      outputCommitCoordinator.stageStart(
        stage = s.id, maxPartitionId = s.rdd.partitions.length - 1)
  }
//根据partitionId获取最优位置
  val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try {
    stage match {
      case s: ShuffleMapStage =>
        partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
      case s: ResultStage =>
        val job = s.activeJob.get
        partitionsToCompute.map { id =>
          val p = s.partitions(id)
          (id, getPreferredLocs(stage.rdd, p))
        }.toMap
    }
  } catch {
    //异常处理
  }

  stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
  listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))

  var taskBinary: Broadcast[Array[Byte]] = null
  try {
    // task序列化
    val taskBinaryBytes: Array[Byte] = stage match {
      case stage: ShuffleMapStage =>
        closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
      case stage: ResultStage =>
        closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()
    }

    taskBinary = sc.broadcast(taskBinaryBytes)
  } catch {
    //异常处理
  }
//根据不同的stage类型生成不同的task,每个partition生成一个task
  val tasks: Seq[Task[_]] = try {
    stage match {
      case stage: ShuffleMapStage =>
        partitionsToCompute.map { id =>
          val locs = taskIdToLocations(id)
          val part = stage.rdd.partitions(id)
          new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
            taskBinary, part, locs, stage.internalAccumulators)
        }

      case stage: ResultStage =>
        val job = stage.activeJob.get
        partitionsToCompute.map { id =>
          val p: Int = stage.partitions(id)
          val part = stage.rdd.partitions(p)
          val locs = taskIdToLocations(id)
          new ResultTask(stage.id, stage.latestInfo.attemptId,
            taskBinary, part, locs, id, stage.internalAccumulators)
        }
    }
  } catch {
    //
  }

  if (tasks.size > 0) {
    //taskSet不为空,则提交到taskScheduler
    taskScheduler.submitTasks(new TaskSet(
      tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
    stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
  } else {
    //标记
  }
}


总结:1 spark懒加载模式,只有在count/show/reduce等action被调用的时候才会真正开始执行计算,每个action会生成一个job,每个job有一个ResultStage;

           2 在执行过程中根据生成关系构造DAGScheduler,根据宽窄依赖关系划分stage。

           3 DAGScheduler根据partition情况分配task,并转移给taskScheduler进行维护;



你可能感兴趣的:(大数据开发)