1 spark中的宽依赖算子
spark的rdd基本操作包括transformation和action,rdd都是懒加载的,通过DAGGraph生成一个有向无环链来代表rdd的生成关系,只有遇到action以后才会真正的去执行。
在执行过程中会根据宽/窄依赖进行stage划分,常见的宽依赖包括groupByKey/reduceByKey/partitionBy……
以reduceByKey为例,调用reduceByKey时,会通过hashPartitioner方式去调用combineByKeyWithClassTag,实现如下:
def combineByKeyWithClassTag[C]( createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true, serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope { code val aggregator = new Aggregator[K, V, C]( self.context.clean(createCombiner), self.context.clean(mergeValue), self.context.clean(mergeCombiners)) if (self.partitioner == Some(partitioner)) { //code } else {
//生成shuffledRDD new ShuffledRDD[K, V, C](self, partitioner) .setSerializer(serializer) .setAggregator(aggregator) .setMapSideCombine(mapSideCombine) } }
核心在于根据传入的序列化方式/聚合器等参数生成新的ShuffledRDD.
2 宽依赖划分stage
action会触发流程真正开始执行
以count为例,
def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum
调用count的时候sparkContext会开始调用runJob方法,进一步调用dagSchedule的runJob方法
/** * 在rdd给定的partitions上运行func,并将函数结果传给resultHandler。 */ def runJob[T, U: ClassTag]( rdd: RDD[T], func: (TaskContext, Iterator[T]) => U, partitions: Seq[Int], resultHandler: (Int, U) => Unit): Unit = { if (stopped.get()) { throw new IllegalStateException("SparkContext has been shutdown") } val callSite = getCallSite val cleanedFunc = clean(func) logInfo("Starting job: " + callSite.shortForm) if (conf.getBoolean("spark.logLineage", false)) { logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString) } dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)//调用dagSchedule.runJob progressBar.foreach(_.finishAll()) rdd.doCheckpoint() }
在dagScheduler里面,调用情况如下:
def runJob[T, U]( rdd: RDD[T], func: (TaskContext, Iterator[T]) => U, partitions: Seq[Int], callSite: CallSite, resultHandler: (Int, U) => Unit, properties: Properties): Unit = { val start = System.nanoTime //核心是submitJob
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties) …… }
/** * 提交action给scheduler. * @param rdd 目标rdd * @param func 执行函数 * @param partitions 目标rdd的分区集合 * @param callSite where in the user program this job was called * @param resultHandler 结果接收方 * @param properties 配置属性 * @return a JobWaiter 用来等候job完成,获取取消job */ def submitJob[T, U]( rdd: RDD[T], func: (TaskContext, Iterator[T]) => U, partitions: Seq[Int], callSite: CallSite, resultHandler: (Int, U) => Unit, properties: Properties): JobWaiter[U] = { // Check to make sure we are not launching a task on a partition that does not exist. val maxPartitions = rdd.partitions.length
//判断分区数目是否合理 partitions.find(p => p >= maxPartitions || p < 0).foreach { p => throw new IllegalArgumentException( "Attempting to access a non-existent partition: " + p + ". " + "Total number of partitions: " + maxPartitions) } //生成新的jobid val jobId = nextJobId.getAndIncrement() if (partitions.size == 0) { // Return immediately if the job is running 0 tasks return new JobWaiter[U](this, jobId, 0, resultHandler) } val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _] val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler) //将新生成的JobSubmit实例添加到job队列中去
eventProcessLoop.post(JobSubmitted( jobId, rdd, func2, partitions.toArray, callSite, waiter, SerializationUtils.clone(properties))) waiter }
eventProcessLoop继承至EventLoop,里面维护有job队列
当队列里面不为空的时候,通过handleJobSubmitteed来处理这个job
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match { case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) => dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) …… }
//完成job到stage的转换,每个job都有一个finalStage:ResultStage
private[scheduler] def handleJobSubmitted(jobId: Int, finalRDD: RDD[_], func: (TaskContext, Iterator[_]) => _, partitions: Array[Int], callSite: CallSite, listener: JobListener, properties: Properties) { var finalStage: ResultStage = null try { // 生成结果stage,执行过程中可能会报异常 finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite) } catch { //异常处理 } //根据finalStage和jobId信息生成新的ActiveJob对象 val job = new ActiveJob(jobId, finalStage, callSite, listener, properties) clearCacheLocs() val jobSubmissionTime = clock.getTimeMillis() jobIdToActiveJob(jobId) = job //job绑定id activeJobs += job finalStage.setActiveJob(job) //再次绑定 val stageIds = jobIdToStageIds(jobId).toArray//获取指定job下面的所有stageId val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo)) listenerBus.post( SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties)) submitStage(finalStage)//根据finalStage提交stage submitWaitingStages() }
/** 提交stage,但是会先递归提交祖先stage. */ private def submitStage(stage: Stage) { val jobId = activeJobForStage(stage) if (jobId.isDefined) { logDebug("submitStage(" + stage + ")") if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) { val missing = getMissingParentStages(stage).sortBy(_.id) //获取stage的依赖关系,划分stage logDebug("missing: " + missing) if (missing.isEmpty) { logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents") submitMissingTasks(stage, jobId.get) //所有祖先stage可用之后调用 } else { for (parent <- missing) { submitStage(parent) //计算丢失的祖先stage信息,优先提交祖先stage } waitingStages += stage //计算祖先stage信息时,修改stage信息为waiting状态 } } } else { abortStage(stage, "No active job for stage " + stage.id, None) } }
//stage划分依据
private def getMissingParentStages(stage: Stage): List[Stage] = { val missing = new HashSet[Stage] val visited = new HashSet[RDD[_]] // 用栈来保存已经访问过的rdd信息 val waitingForVisit = new Stack[RDD[_]] def visit(rdd: RDD[_]) { if (!visited(rdd)) { visited += rdd val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)//是否存在未缓存的partition if (rddHasUncachedPartitions) { for (dep <- rdd.dependencies) {//遍历依赖的rdd dep match { case shufDep: ShuffleDependency[_, _, _] => val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)//宽依赖则生成新的ShuffleMapStage信息 if (!mapStage.isAvailable) { missing += mapStage } case narrowDep: NarrowDependency[_] => waitingForVisit.push(narrowDep.rdd)//窄依赖就将rdd添加到栈中 } } } } } waitingForVisit.push(stage.rdd) while (waitingForVisit.nonEmpty) { visit(waitingForVisit.pop()) } missing.toList }
ps:通过getCacheLocs来获取rdd各个分区的缓存位置(未缓存的话每个分区的缓存位置是Nil)
3 task分配 DAGScheduler.submitMissingTasks
private def submitMissingTasks(stage: Stage, jobId: Int) { stage.pendingPartitions.clear() // 获取所有需要处理的partition val partitionsToCompute: Seq[Int] = stage.findMissingPartitions() val properties = jobIdToActiveJob(jobId).properties runningStages += stage//stage切换到running状态 // 验证stage是否可以提交 stage match { case s: ShuffleMapStage => outputCommitCoordinator.stageStart(stage = s.id, maxPartitionId = s.numPartitions - 1) case s: ResultStage => outputCommitCoordinator.stageStart( stage = s.id, maxPartitionId = s.rdd.partitions.length - 1) }
//根据partitionId获取最优位置 val taskIdToLocations: Map[Int, Seq[TaskLocation]] = try { stage match { case s: ShuffleMapStage => partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap case s: ResultStage => val job = s.activeJob.get partitionsToCompute.map { id => val p = s.partitions(id) (id, getPreferredLocs(stage.rdd, p)) }.toMap } } catch { //异常处理 } stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq) listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties)) var taskBinary: Broadcast[Array[Byte]] = null try { // task序列化 val taskBinaryBytes: Array[Byte] = stage match { case stage: ShuffleMapStage => closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array() case stage: ResultStage => closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array() } taskBinary = sc.broadcast(taskBinaryBytes) } catch { //异常处理 } //根据不同的stage类型生成不同的task,每个partition生成一个task val tasks: Seq[Task[_]] = try { stage match { case stage: ShuffleMapStage => partitionsToCompute.map { id => val locs = taskIdToLocations(id) val part = stage.rdd.partitions(id) new ShuffleMapTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs, stage.internalAccumulators) } case stage: ResultStage => val job = stage.activeJob.get partitionsToCompute.map { id => val p: Int = stage.partitions(id) val part = stage.rdd.partitions(p) val locs = taskIdToLocations(id) new ResultTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs, id, stage.internalAccumulators) } } } catch { // } if (tasks.size > 0) { //taskSet不为空,则提交到taskScheduler taskScheduler.submitTasks(new TaskSet( tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties)) stage.latestInfo.submissionTime = Some(clock.getTimeMillis()) } else { //标记 } }
总结:1 spark懒加载模式,只有在count/show/reduce等action被调用的时候才会真正开始执行计算,每个action会生成一个job,每个job有一个ResultStage;
2 在执行过程中根据生成关系构造DAGScheduler,根据宽窄依赖关系划分stage。
3 DAGScheduler根据partition情况分配task,并转移给taskScheduler进行维护;