在Spark底层源码中,最重要的莫过于Stage划分和Task分配两个算法的源码了。由于之前我也学习过1.6的源码,这次看2.2的源码发现有很多方法发生了修改,所以将我对2.2源码的分析与大家分析一下,希望可以共同学习。
先通过action算子,调用run job()方法,例如foreach算子
def foreach(f: T => Unit): Unit = withScope {
val cleanF = sc.clean(f)
sc.runJob(this, (iter: Iterator[T]) => iter.foreach(cleanF))
}
然后一直点run job()直到出现如下,dagScheduler中的run job()方法
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
resultHandler: (Int, U) => Unit): Unit = {
if (stopped.get()) {
throw new IllegalStateException("SparkContext has been shutdown")
}
val callSite = getCallSite
val cleanedFunc = clean(func)
logInfo("Starting job: " + callSite.shortForm)
if (conf.getBoolean("spark.logLineage", false)) {
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
}
//进入spark最核心的DAGScheduler
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
进入提交任务的代码
def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System.nanoTime
//在这里会提交一个Job任务,然后会返回一个阻塞的线程等待Job执行完成
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
ThreadUtils.awaitReady(waiter.completionFuture, Duration.Inf)
//下面是根据不同的Job任务执行情况打印不同的Log信息
waiter.completionFuture.value.get match {
case scala.util.Success(_) =>
logInfo("Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
case scala.util.Failure(exception) =>
logInfo("Job %d failed: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
val callerStackTrace = Thread.currentThread().getStackTrace.tail
exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
throw exception
}
}
点击submitJob(),进入提交任务代码的方法中
def submitJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): JobWaiter[U] = {
// Check to make sure we are not launching a task on a partition that does not exist.
//检查分区是否存在保证Task正常运行
val maxPartitions = rdd.partitions.length
partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
throw new IllegalArgumentException(
"Attempting to access a non-existent partition: " + p + ". " +
"Total number of partitions: " + maxPartitions)
}
// 增加一个JobId作当前Job的标识(+1)
val jobId = nextJobId.getAndIncrement()
if (partitions.size == 0) {
// Return immediately if the job is running 0 tasks
// 如果没有Task任务,将立即返回JobWaiter
return new JobWaiter[U](this, jobId, 0, resultHandler)
}
// 为分区做个判断,确保分区大于0
assert(partitions.size > 0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
// 首先构造一个JobWaiter阻塞线程 等待job完成 然后把完成结果提交给resultHandler
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
// eventProcessLoop是DAGScheduler的事件队列
// 因为可能集群同时运行着多个Job,而DAGSchduler默认是FIFO先进先出的资源调度
// 这里传入的事件类型为JobSubmitted,而在eventProcessLoop会调用doOnReceive
// 来匹配事件类型并执行对应的操作,最终会匹配到dagScheduler.handleJobSubmitted(....)
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
waiter
}
这里会调用eventProcessLoop,而eventProcessLoop又是DAGSchedulerEventProcessLoop类的实例化,DAGSchedulerEventProcessLoop继承了EventLoop,EventLoop中有一个事件队列也就是eventQueue,每次接收到事件放入队列中,生成线程从eventQueue中取出event事件,使用onRecevie方法执行事件,这时DAGSchedulerEventProcessLoop会调用doOnReceive对接收到的事件进行模式匹配,匹配到handleJobSubmitted
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
//大家注意,这里是Stage划分的精髓所在
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>
dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)
case StageCancelled(stageId, reason) =>
dagScheduler.handleStageCancellation(stageId, reason)
case JobCancelled(jobId, reason) =>
dagScheduler.handleJobCancellation(jobId, reason)
case JobGroupCancelled(groupId) =>
dagScheduler.handleJobGroupCancelled(groupId)
case AllJobsCancelled =>
dagScheduler.doCancelAllJobs()
case ExecutorAdded(execId, host) =>
dagScheduler.handleExecutorAdded(execId, host)
case ExecutorLost(execId, reason) =>
val filesLost = reason match {
case SlaveLost(_, true) => true
case _ => false
}
在handleJobSubmitted中会创造ResultStage也就是最后一个stage
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
callSite: CallSite,
listener: JobListener,
properties: Properties) {
// 创建ResultStage,这里才是真正开始处理提交的job划分stage的时候
var finalStage: ResultStage = null
try {
// New stage creation may throw an exception if, for example, jobs are run on a
// HadoopRDD whose underlying HDFS files have been deleted.
// 它会从后往前找递归遍历它的每一个父RDD,从持久化中抽取反之重新计算
// 补充下:stage分为shuffleMapStage和ResultStage两种
// 每个job都是由1个ResultStage和0+个ShuffleMapStage组成
finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
在createResultStage中有调用了getOrCreateParentStage用来得来FinalStage的父stage
private def createResultStage(
rdd: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
jobId: Int,
callSite: CallSite): ResultStage = {
// 开始创建ResultStage的父stage
// 里面有多个嵌套获取shuffle依赖和循环创建shuffleMapStage,若没有shuffle操作返回为空List
val parents = getOrCreateParentStages(rdd, jobId)
// 当前的stageId标识+1
val id = nextStageId.getAndIncrement()
// 放入刚刚生成的父stage等核心参数,生成ResultStage
val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
// 把ResultStage和它的ID加入stageIdToStage
stageIdToStage(id) = stage
// 更新jobIds和jobIdToStageIds
updateJobIdStageIdMaps(jobId, stage)
// 返回这个ResultStage
stage
}
在getOrCreateParentStage中又调用了getOrCreateShuffleMapStage,由于除了最后一个stage其余的都是shuffleStage,所以就可以调用这个方法来创造父stage
private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
//从getShuffleDependencies开始,
// 这里仅仅是抽取当前RDD的Shuffle依赖
// (Job的Stage是以Shuffle划分的,1个Job中只会生成0+个ShuffleMapStage
// 和1个ResultStage),如果不是ShuffleDependency就继续抽取父RDD...
// 迭代遍历一直到抽取出为止或者没有
getShuffleDependencies(rdd).map { shuffleDep =>
getOrCreateShuffleMapStage(shuffleDep, firstJobId)
}.toList
而在getOrCreateShuffleMapStage中讲解了如何去创建MapStage,如果能够通过传入的stage提取到父stage就返回父stage,如果不能就提取该stage的依赖关系,最后会创建出宽依赖的shuffleMapStage
private def getOrCreateShuffleMapStage(
shuffleDep: ShuffleDependency[_, _, _],
firstJobId: Int): ShuffleMapStage = {
// 通过从ShuffleDependency提取到的shuffleId来提取shuffleIdToMapStage中的ShuffleMapStage
shuffleIdToMapStage.get(shuffleDep.shuffleId) match {
// 如果能提取到 就直接返回
case Some(stage) =>
stage
// 如果提取不到就会依次找到所有父ShuffleDependencies并且构建所有父ShuffleMapStage
case None =>
// Create stages for all missing ancestor shuffle dependencies.
getMissingAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
// Even though getMissingAncestorShuffleDependencies only returns shuffle dependencies
// that were not already in shuffleIdToMapStage, it's possible that by the time we
// get to a particular dependency in the foreach loop, it's been added to
// shuffleIdToMapStage by the stage creation process for an earlier dependency. See
// SPARK-13902 for more information.
// 根据遍历出来的所有ShuffleDependencies依次创建所有父ShuffleMapStage
// 接下来进行判断是否是父stage
if (!shuffleIdToMapStage.contains(dep.shuffleId)) {
createShuffleMapStage(dep, firstJobId)
}
}
// 最后会创建当前ShuffleDependency的ShuffleMapStage
// Finally, create a stage for the given shuffle dependency.
createShuffleMapStage(shuffleDep, firstJobId)
}
}
这时,我们就得到了finalStage的父stage就可以创建出finalStage,当然在之前的代码中可以看出可能会报异常,如果处理的hdfs上的数据发生改动或删除就会报错。当创建出finalStage后,会打印log日志
logInfo("Got job %s (%s) with %d output partitions".format(
job.jobId, callSite.shortForm, partitions.length))
logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
//TODO 接下来开始进行最重要的操作,就是执行Stage划分算法
logInfo("Missing parents: " + getMissingParentStages(finalStage))
接着会对job进行一系列操作,为提交stage做铺垫
val jobSubmissionTime = clock.getTimeMillis()
// HashMap结构,维护着jobId和jobIdToActiveJob的映射关系
jobIdToActiveJob(jobId) = job
// HashSet结构,维护着所有ActiveJob
activeJobs += job
// finalStage一旦生成就会把封装自己的ActiveJob注册到自己的_activeJob上
finalStage.setActiveJob(job)
// 提取出jobId对应的所有StageIds并转换成数组
val stageIds = jobIdToStageIds(jobId).toArray
// 提取出每个stage的最新尝试信息,当job启动时会告知SparkListenersJob
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
//接下来开始提交Stage
submitStage(finalStage)
在提交stage的方法中,我们可以看出我们先将stage放入,然后对这个stage进行判断,如果这个stage有尚未提交的parentStage,就继续调用这个方法提交它的parentStage并将该stage放入到waitingStages中,当它的父stage全部被提交完时才会提交它。这里会对每个stage进行判断,若stage没有尚未提交的父stage,就会调用submitMissingTasks方法提交它的task,当Task提交完后这个stage就提交完成了。注意!!!在之后的提交Task,当stage提交Task提交完后,会调用一个submitWaitingChildStages来提交子stage,在其中对子stage根据id排序后再调用submitStage,所以我们可以说它是递归调用方法来提交stage。
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
if (missing.isEmpty) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
//当所有的Stage划分之后,进行Task提交操作
submitMissingTasks(stage, jobId.get)
} else {
for (parent <- missing) {
submitStage(parent)
}
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id, None)
}
}
先获取到stage中分区数量,因为一个分区对应一个task,根据stage来生成一系列的task。
val tasks: Seq[Task[_]] = try {
val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
stage match {
// 当匹配到生成的是ShuffleMapStage
case stage: ShuffleMapStage =>
// 首先保证pendingPartitions为空
// pendingPartitions中放的是还没完成的partition,还没完成的task
// 如果完成了就会从中清除
// DAGScheduler会用它来确定此state是否已完成
stage.pendingPartitions.clear()
// 开始遍历操作每个需要计算的分区
partitionsToCompute.map { id =>
// 拿到分区地址
val locs = taskIdToLocations(id)
// 拿到此stage对应的rdd的分区
val part = stage.rdd.partitions(id)
// 加入运行状态
stage.pendingPartitions += id
// 开始构建ShuffleMapTask对象,之后会通过这个对象调用runTask,
new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
Option(sc.applicationId), sc.applicationAttemptId)
}
// 当匹配到ResultStage时生成的是ResultTask
case stage: ResultStage =>
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = stage.rdd.partitions(p)
val locs = taskIdToLocations(id)
// 他也进行封装一些列参数,然后开始创建ResultTask。
new ResultTask(stage.id, stage.latestInfo.attemptId,
taskBinary, part, locs, id, properties, serializedTaskMetrics,
Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
}
}
} catch {
case NonFatal(e) =>
abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
runningStages -= stage
return
}
之后会封装Task来创建TaskSet,然后将TaskSet进行提交。接下来DagScheduler的工作就结束了,该进入到TaskScheduler的Task分配算法了。
if (tasks.size > 0) {
logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
//通过TaskScheduler来提交所有的TaskSet
taskScheduler.submitTasks(new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
} else {
// Because we posted SparkListenerStageSubmitted earlier, we should mark
// the stage as completed here in case there are no tasks to run
markStageAsFinished(stage, None)
下篇博客中,我会向大家讲解一下关于task分配算法的源码。这是我个人的理解,如果有哪点错了,希望指点一下。戳这里查看=>Task分配源码分析。
compiled up by JiaMingcan
转载请署名:JiaMingcan