这里以count操作为例,一步步解析Spark在执行一个Job时如何进行DAG图的解析。Spark在遇到Action类型算子时,会使用SparkContext进行一系列的runJob方法调用,最终会调用DAGScheduler的runJob方法来划分DAG图。
// 计算RDD中包含的键值对个数,此时会触发一个SparkContext来提交执行Job
def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum
def runJob[T, U: ClassTag](rdd: RDD[T], func: Iterator[T] => U): Array[U] = {
runJob(rdd, func, 0 until rdd.partitions.length)
}
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: Iterator[T] => U,
partitions: Seq[Int]): Array[U] = {
val cleanedFunc = clean(func)
runJob(rdd, (ctx: TaskContext, it: Iterator[T]) => cleanedFunc(it), partitions)
}
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int]): Array[U] = {
val results = new Array[U](partitions.size)
runJob[T, U](rdd, func, partitions, (index, res) => results(index) = res)
results
}
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
resultHandler: (Int, U) => Unit): Unit = {
if (stopped.get()) {
throw new IllegalStateException("SparkContext has been shutdown")
}
val callSite = getCallSite
val cleanedFunc = clean(func)
logInfo("Starting job: " + callSite.shortForm)
if (conf.getBoolean("spark.logLineage", false)) {
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
}
// 调用DAGScheduler的runJob进行处理,负责任务的逻辑调度,将Job拆分成不同阶段的具有依赖关系的任务集
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
// 调用rdd的doCheckPoint方法来缓存RDD数据,会以一个额外的Job来执行
rdd.doCheckpoint()
}
1、runJob方法,该方法主要使用DAGScheduler来对Job进行逻辑提交执行。
def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System.nanoTime
// DAGScheduler提交该Job,它会等待作业提交的结果。此时提交任务的线程会在这里阻塞直至返回Job
// 执行结果。然后判断一下成功或者是失败来进行下一步操作。
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
val awaitPermission = null.asInstanceOf[scala.concurrent.CanAwait]
waiter.completionFuture.ready(Duration.Inf)(awaitPermission)
waiter.completionFuture.value.get match {
case scala.util.Success(_) =>
logInfo("Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
case scala.util.Failure(exception) =>
logInfo("Job %d failed: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
val callerStackTrace = Thread.currentThread().getStackTrace.tail
exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
throw exception
}
}
2、submitJob方法,使用事件驱动设计。将Job封装成一个JobSubmitted事件,然后利用DAGSchedulerEventProcessLoop提交该事件。
def submitJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): JobWaiter[U] = {
// 检测RDD中Partition的个数,真正执行时一个Partition对应着一个Task
val maxPartitions = rdd.partitions.length
partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
throw new IllegalArgumentException(
"Attempting to access a non-existent partition: " + p + ". " +
"Total number of partitions: " + maxPartitions)
}
// 生成jobId
val jobId = nextJobId.getAndIncrement()
if (partitions.size == 0) {
// Return immediately if the job is running 0 tasks
return new JobWaiter[U](this, jobId, 0, resultHandler)
}
assert(partitions.size > 0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
// 将该Job的执行封装成JobSubmitted事件并通过一个事件处理队列来提交此事件,从而进行后续处理。
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
waiter
}
3、DAGSchedulerEventProcessLoop中对提交的JobSubmitted事件进行处理。此时在处理方法中会生成ResultStage以及所有的ShuffleMapStage。
/**
* DAGSchedulerEventProcessLoop重写了EventLoop的onReceieve方法,在该方法中调用
* doOnReceive方法来处理提交Job是对应的JobSubmitted事件
*/
override def onReceive(event: DAGSchedulerEvent): Unit = {
val timerContext = timer.time()
try {
doOnReceive(event)
} finally {
timerContext.stop()
}
}
/**
* 使用doOnReceive方法来处理不同的事件
*/
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
// 处理提交一个Job时对应的JobSubmitted事件,此时会调用DAGScheduler中的handleJobSubmitted方法
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
}
/**
* 使用DAGScheduler的handleJobSubmitted方法来处理JobSubmiited事件
*/
private[scheduler] def handleJobSubmitted(jobId: Int,
finalRDD: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
callSite: CallSite,
listener: JobListener,
properties: Properties) {
var finalStage: ResultStage = null
try {
// 利用DAG图中的最后一个RDD,即finalRDD来创建finalStage,并将它加入DAGScheduler内部的内
// 存缓冲中,利用createResultStage方法来创建ResultStage,而在createResultStage方法中会将
// 对应的所有ShuffleMapStage创建完毕。
finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
} catch {
case e: Exception =>
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
listener.jobFailed(e)
return
}
// 用finalStage来创建一个Job
val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
clearCacheLocs()
logInfo("Got job %s (%s) with %d output partitions".format(
job.jobId, callSite.shortForm, partitions.length))
logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
logInfo("Parents of final stage: " + finalStage.parents)
logInfo("Missing parents: " + getMissingParentStages(finalStage))
// 将Job加入到内部缓存中
val jobSubmissionTime = clock.getTimeMillis()
jobIdToActiveJob(jobId) = job
activeJobs += job
finalStage.setActiveJob(job)
val stageIds = jobIdToStageIds(jobId).toArray
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
listenerBus.post(
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
// 使用submitStage方法来提交finalStage,最后的结果就是第一个未执行的Stage提交执行,
// 其他Stage都在等待队列中
submitStage(finalStage)
}
4、生成ResultStage以及所有的ShuffleMapStage。实际上在createResultStage方法中会生成所有的Stage。下面将分析该方法的执行逻辑。
/**
* 创建ResultStage以及生成其所依赖的所有ShuffleMapStage
*/
private def createResultStage(rdd: RDD[_],
func: (TaskContext, Iterator[_]) => _,
partitions: Array[Int],
jobId: Int,
callSite: CallSite): ResultStage = {
// 传入finalRDD,得到其所有的父Stage
val parents = getOrCreateParentStages(rdd, jobId)
val id = nextStageId.getAndIncrement()
// 利用finalRDD以及对应的parentStages创建ResultStage
val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite)
stageIdToStage(id) = stage
updateJobIdStageIdMaps(jobId, stage)
stage
}
/**
* 返回一个RDD依赖的所有Stage
*/
private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
// getShuffleDependencies方法获得RDD直接相关的ShuffleDependencies
getShuffleDependencies(rdd).map { shuffleDep =>
// getOrCreateShuffleMapStage获得或者新建一个ShuffleDependence依赖的所有ShuffleMapStage。
getOrCreateShuffleMapStage(shuffleDep, firstJobId)
}.toList
}
/**
* 返回一个RDD的所有宽依赖,即和该RDD直接相关的父依赖集合
*/
private[scheduler] def getShuffleDependencies(rdd: RDD[_]):
HashSet[ShuffleDependency[_, _, _]] = {
// 保存RDD直接依赖的ShuffleDependency集合
val parents = new HashSet[ShuffleDependency[_, _, _]]
// 存放遍历过的RDD
val visited = new HashSet[RDD[_]]
// 存放将要遍历的RDD的栈
val waitingForVisit = new Stack[RDD[_]]
waitingForVisit.push(rdd)
while (waitingForVisit.nonEmpty) {
val toVisit = waitingForVisit.pop()
if (!visited(toVisit)) {
visited += toVisit
// 遍历该RDD的依赖关系
toVisit.dependencies.foreach {
// 如果是宽依赖,则加入到宽依赖关系集合中
case shuffleDep: ShuffleDependency[_, _, _] => parents += shuffleDep
// 窄依赖,将其连接的RDD加入到将要访问的栈中
case dependency => waitingForVisit.push(dependency.rdd)
}
}
}
parents
}
/**
* 对于给定的ShuffleDependency,该方法会创建一个ShuffleMapStage并且与其相关的所有的祖先
* ShuffleDependency对应的ShuffleMapStage都会被创建。
*/
private def getOrCreateShuffleMapStage(shuffleDep: ShuffleDependency[_, _, _],
firstJobId: Int): ShuffleMapStage = {
shuffleIdToMapStage.get(shuffleDep.shuffleId) match {
// 存在一个ShuffleMapStage与ShuffleDependency对应,直接返回
case Some(stage) =>
stage
// 不存在,则新建一个ShuffleMapStage
case None =>
// 获取或新建该ShuffleDependency所依赖的所有的ShuffleMapStage
getMissingAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
if (!shuffleIdToMapStage.contains(dep.shuffleId)) {
createShuffleMapStage(dep, firstJobId)
}
}
// 对当前的ShuffleDependency,新建一个ShuffleMapStage
createShuffleMapStage(shuffleDep, firstJobId)
}
}
/**
* 返回一个RDD的所有祖先ShuffleDependency
*/
private def getMissingAncestorShuffleDependencies(rdd: RDD[_]):
Stack[ShuffleDependency[_, _, _]] = {
// 存放RDD所有的祖先ShuffleDependency
val ancestors = new Stack[ShuffleDependency[_, _, _]]
val visited = new HashSet[RDD[_]]
val waitingForVisit = new Stack[RDD[_]]
waitingForVisit.push(rdd)
// 从后向前遍历RDD,获取RDD所有的祖先ShuffleDependency
while (waitingForVisit.nonEmpty) {
val toVisit = waitingForVisit.pop()
if (!visited(toVisit)) {
visited += toVisit
// getShuffleDependencies方法返回RDD的宽依赖
getShuffleDependencies(toVisit).foreach { shuffleDep =>
// 宽依赖未被注册,则加入,并且继续向前寻找未被注册的宽依赖
if (!shuffleIdToMapStage.contains(shuffleDep.shuffleId)) {
ancestors.push(shuffleDep)
waitingForVisit.push(shuffleDep.rdd)
}
}
}
}
ancestors
}
5、提交每个Stage。实际上在调用createResultStage之后,一个DAG图中的所有的Stage都已经被创建出来,即ResultStage和ShuffleMapStage,此时需要将Stage进行提交执行。即调用submitStage方法,下面将分析该方法的执行逻辑:
/**
* Submits stage, but first recursively submits any missing parents.
* 在submitStage方法中提交执行stage,在提交该stage时,会递归提交计算该stage未被计算的父satges
*/
private def submitStage(stage: Stage) {
val jobId = activeJobForStage(stage)
if (jobId.isDefined) {
logDebug("submitStage(" + stage + ")")
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
// 调用getMissingParentStages方法获得该Stage所有的父Stage,将其父stage存放
// 在missing中并且按照stage id进行排序
val missing = getMissingParentStages(stage).sortBy(_.id)
logDebug("missing: " + missing)
//如果该stage不存在父stage,则直接提交该stage中的所有任务
if (missing.isEmpty) {
logInfo("Submitting " + stage + " (" + stage.rdd + "), " +
"which has no missing parents")
// 如果该Stage不包含未被计算的父Stage,则调用submitMissingTasks方法提交该stage对应的taskSet
submitMissingTasks(stage, jobId.get)
} else {
// 如果存在未被提交执行的父Stage,则递归提交父stage
for (parent <- missing) {
submitStage(parent)
}
// 将该stage加入到等待stage集合中
waitingStages += stage
}
}
} else {
abortStage(stage, "No active job for stage " + stage.id, None)
}
}
/**
* 得到该Stage所有未被计算的祖先Stage集合
*/
private def getMissingParentStages(stage: Stage): List[Stage] = {
val missing = new HashSet[Stage]
val visited = new HashSet[RDD[_]]
// 利用一个栈来保存未被处理的RDD
val waitingForVisit = new Stack[RDD[_]]
// 使用visit方法来遍历栈中未被处理的RDD
def visit(rdd: RDD[_]) {
if (!visited(rdd)) {
visited += rdd
val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil)
if (rddHasUncachedPartitions) {
// 依次遍历RDD中的依赖关系
for (dep <- rdd.dependencies) {
dep match {
// 如果是宽依赖关系,利用getOrCreateShuffleMapStage方法获取该ShuffleDependency对应的ShuffleMapStage
// 以及所有依赖的祖先ShuffleMapStage
case shufDep: ShuffleDependency[_, _, _] =>
val mapStage = getOrCreateShuffleMapStage(shufDep, stage.firstJobId)
// 判断Stage是否可用
if (!mapStage.isAvailable) {
missing += mapStage
}
//如果是窄依赖,则将RDD放入到栈中
case narrowDep: NarrowDependency[_] =>
waitingForVisit.push(narrowDep.rdd)
}
}
}
}
}
// 首先向栈中推入了该stage中的最后一个RDD
waitingForVisit.push(stage.rdd)
// 如果栈不空,则遍历所有栈中的RDD
while (waitingForVisit.nonEmpty) {
visit(waitingForVisit.pop())
}
// 返回Stage所有父Stage
missing.toList
}