def main(args: Array[String]): Unit = {
//使用IDEA开发工具完成WordCount
//local 模式
//创建SparkConf对象
val conf = new SparkConf().setMaster("local[3]").setAppName("Word Count")
val sc = new SparkContext(conf)
//读取文件内容
val lines = sc.textFile("F:\\MyLearning\\SparkStudy\\A_Spark_Start\\src\\main\\resources\\word.txt")
//扁平化操作
val word: RDD[String] = lines.flatMap(_.split(" "))
//将word做分组
val words: RDD[(String, Int)] = word.map((_, 1))
//聚合
val unit: RDD[(String, Int)] = words.reduceByKey(_ + _)
//收集并且打印
val result: Array[(String, Int)] = unit.collect()
result.foreach(println)
}
上述代码,一个world count 程序,下面来看一下每个原子的源码:
def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U] = withScope {
val cleanF = sc.clean(f)
new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.flatMap(cleanF))
}
def map[U: ClassTag](f: T => U): RDD[U] = withScope {
val cleanF = sc.clean(f)
new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.map(cleanF))
}
} else {
new ShuffledRDD[K, V, C](self, partitioner)
.setSerializer(serializer)
.setAggregator(aggregator)
.setMapSideCombine(mapSideCombine)
}
注意: map 等算子返回的RDD 其实是继承OneToOneDependency
,也就是窄依赖
//org.apache.spark.rdd.RDD#collect
def collect(): Array[T] = withScope {
val results = sc.runJob(this, (iter: Iterator[T]) => iter.toArray)
Array.concat(results: _*)
}
//org.apache.spark.SparkContext#runJob 方法重载
def runJob[T, U: ClassTag](rdd: RDD[T], func: Iterator[T] => U): Array[U] = {
runJob(rdd, func, 0 until rdd.partitions.length)
}
//org.apache.spark.SparkContext#runJob
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: Iterator[T] => U,
partitions: Seq[Int]): Array[U] = {
val cleanedFunc = clean(func)
runJob(rdd, (ctx: TaskContext, it: Iterator[T]) => cleanedFunc(it), partitions)
}
//org.apache.spark.SparkContext#runJob
def runJob[T, U: ClassTag](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
resultHandler: (Int, U) => Unit): Unit = {
if (stopped.get()) {
throw new IllegalStateException("SparkContext has been shutdown")
}
val callSite = getCallSite
val cleanedFunc = clean(func)
logInfo("Starting job: " + callSite.shortForm)
if (conf.getBoolean("spark.logLineage", false)) {
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
}
//有向无环图调度器
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
rdd.doCheckpoint()
}
//org.apache.spark.scheduler.DAGScheduler#runJob
def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System.nanoTime
//提交任务
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
ThreadUtils.awaitReady(waiter.completionFuture, Duration.Inf)
//一直在等待
waiter.completionFuture.value.get match {
case scala.util.Success(_) =>
logInfo("Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
case scala.util.Failure(exception) =>
logInfo("Job %d failed: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
val callerStackTrace = Thread.currentThread().getStackTrace.tail
exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
throw exception
}
}
//org.apache.spark.scheduler.DAGScheduler#submitJob
// eventProcessLoop 会一直添加JobSubmitted 对象
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
//其实eventProcessLoop 是一个DAGSchedulerEventProcessLoop 对象
//我们暂时不知道DAGSchedulerEventProcessLoop是干嘛的,只能是跟进源码
private[spark] val eventProcessLoop = new DAGSchedulerEventProcessLoop(this)
//org.apache.spark.scheduler.DAGSchedulerEventProcessLoop
//可以发现,DAGSchedulerEventProcessLoop 继承 EventLoop,继续跟进源码
private[scheduler] class DAGSchedulerEventProcessLoop(dagScheduler: DAGScheduler)
extends EventLoop[DAGSchedulerEvent]("dag-scheduler-event-loop") with Logging {
//org.apache.spark.util.EventLoop
//可以发现存在一个 BlockingQueue,该对象属于 java.util.concurrent 包
// jdk1.6后juc 阻塞式队列,双端式队列
// BlockingQueue 阻塞式队列,当队列满了会阻塞,当队列为空也会阻塞
private val eventQueue: BlockingQueue[E] = new LinkedBlockingDeque[E]()
//org.apache.spark.scheduler.DAGScheduler#submitJob
//回到之前的源码发现,eventProcessLoop 就是往阻塞队列中发送对象
eventProcessLoop.post(JobSubmitted(
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
//org.apache.spark.util.EventLoop
// 既然走到了EventLoop,势必会执行run(),也就是执行 onReceive(event)
override def run(): Unit = {
try {
while (!stopped.get) {
val event = eventQueue.take()
try {
// 终端生命周期之一
onReceive(event)
......
}
//因为EventLoop是一个抽象类,所以只能去其实现类找对应的方法
//org.apache.spark.scheduler.DAGSchedulerEventProcessLoop#onReceive
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
//因为前面eventProcessLoop.post(JobSubmitted())正好对应上
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
//所以下一步走 handleJobSubmitted()
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
}
//org.apache.spark.scheduler.DAGScheduler#handleJobSubmitted
//创建最终的阶段
finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
//org.apache.spark.scheduler.DAGScheduler#createResultStage
//创建或者获取上一个阶段
val parents = getOrCreateParentStages(rdd, jobId)
//org.apache.spark.scheduler.DAGScheduler#getOrCreateParentStages
private def getOrCreateParentStages(rdd: RDD[_], firstJobId: Int): List[Stage] = {
//getShuffleDependencies()主要作用:
//判断当前RDD的依赖关系是否存在shuffle dependency
//如果存在shuffle dependency,将该RDD添加到此HashSet
//如果不存在,去判断该RDD的上一个RDD是否存在shuffle dependency
getShuffleDependencies(rdd).map { shuffleDep =>
getOrCreateShuffleMapStage(shuffleDep, firstJobId)
}.toList
}
//org.apache.spark.scheduler.DAGScheduler#getOrCreateShuffleMapStage
private def getOrCreateShuffleMapStage(
shuffleDep: ShuffleDependency[_, _, _],
firstJobId: Int): ShuffleMapStage = {
shuffleIdToMapStage.get(shuffleDep.shuffleId) match {
case Some(stage) =>
stage
case None =>
// Create stages for all missing ancestor shuffle dependencies.
getMissingAncestorShuffleDependencies(shuffleDep.rdd).foreach { dep =>
if (!shuffleIdToMapStage.contains(dep.shuffleId)) {
// ShuffleMapStage
createShuffleMapStage(dep, firstJobId)
}
}
createShuffleMapStage(shuffleDep, firstJobId)
}
}
//所以最终,一个简单的world count 程序是存在两个Stage
//1. ResultStage
//2. ShuffleMapStage
// 由此可见,spark stage 的划分,是根据RDD 的转换中是否存在 Shuffle 操作来决定的
// 并且 Stage的划分是从 Action 算子开始,一步一步倒推过去,最终划分Stage
//继续看源码
//org.apache.spark.scheduler.DAGScheduler#handleJobSubmitted
//最终,给ShuffleMapStage 放到了ResultStage,其实就是ResultStage包含给ShuffleMapStage
finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite)
// 将finalStage 放入 ActiveJob,证明,job里面包含Stage
// 所以有的时候说 一个job里面包含多个阶段,这句话是没问题的,但是真正意义上来说,一个job里面只包含了一个
//Stage,但是这一个Stage里面又包含多个Stage
val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
...
...
//提交Stage
submitStage(finalStage)
//org.apache.spark.scheduler.DAGScheduler#submitStage
// getMissingParentStages() 判断是否存在上一个Stage
val missing = getMissingParentStages(stage).sortBy(_.id)
//这里使用递归,一直取出上一级Stage,直到没有上一级Stage,执行submitMissingTasks()
if (missing.isEmpty) {
//这里提交的是shuffle Map 任务
submitMissingTasks(stage, jobId.get)
} else {
for (parent <- missing) {
submitStage(parent)
}
waitingStages += stage
}
//org.apache.spark.scheduler.DAGScheduler#submitMissingTasks
case stage: ShuffleMapStage =>
stage.pendingPartitions.clear()
partitionsToCompute.map { id =>
val locs = taskIdToLocations(id)
val part = partitions(id)
stage.pendingPartitions += id
//创建Task,Task 的个数根据分区个数决定
new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
Option(sc.applicationId), sc.applicationAttemptId, stage.rdd.isBarrier())
}
case stage: ResultStage =>
partitionsToCompute.map { id =>
val p: Int = stage.partitions(id)
val part = partitions(p)
val locs = taskIdToLocations(id)
new ResultTask(stage.id, stage.latestInfo.attemptNumber,
taskBinary, part, locs, id, properties, serializedTaskMetrics,
Option(jobId), Option(sc.applicationId), sc.applicationAttemptId,
stage.rdd.isBarrier())
}
}
//提交下一个阶段的任务
// Spark 在划分Stage 的时候,是从后向前划分,但是在提交任务的时候,是从前向后划分
// 比如 : Stage collect --> reducebykey -->map -->flatmap
// submitTask flatmap --> map --> reducebykey --> collect
submitWaitingChildStages(stage)
if (tasks.size > 0) {
//提交任务
taskScheduler.submitTasks(new TaskSet(
tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
}
//org.apache.spark.scheduler.TaskSchedulerImpl#submitTasks
val manager = createTaskSetManager(taskSet, maxTaskFailures)
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
//org.apache.spark.scheduler.FIFOSchedulableBuilder#addTaskSetManager
override def addTaskSetManager(manager: Schedulable, properties: Properties) {
//先添加到任务池,当你机器没有资源的时候,任务可以在池里等待资源充足时在运行
rootPool.addSchedulable(manager)
}
//org.apache.spark.scheduler.TaskSchedulerImpl#submitTasks
backend.reviveOffers()
//org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
case ReviveOffers =>
makeOffers()
//org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.DriverEndpoint#makeOffers
//从任务池获取任务,如果需要执行
if (!taskDescs.isEmpty) {
launchTasks(taskDescs)
}
//org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend.DriverEndpoint#launchTasks
// 由于Task 是需要从driver 发送到executor端执行,所以必须序列化
val serializedTask = TaskDescription.encode(task)
//driver 与 executor 进行通信
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask)))
//回过头来看executor终端
//org.apache.spark.executor.CoarseGrainedExecutorBackend#receive
case LaunchTask(data) =>
if (executor == null) {
exitExecutor(1, "Received LaunchTask command but executor was null")
} else {
// Task 反序列化
val taskDesc = TaskDescription.decode(data.value)
logInfo("Got assigned task " + taskDesc.taskId)
// 执行任务
executor.launchTask(this, taskDesc)
}