具体执行计划是如何生成的,涉及到Spark SQL中的实现细节,这里不做详细描述,之后单独对Spark SQL原理进行解读。
在执行执行计划中的具体操作时,都会掉用SparkContext中的runJob方法,具体是如何调用的,在之后的Spark SQL原理解读中再进行详细描述。
private def clean( func: AnyRef, checkSerializable: Boolean, cleanTransitively: Boolean, accessedFields: Map[Class[_], Set[String]]): Unit = { if (!isClosure(func.getClass)) { logWarning("Expected a closure; got " + func.getClass.getName) return } // TODO: clean all inner closures first. This requires us to find the inner objects. // TODO: cache outerClasses / innerClasses / accessedFields if (func == null) { return } logDebug(s"+++ Cleaning closure $func (${func.getClass.getName}) +++") // A list of classes that represents closures enclosed in the given one val innerClasses = getInnerClosureClasses(func) // A list of enclosing objects and their respective classes, from innermost to outermost // An outer object at a given index is of type outer class at the same index val (outerClasses, outerObjects) = getOuterClassesAndObjects(func) // For logging purposes only val declaredFields = func.getClass.getDeclaredFields val declaredMethods = func.getClass.getDeclaredMethods logDebug(" + declared fields: " + declaredFields.size) declaredFields.foreach { f => logDebug(" " + f) } logDebug(" + declared methods: " + declaredMethods.size) declaredMethods.foreach { m => logDebug(" " + m) } logDebug(" + inner classes: " + innerClasses.size) innerClasses.foreach { c => logDebug(" " + c.getName) } logDebug(" + outer classes: " + outerClasses.size) outerClasses.foreach { c => logDebug(" " + c.getName) } logDebug(" + outer objects: " + outerObjects.size) outerObjects.foreach { o => logDebug(" " + o) } // Fail fast if we detect return statements in closures getClassReader(func.getClass).accept(new ReturnStatementFinder(), 0) // If accessed fields is not populated yet, we assume that // the closure we are trying to clean is the starting one if (accessedFields.isEmpty) { logDebug(s" + populating accessed fields because this is the starting closure") // Initialize accessed fields with the outer classes first // This step is needed to associate the fields to the correct classes later for (cls <- outerClasses) { accessedFields(cls) = Set[String]() } // Populate accessed fields by visiting all fields and methods accessed by this and // all of its inner closures. If transitive cleaning is enabled, this may recursively // visits methods that belong to other classes in search of transitively referenced fields. for (cls <- func.getClass :: innerClasses) { getClassReader(cls).accept(new FieldAccessFinder(accessedFields, cleanTransitively), 0) } } logDebug(s" + fields accessed by starting closure: " + accessedFields.size) accessedFields.foreach { f => logDebug(" " + f) } // List of outer (class, object) pairs, ordered from outermost to innermost // Note that all outer objects but the outermost one (first one in this list) must be closures var outerPairs: List[(Class[_], AnyRef)] = (outerClasses zip outerObjects).reverse var parent: AnyRef = null if (outerPairs.size > 0) { val (outermostClass, outermostObject) = outerPairs.head if (isClosure(outermostClass)) { logDebug(s" + outermost object is a closure, so we clone it: ${outerPairs.head}") } else if (outermostClass.getName.startsWith("$line")) { // SPARK-14558: if the outermost object is a REPL line object, we should clone and clean it // as it may carray a lot of unnecessary information, e.g. hadoop conf, spark conf, etc. logDebug(s" + outermost object is a REPL line object, so we clone it: ${outerPairs.head}") } else { // The closure is ultimately nested inside a class; keep the object of that // class without cloning it since we don't want to clone the user's objects. // Note that we still need to keep around the outermost object itself because // we need it to clone its child closure later (see below). logDebug(" + outermost object is not a closure or REPL line object, so do not clone it: " + outerPairs.head) parent = outermostObject // e.g. SparkContext outerPairs = outerPairs.tail } } else { logDebug(" + there are no enclosing objects!") } // Clone the closure objects themselves, nulling out any fields that are not // used in the closure we're working on or any of its inner closures. for ((cls, obj) <- outerPairs) { logDebug(s" + cloning the object $obj of class ${cls.getName}") // We null out these unused references by cloning each object and then filling in all // required fields from the original object. We need the parent here because the Java // language specification requires the first constructor parameter of any closure to be // its enclosing object. val clone = instantiateClass(cls, parent) for (fieldName <- accessedFields(cls)) { val field = cls.getDeclaredField(fieldName) field.setAccessible(true) val value = field.get(obj) field.set(clone, value) } // If transitive cleaning is enabled, we recursively clean any enclosing closure using // the already populated accessed fields map of the starting closure if (cleanTransitively && isClosure(clone.getClass)) { logDebug(s" + cleaning cloned closure $clone recursively (${cls.getName})") // No need to check serializable here for the outer closures because we're // only interested in the serializability of the starting closure clean(clone, checkSerializable = false, cleanTransitively, accessedFields) } parent = clone } // Update the parent pointer ($outer) of this closure if (parent != null) { val field = func.getClass.getDeclaredField("$outer") field.setAccessible(true) // If the starting closure doesn't actually need our enclosing object, then just null it out if (accessedFields.contains(func.getClass) && !accessedFields(func.getClass).contains("$outer")) { logDebug(s" + the starting closure doesn't actually need $parent, so we null it out") field.set(func, null) } else { // Update this closure's parent pointer to point to our enclosing object, // which could either be a cloned closure or the original user object field.set(func, parent) } } logDebug(s" +++ closure $func (${func.getClass.getName}) is now cleaned +++") if (checkSerializable) { ensureSerializable(func) } } |
在DAGScheduler实际对JobSubmitted事件进行处理时,func函数的类型已经从(TaskContext, Iterator[_]) => U 转换成了(TaskContext, Iterator[_]) => _
完成RDD和func/ shuffleDep的序列化之后,DAGScheduler会根据分区的id、数据本地性结合上一步序列化后的二进制码结果,构造出一系列的Task,这里的Task的运行逻辑一致,但分区id和数据本地性信息是根据要计算的数据的分区信息来进行包装的。
第二步 操作中对task进行真正的反序列化,使用更新后的当前线程类加载器
task = ser.deserialize[Task[Any]](
taskDescription.serializedTask, Thread.currentThread.getContextClassLoader)