Spark 通过submit作业启动流程源码分析

基于Spark2.4.4版本作业通过submit操作提交作业

./bin/spark-submit \
  --class <main-class> \
  --master <master-url> \
  --deploy-mode <deploy-mode> \
  --conf <key>=<value> \
  ... # other options
  <application-jar> \
  [application-arguments]

通过以上脚本将我们的应用程序提交到相对应的介质中运行,spark内部是如何提交运行的呢?

  1. 一开始cat spark-submit 启动脚本
//最后一行 很明显这个类中绝对有Main方法, 脚本中 @$ 意思是启动时传递的一堆参数
org.apache.spark.deploy.SparkSubmit 类来启动程序

1.1 来到spark源码中

 // 很明显 main方法中就调用了一个doSubmit方法而已,并将参数传进去
  def main(args: Array[String]): Unit = {
    val submit = new SparkSubmit()
    submit.doSubmit(args)
  }

  1. 进入doSubmit()方法
  def doSubmit(args: Array[String]): Unit = {
    // 解析参数的方法
    val appArgs = parseArguments(args)
    // 如果启动配置了 -verbos = true  就会输出日志信息
    if (appArgs.verbose) {
      logInfo(appArgs.toString)
    }
    //根据参数类型就行匹配相对应方法
    appArgs.action match {
      case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog)
      case SparkSubmitAction.KILL => kill(appArgs)
      case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
      case SparkSubmitAction.PRINT_VERSION => printVersion()
    }
  }

2.1 对上面的解析参数的方法进行粗略的跟一下源码:

 protected def parseArguments(args: Array[String]): SparkSubmitArguments = {
    new SparkSubmitArguments(args)
  }

/**
 * Parses and encapsulates arguments from the spark-submit script.
 * The env argument is used for testing.
 */
private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, String] = sys.env)
  extends SparkSubmitArgumentsParser with Logging {
  // 看到下面一坨参数,是不是亲切又熟悉了?
  var master: String = null
  var deployMode: String = null
  var executorMemory: String = null
  var executorCores: String = null
  var totalExecutorCores: String = null
  var propertiesFile: String = null
  var driverMemory: String = null
  var driverExtraClassPath: String = null
  var driverExtraLibraryPath: String = null
  var driverExtraJavaOptions: String = null
  var queue: String = null
  var numExecutors: String = null
  var files: String = null
  var mainClass: String = null
  var name: String = null
 .......
 // 验证参数(验证提交参数,验证kill参数 等) 这些点进去看 都很简单,清晰明了
 validateArguments()
...
}

2.2 继续回到doSubmit()方法中

//既然是提交自然走的是submit()方法中
case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog)

// 使用提供的参数提交程序,当指定了代理用户时,确保首先在doAS中包装处理
private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = {
    def doRunMain(): Unit = {
    // 一般代理用户都是为空
      if (args.proxyUser != null) {
        val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser,
          UserGroupInformation.getCurrentUser())
        try {
          proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
            override def run(): Unit = {
              runMain(args, uninitLog)
            }
          })
        } catch {
        // .....
        }
      } else {
      // 最后调用的方法是runMain方法
        runMain(args, uninitLog)
      }
    }
    // ...
  }

2.2.1 runMain 方法,关键方法

 private def runMain(args: SparkSubmitArguments, uninitLog: Boolean): Unit = {
     // 准备提交的环境
    val (childArgs, childClasspath, sparkConf, childMainClass) = prepareSubmitEnvironment(args)    
    
    val loader =
      if (sparkConf.get(DRIVER_USER_CLASS_PATH_FIRST)) {
        new ChildFirstURLClassLoader(new Array[URL](0),
          Thread.currentThread.getContextClassLoader)
      } else {
        new MutableURLClassLoader(new Array[URL](0),
          Thread.currentThread.getContextClassLoader)
      }
    Thread.currentThread.setContextClassLoader(loader)
  // 加载用户提交的jar,file 到classPath
    for (jar <- childClasspath) {
      addJarToClasspath(jar, loader)
    }

    var mainClass: Class[_] = null

    try {
    // 核心方法 点进去一看
      mainClass = Utils.classForName(childMainClass)
      /** 看到这个方法惊不惊喜, 反射中非常熟悉的方法
       def classForName(className: String): Class[_] = {
    	Class.forName(className, true, getContextOrSparkClassLoader)
      }
		**/

    } catch {
      // ... 异常处理 
    }

	// 通过newInstance 实例化构造方法
    val app: SparkApplication = if (classOf[SparkApplication].isAssignableFrom(mainClass)) {
      mainClass.newInstance().asInstanceOf[SparkApplication]
    } else {
      // SPARK-4170
      if (classOf[scala.App].isAssignableFrom(mainClass)) {
        logWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.")
      }
      new JavaMainApplication(mainClass)
    }

 	// 获取main 方法,并执行
      app.start(childArgs.toArray, sparkConf)   

上文中的app.start()方法

  override def start(args: Array[String], conf: SparkConf): Unit = {
  // 实例化 main 方法
    val mainMethod = klass.getMethod("main", new Array[String](0).getClass)
    if (!Modifier.isStatic(mainMethod.getModifiers)) {
      throw new IllegalStateException("The main method in the given main class must be static")
    }

    val sysProps = conf.getAll.toMap
    sysProps.foreach { case (k, v) =>
      sys.props(k) = v
    }
   // 执行main方法
    mainMethod.invoke(null, args)
  }

以上源码总结:(无反射不框架)

// 一层一层 点进去剖析,其实很简单
SparkSubmit.main{
    submit.doSubmit(args)  { // args:启动传的那么一坨参数
        super.doSubmit(args) {
            val appArgs = parseArguments(args)
            submit(appArgs, uninitLog) {
                doRunMain() {
                    runMain(args, uninitLog) {
                        val app = mainClass.newInstance()
                        app.start(childArgs.toArray, sparkConf) {
                            val mainMethod = klass.getMethod("main"...)
                            mainMethod.invoke(null, args)
                        }
                    }
                }
            }
        }
    }
}

有兴趣根据以上思路,可以瞅瞅Flink 启动是否类似呢? 答案是肯定的!

你可能感兴趣的:(Spark,源码,Shell)