本文直接从SparkSubmit说起,脚本提交过程在之前的《spark-submit脚本执行过程》文章中已经说明。
1、执行org.apache.spark.deploy.SparkSubmit的main方法提交。
2、运行yarn客户端Client的run方法。
3、向ResourceManager提交application请求container用来运行ApplicationMaster。
4、运行ApplicationMaster的main方法,运行driver程序并注册AM。
5、用户程序开始运行,遇到action动作开始作业调度。
首先,SparkSubmit入口函数
override def main(args: Array[String]): Unit = {
val submit = new SparkSubmit() {
self =>
override protected def parseArguments(args: Array[String]): SparkSubmitArguments = {
new SparkSubmitArguments(args) {
override protected def logInfo(msg: => String): Unit = self.logInfo(msg)
override protected def logWarning(msg: => String): Unit = self.logWarning(msg)
}
}
override protected def logInfo(msg: => String): Unit = printMessage(msg)
override protected def logWarning(msg: => String): Unit = printMessage(s"Warning: $msg")
override def doSubmit(args: Array[String]): Unit = {
try {
super.doSubmit(args)
} catch {
case e: SparkUserAppException =>
exitFn(e.exitCode)
}
}
}
submit.doSubmit(args)
}
然后
def doSubmit(args: Array[String]): Unit = {
// Initialize logging if it hasn't been done yet. Keep track of whether logging needs to
// be reset before the application starts.
val uninitLog = initializeLogIfNecessary(true, silent = true)
val appArgs = parseArguments(args)
if (appArgs.verbose) {
logInfo(appArgs.toString)
}
appArgs.action match {
// 匹配到这里
case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog)
case SparkSubmitAction.KILL => kill(appArgs)
case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
case SparkSubmitAction.PRINT_VERSION => printVersion()
}
}
实际的submit执行分为两步
看如下代码
private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = {
// 准备启动环境,childMainClass可以关注下
val (childArgs, childClasspath, sparkConf, childMainClass) = prepareSubmitEnvironment(args)
// 运行childMainClass中的main方法
def doRunMain(): Unit = {
if (args.proxyUser != null) {
val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser,
UserGroupInformation.getCurrentUser())
try {
proxyUser.doAs(new PrivilegedExceptionAction[Unit]() {
override def run(): Unit = {
runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
}
})
} catch {
......
}
} else {
runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose)
}
}
然后我们看看runMain中的主要几步
private def runMain(
childArgs: Seq[String],
childClasspath: Seq[String],
sparkConf: SparkConf,
childMainClass: String,
verbose: Boolean): Unit = {
......
var mainClass: Class[_] = null
try {
// 通过反射的方法加载childMainClass,这个类名我们可以从prepareSubmitEnvironment(args)函数中看到
// 其对应的是 org.apache.spark.deploy.yarn.YarnClusterApplication
mainClass = Utils.classForName(childMainClass)
} catch {
......
}
// 实例化SparkApplication
val app: SparkApplication = if (classOf[SparkApplication].isAssignableFrom(mainClass)) {
mainClass.newInstance().asInstanceOf[SparkApplication]
} else {
// SPARK-4170
if (classOf[scala.App].isAssignableFrom(mainClass)) {
logWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.")
}
new JavaMainApplication(mainClass)
}
......
try {
// 调用start,执行invoke运行YarnClusterApplication(上边的childMainClass)start方法
app.start(childArgs.toArray, sparkConf)
} catch {
case t: Throwable =>
throw findCause(t)
}
}
}
我们接着看org.apache.spark.deploy.yarn包下的YarnClusterApplication
private[spark] class YarnClusterApplication extends SparkApplication {
override def start(args: Array[String], conf: SparkConf): Unit = {
// SparkSubmit would use yarn cache to distribute files & jars in yarn mode,
// so remove them from sparkConf here for yarn mode.
conf.remove("spark.jars")
conf.remove("spark.files")
// 可以看到创建yarn Client来运行,注意,这是集群管理器和部署模式共同决定的
new Client(new ClientArguments(args), conf).run()
}
}
如下为run方法,这里主要干啥呢?其实这里就是向ResourceManager提交application
def run(): Unit = {
this.appId = submitApplication()
// 这里注意下,spark.yarn.submit.waitAppCompletion 这个值决定该进程是否活跃,持续报告application的状态,直到application
// 退出停止,也比较耗资源,建议在提交应用时设置为false。
if (!launcherBackend.isConnected() && fireAndForget) {
......
} else {
......
}
}
我们看submitApplication函数
def submitApplication(): ApplicationId = {
var appId: ApplicationId = null
try {
launcherBackend.connect()
// 对yarn Client进行初始化
yarnClient.init(hadoopConf)
yarnClient.start()
// 从ResourceManager获取一个applicationId
val newApp = yarnClient.createApplication()
val newAppResponse = newApp.getNewApplicationResponse()
appId = newAppResponse.getApplicationId()
new CallerContext("CLIENT", sparkConf.get(APP_CALLER_CONTEXT),
Option(appId.toString)).setCurrentContext()
// 判断集群是否有足够资源启动ApplicationMaster
verifyClusterResources(newAppResponse)
// 设置context来启动ApplicationMaster
val containerContext = createContainerLaunchContext(newAppResponse)
val appContext = createApplicationSubmissionContext(newApp, containerContext)
// 向ResourceManager提交并监控应用
yarnClient.submitApplication(appContext)
launcherBackend.setAppId(appId.toString)
reportLauncherState(SparkAppHandle.State.SUBMITTED)
// 返回应用id
appId
} catch {
......
}
}
这里我们主要看createContainerLaunchContext,其主要作用是设置启动环境,java配置选项,组装启动AM的命令。
private def createContainerLaunchContext(newAppResponse: GetNewApplicationResponse)
: ContainerLaunchContext = {
......
val amContainer = Records.newRecord(classOf[ContainerLaunchContext])
amContainer.setLocalResources(localResources.asJava)
amContainer.setEnvironment(launchEnv.asJava)
val javaOpts = ListBuffer[String]()
......
// 这里能看到一个区别,cluster模式使用到ApplicationMaster类来运行AM,而client使用ExecutorLauncher类来运行AM。
// 但实际上ExecutorLauncher和ApplicationMaster功能一样,只是为了分辨cluster-mode和client-mode而在名称上做的区分而已。
// 并且,最终都是调用ApplicationMaster.run()来运行AM。
val amClass =
if (isClusterMode) {
Utils.classForName("org.apache.spark.deploy.yarn.ApplicationMaster").getName
} else {
Utils.classForName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
}
......
// send the acl settings into YARN to control who has access via YARN interfaces
val securityManager = new SecurityManager(sparkConf)
amContainer.setApplicationACLs(
YarnSparkHadoopUtil.getApplicationAclsForYarn(securityManager).asJava)
setupSecurityToken(amContainer)
amContainer
}
当向ResourceManager成功提交application之后,ResourceManager就会启动ApplicationMaster,运行ApplicationMaster的main(),main函数中调用了ApplicationMaster.run(),最后调用runImpl(),我们看看主要做了什么。
private def runImpl(): Unit = {
......
// yarn cluster和yarn client模式不同。在以cluster模式运行时,ApplicationMaster运行Driver程序。在以client模式运行时,运
// 行ExecutorLauncher程序
if (isClusterMode) {
runDriver()
} else {
runExecutorLauncher()
}
......
}
关注两点:
1、官网中也有介绍,当使用spark on yarn cluster模式运行spark应用程序时,driver是运行在由yarn管理的AM进程当中。即driver运行在集群当中的某个NodeManager的container中。在这种模式下,spark application初始化完成之后,客户端可直接退出而不影响程序运行,即提交应用的节点可跟提交的spark application不再有任何联系。
2、当使用spark on yarn client模式运行spark应用程序时,运行ExecutorLauncher,client模式下AM也仅仅作为向ResourceManager请求资源使用。由于driver需要保持活跃状态并和Executor进行各种交互,因此,任务完成之前client不能退出。这里我有些疑问,ExecutorLauncher和driver以及ApplicationMaster是什么关系?
private def runDriver(): Unit = {
addAmIpFilter(None)
// 主要是另起一个Thread运行用户程序,用户代码中包含spark driver,driver我理解为一种类似抽象层的概念,可以说是spark
// 程序的指挥官
userClassThread = startUserApplication()
// This a bit hacky, but we need to wait until the spark.driver.port property has
// been set by the Thread executing the user class.
logInfo("Waiting for spark context initialization...")
val totalWaitTime = sparkConf.get(AM_MAX_WAIT_TIME)
try {
val sc = ThreadUtils.awaitResult(sparkContextPromise.future,
Duration(totalWaitTime, TimeUnit.MILLISECONDS))
if (sc != null) {
rpcEnv = sc.env.rpcEnv
val userConf = sc.getConf
val host = userConf.get("spark.driver.host")
val port = userConf.get("spark.driver.port").toInt
// 注册成为ApplicationMaster
registerAM(host, port, userConf, sc.ui.map(_.webUrl))
val driverRef = rpcEnv.setupEndpointRef(
RpcAddress(host, port),
YarnSchedulerBackend.ENDPOINT_NAME)
createAllocator(driverRef, userConf)
} else {
// Sanity check; should never happen in normal operation, since sc should only be null
// if the user app did not create a SparkContext.
throw new IllegalStateException("User did not initialize spark context!")
}
resumeDriver()
// 完成后,运行用户程序
userClassThread.join()
} catch {
......
} finally {
resumeDriver()
}
}
runExecutorLauncher源码
private def runExecutorLauncher(): Unit = {
val hostname = Utils.localHostName
val amCores = sparkConf.get(AM_CORES)
rpcEnv = RpcEnv.create("sparkYarnAM", hostname, hostname, -1, sparkConf, securityMgr,
amCores, true)
// The client-mode AM doesn't listen for incoming connections, so report an invalid port.
registerAM(hostname, -1, sparkConf, sparkConf.getOption("spark.driver.appUIAddress"))
// The driver should be up and listening, so unlike cluster mode, just try to connect to it
// with no waiting or retrying.
val (driverHost, driverPort) = Utils.parseHostPort(args.userArgs(0))
val driverRef = rpcEnv.setupEndpointRef(
RpcAddress(driverHost, driverPort),
YarnSchedulerBackend.ENDPOINT_NAME)
addAmIpFilter(Some(driverRef))
createAllocator(driverRef, sparkConf)
// In client mode the actor will stop the reporter thread.
reporterThread.join()
}
ApplicationMaster注册完成之后,用户代码就可以运行了。由于spark的懒执行特性,用户代码中的每一步并不是都触发执行,直到执行到包含action的代码时,才触发任务调度,具体的action中对应着调用SparkContext中的runJob,这便触发了stage划分、任务分发、调度执行等一系列动作。本文指在描述应用提交流程,对具体任务调度,本文不做描述。
注:这是个学习的过程,水平有限,如果有错误的地方,大家看到了请帮忙指出,多谢。