#1.Linux shell命令执行
bin/spark-submit \
--master yarn \
--deploymode client \
--class org.apache.spark.examples.SparkPi/WordCount \ #【Driver类】
./examples/jars/spark-examples_2.11-2.1.1.jar 100
...
exec "${SPARK_HOME}"/bin/spark-class \
org.apache.spark.deploy.SparkSubmit "$@"
# spark-submit 转去执行 bin/spark-class.sh 文件
# $@: 本shell命令传入的所有参数
...
1.装载JAVA_HOME
2.装载SPARK_JARS_DIR
3.创建 shell command
# 最终结果: JAVA_HOME/bin/java + JVM参数 + SPARK_JARS_DIR + org.apache.spark.launcher.Main(
# org.apache.spark.deploy.SparkSubmit spark-submit传入参数)
4.执行:通过 spark.launcher.Main类 启动 spark.deploy.SparkSubmit 的执行
spark-submit 脚本:
调用 spark-class 脚本,传入参数:SparkSubmit 类 + SparkSubmit 参数
spark-class 脚本:
装载 JAVA_HOME 及 JVM参数、SPARK_JAR_DIR,运行 spark.launcher.Main Java进程,传入参数:SparkSubmit 类 + SparkSubmit 参数。也就是通过 spark.launcher.Main 进程 调用 SparkSubmit
SparkSubmit.main(){
1.封装 传入的命令行参数字符串 为 参数对象(启动参数)
/*
protected SparkSubmitOptionParser.handle(SparkSubmitArguments实现此方法)
【protected方法抛异常目的是:子类不重写,就报错 -- 模板设计模式】
*/
val appArgs = new SparkSubmitArguments(args) extends SparkSubmitArgumentsParser extends SparkSubmitOptionParser{
parse(args.asJava){
if (!handle(name, value)){
name match {
case MASTER =>
master = value
case CLASS =>
mainClass = value 【Driver类】
...
}
break;
}
}
}
2.对 SparkSubmitArguments.action 参数属性(默认装载为SUBMIT)进-行模式匹配:
case SUBMIT ->submit(appArgs) //[SUBLIT/KILL/REQUEST_STATUS]
...
private def SparkSubmitArguments.loadEnvironmentArguments(){
...
action = Option(action).getOrElse(SUBMIT)
...
}
}?SparkSubmit.main()
2.1.SparkSubmit.submit(appArgs){
1 准备提交环境:
val (childArgs,childClasspath,sysProps,childMainClass) = prepareSubmitEnvironment(args){
...
if(deployMode == Client || isYarnCluster)
childMainClass = args.mainClass
==> case CLASS(--class) => mainClass = value
--class WordCount/SparkPi(Dirver类)
【Cluster Mode 与 Client Mode 唯一区别 —— Driver的位置
Client Mode Driver 位于 Idea/SparkSubmit 进程
Cluster Mode Driver 位于 Yarn集群内 ApplicationMaster 进程】
...
if(isYarnCLuster)
childMainClass = "org.apache.spark.deploy.yarn.Client"
...
}
2 判断执行环境
//不论自带集群/非自带 都执行 dorRunMain() 都执行 runMain()
if(args.isStandaloneCluster && args.useRest) dorRunMain(...)
else dorRunMain(...){
if(args.proxyUser != null) runMain(...)
else runMain(...)
}
//只是对不同执行环境下的 runMain() 传入参数不同
3 runMain(...){
设置 当前线程的上下文环境类加载器
Thread.currentThread.setContextClassLoader(loader)
加载 所需Jar包至 Classpath 上传Spark Jar资源
for (jar <- childClasspath) addJarToClasspath(jar, loader)
封装 sysProps 参数
for ((key, value) <- sysProps) System.setProperty(key, value)
反射 加载 childMainClass 类
[注:未使用 java 类名 / new Thread(),不是 java 进程,不是 new Thread 线程]
mainClass = Utils.classForName(childMainClass)
获取 childMainClass.main 静态main方法对象 ↑↑↑↑prepareSubmitEnvironment()↑↑↑↑
val mainMethod = mainClass.getMethod("main", new Array[String](0).getClass)
调用 null.childMainClass.main() 【静态方法,不需要对象调用,使用null】
//Yarn-Cluster模式==> org.apache.spark.deploy.yarn.Client.main()
//Yarn-Client模式==> (WordCount)Driver.main()
mainMethod.invoke(null, childArgs.toArray) [反射只是个普通方法调用]
}
}?SparkSubmit.submit()
}?SparkSubmit.main()
**1.—————— Yarn-Client 模式 · 在创建 SparkContext 时与 YARN-RM 进行通讯连接——————**
//Driver 进程 【Yarn-Client 与 Yarn-Cluster 的区别,只在于 Driver 是作为 Yarn集群内部某个NM的AM的一个线程,还是Yarn集群外部的一个进程】
Driver(SparkPi/WordCount).main(){
val conf: SparkConf = new SparkConf().setAppName("WordCount")
//SparkContext 任务运行环境,SC可以说是最重要,最核心对象及代码逻辑,甚至于可将SC称为Driver对象
val sc = new SparkContext(conf){
// Create and start the scheduler
val (sched, ts) = SparkContext.createTaskScheduler(this, master, deployMode){
master match {
case masterUrl =>
{
val cm = getClusterManager(masterUrl) match {
case Some(clusterMgr) => clusterMgr
case None => throw new SparkException("Could not parse Master URL: '" + master + "'")
}
try {
val scheduler = cm.createTaskScheduler(sc, masterUrl)
val backend = cm.createSchedulerBackend(sc, masterUrl, scheduler){
...
sc.deployMode match {
case "cluster" =>
new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
case "client" =>
new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc){
override def start() {
...
client = new Client(args, conf).run(){
...
createContainerLaunchContext(newAppResponse){
...
//Yarn-Cluster:java AM
// Yarn-Client:java ExecutorLauncher
val amClass =
if (isClusterMode) {
Utils.classForName("yarn.ApplicationMaster").getName
} else {
Utils.classForName("yarn.ExecutorLauncher").getName
}
...
※ExecutorLauncher进程就是AM进程,存在意义:为区分Client的AM 与 Cluster的AM名称
object ExecutorLauncher {
def main(args: Array[String]): Unit = {
ApplicationMaster.main(args)
}
}
※但是
ApplicationMaster.run(){
if(isClusterMode){
runDriver(securityMgr){
//开启Driver子线程
userClassThread = startUserApplication()
//向RM 注册 AM自身启动成功
registerAM(...)
}
}else{
runExecutorLauncher(securityMgr){
//直接,因为现在这个进程 就是!Driver 进程!而不是 yarn.Client 进程
registerAM(...)
}
}
}
}
}
...
}
}
}
...
}
}
cm.initialize(scheduler, backend)
(backend, scheduler)
}
}
}
}
//WordCount逻辑...
sc.makeRDD(strs).flatMap(_.split(" ")).map((_, 1))
.reduceByKey(_ + _).collect().foreach(println)
sc.stop()
}
}√SparkSubmit.submit()
}√SparkSubmit.main()
**2.—————— Yarn-Cluster 模式 ——————**
org.apache.spark.deploy.yarn.Client.main方法
Client.main(){
1.参数封装(args)
val args = new ClientArguments(argStrings){
userClass ==> --class ,
userJar ==> --jar
}
2.创建 Client 对象
new Client(args,sparkConf){
1.创建与Yarn集群的 ResourceMananger 连接对象
private val yarnClient = YarnClient.createYarnClient(){
protected ApplicationClientProtocol rmClient;
protected InetSocketAddress rmAddress;
}
2.初始化AM Executor配置
// AM related configurations
// Executor related configurations
3.创建 LauncherBackend 对象 (与SparkLauncher进行通信,将App作为其子进程启动)
private val launcherBackend = new LauncherBackend(){
//用于与Launcher Server通信的类
//在创建 Connection 时,通过工厂模式创建新线程
clientThread = LauncherBackend.threadFactory.newThread(connection)
private object LauncherBackend {
val threadFactory = ThreadUtils.namedThreadFactory("LauncherBackend")
}
}
}.run(){
//1.提交Application至Yarn.RM,申请创建 ApplicationMaster
this.appId = submitApplication(){
//1.1 连接RM,Launcher
launcherBackend.connect() //创建launcherBackend子线程,连接 Spark.Launcher[IDEA]
setupCredentials() //设置Kerberos身份验证
yarnClient.init(yarnConf) //初始化连接Yarn.RM配置
yarnClient.start() //启动 yarnClient 服务
//1.2 申请创建RM的资源
val newApp = yarnClient.createApplication() //向RM申请创建一个Application
val newAppResponse = newApp.getNewApplicationResponse() //得到RM回复
appId = newAppResponse.getApplicationId() //获取ApplicationID
reportLauncherState(SparkAppHandle.State.SUBMITTED) //向Java_Launcher[IDEA]报告状态
launcherBackend.setAppId(appId.toString) //通过launcherBackend向Java_Launcher[IDEA]设置ApplicationID
//1.3 验证集群是否有足够的资源用于AM,若有则继续new适当上下文,最后提交上下文,让RM创建AM_Builed_Job
verifyClusterResources(newAppResponse)
※重点※·下方详细说明
//1.4 设置适当的上下文来启动AM
val containerContext = createContainerLaunchContext(newAppResponse)
val appContext = createApplicationSubmissionContext(newApp, containerContext)
※重点※·下方详细说明
//1.5 提交AM并监视AM进程
yarnClient.submitApplication(appContext)
}
//2.根据属性设置,监控Application
/*提交Application应用程序至YARN-RM,如果set配置属性spark.yarn.submit.waitAppCompletion为true,
*则SparkSubmit进程(SparkSubmit.main().submint().yarn.Client.main())则会一直存活,状态监控报告直至application退出
*若false,则在submission之后,若yarnApplicationState,finalApplicationStatus不为FAILED KILLED UNDEFINED
*则正常退出,否抛异常
*/
if (!launcherBackend.isConnected() && fireAndForget) {
val report = getApplicationReport(appId)
val state = report.getYarnApplicationState
}else{
val (yarnApplicationState, finalApplicationStatus) = monitorApplication(appId)
if (yarnApplicationState == YarnApplicationState.FAILED ||
finalApplicationStatus == FinalApplicationStatus.FAILED)
throw new SparkException("")
}
}
}
}?SparkSubmit.submit()
}?SparkSubmit.main()
createContainerLaunchContext(newAppResponse)
//1.4.1 createContainerLaunchContext
//①设置提交我们的ApplicationMaster的容器上下文:ContainerLaunchContext
//YARN AM launch context: user_class,env,resources,command
createContainerLaunchContext(newAppResponse){
//0.创建AM Container
val amContainer = Records.newRecord(classOf[ContainerLaunchContext])
//1.prefixEnv:前缀·环境变量,如libraryPath路径
var prefixEnv: Option[String] = None
//2.Java 虚拟机参数
val javaOpts = ListBuffer[String]()
...
javaOpts += "-XX:+UseConcMarkSweepGC" //CMS GC
...
//3.特定于driver的java选项
javaOpts ++= Utils.splitCommandString(opts).map(YarnSparkHadoopUtil.escapeForShell)
prefixEnv = Some(getClusterPath(sparkConf, Utils.libraryPathEnvPrefix(libraryPaths)))
//4.装载driver参数
val userClass = "--class var"
val userJar = "--jar var"
...
val amClass =
//AM Class: 如果是Yarn-Cluster模式,使用 AM ,否则 Yarn-Client,使用ExecutorLauncher
if (isClusterMode) {
Utils.classForName("org.apache.spark.deploy.yarn.ApplicationMaster").getName
} else {
Utils.classForName("org.apache.spark.deploy.yarn.ExecutorLauncher").getName
}
...
//Seq: 对其内部的全部元素包装成一个Row对象( 1个n元Tuple = 1个Row包含 n 列,m个n元Tuple = m个Row包含 n 列 / 二维表m行n列
val amArgs = Seq(amClass) ++ userClass ++ userJar ++ primaryPyFile ++ primaryRFile ++
userArgs ++ Seq(
"--properties-file", buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
LOCALIZED_CONF_DIR, SPARK_CONF_FILE))
//5.拼接 commands shell执行语句
/* Shell Command for the ApplicationMaster
prefixEnv ${JAVA_HOME}/bin/java -jvm参数 --amArgs(yarn.ApplicationMaster / yarn.ExecutorLauncher)
*/
val commands = prefixEnv ++ Seq(
YarnSparkHadoopUtil.expandEnvironment(Environment.JAVA_HOME) + "/bin/java", "-server"
) ++
javaOpts ++ amArgs ++
Seq(
"1>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout",
"2>", ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr")
//commands验空
val printableCommands = commands.map(s => if (s == null) "null" else s).toList
//将 shell java command 置入 amContainer
amContainer.setCommands(printableCommands.asJava)
//最后,设定 AM 安全管理
}
createApplicationSubmissionContext(newApp, containerContext)
//1.4.2 createApplicationSubmissionContext
//②设置提交我们的ApplicationMaster的提交上下文:ApplicationSubmissionContext
createApplicationSubmissionContext(newApp, containerContext){
val appContext = newApp.getApplicationSubmissionContext
//设置applicationContext属性
appContext.setApplicationName().setQueue(设置提交队列).setAMContainerSpec(①容器上下文).setApplicationType("SPARK")
//遍历Spark.Conf配置信息,根据不同情况,配置相应Context
sparkConf.get(APPLICATION_TAGS)/.get(AM_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS)
/.get(ROLLED_LOG_INCLUDE_PATTERN).foreach{
}
sparkConf.get(MAX_APP_ATTEMPTS)/.get(AM_NODE_LABEL_EXPRESSION) match {
}
...
//设置 Executor 容量:内存 CPU
val capability = Records.newRecord(classOf[Resource])
capability.setMemory(amMemory + amMemoryOverhead)
capability.setVirtualCores(amCores)
...
appContext
}
Client.yarnClient = YarnClient.createYarnClient(){ 连接RM的客户端 }
yarnClient.submitApplication(appContext) —> 创建ApplicationMaster进程 在某个NM节点上
[由 commands 命令创建]
ApplicationMaster: 将资源调度,与,任务调度 分开,解耦合【Driver --AM--> NM】
// 【一个NM可放置多个 Executor(Container)】
ApplicationMaster.main(){
//1.封装 Shell_JAVA 命令的传入参数
val amArgs = new ApplicationMasterArguments(args)
//2.使用Spark配置加载属性文件,并将条目设置为系统属性,以便在AM中运行的用户代码也可以访问它们
...
//使用Hadoop UserGroupInformation将给定的函数作为线程局部变量分发给子线程并运行
SparkHadoopUtil.get.runAsSparkUser {
() =>
//初始化
master = new ApplicationMaster(amArgs, new YarnRMClient{
//AM <--> RM 通信客户端Client
private var amClient: AMRMClient[ContainerRequest] = _
}){
3.装载 各个参数
sparkConf,yarnConf,isClusterMode,maxNumExecutorFailures,heartbeatInterval
rpcEnv: RpcEnv = 2个Java进程(2个JVM)远程通信环境
amEndpoint: RpcEndpointRef = 通信终端(通信目标)
}
//
System.exit(master.run(){
...
if(Client)
runExecutorLauncher()
if(ClusterMode)
runDriver(){
//1.启动用户应用程序线程 -- Driver
userClassThread = startUserApplication(){
...
//创建 Driver 线程 【 Driver是 AM 的一个线程,Driver就是程序员写的创建sc的类】
/*
args:ApplicationMasterArguments{
case ("--class") :: value :: tail =>
userClass = value
args = tail
}
*/
val mainMethod = userClassLoader.loadClass(args.userClass).getMethod("main",classOf[Array[String]])
val userThread = new Thread {
override def run() {
mainMethod.invoke(null, userArgs.toArray)
...
}
}
userThread.setContextClassLoader(userClassLoader)
userThread.setName("Driver")
userThread.start()
userThread
}
//2.等待Driver子线程的sc对象装载
val sc = ThreadUtils.awaitResult(sparkContextPromise.future,
Duration(totalWaitTime,
TimeUnit.MILLISECONDS))
//3.如果Driver.sc对象就绪,【先划分阶段,再分配任务】
if (sc != null) {
rpcEnv = sc.env.rpcEnv
//AM 与 Driver 建立通信(集群模式下AM和Driver属于同一个进程,所以修正点不需要监控驱动程序的生命周期)
val driverRef = runAMEndpoint(sc.getConf.get("spark.driver.host"),
sc.getConf.get("spark.driver.port"),
isClusterMode = true)
//AM 向 RM 为 Executor 申请资源(Container)
registerAM(){
1.AM向RM注册自身
2.向RM申请可用资源列表,若YARN满足所有要求,将有maxexecutor数量的Containers,但只是有可能运行Executor
allocator.allocateResources(){
//更新当前已请求到的资源列表及未满足的需求列表
updateResourceRequests()
//请求额外/未满足需求的容器并接收新的容器分配,并可同时进行心跳检测判断 RM NM 健康状态,必须定期执行
val allocateResponse = amClient.allocate(progressIndicator=0.1f)
//通过ResourceManager获取最新的已分配到容器的列表
val allocatedContainers = allocateResponse.getAllocatedContainers()
//处理AM获取到的容器s
if (allocatedContainers.size > 0) {
//选择所需Container启动Executor,不需要的释放
//race conditions[静态移动数据不如移动计算,节点本地化node-local/机架本地化rack-local]
handleAllocatedContainers(container){
// 1.匹配本机传入的请求 node-local=Executor进程与AM进程+Driver线程在同一个节点
// 2.匹配同一机架 rack-local=Executor进程与AM进程+Driver线程在同一个机架
// 3.分配既不是节点本地化的,也不是机架本地化的 无所谓
// 4.释放不需要的container
internalRelaseContainer(container)
// 5.运行Executor在筛选后的container上
runAllocatedContainers(container){
if (numExecutorsRunning < targetNumExecutors) {
if (launchContainers) {
launcherPool.execute(new Runnable{
override def run(): Unit = {
new ExecutorRunnable(...).run()
updateInternalState() //更新Executor执行情况,Container状态
}
}
new ExecutorRunnable(...).run(){
//NodeManager <--> AM
nmClient = NMClient.createNMClient()
nmClient.init(conf)
nmClient.start()
startContainer(){
//1.设置ContainerLaunchContext
val ctx:ContainerLaunchContext
val env = prepareEnvironment()
val commands = prepareCommand(){
//装配 JAVA_HOME环境变量及JVM参数
//命令内容:创建 CoarseGrainedExecutorBackend 进程
comands = java spark.executor.CoarseGrainedExecutorBackend
}
ctx.setLocalResources(localResources).setEnvironment(env)
.setTokens(...).setCommands(commands).setApplicationACLs()
//2.发送启动命令至ContainerNM(根据ctx启动NM上的ExecutorContainer)
nmClient.startContainer(containrer.get.ctx)
}
}
[启动某NM上的粗粒度ExecutorBackend后台进程]
CoarseGrainedExecutorBackend.main(){
...
//将args数组内Executor参数装入独立变量
var argv = args.toList
while (!argv.isEmpty) {
argv match {
case case ... }
}
...
run(driverUrl, executorId, hostname, cores, appId, workerUrl,
userClassPath){
...
//ExecutorBackend进程建立与ApplicationMaster进程内Driver子线程通信
val fetcher = RpcEnv.create("driverPropsFetcher",hostname,port
executorConf,
new SecurityManager(executorConf),
clientMode = true)
//获取 driver 引用
val driver = fetcher.setupEndpointRefByURI(driverUrl)
...
//创建ExecutorEnv,Executor环境对象
val env = SparkEnv.createExecutorEnv(driverConf, executorId,
hostname, port, cores, cfg.ioEncryptionKey, isLocal = false)
//创建 Executor 通信端点【不是真正计算的Executor】
env.rpcEnv.setupEndpoint("Executor",
new CoarseGrainedExecutorBackend(env.rpcEnv, driverUrl,
executorId,hostname, cores, userClassPath, env){
※E发送:[向Diver注册,表示已准备完毕,可接收Task]
CGEBackend.onStart(DriverRef.ask(RegisterExecutor))
※D接收:[DEndpint.inbox接收到ask(register)]
CoarseGrainedSchedulerBackend.DriverEndpoint
.receiveAndReply(RpcCallContext){
case RegisterExecutor(...) => {
//统计Executor核数等重要信息
...
//Reply回复
executorRef.send(RegisteredExecutor)
}
}
※E接收:[接收到Driver的回复,创建Executor计算对象]
CGEBackend.receive(){
case RegisteredExecutor => {
//真正的计算对象 Executor 【内部包含Task线程池】
executor = new Executor(executorId, hostname,
env, userClassPath, isLocal = false)
}
}
※ExecutorBackend 主动发起的 与 Driver 注册通信结束
※Task任务发射,由 Driver.Endpoint 主动 send
//LaunchTask消息封装体(已序列化后的Task数据)
executorData.executorEndpoint.send(LaunchTask(new
SerializableBuffer(serializedTask)))
※Task任务,被 CGExecutorBackend receive
CGExecutorBackend.receive(){
case LaunchTask(data) => {
//对接收到的TaskData反序列化
val taskDesc = ser.deserialize[TaskDesciption](
data.value)
//Executor线程池,执行Task子线程(CGEBackend)
executor.launchTask(...){
val tr = new TaskRunner(...){
override def run():Unit ={
...}
}
runningTasks.put(taskId,tr)
threadPool.execute(tr)
}
}
}
}
)
}
}
}
}
}
}
}
}
}
}
}
}√master.run()
})√SparkHadoopUtil.get.runAsSparkUser()
}√ApplicationMaster.main()
}√SparkSubmit.submit()
}√SparkSubmit.main()
参考链接:https://blog.csdn.net/lovehuangjiaju/article/details/49123975