Spark Worker内部工作原理

Worker


Worker是一个基于AKKA Actor 的Actor模型,和Master,Driver,进行通信的时候
都是通过在receiver方法中进行样例类的时间匹配,支持Worker同时实现了ActorLogReceive的trait,ActorLogReceive里面复写receive方法,对子类暴露出receiveWithLogging方法,worker只需要复写这个方法就可以了,这里面涉及到的设计模式就模板设计模式

Driver 开启过程

  • 启动Driver方法,需要将要启动的Driver添加到相应的worker上面,
    同时需要向Driver发送启动Driver的时间通知 LaunchDriver(driver.id, driver.desc) ,更改driver状态为RUNNING

Master.scala
def launchDriver(worker: WorkerInfo, driver: DriverInfo) {
logInfo("Launching driver " + driver.id + " on worker " + worker.id)
worker.addDriver(driver)
driver.worker = Some(worker)
worker.actor ! LaunchDriver(driver.id, driver.desc)
driver.state = DriverState.RUNNING

}

  • 来看一下Worker中的 LaunchDriver()方法,

  Worker.scala
case LaunchDriver(driverId, driverDesc) => {
  logInfo(s"Asked to launch driver $driverId")
  val driver = new DriverRunner(
    conf,   //配置文件  val conf: SparkConf,
    driverId, // DriverID 
    workDir,  // Drover 本地工作目录,如果没有工作,目录将会自动创建
    sparkHome,   //sparkHome
    driverDesc.copy(command = Worker.maybeUpdateSSLSettings(driverDesc.command, conf)),  //Driver 的描述信息
    self,     // this 
    akkaUrl)  //这个应该是Worker 与 Client 通信的链接
    将driver封装到hashMap中,进行管理,key是driverID
  drivers(driverId) = driver
  driver.start()
   启动起来之后,driver中记录描述信息中的资源信息,比如几个core,比如memory大小,记录一下
  coresUsed += driverDesc.cores
  memoryUsed += driverDesc.mem
}

来找找 driver.start()


DriverRunner.scala
/**
    DriverRunner是用来管理一个Driver进程,包括driver废掉的时候自动重启driver进程
    这种方式仅用在standalone 集群模式
 * Manages the execution of one driver, including automatically restarting the driver on failure.
 * This is currently only used in standalone cluster deploy mode.
 */
/** Starts a thread to run and manage the driver. */
def start() = {
new Thread("DriverRunner for " + driverId) {
  override def run() {
    try {
      val driverDir = createWorkingDirectory() //创建工作目录
        下载用户创建的jar ,也就是我们自编写的spark   application程序,最后把它打jar包,
        上传spark集群上面去执行,一般都是用maven  也有sbt,这种方式比较少用
         在提交driver的时候,把用户自定义的jar上传到hdfs  spark工作目录下面,在执行jar的时候
        就直接就近从hdfs上面下载jar执行,避免了大量的网络传输
      val localJarFilename = downloadUserJar(driverDir) 

      def substituteVariables(argument: String): String = argument match {
        case "{{WORKER_URL}}" => workerUrl
        case "{{USER_JAR}}" => localJarFilename
        case other => other
      }

      // TODO: If we add ability to submit multiple jars they should also be added here

        构建一个ProcessBuilder用来启动Driver进程
      val builder = CommandUtils.buildProcessBuilder(driverDesc.command, driverDesc.mem,
        sparkHome.getAbsolutePath, substituteVariables)
      launchDriver(builder, driverDir, driverDesc.supervise)
    }
    catch {
      case e: Exception => finalException = Some(e)
    }

    val state =
      if (killed) {
        DriverState.KILLED
      } else if (finalException.isDefined) {
        DriverState.ERROR
      } else {
        finalExitCode match {
          case Some(0) => DriverState.FINISHED
          case _ => DriverState.FAILED
        }
      }

    finalState = Some(state)

    worker ! DriverStateChanged(driverId, state, finalException)
  }
}.start()

}

    createWorkingDirectory()方法
   /**
     创建一个工作目录 为这个driver
     如果创建失败,就报错
   * Creates the working directory for this driver.
   * Will throw an exception if there are errors preparing the directory.
   */
    private def createWorkingDirectory(): File = {
    val driverDir = new File(workDir, driverId)
    if (!driverDir.exists() && !driverDir.mkdirs()) {
      throw new IOException("Failed to create directory " + driverDir)
    }
    driverDir

}

private def downloadUserJar(driverDir: File): String = {

    val jarPath = new Path(driverDesc.jarUrl)

    val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
    val jarFileSystem = jarPath.getFileSystem(hadoopConf)

    val destPath = new File(driverDir.getAbsolutePath, jarPath.getName)
    val jarFileName = jarPath.getName
    val localJarFile = new File(driverDir, jarFileName)
    val localJarFilename = localJarFile.getAbsolutePath
    判断是否下载成功,没成功抛出异常
    if (!localJarFile.exists()) { // May already exist if running multiple workers on one node
      logInfo(s"Copying user jar $jarPath to $destPath")
      FileUtil.copy(jarFileSystem, jarPath, destPath, false, hadoopConf)
    }

    if (!localJarFile.exists()) { // Verify copy succeeded
      throw new Exception(s"Did not see expected jar $jarFileName in $driverDir")
    }
    返回下载到本地jar的路径
    localJarFilename

}

上面DriverRunner中createWorkingDirectory()的创建,依托于Worker中createWorkDir()

Worker.scala
def createWorkDir() {
workDir = Option(workDirPath).map(new File(_)).getOrElse(new File(sparkHome, "work"))
try {
  // This sporadically fails - not sure why ... !workDir.exists() && !workDir.mkdirs()
  // So attempting to create and then check if directory was created or not.
  workDir.mkdirs()
  if ( !workDir.exists() || !workDir.isDirectory) {
    logError("Failed to create work directory " + workDir)
    System.exit(1)
  }
  assert (workDir.isDirectory)
} catch {
  case e: Exception =>
    logError("Failed to create work directory " + workDir, e)
    System.exit(1)
}

}

DriverRunner.scala 

var keepTrying = !killed

while (keepTrying) {
  logInfo("Launch Command: " + command.command.mkString("\"", "\" \"", "\""))

  synchronized {
    if (killed) { return }
    process = Some(command.start())
    initialize(process.get)
  }

  val processStart = clock.getTimeMillis()
  val exitCode = process.get.waitFor()
  if (clock.getTimeMillis() - processStart > successfulRunDuration * 1000) {
    waitSeconds = 1
  }

  if (supervise && exitCode != 0 && !killed) {
    logInfo(s"Command exited with status $exitCode, re-launching after $waitSeconds s.")
    sleeper.sleep(waitSeconds)
    waitSeconds = waitSeconds * 2 // exponential back-off
  }

  keepTrying = supervise && exitCode != 0 && !killed
  finalExitCode = Some(exitCode)
}
 }

- 当DriverRunner 把DriverDir 和 Jars包就近下载完后

 worker ! DriverStateChanged(driverId, state, finalException)

通知worker, Driver状态已经改变,Worker收到之后,再将DriverState 改变的消息,通过Actor发送给Master

    Worker.scala
    case DriverStateChanged(driverId, state, exception) => {
      state match {
        case DriverState.ERROR =>
          logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}")
        case DriverState.FAILED =>
          logWarning(s"Driver $driverId exited with failure")
        case DriverState.FINISHED =>
          logInfo(s"Driver $driverId exited successfully")
        case DriverState.KILLED =>
          logInfo(s"Driver $driverId was killed by user")
        case _ =>
          logDebug(s"Driver $driverId changed state to $state")
      }
      将DriverState 改变的消息,通过Actor发送给Master
      master ! DriverStateChanged(driverId, state, exception)
      我们可以看出来drivers是正在开启的Driver的集合,完成开启,将会被移出dirvers
      val driver = drivers.remove(driverId).get
      finishedDrivers是存储已经完成开启动作的driver,将上一个刚完成的dirver放到这个集合    
      finishedDrivers(driverId) = driver
      将刚才开启driver所占用的内存 和  内核  释放
      memoryUsed -= driver.driverDesc.mem
      coresUsed -= driver.driverDesc.cores
    }

Executor 开启过程

首先当然是看

Master.scala 中的launchExecutor()方法
def launchExecutor(worker: WorkerInfo, exec: ExecutorDesc) {
    logInfo("Launching executor " + exec.fullId + " on worker " + worker.id)
    worker.addExecutor(exec) //根据传入的worker,为该worker传入Executor
    worker.actor ! LaunchExecutor(masterUrl, //利用actor通信,让传入的的worker 执行 LaunchExecutor()
      exec.application.id, exec.id, exec.application.desc, exec.cores, exec.memory)
    exec.application.driver ! ExecutorAdded( //同时Master把executor添加到driver 中的executor集合中
      exec.id, worker.id, worker.hostPort, exec.cores, exec.memory)
  }

来看看Worker是如何处理launchExecutor()的

 case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) =>
  if (masterUrl != activeMasterUrl) {
    logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
  } else {
    try {
      logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))

      // Create the executor's working directory
      val executorDir = new File(workDir, appId + "/" + execId)
      if (!executorDir.mkdirs()) {
        throw new IOException("Failed to create directory " + executorDir)
      }

      // Create local dirs for the executor. These are passed to the executor via the
      // SPARK_LOCAL_DIRS environment variable, and deleted by the Worker when the
      // application finishes.
      val appLocalDirs = appDirectories.get(appId).getOrElse {
        Utils.getOrCreateLocalRootDirs(conf).map { dir =>
          Utils.createDirectory(dir).getAbsolutePath()
        }.toSeq
      }
      appDirectories(appId) = appLocalDirs

      新建一个 ExecutorRunner
      val manager = new ExecutorRunner(
        appId,
        execId,
        appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
        cores_,
        memory_,
        self,
        workerId,
        host,
        webUi.boundPort,
        publicAddress,
        sparkHome,
        executorDir,
        akkaUrl,
        conf,
        appLocalDirs, ExecutorState.LOADING)
      executors(appId + "/" + execId) = manager
      manager.start()
      coresUsed += cores_
      memoryUsed += memory_
      master ! ExecutorStateChanged(appId, execId, manager.state, None, None)
    } catch {
      case e: Exception => {
        logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)
        if (executors.contains(appId + "/" + execId)) {
          executors(appId + "/" + execId).kill()
          executors -= appId + "/" + execId
        }
       通知 master  Executor状态改变
        master ! ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
          Some(e.toString), None)
      }
    }
  }

来看看ExecutorRunner.scala

 def start() {
    workerThread = new Thread("ExecutorRunner for " + fullId) {
      override def run() { fetchAndRunExecutor() }
    }   
    workerThread.start()
    // Shutdown hook that kills actors on shutdown.
    shutdownHook = new Thread() {
      override def run() {
        killProcess(Some("Worker shutting down"))
      }
    }
    Runtime.getRuntime.addShutdownHook(shutdownHook)
 }

ExecutorRunner.scala

  /**
   * Download and run the executor described in our ApplicationDescription
   */
  def fetchAndRunExecutor() {
try {
  // Launch the process
  val builder = CommandUtils.buildProcessBuilder(appDesc.command, memory,
    sparkHome.getAbsolutePath, substituteVariables)
  val command = builder.command()
  logInfo("Launch command: " + command.mkString("\"", "\" \"", "\""))

  builder.directory(executorDir)
  builder.environment.put("SPARK_LOCAL_DIRS", appLocalDirs.mkString(","))
  // In case we are running this from within the Spark Shell, avoid creating a "scala"
  // parent process for the executor command
  builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")

  // Add webUI log urls
  val baseUrl =
    s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
  builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
  builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")

  process = builder.start()
  val header = "Spark Executor Command: %s\n%s\n\n".format(
    command.mkString("\"", "\" \"", "\""), "=" * 40)

  // Redirect its stdout and stderr to files
  val stdout = new File(executorDir, "stdout")
  stdoutAppender = FileAppender(process.getInputStream, stdout, conf)

  val stderr = new File(executorDir, "stderr")
  Files.write(header, stderr, UTF_8)
  stderrAppender = FileAppender(process.getErrorStream, stderr, conf)

  // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
  // or with nonzero exit code
  val exitCode = process.waitFor()
  state = ExecutorState.EXITED
  val message = "Command exited with code " + exitCode

  看到了。Executor 状态是先 通知worker,然后由worker通知master,
  worker ! ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode))
} catch {
  case interrupted: InterruptedException => {
    logInfo("Runner thread for executor " + fullId + " interrupted")
    state = ExecutorState.KILLED
    killProcess(None)
  }
  case e: Exception => {
    logError("Error running executor", e)
    state = ExecutorState.FAILED
    killProcess(Some(e.toString))
  }
}

}
}

然后当worker节点上的executor进程启动之后,会向Driver节点上相应的Driver进程反向注册,说明executor进程已经启动成功,等待执行task

你可能感兴趣的:(Spark Worker内部工作原理)