SparkContext

1.简介:SparkContext是Spark的驱动器,她的初始化中包含任务调度器、资源管理器、事件管理器和状态跟踪器等。

 

2.构造方法

主构造方法

//传入配置参数SparkConf
class SparkContext(config: SparkConf) extends Logging 

辅助构造方法

def this() = this(new SparkConf())  //无参构造方法传入默认配置

//master:运行环境,如:"local","yarn"等 appName:应用名称 conf:配置
def this(master: String, appName: String, conf: SparkConf) =   
    //调用主构造方法,还有几个类似辅助构造方法,调用updateConf ↓
    this(SparkContext.updatedConf(conf, master, appName))      

//创建SparkConf的修改版本,将传参配置设置到SparkConf中
private[spark] def updatedConf(
      conf: SparkConf,
      master: String,
      appName: String,
      sparkHome: String = null,
      jars: Seq[String] = Nil,
      environment: Map[String, String] = Map()): SparkConf =  
  {
    val res = conf.clone()
    res.setMaster(master)
    res.setAppName(appName)
    if (sparkHome != null) {
      res.setSparkHome(sparkHome)
    }
    if (jars != null && !jars.isEmpty) {
      res.setJars(jars)
    }
    res.setExecutorEnv(environment.toSeq)
    res
  }

 

3.主要成员

//部分将在初始化中说明
private var _conf: SparkConf = _
private var _eventLogDir: Option[URI] = None
private var _eventLogCodec: Option[String] = None
private var _listenerBus: LiveListenerBus = _
private var _env: SparkEnv = _
private var _statusTracker: SparkStatusTracker = _
private var _progressBar: Option[ConsoleProgressBar] = None
private var _ui: Option[SparkUI] = None
private var _hadoopConfiguration: Configuration = _
private var _executorMemory: Int = _
private var _schedulerBackend: SchedulerBackend = _
private var _taskScheduler: TaskScheduler = _
private var _heartbeatReceiver: RpcEndpointRef = _
@volatile private var _dagScheduler: DAGScheduler = _
private var _applicationId: String = _
private var _applicationAttemptId: Option[String] = None
private var _eventLogger: Option[EventLoggingListener] = None
private var _driverLogger: Option[DriverLogger] = None
private var _executorAllocationManager: Option[ExecutorAllocationManager] = None
private var _cleaner: Option[ContextCleaner] = None
private var _listenerBusStarted: Boolean = false
private var _jars: Seq[String] = _
private var _files: Seq[String] = _
private var _shutdownHookRef: AnyRef = _
private var _statusStore: AppStatusStore = _
private var _heartbeater: Heartbeater = _
private var _resources: scala.collection.immutable.Map[String, ResourceInformation] = _

 

4.初始化

try {
    _conf = config.clone()  //克隆构造方法传入的配置
    _conf.validateSettings()  //检查非法或过时的配置设置

    if (!_conf.contains("spark.master")) {   //如果没有设置运行环境,抛出异常
      throw new SparkException("A master URL must be set in your configuration")
    }
    if (!_conf.contains("spark.app.name")) {  //如果没有设置应用名称,抛出异常
      throw new SparkException("An application name must be set in your configuration")
    }

    _driverLogger = DriverLogger(_conf)   //Ctrl点不进去,英文翻译为:驱动程序记录器

    val resourcesFileOpt = conf.get(DRIVER_RESOURCES_FILE)    //驱动程序资源文件
    //英文翻译为:获取或发现所有资源
    _resources = getOrDiscoverAllResources(_conf, SPARK_DRIVER_PREFIX, resourcesFileOpt) 
    logInfo(s"Submitted application: $appName")

    //如果用户代码在YARN群集上由AM运行,则必须设置系统属性spark.yarn.app.id
    if (master == "yarn" && deployMode == "cluster" && !_conf.contains("spark.yarn.app.id")) {
      throw new SparkException("Detected yarn cluster mode, but isn't running on a cluster. " +
        "Deployment to YARN is not supported directly by SparkContext. Please use spark-submit.")
    }

    if (_conf.getBoolean("spark.logConf", false)) {
      logInfo("Spark configuration:\n" + _conf.toDebugString)
    }

    // 设置Spark驱动程序主机和端口系统属性。这显式设置了配置,而不是依赖于配置常量的默认值。 
    _conf.set(DRIVER_HOST_ADDRESS, _conf.get(DRIVER_HOST_ADDRESS))
    _conf.setIfMissing(DRIVER_PORT, 0)

    _conf.set(EXECUTOR_ID, SparkContext.DRIVER_IDENTIFIER)

    _jars = Utils.getUserJars(_conf)  //返回jars文件
    _files = _conf.getOption(FILES.key).map(_.split(",")).map(_.filter(_.nonEmpty))
      .toSeq.flatten

    _eventLogDir =
      if (isEventLogEnabled) {
        val unresolvedDir = conf.get(EVENT_LOG_DIR).stripSuffix("/")
        //为用户输入字符串描述的文件返回格式正确的URI
        Some(Utils.resolveURI(unresolvedDir))  
      } else {
        None
      }

    _eventLogCodec = {
      val compress = _conf.get(EVENT_LOG_COMPRESS)
      if (compress && isEventLogEnabled) {
        //返回给定编解码器名称的简短版本
        Some(CompressionCodec.getCodecName(_conf)).map(CompressionCodec.getShortName)  
      } else {
        None
      }
    }
    //异步将SparkListenerEvents传递给已注册的SparkListeners(_listenerBus)
    _listenerBus = new LiveListenerBus(_conf)  

    //在创建SparkEnv之前初始化应用程序状态存储和侦听器,以便它获取所有事件
    val appStatusSource = AppStatusSource.createSource(conf)
    _statusStore = AppStatusStore.createLiveStore(conf, appStatusSource)
    listenerBus.addToStatusQueue(_statusStore.listener.get)

    // 创建Spark执行环境(缓存,映射输出跟踪器等)
    _env = createSparkEnv(_conf, isLocal, listenerBus)
    SparkEnv.set(_env)

    //如果运行REPL,请将repl的输出目录注册到文件服务器
    _conf.getOption("spark.repl.class.outputDir").foreach { path =>
      val replUri = _env.rpcEnv.fileServer.addDirectory("/classes", new File(path))
      _conf.set("spark.repl.class.uri", replUri)
    }
    //用于监视作业和阶段进度的低级状态报告API
    _statusTracker = new SparkStatusTracker(this, _statusStore) 

    _progressBar =
      if (_conf.get(UI_SHOW_CONSOLE_PROGRESS)) {
        //ConsoleProgressBar显示控制台下一行中的阶段进度
        Some(new ConsoleProgressBar(this))  
      } else {
        None
      }

    _ui =
      if (conf.get(UI_ENABLED)) {
        //SparkUI:Spark应用程序的顶级用户界面
        Some(SparkUI.create(Some(this), _statusStore, 
             _conf, _env.securityManager, appName, "", startTime))
      } else {
        //对于测试,请不要启用UI
        None
      }
    //在启动任务计划程序之前绑定UI,以便将绑定端口正确地传递给集群管理器
    _ui.foreach(_.bind())
    //创建配置可以初始化一些Hadoop子系统
    _hadoopConfiguration = SparkHadoopUtil.get.newConfiguration(_conf) 
    //性能优化
    _hadoopConfiguration.size()

    //添加通过构造函数给出的每个JAR
    if (jars != null) {
      jars.foreach(addJar)
    }

    if (files != null) {
      files.foreach(addFile)
    }

    _executorMemory = _conf.getOption(EXECUTOR_MEMORY.key)  //执行者内存
      .orElse(Option(System.getenv("SPARK_EXECUTOR_MEMORY")))
      .orElse(Option(System.getenv("SPARK_MEM"))
      .map(warnSparkMem))
      .map(Utils.memoryStringToMb)
      .getOrElse(1024)

    // 将java选项转换为env vars作为解决方法,因为我们无法直接在sbt中设置env vars
    for { (envKey, propKey) <- Seq(("SPARK_TESTING", IS_TESTING.key))
      value <- Option(System.getenv(envKey)).orElse(Option(System.getProperty(propKey)))} {
      executorEnvs(envKey) = value
    }
    Option(System.getenv("SPARK_PREPEND_CLASSES")).foreach { v =>
      executorEnvs("SPARK_PREPEND_CLASSES") = v
    }
    // Mesos调度程序后端依赖此环境变量来设置执行程序内存
    executorEnvs("SPARK_EXECUTOR_MEMORY") = executorMemory + "m"
    executorEnvs ++= _conf.getExecutorEnv
    executorEnvs("SPARK_USER") = sparkUser

    // 我们需要在“createTaskScheduler”之前注册“HeartbeatReceiver”,
    // 因为Executor将在构造函数中检索“HeartbeatReceiver”
    _heartbeatReceiver = env.rpcEnv.setupEndpoint(
      HeartbeatReceiver.ENDPOINT_NAME, new HeartbeatReceiver(this))

    // 创建并启动调度程序
    val (sched, ts) = SparkContext.createTaskScheduler(this, master, deployMode)
    _schedulerBackend = sched  //调度程序后端
    _taskScheduler = ts  //任务计划程序
    _dagScheduler = new DAGScheduler(this) //有向无环图计划程序
    _heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)

    // 创建并启动heartbeater以收集内存指标
    _heartbeater = new Heartbeater(env.memoryManager,
      () => SparkContext.this.reportHeartBeat(),
      "driver-heartbeater",
      conf.get(EXECUTOR_HEARTBEAT_INTERVAL))
    _heartbeater.start()

    //taskScheduler在DAGScheduler的构造函数中设置DAGScheduler引用后启动TaskScheduler
    _taskScheduler.start()

    _applicationId = _taskScheduler.applicationId()  //获取应用Id
    _applicationAttemptId = _taskScheduler.applicationAttemptId()
    _conf.set("spark.app.id", _applicationId)
    if (_conf.get(UI_REVERSE_PROXY)) {
      //设置指定键指示的系统属性
      System.setProperty("spark.ui.proxyBase", "/proxy/" + _applicationId) 
    }
    _ui.foreach(_.setAppId(_applicationId))
    _env.blockManager.initialize(_applicationId)  //使用给定的appId初始化BlockManager

    // Driver的metrics系统需要将spark.app.id设置为app ID
    //所以它应该在我们从任务调度程序获取应用程序ID并设置spark.app.id之后开始
    _env.metricsSystem.start()
    // 在度量系统启动后,将驱动程序指标servlet处理程序附加到web ui
    _env.metricsSystem.getServletHandlers.foreach(handler => ui.foreach(_.attachHandler(handler)))

    _eventLogger =
      if (isEventLogEnabled) {
        val logger =
          //SparkListener,用于将事件记录到持久存储
          new EventLoggingListener(_applicationId, _applicationAttemptId, _eventLogDir.get, 
            _conf, _hadoopConfiguration)
        logger.start()
        listenerBus.addToEventLogQueue(logger) //将侦听器添加到事件日志队列
        Some(logger)  //返回事件记录
      } else {
        None
      }

    _cleaner =
      if (_conf.get(CLEANER_REFERENCE_TRACKING)) {
        Some(new ContextCleaner(this))    //用于RDD,shuffle和广播状态的异步清理器
      } else {
        None
      }
    _cleaner.foreach(_.start())   //启动清理器

    //返回是否在给定conf中启用动态分配
    val dynamicAllocationEnabled = Utils.isDynamicAllocationEnabled(_conf)   
    _executorAllocationManager =   //执行者分配管理器
      if (dynamicAllocationEnabled) {
        schedulerBackend match {
          case b: ExecutorAllocationClient =>
            Some(new ExecutorAllocationManager(
              schedulerBackend.asInstanceOf[ExecutorAllocationClient], listenerBus, _conf,
              cleaner = cleaner))
          case _ =>
            None
        }
      } else {
        None
      }
    //start():注册调度程序回调以决定何时添加和删除执行程序,以及启动调度任务
    _executorAllocationManager.foreach(_.start())   

    //注册spark.extraListeners中指定的侦听器,然后启动侦听器总线
    setupAndStartListenerBus()   
    postEnvironmentUpdate()   //任务计划程序准备好后,发布环境更新事件
    postApplicationStart()   //发布应用程序启动事件

    // Post init
    _taskScheduler.postStartHook()   //Yarn使用它根据首选位置引导资源分配,等待奴隶注册等
    //注册DAG任务计划测量资源
    _env.metricsSystem.registerSource(_dagScheduler.metricsSource)   
    //注册BlockManager资源
    _env.metricsSystem.registerSource(new BlockManagerSource(_env.blockManager))  
    _env.metricsSystem.registerSource(new JVMCPUSource())   //注册JVM CPU资源
    _executorAllocationManager.foreach { e =>
      //注册执行者分配管理器资源
      _env.metricsSystem.registerSource(e.executorAllocationManagerSource)   
    }
    appStatusSource.foreach(_env.metricsSystem.registerSource(_))   
    // 如果用户忘记了上下文,请确保上下文已停止。 这避免了离开JVM完全退出后未完成的事件日志。
    // 如果JVM没有用但是被杀了。
    logDebug("Adding shutdown hook") //迫切渴望创建记录器
    //添加具有给定优先级的关闭挂钩。运行具有更高优先级值的挂钩
    _shutdownHookRef = ShutdownHookManager.addShutdownHook(  
      ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY) { () =>
      logInfo("Invoking stop() from shutdown hook")
      try {
        stop()
      } catch {
        case e: Throwable =>
          logWarning("Ignoring Exception while stopping SparkContext from shutdown hook", e)
      }
    }
  } catch {
    case NonFatal(e) =>
      logError("Error initializing SparkContext.", e)
      try {
        stop()
      } catch {
        case NonFatal(inner) =>
          logError("Error stopping SparkContext after init error.", inner)
      } finally {
        throw e
      }
  }

 

你可能感兴趣的:(Spark源码系列)