Spark源码分析:SparkContext初始化

Spark源码分析:SparkContext初始化

1.Spark怎么运行?

spark一般从spark-shell开始,

具体调用层次关系:

Spark-shell->spark submit->real.main->sparkLoop—>createSparkcontext

Spark源码分析:SparkContext初始化_第1张图片

2.Spark初始化综述

Spark初始化主要涉及一下内容:

sparkenv. taskscheduler DAGScheduler webui

  • No1

    通过sparkconf来构建sparkenv, sparkenv主要包含blockmanager mapoutputtracker shufflefetcher connectionmanager

    SparkEnv构造函数如下:

    class SparkEnv (
        val executorId: String,
        private[spark] val rpcEnv: RpcEnv,
        val serializer: Serializer,
        val closureSerializer: Serializer,
        val serializerManager: SerializerManager,
        val mapOutputTracker: MapOutputTracker,//用来缓存mapstatus信息
        val shuffleManager: ShuffleManager,//路由维护表
        val broadcastManager: BroadcastManager,//广播
        val blockManager: BlockManager,//块管理
        val securityManager: SecurityManager,//安全管理
        val metricsSystem: MetricsSystem,//测量
        val memoryManager: MemoryManager,
        val outputCommitCoordinator: OutputCommitCoordinator,
        val conf: SparkConf) extends Logging 
    
  • No2

    创建taskscheduler。根据运行模式选择schedulerbackend,同时启动taskscheduler。

    /**
     * Create a task scheduler based on a given master URL.
     * Return a 2-tuple of the scheduler backend and the task scheduler.
     */
    private def createTaskScheduler(
        sc: SparkContext,
        master: String,
        deployMode: String): (SchedulerBackend, TaskScheduler) = {
      import SparkMasterRegex._
    
      // When running locally, don't try to re-execute tasks on failure.
      val MAX_LOCAL_TASK_FAILURES = 1
    
      master match {
        case "local" =>
          val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
          val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1)
          scheduler.initialize(backend)
          (backend, scheduler)
    
        case LOCAL_N_REGEX(threads) =>
          def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
          // local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.
          val threadCount = if (threads == "*") localCpuCount else threads.toInt
          if (threadCount <= 0) {
            throw new SparkException(s"Asked to run locally with $threadCount threads")
          }
          val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
          val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)
          scheduler.initialize(backend)
          (backend, scheduler)
    
        case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>
          def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
          // local[*, M] means the number of cores on the computer with M failures
          // local[N, M] means exactly N threads with M failures
          val threadCount = if (threads == "*") localCpuCount else threads.toInt
          val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)
          val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)
          scheduler.initialize(backend)
          (backend, scheduler)
    
        case SPARK_REGEX(sparkUrl) =>
          val scheduler = new TaskSchedulerImpl(sc)
          val masterUrls = sparkUrl.split(",").map("spark://" + _)
          val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)
          scheduler.initialize(backend)
          (backend, scheduler)
    
        case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
          // Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.
          val memoryPerSlaveInt = memoryPerSlave.toInt
          if (sc.executorMemory > memoryPerSlaveInt) {
            throw new SparkException(
              "Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format(
                memoryPerSlaveInt, sc.executorMemory))
          }
    
          val scheduler = new TaskSchedulerImpl(sc)
          val localCluster = new LocalSparkCluster(
            numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt, sc.conf)
          val masterUrls = localCluster.start()
          val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)
          scheduler.initialize(backend)
          backend.shutdownCallback = (backend: StandaloneSchedulerBackend) => {
            localCluster.stop()
          }
          (backend, scheduler)
    
        case masterUrl =>
          val cm = getClusterManager(masterUrl) match {
            case Some(clusterMgr) => clusterMgr
            case None => throw new SparkException("Could not parse Master URL: '" + master + "'")
          }
          try {
            val scheduler = cm.createTaskScheduler(sc, masterUrl)
            val backend = cm.createSchedulerBackend(sc, masterUrl, scheduler)
            cm.initialize(scheduler, backend)
            (backend, scheduler)
          } catch {
            case se: SparkException => throw se
            case NonFatal(e) =>
              throw new SparkException("External scheduler cannot be instantiated", e)
          }
      }
    }
    
    def initialize(backend: SchedulerBackend) {
        this.backend = backend
        schedulableBuilder = {
          schedulingMode match {
            case SchedulingMode.FIFO =>
              new FIFOSchedulableBuilder(rootPool)
            case SchedulingMode.FAIR =>
              new FairSchedulableBuilder(rootPool, conf)
            case _ =>
              throw new IllegalArgumentException(s"Unsupported $SCHEDULER_MODE_PROPERTY: " +
              s"$schedulingMode")
          }
        }
        schedulableBuilder.buildPools()
      }
    
    
  • No3

    根据taskscheduler实例创建dagscheduler。

    class DAGScheduler(
        private[scheduler] val sc: SparkContext,
        private[scheduler] val taskScheduler: TaskScheduler,
        listenerBus: LiveListenerBus,
        mapOutputTracker: MapOutputTrackerMaster,
        blockManagerMaster: BlockManagerMaster,
        env: SparkEnv,
        clock: Clock = new SystemClock())
      extends Logging {
    
      def this(sc: SparkContext, taskScheduler: TaskScheduler) = {
        this(
          sc,
          taskScheduler,
          sc.listenerBus,
          sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster],
          sc.env.blockManager.master,
          sc.env)
      }
    
  • No4

    启动webUI。

你可能感兴趣的:(Spark源码分析:SparkContext初始化)