val rdd = sc.parallelize(List(1,2,3,4,5,6,7),3)生成RDD的时候,RDD中的partition是如何决定的
def parallelize[T: ClassTag]( seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] |
numSlices如果指定则按照指定的partition数量,如果不指定则按照默认值defaultParallelism
def defaultParallelism: Int = { assertNotStopped() taskScheduler.defaultParallelism } |
要根据createTaskScheduler方法中指定的集群运行模式来确定taskScheduler类型
private def createTaskScheduler( sc: SparkContext, master: String, deployMode: String): (SchedulerBackend, TaskScheduler) = { import SparkMasterRegex._
// When running locally, don't try to re-execute tasks on failure. val MAX_LOCAL_TASK_FAILURES = 1
master match { case "local" => val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true) val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1) scheduler.initialize(backend) (backend, scheduler)
case LOCAL_N_REGEX(threads) => def localCpuCount: Int = Runtime.getRuntime.availableProcessors() // local[*] estimates the number of cores on the machine; local[N] uses exactly N threads. val threadCount = if (threads == "*") localCpuCount else threads.toInt if (threadCount <= 0) { throw new SparkException(s"Asked to run locally with $threadCount threads") } val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true) val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount) scheduler.initialize(backend) (backend, scheduler)
case LOCAL_N_FAILURES_REGEX(threads, maxFailures) => def localCpuCount: Int = Runtime.getRuntime.availableProcessors() // local[*, M] means the number of cores on the computer with M failures // local[N, M] means exactly N threads with M failures val threadCount = if (threads == "*") localCpuCount else threads.toInt val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true) val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount) scheduler.initialize(backend) (backend, scheduler)
case SPARK_REGEX(sparkUrl) => val scheduler = new TaskSchedulerImpl(sc) val masterUrls = sparkUrl.split(",").map("spark://" + _) val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls) scheduler.initialize(backend) (backend, scheduler)
case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) => // Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang. val memoryPerSlaveInt = memoryPerSlave.toInt if (sc.executorMemory > memoryPerSlaveInt) { throw new SparkException( "Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format( memoryPerSlaveInt, sc.executorMemory)) }
val scheduler = new TaskSchedulerImpl(sc) val localCluster = new LocalSparkCluster( numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt, sc.conf) val masterUrls = localCluster.start() val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls) scheduler.initialize(backend) backend.shutdownCallback = (backend: StandaloneSchedulerBackend) => { localCluster.stop() } (backend, scheduler)
case MESOS_REGEX(mesosUrl) => MesosNativeLibrary.load() val scheduler = new TaskSchedulerImpl(sc) val coarseGrained = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true) val backend = if (coarseGrained) { new MesosCoarseGrainedSchedulerBackend(scheduler, sc, mesosUrl, sc.env.securityManager) } else { new MesosFineGrainedSchedulerBackend(scheduler, sc, mesosUrl) } scheduler.initialize(backend) (backend, scheduler)
case masterUrl => val cm = getClusterManager(masterUrl) match { case Some(clusterMgr) => clusterMgr case None => throw new SparkException("Could not parse Master URL: '" + master + "'") } try { val scheduler = cm.createTaskScheduler(sc, masterUrl) val backend = cm.createSchedulerBackend(sc, masterUrl, scheduler) cm.initialize(scheduler, backend) (backend, scheduler) } catch { case se: SparkException => throw se case NonFatal(e) => throw new SparkException("External scheduler cannot be instantiated", e) } } } |
以TaskSchedulerImpl为例
override def defaultParallelism(): Int = backend.defaultParallelism() |
以LocalSchedulerBackend为例
override def defaultParallelism(): Int = scheduler.conf.getInt("spark.default.parallelism", totalCores) |
从源码可以看到,首先获取conf中的spark.default.parallelism配置参数,如果没有配置,则默认使用totalCores,找到totalCores是创建LocalSchedulerBackend时传进来的
private[spark] class LocalSchedulerBackend( conf: SparkConf, scheduler: TaskSchedulerImpl, val totalCores: Int) |
那么回到createTaskScheduler方法
val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1) val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount) |
自此parallelize方法如果没有指定partition数量,默认有几个partition取决于local,如果是local则默认是1,如果local中有参数则partition数量为参数
conf.setMaster("local[2]") |
也可以在代码中设置
conf.set("spark.default.parallelism","2") |
partition默认数量
parallelize中的partition数量优先级sc.parallelize(List(1,2,3,4,5,6,7),3)> conf.set("spark.default.parallelism","2") > conf.setMaster("local")
最后的partition数量为3