sc.textFile("./data/input")
.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
.mapToPair(x -> new Tuple2<>(x, 1))
.reduceByKey((x1, x2) -> x1 + x2)
.saveAsTextFile("./data/output/output4");
def textFile(path: String): JavaRDD[String] = sc.textFile(path)
可以看到textFile的方法返回值是一个JavaRDD,这个RDD由sc.textFile(path)返回,继续深入sc.textFile(path)
def textFile(
path: String,
minPartitions: Int = defaultMinPartitions): RDD[String] = withScope {
assertNotStopped()
hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
minPartitions).map(pair => pair._2.toString).setName(path)
}
sc.textFile(path)返回的RDD由hadoopFile产生的RDD再经过map转换得到
def hadoopFile[K, V](
path: String,
inputFormatClass: Class[_ <: InputFormat[K, V]],
keyClass: Class[K],
valueClass: Class[V],
minPartitions: Int = defaultMinPartitions): RDD[(K, V)] = withScope {
assertNotStopped()
// This is a hack to enforce loading hdfs-site.xml.
// See SPARK-11227 for details.
FileSystem.getLocal(hadoopConfiguration)
// A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it.
val confBroadcast = broadcast(new SerializableConfiguration(hadoopConfiguration))
val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path)
new HadoopRDD(
this,
confBroadcast,
Some(setInputPathsFunc),
inputFormatClass,
keyClass,
valueClass,
minPartitions).setName(path)
}
可以看到,第一个RDD诞生了,它就是HadoopRDD。再次回头进入sc.textFile(path)的map函数。
def map[U: ClassTag](f: T => U): RDD[U] = withScope {
val cleanF = sc.clean(f)
new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.map(cleanF))
}
可以看到,它是new了一个MapPartitionsRDD并返回,于是这是第二个RDD,MapPartitionsRDD
在textFile方法里,一共产生了2个RDD,分别是HadoopRDD和MapPartitionRDD
def flatMap[U](f: FlatMapFunction[T, U]): JavaRDD[U] = {
def fn: (T) => Iterator[U] = (x: T) => f.call(x).asScala
JavaRDD.fromRDD(rdd.flatMap(fn)(fakeClassTag[U]))(fakeClassTag[U])
}
可以看到,flatMap的返回值是JavaRDD,而该RDD是由rdd.flatMap(fn)产生之后再经由JavaRDD.fromRDD()得到。继续通过由内向外分析。
def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U] = withScope {
val cleanF = sc.clean(f)
new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.flatMap(cleanF))
}
可以看到,在flatMap函数里,最后返回的是MapPartitionsRDD。因此,第三个RDD诞生,它就是MapPartitionsRDD
implicit def fromRDD[T: ClassTag](rdd: RDD[T]): JavaRDD[T] = new JavaRDD[T](rdd)
可以看到,在fromRDD函数里,是将传进来的参数rdd重新new了一个JavaRDD并返回,于是第四个RDD诞生,它就是JavaRDD
在flatMap函数里,总共生成了2个RDD,分别是MapPartitionsRDD和JavaRDD
def mapToPair[K2, V2](f: PairFunction[T, K2, V2]): JavaPairRDD[K2, V2] = {
def cm: ClassTag[(K2, V2)] = implicitly[ClassTag[(K2, V2)]]
new JavaPairRDD(rdd.map[(K2, V2)](f)(cm))(fakeClassTag[K2], fakeClassTag[V2])
}
可以看到,最后返回的是一个JavaPairRDD,而这个JavaPairRDD是由rdd.map()得到的。
def map[U: ClassTag](f: T => U): RDD[U] = withScope {
val cleanF = sc.clean(f)
new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.map(cleanF))
}
可以看到,map返回的是一个MapPartitionsRDD,因此第五个RDD诞生了,它就是MapPartitionsRDD
在mapToPair中,最后返回的JavaPairRDD是由map返回的MapPartitionsRDD经过new而得到,所以第六个RDD是JavaPairRDD
def reduceByKey(func: JFunction2[V, V, V]): JavaPairRDD[K, V] = {
fromRDD(reduceByKey(defaultPartitioner(rdd), func))
}
可以看到,首先是由reduceByKey(defaultPartitioner(rdd), func)得到一个RDD,再经由fromRDD得到一个新的RDD。
def reduceByKey(partitioner: Partitioner, func: JFunction2[V, V, V]): JavaPairRDD[K, V] =
fromRDD(rdd.reduceByKey(partitioner, func))
而在reduceByKey(defaultPartitioner(rdd), func)中由是由rdd.reduceByKey(partitioner, func)和fromRDD得到最后的RDD。
def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = self.withScope {
combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner)
}
RDD直接由combineByKeyWithClassTag[V]((v: V) => v, func, func, partitioner)得到
def combineByKeyWithClassTag[C](
createCombiner: V => C,
mergeValue: (C, V) => C,
mergeCombiners: (C, C) => C,
partitioner: Partitioner,
mapSideCombine: Boolean = true,
serializer: Serializer = null)(implicit ct: ClassTag[C]): RDD[(K, C)] = self.withScope {
require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
if (keyClass.isArray) {
if (mapSideCombine) {
throw new SparkException("Cannot use map-side combining with array keys.")
}
if (partitioner.isInstanceOf[HashPartitioner]) {
throw new SparkException("HashPartitioner cannot partition array keys.")
}
}
val aggregator = new Aggregator[K, V, C](
self.context.clean(createCombiner),
self.context.clean(mergeValue),
self.context.clean(mergeCombiners))
if (self.partitioner == Some(partitioner)) {
self.mapPartitions(iter => {
val context = TaskContext.get()
new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context))
}, preservesPartitioning = true)
} else {
new ShuffledRDD[K, V, C](self, partitioner)
.setSerializer(serializer)
.setAggregator(aggregator)
.setMapSideCombine(mapSideCombine)
}
}
可以看到,最后返回的是ShuffledRDD,因此第七个RDD诞生了,它就是ShuffledRDD
def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = {
new JavaPairRDD[K, V](rdd)
}
第八个RDD诞生,它就是JavaPairRDD
def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): JavaPairRDD[K, V] = {
new JavaPairRDD[K, V](rdd)
}
第九个RDD诞生,它也是JavaPairRDD
在reduceByKey函数里,总共产生了三个RDD,分别是ShuffledRDD、JavaPairRDD和JavaPairRDD
def saveAsTextFile(path: String): Unit = {
rdd.saveAsTextFile(path)
}
可以看到,返回值是Unit,但是rdd.saveAsTextFile(path)内却有产生RDD
def saveAsTextFile(path: String): Unit = withScope {
// https://issues.apache.org/jira/browse/SPARK-2075
//
// NullWritable is a `Comparable` in Hadoop 1.+, so the compiler cannot find an implicit
// Ordering for it and will use the default `null`. However, it's a `Comparable[NullWritable]`
// in Hadoop 2.+, so the compiler will call the implicit `Ordering.ordered` method to create an
// Ordering for `NullWritable`. That's why the compiler will generate different anonymous
// classes for `saveAsTextFile` in Hadoop 1.+ and Hadoop 2.+.
//
// Therefore, here we provide an explicit Ordering `null` to make sure the compiler generate
// same bytecodes for `saveAsTextFile`.
val nullWritableClassTag = implicitly[ClassTag[NullWritable]]
val textClassTag = implicitly[ClassTag[Text]]
val r = this.mapPartitions { iter =>
val text = new Text()
iter.map { x =>
text.set(x.toString)
(NullWritable.get(), text)
}
}
RDD.rddToPairRDDFunctions(r)(nullWritableClassTag, textClassTag, null)
.saveAsHadoopFile[TextOutputFormat[NullWritable, Text]](path)
}
继续进入mapPatitions函数。
def mapPartitions[U: ClassTag](
f: Iterator[T] => Iterator[U],
preservesPartitioning: Boolean = false): RDD[U] = withScope {
val cleanedF = sc.clean(f)
new MapPartitionsRDD(
this,
(context: TaskContext, index: Int, iter: Iterator[T]) => cleanedF(iter),
preservesPartitioning)
}
可以看到,返回的是一个MapPartitionsRDD。于是,第十个RDD诞生,它就是MapPartitionsRDD
saveAsTextFile函数里,共产生一个RDD,为MapPartitionsRDD。
在Spark的WordCount代码中,共产生了十个RDD。它们依次分别是:
Spark的WordCount到底产生了多少个RDD