* :: DeveloperApi ::
* Base class for dependencies where each partition of the child RDD depends on a small number
* of partitions of the parent RDD. Narrow dependencies allow for pipelined execution.
abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
* Get the parent partitions for a child partition.
* @param partitionId a partition of the child RDD
* @return the partitions of the parent RDD that the child partition depends upon
def getParents(partitionId: Int): Seq[Int]
override def rdd: RDD[T] = _rdd
* :: DeveloperApi ::
* Represents a dependency on the output of a shuffle stage. Note that in the case of shuffle,
* the RDD is transient since we don't need it on the executor side.
* @param _rdd the parent RDD
* @param partitioner partitioner used to partition the shuffle output
* @param serializer [[org.apache.spark.serializer.Serializer Serializer]] to use. If not set
* explicitly then the default serializer, as specified by `spark.serializer`
* config option, will be used.
* @param keyOrdering key ordering for RDD's shuffles
* @param aggregator map/reduce-side aggregator for RDD's shuffle
* @param mapSideCombine whether to perform partial aggregation (also known as map-side combine)
class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
@transient private val _rdd: RDD[_ <: Product2[K, V]],
val partitioner: Partitioner,
val serializer: Serializer = SparkEnv.get.serializer,
val keyOrdering: Option[Ordering[K]] = None,
val aggregator: Option[Aggregator[K, V, C]] = None,
val mapSideCombine: Boolean = false)
extends Dependency[Product2[K, V]] {
if (mapSideCombine) {
require(aggregator.isDefined, "Map-side combine without Aggregator specified!")
override def rdd: RDD[Product2[K, V]] = _rdd.asInstanceOf[RDD[Product2[K, V]]]
private[spark] val keyClassName: String = reflect.classTag[K].runtimeClass.getName
private[spark] val valueClassName: String = reflect.classTag[V].runtimeClass.getName
// Note: It's possible that the combiner class tag is null, if the combineByKey
// methods in PairRDDFunctions are used instead of combineByKeyWithClassTag.
private[spark] val combinerClassName: Option[String] =
val shuffleId: Int = _rdd.context.newShuffleId()
val shuffleHandle: ShuffleHandle = _rdd.context.env.shuffleManager.registerShuffle(
shuffleId, _rdd.partitions.length, this)
窄依赖: 并行化+容错
宽依赖: 进行阶段划分(shuffle后的阶段需要等待shuffle前的阶段计算完才能执行)
SparkConf conf = new SparkConf().setAppName("Java-Test-WordCount").setMaster("local[*]");
JavaSparkContext jsc = new JavaSparkContext(conf);
List> tuple2List1 = Arrays.asList(new Tuple2<>("Alice", 15), new Tuple2<>("Bob", 18), new Tuple2<>("Thomas", 20), new Tuple2<>("Catalina", 25));
List> tuple3List = Arrays.asList(new Tuple3<>("Alice", "Female", "NanJ"), new Tuple3<>("Thomas", "Male", "ShangH"), new Tuple3<>("Tom", "Male", "BeiJ"));
JavaRDD> javaRDD1 = jsc.parallelize(tuple2List1);
JavaRDD> javaRDD2 = jsc.parallelize(tuple3List);
JavaPairRDD javaRDD3 = javaRDD1.mapToPair(new PairFunction, String, Integer>() {
public Tuple2 call(Tuple2 tuple2) {
return tuple2;
JavaPairRDD javaRDD31 = javaRDD3.partitionBy(new HashPartitioner(2));
JavaPairRDD> javaRDD4 = javaRDD2.mapToPair(new PairFunction, String, Tuple2>() {
public Tuple2> call(Tuple3 tuple3) {
return new Tuple2<>(tuple3._1(), new Tuple2<>(tuple3._2(), tuple3._3()));
JavaPairRDD> javaRDD41 = javaRDD4.partitionBy(new HashPartitioner(2));
//通过join 根据第五和第六个RDD构建出第七个RDD
JavaPairRDD>> javaRDD6 = javaRDD31.join(javaRDD41);
javaRDD6.foreach(new VoidFunction>>>() {
public void call(Tuple2>> stringTuple2Tuple2) throws Exception {
查看Spark Web UI
Note:通过如上UI显示,可以看出,Stage5中,partitionBy 和 join在同一个Stage中,并且join是子RDD的算子,故而可以得出结论,在此Stage中,join就是一个窄依赖
SparkConf conf = new SparkConf().setAppName("Java-Test-WordCount").setMaster("local[*]");
JavaSparkContext jsc = new JavaSparkContext(conf);
List> tuple2List1 = Arrays.asList(new Tuple2<>("Alice", 15), new Tuple2<>("Bob", 18), new Tuple2<>("Thomas", 20), new Tuple2<>("Catalina", 25));
List> tuple3List = Arrays.asList(new Tuple3<>("Alice", "Female", "NanJ"), new Tuple3<>("Thomas", "Male", "ShangH"), new Tuple3<>("Tom", "Male", "BeiJ"));
JavaRDD> javaRDD1 = jsc.parallelize(tuple2List1);
JavaRDD> javaRDD2 = jsc.parallelize(tuple3List);
JavaPairRDD javaRDD3 = javaRDD1.mapToPair(new PairFunction, String, Integer>() {
public Tuple2 call(Tuple2 tuple2) {
return tuple2;
JavaPairRDD> javaRDD4 = javaRDD2.mapToPair(new PairFunction, String, Tuple2>() {
public Tuple2> call(Tuple3 tuple3) {
return new Tuple2<>(tuple3._1(), new Tuple2<>(tuple3._2(), tuple3._3()));
//通过join 根据第三个RDD和第四个RDD构建得出第五个RDD
JavaPairRDD>> javaRDD5 = javaRDD3.join(javaRDD4);
javaRDD5.foreach(new VoidFunction>>>() {
public void call(Tuple2>> stringTuple2Tuple2) throws Exception {
查看Spark Web UI