sparkcore的join

先准备两个rdd数据

scala> val a = sc.parallelize(Array(("A","a1"),("B","b1"),("C","c1"),("E","e1")))
a: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[5] at parallelize at :24

scala> val b = sc.parallelize(Array(("A","a2"),("B","b2"),("D","d2"),("F","f2")))
b: org.apache.spark.rdd.RDD[java.io.Serializable] = ParallelCollectionRDD[6] at parallelize at :24

1.a join b得到的结果

scala> a.join(b).collect()
res9: Array[(String, (String, String))] = Array((B,(b1,b2)), (A,(a1,a2)))

2.a leftOuterJoin b得到的结果

scala> a.leftOuterJoin(b).collect
res10: Array[(String, (String, Option[String]))] = Array((B,(b1,Some(b2))), (A,(a1,Some(a2))), (C,(c1,None)), (E,(e1,None)))

3.a rightOutJoin b得到的结果

scala> a.rightOuterJoin(b).collect
res11: Array[(String, (Option[String], String))] = Array((B,(Some(b1),b2)), (F,(None,f2)), (D,(None,d2)), (A,(Some(a1),a2)))

4.a fullOuterJoin b得到的结果

scala> a.fullOuterJoin(b).collect
res12: Array[(String, (Option[String], Option[String]))] = Array((B,(Some(b1),Some(b2))), (F,(None,Some(f2))), (D,(None,Some(d2))), (A,(Some(a1),Some(a2))), (C,(Some(c1),None)), (E,(Some(e1),None)))

你可能感兴趣的:(sparkcore的join)