阅读更多
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object JoinRDD {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("sparkjson").setMaster("local")
val sc = new SparkContext(conf)
val file1 = sc.textFile("C:\\Users\\Think\\Desktop\\json.txt")
val file2 = sc.textFile("C:\\Users\\Think\\Desktop\\json2.txt")
//(tom,1,2), (jack,1,2)
val words1: RDD[(String, Int, Int)] = file1.flatMap(_.split(" ")).map((_, 1, 2))
//(tom,3,4), (jack,3,4)
val words2 : RDD[(String,Int,Int)] = file2.flatMap(_.split(" ")).map((_, 3, 4))
val words1Map = words1.map(x=>(x._1,(x._2,x._3)))
val words2Map = words2.map(x=>(x._1,(x._2,x._3)))
val result: RDD[(String, ((Int, Int), Option[(Int, Int)]))] = words1Map.leftOuterJoin(words2Map)
val finalResult = result.map(x=>{
val key = x._1
val outerTuple = x._2
val outerTupleMeta1 = x._2._1._1
val outerTuplemeta2 = x._2._1._2
val outerTuple2_1 = x._2._2.get._1
val outerTuple2_2 = x._2._2.get._2
(key,outerTupleMeta1,outerTuplemeta2,outerTuple2_1,outerTuple2_2)
})
// println(words1.collect().toBuffer)
// println(words2.collect().toBuffer)
println(finalResult.collect().toBuffer)
sc.stop()
}
}