leftOuterJoin操作

阅读更多

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object JoinRDD {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("sparkjson").setMaster("local")
    val sc = new SparkContext(conf)
    val file1 = sc.textFile("C:\\Users\\Think\\Desktop\\json.txt")
    val file2 = sc.textFile("C:\\Users\\Think\\Desktop\\json2.txt")
    //(tom,1,2), (jack,1,2)
    val words1: RDD[(String, Int, Int)] = file1.flatMap(_.split(" ")).map((_, 1, 2))
    //(tom,3,4), (jack,3,4)
    val words2 : RDD[(String,Int,Int)] = file2.flatMap(_.split(" ")).map((_, 3, 4))

    val words1Map = words1.map(x=>(x._1,(x._2,x._3)))
    val words2Map = words2.map(x=>(x._1,(x._2,x._3)))

    val result: RDD[(String, ((Int, Int), Option[(Int, Int)]))] = words1Map.leftOuterJoin(words2Map)
    val finalResult = result.map(x=>{
      val key = x._1
      val outerTuple = x._2
      val outerTupleMeta1 = x._2._1._1
      val outerTuplemeta2 = x._2._1._2
      val outerTuple2_1 = x._2._2.get._1
      val outerTuple2_2 = x._2._2.get._2
      (key,outerTupleMeta1,outerTuplemeta2,outerTuple2_1,outerTuple2_2)
    })
//    println(words1.collect().toBuffer)
//    println(words2.collect().toBuffer)
    println(finalResult.collect().toBuffer)
    sc.stop()
  }
}


你可能感兴趣的:(Spark)