package RDD import org.apache.spark.{SparkConf, SparkContext} /** * Created by legotime on 2016/5/5. */ object pairRDD { def myfunc1(index: Int, iter: Iterator[(String)]) : Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } def myfunc2(index:Int,iter:Iterator[(Int,String)]):Iterator[String]={ iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } def main(args: Array[String]) { val conf = new SparkConf().setAppName("pair RDD").setMaster("local") val sc = new SparkContext(conf) val SingleRDD = sc.parallelize(List("scala","python","java","Spark","hadoop"),2) SingleRDD.mapPartitionsWithIndex(myfunc1).collect.foreach(println) //[partID:0, val: scala] //[partID:0, val: python] //[partID:1, val: java] //[partID:1, val: Spark] //[partID:1, val: hadoop] //-----------------------------单个pairRDD------------------------------------- val pairRDD = SingleRDD.map(x => (x.length,x)) pairRDD.mapPartitionsWithIndex(myfunc2).collect.foreach(println) //[partID:0, val: (5,scala)] //[partID:0, val: (6,python)] //[partID:1, val: (4,java)] //[partID:1, val: (5,Spark)] //[partID:1, val: (6,hadoop)] //合并具有相同键(key)的值 pairRDD.reduceByKey(_+_).collect().foreach(println) //先本地partition相同的key聚合,之后再partition和partition结合 //(4,java) //(6,pythonhadoop) //(5,scalaSpark) //对具有相同键的值进行分组 pairRDD.groupByKey().collect.foreach(println) //(4,CompactBuffer(java)) //(6,CompactBuffer(python, hadoop)) //(5,CompactBuffer(scala, Spark)) //对key中的每个值应用一个函数 pairRDD.mapValues(x=>"I am "+x).collect.foreach(println) //(5,I am scala) //(6,I am python) //(4,I am java) //(5,I am Spark) //(6,I am hadoop) pairRDD.flatMapValues(x=>"I am "+x).collect.foreach(print) //(5,I)(5, )(5,a)(5,m)(5, )(5,s)(5,c)(5,a)(5,l)(5,a)(6,I)(6, )(6,a)(6,m)(6, )(6,p)(6,y)(6,t)(6,h)(6,o)(6,n)(4,I)(4, )(4,a)(4,m)(4, )(4,j)(4,a)(4,v)(4,a)(5,I)(5, )(5,a)(5,m)(5, )(5,S)(5,p)(5,a)(5,r)(5,k)(6,I)(6, )(6,a)(6,m)(6, )(6,h)(6,a)(6,d)(6,o)(6,o)(6,p)16/05/05 22:27:52 INFO SparkContext: Starting job: collect at pairRDD.scala:55 pairRDD.keys.collect.foreach(println) pairRDD.values.collect.foreach(println) pairRDD.sortByKey().collect.foreach(println) //(4,java) //(5,scala) //(5,Spark) //(6,python) //(6,hadoop) //-----------------------------两个pairRDD------------------------------------- val tempPairRDD = sc.parallelize(List((5,"flink"))) tempPairRDD.collect.foreach(println) pairRDD.subtract(tempPairRDD).mapPartitionsWithIndex(myfunc2).collect.foreach(println) pairRDD.join(tempPairRDD).collect.foreach(println) //确保tempPairRDD第一个键存在(左连接) pairRDD.leftOuterJoin(tempPairRDD).collect.foreach(println) //(4,(java,None)) //(6,(python,None)) //(6,(hadoop,None)) //(5,(scala,Some(flink))) //(5,(Spark,Some(flink))) //确保PairRDD第一个键存在(右连接) pairRDD.rightOuterJoin(tempPairRDD).collect.foreach(println) //(4,(CompactBuffer(java),CompactBuffer())) //(6,(CompactBuffer(python, hadoop),CompactBuffer())) //(5,(CompactBuffer(scala, Spark),CompactBuffer(flink))) pairRDD.cogroup(tempPairRDD).collect.foreach(println) //(4,(CompactBuffer(java),CompactBuffer())) //(6,(CompactBuffer(python, hadoop),CompactBuffer())) //(5,(CompactBuffer(scala, Spark),CompactBuffer(flink))) //==============================pairRDD的行动操作=================================== val ActionRDD = sc.parallelize(List((1,2),(3,4),(5,6),(1,6)),2) ActionRDD.countByValue.foreach(println) //((1,2),1) //((5,6),1) //((1,6),1) //((3,4),1) ActionRDD.countByKey.foreach(println) //(1,2) //(3,1) //(5,1) ActionRDD.collectAsMap().foreach(println) //(5,6) //(1,6) //(3,4) ActionRDD.lookup(1).foreach(println) //2 //6 }}
//更多操作参考:http://homepage.cs.latrobe.edu.au/zhe/ZhenHeSparkRDDAPIExamples.html