RDD中的转换和action(二)PairRDD操作

package RDD

import org.apache.spark.{SparkConf, SparkContext}

/**  * Created by legotime on 2016/5/5.  */ object pairRDD {
  def myfunc1(index: Int, iter: Iterator[(String)]) : Iterator[String] = {
    iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator
  }
  def myfunc2(index:Int,iter:Iterator[(Int,String)]):Iterator[String]={
    iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator
  }
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("pair RDD").setMaster("local")
    val sc = new SparkContext(conf)

    val SingleRDD = sc.parallelize(List("scala","python","java","Spark","hadoop"),2)
    SingleRDD.mapPartitionsWithIndex(myfunc1).collect.foreach(println)
    //[partID:0, val: scala]
    //[partID:0, val: python]
    //[partID:1, val: java]
    //[partID:1, val: Spark]
    //[partID:1, val: hadoop]
    //-----------------------------单个pairRDD-------------------------------------
    val pairRDD = SingleRDD.map(x => (x.length,x))
    pairRDD.mapPartitionsWithIndex(myfunc2).collect.foreach(println)
    //[partID:0, val: (5,scala)]
    //[partID:0, val: (6,python)]
    //[partID:1, val: (4,java)]
    //[partID:1, val: (5,Spark)]
    //[partID:1, val: (6,hadoop)]

    //合并具有相同键(key)的值
    pairRDD.reduceByKey(_+_).collect().foreach(println)
    //先本地partition相同的key聚合,之后再partition和partition结合
    //(4,java)
    //(6,pythonhadoop)
    //(5,scalaSpark)

    //对具有相同键的值进行分组
    pairRDD.groupByKey().collect.foreach(println)
    //(4,CompactBuffer(java))
    //(6,CompactBuffer(python, hadoop))
    //(5,CompactBuffer(scala, Spark))

    //对key中的每个值应用一个函数
    pairRDD.mapValues(x=>"I am "+x).collect.foreach(println)
    //(5,I am scala)
    //(6,I am python)
    //(4,I am java)
    //(5,I am Spark)
    //(6,I am hadoop)
    pairRDD.flatMapValues(x=>"I am "+x).collect.foreach(print)
    //(5,I)(5, )(5,a)(5,m)(5, )(5,s)(5,c)(5,a)(5,l)(5,a)(6,I)(6, )(6,a)(6,m)(6, )(6,p)(6,y)(6,t)(6,h)(6,o)(6,n)(4,I)(4, )(4,a)(4,m)(4, )(4,j)(4,a)(4,v)(4,a)(5,I)(5, )(5,a)(5,m)(5, )(5,S)(5,p)(5,a)(5,r)(5,k)(6,I)(6, )(6,a)(6,m)(6, )(6,h)(6,a)(6,d)(6,o)(6,o)(6,p)16/05/05 22:27:52 INFO SparkContext: Starting job: collect at pairRDD.scala:55


    pairRDD.keys.collect.foreach(println)
    pairRDD.values.collect.foreach(println)
    pairRDD.sortByKey().collect.foreach(println)
    //(4,java)
    //(5,scala)
    //(5,Spark)
    //(6,python)
    //(6,hadoop)

    //-----------------------------两个pairRDD-------------------------------------
    val tempPairRDD = sc.parallelize(List((5,"flink")))
    tempPairRDD.collect.foreach(println)
    pairRDD.subtract(tempPairRDD).mapPartitionsWithIndex(myfunc2).collect.foreach(println)

    pairRDD.join(tempPairRDD).collect.foreach(println)

    //确保tempPairRDD第一个键存在(左连接)
    pairRDD.leftOuterJoin(tempPairRDD).collect.foreach(println)
    //(4,(java,None))
    //(6,(python,None))
    //(6,(hadoop,None))
    //(5,(scala,Some(flink)))
    //(5,(Spark,Some(flink)))
    //确保PairRDD第一个键存在(右连接)
    pairRDD.rightOuterJoin(tempPairRDD).collect.foreach(println)
    //(4,(CompactBuffer(java),CompactBuffer()))
    //(6,(CompactBuffer(python, hadoop),CompactBuffer()))
    //(5,(CompactBuffer(scala, Spark),CompactBuffer(flink)))
    pairRDD.cogroup(tempPairRDD).collect.foreach(println)
    //(4,(CompactBuffer(java),CompactBuffer()))
    //(6,(CompactBuffer(python, hadoop),CompactBuffer()))
    //(5,(CompactBuffer(scala, Spark),CompactBuffer(flink)))

    //==============================pairRDD的行动操作===================================
    val ActionRDD = sc.parallelize(List((1,2),(3,4),(5,6),(1,6)),2)
    ActionRDD.countByValue.foreach(println)
    //((1,2),1)
    //((5,6),1)
    //((1,6),1)
    //((3,4),1)
    ActionRDD.countByKey.foreach(println)
    //(1,2)
    //(3,1)
    //(5,1)
    ActionRDD.collectAsMap().foreach(println)
    //(5,6)
    //(1,6)
    //(3,4)
    ActionRDD.lookup(1).foreach(println)
    //2
    //6










  }

}

//更多操作参考:http://homepage.cs.latrobe.edu.au/zhe/ZhenHeSparkRDDAPIExamples.html

你可能感兴趣的:(源码,RDD)