Spark算子:RDD键值转换操作(3)–groupByKey、reduceByKey、reduceByKeyLocally
def groupByKey():RDD[( K, Iterable[V]) ]
def groupByKey(numPartitions:Int):RDD[(K, Iterable[V])]
def groupByKey(partitioner:Partitioner):RDD[(K,Iterable[v])]
该函数将RDD [K,V]中每个K对应的V值,一个集合Iterable[V]中;
scala> var rdd1 = sc.makeRDD(Array(("A",0),("A",2),("B",1),("B",2),("C",1)))
rdd1: org.apache.spark.rdd.RDD[(String,Int)] = ParallelCollectionRDD[0] at makeRDD at :21
scala >rdd1.groupByKey().collect
res81: Array[(String, Iterable[Int])] = Array((A,CompactBuffer(0,2)),(B,compactBuffer(2,1)),(C,CompactBuffer(1)))
def reduceBykey(func:(V,V)=>V):RDD[(K,V)]
def reduceByKey(func:(V,V)=>V),numPartitions:Int):RDD[(K,V)]
def reduceByKey(partitioner:Partitioner, func:(V,V)=>V):RDD[(K,V)]
该函数用于将RDD[K,V]中每个K对应的V值根据映射函数来运算。
var rdd1 = sc.makeRDD(Array(("A",0),("A",2),("B",1),("B",2),("C",1)))
rdd1.partitions.size //Int = 15
var rdd2 = rdd1.reduceByKey((x,y)=>x+y) rdd2.collect //Array[(String,Int)] = Array((A,2),(B,3),(C,1)) rdd2.partitions.size // Int = 15 var rdd2 = rdd1.reduceByKey(new org.apache.spark.HashPartitioner(2),(x,y)=>x+y) rdd2.collect //Array[(String, Int)] = Array((B,3),(A,2),(C,1)) rdd2.partitions.size //Int =2
def reduceByKeyLocally(func:(V,V)=>V):Map[K,V]
该函数将RDD[K,V]中每个K对应的V值根据映射函数来运算,运算结果映射到一个Map[K,V]中,而不是RDD[K,V].
var rdd1 = sc.makeRDD(Array(("A",0),("A",2),("B",1),("B",2),("C",1)))
rdd1.reduceByKeyLocally((x,y)=>x+y) //scala.collection.Map[String,Int] = Map(B->3,A->2,C->1)