scala> val c = sc.parallelize(List("aaa","b","b","c"))
c: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[8] at parallelize at :21
//做一个映射
scala> c.map(x => (x,x.length))
res7: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[9] at map at :24
scala> res7.first
res8: (String, Int) = (aaa,3)
//把后面的值做映射 转换成元胞的形式
scala> res7.mapValues(y=>(y,10))
res11: org.apache.spark.rdd.RDD[(String, (Int, Int))] = MapPartitionsRDD[10] at mapValues at :26
scala> res11.collect
res12: Array[(String, (Int, Int))] = Array((aaa,(3,10)), (b,(1,10)), (b,(1,10)), (c,(1,10)))
scala> res7.collect
res13: Array[(String, Int)] = Array((aaa,3), (b,1), (b,1), (c,1))
----------------------------------------------------------------------------------------------
上午最后的练习:使用reduceByKey mapValues
val key = sc.parallelize(List(("panda",0),("pink",3),("pirate",3),("panda",1),("pink",4)))
综合:key.mapValues(y=>(y,1)) reduceByKey((x,y) => (x._1+y._1 , x._1+y._2))
scala> val key = sc.parallelize(List(("panda",0),("pink",3),("pirate",3),("panda",1),("pink",4)))
key: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[12] at parallelize at :21
scala> key.collect
res15: Array[(String, Int)] = Array((panda,0), (pink,3), (pirate,3), (panda,1), (pink,4))
scala> val key1 = key.mapValues(y=>(y,1))
key1: org.apache.spark.rdd.RDD[(String, (Int, Int))] = MapPartitionsRDD[13] at mapValues at :23
scala> key1.collect
res16: Array[(String, (Int, Int))] = Array((panda,(0,1)), (pink,(3,1)), ((pirate,(3,1)), (panda,(1,1)), (pink,(4,1)))
scala> val key2 = key1.reduceByKey((x,y) => (x._1+y._1 , x._1+y._2))
key2: org.apache.spark.rdd.RDD[(String, (Int, Int))] = ShuffledRDD[14] at reduceByKey at :25
scala> key2.collect
res17: Array[(String, (Int, Int))] = Array(([pirate,(3,1)), (panda,(1,2)), (pink,(7,5)))
-------------------------------------------------------------------------------------=----------------------
join相同键的值 相当于内连接
scala> val aa = sc.parallelize(List(("K1","V1"),("K2","V2"),("K3","V3")))
aa: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[15] at parallelize at :21
scala> val bb = sc.parallelize(List(("K1","V1"),("K2","V2"),("K4","V4")))
bb: org.apache.spark.rdd.RDD[(String, String)] = ParallelCollectionRDD[16] at parallelize at :21
scala> aa.join(bb)
res18: org.apache.spark.rdd.RDD[(String, (String, String))] = MapPartitionsRDD[19] at join at :26
scala> res18.collect
res19: Array[(String, (String, String))] = Array((K1,(V1,V1)), (K2,(V2,V2)))
//bb join aa coleect也行
scala> bb.join(aa) collect
warning: there were 1 feature warning(s); re-run with -feature for details
res20: Array[(String, (String, String))] = Array((K1,(V1,V1)), (K2,(V2,V2)))
//自己join自己 得到相同的键后面两个相同的值
scala> bb.join(bb) collect
warning: there were 1 feature warning(s); re-run with -feature for details
res21: Array[(String, (String, String))] = Array((K1,(V1,V1)), (K4,(V4,V4)), (K2,(V2,V2)))
//
bb.saveAsTextFile("/root/bbbbbbb00")
在hdfs会生成三个文件 part00000 part00001 part00002因为有三个节点
//take()取出前几个值
scala> bb.take(2)
res23: Array[(String, String)] = Array((K1,V1), (K2,V2))