import org.apache.spark.{HashPartitioner, SparkConf, SparkContext} import scala.collection.mutable.ArrayBuffer object RDDTest { def main(args: Array[String]): Unit = { val sc = new SparkContext(new SparkConf().setAppName("matrix T").setMaster("local"))//得到SparkContext /*** * map[U:ClassTag](f:T=>U):RDD[U] * 通过函数f将元素类型为T的集合转为元素类型为U的集合。T和U可以相同 */ val rdd1 = sc.parallelize(1 to 10) val rdd2 = rdd1.map(_*2) rdd2.foreach(println) println("-----------------------------------------------------") val rdd3 = rdd1.map{x=> x + 3 } rdd3.foreach(println) println("-----------------------------------------------------") val rdd4 = rdd1.map{x=> "No." + x} rdd4.foreach(println) println("-----------------------------------------------------") /*** * flatMap[U:ClassTag](f:T=>TraversableOnce[U]):RDD[U] * 类似于map,但是在元素遍历处理时返回的是多个元素,最终返回MapPartitionsRDD类型的RDD */ val rdd5 = sc.textFile("hdfs://pc1:9000/input/chenjie.txt") /** * rdd5: * hello my name is chenjie * i come from shanghai university * thank you */ val lines = rdd5.flatMap { line => println("执行一次flatMap,line=" + line + ",拆分成:" + line.split(" ")) line.split(" ") } /** * 执行一次flatMap,line=hello my name is chenjie,拆分成:[Ljava.lang.String;@5bd160c3 * 执行一次flatMap,line=i come from shanghai university,拆分成:[Ljava.lang.String;@65203f1f * 执行一次flatMap,line=thank you ,拆分成:[Ljava.lang.String;@13c84074 * * * "hello my name is chenjie" => ["hello","my","name","is","chenjie"] * "i come from shanghai university" => ["i","come","from","shanghai","university"] * "thank you" => ["thank","you"] * 压扁 => ["hello","my","name","is","chenjie","i","come","from","shanghai","university","thank","you"] */ lines.foreach(println) println("-----------------------------------------------------") val chars = rdd5.flatMap{line => //必须返回多个元素 //Array(line.charAt(0),line.charAt(line.size-1)) import scala.collection.mutable.ArrayBuffer val arrayBufferChar = ArrayBuffer[Char]() arrayBufferChar += line.charAt(0) arrayBufferChar += line.charAt(line.size-1) arrayBufferChar } chars.foreach(println) println("-----------------------------------------------------") /*** * filter(f:T=>Boolean):RDD[T] * 传入的函数要求返回值是Boolean类型,过滤剩下返回值为true的元素 */ val rdd6 = sc.parallelize(1 to 10) // val filterRDD = rdd val rdd7 = rdd6.filter(_%2==0) rdd7.foreach(println) println("-----------------------------------------------------") val rdd8 = rdd6.filter(x=> if(x > 3 && x < 7) true else false ) rdd8.foreach(println) println("-----------------------------------------------------") /*** * mapPartitions[U:ClassTag](f:Iterator[T]=>Iterator[U],preservesPartitioning:Boolean = false):RDD[U] */ val rdd9 = sc.parallelize(1 to 10) val rdd10 = rdd9.mapPartitions(iter=>iter.filter(_>3)) rdd10.foreach(println) println("-----------------------------------------------------") val rdd11 = rdd9.mapPartitions(iter=> iter.filter(x=> x<5 ) ) /*** * glom():RDD[Array[T]] * 将每个分区转化为数组 */ val rdd12 = sc.parallelize(1 to 10) val rdd13 = rdd12.glom()//[[1],[2],[3],[4],[5],[6],[7],[8],[9],[10]] //Array[Array[Int]] rdd13.foreach(_.foreach(println)) println("-----------------------------------------------------") /*** * distinct(numPartitions:Int)(implicitord:Ordering[T]=null):RDD[t] * 将RDD中每个partitioner内部重复的元素去掉 */ val rdd14 = sc.parallelize(Array(1,2,2,2,2,2,4,5,6,7,7,7,7,7,7,8,8,9)) val rdd15 = rdd14.distinct(2) rdd15.foreach(println) println("-----------------------------------------------------") /*** * cartesian[U:ClassTag](other:RDD[U]):RDD[(T,U)] * 在两个RDD之间,将各自的元素的笛卡尔积以内部元素类型为Tuple形式的RDD返回 */ val rdd16 = sc.parallelize(Array("A1","A2","A3")) val rdd17 = sc.parallelize(Array("B1","B2","B3")) val rdd18 = rdd16.cartesian(rdd17) rdd18.foreach(println) println("-----------------------------------------------------") /*** * union(other:RDD[T]):RDD[T] * 合并 */ val rdd19 = rdd16.union(rdd17) rdd19.foreach(println) println("-----------------------------------------------------") val rdd20 = sc.parallelize(Array(("A",1),("B",2),("C",3))) val rdd21 = rdd20.mapValues(a=>a*2) rdd21.foreach(println) println("-----------------------------------------------------") /** * subtract(other:RDD[T]):RDD[T] * 求两个RDD之间的差集 */ val rdd22 = sc.parallelize(Array("A","B","C","D")) val rdd23 = sc.parallelize(Array("C","D","E","F")) val rdd24 = rdd22.subtract(rdd23) rdd24.foreach(println) println("-----------------------------------------------------") /*** * sample(withReplacement:Boolean,fraction:Double,seed:Long=Utils.random.nextLong):RDD[T] * 对集合中的元素进行取样,可以指定取出元素的百分比以及随机种子 */ val rdd25 = sc.parallelize(Array("A","B","C","D")) val sampleRDD = rdd25.sample(true,0.5,3) sampleRDD.foreach(println) println("-----------------------------------------------------") /** * takeSample(withReplacement:Boolean,num:Int,seed:Long=Utils.random.nextLong):Array[T] * 采样 */ val rdd26 = sc.parallelize(Array("A","B","C","D")) val rdd27 = rdd26.takeSample(true,3,3) rdd27.foreach(println) println("-----------------------------------------------------") /** * groupBy[K](f:T=>K, p:Partitioner)(implicitkt:ClassTag[K],ord:Ordering[K]=null):RDD[(K,Iterable[T])] * 首先根据传入的f产生的key,形成元素为K-V形式的RDD,然后调用groupByKey对key值相同的元素进行分组 */ val rdd28 = sc.parallelize(Array("V1","V2","U1","W2","U2","V2","W1")) val rdd29 = rdd28.groupBy(_.substring(0,1)) rdd29.foreach(println) println("-----------------------------------------------------") /*** * partitionBy(partitioner:Partioner):RDD[(K,V)] * 只适用于内部元素是K-V形式的RDD,主要是将RDD进行重新分区,如果分区结果与之前的一致则返回自身,否则产生ShuffledRDD类型的RDD */ val rdd30 = sc.parallelize(Array(("V1",2),("V1",1),("V2",2),("W3",1),("U1",2),("U1",1),("U1",3))) val rdd31 = rdd30.partitionBy(new HashPartitioner(3)) rdd31.foreach(println) println("-----------------------------------------------------") /** * cogroup[W](other:RDD[(K,W)],partitioner:Partitioner):RDD[(K,(Iterable[V]),Iterable[W]))] * 只适用于元素类型为K-V的RDD,主要是将两个RDD中的元素key值相同的元素行合并形成新的K-V键值对,其value是每个RDD 元素集合的迭代器构成的Tuple类型的元素 */ val rdd32 = sc.parallelize(Array(("V1",1),("V2",2),("V2",2),("U1",2),("U2",1),("U5",4))) val rdd33 = sc.parallelize(Array(("V1",1),("V8",2),("U1",2),("U5",1))) val rdd34 = rdd32.cogroup(rdd33) rdd34.foreach{line=> println("key=" + line._1) val iter1 = line._2._1 val iter2 = line._2._2 iter1.foreach(println) iter2.foreach(println) } println("-----------------------------------------------------") /** * combineByKey[C](createCombiner:V => C, mergeValue:(C,V)) => C,mergeCombiners:(C,C) => C ):RDD[(K,C)] * 使用于元素类型为K-V形式的RDD,它将每个分区中的元素按照Key合并,最后返回ShuffledRDD类型的RDD */ val rdd35 = sc.parallelize(Array(("V1",1),("V1",2),("V2",2),("U3",1),("U1",1),("U2",2))) val rdd36 = rdd35.combineByKey((v:Int)=>List(v),(c:List[Int],v:Int)=>v::c,(c1:List[Int],c2:List[Int])=>c1:::c2) rdd36.foreach(println) println("-----------------------------------------------------") /*** * reduceByKey(func:(V,V)=>V):RDD[(K,V)] * 与combineByKey相似,但其返回的RDD内部元素类型和原有类型保持一致 */ val rdd37 = sc.parallelize(Array(("V1",1),("V1",2),("V2",2),("U3",1),("U1",1),("U2",2))) val rdd38 = rdd37.reduceByKey(_+_) rdd38.foreach(println) println("-----------------------------------------------------") /** * join[W](other:RDD[(K,W)]):RDD[(K,(V,W))] * 该方法只适用于元素类型为K-V的RDD,将两个RDD中key相同的元素先合并成以Key为Key,以每个RDD中该元素的集合为集合 */ val rdd39 = sc.parallelize(Array(("V1",1),("U5",4))) val rdd40 = sc.parallelize(Array(("V1",4),("U5",4))) val rdd41 = rdd39.join(rdd40) rdd41.foreach(println) println("-----------------------------------------------------") } }