一、基本RDD
1、转化操作
(1)map()、flatMap()、filter()
scala> val line = sc.textFile("/Users/Desktop/log.txt")
line: org.apache.spark.rdd.RDD[String] = /Users/qanfuhong/Desktop/log.txt MapPartitionsRDD[1] at textFile at :24 ^
scala> val erroRDD = line.filter(line => line.contains("error"))
erroRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at filter at :26
scala> val errorRDD = line.filter(line => line.contains("error"))
errorRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[3] at filter at :26
scala> val input =sc.parallelize(List(1,2,3,4))
input: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[4] at parallelize at :24
scala> val result = input.map(x => x*x)
result: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[5] at map at :26
scala> println(result.collect().mkString(","))
[Stage 0:> 1,4,9,16
scala> println(result.collect().mkString(":"))
1:4:9:16
scala> val lines = sc.parallelize(List("hello jjames","hi"))
lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[6] at parallelize at :24
scala> val woeds = lines.flatMap(x => x.split(" "))
woeds: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[7] at flatMap at :26
scala> woeds.first()
res11: String = hello
collect()在生产环境中,慎用。
2、伪集合操作:union(rdd)--并集,distinct(rdd),intersection(rdd)--交集,rdd1.subtract(rdd2)--返回只存在rdd1的元素集合,cartesian(rdd)-笛卡尔积
scala> val lines = sc.parallelize(List(1,1,1,3,2,4,2))
lines: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[8] at parallelize at :24
scala> val linesDistinct = lines.distinct()
linesDistinct: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[11] at distinct at :26
scala> println(linesDistinct.collect().mkString(","))
1,2,3,4
scala> val rdd1 = sc.parallelize(List("coffee","coffee","pandas","monkey","tea"))
rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[12] at parallelize at :24
scala> val rdd2 = sc.makeRDD(List("coffee","money","kitty"))
rdd2: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[13] at makeRDD at :24
scala> rdd1.distinct()
res13: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[16] at distinct at :27
scala> println(rdd1.distinct().collect().mkString(","))
tea,pandas,monkey,coffee
scala> val rddUnion = rdd1.union(rdd2)
rddUnion: org.apache.spark.rdd.RDD[String] = UnionRDD[20] at union at :28
scala> print(rddUnion.collect().mkString(","))
coffee,coffee,pandas,monkey,tea,coffee,money,kitty
scala> val rddInter = rdd1.intersection(rdd2)
rddInter: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[26] at intersection at :28
scala> print(rddInter.collect().mkString(","))
coffee
scala> val rddSub = rdd1.subtract(rdd2)
rddSub: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[30] at subtract at :28
scala> print(rddSub.collect().mkString(" "))
tea pandas monkey
scala> val rddCar = rdd1.cartesian(rdd2)
rddCar: org.apache.spark.rdd.RDD[(String, String)] = CartesianRDD[31] at cartesian at :28
scala> print(rddCar.collect().mkString(","))
(coffee,coffee),(coffee,money),(coffee,kitty),(coffee,coffee),(coffee,money),(coffee,kitty),(pandas,coffee),(pandas,money),(pandas,kitty),(monkey,coffee),(monkey,money),(monkey,kitty),(tea,coffee),(tea,money),(tea,kitty)
scala>
3、行动操作