spark的算子比较多,但只有部分是常用的,特此记录下来,如需补充更多算子,请在评论留言。
scala> val nums=sc.parallelize(List(1,2,3))
nums: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[64] at parallelize at :24
scala> nums.collect()
res43: Array[Int] = Array(1, 2, 3)
scala> nums.map
def map[U](f: Int => U)(implicit evidence$3: scala.reflect.ClassTag[U]): org.apache.spark.rdd.RDD[U]
scala> nums.map(x=>x+1).collect()
res45: Array[Int] = Array(2, 3, 4)
scala> val ff=sc.parallelize(Array((1,2),(3,4),(5,6)))
ff: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[71] at parallelize at :24
scala> ff.flatMap(x=>Seq(x._1)).collect()
res67: Array[Int] = Array(1, 3, 5)
scala> val a = sc.parallelize(1 to 10, 3)
a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[99] at parallelize at :24
scala> a.collect()
res84: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
scala> def myfuncPerElement(e:Int):Int = {
|
| println("e="+e)
|
| e*2
|
| }
myfuncPerElement: (e: Int)Int
scala> def myfuncPerPartition ( iter : Iterator [Int] ) : Iterator [Int] = {
|
| println("run in partition")
|
| var res = for (e <- iter ) yield e*2
|
| res
|
| }
myfuncPerPartition: (iter: Iterator[Int])Iterator[Int]
scala> a.map(myfuncPerElement).collect()
e=1
e=2
e=3
e=4
e=5
e=6
e=7
e=8
e=9
e=10
res85: Array[Int] = Array(2, 4, 6, 8, 10, 12, 14, 16, 18, 20)
scala> a.mapPartitions(myfuncPerPartition).collect()
run in partition
run in partition
run in partition
res86: Array[Int] = Array(2, 4, 6, 8, 10, 12, 14, 16, 18, 20)
scala> a.mapPartitions(iter=>iter.filter(_>3)).collect()
res88: Array[Int] = Array(4, 5, 6, 7, 8, 9, 10)
scala> a.map(iter=>iter.filter(_>3)).collect()
:27: error: value filter is not a member of Int
a.map(iter=>iter.filter(_>3)).collect()
scala> val a=sc.parallelize(Array(("c",8),("r",2),("e",3)))
a: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at parallelize at :24
scala> a
res0: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at parallelize at :24
scala> val b=sc.parallelize(Array(("c",2),("r",20),("e",39),("d",3)))
b: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[1] at parallelize at :24
scala> val df = a++b
df: org.apache.spark.rdd.RDD[(String, Int)] = UnionRDD[3] at $plus$plus at :28
scala> df.collect()
res2: Array[(String, Int)] = Array((c,8), (r,2), (e,3), (c,2), (r,20), (e,39), (d,3))
scala> a.collect()
res96: Array[(String, Int)] = Array((c,8), (r,2), (e,3))
scala> b.collect()
res99: Array[(String, Int)] = Array((c,2), (r,20), (e,39), (d,3))
scala> a.cartesian(b).collect()
res100: Array[((String, Int), (String, Int))] = Array(((c,8),(c,2)), ((c,8),(r,20)), ((c,8),(e,39)), ((c,8),(d,3)), ((r,2),(c,2)), ((r,2),(r,20)), ((r,2),(e,39)), ((r,2),(d,3)), ((e,3),(c,2)), ((e,3),(r,20)), ((e,3),(e,39)), ((e,3),(d,3)))
scala> a.groupBy(x=>{if(x._1 >= "e") 1 else 0}).collect()
res103: Array[(Int, Iterable[(String, Int)])] = Array((0,CompactBuffer((c,8))), (1,CompactBuffer((r,2), (e,3))))
scala> val nn=sc.parallelize(List(4,23,6,2,35,8,2,4,1))
nn: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[113] at parallelize at :24
scala> nn.sortBy(x=>x,true).collect()
res114: Array[Int] = Array(1, 2, 2, 4, 4, 6, 8, 23, 35)
scala> nn.sortBy(x=>x,false).collect()
res115: Array[Int] = Array(35, 23, 8, 6, 4, 4, 2, 2, 1)
scala> a.sortBy(_._2).collect()
res117: Array[(String, Int)] = Array((r,2), (e,3), (c,8))
scala> val repl=sc.parallelize(1 to 1000,4)
repl: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[132] at parallelize at :24
scala> repl.count()
res118: Long = 1000
scala> repl.sample(false,0.1)
res119: org.apache.spark.rdd.RDD[Int] = PartitionwiseSampledRDD[133] at sample at :27
scala> repl.sample(false,0.1).collect()
res120: Array[Int] = Array(3, 6, 18, 41, 47, 54, 75, 76, 80, 95, 96, 116, 153, 158, 166, 171, 176, 186, 190, 206, 251, 252, 254, 279, 282, 296, 299, 305, 308, 345, 349, 358, 363, 392, 406, 413, 422, 423, 434, 451, 456, 473, 477, 478, 489, 494, 498, 502, 513, 532, 538, 546, 547, 570, 572, 591, 597, 619, 629, 646, 649, 650, 661, 671, 681, 682, 709, 713, 716, 724, 730, 743, 747, 751, 753, 757, 781, 788, 798, 811, 816, 818, 833, 845, 863, 888, 889, 900, 937, 950, 951, 965, 968, 971, 975, 993)
scala> repl.sample(false,0.1).getNumPartitions
res121: Int = 4
cache
缓存数据到内存中,对应persist(MEMORY_ONLY)
filter
过滤数据,符合过滤条件的被筛选出来
scala> repl.filter(_>3)
res122: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[136] at filter at :27
scala> repl.filter(_>980).count()
res123: Long = 20
下面的算子操作对象为Key-Value
this.cogroup(other,partitioner).flatMapValues(case(vs,ws)=>
for(v<-vs;w<-ws)yield(v,w)
)
代码顺便介绍了leftOuterJoin,rightOuterJoin。
scala> a.collect()
res125: Array[(String, Int)] = Array((c,8), (r,2), (e,3))
scala> b.collect()
res126: Array[(String, Int)] = Array((c,2), (r,20), (e,39), (d,3))
scala> b.join(a)
res127: org.apache.spark.rdd.RDD[(String, (Int, Int))] = MapPartitionsRDD[140] at join at :29
scala> b.join(a).collect()
res128: Array[(String, (Int, Int))] = Array((e,(39,3)), (r,(20,2)), (c,(2,8)))
scala> b.leftOuterJoin(a).collect()
res129: Array[(String, (Int, Option[Int]))] = Array((d,(3,None)), (e,(39,Some(3))), (r,(20,Some(2))), (c,(2,Some(8))))
scala> b.rightOuterJoin(a).collect()
res130: Array[(String, (Option[Int], Int))] = Array((e,(Some(39),3)), (r,(Some(20),2)), (c,(Some(2),8)))
scala> c.collect()
res134: Array[(String, Int)] = Array((c,8), (r,2), (e,8), (c,4))
scala> c.groupByKey().collect()
res136: Array[(String, Iterable[Int])] = Array((e,CompactBuffer(8)), (r,CompactBuffer(2)), (c,CompactBuffer(8, 4)))
scala> c.reduceByKey((a,b)=>a*b).collect()
res140: Array[(String, Int)] = Array((e,8), (r,2), (c,32))
scala> c.collect()
res134: Array[(String, Int)] = Array((c,8), (r,2), (e,8), (c,4))
scala> c.mapValues(a=>a+1).collect()
res137: Array[(String, Int)] = Array((c,9), (r,3), (e,9), (c,5))