func:function
def mappatitions(): Unit = {
val conf = new SparkConf().setAppName("cogroup").setMaster("local[*]")
val sc = new SparkContext(conf)
val a = sc.parallelize(1 to 9, 3)
def doubleFunc(iter: Iterator[Int]): Iterator[Int] = {
var res = List[Int]()
while (iter.hasNext) {
val cur = iter.next;
res.::=((cur * 2))
}
res.iterator
}
val result = a.mapPartitions(doubleFunc)
result.collect().foreach(println)
}
def mappatitionswithindex() {
val conf = new SparkConf().setAppName("mappatitionswithindex").setMaster("local[*]")
val sc = new SparkContext(conf)
val rdd1: RDD[Int] = sc.makeRDD(1 to 10, 2)
val rdd2: RDD[String] = rdd1.mapPartitionsWithIndex((x, iter) => {
var result = List[String]()
result.::=(x + "|" + iter.toList)
result.iterator
})
val res: Array[String] = rdd2.collect()
for (x <- res) {
println(x)
}
}
// 创建RDD
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5))
// 获得所有的偶数。
val num: RDD[Int] = rdd.filter(aaa => aaa % 2==0 )
num.foreach(line => println(line))
def distinct(): Unit = {
val conf = new SparkConf().setAppName("distinct").setMaster("local[*]")
val sc = new SparkContext(conf)
val distinctRdd = sc.parallelize(List(1, 2, 1, 5, 2, 9, 6, 1))
val res: RDD[Int] = distinctRdd.distinct(2)
// distinct 中的参数numTask 表示 数据先对2整除,其他的依次余1,余2,余3 。局部无序,整体有序。
res.foreach(println)
}
def partitonBy2repatition(): Unit = {
val conf = new SparkConf().setAppName("localTest").setMaster("local[4]")
val sc = new SparkContext(conf)
//设置4个分区;
val rdd = sc.parallelize(List("hello", "jason", "what", "are", "you", "doing", "hi", "jason", "do", "you", "eat", "dinner", "hello", "jason", "do", "you", "have", "some", "time", "hello", "jason", "time", "do", "you", "jason", "jason"), 4)
val word_count = rdd.flatMap(_.split(",")).map((_, 1))
//重分区为10个;
val rep = word_count.repartition(10)
rep.foreachPartition(pair => {
println("第几个分区-------------" + TaskContext.get.partitionId)
pair.foreach(p => {
println(p)
})
})
println("************************************************************************************************")
//重分区为10;
val parby = word_count.partitionBy(new HashPartitioner(10))
parby.foreachPartition(pair => {
println("第几个分区-------------" + TaskContext.get.partitionId)
pair.foreach(p => {
println(p)
})
})
}
coalesce(numPartitions)
重新分区,第一个参数是要分多少区,第二个参数是否shuffle,默认false(分区多变少),true(分区少变多)
repartitionAndSortWithinPartitions(partitioner)
重新分区+排序,比先分区再排序效率要高 对K/V的RDD进行操作
sortBy(func,[ascending],[numTasks])
用func先对数据进行处理,按照处理后的数据比较结果排序。第一个参数是根据什么排序,第二个是怎么排序 false倒序,第三个排序后分区数,默认和原RDD一致
pipe(command,[envVars])
管道(调用外部程序),对于每个分区,都执行一个perl后者shell脚本,返回输出的RDD
Shell脚本pipe.sh:
#!/bin/sh
echo "AA"
while read LINE; do
echo ">>>"${LINE}
Done
----------------------------------------------------
scala> val rdd = sc.parallelize(List("hi","Hello","how","are","you"),1)
rdd: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[50] at parallelize at :24
scala> rdd.pipe("/home/bigdata/pipe.sh").collect()
res18: Array[String] = Array(AA, >>>hi, >>>Hello, >>>how, >>>are, >>>you)
scala> val rdd = sc.parallelize(List("hi","Hello","how","are","you"),2)
rdd: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[52] at parallelize at :24
scala> rdd.pipe("/home/bigdata/pipe.sh").collect()
res19: Array[String] = Array(AA, >>>hi, >>>Hello, AA, >>>how, >>>are, >>>you)
def join() {
val conf = new SparkConf()
.setAppName("join")
.setMaster("local")
val sc = new SparkContext(conf)
val studentList = Array(
Tuple2(1, "leo"),
Tuple2(2, "jack"),
Tuple2(3, "tom"));
val scoreList = Array(
Tuple2(1, 100),
Tuple2(2, 90),
Tuple2(3, 60));
val students = sc.parallelize(studentList);
val scores = sc.parallelize(scoreList);
val studentScores = students.join(scores)
studentScores.foreach(studentScore => {
println("student id: " + studentScore._1);
println("student name: " + studentScore._2._1)
println("student socre: " + studentScore._2._2)
println("=======================================")
})
}
def join2(): Unit = {
val conf = new SparkConf()
.setAppName("join2")
.setMaster("local[*]")
val sc = new SparkContext(conf)
val idName = sc.parallelize(Array((1, "zhangsan"), (2, "lisi"), (3, "wangwu")))
val idAge = sc.parallelize(Array((1, 30), (2, 29), (4, 21)))
//idName.join(idAge).collect().foreach(println)
//val result: RDD[(Int, (String, Int))] = idName.join(idAge)
//左外连接 idName.leftOuterJoin(idAge).collect().foreach(println)
//右外连接 idName.rightOuterJoin(idAge).collect().foreach(println)
//全连接
idName.fullOuterJoin(idAge).collect().foreach(println)
// result.collect().foreach(res => {
// println("学生ID"+res._1)
// println("学生姓名:"+res._2._1)
// println("学生成绩:"+res._2._2)
// println("=============================================")
// })
}
def cogroup(): Unit = {
val conf = new SparkConf()
.setAppName("cogroup")
.setMaster("local[*]")
val sc = new SparkContext(conf)
val idName = sc.parallelize(Array((1, "zhangsan"), (2, "lisi")))
val idScore = sc.parallelize(Array((1, 100), (2, 90), (2, 95)))
idName.cogroup(idScore).foreach(
res => {
println("id\t" + res._1)
println("name\t" + res._2._1)
println("score\t" + res._2._2)
println("=============================")
}
)
}
def reduceByKey(): Unit ={
val conf = new SparkConf().setAppName("cogroup").setMaster("local[*]")
val sc = new SparkContext(conf)
val rdd = sc.parallelize(List(("jack",1),("tom",5),("jack",5),("tom",6),("jack",7)))
val result: Array[(String, Int)] = rdd.reduceByKey((x,y)=>(x+y)).collect()
for((x:String,y:Int) <- result){
println(x+" "+y)
}
}
def groupByKey(): Unit ={
val conf = new SparkConf() .setAppName("groupByKey").setMaster("local[*]")
val sc = new SparkContext(conf)
val rdd = sc.parallelize(List(("jack",1),("tom",5),("jack",5),("tom",6),("jack",7)))
rdd.groupByKey().collect().foreach(x=>{
println("姓名:"+x._1+" 分数"+x._2)
})
rdd.groupByKey().map(x=>{
val name = x._1
var sum = 0
for(y<- x._2 ){
sum += y
}
(name,sum)
}).collect().foreach(println)
}
def combineByKey(): Unit ={
val conf = new SparkConf().setAppName("combinerByKey").setMaster("local[*]")
val sc = new SparkContext(conf)
// 求出每个字母的平均值。
/*
createCombiner: V => C, 创建一个合并对象。a:(90,1)
mergeValue: (C, V) => C, a(170,2) 区内合并
mergeCombiners: (C, C) => C, a(76 1) a(246,3) 全区合并。
*/
val rdd1: RDD[(String, Int)] = sc.makeRDD(Array(("a",90),("a",80),("b",46),("b",58),("b",29),("c",58),("c",90),("d",91),("a",76)))
val rdd3: RDD[(String, (Int, Int))] = rdd1.combineByKey(
v => (v, 1), // (90,1)
(c: (Int, Int), v) => (c._1 + v, c._2 + 1),
(c1: (Int, Int), c2: (Int, Int)) => (c1._1 + c2._1, c1._2 + c2._2)
)
//rdd3.map(x=>(x._1+"平均值"+(x._2._1/x._2._2))).collect().foreach(println)
rdd3.map{case(x:String,(y:Int,z:Int))=>(x,(y/z))}.collect().foreach(println)
该方法仅在预期结果数组很小的情况下使用,因为所有数据都被加载到driver端的内存中。
7. takeOrdered(n,[ordering])
返回RDD中前n个元素,并按默认顺序排序(升序)或者按自定义比较器顺序排序。
8. aggregate (zeroValue: U)(seqOp: (U, T) ⇒ U, combOp: (U, U) ⇒ U)
aggregate函数将每个分区里面的元素通过seqOp和初始值进行聚合,然后用combine函数将每个分区的结果和初始值(zeroValue)进行combine操作。这个函数最终返回的类型不需要和RDD中元素类型一致。
9. fold(num)(func)
折叠操作,aggregate的简化操作,seqop和combop一样。
10. saveAsTextFile(path)
将dataSet中元素以文本文件的形式写入本地文件系统或者HDFS等。Spark将对每个元素调用toString方法,将数据元素转换为文本文件中的一行记录。
若将文件保存到本地文件系统,那么只会保存在executor所在机器的本地目录。
11. saveAsSequenceFile(path)
将dataSet中元素以Hadoop SequenceFile的形式写入本地文件系统或者HDFS等。(对pairRDD操作)
算子使用和saveAsTextFile(path)一致。