学习Spark必须要从RDD(弹性数据库)学习,RDD支持很多的API操作,最最基础的就是如何读取数据,请看下面的Demo:
package day01
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object Example3_27 {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkCount").setMaster("local")
val sc = new SparkContext(sparkConf)
val input = sc.parallelize(List(1,2,3,4))
val result = input.map { x => x*x }
println(result.collect().mkString(","))
}
}
package day01
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object Example3_29 {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkCount").setMaster("local")
val sc = new SparkContext(sparkConf)
val lines = sc.parallelize(List("hello world", "hi"))
val words = lines.map { line => line.split(" ") }
words.first().map(println(_))
}
}
RDD还支持一些集合的操作,比如∩,∪,差集等等
package day01
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object Example3_30 {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkCount").setMaster("local")
val sc = new SparkContext(sparkConf)
val rdd1 = sc.parallelize(List("coffee","coffee","panda","monkey","tea")).persist()
val rdd2 = sc.parallelize(List("coffee","monkey","kitty")).persist()
rdd1.distinct().collect().map { x => print(x+" ") }
rdd1.union(rdd2).collect().map { x => print(x+" ") }
rdd1.intersection(rdd2).collect().map { x => print(x+" ")}
rdd1.subtract(rdd2).collect().map { x => print(x+" ") }
}
}
在有些特定的时候我们需要使用笛卡尔积,RDD同样支持:
package day01
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object Example3_31 {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkCount").setMaster("local")
val sc = new SparkContext(sparkConf)
val rdd1 = sc.parallelize(List("User(1)","User(2)","User(3)"))
val rdd2 = sc.parallelize(List("Venue(Betabrand)","Venue(Asha Tea House)","Venue(Ritual)"))
rdd1.cartesian(rdd2).collect().map(print(_))
}
}
有的时候我们需要计算均值,就需要求和,计数等操作,可以使用如下函数:
package day01
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object Example3_36 {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkCount").setMaster("local")
val sc = new SparkContext(sparkConf)
val input = sc.parallelize(List(1,2,3,4,5,6))
val result = input.aggregate((0, 0))(
(acc, value) => (acc._1 + value, acc._2 + 1),
(acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2))
println(result._1 / result._2.toDouble)
}
}
总结:RDD主要支持两类操作,一个是称为transformations和actions两种,区别在于返回的还是RDD(在RDD中操作),而actions返回的是具体的值。