Spark的学习(二)

学习Spark必须要从RDD(弹性数据库)学习,RDD支持很多的API操作,最最基础的就是如何读取数据,请看下面的Demo:

package day01
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object Example3_27 {
      def main(args: Array[String]) {
          val sparkConf = new SparkConf().setAppName("SparkCount").setMaster("local")
          val sc = new SparkContext(sparkConf)
          val input = sc.parallelize(List(1,2,3,4)) 
          val result = input.map { x => x*x }
          println(result.collect().mkString(","))
      }
}
package day01
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object Example3_29 {
     def main(args: Array[String]) {
          val sparkConf = new SparkConf().setAppName("SparkCount").setMaster("local")
          val sc = new SparkContext(sparkConf)
          val lines = sc.parallelize(List("hello world", "hi")) 
          val words = lines.map { line => line.split(" ") }
          words.first().map(println(_))

    }
}

RDD还支持一些集合的操作,比如∩,∪,差集等等

package day01
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object Example3_30 {
      def main(args: Array[String]) {
          val sparkConf = new SparkConf().setAppName("SparkCount").setMaster("local")
          val sc = new SparkContext(sparkConf)
          val rdd1 = sc.parallelize(List("coffee","coffee","panda","monkey","tea")).persist()
          val rdd2 = sc.parallelize(List("coffee","monkey","kitty")).persist()
          rdd1.distinct().collect().map { x => print(x+" ") }
          rdd1.union(rdd2).collect().map { x => print(x+" ") }
          rdd1.intersection(rdd2).collect().map { x => print(x+" ")}
          rdd1.subtract(rdd2).collect().map { x => print(x+" ") }
    }
}

在有些特定的时候我们需要使用笛卡尔积,RDD同样支持:

package day01
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object Example3_31 {
        def main(args: Array[String]) {
                  val sparkConf = new SparkConf().setAppName("SparkCount").setMaster("local")
                  val sc = new SparkContext(sparkConf)
                  val rdd1 = sc.parallelize(List("User(1)","User(2)","User(3)"))
                  val rdd2 = sc.parallelize(List("Venue(Betabrand)","Venue(Asha Tea House)","Venue(Ritual)"))
                  rdd1.cartesian(rdd2).collect().map(print(_))
        }
}

有的时候我们需要计算均值,就需要求和,计数等操作,可以使用如下函数:

package day01
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
object Example3_36 {
        def main(args: Array[String]) {
                  val sparkConf = new SparkConf().setAppName("SparkCount").setMaster("local")
                  val sc = new SparkContext(sparkConf)
                  val input = sc.parallelize(List(1,2,3,4,5,6))
                  val result = input.aggregate((0, 0))(
                  (acc, value) => (acc._1 + value, acc._2 + 1),
                  (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2))
                  println(result._1 / result._2.toDouble)
        }
}

总结:RDD主要支持两类操作,一个是称为transformations和actions两种,区别在于返回的还是RDD(在RDD中操作),而actions返回的是具体的值。

你可能感兴趣的:(spark)