spark mlib 机器学习系列之一:Spark rdd 常见操作

package mlib

import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession

object UsefulRddOpts {
    def main(args: Array[String]): Unit = {
        val spark = SparkSession
            .builder
            .master("local[*]")
            .appName("UseFulRddOpts")
            .getOrCreate()
        val sc = spark.sparkContext
        showDiffReduceByKeyAndGroupByKey(sc)
    }

    /**
      * 尽量使用reduceByKey,reduceByKey 会在每个partitiongs 上先进性聚合,可以减少数据
      * 的传输,可以理解为在Mapper 的的时候进行了数据压缩。
      * @param sc spark 上下文环境
      */
    def showDiffReduceByKeyAndGroupByKey(sc : SparkContext) = {
        val data = sc.parallelize(Array((1, "susun"), (2, "sunsun"),
            (3, "susum"), (4, "nina"), (1, "hh"), (1, "susun")), 1)
        val result01 = data.reduceByKey(_ + ":" + _)
        val result02 = data.groupByKey()
        result01.foreach(println)  
        //结果
        //(4,nina)
        //(1,susun:hh:susun)
        //(3,susum)
        //(2,sunsun) 
        result02.foreach(println)
        //结果
        //(4,CompactBuffer(nina))
        //(1,CompactBuffer(susun, hh, susun))
        //(3,CompactBuffer(susum))
        //(2,CompactBuffer(sunsun))

    }

    /**
      * reduce 用法
      * @param sc hello,
      */
    def showUseOfReduce(sc : SparkContext) = {
        val data = sc.parallelize(Array("123", "456", "789"), 1)
        val reduce01 = data.reduce((str01, str02) => str01 + str02)
        val reduce02 = data.reduce(_ + _)
        println(reduce01 + ":" + reduce02)  // 123456789:123456789
    }

    /**
      * keyBy 的用法,给每个数据分配一个key
      * @param sc spark 上下文
      */
    def showUseOfKeyBy(sc : SparkContext) = {
        val data = sc.parallelize(Array("123dlalda", "hello,world", "hi, man."))
        val result = data.keyBy(str => str.length)
        result.foreach(println)
        // 结果
        //(9,123dlalda)
        //(8,hi, man.)
        //(11,hello,world)
    }

    /**
      * groupBy 和groupByKey 的用法
      * @param sc spark 上下文环境
      */
    def showUseOfGroypByAndGroupByKey(sc : SparkContext) = {
        val  data = sc.parallelize(Array(1,2,3,4,5,6,7,8), 1)
        val result01 = data.groupBy(x => {if(x > 3) "gt3" else  "lte3"})
        result01.foreach(println)
        // 结果
        //(gt3,CompactBuffer(4, 5, 6, 7, 8))
        //(lte3,CompactBuffer(1, 2, 3))

        val data02 = sc.parallelize(1 to 9)
        val result02 = data02.groupBy(x => {if(x % 2 == 0) "double" else  "single"})
        result02.foreach(println)
        // 结果
        //(double,CompactBuffer(2, 4, 6, 8))
        //(single,CompactBuffer(1, 3, 5, 7, 9))

        val data03 = sc.parallelize(Array((1, "susun"), (2, "sunsun"),
            (3, "susum"), (4, "nina"), (1, "hh"), (1, "susun")), 1)
        val result03 = data03.groupByKey()
        result03.foreach(println)
        // 结果
        // (4,CompactBuffer(nina))
        // (1,CompactBuffer(susun, hh, susun))
        // (3,CompactBuffer(susum))
        // (2,CompactBuffer(sunsun))
    }
    /**
      * 展示map 和flatMap 的用法
      * @param sc
      */
    def showUseOfMapAndFlatMap(sc : SparkContext) = {
        val data = sc.parallelize(Array("hello,world", "bilibili,acfun", "hehe,intresting"), 1)
        // 把每一行的数据进行按照逗号切分成数组
        val map = data.map(x => x.split(","))
        map.foreach(x => {println(x(0) +" " + x(1))})
        // 类似如此{["hello", "world"], ...,["hehe", "intresting"]}

        //  把每一行的数据进行按照逗号切分,然后进行扁平化操作,
        val flatMap = data.flatMap(x => x.split(","))
        flatMap.foreach(println)   // 结果每行一个单词

        // 例子二
        val data01 = sc.parallelize(Array(1, 2, 3, 4, 5))
        val result01 = data01.map(x => List(x+1)).collect // List 里面套了Array(List(2), List(3),...,List(6))
        val result02 = data01.flatMap(x => List(x+1)).collect //Array(2,3,4,5,6)
    }

    /**
      * filter 的用法, 注意保留符合filter 条件的内容
      * @param sc spark 上下文对象
      */
    def showUseOfFilter(sc : SparkContext) = {
        val data = sc.parallelize(Array((1, "susun"), (2, "sunsun"),
            (3, "susum"), (4, "nina"), (1, "hh"), (1, "susun")), 1)
        val result = data.filter(item => "hh".equals(item._2))
        result.foreach(println)  // (1,"hh")
    }

    /**
      * 去重操作
      * @param sc spark 上下文对象
      */
    def showUseOfDistinct(sc : SparkContext) = {
        val data = sc.parallelize(Array((1, "susun"), (2, "sunsun"),
        (3, "susum"), (4, "nina"), (1, "hh"), (1, "susun")), 1)
        val result = data.distinct()
        result.foreach(println)  // 去掉重复内容
    }

    /**
      * countByKey 的用法
      * @param sc spark 上下文对象
      */
    def showUseOfCountByKey(sc : SparkContext) = {
        val data = sc.parallelize(Array((1, "susun"), (2, "sunsun"),
        (3, "susum"), (4, "nina"), (1, "hh")))
        val result = data.countByKey()
        result.foreach(println)  // 根据key 进行统计个数
    }

    /**
      * countByValue, 统计各个Value 的个数
      * @param sc spark 上下文
      */
    def showUseOfCountByValue(sc : SparkContext) = {
        val data = sc.parallelize(List(0, 1, 2, 3, 0, 3, 4, 5,6, 7), 1)
        val result = data.countByValue()
        result.foreach(println)  // 根据value 进行统计个数
    }

    /**
      * repartition 的用法, 分区后的rdd 需要用另一个rdd 进行装载
      * @param sc sparkContext
      */
    def showUseOfRepartition(sc : SparkContext) = {
        val data = sc.parallelize(Array(1, 2, 3, 4, 5, 6))
        println(data.partitions.length)
        val data01 = data.repartition(5)
        println(data01.partitions.length)
        val data02 = data.repartition(3)
        println(data02.partitions.length) // 重新分区后的分区个数

    }

    /**
      * def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope
      * {
      *     coalesce(numPartitions, shuffle = true)
      * }
      * 重新分片,重新分区, 注意特例repartition
      * @param sc spark 上下文对象
      * @param spark 2.X 后的新的api
      */
    def showUseOfCoalesce(spark : SparkSession, sc : SparkContext) = {
//        val data = sc.parallelize(Seq(Map("john" -> 19, "Tracy" -> 18, "Lily" -> 20),
//            Map("susum" -> 50), Map("lili" -> 50, "hehe" -> 9)), 1)
//        data.aggregate(0)(math.max(_ , _), _ + _)
        var data = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
//        var data = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 1) // 如果提前制定并行度,无论后面如何操作都是一样
        data.cache()

        val result01 = data.aggregate(0)(math.max(_, _), _ + _)
        println(result01)

        val data02 = data.coalesce(2)
        val result02 = data02.aggregate(0)(math.max(_, _), _ + _)
        println(result02)

        val data03 = data.repartition(2)
        val result03 = data03.aggregate(0)(math.max(_, _), _ + _ )
        println(result03)
    }

    /**
      * 笛卡尔积的用法
      * @param sc SparkContext, spark 程序提交入口
      */
    def showUseOfCartessian(sc : SparkContext) = {
        // 默认会有多个分区
        val data01 = sc.parallelize(Array(1, 2, 3, 4), 1)
        val data02 = sc.parallelize(Array(5, 6), 1)
        val result = data01.cartesian(data02)
        result.foreach(println)
        val result02 = data02.cartesian(data01)
        result02.foreach(println)
    }

    /**
      * cache 用法 (persist 的一种特例)
      * @param spark SparkSession 对象,2.0 后的新的api
      * @param sc spark 上下文
      */
    def showUseOfCache(spark : SparkSession,sc : SparkContext) = {
        import spark.implicits._
        val data = sc.parallelize(List(1, 2 ,3, 4, 5, 6)).toDF()
        data.cache()
        val count = data.count()
        println("---------------------------------------------------")
        data.show()
    }

    /**
      * 展示aggregate 函数的作用
      * @param sc spark 上下文
      */
    def showUseOffAggregate(sc : SparkContext) = {
        // aggregate 函数,
        // aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U
        // U 期望的返回值类型,zeroValue, 默认返回的值,分别和 seqOp 以及combOp 进行操作
        // 首先对rdd 中各个分区中的数据进行sepOp 操作, 得到各个分区的结果U, 然后对各个
        // 分区的结果U 进行combOp 操作。
        // 具体看如下例子,如下中第一个data01默认应该至少有三个分区,所有结果是20,
        // 第二个默认分区是1, 所以结果是8
        val data01 = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8))
        val result01 = data01.aggregate(0)(math.max(_, _), _ + _)
        println(result01)//20

        // data02 计算流程
        // math.max(0, 1) => 1
        // math.max(1, 2) => 2
        // ...
        // math.max(7, 8) => 8
        val data02 = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8), 1)
        val result02 = data02.aggregate(0)(math.max(_, _), _ + _)
        println(result02) // 8
    }

}

你可能感兴趣的:(spark,机器学习与大数据)