Spark实现分组取 top N 示例 —— Scala版

主要步骤:

  1. 加载集合生成RDD(textFile)
  2. 对RDD进行转换,将要排序的属性进行分离,生成新的RDD(map)
  3. 对键进行分组,并在分组内基于第二步分离出的属性进行排序,并取排序结果的 top N (groupByKey,map)
package rddDemo.examples

import org.apache.spark.{SparkConf, SparkContext}

/**
  * 数据样本如下:
    anhui hefei 20
    jiangsu nanjing 90
    shandong jinan 100
  * Created by asus on 2018/7/29.
  */
object TopNWithGroupDemo {
  def main(args : Array[String]) : Unit = {
    val conf = new SparkConf()
    conf.setAppName("TopNDemo")
    conf.setMaster("local[2]")

    System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")

    val sc = new SparkContext(conf)

    val filepath = "hdfs://192.168.204.130:9000/log_file/sale_data.txt"

    // 加载 HDFS 文件构造RDD,并使用 filter 算子去掉空行
    val saleRDD = sc.textFile(filepath , 2).filter(line => line.trim().length > 0)
    // 对RDD进行转换,将要排序的属性作为键
    val saleWithProvinceRDD = saleRDD.map{
      line => {
        val lineInfo = line.split("\\s+")
        val province = lineInfo(0)
        val saleCount = lineInfo(2).toInt
        new Tuple2[String , Tuple2[Integer , String]](province , new Tuple2[Integer , String](saleCount , line))
      }
    }
//    saleWithProvinceRDD.foreach(println)

    // 进行分组
    val saleGroupByProvinceRDD = saleWithProvinceRDD.groupByKey()
    // 在分组内进行排序,取分组内的 top N
    val saleGroupByProvinceTopNRDD = saleGroupByProvinceRDD.map {
      tuple => {
        // 取出键
        val province = tuple._1
        // 取出键所对应的值
        val saleInfo = tuple._2
        var saleInfoList = List[(Integer , String)]()
        for(s <- saleInfo) {
          val saleCount = s._1
          val info = s._2
          saleInfoList = saleInfoList.::((saleCount , info))
        }

        // 将键所对应的值进行自定义排序
        saleInfoList = saleInfoList.sortWith{
          (s1 , s2) => {
            s1._1 > s2._1
          }
        }
        saleInfoList = saleInfoList.take(3)
        new Tuple2[String , Iterable[(Integer , String)]](province , saleInfoList)
      }
    }

    saleGroupByProvinceTopNRDD.foreach(println)

    sc.stop()
  }

}

 

你可能感兴趣的:(spark)