spark DataFrame数据集 sql风格 和 dsl Api风格需求练习

需求:求每个城市中成绩最高的两个人的信息

数据:

csv格式

id,name,age,sex,city,score
1,张飞,21,M,北京,80
2,关羽,23,M,北京,82
7,周瑜,24,M,北京,85
3,赵云,20,F,上海,88
4,刘备,26,M,上海,83
8,孙权,26,M,上海,78
5,曹操,30,F,深圳,90.8
6,孔明,35,F,深圳,77.8
9,吕布,28,M,深圳,98

代码:

package createdf

import org.apache.log4j.{
     Level, Logger}
import org.apache.spark.sql.expressions.{
     Window, WindowSpec}
import org.apache.spark.sql.{
     DataFrame, SparkSession}

/**
  * @date :2021/4/2 18:38
  * @author :xiaotao
  * @description :求每个城市中成绩最高的两个人的信息
  */
object DataFrameDemo {
     
  Logger.getLogger("org").setLevel(Level.WARN)

  def main(args: Array[String]): Unit = {
     
    val ss: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()

    sqlTest(ss)
    dslApiTest(ss)

    ss.stop()
  }

  def sqlTest(ss: SparkSession): Unit = {
     
    val df: DataFrame = ss.read.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ").option("header", true).csv("D:\\doc\\stu2.csv")

    df.createTempView("tmp")

    ss.sql(
      """
        |select
        |city,
        |name
        |from
        |(
        |     select
        |     city,
        |     name,
        |     row_number() over(partition by city order by score desc) as rns
        |     from tmp
        |) o
        |where rns <=2
      """.stripMargin).show()
  }

  def dslApiTest(ss: SparkSession): Unit = {
     
    val df: DataFrame = ss.read.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ").option("header", true).csv("D:doc\\stu2.csv")
    import ss.implicits._
    import org.apache.spark.sql.functions._

    val window: WindowSpec = Window.partitionBy('city).orderBy('score.desc)

    df.select('city, 'name, row_number().over(window) as "rns").where('rns <= 2).drop('rns).show()
  }

}

控制台:

E:\develop\Java\jdk1.8.0_171\bin\java.exe ...
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
+----+----+
|city|name|
+----+----+
|  深圳|  吕布|
|  深圳|  曹操|
|  上海|  赵云|
|  上海|  刘备|
|  北京|  周瑜|
|  北京|  关羽|
+----+----+

+----+----+
|city|name|
+----+----+
|  深圳|  吕布|
|  深圳|  曹操|
|  上海|  赵云|
|  上海|  刘备|
|  北京|  周瑜|
|  北京|  关羽|
+----+----+


Process finished with exit code 0

你可能感兴趣的:(spark,spark)