csv格式
id,name,age,sex,city,score
1,张飞,21,M,北京,80
2,关羽,23,M,北京,82
7,周瑜,24,M,北京,85
3,赵云,20,F,上海,88
4,刘备,26,M,上海,83
8,孙权,26,M,上海,78
5,曹操,30,F,深圳,90.8
6,孔明,35,F,深圳,77.8
9,吕布,28,M,深圳,98
package createdf
import org.apache.log4j.{
Level, Logger}
import org.apache.spark.sql.expressions.{
Window, WindowSpec}
import org.apache.spark.sql.{
DataFrame, SparkSession}
/**
* @date :2021/4/2 18:38
* @author :xiaotao
* @description :求每个城市中成绩最高的两个人的信息
*/
object DataFrameDemo {
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val ss: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
sqlTest(ss)
dslApiTest(ss)
ss.stop()
}
def sqlTest(ss: SparkSession): Unit = {
val df: DataFrame = ss.read.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ").option("header", true).csv("D:\\doc\\stu2.csv")
df.createTempView("tmp")
ss.sql(
"""
|select
|city,
|name
|from
|(
| select
| city,
| name,
| row_number() over(partition by city order by score desc) as rns
| from tmp
|) o
|where rns <=2
""".stripMargin).show()
}
def dslApiTest(ss: SparkSession): Unit = {
val df: DataFrame = ss.read.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ").option("header", true).csv("D:doc\\stu2.csv")
import ss.implicits._
import org.apache.spark.sql.functions._
val window: WindowSpec = Window.partitionBy('city).orderBy('score.desc)
df.select('city, 'name, row_number().over(window) as "rns").where('rns <= 2).drop('rns).show()
}
}
E:\develop\Java\jdk1.8.0_171\bin\java.exe ...
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
+----+----+
|city|name|
+----+----+
| 深圳| 吕布|
| 深圳| 曹操|
| 上海| 赵云|
| 上海| 刘备|
| 北京| 周瑜|
| 北京| 关羽|
+----+----+
+----+----+
|city|name|
+----+----+
| 深圳| 吕布|
| 深圳| 曹操|
| 上海| 赵云|
| 上海| 刘备|
| 北京| 周瑜|
| 北京| 关羽|
+----+----+
Process finished with exit code 0