一 ,常用 : 排序,累加
1 ,分组取前三 :
- 精华代码 :
select * from
(select id,name,clazz,score,
row_number() over(partition by clazz order by score desc) rank
from person) res
where res.rank <=3
- 结果 :
+
| id|name| clazz|score|rank|
+
| 5| e| spark| 60| 1|
| 9| i| spark| 50| 2|
| 2| b| spark| 30| 3|
| 11| k|hadoop| 70| 1|
| 10| j|hadoop| 60| 2|
| 4| d|hadoop| 50| 3|
| 12| l| hive| 90| 1|
| 13| m| hive| 80| 2|
| 6| f| hive| 70| 3|
+
- 全部代码 :
package com.lifecycle.demo01
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SparkSession}
object Demo03 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[2]")
.config("spark.eventLog.enabled", "false")
.config("spark.driver.memory", "2g")
.config("spark.executor.memory", "2g")
.appName("SparkDemoFromS3")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
val df01: DataFrame = spark.read.option("delimiter", ",").csv("person.csv")
val df02: DataFrame = df01.toDF("id","name","clazz","score")
df02.createOrReplaceTempView("person")
spark.sql("select * from " +
"(select id,name,clazz,score,row_number() over(partition by clazz order by score desc) rank from person) res " +
"where res.rank <=3").show(100)
spark.close()
}
}
2 ,分组聚合 :
- sql
select clazz,sum(score) sumscore from person group by(clazz)
- 结果 :
+------+--------+
| clazz|sumscore|
+------+--------+
| spark| 170.0|
|hadoop| 220.0|
| hive| 280.0|
+------+--------+
3 ,分组累加 ( sum over )
- 元数据 :
1438,2016-05-13,165
1438,2016-05-14,595
1438,2016-05-15,105
1629,2016-05-13,12340
1629,2016-05-14,13850
1629,2016-05-15,227
- sql :
select pcode,event_date,duration,
sum(duration) over (partition by pcode order by event_date asc) as sum_duration
from userlogs_date
- 结果 :
+
|pcode|event_date|duration|sum_duration|
+
| 1438|2016-05-13| 165| 165.0|
| 1438|2016-05-14| 595| 760.0|
| 1438|2016-05-15| 105| 865.0|
| 1629|2016-05-13| 12340| 12340.0|
| 1629|2016-05-14| 13850| 26190.0|
| 1629|2016-05-15| 227| 26417.0|
+
4 ,不分组累加 : 必须排序
- sql :
select pcode,event_date,duration,
sum(duration) over (order by pcode,event_date asc) as sum_duration
from userlogs_date
- 结果 :
+
|pcode|event_date|duration|sum_duration|
+
| 1438|2016-05-13| 165| 165.0|
| 1438|2016-05-14| 595| 760.0|
| 1438|2016-05-15| 105| 865.0|
| 1629|2016-05-13| 12340| 13205.0|
| 1629|2016-05-14| 13850| 27055.0|
| 1629|2016-05-15| 227| 27282.0|
+
- 全部代码 :
package com.lifecycle.demo01
import org.apache.spark.sql.{DataFrame, SparkSession}
object Demo05 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[2]")
.config("spark.eventLog.enabled", "false")
.config("spark.driver.memory", "2g")
.config("spark.executor.memory", "2g")
.appName("SparkDemoFromS3")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val df01: DataFrame = spark.read.option("delimiter", ",").csv("ppt")
val df02: DataFrame = df01.toDF("pcode","event_date","duration")
df02.createOrReplaceTempView("userlogs_date")
spark.sql("select pcode,event_date,duration,sum(duration) over (order by pcode,event_date asc) as sum_duration from userlogs_date").show(100)
spark.close()
}
}
二 ,窗口函数大全 :
1 ,种类 :
- ranking 排名类
- analytic 分析类
- aggregate 聚合类
2 ,rank : 不分组排序
- 元数据 :
1438,2016-05-13,165
1438,2016-05-14,595
1438,2016-05-15,105
1629,2016-05-13,12340
1629,2016-05-14,13850
1629,2016-05-15,227