/**
* Created by Administrator on 2018/9/4.
*
* 总结如下:
* preceding:用于累加前N行(分区之内)。若是从分区第一行头开始,则为 unbounded。 N为:相对当前行向前的偏移量
* following :与preceding相反,累加后N行(分区之内)。若是累加到该分区结束,则为 unbounded。N为:相对当前行向后的偏移量
* current row:顾名思义,当前行,偏移量为0
* 说明:上边的前N,后M,以及current row均会累加该偏移量所在行
*
*
* 1. 累加历史:分区内当天及之前所有
* 写法1:select pcode,event_date,
* sum(duration) over (partition by pcode order by event_date asc) as sum_duration from userlogs_date
* 写法2:select pcode,event_date,
* sum(duration) over (partition by pcode order by event_date asc rows between unbounded preceding and current row) as sum_duration from userlogs_date
*2. 累加当日和昨天:
* select pcode,event_date,
* sum(duration) over (partition by pcode order by event_date asc rows between 1 preceding and current row) as sum_duration from userlogs_date
*3. 累加当日、昨日、明日:
* select pcode,event_date,
* sum(duration) over (partition by pcode order by event_date asc rows between 1 preceding and 1 following ) as sum_duration from userlogs_date
*4. 累加分区内所有:当天和之前之后所有:(同一个 partition by sum值都相同)
* select pcode,event_date,
* sum(duration) over (partition by pcode order by event_date asc rows between unbounded preceding and unbounded following ) as sum_duration from userlogs_date
*
*
*/
object 开窗函数 {
def main(args: Array[String]): Unit = {
val spark = SparkUtil.getSpark("func","local")
f3(spark)
spark.stop()
}
/**
* +-----+----------+--------+
|pcode|event_date|duration|
+-----+----------+--------+
| 1|2018-09-02| 3|
| 1|2018-09-03| 2|
| 1|2018-09-04| 1|
| 2|2018-09-01| 4|
| 2|2018-09-02| 3|
+-----+----------+--------+
* @param spark
* @return
*/
def f(spark:SparkSession): DataFrame ={
spark.sql(
"""
|select '1' pcode,'2018-09-04' event_date,1 duration
|union
|select '1' pcode,'2018-09-03' event_date,2 duration
|union
|select '1' pcode,'2018-09-02' event_date,3 duration
|union
|select '2' pcode,'2018-09-02' event_date,3 duration
|union
|select '2' pcode,'2018-09-01' event_date,4 duration
""".stripMargin)
}
/**
* 使用窗口函数累加历史数据
* sum(duration) over (partition by pcode order by event_date asc) as sum_duration
* @param spark
*/
def f1(spark:SparkSession) ={
/*+-----+----------+------------+
|pcode|event_date|sum_duration|
+-----+----------+------------+
| 1|2018-09-02| 3|
| 1|2018-09-03| 5|
| 1|2018-09-04| 6|
| 2|2018-09-01| 4|
| 2|2018-09-02| 7|
+-----+----------+------------+*/
// spark-sql实现
/*f(spark).createOrReplaceTempView("tmp")
spark.sql(
"""
|select pcode,event_date,sum(duration) over (partition by pcode order by event_date asc) as sum_duration
|from
|tmp
""".stripMargin).show()*/
//dataframe实现
val df = f(spark)
val windowSpace = Window.partitionBy("pcode").orderBy(df("event_date").asc)
import org.apache.spark.sql.functions._
df.select(
df("pcode"),
df("event_date"),
sum("duration").over(windowSpace).as("duration")
).show()
}
/**
* 扩展 累加一段时间范围内
* 累加之前N天,累加前N天到后N天等等
* @param spark
*/
def f2(spark:SparkSession): Unit ={
//1. 累加历史所有
f(spark).createOrReplaceTempView("tmp")
/*spark.sql(
"""
|select pcode,event_date,
|sum(duration) over (partition by pcode order by event_date asc) as sum_duration
|from tmp
""".stripMargin)
.show()*/
//1累加历史所有
//rows between unbounded preceding and current row
/* spark.sql(
"""
|select pcode,event_date,
|sum(duration) over (partition by pcode order by event_date asc rows between unbounded preceding and current row) as sum_duration
|from tmp
""".stripMargin)
.show()*/
//2 累加N天之前,假设N=1
//rows between 3 preceding and current row
/*+-----+----------+------------+
|pcode|event_date|sum_duration|
+-----+----------+------------+
| 1|2018-09-02| 3|
| 1|2018-09-03| 5|
| 1|2018-09-04| 3|
| 2|2018-09-01| 4|
| 2|2018-09-02| 7|
+-----+----------+------------+*/
/*spark.sql(
"""
|select pcode,event_date,
|sum(duration) over (partition by pcode order by event_date asc rows between 1 preceding and current row) as sum_duration
|from tmp
""".stripMargin)
.show()*/
//2 df实现
/*val df = f(spark)
import org.apache.spark.sql.functions._
val windowSpec = Window.partitionBy("pcode").orderBy("event_date").rowsBetween(-1,0)
df.select(
df("pcode"),
df("event_date"),
sum(df("duration")).over(windowSpec).as("sum_duration")
).show()*/
}
case class TT(id :String,b:String,m:String,k:String,n:Int)
def f3(spark:SparkSession): Unit ={
import spark.implicits._
val df = spark.sql(
"""
|SELECT '1' as id ,'a' b, '201808' as m,'a' as k,0 as n
|union
|SELECT '1' as id ,'b' b, '201809' as m,'a' as k,0 as n
|union
|SELECT '1' as id ,'c' b, '201808' as m,'b' as k,0 as n
|union
|SELECT '2' as id ,'d' b, '201809' as m,'b' as k,0 as n
""".stripMargin).as[TT]
Window.partitionBy("").orderBy("").rowsBetween(Long.MinValue,0L)
import org.apache.spark.sql.functions._
/*df.groupByKey(_.k)
.flatMapGroups[TT]((k:String,its:Iterator[TT])=>{
val res = ArrayBuffer[TT]()
val ts = mutable.TreeSet[TT]()(new Ordering[TT] {
override def compare(x: TT, y: TT): Int = {
x.m.toInt-y.m.toInt
}
})
val cache = mutable.HashMap[String,(String,Int)]()
for (elem <- its) {
ts.add(elem)
cache.put(elem.m,cache.getOrElse(elem.m,(elem.id,1)))
}
//id :String,b:String,m:String,k:String,n:Int
for(t<-ts){
res.append(TT(t.id,t.b,t.m,t.k,1))
}
res.iterator
})
.show()*/
}
}