开窗函数以及code实现

/**
  * Created by Administrator on 2018/9/4.
  *
  * 总结如下:
  *  preceding:用于累加前N行(分区之内)。若是从分区第一行头开始,则为 unbounded。 N为:相对当前行向前的偏移量
  *  following :与preceding相反,累加后N行(分区之内)。若是累加到该分区结束,则为 unbounded。N为:相对当前行向后的偏移量
  *  current row:顾名思义,当前行,偏移量为0
  *  说明:上边的前N,后M,以及current row均会累加该偏移量所在行
  *
  *
  * 1. 累加历史:分区内当天及之前所有
  *     写法1:select pcode,event_date,
  *             sum(duration) over (partition by pcode order by event_date asc) as sum_duration from userlogs_date
  *     写法2:select pcode,event_date,
  *             sum(duration) over (partition by pcode order by event_date asc rows between unbounded preceding and current row) as sum_duration from userlogs_date
  *2. 累加当日和昨天:
  *     select pcode,event_date,
  *         sum(duration) over (partition by pcode order by event_date asc rows between 1 preceding and current row) as sum_duration from userlogs_date
  *3. 累加当日、昨日、明日:
  *     select pcode,event_date,
  *         sum(duration) over (partition by pcode order by event_date asc rows between 1 preceding and 1 following ) as sum_duration from userlogs_date
  *4. 累加分区内所有:当天和之前之后所有:(同一个 partition by sum值都相同)
  *     select pcode,event_date,
  *         sum(duration) over (partition by pcode order by event_date asc rows between unbounded preceding and unbounded following ) as sum_duration from userlogs_date
  *
  *
  */
object 开窗函数 {
    def main(args: Array[String]): Unit = {
        val spark = SparkUtil.getSpark("func","local")
        f3(spark)
        spark.stop()
    }

    /**
      * +-----+----------+--------+
        |pcode|event_date|duration|
        +-----+----------+--------+
        |    1|2018-09-02|       3|
        |    1|2018-09-03|       2|
        |    1|2018-09-04|       1|
        |    2|2018-09-01|       4|
        |    2|2018-09-02|       3|
        +-----+----------+--------+
      * @param spark
      * @return
      */
    def f(spark:SparkSession): DataFrame ={
        spark.sql(
            """
              |select '1' pcode,'2018-09-04' event_date,1 duration
              |union
              |select '1' pcode,'2018-09-03' event_date,2 duration
              |union
              |select '1' pcode,'2018-09-02' event_date,3 duration
              |union
              |select '2' pcode,'2018-09-02' event_date,3 duration
              |union
              |select '2' pcode,'2018-09-01' event_date,4 duration
            """.stripMargin)
    }
    /**
      * 使用窗口函数累加历史数据
      * sum(duration) over (partition by pcode order by event_date asc) as sum_duration
      * @param spark
      */
    def f1(spark:SparkSession) ={

        /*+-----+----------+------------+
        |pcode|event_date|sum_duration|
        +-----+----------+------------+
        |    1|2018-09-02|           3|
        |    1|2018-09-03|           5|
        |    1|2018-09-04|           6|
        |    2|2018-09-01|           4|
        |    2|2018-09-02|           7|
        +-----+----------+------------+*/
//        spark-sql实现
        /*f(spark).createOrReplaceTempView("tmp")
            spark.sql(
            """
              |select pcode,event_date,sum(duration) over (partition by pcode order by event_date asc) as sum_duration
              |from
              |tmp
            """.stripMargin).show()*/

        //dataframe实现
        val df = f(spark)
        val windowSpace = Window.partitionBy("pcode").orderBy(df("event_date").asc)
        import org.apache.spark.sql.functions._
        df.select(
            df("pcode"),
            df("event_date"),
            sum("duration").over(windowSpace).as("duration")
        ).show()

    }

    /**
      * 扩展 累加一段时间范围内
      * 累加之前N天,累加前N天到后N天等等
      * @param spark
      */
    def f2(spark:SparkSession): Unit ={

        //1. 累加历史所有
        f(spark).createOrReplaceTempView("tmp")
        /*spark.sql(
            """
              |select pcode,event_date,
              |sum(duration) over (partition by pcode order by event_date asc) as sum_duration
              |from tmp
            """.stripMargin)
            .show()*/

        //1累加历史所有
        //rows between unbounded preceding and current row
       /* spark.sql(
            """
              |select pcode,event_date,
              |sum(duration) over (partition by pcode order by event_date asc rows between unbounded preceding and current row) as sum_duration
              |from tmp
            """.stripMargin)
            .show()*/

        //2 累加N天之前,假设N=1
        //rows between 3 preceding and current row
        /*+-----+----------+------------+
        |pcode|event_date|sum_duration|
        +-----+----------+------------+
        |    1|2018-09-02|           3|
        |    1|2018-09-03|           5|
        |    1|2018-09-04|           3|
        |    2|2018-09-01|           4|
        |    2|2018-09-02|           7|
        +-----+----------+------------+*/
        /*spark.sql(
            """
              |select pcode,event_date,
              |sum(duration) over (partition by pcode order by event_date asc rows between 1 preceding and current row) as sum_duration
              |from tmp
            """.stripMargin)
            .show()*/
        //2 df实现
        /*val df = f(spark)
        import org.apache.spark.sql.functions._
        val windowSpec = Window.partitionBy("pcode").orderBy("event_date").rowsBetween(-1,0)
        df.select(
            df("pcode"),
            df("event_date"),
            sum(df("duration")).over(windowSpec).as("sum_duration")
        ).show()*/


    }


    case class TT(id :String,b:String,m:String,k:String,n:Int)
    def f3(spark:SparkSession): Unit ={
        import spark.implicits._
        val df = spark.sql(
            """
              |SELECT '1' as id ,'a' b, '201808' as m,'a' as k,0 as n
              |union
              |SELECT '1' as id ,'b' b, '201809' as m,'a' as k,0 as n
              |union
              |SELECT '1' as id ,'c' b, '201808' as m,'b' as k,0 as n
              |union
              |SELECT '2' as id ,'d' b, '201809' as m,'b' as k,0 as n
            """.stripMargin).as[TT]
        Window.partitionBy("").orderBy("").rowsBetween(Long.MinValue,0L)
        import org.apache.spark.sql.functions._
        /*df.groupByKey(_.k)
            .flatMapGroups[TT]((k:String,its:Iterator[TT])=>{
                val res = ArrayBuffer[TT]()
                val ts = mutable.TreeSet[TT]()(new Ordering[TT] {
                    override def compare(x: TT, y: TT): Int = {
                        x.m.toInt-y.m.toInt
                    }
                })
                val cache = mutable.HashMap[String,(String,Int)]()
                for (elem <- its) {
                    ts.add(elem)
                    cache.put(elem.m,cache.getOrElse(elem.m,(elem.id,1)))
                }
            //id :String,b:String,m:String,k:String,n:Int
                for(t<-ts){
                    res.append(TT(t.id,t.b,t.m,t.k,1))
                }
                res.iterator
            })
            .show()*/
    }
}

 

你可能感兴趣的:(spark)