Structured Streaming中的状态算子:flatMapGroupWithState、mapGroupWithState

前言

 最近正在看spark的Structured Streaming,正好工作中需要在流式计算中实现一个类似分组排序的功能,现在使用Structured Streaming来实现。

代码


import java.sql.Timestamp

import com.xxx.source.Source
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, Trigger}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.mutable.ListBuffer
import scala.collection.{Iterator, mutable}
import org.json4s._
import org.json4s.jackson.{Json, Serialization}


case class State(key: (String, String, String), latest: mutable.Map[String, Long])

case class Result(platform: String, fSid: String, ds: String, userId: String, guide_sid: String, recordTime: Timestamp)


object Top {
  def main(args: Array[String]): Unit = {
    val spark = session()
    spark.sparkContext.setLogLevel("WARN")
    import spark.implicits._
    val dataFrame = Source.kafkaSource(spark)
    val scheme = "user_id String,lv String,guide_sid String,my_colum_ String,vip String,record_time Long,f_sid String"
    val guide = dataFrame
      .selectExpr(s"from_json(cast (value as string),'$scheme') as bi", "topic")
      .selectExpr("bi.user_id as user_id", "bi.lv as lv", "bi.guide_sid as guide_sid", "bi.vip as vip",
        "bi.record_time as record_time", "bi.f_sid as f_sid", "bi.my_colum_ as my_colum_",
        "from_unixtime(bi.record_time,'yyyyMMdd') as ds", "cast (topic as string) as platform")
      .filter(x => rowFilter(x))
      .groupByKey(r => (r.getAs[String]("platform"), r.getAs[String]("f_sid"), r.getAs[String]("ds")))
      .flatMapGroupsWithState(OutputMode.Update(), GroupStateTimeout.NoTimeout())(top)
    val query = guide.writeStream
      .outputMode(OutputMode.Update())
      .format("console")
      .option("checkpointLocation", "/ck")
      .option("truncate", "false")
      .trigger(Trigger.ProcessingTime("10 seconds"))
      .start()
    query.awaitTermination()
  }



  def session(): SparkSession = {
    val os = System.getProperty("os.name")
    val master = if (os.startsWith("Windows")) "local[*]" else "yarn-cluster"
    val appName = this.getClass.getSimpleName
    val session = SparkSession.builder()
      .appName(appName)
      .master(master)
      .getOrCreate()
    session
  }


  def rowFilter(row: Row): Boolean = {
    if (row.getAs[String]("f_sid") == null || row.getAs[String]("ds") == null) false else true
  }


  def top(key: (String, String, String), value: Iterator[Row], state: GroupState[State]): Iterator[Result] = {
    val oldState = if (state.exists) state.get else State(key, mutable.Map[String, Long]())
    val latest = oldState.latest
    val results: ListBuffer[Result] = ListBuffer[Result]()
    value.toList
      .groupBy(_.getAs[String]("user_id"))
      .foreach(f => {
        val userID = f._1
        val groRows = f._2
        val sortedRows = groRows.sortWith(_.getAs[Long]("record_time") > _.getAs[Long]("record_time"))
        val top = sortedRows.head
        val guideSid = top.getAs[String]("guide_sid")
        val cur = top.getAs[Long]("record_time")
        val pre = latest.getOrElse[Long](userID, 0L)
        if (pre < cur) {
          latest.update(userID, cur)
          println("pre:" + pre + "cur:" + cur)
          results += Result(key._1, key._2, key._3, userID, guideSid, new Timestamp(cur * 1000))
        }
      })
    val newState = State(key, latest)
    state.update(newState)
    results.toIterator
  }

}


 

 这个是读取kafka的



import java.util.Properties

import com.youkia.utils.ServiceConf
import org.apache.spark.sql.{DataFrame, SparkSession}


object Source {
  private val KAFKA_PROPERTIES: Properties = ServiceConf.getProperties("/conf/kafka.properties")

  /**
   * kafka数据源
   *
   * @param sparkSession
   * @return
   */
  def kafkaSource(sparkSession: SparkSession): DataFrame = {
    val map = new scala.collection.mutable.HashMap[String, String]
    val ite = KAFKA_PROPERTIES.entrySet().iterator()
    while (ite.hasNext) {
      val next = ite.next()
      map += (next.getKey.toString -> next.getValue.toString)
    }
    sparkSession.readStream
      .format("kafka")
      .options(map)
      .load()
  }

}

 说明

大致功能就是从kafka中实时消费数据,数据是JSON字符串格式,解析之后,按platform、f_sid、ds、user_id分组,只保留组内每个用户当天record_time最新的数据,因为实时数据不能保证事件时间的有序性,我们必须对数据进行全局排序后取其TOP1。

flatMapGroupWithState、mapGroupWithState是Structured Streaming中的full state算子,跟spark streaming中updateStateByKey、mapWithState类似,就是把之前的聚合结果都存在state中,等每次新数据过来时更新state。

mapGroupsWithStateflatMapGroupsWithState之间的主要区别在于,前者允许函数返回一个且仅返回一条记录,而后者允许函数返回任意数量的记录(包括无记录)。此外, flatMapGroupsWithState有 AppendUpdate两种输出模式,即:追加或者更新。另外,两者都可使用GroupStateTimeout来设置数据超时类型。

使用GroupStateTimeout的注意事项:

  • 超时类型是所有组中的全局参数(在[map|flatMap]GroupsWithState中设置超时类型,而确切的超时持续时间/时间戳可以通过调用state.setTimeout...()来为每个组配置。
  • 超时可以基于处理时间(即 GroupStateTimeout.ProcessingTimeTimeout)或事件时间(即 GroupStateTimeout.EventTimeTimeout)。
  • 使用EventTimeTimeout,用户必须设置使用Dataset.withWatermark()来设置事件时间水印。
  • 当某个组发生超时时,将为该组调用没有值的函数,并将其 GroupState.hasTimedOut()设置为true。
  • 每次在组上调用该函数(即:当该组有新数据或该组已超时)时,都会重置超时。

你可能感兴趣的:(Structured,Streaming,状态算子)