Flink_aggregate_process_state 案例

先说下模拟场景吧,统计日志数据中每分钟的IP访问量,取出访问量最大的五个值,每五秒更新一次
解决思路:
1、将日志中的时间转换为时间戳,作为EventTime
2、所以调用assignTimestampsAndWatermarks将排序EventTime,并设置延迟watermark为60S
3、构建滑动窗口
4、自定义预聚合函数aggregate将数据转换为UrlViewCount格式
5、自定义转换算子process将数据转换输出

数据如下,需要代码的朋友留邮件即可

83.149.9.216 - - 17/05/2015:10:05:03 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-search.png
83.149.9.216 - - 17/05/2015:10:05:43 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png
83.149.9.216 - - 17/05/2015:10:05:47 +0000 GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js
83.149.9.216 - - 17/05/2015:10:05:12 +0000 GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js
83.149.9.216 - - 17/05/2015:10:05:07 +0000 GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js
83.149.9.216 - - 17/05/2015:10:05:34 +0000 GET /presentations/logstash-monitorama-2013/images/sad-medic.png
83.149.9.216 - - 17/05/2015:10:05:57 +0000 GET /presentations/logstash-monitorama-2013/css/fonts/Roboto-Bold.ttf
83.149.9.216 - - 17/05/2015:10:05:50 +0000 GET /presentations/logstash-monitorama-2013/css/fonts/Roboto-Regular.ttf
83.149.9.216 - - 17/05/2015:10:05:24 +0000 GET /presentations/logstash-monitorama-2013/images/frontend-response-codes.png
83.149.9.216 - - 17/05/2015:10:05:50 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-dashboard.png
83.149.9.216 - - 17/05/2015:10:05:46 +0000 GET /presentations/logstash-monitorama-2013/images/Dreamhost_logo.svg
83.149.9.216 - - 17/05/2015:10:05:11 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-dashboard2.png
83.149.9.216 - - 17/05/2015:10:05:19 +0000 GET /presentations/logstash-monitorama-2013/images/apache-icon.gif

代码如下详情——>注释

    package com.thoughtworks.log
    
    import java.sql.Timestamp
    import java.text.SimpleDateFormat
    import java.util
    
    import org.apache.flink.api.common.functions.AggregateFunction
    import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
    import org.apache.flink.configuration.Configuration
    import org.apache.flink.contrib.streaming.state.{RocksDBOptions, RocksDBStateBackend}
    import org.apache.flink.core.fs.Path
    import org.apache.flink.runtime.state.StateBackend
    import org.apache.flink.runtime.state.filesystem.FsStateBackend
    import org.apache.flink.streaming.api.TimeCharacteristic
    import org.apache.flink.streaming.api.functions.KeyedProcessFunction
    import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
    import org.apache.flink.streaming.api.scala._
    import org.apache.flink.streaming.api.scala.function.WindowFunction
    import org.apache.flink.streaming.api.windowing.time.Time
    import org.apache.flink.streaming.api.windowing.windows.TimeWindow
    import org.apache.flink.util.{Collector, TernaryBoolean}
    
    import scala.collection.mutable.ListBuffer
    
    // 输入日志样例类
    case class LogEvent(ip: String, userName: String, eventTime: Long, method: String, url: String)
    
    // 中间统计数量的数据类型
    case class UrlViewCount(url: String, windowEnd: Long, count: Long)
    
    object NetworkLog {
      def main(args: Array[String]): Unit = {
        val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
        // 设置时间为EventTime 日志发生时间
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    
        //TODO  FSImage
        val checkPointPath = new Path("file:///fs/checkpoints")
        val fsStateBackend: StateBackend = new FsStateBackend(checkPointPath)
        env.setStateBackend(fsStateBackend)
    
        // TODO  RocksDB有点问题暂时不可用
        //    val checkpointDataUri = "file:///fs/checkpoints"
        //    val tmpDir = "file:///rocksDB/checkpoints"
        //    val fsStateBackend: StateBackend = new FsStateBackend(checkpointDataUri)
        //    val rocksDBBackend: RocksDBStateBackend = new RocksDBStateBackend(fsStateBackend, TernaryBoolean.TRUE)
        //    val config = new Configuration()
        //    //TIMER分为HEAP(默认,性能更好)和RocksDB(扩展好)
        //    config.setString(RocksDBOptions.TIMER_SERVICE_FACTORY, RocksDBStateBackend.PriorityQueueStateType.ROCKSDB.toString)
        //    rocksDBBackend.configure(config)
        //    rocksDBBackend.setDbStoragePath(tmpDir)
        //    env.setStateBackend(rocksDBBackend.asInstanceOf[StateBackend])
    
        env.readTextFile("data/apache.log")
          .map(line => {
            val strs: Array[String] = line.split(" ")
            // 把log时间转换为时间戳
            val simpleDateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
            // 已经是毫秒了
            val timestamp = simpleDateFormat.parse(strs(3)).getTime
            LogEvent(strs(0), strs(2), timestamp, strs(5), strs(6))
          })
          // watermark延迟为60S
          .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[LogEvent](Time.seconds(60)) {
          override def extractTimestamp(element: LogEvent): Long = element.eventTime
        })
          .keyBy(_.url) // 根据Url分组
          .timeWindow(Time.minutes(1), Time.seconds(5)) // 窗口长度为60S,5s滚动
          .aggregate(new CountAgg(), new WindowResultFun()) // 自定义预聚合函数
          .keyBy(_.windowEnd) // 根据窗口来分组
          .process(new TopURL(5)) // 自定义转换算子
          .print("Log test")
          .setParallelism(1)
    
        env.execute("haha")
      }
    }
    
    /**
          * 预聚合操作,来一条数据就计数器加1
          * LogEvent 输入类型
          * Long 聚合类型
          * Long 输出类型
          */
    class CountAgg() extends AggregateFunction[LogEvent, Long, Long] {
      // 创建的时候计数为0
      override def createAccumulator(): Long = 0L
    
      // 来一个就增加1
      override def add(value: LogEvent, accumulator: Long): Long = accumulator + 1
    
      // 返回结果
      override def getResult(accumulator: Long): Long = accumulator
    
      // 聚合
      override def merge(a: Long, b: Long): Long = a + b
    }
        
    /**
      * 窗口关闭时的操作,包装成UrlViewCount
      * Long 输入
      * UrlViewCount 输出
      * String key=> url
      * TimeWindow 窗口
      */
    class WindowResultFun() extends WindowFunction[Long, UrlViewCount, String, TimeWindow] {
      override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[UrlViewCount]): Unit = {
        out.collect(UrlViewCount(key, window.getEnd, input.iterator.next()))
      }
    }
    
    /**
      * 自定义转换算子
      * Long  通过那个keyby的  时间戳
      * UrlViewCount  输入
      * String   输出
      */
    class TopURL(size: Int) extends KeyedProcessFunction[Long, UrlViewCount, String] {
      // 懒加载方式定义state
      lazy val urlState: ListState[UrlViewCount] = getRuntimeContext.getListState(new ListStateDescriptor[UrlViewCount]("url_state", classOf[UrlViewCount]))
    
      // 来一条数据塞一条数据进去
      override def processElement(value: UrlViewCount, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#Context, out: Collector[String]): Unit = {
        urlState.add(value)
    
        // 注册定时器 定时器触发时,应该收集到了所有数据
        ctx.timerService().registerEventTimeTimer(value.windowEnd + 100)
      }
    
      // 定时器触发之后执行onTimer方法
      override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
        // allUrlViewCounts装urlState里面的所有数据
        val allUrlViewCounts: ListBuffer[UrlViewCount] = ListBuffer()
    
        val iter: util.Iterator[UrlViewCount] = urlState.get().iterator()
        while (iter.hasNext) {
          allUrlViewCounts.append(iter.next())
        }
    
        // 清空urlState 避免重复计算
        urlState.clear()
    
        // 按照点击量大小排序
        val sortedUrlViewCounts = allUrlViewCounts.sortWith(_.count > _.count).take(size)
    
        // 把结果格式化为string输出
        val result: StringBuilder = new StringBuilder
        result.append("====================================\n")
        result.append("时间: ").append(new Timestamp(timestamp - 100)).append("\n")
    
        for (i <- sortedUrlViewCounts.indices) {
          val currentUrlView: UrlViewCount = sortedUrlViewCounts(i)
          // e.g.  No1:  URL=/blog/tags/firefox?flav=rss20  流量=55
          result.append("No").append(i + 1).append(":")
            .append("  URL=").append(currentUrlView.url)
            .append("  流量=").append(currentUrlView.count).append("\n")
        }
        result.append("====================================\n\n")
    
        Thread.sleep(500)
    
        out.collect(result.toString())
    
      }
    }

你可能感兴趣的:(flink)