使用flink进行一天的UV(dau)统计

使用flink1.9进行实时dau统计
难点:窗口时间长(24小时),dau统计不能增量统计,需全窗口去重
实现:实现ProcessAllWindowFunction使用mapstate配合和rocksDB做全窗口数据缓存,基于processtime每20s触发一次从0点到当前的去重计算,即为实时dau
优化点:使用evictor方法,移除掉已经缓存进mapstate的数据,使进入到process()方法的数据量减少,减少性能损耗
不足:由于全窗口计算,dau又无需keyby,导致计算部分始终一个并行度在运行,需要提交任务时适度增加资源,下面是代码:

package com.ops


import java.text.SimpleDateFormat
import java.util
import java.util.{Date, Properties}

import com.fasterxml.jackson.databind.ObjectMapper
import com.xsyx.ops.DateUtil.getCurrentDateTimeJdk8
import org.apache.flink.api.common.functions.{AggregateFunction, RichFlatMapFunction}
import org.apache.flink.api.common.restartstrategy.RestartStrategies
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{ListStateDescriptor, MapState, MapStateDescriptor}
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.contrib.streaming.state.RocksDBStateBackend
import org.apache.flink.runtime.state.StateBackend
import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic}
import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
import org.apache.flink.streaming.api.functions.{AssignerWithPeriodicWatermarks, KeyedProcessFunction}
import org.apache.flink.streaming.api.scala.function.{ProcessAllWindowFunction, ProcessWindowFunction, WindowFunction}
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.streaming.api.windowing.assigners.{TumblingEventTimeWindows, TumblingProcessingTimeWindows}
import org.apache.flink.streaming.api.windowing.evictors.TimeEvictor
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers._
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.table.runtime.aggregate.KeyedProcessFunctionWithCleanupState
import org.apache.flink.util.Collector

import scala.collection.JavaConversions._
import scala.util.{Failure, Success, Try}

object RealTimeDau2 {
  def main(args: Array[String]): Unit = {
    val windowTime =TumblingEventTimeWindows.of(Time.days(1), Time.hours(-8))
    val triggerInterval = 20
    var backendFilePath = ""
    val parallelism = 1
    val evictorTime = 20

    val jobName = "FlinkFrontLogDau"
    val topicNames = List("per_log")
    val groupName = "ops-dau2"
    val properties = new Properties()
    var kafkaBrokers: String = null

    //获取运行os 
    val ifOSIsWin = System.getProperty("os.name").startsWith("Windows")
    ifOSIsWin match {
      case true => {
        kafkaBrokers = "192.168.1.225:9092" //"172.16.7.207:9092"
        backendFilePath = "file:///D:/filebackend"
      }
      case _ => {
        kafkaBrokers = "x.x.x.x:9092"
        backendFilePath = backendFilePath //"file:///opt/ops/flinkbackend"
      }
    }
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(parallelism)
    env.setStateBackend(new RocksDBStateBackend(backendFilePath, true)) ///D:/filebackend  /opt/ops/flinkbackend
    env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
    env.enableCheckpointing(20000)
    env.getCheckpointConfig.setFailOnCheckpointingErrors(false)
    env.getCheckpointConfig.setMaxConcurrentCheckpoints(1)
    env.getCheckpointConfig.setCheckpointTimeout(10000)
    env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500)
    env.getConfig.setRestartStrategy(RestartStrategies.fixedDelayRestart(4, 1000))
    env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
    properties.setProperty("bootstrap.servers", kafkaBrokers)
    properties.setProperty("group.id", groupName)

    val myConsumer = new FlinkKafkaConsumer[String](topicNames, new SimpleStringSchema(), properties)
    if (ifOSIsWin) {
      myConsumer.setStartFromLatest()
    }
    val mapper = new ObjectMapper
    val dauStream = env.addSource(myConsumer).filter(_.nonEmpty).map(line => {
      val timeStamp = new Date().getTime
      Try {
        val rootNode = mapper.readTree(line)
        val messageNode = mapper.readTree(rootNode.get("message").toString)
        val guid = messageNode.get("g").asText
        ("", guid.hashCode, timeStamp)
      } match {
        case Success(value) => (value._1, value._2, value._3)
        case Failure(exception) => {
          println(exception.toString)
          ("", 0, timeStamp)
        }
      }
    })
      .assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[(String, Int, Long)] {
        var currentMaxTimestamp = 0L
        val maxOutOfOrderness = 2000L
        var lastEmittedWatermark: Long = Long.MinValue

        override def extractTimestamp(element: (String, Int, Long), previousElementTimestamp: Long): Long = {
          //timestamp
          val timestamp = element._3
          if (timestamp > currentMaxTimestamp) {
            currentMaxTimestamp = timestamp
          }
          timestamp
        }

        // Returns the current watermark
        override def getCurrentWatermark: Watermark = {
          // 允许延迟2秒
          val potentialWM = currentMaxTimestamp - maxOutOfOrderness
          // 保证水印能依次递增
          if (potentialWM >= lastEmittedWatermark) {
            lastEmittedWatermark = potentialWM
          }
          new Watermark(lastEmittedWatermark)
        }
      })
      .windowAll(windowTime)
      // triggerInterval:每20s触发一次计算,每次将全量输出到MyReduceWindowAllFunction的mapstate 数据量会很大
      // 所以设置evictorTime:20s,清除20s之前的数据
      // 这样在source端只保留近20的源数据,输送给MyReduceWindowAllFunction方法,
      // MyReduceWindowAllFunction 20s之前的数据已经在之前的触发计算中在mapstate中缓存了
      // 所以只要每次触发计算后,只要是triggerInterval 20内计算完成,最终结果uv数据就不会错
      // 如果每次触发计算后处理时间大于triggerInterval 20s,如需要25s,那本次计算还没完成,有可能这次20s的数据还没有完全塞进mapstate,
      // 下次计算又开始了,触发了evictor,将本次20s清除了,数据就会有问题,计算出来的uv可能比真是uv少
      .trigger(ContinuousProcessingTimeTrigger.of(Time.seconds(triggerInterval)))
      .evictor(TimeEvictor.of(Time.seconds(evictorTime), true))
      //      .aggregate(new MyAggregateFunction)
      .process(new MyReduceWindowAllFunction)

    ifOSIsWin match {
      case true => dauStream.print()
      case _ => dauStream.addSink(new MysqlSinkPool)
    }
    env.execute(jobName)
  }



  class MyReduceWindowAllFunction extends ProcessAllWindowFunction[(String, Int, Long), (String, Long), TimeWindow] {
    private var userState: MapState[Int, String] = null
    val userStateDesc = new MapStateDescriptor[Int, String]("dau-state", classOf[Int], classOf[String])

    override def process(context: Context, elements: Iterable[(String, Int, Long)], out: Collector[(String, Long)]): Unit = {
      //      获取mapSate
      userState = context.windowState.getMapState(userStateDesc) //context.windowState.getMapState(userStateDesc)

      val currentProcessTime = DateUtil.getCurrentDateTimeJdk8(true)
      //      对窗口类所有数据遍历存入mapState,去重
      for (e <- elements) {
        val key = e._2
        userState.put(key, null)
      }

      //统计mapstate key的个数,即当前dau
      val currentDau = userState.keys().size
      out.collect(currentProcessTime, currentDau)
    }
  }

}

你可能感兴趣的:(flink)