使用flink1.9进行实时dau统计
难点:窗口时间长(24小时),dau统计不能增量统计,需全窗口去重
实现:实现ProcessAllWindowFunction使用mapstate配合和rocksDB做全窗口数据缓存,基于processtime每20s触发一次从0点到当前的去重计算,即为实时dau
优化点:使用evictor方法,移除掉已经缓存进mapstate的数据,使进入到process()方法的数据量减少,减少性能损耗
不足:由于全窗口计算,dau又无需keyby,导致计算部分始终一个并行度在运行,需要提交任务时适度增加资源,下面是代码:
package com.ops
import java.text.SimpleDateFormat
import java.util
import java.util.{Date, Properties}
import com.fasterxml.jackson.databind.ObjectMapper
import com.xsyx.ops.DateUtil.getCurrentDateTimeJdk8
import org.apache.flink.api.common.functions.{AggregateFunction, RichFlatMapFunction}
import org.apache.flink.api.common.restartstrategy.RestartStrategies
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{ListStateDescriptor, MapState, MapStateDescriptor}
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.contrib.streaming.state.RocksDBStateBackend
import org.apache.flink.runtime.state.StateBackend
import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic}
import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
import org.apache.flink.streaming.api.functions.{AssignerWithPeriodicWatermarks, KeyedProcessFunction}
import org.apache.flink.streaming.api.scala.function.{ProcessAllWindowFunction, ProcessWindowFunction, WindowFunction}
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.streaming.api.windowing.assigners.{TumblingEventTimeWindows, TumblingProcessingTimeWindows}
import org.apache.flink.streaming.api.windowing.evictors.TimeEvictor
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers._
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.table.runtime.aggregate.KeyedProcessFunctionWithCleanupState
import org.apache.flink.util.Collector
import scala.collection.JavaConversions._
import scala.util.{Failure, Success, Try}
object RealTimeDau2 {
def main(args: Array[String]): Unit = {
val windowTime =TumblingEventTimeWindows.of(Time.days(1), Time.hours(-8))
val triggerInterval = 20
var backendFilePath = ""
val parallelism = 1
val evictorTime = 20
val jobName = "FlinkFrontLogDau"
val topicNames = List("per_log")
val groupName = "ops-dau2"
val properties = new Properties()
var kafkaBrokers: String = null
//获取运行os
val ifOSIsWin = System.getProperty("os.name").startsWith("Windows")
ifOSIsWin match {
case true => {
kafkaBrokers = "192.168.1.225:9092" //"172.16.7.207:9092"
backendFilePath = "file:///D:/filebackend"
}
case _ => {
kafkaBrokers = "x.x.x.x:9092"
backendFilePath = backendFilePath //"file:///opt/ops/flinkbackend"
}
}
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(parallelism)
env.setStateBackend(new RocksDBStateBackend(backendFilePath, true)) ///D:/filebackend /opt/ops/flinkbackend
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
env.enableCheckpointing(20000)
env.getCheckpointConfig.setFailOnCheckpointingErrors(false)
env.getCheckpointConfig.setMaxConcurrentCheckpoints(1)
env.getCheckpointConfig.setCheckpointTimeout(10000)
env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500)
env.getConfig.setRestartStrategy(RestartStrategies.fixedDelayRestart(4, 1000))
env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
properties.setProperty("bootstrap.servers", kafkaBrokers)
properties.setProperty("group.id", groupName)
val myConsumer = new FlinkKafkaConsumer[String](topicNames, new SimpleStringSchema(), properties)
if (ifOSIsWin) {
myConsumer.setStartFromLatest()
}
val mapper = new ObjectMapper
val dauStream = env.addSource(myConsumer).filter(_.nonEmpty).map(line => {
val timeStamp = new Date().getTime
Try {
val rootNode = mapper.readTree(line)
val messageNode = mapper.readTree(rootNode.get("message").toString)
val guid = messageNode.get("g").asText
("", guid.hashCode, timeStamp)
} match {
case Success(value) => (value._1, value._2, value._3)
case Failure(exception) => {
println(exception.toString)
("", 0, timeStamp)
}
}
})
.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[(String, Int, Long)] {
var currentMaxTimestamp = 0L
val maxOutOfOrderness = 2000L
var lastEmittedWatermark: Long = Long.MinValue
override def extractTimestamp(element: (String, Int, Long), previousElementTimestamp: Long): Long = {
//timestamp
val timestamp = element._3
if (timestamp > currentMaxTimestamp) {
currentMaxTimestamp = timestamp
}
timestamp
}
// Returns the current watermark
override def getCurrentWatermark: Watermark = {
// 允许延迟2秒
val potentialWM = currentMaxTimestamp - maxOutOfOrderness
// 保证水印能依次递增
if (potentialWM >= lastEmittedWatermark) {
lastEmittedWatermark = potentialWM
}
new Watermark(lastEmittedWatermark)
}
})
.windowAll(windowTime)
// triggerInterval:每20s触发一次计算,每次将全量输出到MyReduceWindowAllFunction的mapstate 数据量会很大
// 所以设置evictorTime:20s,清除20s之前的数据
// 这样在source端只保留近20的源数据,输送给MyReduceWindowAllFunction方法,
// MyReduceWindowAllFunction 20s之前的数据已经在之前的触发计算中在mapstate中缓存了
// 所以只要每次触发计算后,只要是triggerInterval 20内计算完成,最终结果uv数据就不会错
// 如果每次触发计算后处理时间大于triggerInterval 20s,如需要25s,那本次计算还没完成,有可能这次20s的数据还没有完全塞进mapstate,
// 下次计算又开始了,触发了evictor,将本次20s清除了,数据就会有问题,计算出来的uv可能比真是uv少
.trigger(ContinuousProcessingTimeTrigger.of(Time.seconds(triggerInterval)))
.evictor(TimeEvictor.of(Time.seconds(evictorTime), true))
// .aggregate(new MyAggregateFunction)
.process(new MyReduceWindowAllFunction)
ifOSIsWin match {
case true => dauStream.print()
case _ => dauStream.addSink(new MysqlSinkPool)
}
env.execute(jobName)
}
class MyReduceWindowAllFunction extends ProcessAllWindowFunction[(String, Int, Long), (String, Long), TimeWindow] {
private var userState: MapState[Int, String] = null
val userStateDesc = new MapStateDescriptor[Int, String]("dau-state", classOf[Int], classOf[String])
override def process(context: Context, elements: Iterable[(String, Int, Long)], out: Collector[(String, Long)]): Unit = {
// 获取mapSate
userState = context.windowState.getMapState(userStateDesc) //context.windowState.getMapState(userStateDesc)
val currentProcessTime = DateUtil.getCurrentDateTimeJdk8(true)
// 对窗口类所有数据遍历存入mapState,去重
for (e <- elements) {
val key = e._2
userState.put(key, null)
}
//统计mapstate key的个数,即当前dau
val currentDau = userState.keys().size
out.collect(currentProcessTime, currentDau)
}
}
}