为了统计每个窗口下最热门的商品,我们需要再次按窗口进行分组,这里根据ItemViewCount中的windowEnd进行keyBy()操作。然后使用ProcessFunction实现一个自定义的TopN函数TopNHotItems来计算点击量排名前3名的商品,并将排名结果格式化成字符串,便于后续输出。
.keyBy("windowEnd")
.process(new TopNHotItems(3))
ProcessFunction是Flink提供的一个low-level API,用于实现更高级的功能。它主要提供了定时器timer的功能(支持EventTime或ProcessingTime)。本案例中我们将利用timer来判断何时收齐了某个window下所有商品的点击量数据。由于Watermark的进度是全局的,在processElement方法中,每当收到一条数据ItemViewCount,我们就注册一个windowEnd+1的定时器(Flink框架会自动忽略同一时间的重复注册)。windowEnd+1的定时器被触发时,意味着收到了windowEnd+1的Watermark,即收齐了该windowEnd下的所有商品窗口统计值。我们在onTimer()中处理将收集的所有商品及点击量进行排序,选出TopN,并将排名信息格式化成字符串后进行输出。
这里我们还使用了ListState
package analysis
import java.sql.Timestamp
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.api.java.tuple.{Tuple, Tuple1}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import scala.collection.mutable.ListBuffer
/**
* @author https://blog.csdn.net/qq_38704184
* @package analysis
* @date 2019/11/11 17:45
* @version 1.0
*/
// 输入数据样例类
case class UserBehavior(userId: Long, itemId: Long, categoryId: Int, behavior: String, timestamp: Long)
// 输出数据样例类
case class ItemViewCount(itemId: Long, windowEnd: Long, count: Long)
object HotItems {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
env.readTextFile("E:\\bigdata\\037_Flink项目\\037_Flink项目\\UserBehavior.csv")
.map(line => {
val linearray: Array[String] = line.split(",")
UserBehavior(linearray(0).toLong, linearray(1).toLong, linearray(2).toInt, linearray(3), linearray(4).toLong)
})
.assignAscendingTimestamps(_.timestamp * 1000)
.filter(_.behavior == "pv")
.keyBy("itemId")
.timeWindow(Time.hours(1), Time.minutes(1))
.aggregate(new CountAGG(), new WindowResultFunction())
.keyBy("windowEnd")
.process(new TopNHotItems(5))
.print()
env.execute("Hot Items Job")
}
}
class CountAGG extends AggregateFunction[UserBehavior, Long, Long] {
override def createAccumulator(): Long = 0L
override def add(value: UserBehavior, accumulator: Long): Long = accumulator + 1
override def getResult(accumulator: Long): Long = accumulator
override def merge(a: Long, b: Long): Long = a + b
}
class WindowResultFunction extends WindowFunction[Long, ItemViewCount, Tuple, TimeWindow] {
override def apply(key: Tuple,
window: TimeWindow,
input: Iterable[Long],
out: Collector[ItemViewCount]): Unit = {
val itemId: Long = key.asInstanceOf[Tuple1[Long]].f0
val count: Long = input.iterator.next()
out.collect(ItemViewCount(itemId, window.getEnd, count))
}
}
//自定义实现process function
class TopNHotItems(topSize: Int) extends KeyedProcessFunction[Tuple, ItemViewCount, String] {
// 定义状态ListState
private var itemState: ListState[ItemViewCount] = _
override def open(parameters: Configuration): Unit = {
super.open(parameters)
// 命名状态变量的名字和类型
val itemStateDesc = new ListStateDescriptor[ItemViewCount]("itemState", classOf[ItemViewCount])
itemState = getRuntimeContext.getListState(itemStateDesc)
}
override def processElement(value: ItemViewCount,
ctx: KeyedProcessFunction[Tuple, ItemViewCount, String]#Context,
out: Collector[String]): Unit = {
itemState.add(value)
// 注册定时器,触发时间定为windowEnd + 1,出发说明window已经收集完成所有数据
ctx.timerService().registerEventTimeTimer(value.windowEnd + 1)
}
// 定时器出发操作,从state取出所有数据,排序TopN,输出
override def onTimer(timestamp: Long,
ctx: KeyedProcessFunction[Tuple, ItemViewCount, String]#OnTimerContext,
out: Collector[String]): Unit = {
super.onTimer(timestamp, ctx, out)
// 获取收取商品点击量
val allItems: ListBuffer[ItemViewCount] = ListBuffer()
import scala.collection.JavaConversions._
for (item <- itemState.get()) {
allItems += item
}
// 清除状态中的数据,释放空间
itemState.clear()
// 按照点击率从大到小排序,选取TopN
val sortedItems: ListBuffer[ItemViewCount] = allItems.sortBy(_.count)(Ordering.Long.reverse).take(topSize)
// 将排名数据格式化,便于打印输出
val result = new StringBuilder()
result.append("====================================\n")
result.append("时间:")
result.append(new Timestamp(timestamp - 1)).append("\n")
for (i <- sortedItems.indices) {
val currentItem: ItemViewCount = sortedItems(i)
// 输出打印的格式 e.g. No1: 商品ID=12224 浏览量=2413
result.append("No").append(i + 1).append(":")
.append(" 商品ID=").append(currentItem.itemId)
.append(" 浏览量=").append(currentItem.count).append("\n")
}
result.append("====================================\n\n")
// 控制输出频率
Thread.sleep(1000)
out.collect(result.toString)
}
}