第一步:将dataStream转换城windowedStream
//kafka中的数据示例
452437,3255022,5099474,pv,1511658000
813974,1332724,2520771,buy,1511658000
524395,3887779,2366905,pv,1511658000
470572,3760258,1299190,pv,1511658001
543789,3110556,4558987,cart,1511658001
354759,2191348,4756105,pv,1511658001
382009,2123538,4801426,pv,1511658001
677046,1598945,4145813,pv,1511658001
946161,3021357,1506018,pv,1511658001
// 从kafka读取数据
val inputStream = env.addSource(new FlinkKafkaConsumer[String]("hotitems", new SimpleStringSchema(), properties))
.map(data => {
val dataArray = data.split(",")
UserBehavior(dataArray(0).toLong, dataArray(1).toLong, dataArray(2).toInt, dataArray(3), dataArray(4).toLong)
})
.assignAscendingTimestamps(_.timestamp * 1000L)
// 对数据进行窗口聚合处理
val aggStream: DataStream[ItemViewCount] = inputStream
.filter(_.behavior == "pv") // 过滤出pv数据
.keyBy(_.itemId)
.timeWindow(Time.hours(1), Time.minutes(5)) // 开窗进行统计
.aggregate(new CountAgg(), new WindowCountResult()) // 聚合出当前商品在时间窗口内的统计数量
//样例类
case class UserBehavior(userId: Long, itemId: Long, categoryId: Int, behavior: String, timestamp: Long)
第二步:自定义聚合函数
// 自定义的预聚合函数,来一条数据就加一
class CountAgg() extends AggregateFunction[UserBehavior, Long, Long] {
//add方法为累加器累加的方法,这里为最简单的+1操作
override def add(value: UserBehavior, accumulator: Long): Long = accumulator + 1
//初始化累加值
override def createAccumulator(): Long = 0L
//最后返回那个值,这里为accumulator
override def getResult(accumulator: Long): Long = accumulator
//分区处理的归并操作,这里将所有并处理的结果相加
override def merge(a: Long, b: Long): Long = a + b
}
第三部:自定义窗口函数
// 自定义window function
class WindowCountResult() extends WindowFunction[Long, ItemViewCount, Long, TimeWindow] {
//Long类型的Key为上一步的自定义累加器的返回值
//Window为差给你扣类型,第一步中的没窗口类型,TimeWindow
//input为接收的数据类型,此处为Long类型的迭代器
//out为此方法返回的类型,此处为ItemViewCount样例类对象的集合
override def apply(key: Long, window: TimeWindow, input: Iterable[Long], out: Collector[ItemViewCount]): Unit = {
//调用ItemViewCount样例类对象的构造器,依次构造出ItemViewCount样例类并返回
out.collect(ItemViewCount(key, window.getEnd, input.iterator.next()))
}
}
case class ItemViewCount(itemId: Long, windowEnd: Long, count: Long)