字段名 |
数据类型 |
说明 |
userId |
Long |
加密后的用户ID |
itemId |
Long |
加密后的商品ID |
categoryId |
Int |
加密后的商品所属类别ID |
behavior |
String |
用户行为类型,包括(‘pv’, ‘’buy, ‘cart’, ‘fav’) |
timestamp |
Long |
行为发生的时间戳,单位秒 |
82170,3588374,2465336,pv,1511658004
587599,2067643,4818107,cart,1511658004
367451,15775,4756105,pv,1511658004
428316,2478780,4284875,pv,1511658004
284910,3680091,3829657,pv,1511658004
345119,737662,4357323,pv,1511658004
551442,1762997,1879194,pv,1511658004
550384,3908776,1029459,pv,1511658004
677500,4534693,2640118,pv,1511658004
398626,2791489,1467750,pv,1511658004
118053,3545571,2433095,pv,1511658005
457401,4063698,4801426,pv,1511658005
45105,3234847,3141941,fav,1511658005
604760,2661651,3738615,pv,1511658005
905383,2064903,2939262,cart,1511658005
740788,3657484,4936889,pv,1511658005
456838,1242724,4756105,fav,1511658005
585217,215764,2640118,pv,1511658006
658185,4025021,4048584,fav,1511658006
210431,2035568,2328673,pv,1511658006
数据从kafka到Flink
import java.sql.Timestamp
import java.util.Properties
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.api.java.tuple.{Tuple, Tuple1}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.util.Collector
import scala.collection.mutable.ListBuffer
/**
* @author fczheng
* 2019-11-08 15:39
*/
object HotItems {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
//读取文件
//val data = env.readTextFile("E:\\develop\\workspace\\myidea\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")
//读取kafka中的流
val properties = new Properties()
properties.setProperty("bootstrap.servers", "hadoop102:9092,hadoop103:9092")
properties.setProperty("group.id", "consumer-group")
properties.setProperty("key.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("value.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("auto.offset.reset", "latest")
val data = env.addSource(new FlinkKafkaConsumer[String]("hotitems",new SimpleStringSchema(),properties))
val stream = data
.map(line =>{
val s: Array[String] = line.split(",")
UserBehavior(
s(0).toLong,
s(1).toLong,
s(2).toInt,
s(3),
s(4).toLong
)
})
.assignAscendingTimestamps(_.timeStamp * 1000) //指定时间戳和watermark
.filter(_.behavior == "pv")
//.keyBy("itemId")
.keyBy(_.itemId)
.timeWindow(Time.minutes(60),Time.minutes(5))
.aggregate(new CountAgg() , new WindowResultFunction())
.keyBy(_.windowEnd)
.process(new TopNHostItems(3)).print()
env.execute()
}
}
//自定义process Function
// 求某个窗口中前 N 名的热门点击商品,key 为窗口时间戳,输出为 TopN 的结果字符串
class TopNHostItems(topSize: Int) extends
KeyedProcessFunction[Long,ItemViewCount,String]{
//自定义一个list State,用来保存所有的ItemViewCount
private var itemState : ListState[ItemViewCount] = _
override def open(parameters: Configuration): Unit = {
super.open(parameters)
//命名状态变量的名字和状态变量的类型
val itemsStateDesc = new ListStateDescriptor[ItemViewCount]("itemState-state",classOf[ItemViewCount])
//定义状态变量
itemState = getRuntimeContext.getListState(itemsStateDesc)
}
override def processElement(input: ItemViewCount,
context: KeyedProcessFunction[Long, ItemViewCount, String]#Context,
collector: Collector[String]): Unit = {
//每一条数据存入state中
itemState.add(input)
// 注册定时器,延时触发;当定时器触发时,当前windowEnd的一组数据应该都到齐,统一排序处理
// 也就是当程序看到windowend + 1的水位线watermark时,触发onTimer回调函数
context.timerService().registerEventTimeTimer(input.windowEnd + 1)
}
override def onTimer(timestamp: Long,
ctx: KeyedProcessFunction[Long, ItemViewCount, String]#OnTimerContext,
out: Collector[String]): Unit = {
//定时器触发时,已经收集到所有的数据,首先把所有数据放到一个list中
val allItems: ListBuffer[ItemViewCount] = new ListBuffer()
import scala.collection.JavaConversions._
for(item <- itemState.get){
allItems += item
}
//提前清楚状态中的数据,释放空间
itemState.clear()
//按点击量从大到小排序
val sortedItems: ListBuffer[ItemViewCount]
= allItems.sortBy(_.count)(Ordering.Long.reverse)take(topSize)
// 将排名信息格式化成 String, 便于打印
val result: StringBuilder = new StringBuilder
result.append("====================================\n")
result.append("时间: ").append(new Timestamp(timestamp - 1)).append("\n")
for(i <- sortedItems.indices){
val currentItem: ItemViewCount = sortedItems(i)
// e.g. No1: 商品ID=12224 浏览量=2413
result.append("No").append(i+1).append(":")
.append(" 商品ID=").append(currentItem.itemId)
.append(" 浏览量=").append(currentItem.count).append("\n")
}
result.append("====================================\n\n")
// 控制输出频率,模拟实时滚动结果
Thread.sleep(1000)
out.collect(result.toString())
}
}
// 自定义预聚合函数,来一个数据就加一
class CountAgg extends AggregateFunction[UserBehavior,Long,Long]{
override def createAccumulator(): Long = 0L
override def add(in: UserBehavior, acc: Long): Long = acc + 1
override def getResult(acc: Long): Long = acc
override def merge(acc1: Long, acc2: Long): Long = acc1 + acc2
}
//用于输出窗口的结构
class WindowResultFunction extends WindowFunction[
Long,
ItemViewCount,
Long,
TimeWindow]{
override def apply(
key: Long,
window: TimeWindow,
aggregateResult: Iterable[Long],
collector: Collector[ItemViewCount]) : Unit = {
// val itemId: Long = key.asInstanceOf[Tuple1[Long]].f0
val itemId = key
val windowEnd = window.getEnd
val count: Long = aggregateResult.iterator.next()
collector.collect(ItemViewCount(itemId, windowEnd, count))
}
}
//输入数据的样例类
case class UserBehavior(userId: Long,
itemId: Long,
categoryId: Int,
behavior: String,
timeStamp: Long)
//中间输出的商品浏览量的样例类
case class ItemViewCount(itemId: Long,
windowEnd: Long,
count: Long)
====================================
时间: 2017-11-26 10:20:00.0
No1: 商品ID=2218996 浏览量=1
No2: 商品ID=1994111 浏览量=1
No3: 商品ID=1469184 浏览量=1
====================================
====================================
时间: 2017-11-26 10:25:00.0
No1: 商品ID=2338453 浏览量=5
No2: 商品ID=3784551 浏览量=4
No3: 商品ID=3919770 浏览量=3
====================================
====================================
时间: 2017-11-26 10:30:00.0
No1: 商品ID=2338453 浏览量=7
No2: 商品ID=3034696 浏览量=6
No3: 商品ID=3784551 浏览量=5
====================================
====================================
时间: 2017-11-26 10:35:00.0
No1: 商品ID=2338453 浏览量=12
No2: 商品ID=3034696 浏览量=7
No3: 商品ID=291932 浏览量=6
====================================
====================================
时间: 2017-11-26 10:40:00.0
No1: 商品ID=2338453 浏览量=13
No2: 商品ID=291932 浏览量=9
No3: 商品ID=279675 浏览量=8
====================================
====================================
时间: 2017-11-26 10:45:00.0
No1: 商品ID=2338453 浏览量=14
No2: 商品ID=279675 浏览量=11
No3: 商品ID=291932 浏览量=11
====================================
1.7.2
2.11
2.2.0
org.apache.flink
flink-scala_${scala.binary.version}
${flink.version}
org.apache.flink
flink-streaming-scala_${scala.binary.version}
${flink.version}
org.apache.kafka
kafka_${scala.binary.version}
${kafka.version}
org.apache.flink
flink-connector-kafka_${scala.binary.version}
${flink.version}
net.alchim31.maven
scala-maven-plugin
3.4.6
testCompile
org.apache.maven.plugins
maven-assembly-plugin
3.0.0
jar-with-dependencies
make-assembly
package
single