用到的数据文件
链接:https://pan.baidu.com/s/1uCk-IF4wWVfUkuuTAKaD0w
提取码:2hmu
用户行为数据不断写入kafka,程序不断从kafka读取数据,每个五分钟统计最近
一小时浏览次数最多的热门商品top 5。
输入数据:
UserBehavior
字段名:userId itemId categoryId behavior timestamp
解释: 用户名 商品id 商品类别id 行为 时间戳
值举例: lily 1715 1464116 pv 1511658000
类型: Long Long Integer String Long
输出数据:
ItemViewCount
字段名 itemId count_pv windowEnd
解释: 商品id 商品pv总数 窗口结束时间戳
值举例:1715 17 1511658000000
类型: Long Long Long
package com.atguigu.hotitems_analysis.beans;
/**
*
*/
public class UserBehavior {
public Long userId;
public Long itemId;
public Integer categoryId;
public String behavior;
public Long timestamp;
public UserBehavior() {
}
public UserBehavior(Long userId, Long itemId, Integer categoryId, String behavior, Long timestamp) {
this.userId = userId;
this.itemId = itemId;
this.categoryId = categoryId;
this.behavior = behavior;
this.timestamp = timestamp;
}
public Long getUserId() {
return userId;
}
public void setUserId(Long userId) {
this.userId = userId;
}
public Long getItemId() {
return itemId;
}
public void setItemId(Long itemId) {
this.itemId = itemId;
}
public Integer getCategoryId() {
return categoryId;
}
public void setCategoryId(Integer categoryId) {
this.categoryId = categoryId;
}
public String getBehavior() {
return behavior;
}
public void setBehavior(String behavior) {
this.behavior = behavior;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
@Override
public String toString() {
return "UserBehavior{" +
"userId=" + userId +
", itemId=" + itemId +
", categoryId=" + categoryId +
", behavior='" + behavior + '\'' +
", timestamp=" + timestamp +
'}';
}
}
package com.atguigu.hotitems_analysis.beans;
/**
* 处理后的结果类
*/
public class ItemViewCount {
public Long itemId;
public Long windowEnd;
public Long count;
public ItemViewCount() {
}
public ItemViewCount(Long itemId, Long windowEnd, Long count) {
this.itemId = itemId;
this.windowEnd = windowEnd;
this.count = count;
}
public Long getItemId() {
return itemId;
}
public void setItemId(Long itemId) {
this.itemId = itemId;
}
public Long getWindowEnd() {
return windowEnd;
}
public void setWindowEnd(Long windowEnd) {
this.windowEnd = windowEnd;
}
public Long getCount() {
return count;
}
public void setCount(Long count) {
this.count = count;
}
@Override
public String toString() {
return "ItemViewCount{" +
"itemId=" + itemId +
", windowEnd=" + windowEnd +
", count=" + count +
'}';
}
}
package com.atguigu.hotitems_analysis;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.Properties;
public class KafkaProducerUtil {
public static void main(String[] args) throws Exception{
writeToKafka("hotitems_test");
}
public static void writeToKafka(String topic) throws Exception{
Properties ps = new Properties();
ps.setProperty("bootstrap.servers","192.168.149.131:9092");//集群地址
ps.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");//key序列化方式
ps.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");//value序列化方式
KafkaProducer kafkaProducer = new KafkaProducer<>(ps);//
BufferedReader bufferedReader = new BufferedReader(new FileReader("G:\\SoftwareInstall\\idea\\project\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv"));
String line;
while((line= bufferedReader.readLine()) !=null ){
ProducerRecord record = new ProducerRecord<>(topic, line);
kafkaProducer.send(record);
Thread.sleep(2);
}
kafkaProducer.close();
}
}
package com.atguigu.hotitems_analysis.Ahotitems;
import com.atguigu.hotitems_analysis.beans.ItemViewCount;
import com.atguigu.hotitems_analysis.beans.UserBehavior;
import org.apache.commons.compress.utils.Lists;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Properties;
/**
* 本类从kafka接收商品信息数据,每隔五分钟统计最近一小时的热门商品top 5.
* 商品信息字段名:userId itemId categoryId behavior timestamp
* 一个商品pv次数越多,热度越高。
*
* 分析:
* 步骤1:分组开窗聚合,得到每个窗口各个商品pv的count值:
* 先把"behavior=pv"的数据过滤出来,然后按照商品id即itemId分组。
* 有一个滑动窗口操作,长度一小时,步长5分钟。
* 要对每个itemId的pv做聚合,且聚合后数据类型改变,应该使用aggregate函数
* aggregate函数中第一个参数为增量聚合函数,利用累加器累加状态后将状态输出
* 因为需要按窗口统计,所以需要获取到窗口的信息,所以aggregate函数必须有第二个参数,即一个全窗口函数
* 全窗口函数中负责将itemId,windowEnd,count_pv封装并输出。
*
* 步骤2:收集同一窗口内所有商品的count值,排序输出top 5
* top 5是每个窗口中的,所以肯定需要先按windowEnd分组
* 窗口内的数据何时全部到达呢?当事件时间到达watermark时,全部数据都已到达,然后触发计算。
* 已达到但未触发计算的数据可以保存在ListState中,待数据全部到达时触发定时器计算逻辑输出结果。
* 因为用到了定时器和状态,所以必须使用processFunction api.
* 定时器:
* 每来一条数据,就将该数据加入listState,然后就根据数据中带有的windowEnd时间戳注册定时器,时间戳相同,定时器就是同一个。
* 在onTimer方法中,取出listState中的数据然后排序即可。
* 注意:
* 定时器触发后应该在onTimer方法中调用clear方法清除状态。
* 在close方法中也应该调用clear方法清除状态。
*/
public class HotItems {
public static void main(String[] args) throws Exception {
//创建环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
/**
* 读取数据并转换成pojo,按事件时间处理就必须先分配时间戳和watermark
* 要想kafka从头开始消费时数据,group.id必须是全新的,消费策略必须是earliest
*/
Properties ps = new Properties();
ps.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.149.131:9092");//集群地址
ps.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "consumer_group");//消费者组
ps.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");//key反序列化方式
ps.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");//value反序列化方式
ps.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");//消费策略
//其实第二个参数指定了序列化方式,那key和value的序列化方式就不用指定了
DataStream inputStream=env.addSource(new FlinkKafkaConsumer("hotitems_test",new SimpleStringSchema(),ps));
DataStream dataStream=inputStream.map(
line ->{
String [] words=line.split(",");
return new UserBehavior(new Long(words[0]),new Long(words[1]),new Integer(words[2]),new String(words[3]),new Long(words[4]));
})
.assignTimestampsAndWatermarks(
new AscendingTimestampExtractor() { //升序
@Override
public long extractAscendingTimestamp(UserBehavior userBehavior) {//获取事件时间戳,秒级转毫秒级
return userBehavior.getTimestamp()*1000L;
}
});
//分组聚合得到结果数据
DataStream aggStream=dataStream
.filter(data -> "pv".equals(data.getBehavior())) //过滤“pv”行为
.keyBy(UserBehavior::getItemId)
.timeWindow(Time.minutes(60),Time.minutes(5)) //每5分钟更新一次1小时窗口数据
.aggregate(new ItemCountAgg(),new WindowItemCountResult());
//收集同一窗口所有商品的count数据,按top 5输出
DataStream resultDs=aggStream
.keyBy("windowEnd")
.process(new TopNItems(5));
//输出并执行
resultDs.print("每隔五分钟最近一小时前五的热门商品");
env.execute("hot items analysis");
}
//泛型1:输入类型 泛型2:聚合状态类型 泛型3:输出类型
public static class ItemCountAgg implements AggregateFunction{
@Override
public Long createAccumulator() {//创建累加器并给初始值
return 0L;
}
@Override
public Long add(UserBehavior userBehavior,Long accumulator) {//每次计算累加器加一,并返回新的累加器值
return accumulator+1;
}
@Override
public Long getResult(Long accumulator) {//累加器最终给外部返回的值
return accumulator;
}
@Override
public Long merge(Long a, Long b) { //合并两个累加器,返回合并后的累加器的状态,这儿用不到.用不到.
return a+b;
}
}
//参数1:输入类型,即ItemCountAgg的输出类型 参数2:输出类型 参数3:keyBy的返回值键值对中value的类型 参数4: 窗口类型
public static class WindowItemCountResult implements WindowFunction{
//迭代器中装的是输入类型
@Override
public void apply(Long key, TimeWindow window, Iterable iterable, Collector collector) throws Exception {
//包装成一个ItemViewCount对象输出
collector.collect(new ItemViewCount(key.longValue(),window.getEnd(),iterable.iterator().next()));
}
}
//参数1:keyBy返回值类型 参数2:输入类型 参数3:输出类型
public static class TopNItems extends KeyedProcessFunction{
private Integer topSize;
private ListState listState; //列表状态,保存当前窗口所有输出的ItemViewCount
public TopNItems(Integer topSize) {
this.topSize = topSize;
}
@Override
public void open(Configuration parameters) throws Exception {
listState =getRuntimeContext().getListState(new ListStateDescriptor("item-view-count-list",ItemViewCount.class));
}
//每来一条数据如何处理
@Override
public void processElement(ItemViewCount value, Context context, Collector collector) throws Exception {
//每来一条数据,存入List中,并注册定时器(只有触发时间一样,定时器就是同一个)
listState.add(value);
context.timerService().registerEventTimeTimer(value.getWindowEnd());//注册定时器
}
//定时器触发时的逻辑
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector out) throws Exception {
//转换成Arraylist再排序
ArrayList itemViewCounts = Lists.newArrayList(listState.get().iterator());
itemViewCounts.sort(new Comparator() {
@Override
public int compare(ItemViewCount o1, ItemViewCount o2) {//前大于后返回负数,为倒序
if(o1.getCount() > o2.getCount())
return -1;
else if (o1.getCount() == o2.getCount())
return 0;
else
return 1;
}
});
//定义输出结果格式
StringBuilder resultBuilder=new StringBuilder();
resultBuilder.append("===================\n");
resultBuilder.append("窗口结束时间:").append(new Timestamp(timestamp)).append("\n"); //输出windowend
//遍历输出
for (int i = 0; i < Math.min(topSize,itemViewCounts.size()); i++) {
ItemViewCount currentItemViewCount = itemViewCounts.get(i);
resultBuilder.append("Number").append(i+1).append(":")
.append("商品ID:").append(currentItemViewCount.getItemId())
.append("浏览量:").append(currentItemViewCount.getCount())
.append("\n");
}
resultBuilder.append("===================\n\n");
Thread.sleep(1000L);//控制输出频率
out.collect(resultBuilder.toString());
listState.clear();//清空状态
}
@Override
public void close() throws Exception {
listState.clear();//清空状态
}
}
}
package com.atguigu.hotitems_analysis.Ahotitems;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
/**
* 本类从kafka接收商品信息数据,每隔五分钟统计最近一小时的热门商品top 5.
* 计划器这东西在Table api&Flink sql才需要引入
*
*/
public class HotItemsFlinkSQL {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
EnvironmentSettings settings = EnvironmentSettings.newInstance() //计划器这东西在Table api&Flink sql才需要引入
.useBlinkPlanner()
.inStreamingMode()
.build();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
//创建kafka源表
tableEnv.sqlUpdate(
"create table inputTable(" +
" userId BIGINT, " +
" itemId BIGINT, " +
" categoryId INT, " +
" behavior STRING," +
" ts BIGINT," +
" rt AS TO_TIMESTAMP(FROM_UNIXTIME(ts))," + //基于ts新建一个事件时间字段
" WATERMARK FOR rt AS rt - INTERVAL '1' SECOND " +
") WITH ( " +
" 'connector.type' = 'kafka'," +
" 'connector.version' = 'universal'," +
" 'connector.topic' = 'hotitems_test'," +
" 'connector.properties.group.id' = 'flink_hot1iqm11saq12311'," +
" 'connector.startup-mode' = 'earliest-offset'," + //消费模式
" 'connector.properties.zookeeper.connect' = '192.168.149.131:2181'," +
" 'connector.properties.bootstrap.servers' = '192.168.149.131:9092'," +
" 'format.type' = 'csv' )"); //如果是json就写json
// tableEnv.registerFunction("long2Ts",new LongToTimestamp());//注册函数
//Flink sql处理:由于table api中没有窗口内求top N的函数,所以我们使用flink sql来进行窗口内求top n.
//table直接转视图调的是TableEnviroment的方法,而不是StreamTableEnviroment的,可能会有一些问题,
//我们可以table转datastream,datastream再转视图曲线救国。
// tableEnv.createTemporaryView("sensor",tableEnv.toAppendStream(table1, Row.class),"itemId,windowEnd,cnt");
Table resultTable = tableEnv.sqlQuery(" select * from " +
"(select *,row_number() over(partition by windowEnd order by cnt desc ) as top_rank from " +
" ( " +
" select itemId,hop_end(rt,interval '5' minute,interval '1' hour) as windowEnd,count(itemId) cnt " +
" from inputTable where behavior='pv'" +
" group by itemId,hop(rt,interval '5' minute,interval '1' hour)" +
" )t1 " +
")t where top_rank<=5" );
//输出,Flink sql处理了的时候已经没有窗口,必须使用toRetractStream输出
tableEnv.toRetractStream(resultTable,Row.class).print("table api&Flink sql小案例");
//执行
env.execute("hot items analysis");
}
}
4.0.0
com.atguigu
UserBehaviorAnalysis
pom
1.0-SNAPSHOT
HotItemsAnalysis
BasicKnowledge
1.10.1
2.11
2.2.0
org.apache.flink
flink-clients_${scala.binary.version}
${flink.version}
org.apache.flink
flink-java
${flink.version}
org.apache.flink
flink-streaming-java_${scala.binary.version}
${flink.version}
org.apache.kafka
kafka_${scala.binary.version}
${kafka.version}
org.apache.flink
flink-connector-kafka_${scala.binary.version}
${flink.version}
org.apache.flink
flink-table-planner-blink_${scala.binary.version}
${flink.version}
org.apache.flink
flink-table-planner_${scala.binary.version}
${flink.version}
org.apache.flink
flink-csv
${flink.version}
org.apache.flink
flink-connector-kafka-0.11_${scala.binary.version}
${flink.version}
org.apache.flink
flink-connector-redis_${scala.binary.version}
1.1.5
org.apache.flink
flink-connector-elasticsearch6_${scala.binary.version}
${flink.version}
mysql
mysql-connector-java
5.1.44
maven-compiler-plugin
1.8
UTF-8
链接:https://pan.baidu.com/s/1uCk-IF4wWVfUkuuTAKaD0w
提取码:2hmu