Flink之处理函数
- 一、准备
-
- 1.1、Event类
- 1.2、EventWithWatermarkSource类
- 二、处理函数分类
-
- 2.1、ProcessFunction
-
- 2.2、KeyedProcessFunction
-
- 2.3、ProcessWindowFunction
-
- 2.4、ProcessAllWindowFunction
-
- 2.5、CoProcessFunction
-
- 2.6、ProcessJoinFunction
-
- 2.7、BroadcastProcessFunction
-
- 2.8、KeyedBroadcastProcessFunction
-
- 三、热门url案例
-
- 3.1、需求
- 3.2、实现代码
-
- 3.2.1、Event类
- 3.2.2、UrlViewCount类
- 3.2.3、TopNExample类
- 3.3、代码改进
-
- 3.3.1、TopNExample类不足
- 3.3.2、改进代码
一、准备
1.1、Event类
package com.hpsk.flink.beans;
import java.sql.Timestamp;
public class Event {
public String user;
public String url;
public Long timestamp;
public Event(){
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
1.2、EventWithWatermarkSource类
package com.hpsk.flink.source;
import com.hpsk.flink.beans.Event;
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
import org.apache.flink.streaming.api.watermark.Watermark;
import java.util.Calendar;
import java.util.Random;
public class EventWithWatermarkSource implements ParallelSourceFunction<Event> {
private boolean isRunning = true;
String[] users = new String[]{"Alice", "Bob", "Mary", "Tom"};
String[] urls = new String[]{"./home", "./cart", "./prod?id=1", "./prod?id=10"};
@Override
public void run(SourceContext<Event> ctx) throws Exception {
Random random = new Random();
while (isRunning) {
String user = users[random.nextInt(users.length)];
String url = urls[random.nextInt(urls.length)];
long currTs = Calendar.getInstance().getTimeInMillis();
Event event = new Event(user, url, currTs);
ctx.collectWithTimestamp(event, currTs);
ctx.emitWatermark(new Watermark(event.timestamp - 1L));
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
isRunning = false;
}
}
二、处理函数分类
2.1、ProcessFunction
2.1.1、函数详解
public abstract class ProcessFunction<I, O> extends AbstractRichFunction {
public abstract void processElement(I value, Context ctx, Collector<O> out) throws Exception;
public void onTimer(long timestamp, OnTimerContext ctx, Collector<O> out) throws Exception {}
public abstract class Context {.....
}
}
2.1.2、基本使用
package com.hpsk.flink.function;
import com.hpsk.flink.beans.Event;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import java.sql.Timestamp;
public class ProcessFunctionDS {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());
inputDS.print("input ");
OutputTag<Event> outputTag = new OutputTag<Event>("event"){};
SingleOutputStreamOperator<Event> result = inputDS.process(new ProcessFunction<Event, Event>() {
@Override
public void processElement(Event value, Context ctx, Collector<Event> out) throws Exception {
long currTs = ctx.timerService().currentProcessingTime();
if (!value.user.equals("Bob")) {
out.collect(value);
} else {
ctx.output(outputTag, value);
}
System.out.println(new Timestamp(currTs) + " 数据到达时间:" + value);
}
});
result.print("output ");
result.getSideOutput(outputTag).print("outputTag ");
env.execute();
}
}
2.2、KeyedProcessFunction
2.2.1、函数详解
public abstract class KeyedProcessFunction<K, I, O> extends AbstractRichFunction {
public abstract void processElement(I value, Context ctx, Collector<O> out) throws Exception;
public void onTimer(long timestamp, OnTimerContext ctx, Collector<O> out) throws Exception {}
public abstract class Context {.....
}
}
2.2.2、基本使用
package com.hpsk.flink.function;
import com.hpsk.flink.beans.Event;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
public class KeyedProcessFunctionDS {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());
inputDS.print("input ");
SingleOutputStreamOperator<String> result = inputDS
.keyBy(t -> t.user)
.process(new KeyedProcessFunction<String, Event, String>() {
@Override
public void processElement(Event value, Context ctx, Collector<String> out) throws Exception {
out.collect("数据到达,时间戳为:" + new Timestamp(ctx.timestamp()));
out.collect(" 数据到达,水位线为: " + new Timestamp(ctx.timerService().currentWatermark()) + "\n -------分割线-------");
ctx.timerService().registerEventTimeTimer(ctx.timestamp() + 10 * 1000L);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
out.collect("定时器触发,触发时间:" + new Timestamp(timestamp));
}
});
result.print("output ");
env.execute();
}
}
2.3、ProcessWindowFunction
2.3.1、函数详解
public abstract class ProcessWindowFunction<IN, OUT, KEY, W extends Window>
extends AbstractRichFunction {
public abstract void process(
KEY key, Context context, Iterable<IN> elements, Collector<OUT> out) throws Exception;
2.3.2、基本使用
package com.hpsk.flink.function;
import com.hpsk.flink.beans.Event;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.HashSet;
public class ProcessWindowFunctionDS {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(2);
DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());
inputDS.print("input ");
SingleOutputStreamOperator<String> result = inputDS
.keyBy(t -> t.user)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
.process(new ProcessWindowFunction<Event, String, String, TimeWindow>() {
@Override
public void process(String s, Context ctx, Iterable<Event> elements, Collector<String> out) throws Exception {
HashSet<String> urls = new HashSet<>();
for (Event element : elements) {
urls.add(element.url);
}
Timestamp start = new Timestamp(ctx.window().getStart());
Timestamp end = new Timestamp(ctx.window().getEnd());
out.collect("窗口:[ " + start + "~" + end + " ] ->的用户" + s + "总计访问" + urls.size() + "个页面。");
}
});
result.print("output ");
env.execute();
}
}
2.4、ProcessAllWindowFunction
2.4.1、函数详解
public abstract class ProcessAllWindowFunction<IN, OUT, W extends Window>
extends AbstractRichFunction {
public abstract void process(Context context, Iterable<IN> elements, Collector<OUT> out)
throws Exception;
2.4.2、基本使用
package com.hpsk.flink.function;
import com.hpsk.flink.beans.Event;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.HashSet;
public class ProcessAllWindowFunctionDS {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());
inputDS.print("input ");
SingleOutputStreamOperator<String> result = inputDS
.windowAll(TumblingEventTimeWindows.of(Time.seconds(10)))
.process(new ProcessAllWindowFunction<Event, String, TimeWindow>() {
@Override
public void process(Context ctx, Iterable<Event> elements, Collector<String> out) throws Exception {
HashSet<String> users = new HashSet<>();
HashSet<String> urls = new HashSet<>();
for (Event element : elements) {
urls.add(element.url);
users.add(element.user);
}
Timestamp start = new Timestamp(ctx.window().getStart());
Timestamp end = new Timestamp(ctx.window().getEnd());
out.collect("窗口:[ " + start + "~" + end + " ] -> 总计用户" + users.size() + ",总计访问" + urls.size() + "个独立页面。");
}
});
result.print("output ");
env.execute();
}
}
2.5、CoProcessFunction
2.5.1、函数详解
public abstract class CoProcessFunction<IN1, IN2, OUT> extends AbstractRichFunction {
public abstract void processElement1(IN1 value, Context ctx, Collector<OUT> out)
throws Exception;
public abstract void processElement2(IN2 value, Context ctx, Collector<OUT> out)
throws Exception;
public void onTimer(long timestamp, OnTimerContext ctx, Collector<OUT> out) throws Exception {}
2.5.2、基本使用
package com.hpsk.flink.function;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
import org.apache.flink.util.Collector;
public class CoProcessFunctionDS {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Tuple3<String, String, Long>> appStream = env.fromElements(
Tuple3.of("order-1", "app", 1000L),
Tuple3.of("order-2", "app", 2000L)
).assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple3<String,
String, Long>>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
@Override
public long extractTimestamp(Tuple3<String, String, Long> element, long recordTimestamp) {
return element.f2;
}
})
);
SingleOutputStreamOperator<Tuple4<String, String, String, Long>> thirdpartStream = env.fromElements(
Tuple4.of("order-1", "third-party", "success", 3000L),
Tuple4.of("order-3", "third-party", "success", 4000L)
).assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple4<String,
String, String, Long>>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple4<String, String, String, Long>>() {
@Override
public long extractTimestamp(Tuple4<String, String, String, Long>
element, long recordTimestamp) {
return element.f3;
}
})
);
SingleOutputStreamOperator<String> result = appStream
.connect(thirdpartStream)
.keyBy(t -> t.f0, t -> t.f0)
.process(new CoProcessFunction<Tuple3<String, String, Long>, Tuple4<String, String, String, Long>, String>() {
private ValueState<Tuple3<String, String, Long>> appEventState;
private ValueState<Tuple4<String, String, String, Long>> thirdPartyEventState;
@Override
public void open(Configuration parameters) throws Exception {
appEventState = getRuntimeContext().getState(new ValueStateDescriptor<Tuple3<String, String, Long>>("appEventState", Types.TUPLE(Types.STRING, Types.STRING, Types.LONG)));
thirdPartyEventState = getRuntimeContext().getState(new ValueStateDescriptor<Tuple4<String, String, String, Long>>("thirdPartyEventState", Types.TUPLE(Types.STRING, Types.STRING, Types.STRING, Types.LONG)));
}
@Override
public void processElement1(Tuple3<String, String, Long> value, Context ctx, Collector<String> out) throws Exception {
if (thirdPartyEventState.value() != null) {
out.collect("对账成功 : " + value + " " + thirdPartyEventState.value());
thirdPartyEventState.clear();
} else {
appEventState.update(value);
ctx.timerService().registerEventTimeTimer(value.f2 + 5000L);
}
}
@Override
public void processElement2(Tuple4<String, String, String, Long> value, Context ctx, Collector<String> out) throws Exception {
if (appEventState.value() != null) {
out.collect("对账成功:" + appEventState.value() + " " + value);
appEventState.clear();
} else {
thirdPartyEventState.update(value);
ctx.timerService().registerEventTimeTimer(value.f3 + 5000L);
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
if (appEventState.value() != null) {
out.collect("对账失败:" + appEventState.value() + " " + "第三方支付平台信息未到");
}
if (thirdPartyEventState.value() != null) {
out.collect("对账失败:" + thirdPartyEventState.value() + " " + "app信息未到");
}
appEventState.clear();
thirdPartyEventState.clear();
}
});
result.print("output ");
env.execute();
}
}
2.6、ProcessJoinFunction
2.6.1、函数详解
public abstract class ProcessJoinFunction<IN1, IN2, OUT> extends AbstractRichFunction {
public abstract void processElement(IN1 left, IN2 right, Context ctx, Collector<OUT> out)
throws Exception;
2.6.2、基本使用
package com.hpsk.flink.function;
import com.hpsk.flink.beans.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
public class ProcessJoinFunctionDS {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Tuple3<String, String, Long>> orderStream = env.fromElements(
Tuple3.of("Mary", "order-1", 5000L),
Tuple3.of("Alice", "order-2", 5000L),
Tuple3.of("Bob", "order-3", 20000L),
Tuple3.of("Alice", "order-4", 20000L),
Tuple3.of("Cary", "order-5", 51000L)
).assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple3<String,
String, Long>>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
@Override
public long extractTimestamp(Tuple3<String, String, Long> element, long recordTimestamp) {
return element.f2;
}
})
);
SingleOutputStreamOperator<Event> clickStream = env.fromElements(
new Event("Bob", "./cart", 2000L),
new Event("Alice", "./prod?id=100", 3000L),
new Event("Alice", "./prod?id=200", 3500L),
new Event("Bob", "./prod?id=2", 2500L),
new Event("Alice", "./prod?id=300", 36000L),
new Event("Bob", "./home", 30000L),
new Event("Bob", "./prod?id=1", 23000L),
new Event("Bob", "./prod?id=3", 33000L)
).assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
SingleOutputStreamOperator<String> result = orderStream
.keyBy(t -> t.f0)
.intervalJoin(clickStream.keyBy(t -> t.user))
.between(Time.seconds(-5), Time.seconds(10))
.process(new ProcessJoinFunction<Tuple3<String, String, Long>, Event, String>() {
@Override
public void processElement(Tuple3<String, String, Long> left, Event right, Context ctx, Collector<String> out) throws Exception {
out.collect(right + " => {" + left.f0 + ", " + left.f1 + ", " + new Timestamp(left.f2) + "}");
}
});
result.print("output ");
env.execute();
}
}
2.7、BroadcastProcessFunction
2.7.1、函数详解
public abstract class BroadcastProcessFunction<IN1, IN2, OUT> extends BaseBroadcastProcessFunction {
public abstract void processElement(
final IN1 value, final ReadOnlyContext ctx, final Collector<OUT> out) throws Exception;
public abstract void processBroadcastElement(
final IN2 value, final Context ctx, final Collector<OUT> out) throws Exception;
2.7.2、基本使用
package com.hpsk.flink.function;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.util.Collector;
public class BroadcastProcessFunctionDS {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<String> tableConfigStream = env.fromElements(
"table1,createTable",
"table2,createTable",
"table3,createTable");
SingleOutputStreamOperator<Tuple2<String, String>> MySqlTableStream = env.fromElements(
Tuple2.of("table1", "data"),
Tuple2.of("table2", "data"),
Tuple2.of("table4", "data")
);
MapStateDescriptor<String, String> mapStateDescriptor = new MapStateDescriptor<>("map-state", String.class, String.class);
BroadcastStream<String> broadcast = tableConfigStream.broadcast(mapStateDescriptor);
SingleOutputStreamOperator<String> result = MySqlTableStream
.connect(broadcast)
.process(new MyBroadcastProcessFunction(mapStateDescriptor));
result.print("output ");
env.execute();
}
public static class MyBroadcastProcessFunction extends BroadcastProcessFunction<Tuple2<String, String>, String, String>{
private MapStateDescriptor<String, String> mapStateDescriptor;
public MyBroadcastProcessFunction(MapStateDescriptor<String, String> mapStateDescriptor) {
this.mapStateDescriptor = mapStateDescriptor;
}
@Override
public void processBroadcastElement(String value, Context ctx, Collector<String> out) throws Exception {
BroadcastState<String, String> configBroadcast = ctx.getBroadcastState(mapStateDescriptor);
String[] split = value.split(",");
configBroadcast.put(split[0].trim(), split[1].trim());
}
@Override
public void processElement(Tuple2<String, String> value, ReadOnlyContext ctx, Collector<String> out) throws Exception {
ReadOnlyBroadcastState<String, String> broadcastState = ctx.getBroadcastState(mapStateDescriptor);
String table = value.f0;
String create = broadcastState.get(table);
if (create != null) {
out.collect(value.f0 + "为配置表,需要在phoenix中建表 -> 建表语句:" + create + ", 数据为:" + value.f1);
} else {
out.collect(value.f0 + "业务表, 跳过建表");
}
}
}
}
2.8、KeyedBroadcastProcessFunction
2.8.1、函数详解
public abstract class KeyedBroadcastProcessFunction<KS, IN1, IN2, OUT>
extends BaseBroadcastProcessFunction {
public abstract void processElement(
final IN1 value, final ReadOnlyContext ctx, final Collector<OUT> out) throws Exception;
public abstract void processBroadcastElement(
final IN2 value, final Context ctx, final Collector<OUT> out) throws Exception;
public void onTimer(final long timestamp, final OnTimerContext ctx, final Collector<OUT> out)
throws Exception {}
2.8.2、基本使用
package com.hpsk.flink.function;
import com.hpsk.flink.beans.Event;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction;
import org.apache.flink.util.Collector;
public class KeyedBroadcastProcessFunctionDS {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<String> tableConfigStream = env.fromElements("Tom");
SingleOutputStreamOperator<Event> MySqlTableStream = env.addSource(new EventWithWatermarkSource());
MapStateDescriptor<String, String> mapStateDescriptor = new MapStateDescriptor<>("map-state", String.class, String.class);
BroadcastStream<String> broadcast = tableConfigStream.broadcast(mapStateDescriptor);
SingleOutputStreamOperator<String> result = MySqlTableStream
.keyBy(t -> t.user)
.connect(broadcast)
.process(new MyBroadcastProcessFunction(mapStateDescriptor));
result.print("output ");
env.execute();
}
public static class MyBroadcastProcessFunction extends KeyedBroadcastProcessFunction<String, Event, String, String>{
private MapStateDescriptor<String, String> mapStateDescriptor;
private MapState<String, Long> eventMapState;
@Override
public void open(Configuration parameters) throws Exception {
eventMapState = getRuntimeContext().getMapState(new MapStateDescriptor<String, Long>("event-map-state", String.class, Long.class));
}
public MyBroadcastProcessFunction(MapStateDescriptor<String, String> mapStateDescriptor) {
this.mapStateDescriptor = mapStateDescriptor;
}
@Override
public void processElement(Event value, ReadOnlyContext ctx, Collector<String> out) throws Exception {
if (eventMapState.contains(value.user)) {
Long num = eventMapState.get(value.user);
eventMapState.put(value.user, num + 1);
} else {
eventMapState.put(value.user, 1L);
}
ReadOnlyBroadcastState<String, String> broadcastState = ctx.getBroadcastState(mapStateDescriptor);
String user = value.user;
String userConfig = broadcastState.get(user);
if (userConfig != null) {
Long aLong = eventMapState.get(value.user);
out.collect("用户 "+ value.user + " 访问次数 -> " + aLong);
}
}
@Override
public void processBroadcastElement(String value, Context ctx, Collector<String> out) throws Exception {
BroadcastState<String, String> configBroadcast = ctx.getBroadcastState(mapStateDescriptor);
configBroadcast.put(value, value);
}
}
}
三、热门url案例
3.1、需求
统计最近10秒内最热门的url链接,并且每5秒钟更新一次
3.2、实现代码
3.2.1、Event类
package com.hpsk.flink.beans;
import java.sql.Timestamp;
public class Event {
public String user;
public String url;
public Long timestamp;
public Event(){
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
3.2.2、UrlViewCount类
package com.hpsk.flink.beans;
import java.sql.Timestamp;
public class UrlViewCount {
public Long windowStart;
public Long windowEnd;
public String url;
public Long count;
public UrlViewCount() {
}
public UrlViewCount(Long windowStart, Long windowEnd, String url, Long count) {
this.windowStart = windowStart;
this.windowEnd = windowEnd;
this.url = url;
this.count = count;
}
@Override
public String toString() {
return "UrlViewCount{" +
"windowStart=" + new Timestamp(windowStart) +
", windowEnd=" + new Timestamp(windowEnd) +
", url='" + url +
", count=" + count +
'}';
}
}
3.2.3、TopNExample类
package com.hpsk.flink.demand;
import com.hpsk.flink.beans.Event;
import com.hpsk.flink.beans.UrlViewCount;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
public class TopNExample {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());
inputDS.print("input ");
SingleOutputStreamOperator<UrlViewCount> result = inputDS
.keyBy(t -> t.url)
.windowAll(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
.aggregate(new UrlHashViewCountAgg(), new UrlHashViewCountResult());
result.print("output ");
env.execute();
}
public static class UrlHashViewCountAgg implements AggregateFunction<Event, HashMap<String, Long>, ArrayList<Tuple2<String, Long>>>{
@Override
public HashMap<String, Long> createAccumulator() {
return new HashMap<>();
}
@Override
public HashMap<String, Long> add(Event value, HashMap<String, Long> accumulator) {
if (accumulator.containsKey(value.url)) {
Long count = accumulator.get(value.url);
accumulator.put(value.url, count + 1);
} else {
accumulator.put(value.url, 1L);
}
return accumulator;
}
@Override
public ArrayList<Tuple2<String, Long>> getResult(HashMap<String, Long> accumulator) {
ArrayList<Tuple2<String, Long>> result = new ArrayList<>();
for (String key : accumulator.keySet()) {
result.add(Tuple2.of(key, accumulator.get(key)));
}
result.sort(new Comparator<Tuple2<String, Long>>() {
@Override
public int compare(Tuple2<String, Long> o1, Tuple2<String, Long> o2) {
return o2.f1.compareTo(o1.f1);
}
});
return result;
}
@Override
public HashMap<String, Long> merge(HashMap<String, Long> a, HashMap<String, Long> b) {
return null;
}
}
private static class UrlHashViewCountResult extends ProcessAllWindowFunction<ArrayList<Tuple2<String, Long>>, UrlViewCount, TimeWindow> {
@Override
public void process(Context ctx, Iterable<ArrayList<Tuple2<String, Long>>> elements, Collector<UrlViewCount> out) throws Exception {
long start = ctx.window().getStart();
long end = ctx.window().getEnd();
ArrayList<Tuple2<String, Long>> list = elements.iterator().next();
for (int i = 0; i < Math.min(list.size(), 2); i++) {
out.collect(new UrlViewCount(start, end, list.get(i).f0, list.get(i).f1));
}
}
}
}
3.3、代码改进
3.3.1、TopNExample类不足
利用windowAll方法将数据放入一个窗口进行统计,失去了并行计算的意义。
3.3.2、改进代码
package com.hpsk.flink.demand;
import com.hpsk.flink.beans.Event;
import com.hpsk.flink.beans.UrlViewCount;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.Comparator;
public class UrlTopNViewCount {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());
inputDS.print("input ");
SingleOutputStreamOperator<UrlViewCount> result = inputDS
.keyBy(t -> t.url)
.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
.aggregate(new UrlViewCountAgg(), new UrlViewCountResult());
SingleOutputStreamOperator<UrlViewCount> topNResult = result
.keyBy(t -> t.windowEnd)
.process(new TopNProcessResult(2));
topNResult.print("output ");
env.execute();
}
public static class UrlViewCountAgg implements AggregateFunction<Event, Long, Long>{
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(Event value, Long accumulator) {
return accumulator + 1;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return null;
}
}
private static class UrlViewCountResult extends ProcessWindowFunction<Long, UrlViewCount, String, TimeWindow> {
@Override
public void process(String url, Context ctx, Iterable<Long> elements, Collector<UrlViewCount> out) throws Exception {
long start = ctx.window().getStart();
long end = ctx.window().getEnd();
long count = elements.iterator().next();
out.collect(new UrlViewCount(start, end, url, count));
}
}
private static class TopNProcessResult extends KeyedProcessFunction<Long, UrlViewCount, UrlViewCount> {
private Integer n;
private ListState<UrlViewCount> urlViewCountListState;
public TopNProcessResult(Integer n) {
this.n = n;
}
@Override
public void open(Configuration parameters) throws Exception {
urlViewCountListState = getRuntimeContext().getListState(new ListStateDescriptor<UrlViewCount>("url-list-state", Types.POJO(UrlViewCount.class)));
}
@Override
public void processElement(UrlViewCount value, Context ctx, Collector<UrlViewCount> out) throws Exception {
urlViewCountListState.add(value);
ctx.timerService().registerEventTimeTimer(value.windowEnd + 1);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<UrlViewCount> out) throws Exception {
ArrayList<UrlViewCount> urlViewCountArrayList = new ArrayList<>();
for (UrlViewCount urlViewCount : urlViewCountListState.get()) {
urlViewCountArrayList.add(urlViewCount);
}
urlViewCountArrayList.sort(new Comparator<UrlViewCount>() {
@Override
public int compare(UrlViewCount o1, UrlViewCount o2) {
return o2.count.compareTo(o1.count);
}
});
for (int i = 0; i < n; i++) {
out.collect(urlViewCountArrayList.get(i));
}
urlViewCountListState.clear();
}
}
}