Flink 官方文档:Flink 官方文档
参考博客:Alienware^博主
为了提高实时性,我们可以再次将流处理的思路发扬光大:就像 DataStream 的简单聚合一样,每来一条数据就立即进行计算,中间只要保持一个简单的聚合状态就可以了;区别只是在于不立即输出结果,而是要等到窗口结束时间。等到窗口到了结束时间需要输出计算结果的时候,我们只需要拿出之前聚合的状态直接输出,这无疑就大大提高了程序运行的效率和实时性。
DataStream<Tuple2<String, Long>> input = ...;
input
.keyBy(<key selector>)
.window(<window assigner>)
.reduce(new ReduceFunction<Tuple2<String, Long>>() {
public Tuple2<String, Long> reduce(Tuple2<String, Long> v1, Tuple2<String, Long> v2) {
return new Tuple2<>(v1.f0, v1.f1 + v2.f1);
}
});
package com.ali.flink.demo.driver;
import com.ali.flink.demo.utils.DataGeneratorImpl002;
import com.ali.flink.demo.utils.FlinkEnv;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.Duration;
import java.util.Date;
/**
* 使用reduce function,统计用户点击url的次数
*/
public class FlinkReduceFunctionDemo01 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
env.setParallelism(1);
DataGeneratorSource<String> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl002());
DataStream<String> dataGeneratorStream = env.addSource(dataGeneratorSource).returns(String.class);
// dataGeneratorStream.print("source");
SingleOutputStreamOperator<Event> mapStream = dataGeneratorStream
.map(new MapFunction<String, Event>() {
@Override
public Event map(String s) throws Exception {
JSONObject jsonObject = JSON.parseObject(s);
String username = jsonObject.getString("username");
String eventtime = jsonObject.getString("eventtime");
String click_url = jsonObject.getString("click_url");
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date event_time = null;
try {
event_time = simpleDateFormat.parse(eventtime);
} catch (ParseException e) {
e.printStackTrace();
}
return new Event(username, click_url, event_time.getTime(), eventtime);
}
});
mapStream.print("map source");
mapStream.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event s, long l) {
return s.eventTime;
}
}))
.map(new MapFunction<Event, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> map(Event event) throws Exception {
return Tuple2.of(event.userName + "--" + event.clickUrl, 1L);
}
})
.keyBy(new KeySelector<Tuple2<String, Long>, String>() {
@Override
public String getKey(Tuple2<String, Long> s) throws Exception {
return s.f0;
}
})
.window(TumblingEventTimeWindows.of(Time.seconds(20)))
.reduce(new ReduceFunction<Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> reduce(Tuple2<String, Long> value1, Tuple2<String, Long> value2) throws Exception {
return Tuple2.of(value1.f0, value1.f1 + value2.f1);
}
}).print("count");
env.execute("tumble window test");
}
public static class Event{
private String userName;
private String clickUrl;
private long eventTime;
private String time;
public Event(String userName, String clickUrl, long eventTime, String time) {
this.userName = userName;
this.clickUrl = clickUrl;
this.eventTime = eventTime;
this.time = time;
}
@Override
public String toString() {
return "Event{" +
"userName='" + userName + '\'' +
", clickUrl='" + clickUrl + '\'' +
", eventTime=" + eventTime +
", time='" + time + '\'' +
'}';
}
}
}
map source> Event{userName='aaa', clickUrl='url1', eventTime=1657094061000, time='2022-07-06 15:54:21'}
map source> Event{userName='bbb', clickUrl='url1', eventTime=1657094061000, time='2022-07-06 15:54:21'}
map source> Event{userName='aaa', clickUrl='url2', eventTime=1657094063000, time='2022-07-06 15:54:23'}
map source> Event{userName='bbb', clickUrl='url1', eventTime=1657094065000, time='2022-07-06 15:54:25'}
map source> Event{userName='aaa', clickUrl='url1', eventTime=1657094065000, time='2022-07-06 15:54:25'}
map source> Event{userName='bbb', clickUrl='url1', eventTime=1657094070000, time='2022-07-06 15:54:30'}
map source> Event{userName='aaa', clickUrl='url1', eventTime=1657094070000, time='2022-07-06 15:54:30'}
map source> Event{userName='aaa', clickUrl='url1', eventTime=1657094077000, time='2022-07-06 15:54:37'}
map source> Event{userName='ccc', clickUrl='url2', eventTime=1657094077000, time='2022-07-06 15:54:37'}
map source> Event{userName='ccc', clickUrl='url1', eventTime=1657094084000, time='2022-07-06 15:54:44'}
count> (aaa--url2,1)
count> (aaa--url1,4)
count> (ccc--url2,1)
count> (bbb--url1,3)
map source> Event{userName='ccc', clickUrl='url1', eventTime=1657094091000, time='2022-07-06 15:54:51'}
map source> Event{userName='bbb', clickUrl='url1', eventTime=1657094100000, time='2022-07-06 15:55:00'}
map source> Event{userName='ccc', clickUrl='url2', eventTime=1657094104000, time='2022-07-06 15:55:04'}
count> (ccc--url1,2)
总结:
使用 Reduce Function 窗口聚合转换输出的数据类型跟Reduce Function的输入类型一直,以上举例为Tuple2
如果想获取窗口信息,需要搭配Window Function或ProcessWindowFunction来使用。
private static class AverageAggregate
implements AggregateFunction<Tuple2<String, Long>, Tuple2<Long, Long>, Double> {
@Override
public Tuple2<Long, Long> createAccumulator() {
return new Tuple2<>(0L, 0L);
}
@Override
public Tuple2<Long, Long> add(Tuple2<String, Long> value, Tuple2<Long, Long> accumulator) {
return new Tuple2<>(accumulator.f0 + value.f1, accumulator.f1 + 1L);
}
@Override
public Double getResult(Tuple2<Long, Long> accumulator) {
return ((double) accumulator.f0) / accumulator.f1;
}
@Override
public Tuple2<Long, Long> merge(Tuple2<Long, Long> a, Tuple2<Long, Long> b) {
return new Tuple2<>(a.f0 + b.f0, a.f1 + b.f1);
}
}
DataStream<Tuple2<String, Long>> input = ...;
input
.keyBy(<key selector>)
.window(<window assigner>)
.aggregate(new AverageAggregate());
package com.ali.flink.demo.driver;
import com.ali.flink.demo.utils.DataGeneratorImpl002;
import com.ali.flink.demo.utils.FlinkEnv;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.Duration;
import java.util.Date;
import java.util.HashSet;
/**
* 使用 aggregate function,统计访问 url的 PV/UV 值
*/
public class FlinkAggregateFunctionDemo01 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
env.setParallelism(1);
DataGeneratorSource<String> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl002());
DataStream<String> dataGeneratorStream = env.addSource(dataGeneratorSource).returns(String.class);
// dataGeneratorStream.print("source");
SingleOutputStreamOperator<Event> mapStream = dataGeneratorStream
.map(new MapFunction<String, Event>() {
@Override
public Event map(String s) throws Exception {
JSONObject jsonObject = JSON.parseObject(s);
String username = jsonObject.getString("username");
String eventtime = jsonObject.getString("eventtime");
String click_url = jsonObject.getString("click_url");
return new Event(username, click_url, eventtime);
}
});
mapStream.print("map source");
mapStream.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event s, long l) {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date event_time = null;
try {
event_time = simpleDateFormat.parse(s.eventTime);
} catch (ParseException e) {
e.printStackTrace();
}
return event_time.getTime();
}
}))
.keyBy(new KeySelector<Event, String>() {
@Override
public String getKey(Event event) throws Exception {
return event.clickUrl;
}
})
.window(TumblingEventTimeWindows.of(Time.seconds(20)))
.aggregate(new AggregateFunction<Event, Tuple2<Long, HashSet<String>>, Double>() {
@Override
public Tuple2<Long, HashSet<String>> createAccumulator() {
return Tuple2.of(0L, new HashSet<>());
}
@Override
public Tuple2<Long, HashSet<String>> add(Event event, Tuple2<Long, HashSet<String>> value) {
value.f1.add(event.userName);
return Tuple2.of(value.f0 + 1L, value.f1);
}
@Override
public Double getResult(Tuple2<Long, HashSet<String>> value) {
System.out.println("PV: " + value.f0 + "UV: " + value.f1.size());
return Double.valueOf(value.f0 / value.f1.size());
}
@Override
public Tuple2<Long, HashSet<String>> merge(Tuple2<Long, HashSet<String>> value, Tuple2<Long, HashSet<String>> acc1) {
value.f0 = value.f0 + acc1.f0;
for (String s : acc1.f1) {
value.f1.add(s);
}
return value;
}
})
.print("count");
env.execute("tumble window test");
}
public static class Event{
private String userName;
private String clickUrl;
private String eventTime;
public Event(String userName, String clickUrl, String eventTime) {
this.userName = userName;
this.clickUrl = clickUrl;
this.eventTime = eventTime;
}
@Override
public String toString() {
return "Event{" +
"userName='" + userName + '\'' +
", clickUrl='" + clickUrl + '\'' +
", eventTime='" + eventTime + '\'' +
'}';
}
}
}
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 16:34:41'}
map source> Event{userName='bbb', clickUrl='url2', eventTime='2022-07-06 16:34:44'}
map source> Event{userName='aaa', clickUrl='url2', eventTime='2022-07-06 16:34:44'}
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 16:34:44'}
map source> Event{userName='ccc', clickUrl='url2', eventTime='2022-07-06 16:34:52'}
map source> Event{userName='aaa', clickUrl='url2', eventTime='2022-07-06 16:35:00'}
map source> Event{userName='ccc', clickUrl='url1', eventTime='2022-07-06 16:35:08'}
PV: 2UV: 1
PV: 3UV: 3
count> 2.0
count> 1.0
map source> Event{userName='bbb', clickUrl='url2', eventTime='2022-07-06 16:35:15'}
map source> Event{userName='bbb', clickUrl='url1', eventTime='2022-07-06 16:35:22'}
PV: 2UV: 2
PV: 1UV: 1
count> 1.0
count> 1.0
总结:
AggregateFunction 这里有三种类型:输入类型(IN)、累加器类型(ACC)和输出类型(OUT)。输入类型 IN 就是输入流中元素的数据类型;累加器类型 ACC 则是我们进行聚合的中间状态类型;而输出类型当然就是最终计算结果的类型了。
接口中有四个方法:
单独使用AggregateFunction,无法获取窗口信息,以及上例中 url 信息,需要搭配 WindowFunction和ProcessWindowFunction来使用。
DataStream<Tuple2<String, Long>> input = ...;
input
.keyBy(t -> t.f0)
.window(TumblingEventTimeWindows.of(Time.minutes(5)))
.process(new MyProcessWindowFunction());
/* ... */
public class MyProcessWindowFunction
extends ProcessWindowFunction<Tuple2<String, Long>, String, String, TimeWindow> {
@Override
public void process(String key, Context context, Iterable<Tuple2<String, Long>> input, Collector<String> out) {
long count = 0;
for (Tuple2<String, Long> in: input) {
count++;
}
out.collect("Window: " + context.window() + "count: " + count);
}
}
package com.ali.flink.demo.driver;
import com.ali.flink.demo.utils.DataGeneratorImpl002;
import com.ali.flink.demo.utils.FlinkEnv;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.Duration;
import java.util.Date;
import java.util.HashSet;
/**
* 使用 aggregate function,统计访问 url的 PV/UV 值
*/
public class FlinkProcessFunctionDemo01 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
env.setParallelism(1);
DataGeneratorSource<String> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl002());
DataStream<String> dataGeneratorStream = env.addSource(dataGeneratorSource).returns(String.class);
// dataGeneratorStream.print("source");
SingleOutputStreamOperator<Event> mapStream = dataGeneratorStream
.map(new MapFunction<String, Event>() {
@Override
public Event map(String s) throws Exception {
JSONObject jsonObject = JSON.parseObject(s);
String username = jsonObject.getString("username");
String eventtime = jsonObject.getString("eventtime");
String click_url = jsonObject.getString("click_url");
return new Event(username, click_url, eventtime);
}
});
mapStream.print("map source");
mapStream.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event s, long l) {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date event_time = null;
try {
event_time = simpleDateFormat.parse(s.eventTime);
} catch (ParseException e) {
e.printStackTrace();
}
return event_time.getTime();
}
}))
.keyBy(new KeySelector<Event, String>() {
@Override
public String getKey(Event event) throws Exception {
return event.clickUrl;
}
})
.window(TumblingEventTimeWindows.of(Time.seconds(20)))
.process(new ProcessWindowFunction<Event, String, String, TimeWindow>() {
@Override
public void process(String value, Context context, Iterable<Event> iterable, Collector<String> collector) throws Exception {
HashSet<String> hashSet = new HashSet<>();
for (Event event : iterable){
hashSet.add(event.userName);
}
int size = hashSet.size();
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String start_window = simpleDateFormat.format(context.window().getStart());
String end_window = simpleDateFormat.format(context.window().getEnd());
collector.collect(start_window + "--" + end_window + "--" + value + "--" + size);
}
})
.print("count");
env.execute("tumble window test");
}
public static class Event{
private String userName;
private String clickUrl;
private String eventTime;
public Event(String userName, String clickUrl, String eventTime) {
this.userName = userName;
this.clickUrl = clickUrl;
this.eventTime = eventTime;
}
@Override
public String toString() {
return "Event{" +
"userName='" + userName + '\'' +
", clickUrl='" + clickUrl + '\'' +
", eventTime='" + eventTime + '\'' +
'}';
}
}
}
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 17:15:31'}
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 17:15:32'}
map source> Event{userName='ccc', clickUrl='url1', eventTime='2022-07-06 17:15:35'}
map source> Event{userName='ccc', clickUrl='url2', eventTime='2022-07-06 17:15:37'}
map source> Event{userName='bbb', clickUrl='url1', eventTime='2022-07-06 17:15:45'}
count> 2022-07-06 17:15:20--2022-07-06 17:15:40--url1--2
count> 2022-07-06 17:15:20--2022-07-06 17:15:40--url2--1
map source> Event{userName='ccc', clickUrl='url2', eventTime='2022-07-06 17:15:50'}
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 17:15:57'}
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 17:16:02'}
count> 2022-07-06 17:15:40--2022-07-06 17:16:00--url1--2
count> 2022-07-06 17:15:40--2022-07-06 17:16:00--url2--1
map source> Event{userName='bbb', clickUrl='url2', eventTime='2022-07-06 17:16:07'}
map source> Event{userName='aaa', clickUrl='url2', eventTime='2022-07-06 17:16:14'}
map source> Event{userName='aaa', clickUrl='url2', eventTime='2022-07-06 17:16:23'}
count> 2022-07-06 17:16:00--2022-07-06 17:16:20--url1--1
count> 2022-07-06 17:16:00--2022-07-06 17:16:20--url2--2
总结:
全窗口函数的计算逻辑是将同一窗口内的数据先攒起来,等窗口触发时再进行批计算,并不是像增量窗口函数那样,来一条计算一条,实时处理。
一般使用全窗口函数,都会结合增量聚合函数来使用,这样增量聚合函数处理的每一条数据,调用getResult 方法之后,不是直接输出,而是传递给后面的全窗口函数中的process方法中的 iterable 传递,当前的 iterable 其实就是增量聚合的结果,这样就给两者的优势放在一起,变成了一个通用强大的用法
DataStream<SensorReading> input = ...;
input
.keyBy(<key selector>)
.window(<window assigner>)
.reduce(new MyReduceFunction(), new MyProcessWindowFunction());
// Function definitions
private static class MyReduceFunction implements ReduceFunction<SensorReading> {
public SensorReading reduce(SensorReading r1, SensorReading r2) {
return r1.value() > r2.value() ? r2 : r1;
}
}
private static class MyProcessWindowFunction
extends ProcessWindowFunction<SensorReading, Tuple2<Long, SensorReading>, String, TimeWindow> {
public void process(String key,
Context context,
Iterable<SensorReading> minReadings,
Collector<Tuple2<Long, SensorReading>> out) {
SensorReading min = minReadings.iterator().next();
out.collect(new Tuple2<Long, SensorReading>(context.window().getStart(), min));
}
}
package com.ali.flink.demo.driver;
import com.ali.flink.demo.utils.DataGeneratorImpl002;
import com.ali.flink.demo.utils.FlinkEnv;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.Duration;
import java.util.Date;
import java.util.HashSet;
/**
* 使用reduce function + ProcessWindowFunction,统计访问 url 的 UV 值
*/
public class FlinkReduceAndProcessFunctionDemo01 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
env.setParallelism(1);
DataGeneratorSource<String> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl002());
DataStream<String> dataGeneratorStream = env.addSource(dataGeneratorSource).returns(String.class);
// dataGeneratorStream.print("source");
SingleOutputStreamOperator<Tuple4<String, String, String, HashSet<String>>> mapStream = dataGeneratorStream
.map(new MapFunction<String, Tuple4<String, String, String, HashSet<String>>>() {
@Override
public Tuple4<String, String, String, HashSet<String>> map(String s) throws Exception {
JSONObject jsonObject = JSON.parseObject(s);
String username = jsonObject.getString("username");
String eventtime = jsonObject.getString("eventtime");
String click_url = jsonObject.getString("click_url");
HashSet<String> hashSet = new HashSet<>();
hashSet.add(username);
return Tuple4.of(username, click_url, eventtime, hashSet);
}
});
mapStream.print("map source");
mapStream.assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple4<String, String, String, HashSet<String>>>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple4<String, String, String, HashSet<String>>>() {
@Override
public long extractTimestamp(Tuple4<String, String, String, HashSet<String>> s, long l) {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date event_time = null;
try {
event_time = simpleDateFormat.parse(s.f2);
} catch (ParseException e) {
e.printStackTrace();
}
return event_time.getTime();
}
}))
.keyBy(new KeySelector<Tuple4<String, String, String, HashSet<String>>, String>() {
@Override
public String getKey(Tuple4<String, String, String, HashSet<String>> event) throws Exception {
return event.f1;
}
})
.window(TumblingEventTimeWindows.of(Time.seconds(20)))
.reduce(new ReduceFunction<Tuple4<String, String, String, HashSet<String>>>() {
@Override
public Tuple4<String, String, String, HashSet<String>> reduce(Tuple4<String, String, String, HashSet<String>> v1, Tuple4<String, String, String, HashSet<String>> v2) throws Exception {
for (String s : v2.f3) {
v1.f3.add(s);
}
return v1;
}
}, new ProcessWindowFunction<Tuple4<String, String, String, HashSet<String>>, String, String, TimeWindow>() {
@Override
public void process(String value, Context context, Iterable<Tuple4<String, String, String, HashSet<String>>> iterable, Collector<String> collector) throws Exception {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String start_window = simpleDateFormat.format(context.window().getStart());
String end_window = simpleDateFormat.format(context.window().getEnd());
int size = iterable.iterator().next().f3.size();
collector.collect(start_window + "--" + end_window + "--" + value + "--" + size);
}
}).print("count");
env.execute("tumble window test");
}
public static class Event{
private String userName;
private String clickUrl;
private String eventTime;
public Event(String userName, String clickUrl, String eventTime) {
this.userName = userName;
this.clickUrl = clickUrl;
this.eventTime = eventTime;
}
@Override
public String toString() {
return "Event{" +
"userName='" + userName + '\'' +
", clickUrl='" + clickUrl + '\'' +
", eventTime='" + eventTime + '\'' +
'}';
}
}
}
map source> (aaa,url2,2022-07-06 17:57:07,[aaa])
map source> (bbb,url2,2022-07-06 17:57:16,[bbb])
map source> (bbb,url1,2022-07-06 17:57:16,[bbb])
map source> (bbb,url1,2022-07-06 17:57:21,[bbb])
map source> (ccc,url2,2022-07-06 17:57:28,[ccc])
count> 2022-07-06 17:57:00--2022-07-06 17:57:20--url2--2
count> 2022-07-06 17:57:00--2022-07-06 17:57:20--url1--1
map source> (aaa,url2,2022-07-06 17:57:32,[aaa])
map source> (aaa,url2,2022-07-06 17:57:41,[aaa])
map source> (aaa,url1,2022-07-06 17:57:45,[aaa])
count> 2022-07-06 17:57:20--2022-07-06 17:57:40--url1--1
count> 2022-07-06 17:57:20--2022-07-06 17:57:40--url2--2
map source> (bbb,url2,2022-07-06 17:57:53,[bbb])
map source> (aaa,url1,2022-07-06 17:58:02,[aaa])
count> 2022-07-06 17:57:40--2022-07-06 17:58:00--url2--2
count> 2022-07-06 17:57:40--2022-07-06 17:58:00--url1--1
map source> (ccc,url2,2022-07-06 17:58:06,[ccc])
map source> (bbb,url2,2022-07-06 17:58:07,[bbb])
map source> (bbb,url2,2022-07-06 17:58:13,[bbb])
map source> (aaa,url2,2022-07-06 17:58:14,[aaa])
map source> (aaa,url1,2022-07-06 17:58:16,[aaa])
map source> (aaa,url1,2022-07-06 17:58:18,[aaa])
map source> (bbb,url1,2022-07-06 17:58:27,[bbb])
count> 2022-07-06 17:58:00--2022-07-06 17:58:20--url1--1
count> 2022-07-06 17:58:00--2022-07-06 17:58:20--url2--3
map source> (bbb,url1,2022-07-06 17:58:30,[bbb])
map source> (ccc,url1,2022-07-06 17:58:31,[ccc])
map source> (bbb,url1,2022-07-06 17:58:34,[bbb])
map source> (bbb,url2,2022-07-06 17:58:36,[bbb])
map source> (ccc,url2,2022-07-06 17:58:40,[ccc])
map source> (bbb,url2,2022-07-06 17:58:48,[bbb])
count> 2022-07-06 17:58:20--2022-07-06 17:58:40--url1--2
count> 2022-07-06 17:58:20--2022-07-06 17:58:40--url2--1
DataStream<Tuple2<String, Long>> input = ...;
input
.keyBy(<key selector>)
.window(<window assigner>)
.aggregate(new AverageAggregate(), new MyProcessWindowFunction());
// Function definitions
/**
* The accumulator is used to keep a running sum and a count. The {@code getResult} method
* computes the average.
*/
private static class AverageAggregate
implements AggregateFunction<Tuple2<String, Long>, Tuple2<Long, Long>, Double> {
@Override
public Tuple2<Long, Long> createAccumulator() {
return new Tuple2<>(0L, 0L);
}
@Override
public Tuple2<Long, Long> add(Tuple2<String, Long> value, Tuple2<Long, Long> accumulator) {
return new Tuple2<>(accumulator.f0 + value.f1, accumulator.f1 + 1L);
}
@Override
public Double getResult(Tuple2<Long, Long> accumulator) {
return ((double) accumulator.f0) / accumulator.f1;
}
@Override
public Tuple2<Long, Long> merge(Tuple2<Long, Long> a, Tuple2<Long, Long> b) {
return new Tuple2<>(a.f0 + b.f0, a.f1 + b.f1);
}
}
private static class MyProcessWindowFunction
extends ProcessWindowFunction<Double, Tuple2<String, Double>, String, TimeWindow> {
public void process(String key,
Context context,
Iterable<Double> averages,
Collector<Tuple2<String, Double>> out) {
Double average = averages.iterator().next();
out.collect(new Tuple2<>(key, average));
}
}
package com.ali.flink.demo.driver;
import com.ali.flink.demo.utils.DataGeneratorImpl002;
import com.ali.flink.demo.utils.FlinkEnv;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.Duration;
import java.util.Date;
import java.util.HashSet;
/**
* 使用 aggregate function + ProcessWindowFunction, 统计访问 url的 UV 值
*/
public class FlinkAggregateAndProcessFunctionDemo01 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
env.setParallelism(1);
DataGeneratorSource<String> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl002());
DataStream<String> dataGeneratorStream = env.addSource(dataGeneratorSource).returns(String.class);
// dataGeneratorStream.print("source");
SingleOutputStreamOperator<Event> mapStream = dataGeneratorStream
.map(new MapFunction<String, Event>() {
@Override
public Event map(String s) throws Exception {
JSONObject jsonObject = JSON.parseObject(s);
String username = jsonObject.getString("username");
String eventtime = jsonObject.getString("eventtime");
String click_url = jsonObject.getString("click_url");
return new Event(username, click_url, eventtime);
}
});
mapStream.print("map source");
mapStream.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event s, long l) {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date event_time = null;
try {
event_time = simpleDateFormat.parse(s.eventTime);
} catch (ParseException e) {
e.printStackTrace();
}
return event_time.getTime();
}
}))
.keyBy(new KeySelector<Event, String>() {
@Override
public String getKey(Event event) throws Exception {
return event.clickUrl;
}
})
.window(TumblingEventTimeWindows.of(Time.seconds(20)))
.aggregate(new AggregateFunction<Event, HashSet<String>, Long>() {
@Override
public HashSet<String> createAccumulator() {
return new HashSet<String>();
}
@Override
public HashSet<String> add(Event event, HashSet<String> set) {
set.add(event.userName);
return set;
}
@Override
public Long getResult(HashSet<String> set) {
return Long.valueOf(set.size());
}
@Override
public HashSet<String> merge(HashSet<String> set, HashSet<String> acc1) {
for (String s : acc1) {
set.add(s);
}
return set;
}
}, new ProcessWindowFunction<Long, String, String, TimeWindow>() {
@Override
public void process(String value, Context context, Iterable<Long> iterable, Collector<String> collector) throws Exception {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String start_window = simpleDateFormat.format(context.window().getStart());
String end_window = simpleDateFormat.format(context.window().getEnd());
Long size = iterable.iterator().next();
collector.collect(start_window + "--" + end_window + "--" + value + "--" + size);
}
})
.print("count");
env.execute("tumble window test");
}
public static class Event{
private String userName;
private String clickUrl;
private String eventTime;
public Event(String userName, String clickUrl, String eventTime) {
this.userName = userName;
this.clickUrl = clickUrl;
this.eventTime = eventTime;
}
@Override
public String toString() {
return "Event{" +
"userName='" + userName + '\'' +
", clickUrl='" + clickUrl + '\'' +
", eventTime='" + eventTime + '\'' +
'}';
}
}
}
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 18:10:08'}
map source> Event{userName='ccc', clickUrl='url1', eventTime='2022-07-06 18:10:12'}
map source> Event{userName='ccc', clickUrl='url1', eventTime='2022-07-06 18:10:15'}
map source> Event{userName='ccc', clickUrl='url2', eventTime='2022-07-06 18:10:22'}
count> 2022-07-06 18:10:00--2022-07-06 18:10:20--url1--2
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 18:10:27'}
map source> Event{userName='aaa', clickUrl='url2', eventTime='2022-07-06 18:10:33'}
map source> Event{userName='ccc', clickUrl='url1', eventTime='2022-07-06 18:10:35'}
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 18:10:37'}
map source> Event{userName='ccc', clickUrl='url2', eventTime='2022-07-06 18:10:42'}
count> 2022-07-06 18:10:20--2022-07-06 18:10:40--url2--2
count> 2022-07-06 18:10:20--2022-07-06 18:10:40--url1--2
map source> Event{userName='ccc', clickUrl='url2', eventTime='2022-07-06 18:10:48'}
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 18:10:49'}
map source> Event{userName='ccc', clickUrl='url2', eventTime='2022-07-06 18:10:57'}
map source> Event{userName='ccc', clickUrl='url1', eventTime='2022-07-06 18:11:03'}
map source> Event{userName='aaa', clickUrl='url1', eventTime='2022-07-06 18:11:03'}
count> 2022-07-06 18:10:40--2022-07-06 18:11:00--url2--1
count> 2022-07-06 18:10:40--2022-07-06 18:11:00--url1--1