目录
1. 增量聚合函数(incremental aggregation functions)
(1)归约函数(ReduceFunction)
(2)聚合函数(AggregateFunction)
2. 全窗口函数(full window functions)
(1)窗口函数(WindowFunction)
(2)处理窗口函数(ProcessWindowFunction)
3. 增量聚合和全窗口函数的结合使用
4、窗口的生命周期
1. 窗口的创建
2. 窗口计算的触发
3. 窗口的销毁
4. 窗口 API 调用总结
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import
org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindo
ws;
import org.apache.flink.streaming.api.windowing.time.Time;
import java.time.Duration;
public class WindowReduceExample {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 从自定义数据源读取数据,并提取时间戳、生成水位线
SingleOutputStreamOperator stream = env.addSource(new
ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.forBoun
dedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner()
{
@Override
public long extractTimestamp(Event element, long recordTimestamp)
{
return element.timestamp;
}
})); stream.map(new MapFunction>() {
@Override
public Tuple2 map(Event value) throws Exception {
// 将数据转换成二元组,方便计算
return Tuple2.of(value.user, 1L);
}
})
.keyBy(r -> r.f0)
// 设置滚动事件时间窗口
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
.reduce(new ReduceFunction>() {
@Override
public Tuple2 reduce(Tuple2 value1,
Tuple2 value2) throws Exception {
// 定义累加规则,窗口闭合时,向下游发送累加结果
return Tuple2.of(value1.f0, value1.f1 + value2.f1);
}
})
.print();
env.execute();
}
}
public interface AggregateFunction extends Function, Serializable
{
ACC createAccumulator();
ACC add(IN value, ACC accumulator);
OUT getResult(ACC accumulator);
155
ACC merge(ACC a, ACC b);
}
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import
org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import java.util.HashSet;
public class WindowAggregateFunctionExample {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator stream = env.addSource(new
ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.forMono
tonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner()
{
@Override
public long extractTimestamp(Event element, long recordTimestamp)
{
return element.timestamp;
}
}));
// 所有数据设置相同的 key,发送到同一个分区统计 PV 和 UV,再相除
stream.keyBy(data -> true)
.window(SlidingEventTimeWindows.of(Time.seconds(10),
Time.seconds(2)))
.aggregate(new AvgPv())
.print();
env.execute();
}
public static class AvgPv implements AggregateFunction, Long>, Double> {
@Override
public Tuple2, Long> createAccumulator() {
// 创建累加器
return Tuple2.of(new HashSet(), 0L);
}
@Override
public Tuple2, Long> add(Event value,
Tuple2, Long> accumulator) {
// 属于本窗口的数据来一条累加一次,并返回累加器
accumulator.f0.add(value.user);
return Tuple2.of(accumulator.f0, accumulator.f1 + 1L);
}
@Override
public Double getResult(Tuple2, Long> accumulator) {
// 窗口闭合时,增量聚合结束,将计算结果发送到下游
return (double) accumulator.f1 / accumulator.f0.size();
}
@Override
public Tuple2, Long> merge(Tuple2, Long>
a, Tuple2, Long> b) {
return null;
}
}
}
stream
.keyBy()
.window()
.apply(new MyWindowFunction());
public interface WindowFunction extends Function,
Serializable {
void apply(KEY key, W window, Iterable input, Collector out) throws
Exception;
}
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import
org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import
org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.HashSet;
public class UvCountByWindowExample {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator stream = env.addSource(new
ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.forBound
edOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new
SerializableTimestampAssigner() {
@Override
public long extractTimestamp(Event element, long
recordTimestamp) {
return element.timestamp;
}
}));
// 将数据全部发往同一分区,按窗口统计 UV
stream.keyBy(data -> true)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
.process(new UvCountByWindow())
.print();
env.execute();
}
// 自定义窗口处理函数
public static class UvCountByWindow extends ProcessWindowFunction{
@Override
public void process(Boolean aBoolean, Context context, Iterable
elements, Collector out) throws Exception {
HashSet userSet = new HashSet<>();
// 遍历所有数据,放到 Set 里去重
for (Event event: elements){
userSet.add(event.user);
}
// 结合窗口信息,包装输出内容
Long start = context.window().getStart();
Long end = context.window().getEnd();
out.collect(" 窗 口 : " + new Timestamp(start) + " ~ " + new
Timestamp(end)
+ " 的独立访客数量是:" + userSet.size());
}
}
}
// ReduceFunction 与 WindowFunction 结合
public SingleOutputStreamOperator reduce(
ReduceFunction reduceFunction, WindowFunction function)
// ReduceFunction 与 ProcessWindowFunction 结合
public SingleOutputStreamOperator reduce(
ReduceFunction reduceFunction, ProcessWindowFunction
function)
// AggregateFunction 与 WindowFunction 结合
public SingleOutputStreamOperator aggregate(
AggregateFunction aggFunction, WindowFunction
windowFunction)
// AggregateFunction 与 ProcessWindowFunction 结合
public SingleOutputStreamOperator aggregate(
AggregateFunction aggFunction,
ProcessWindowFunction windowFunction)
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import
org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import
org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
public class UrlViewCountExample {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator stream = env.addSource(new
ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.forMonot
onousTimestamps()
.withTimestampAssigner(new
SerializableTimestampAssigner() {
@Override
public long extractTimestamp(Event element, long
recordTimestamp) {
return element.timestamp;
}
}));
// 需要按照 url 分组,开滑动窗口统计
stream.keyBy(data -> data.url)
.window(SlidingEventTimeWindows.of(Time.seconds(10),
Time.seconds(5)))
// 同时传入增量聚合函数和全窗口函数
.aggregate(new UrlViewCountAgg(), new UrlViewCountResult())
.print();
env.execute();
}
// 自定义增量聚合函数,来一条数据就加一
public static class UrlViewCountAgg implements AggregateFunction {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(Event value, Long accumulator) {
return accumulator + 1;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return null;
}
}
// 自定义窗口处理函数,只需要包装窗口信息
public static class UrlViewCountResult extends ProcessWindowFunction {
@Override
public void process(String url, Context context, Iterable elements,
Collector out) throws Exception {
// 结合窗口信息,包装输出内容
Long start = context.window().getStart();
Long end = context.window().getEnd();
// 迭代器中只有一个元素,就是增量聚合函数的计算结果
out.collect(new UrlViewCount(url, elements.iterator().next(), start,
end));
}
}
}
import java.sql.Timestamp;
public class UrlViewCount {
public String url;
public Long count;
public Long windowStart;
public Long windowEnd;
public UrlViewCount() {
}
public UrlViewCount(String url, Long count, Long windowStart, Long windowEnd)
{
this.url = url;
this.count = count;
this.windowStart = windowStart;
this.windowEnd = windowEnd;
}
@Override
public String toString() {
return "UrlViewCount{" +
"url='" + url + '\'' +
", count=" + count +
", windowStart=" + new Timestamp(windowStart) +
", windowEnd=" + new Timestamp(windowEnd) +
'}';
}
}