flink版本:1.13.1
scala版本:2.12
<properties>
<flink.version>1.13.1</flink.version>
<scala.version>2.12</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- 实现自定义的数据格式来做序列化,可以引入下面的依赖 -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
CREATE TABLE EventTable(
user STRING,
url STRING,
ts TIMESTAMP(3), // 传入得值是bigint,自动转换
WATERMARK FOR ts AS ts - INTERVAL '5' SECOND
) WITH (
...
);
说明:
CREATE TABLE events (
user STRING,
url STRING,
ts BIGINT,
ts_ltz AS TO_TIMESTAMP_LTZ(ts, 3),
WATERMARK FOR ts_ltz AS time_ltz - INTERVAL '5' SECOND
) WITH (
...
);
说明:
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
SingleOutputStreamOperator<Event> dataStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.getTimestamp();
}
}));
Table table = tableEnv.fromDataStream(dataStream, $("user"), $("url"), $("timestamp").as("ts"),
$("et").rowtime(), $("ps").proctime());
说明
CREATE TABLE EventTable(
user STRING,
url STRING,
ts AS PROCTIME()
) WITH (
...
);
DataStream<Tuple2<String, String>> stream = ...;
// 声明一个额外的字段作为处理时间属性字段,$("ts").proctime()系统处理时间
Table table = tEnv.fromDataStream(stream, $("user"), $("url"), $("ts").proctime());
在 Flink 1.12 之前的版本中,Table API 和 SQL 提供了一组“分组窗口”(Group Window)函数,常用的时间窗口如滚动窗口、滑动窗口、会话窗口都有对应的实现;具体在 SQL 中就是调用 TUMBLE()、HOP()、SESSION(),传入时间属性字段、窗口大小等参数就可以了。
Table result = tableEnv.sqlQuery(
"SELECT " +
"user, " +
"TUMBLE_END(ts, INTERVAL '1' HOUR) as endT, " +
"COUNT(url) AS cnt " +
"FROM EventTable " +
"GROUP BY " + // 使用窗口和用户名进行分组
"user, " +
"TUMBLE(ts, INTERVAL '1' HOUR)" // 定义 1 小时滚动窗口
);
这里定义了 1 小时的滚动窗口,将窗口和用户 user 一起作为分组的字段。用聚合函数COUNT()对分组数据的个数进行了聚合统计,并将结果字段重命名为cnt;用TUPMBLE_END()函数获取滚动窗口的结束时间,重命名为 endT 提取出来。
从 1.13 版本开始,Flink 开始使用窗口表值函数(Windowing table-valued functions,Windowing TVFs)来定义窗口。窗口表值函数是 Flink 定义的多态表函数(PTF),可以将表进行扩展后返回。表函数(table function)可以看作是返回一个表的函数
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
TableConfig config = tableEnv.getConfig();
config.setIdleStateRetention(Duration.ofMillis(60));
SingleOutputStreamOperator<Event> dataStream = env.addSource(new ClickSource()).assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.getTimestamp();
}
}));
// 1. 注册虚拟表
tableEnv.createTemporaryView("table_click", dataStream, $("user"), $("ts").rowtime());
// 2. 窗口聚合查询 老版本-滚动窗口
Table agg = tableEnv.sqlQuery("select " +
" user," +
" count(1) AS ct," +
" TUMBLE_END(ts,INTERVAL '10' SECOND) AS endTime" +
" from table_click " +
" group by user, TUMBLE(ts, INTERVAL '10' SECOND)"); // 滚动窗口 布长10
// 3. 窗口聚合查询 TVF-滚动窗口
Table tvfTumbleAgg = tableEnv.sqlQuery("select " +
" user," +
" count(1) AS ct," +
" window_start," +
" window_end" +
" from TABLE(" + // 创建一个滚动窗口,参数1:数据来源的虚拟表,参数2:滚动日期参数,参数3:窗口布长
" TUMBLE(table table_click, DESCRIPTOR(ts), INTERVAL '10' SECOND))" +
" GROUP BY user, window_start, window_end"); // group by window_start, window_end 固定写法
// 4. 窗口聚合查询 TVF-滑动窗口
Table tvfHopAgg = tableEnv.sqlQuery("select " +
" user," +
" count(1) AS ct," +
" window_start," +
" window_end" +
" from TABLE(" + // 创建一个滚动窗口,参数1:数据来源的虚拟表,参数2:滚动日期参数,参数3:滑动补布长 参数4:窗口布长
" HOP(table table_click, DESCRIPTOR(ts), INTERVAL '5' SECOND,INTERVAL '10' SECOND))" +
" GROUP BY user, window_start, window_end"); // group by window_start, window_end 固定写法
// 5. 累计窗口
Table tvfCumulateAgg = tableEnv.sqlQuery("select " +
" CURRENT_TIME as cutTime," +
" user," +
" count(1) AS ct," +
" window_start," +
" window_end" +
" from TABLE(" + // 创建一个滚动窗口,参数1:数据来源的虚拟表,参数2:滚动日期参数,参数3:每隔多久计算输出一次 参数4:全窗口布长
" CUMULATE(table table_click, DESCRIPTOR(ts), INTERVAL '5' SECOND,INTERVAL '10' SECOND))" +
" GROUP BY user, window_start, window_end"); // group by window_start, window_end 固定写法
dataStream.print("source:");
//tableEnv.toChangelogStream(agg).print("agg:");
//tableEnv.toChangelogStream(tvfTumbleAgg).print("tvfTumbleAgg:");
//tableEnv.toChangelogStream(tvfHopAgg).print("tvfHotAgg:");
tableEnv.toChangelogStream(tvfCumulateAgg).print("tvfCumulateAgg:");
在持续查询的过程中,由于用于分组的 key 可能会不断增加,因此计算结果所需要
维护的状态也会持续增长。为了防止状态无限增长耗尽资源
# 方式1
TableEnvironment tableEnv = ...
// 获取表环境的配置
TableConfig tableConfig = tableEnv.getConfig();
// 配置状态保持时间
tableConfig.setIdleStateRetention(Duration.ofMinutes(60));
# 方式2
TableEnvironment tableEnv = ...
Configuration configuration = tableEnv.getConfig().getConfiguration();
configuration.setString("table.exec.state.ttl", "60 min");
Table result = tableEnv.sqlQuery(
"SELECT " +
"user, " +
"window_end AS endT, " +
"COUNT(url) AS cnt " +
"FROM TABLE( " +
"TUMBLE( TABLE EventTable, " +
"DESCRIPTOR(ts), " +
"INTERVAL '1' HOUR)) " +
"GROUP BY user, window_start, window_end "
);
注意:GROUP BY window_start, window_end 是固定写法
Flink SQL 中的开窗函数也是通过 OVER 子句来实现的
<聚合函数> OVER (
[PARTITION BY <字段 1>[, <字段 2>, ...]]
ORDER BY <时间属性字段>
<开窗范围>),
...
FROM ...
# PRECEDING 指前面几个/前一段时间;CURRENT ROW:指到当前最新得行数
BETWEEN ... PRECEDING AND CURRENT ROW
RANGE BETWEEN INTERVAL '1' HOUR PRECEDING AND CURRENT ROW
ROWS BETWEEN 5 PRECEDING AND CURRENT ROW
SELECT user,
COUNT(url) OVER (
PARTITION BY user
ORDER BY ts
RANGE BETWEEN INTERVAL '1' HOUR PRECEDING AND CURRENT ROW
) AS cnt
FROM EventTable
import com.flink.dto.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import static org.apache.flink.table.api.Expressions.$;
public class WindowTopNExample {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 读取数据源,并分配时间戳、生成水位线
SingleOutputStreamOperator<Event> eventStream = env
.fromElements(
new Event("Alice", "./home", 1000L),
new Event("Bob", "./cart", 1000L),
new Event("Alice", "./prod?id=1", 25 * 60 * 1000L),
new Event("Alice", "./prod?id=4", 55 * 60 * 1000L),
new Event("Bob", "./prod?id=5", 3600 * 1000L + 60 * 1000L),
new Event("Cary", "./home", 3600 * 1000L + 30 * 60 * 1000L),
new Event("Cary", "./prod?id=7", 3600 * 1000L + 59 * 60 * 1000L)
).assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long
recordTimestamp) {
return element.getTimestamp();
}
}));
// 创建表环境
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 将数据流转换成表,并指定时间属性
Table eventTable = tableEnv.fromDataStream(
eventStream,
$("user"),
$("url"),
$("timestamp").rowtime().as("ts") // 将 timestamp 指定为事件时间,并命名为 ts
);
// 为方便在 SQL 中引用,在环境中注册表 EventTable
tableEnv.createTemporaryView("EventTable", eventTable);
// 定义子查询,进行窗口聚合,得到包含窗口信息、用户以及访问次数的结果表
String subQuery =
"SELECT window_start, window_end, user, COUNT(url) as cnt " +
"FROM TABLE ( " +
"TUMBLE( TABLE EventTable, DESCRIPTOR(ts), INTERVAL '1' HOUR )) " + // 滚动窗口 布长1小时
"GROUP BY window_start, window_end, user ";
// 定义 Top N 的外层查询
String topNQuery =
"SELECT * " +
"FROM (" +
"SELECT *, " +
"ROW_NUMBER() OVER ( " +
"PARTITION BY window_start, window_end " +
"ORDER BY cnt desc " +
") AS row_num " +
"FROM (" + subQuery + ")) " +
"WHERE row_num <= 2";
// 执行 SQL 得到结果表
Table result = tableEnv.sqlQuery(topNQuery);
tableEnv.toDataStream(result).print();
env.execute();
}
}