传统 SQL 是面向静态数据的查询语言,而现代实时业务要求对动态数据流进行即时分析。Flink SQL 应运而生,它让开发者无需编写复杂的状态管理代码,就能实现实时ETL、复杂事件处理(CEP)、实时报表等场景。其核心优势在于:
Flink SQL 的核心抽象是动态表——随时间变化的表。与传统数据库表不同,动态表通过 INSERT、UPDATE、DELETE 操作持续更新。例如:
-- 将 Kafka 数据流映射为动态表
CREATE TABLE user_behavior (
user_id BIGINT,
item_id BIGINT,
action STRING,
ts TIMESTAMP(3),
WATERMARK FOR ts AS ts - INTERVAL '5' SECOND
) WITH (
'connector' = 'kafka',
'topic' = 'user_behavior',
'scan.startup.mode' = 'earliest-offset'
);
WATERMARK
。-- 统计每5分钟各商品的点击量(事件时间窗口)
SELECT
item_id,
TUMBLE_START(ts, INTERVAL '5' MINUTE) AS window_start,
COUNT(*) AS clicks
FROM user_behavior
WHERE action = 'click'
GROUP BY
item_id,
TUMBLE(ts, INTERVAL '5' MINUTE);
sql-client.sh
启动。sql-client
提交作业。-- 创建 MySQL 结果表
CREATE TABLE item_clicks (
item_id BIGINT,
window_start TIMESTAMP(3),
clicks BIGINT,
PRIMARY KEY (item_id) NOT ENFORCED
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://localhost:3306/flink',
'table-name' = 'item_clicks'
);
-- 将聚合结果写入 MySQL
INSERT INTO item_clicks
SELECT
item_id,
TUMBLE_START(ts, INTERVAL '5' MINUTE) AS window_start,
COUNT(*) AS clicks
FROM user_behavior
WHERE action = 'click'
GROUP BY
item_id,
TUMBLE(ts, INTERVAL '5' MINUTE);
SUBSTRING
)、时间函数(DATE_FORMAT
)、聚合函数(SUM
)。// 定义 UDF:提取 URL 中的域名
public class ExtractDomain extends ScalarFunction {
public String eval(String url) {
return url.split("//")[1].split("/")[0];
}
}
// SQL 中注册使用
tEnv.createTemporarySystemFunction("extract_domain", ExtractDomain.class);
-- 设置 Checkpoint 参数(需在 Flink 配置中生效)
SET 'execution.checkpointing.interval' = '1min';
SET 'execution.checkpointing.timeout' = '3min';
实时流与外部维表(如 MySQL)关联时,需通过 Lookup Join 或 Temporal Table 实现。
-- 定义汇率维表(支持版本查询)
CREATE TABLE currency_rates (
currency STRING,
rate DECIMAL(10, 4),
update_time TIMESTAMP(3),
WATERMARK FOR update_time AS update_time - INTERVAL '30' SECOND
) WITH (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://localhost:3306/finance',
'table-name' = 'currency_rates'
);
-- 将维表声明为 Temporal Table
CREATE TEMPORARY VIEW rates AS
SELECT
currency,
rate,
update_time
FROM currency_rates
FOR SYSTEM_TIME AS OF update_time;
-- 流表与维表关联
SELECT
o.order_id,
o.amount * r.rate AS amount_usd
FROM orders AS o
JOIN rates FOR SYSTEM_TIME AS OF o.order_time AS r
ON o.currency = r.currency;
通过 MATCH_RECOGNIZE
实现模式匹配(如检测连续登录失败)。
SELECT *
FROM user_login_events
MATCH_RECOGNIZE (
PARTITION BY user_id
ORDER BY event_time
MEASURES
START_ROW.event_time AS start_time,
LAST(FAIL.event_time) AS end_time,
COUNT(FAIL.*) AS failures
ONE ROW PER MATCH
AFTER MATCH SKIP TO LAST FAIL
PATTERN (START FAIL{3})
DEFINE
FAIL AS FAIL.action = 'login_failed'
);
通过 Debezium 捕获 MySQL 的变更数据(CDC),实时同步到 Hudi 数据湖。
-- 创建 MySQL CDC 表
CREATE TABLE orders_cdc (
id BIGINT,
amount DECIMAL(10, 2),
status STRING,
update_time TIMESTAMP(3),
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector' = 'mysql-cdc',
'hostname' = 'localhost',
'port' = '3306',
'database-name' = 'mydb',
'table-name' = 'orders'
);
-- 写入 Hudi 表
INSERT INTO hudi_orders
SELECT
id,
amount,
status,
update_time
FROM orders_cdc;
-- 设置作业并行度
SET 'parallelism.default' = '8';
STATE TTL
清理过期状态。-- 启用 Mini-Batch 聚合(需在配置中设置)
SET 'table.exec.mini-batch.enabled' = 'true';
SET 'table.exec.mini-batch.allow-latency' = '5s';
SET 'table.exec.mini-batch.size' = '1000';
乱序数据处理
WATERMARK
延迟和 allowedLateness
。CUMULATE
窗口替代 TUMBLE
窗口,缓解乱序影响。维表 Join 性能
lookup.cache.max-rows
、lookup.cache.ttl
)。状态膨胀
STATE TTL
。state.backend.rocksdb.compaction.level
)。