基于时间的操作(比如 Table API 和 SQL 中窗口操作),需要定义相关的时间语义和时间数据来源的信息。 所以, Table 可以提供一个逻辑上的时间字段,用于在表处理程序中, 指示时间和访问相应的时间戳。
// 定义DataStream,并map称样例类
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val inputStream: DataStream[String] = env.readTextFile("sensor.txt")
val ds: DataStream[_02_SensorReading] = inputStream.map(
data => {
val arr = data.split(",")
(_02_SensorReading(arr(0), arr(1).toLong, arr(2).toDouble))
}
)
// 创建表的执行环境
val tabEnvSettings: EnvironmentSettings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build()
val tabEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, tabEnvSettings)
// 将dataStream转换为Table,并指定时间字段
val table: Table = tabEnv.fromDataStream(ds,'id,'timestamp,'temperature,'pt.proctime)
// 2)定义Table Schema时候指定
tabEnv.connect(
new FileSystem().path("sensor.txt")
).withFormat( new Csv())
.withSchema(
new Schema()
.field("id",DataTypes.STRING())
.field("timestamp",DataTypes.BIGINT())
.field("temperature",DataTypes.DOUBLE())
.field("pt",DataTypes.TIMESTAMP(3)).proctime() // 指定pt字段为处理时间
)// 定义表结构
.createTemporaryTable("inputTable") // 创建临时表
// 3)创建表的DDL中指定
val sinkDDL:String =
"""
|
|create table dataTable (
| id varchar(20) not null,
| ts bigint,
| temperature double,
| pt AS PROCTIME()
| ) with (
| 'connector.type' = 'filesystem',
| 'connector.path' = 'file:///sensor.txt',
| 'format.type' = 'csv'
|""".stripMargin
tabEnv.sqlUpdate(sinkDDL) // 执行DDL 注意: 运行这段 DDL,必须使用 Blink Planner
为了处理无序事件,并区分流中的准时和迟到事件; Flink 需要从事件数据中,提取时间戳,并用来推进事件时间的进展(watermark)。
// 1)DataStream 转换成Table时候指定
// 注意: 必须在转换的数据流中分配时间戳和水位线
// 根据指定的.rowtime字段名是否存在,分两种情况
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val inputStream: DataStream[String] = env.readTextFile("sensor.txt")
val ds: DataStream[_02_SensorReading] = inputStream.map(
data => {
val arr = data.split(",")
_02_SensorReading(arr(0), arr(1).toLong, arr(2).toDouble)
}
).assignAscendingTimestamps(_.timestamp * 1000L)
val tabSettings: EnvironmentSettings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
val tabEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, tabSettings)
// 将DataStream转换为Table,并指定时间字段
val table1: Table = tabEnv.fromDataStream(ds,'id,'timestamp.rowtime,'temperature)
// 直接追加字段
val table2: Table = tabEnv.fromDataStream(ds,'id,'timestamp,'temperature,'rt.rowtime)
// 2) 定义Table Schema时指定
tabEnv.connect(
new FileSystem().path("")
).withFormat(new Csv())
.withSchema(
new Schema()
.field("id",DataTypes.STRING())
.field("timestamp",DataTypes.BIGINT())
.field("temperature",DataTypes.DOUBLE())
.rowtime(
new Rowtime()
.timestampsFromField("timestamp") // 从字段中提取时间戳
.watermarksPeriodicBounded(1000) //watermark延迟一秒
)
)// 定义表结构
.createTemporaryTable("inputTable")
// 3) 创建表的 DDL 中指定
val sinkDDL: String =
"""
|create table dataTable (
| id varchar(20) not null,
| ts bigint,
| temperature double,
| rt AS TO_TIMESTAMP( FROM_UNIXTIME(ts) ),
| watermark for rt as rt - interval '1' second
|) with ( 'connector.type' = 'filesystem',
| 'connector.path' = 'file:///D:\\..\\sensor.txt',
| 'format.type' = 'csv'
|)
""".stripMargin
tabEnv.sqlUpdate(sinkDDL) // 执行 DDL
/*
这里 FROM_UNIXTIME 是系统内置的时间函数,用来将一个整数(秒数)转换成“YYYY-MM-DD hh:mm:ss” 格式(默认,也可以作为第二个 String 参数传入)的日期时间
字符串(date time string);然后再用 TO_TIMESTAMP 将其转换成 Timestamp。
*/
CREATE TABLE user_actions (
user_name STRING,
data STRING,
user_action_time TIMESTAMP(3),
-- 使用下面这句来将 user_action_time 声明为事件时间,并且声明 watermark 的生成规则,即 user_action_time 减 5 秒
-- 事件时间列的字段类型必须是 TIMESTAMP 或者 TIMESTAMP_LTZ 类型
WATERMARK FOR user_action_time AS user_action_time - INTERVAL '5' SECOND
) WITH (
...
);
SELECT
TUMBLE_START(user_action_time, INTERVAL '10' MINUTE),
COUNT(DISTINCT user_name)
FROM user_actions
-- 然后就可以在窗口算子中使用 user_action_time
GROUP BY TUMBLE(user_action_time, INTERVAL '10' MINUTE);
如果想使用事件时间,那么我们的时间戳类型必须是 TIMESTAMP 或者 TIMESTAMP_LTZ 类型。很多小伙伴会想到,我们的时间戳一般不都是秒或者是毫秒(BIGINT 类型)嘛,那这种情况怎么办?
解决方案如下:
CREATE TABLE user_actions (
user_name STRING,
data STRING,
-- 1. 这个 ts 就是常见的毫秒级别时间戳
ts BIGINT,
-- 2. 将毫秒时间戳转换成 TIMESTAMP_LTZ 类型
time_ltz AS TO_TIMESTAMP_LTZ(ts, 3),
-- 3. 使用下面这句来将 user_action_time 声明为事件时间,并且声明 watermark 的生成规则,即 user_action_time 减 5 秒
-- 事件时间列的字段类型必须是 TIMESTAMP 或者 TIMESTAMP_LTZ 类型
WATERMARK FOR time_ltz AS time_ltz - INTERVAL '5' SECOND
) WITH (
...
);
SELECT
TUMBLE_START(time_ltz, INTERVAL '10' MINUTE),
COUNT(DISTINCT user_name)
FROM user_actions
GROUP BY
TUMBLE(time_ltz, INTERVAL '10' MINUTE);
Table 和 DataStream 可以互转,那么 Flink 也提供了一个能力,就是在 Table 转为 DataStream 时,指定时间戳字段。
package com.yyds.flink_sql;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import java.time.Duration;
import static org.apache.flink.table.api.Expressions.$;
/**
* DataStream 中指定事件时间
*/
public class _10_EventTime_DF {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.setString("rest.port","9091");
// 执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
env.setParallelism(1);
// 创建表环境
EnvironmentSettings settings = EnvironmentSettings
.newInstance()
.inStreamingMode()
.build();
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env, settings);
// 从自定义数据源读取数据
DataStreamSource<Row> dataStreamSource = env.addSource(new _10_MySourceFunction());
// 新的分配水位线的方法
SingleOutputStreamOperator<Row> r = dataStreamSource.assignTimestampsAndWatermarks(
WatermarkStrategy.<Row>forBoundedOutOfOrderness(Duration.ZERO).withTimestampAssigner(new SerializableTimestampAssigner<Row>() {
@Override
public long extractTimestamp(Row row, long l) {
return (long) row.getField("f2");
}
})
);
//将f2字段 指定为事件时间的时间戳
Table sourceTable = tEnv.fromDataStream(r, $("f0"), $("f1"), $("f2").rowtime());
tEnv.createTemporaryView("source_table",sourceTable);
// 在滑动窗口中使用f2
String tumbleWindowSql = "select\n" +
" tumble_start(f2,interval '5' second ),\n" +
" count(distinct f0)\n" +
"from source_table\n" +
"group by tumble(f2,interval '5' second )"
;
Table resTable = tEnv.sqlQuery(tumbleWindowSql);
tEnv.toDataStream(resTable,Row.class).print();
env.execute("_10_EventTime_DF");
}
}
CREATE TABLE user_actions (
user_name STRING,
data STRING,
-- 使用下面这句来将 user_action_time 声明为处理时间
user_action_time AS PROCTIME()
) WITH (
...
);
SELECT
TUMBLE_START(user_action_time, INTERVAL '10' MINUTE),
COUNT(DISTINCT user_name)
FROM user_actions
-- 然后就可以在窗口算子中使用 user_action_time
GROUP BY TUMBLE(user_action_time, INTERVAL '10' MINUTE);
public class _10_ProcTime_DF {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.setString("rest.port","9091");
// 执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
env.setParallelism(1);
// 创建表环境
EnvironmentSettings settings = EnvironmentSettings
.newInstance()
.inStreamingMode()
.build();
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env, settings);
// 从自定义数据源读取数据
DataStreamSource<Row> dataStreamSource = env.addSource(new _10_MySourceFunction());
// Table sourceTable = tEnv.fromDataStream(dataStreamSource, "f0,f1,f2,proctime.proctime");// 老的api
Table sourceTable = tEnv.fromDataStream(dataStreamSource, $("f0"), $("f1"),$("f2"), $("proctime").proctime());
tEnv.createTemporaryView("source_table",sourceTable);
// 在滑动窗口中使用f2
String tumbleWindowSql = "select\n" +
" tumble_start(proctime,interval '5' second ),\n" +
" count(distinct f0)\n" +
"from source_table\n" +
"group by tumble(proctime,interval '5' second )"
;
Table resTable = tEnv.sqlQuery(tumbleWindowSql);
tEnv.toDataStream(resTable,Row.class).print();
env.execute("_10_ProcTime_DF");
}
}