watermark = 进入Flink窗口的最大事件时间(maxEventTime) - 一定的延迟时间(t)
//这个延迟时间t是在程序当中配置的
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.api.java.tuple._
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.table.api.{TableEnvironment, Types}
import org.apache.flink.table.sinks.CsvTableSink
import org.apache.flink.table.sources.CsvTableSource
import org.apache.flink.types.Row
case class User(id:Int,name:String,age:Int,timestamp:Long)
object SqlTest {
def main(args: Array[String]): Unit = {
val streamEnv = StreamExecutionEnvironment.getExecutionEnvironment
val tEnv = TableEnvironment.getTableEnvironment(streamEnv)
//指定时间类型为事件时间
streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = streamEnv.fromElements(
User(1,"nie",22,1511658000),
User(2,"hu",20,1511658000),
User(2,"xiao",19,1511658000)
).assignAscendingTimestamps(_.timestamp * 1000L) //指定水位线
tEnv.registerDataStream("testTable",stream,'id, 'name,'age,'event_time.rowtime)
val result = tEnv.sqlQuery(
"select id,sum(age) from testTable group by TUMBLE(event_time,INTERVAL '5' MINUTE),id"
)
result.toRetractStream[Row].print()
streamEnv.execute("windowTest")
}
}
val stream = streamEnv.fromElements(
User(1,"nie",22,1511658000000L),
User(2,"hu",20,1511658000000L),
User(2,"xiao",19,1511658300000L)
).assignAscendingTimestamps(_.timestamp) //指定水位线
我们直到可以利用水位线处理延迟情况,上面assignAscendingTimestamps方法针对的是数据有序,无法设定允许延迟时间,也就无法处理数据延迟的情况,下面介绍几种生成水位线的方式
生成水位线分为两步:
自定义生成分为两种:
1、Periodic Watermarks
1)、升序模式
eg:
//指定时间类型为事件时间
streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = streamEnv.fromElements(
User(1,"nie",22,1511658000),
User(2,"hu",20,1511658000),
User(2,"xiao",19,1511658300)
).assignAscendingTimestamps(_.timestamp * 1000L) //指定水位线
2)、使用固定时延间隔的Timestamp Assigner
如下代码所示,通过创建BoundedOutOfOrdernessTimestampExtractor实现类来定义Timestamp Assigner,其中第一个参数Time.seconds(10)代表了最长的时延为10s,第二个为extractTimestamp抽取逻辑,选择样例类User的第三个元素作为Timestamps
eg:
case class User(id:Int,name:String,age:Int,timestamp:Long)
//指定时间类型为事件时间
streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = streamEnv.fromElements(
User(1, "nie", 22, 1511658000),
User(2, "hu", 20, 1511658000),
User(2, "xiao", 19, 1511658000)
).assignTimestampsAndWatermarks(
new BoundedOutOfOrdernessTimestampExtractor[User](Time.seconds(10)) {
override def extractTimestamp(t: User): Long = t.timestamp
}
)
eg:
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.scala.createTypeInformation
import org.apache.flink.api.java.tuple._
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.table.api.scala._
import org.apache.flink.table.api.{TableEnvironment, Types}
import org.apache.flink.table.sinks.CsvTableSink
import org.apache.flink.table.sources.CsvTableSource
import org.apache.flink.types.Row
case class User(id:Int,name:String,age:Int,timestamp:Long)
object SqlTest {
def main(args: Array[String]): Unit = {
val streamEnv = StreamExecutionEnvironment.getExecutionEnvironment
val tEnv = TableEnvironment.getTableEnvironment(streamEnv)
//指定时间类型为事件时间
streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = streamEnv.fromElements(
User(1, "nie", 22, 1511658000000L),
User(2, "hu", 20, 1511658000000L),
User(2, "xiao", 19, 1511658003000L),
User(2, "feng", 31, 1511658002000L)
).assignTimestampsAndWatermarks(
new BoundedOutOfOrdernessTimestampExtractor[User](Time.seconds(1)) {
override def extractTimestamp(t: User): Long = t.timestamp
}
)
tEnv.registerDataStream("testTable",stream,'id, 'name,'age,'event_time.rowtime)
val result = tEnv.sqlQuery(
"select id,sum(age) from testTable group by TUMBLE(event_time,INTERVAL '2' SECOND),id"
)
result.toRetractStream[Row].print()
streamEnv.execute("windowTest")
}
}
如上述代码所示,设置滚动窗口,窗口大小为2s,允许延迟时间为1s,四条数据的日期分别为2017-11-26 9:0:0、2017-11-26 9:0:0、2017-11-26 9:0:3、2017-11-26 9:0:2,可以看到第1、2、4条数据应该属于同一个窗口,只不过第四条数据延迟了,当第三条数据到达后,水位线应该为1511658003000L - 1000 = 1511658002000L,没有超过窗口结束时间1511658002000L,所以不触发窗口计算,第1、2、4条数据应该还是在一个窗口中计算的
这时候我们修改下代码,如下所示,修改了第三条数据的时间戳
val stream = streamEnv.fromElements(
User(1, "nie", 22, 1511658000000L),
User(2, "hu", 20, 1511658000000L),
User(2, "xiao", 19, 1511658004000L),
User(2, "feng", 31, 1511658002000L)
).assignTimestampsAndWatermarks(
new BoundedOutOfOrdernessTimestampExtractor[User](Time.seconds(1)) {
override def extractTimestamp(t: User): Long = t.timestamp
}
)
这时候当第三条数据到达的时候,水位线为1511658004000L - 1000 = 1511658003000L,超过了窗口结束时间1511658002000L,前两条数据触发计算,这时候第四条数据就没有加入计算
2、Punctuated Watermarks
eg:判断某个元素的当前状态,如果状态为0则触发生成Watermarks,如果状态不为0,则不触发生成Watermarks。
class PunctuatedAssigner extends AssignerWithPunctuatedWatermarks[(String,Long,Int)]{
//复写extractTimestamps方法,定义抽取Timestamp逻辑
override def extractTimestamp(element:(String,Long,Int),
previousElementTimestamp:Long):Long = {
element._2
}
//复写checkAndGetNextWatermark方法,定义Watermark生成逻辑
override def checkAndGetNextWatermark(lastElement:(String,Long,Int),
extractedTimestamp:Long):Watermark = {
//根据元素中第三位字段状态是否为0生成Watermark
if (lastElement._3 == 0) new Watermark(extractedTimestamp) else null
}
}