Flink 的 Table API 和 SQL 提供了多种自定义函数的接口,以抽象类的形式定义。当前 UDF主要有以下几类:
自定义函数需要继承(extends) ScalarFunction类,并实现eval()方法。
package com.ali.flink.demo.driver.flink_udf;
import com.ali.flink.demo.bean.UserLoginEventBean;
import com.ali.flink.demo.utils.DataGeneratorImpl005;
import com.ali.flink.demo.utils.FlinkEnv;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.ScalarFunction;
import org.apache.flink.types.Row;
import java.time.Duration;
import static org.apache.flink.table.api.Expressions.$;
public class FlinkUDFScalarFunctions {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
env.setParallelism(1);
StreamTableEnvironment tableEnv = FlinkEnv.getStreamTableEnv(env);
// 获取数据源
DataGeneratorSource<UserLoginEventBean> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl005());
SingleOutputStreamOperator<UserLoginEventBean> sourceStream = env.addSource(dataGeneratorSource).returns(UserLoginEventBean.class)
.assignTimestampsAndWatermarks(WatermarkStrategy.<UserLoginEventBean>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<UserLoginEventBean>() {
@Override
public long extractTimestamp(UserLoginEventBean userLoginEventBean, long l) {
return userLoginEventBean.getTimestamp();
}
}));
sourceStream.print("source");
// 将 DataStream 转换为 Table,并设置表字段信息
Table sourceTable = tableEnv.fromDataStream(sourceStream, $("userId").as("user_id"), $("loginAddress"), $("loginType"), $("loginTime"));
// 创建临时表
tableEnv.createTemporaryView("loginTable", sourceTable);
// 注册udf函数
// createTemporarySystemFunction:临时系统函数,相当于系统函数,适用于所有database和catalog
// createTemporaryFunction:临时函数,适用于当前database和catalog
tableEnv.createTemporarySystemFunction("Myhash", MyHashFunction.class);
// 执行sql,调用udf函数
Table resultTable = tableEnv.sqlQuery("select user_id, Myhash(user_id) as hash_id from loginTable");
// 将Table 转换为 DataStream
DataStream<Row> resultStream = tableEnv.toDataStream(resultTable);
resultStream.print("result");
env.execute("flink udf start");
}
// 自定义函数,实现hash功能
public static class MyHashFunction extends ScalarFunction{
public int eval(String str){
return str.hashCode();
}
}
}
-------------------------------结果---------------------------------
source> UserLoginEventBean{userId='u4', loginAddress='杭州', loginType='fail', loginTime='2022-07-28 14:15:28', timestamp=1658988928626}
result> +I[u4, 3679]
source> UserLoginEventBean{userId='u2', loginAddress='南京', loginType='fail', loginTime='2022-07-28 14:15:29', timestamp=1658988929650}
result> +I[u2, 3677]
自定义函数需要继承(extends) TableFunction类,T:输出类型,并实现eval()方法。
package com.ali.flink.demo.driver.flink_udf;
import cn.hutool.core.util.RandomUtil;
import com.ali.flink.demo.bean.UserLoginEventBean;
import com.ali.flink.demo.utils.DataGeneratorImpl005;
import com.ali.flink.demo.utils.FlinkEnv;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;
import java.time.Duration;
import static org.apache.flink.table.api.Expressions.$;
public class FlinkUDFTableFunctions {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
env.setParallelism(1);
StreamTableEnvironment tableEnv = FlinkEnv.getStreamTableEnv(env);
DataGeneratorSource<UserLoginEventBean> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl005());
SingleOutputStreamOperator<UserLoginEventBean> sourceStream = env.addSource(dataGeneratorSource).returns(UserLoginEventBean.class)
.assignTimestampsAndWatermarks(WatermarkStrategy.<UserLoginEventBean>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<UserLoginEventBean>() {
@Override
public long extractTimestamp(UserLoginEventBean userLoginEventBean, long l) {
return userLoginEventBean.getTimestamp();
}
}));
// sourceStream.print("source");
// map方法构造数据
SingleOutputStreamOperator<UserLoginEventBean> mapStream = sourceStream.map(new MapFunction<UserLoginEventBean, UserLoginEventBean>() {
@Override
public UserLoginEventBean map(UserLoginEventBean userLoginEventBean) throws Exception {
String s = "123456789";
userLoginEventBean.setUserId(userLoginEventBean.getUserId() + s.substring(RandomUtil.randomInt(9)) + "," + s.substring(RandomUtil.randomInt(9)));
return userLoginEventBean;
}
});
mapStream.print("map source");
Table sourceTable = tableEnv.fromDataStream(mapStream, $("userId").as("user_id"), $("loginAddress"), $("loginType"), $("loginTime"));
tableEnv.createTemporaryView("loginTable", sourceTable);
// 调用udf函数
tableEnv.createTemporarySystemFunction("MySplit", MySplitFunction.class);
Table resultTable = tableEnv.sqlQuery("select user_id, u1, u2 from loginTable, lateral table(MySplit(user_id)) as T(u1, u2)");
DataStream<Row> resultStream = tableEnv.toDataStream(resultTable);
resultStream.print("result");
env.execute("flink udf start");
}
public static class MySplitFunction extends TableFunction<Tuple2<String, Integer>>{
public void eval(String str){
String[] split = str.split(",");
for (String s : split) {
collect(Tuple2.of(s, s.length()));
}
}
}
}
--------------------------结果------------------------------------
map source> UserLoginEventBean{userId='u3456789,789', loginAddress='杭州', loginType='fail', loginTime='2022-07-28 14:49:53', timestamp=1658990993346}
result> +I[u3456789,789, u3456789, 8]
result> +I[u3456789,789, 789, 3]
map source> UserLoginEventBean{userId='u356789,456789', loginAddress='北京', loginType='success', loginTime='2022-07-28 14:49:57', timestamp=1658990997357}
result> +I[u356789,456789, u356789, 7]
result> +I[u356789,456789, 456789, 6]
自定义函数需要继承(extends) AggregateFunction
package com.ali.flink.demo.driver.flink_udf;
import cn.hutool.core.util.RandomUtil;
import com.ali.flink.demo.bean.UserLoginEventBean;
import com.ali.flink.demo.utils.DataGeneratorImpl005;
import com.ali.flink.demo.utils.FlinkEnv;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.AggregateFunction;
import org.apache.flink.types.Row;
import java.time.Duration;
import static org.apache.flink.table.api.Expressions.$;
public class FlinkUDFAggregateFunctions {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
env.setParallelism(1);
StreamTableEnvironment tableEnv = FlinkEnv.getStreamTableEnv(env);
DataGeneratorSource<UserLoginEventBean> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl005());
SingleOutputStreamOperator<UserLoginEventBean> sourceStream = env.addSource(dataGeneratorSource).returns(UserLoginEventBean.class)
.assignTimestampsAndWatermarks(WatermarkStrategy.<UserLoginEventBean>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<UserLoginEventBean>() {
@Override
public long extractTimestamp(UserLoginEventBean userLoginEventBean, long l) {
return userLoginEventBean.getTimestamp();
}
}));
// sourceStream.print("source");
// map方法构造数据
SingleOutputStreamOperator<UserLoginEventBean> mapStream = sourceStream.map(new MapFunction<UserLoginEventBean, UserLoginEventBean>() {
@Override
public UserLoginEventBean map(UserLoginEventBean userLoginEventBean) throws Exception {
userLoginEventBean.setCnt(RandomUtil.randomInt(5));
return userLoginEventBean;
}
});
mapStream.print("map source");
Table sourceTable = tableEnv.fromDataStream(mapStream, $("userId").as("user_id"), $("cnt"));
tableEnv.createTemporaryView("loginTable", sourceTable);
// 调用udf函数
tableEnv.createTemporarySystemFunction("avgFunction", MyAggregateFunction.class);
Table resultTable = tableEnv.sqlQuery("select user_id, avgFunction(cnt, 1) from loginTable group by user_id");
// group by操作的流,需要转换为变化流toChangelogStream()
DataStream<Row> resultStream = tableEnv.toChangelogStream(resultTable);
resultStream.print("result");
env.execute("flink udf start");
}
public static class MyAccumulator{
public long sum = 0;
public int count = 0;
}
public static class MyAggregateFunction extends AggregateFunction<Long, MyAccumulator>{
@Override
public Long getValue(MyAccumulator myAccumulator) {
if (myAccumulator.count == 0){
return null;
}else {
return myAccumulator.sum / myAccumulator.count;
}
}
@Override
public MyAccumulator createAccumulator() {
// 初始化累加器
return new MyAccumulator();
}
// 该方法必须为 accumulate(),同时为 public
public void accumulate(MyAccumulator acc, Long sumValue, Integer countValue){
acc.sum += sumValue * countValue;
acc.count += countValue;
}
}
}
---------------------------------------------------------------
map source> UserLoginEventBean{userId='u4', loginAddress='杭州', loginType='success', loginTime='2022-07-28 15:35:35', timestamp=1658993735844, cnt=4}
result> +I[u4, 4]
map source> UserLoginEventBean{userId='u3', loginAddress='上海', loginType='fail', loginTime='2022-07-28 15:35:44', timestamp=1658993744860, cnt=4}
result> +I[u3, 4]
15:35:45,627 INFO org.apache.flink.runtime.checkpoint.CheckpointCoordinator [] - Triggering checkpoint 1 (type=CHECKPOINT) @ 1658993745619 for job a3ee41ca5ad7bd5a3dbd070ba3de5a43.
map source> UserLoginEventBean{userId='u4', loginAddress='南京', loginType='fail', loginTime='2022-07-28 15:35:52', timestamp=1658993752874, cnt=0}
result> -U[u4, 4] -- 产生了回撤操作
result> +U[u4, 2] -- 重新计算后的结果
自定义函数需要继承(extends) TableAggregateFunction
package com.ali.flink.demo.driver.flink_udf;
import com.ali.flink.demo.bean.UserLoginEventBean;
import com.ali.flink.demo.utils.DataGeneratorImpl005;
import com.ali.flink.demo.utils.FlinkEnv;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.table.api.Schema;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableAggregateFunction;
import org.apache.flink.types.Row;
import org.apache.flink.util.Collector;
import java.time.Duration;
import static org.apache.flink.table.api.Expressions.$;
import static org.apache.flink.table.api.Expressions.call;
public class FlinkUDFTableAggregateFunctions {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
env.setParallelism(1);
StreamTableEnvironment tableEnv = FlinkEnv.getStreamTableEnv(env);
DataGeneratorSource<UserLoginEventBean> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl005());
DataStream<UserLoginEventBean> sourceStream = env.addSource(dataGeneratorSource).returns(UserLoginEventBean.class)
.assignTimestampsAndWatermarks(WatermarkStrategy.<UserLoginEventBean>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<UserLoginEventBean>() {
@Override
public long extractTimestamp(UserLoginEventBean userLoginEventBean, long l) {
return userLoginEventBean.getTimestamp();
}
}));
sourceStream.print("source");
// 将 sourceStream转换为Table,指定属性字段和时间字段(事件时间)
Table sourceTable = tableEnv.fromDataStream(sourceStream, Schema.newBuilder()
.column("userId", "string")
.column("timestamp", "bigint")
.columnByExpression("rowtime", "CAST(TO_TIMESTAMP(FROM_UNIXTIME(`timestamp`/1000)) AS TIMESTAMP(3))")
.watermark("rowtime", "rowtime - interval '10' second ")
.build());
// 创建临时表
tableEnv.createTemporaryView("loginTable", sourceTable);
// 调用udf函数
tableEnv.createTemporarySystemFunction("Top2Function", Top2Function.class);
// group sql,时间窗内,按用户分组计算记录数
String query = "SELECT userId ,COUNT(1) AS cnt,window_start,window_end " +
"FROM TABLE(" +
" TUMBLE(TABLE loginTable, DESCRIPTOR(rowtime),INTERVAL '10' SECOND)" +
")" +
"GROUP BY userId,window_start,window_end";
// 执行sql
Table resultTable = tableEnv.sqlQuery(query);
// 转换为更新流
DataStream<Row> resultStream = tableEnv.toChangelogStream(resultTable);
resultStream.print("resultStream");
// 由于调用call,所以使用table api方式,按窗口分组计算窗口内的top2
Table outTable = resultTable.groupBy($("window_end"))
.flatAggregate(call("Top2Function", $("cnt")).as("value", "rank"))
.select($("window_end"), $("value"), $("rank"));
// 转换为更新流
DataStream<Row> outStream = tableEnv.toChangelogStream(outTable);
outStream.print("outStream");
env.execute("flink udf start");
}
public static class MyTop2Accumulator{
public Long firstValue;
public Long secondValue;
}
// 继承 TableAggregateFunction, T:输出类型, ACC:累加器
public static class Top2Function extends TableAggregateFunction<Tuple2<Long, Integer>, MyTop2Accumulator>{
@Override
public MyTop2Accumulator createAccumulator() {
MyTop2Accumulator accumulator = new MyTop2Accumulator();
accumulator.firstValue = Long.MIN_VALUE;
accumulator.secondValue = Long.MIN_VALUE;
return accumulator;
}
public void accumulate(MyTop2Accumulator accumulator, Long value) {
if (value > accumulator.firstValue){
accumulator.secondValue = accumulator.firstValue;
accumulator.firstValue = value;
}else if (value > accumulator.secondValue){
accumulator.secondValue = value;
}
}
public void emitValue(MyTop2Accumulator accumulator, Collector<Tuple2<Long, Integer>> out) {
if (accumulator.firstValue != Long.MIN_VALUE){
out.collect(Tuple2.of(accumulator.firstValue, 1));
}
if (accumulator.secondValue != Long.MIN_VALUE){
out.collect(Tuple2.of(accumulator.secondValue, 2));
}
}
}
}
-------------------------------结果---------------------------------
source> UserLoginEventBean{userId='u1', loginAddress='南京', loginType='fail', loginTime='2022-07-28 18:47:43', timestamp=1659005263897, cnt=0}
source> UserLoginEventBean{userId='u2', loginAddress='南京', loginType='fail', loginTime='2022-07-28 18:47:43', timestamp=1659005263900, cnt=0}
source> UserLoginEventBean{userId='u1', loginAddress='北京', loginType='success', loginTime='2022-07-28 18:47:43', timestamp=1659005263900, cnt=0}
source> UserLoginEventBean{userId='u4', loginAddress='杭州', loginType='fail', loginTime='2022-07-28 18:47:43', timestamp=1659005263900, cnt=0}
source> UserLoginEventBean{userId='u1', loginAddress='北京', loginType='fail', loginTime='2022-07-28 18:47:47', timestamp=1659005267902, cnt=0}
source> UserLoginEventBean{userId='u2', loginAddress='上海', loginType='success', loginTime='2022-07-28 18:47:51', timestamp=1659005271915, cnt=0}
source> UserLoginEventBean{userId='u1', loginAddress='北京', loginType='fail', loginTime='2022-07-28 18:47:51', timestamp=1659005271915, cnt=0}
18:47:52,685 INFO org.apache.flink.runtime.checkpoint.CheckpointCoordinator [] - Triggering checkpoint 1 (type=CHECKPOINT) @ 1659005272678 for job d0c69d024fbbcfb4c5372c297094438b.
source> UserLoginEventBean{userId='u3', loginAddress='南京', loginType='fail', loginTime='2022-07-28 18:47:53', timestamp=1659005273925, cnt=0}
source> UserLoginEventBean{userId='u2', loginAddress='南京', loginType='success', loginTime='2022-07-28 18:47:56', timestamp=1659005276929, cnt=0}
source> UserLoginEventBean{userId='u4', loginAddress='南京', loginType='fail', loginTime='2022-07-28 18:47:59', timestamp=1659005279943, cnt=0}
source> UserLoginEventBean{userId='u4', loginAddress='南京', loginType='fail', loginTime='2022-07-28 18:48:01', timestamp=1659005281948, cnt=0}
resultStream> +I[u1, 3, 2022-07-28T18:47:40, 2022-07-28T18:47:50]
resultStream> +I[u2, 1, 2022-07-28T18:47:40, 2022-07-28T18:47:50]
resultStream> +I[u4, 1, 2022-07-28T18:47:40, 2022-07-28T18:47:50]
outStream> +I[2022-07-28T18:47:50, 3, 1]
outStream> -D[2022-07-28T18:47:50, 3, 1]
outStream> +I[2022-07-28T18:47:50, 3, 1]
outStream> +I[2022-07-28T18:47:50, 1, 2]
outStream> -D[2022-07-28T18:47:50, 3, 1]
outStream> -D[2022-07-28T18:47:50, 1, 2]
outStream> +I[2022-07-28T18:47:50, 3, 1]
outStream> +I[2022-07-28T18:47:50, 1, 2]