Flink UDF 函数汇总

文章目录

  • Flink UDF 函数汇总
    • 标量函数(Scalar Functions)(一对一关系)
    • 表函数(Table Functions)(一对多关系)
    • 聚合函数(Aggregate Functions)(多对一关系)
    • 表聚合函数(Table Aggregate Functions)(多对多关系)

Flink UDF 函数汇总

Flink 的 Table API 和 SQL 提供了多种自定义函数的接口,以抽象类的形式定义。当前 UDF主要有以下几类:

  • 标量函数(Scalar Functions):将输入的标量值转换成一个新的标量值;
  • 表函数(Table Functions):将标量值转换成一个或多个新的行数据,也就是扩展成一个表;
  • 聚合函数(Aggregate Functions):将多行数据里的标量值转换成一个新的标量值;
  • 表聚合函数(Table Aggregate Functions):将多行数据里的标量值转换成一个或多个新的行数据。

标量函数(Scalar Functions)(一对一关系)

自定义函数需要继承(extends) ScalarFunction类,并实现eval()方法。

package com.ali.flink.demo.driver.flink_udf;

import com.ali.flink.demo.bean.UserLoginEventBean;
import com.ali.flink.demo.utils.DataGeneratorImpl005;
import com.ali.flink.demo.utils.FlinkEnv;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.ScalarFunction;
import org.apache.flink.types.Row;

import java.time.Duration;

import static org.apache.flink.table.api.Expressions.$;

public class FlinkUDFScalarFunctions {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
        env.setParallelism(1);

        StreamTableEnvironment tableEnv = FlinkEnv.getStreamTableEnv(env);
		// 获取数据源
        DataGeneratorSource<UserLoginEventBean> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl005());
        SingleOutputStreamOperator<UserLoginEventBean> sourceStream = env.addSource(dataGeneratorSource).returns(UserLoginEventBean.class)
                .assignTimestampsAndWatermarks(WatermarkStrategy.<UserLoginEventBean>forBoundedOutOfOrderness(Duration.ofSeconds(2))
                        .withTimestampAssigner(new SerializableTimestampAssigner<UserLoginEventBean>() {
                            @Override
                            public long extractTimestamp(UserLoginEventBean userLoginEventBean, long l) {
                                return userLoginEventBean.getTimestamp();
                            }
                        }));

        sourceStream.print("source");
		// 将 DataStream 转换为 Table,并设置表字段信息
        Table sourceTable = tableEnv.fromDataStream(sourceStream, $("userId").as("user_id"), $("loginAddress"), $("loginType"), $("loginTime"));
        // 创建临时表
        tableEnv.createTemporaryView("loginTable", sourceTable);
        // 注册udf函数
        // createTemporarySystemFunction:临时系统函数,相当于系统函数,适用于所有database和catalog
        // createTemporaryFunction:临时函数,适用于当前database和catalog
        tableEnv.createTemporarySystemFunction("Myhash", MyHashFunction.class);
		// 执行sql,调用udf函数
        Table resultTable = tableEnv.sqlQuery("select user_id, Myhash(user_id) as hash_id from loginTable");
		// 将Table 转换为 DataStream
        DataStream<Row> resultStream = tableEnv.toDataStream(resultTable);
        resultStream.print("result");

        env.execute("flink udf start");
    }
    
	// 自定义函数,实现hash功能
    public static class MyHashFunction extends ScalarFunction{
        public int eval(String str){
            return str.hashCode();
        }
    }
}


-------------------------------结果---------------------------------
source> UserLoginEventBean{userId='u4', loginAddress='杭州', loginType='fail', loginTime='2022-07-28 14:15:28', timestamp=1658988928626}
result> +I[u4, 3679]
source> UserLoginEventBean{userId='u2', loginAddress='南京', loginType='fail', loginTime='2022-07-28 14:15:29', timestamp=1658988929650}
result> +I[u2, 3677]

表函数(Table Functions)(一对多关系)

自定义函数需要继承(extends) TableFunction类,T:输出类型,并实现eval()方法。

package com.ali.flink.demo.driver.flink_udf;

import cn.hutool.core.util.RandomUtil;
import com.ali.flink.demo.bean.UserLoginEventBean;
import com.ali.flink.demo.utils.DataGeneratorImpl005;
import com.ali.flink.demo.utils.FlinkEnv;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;

import java.time.Duration;

import static org.apache.flink.table.api.Expressions.$;

public class FlinkUDFTableFunctions {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
        env.setParallelism(1);

        StreamTableEnvironment tableEnv = FlinkEnv.getStreamTableEnv(env);

        DataGeneratorSource<UserLoginEventBean> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl005());
        SingleOutputStreamOperator<UserLoginEventBean> sourceStream = env.addSource(dataGeneratorSource).returns(UserLoginEventBean.class)
                .assignTimestampsAndWatermarks(WatermarkStrategy.<UserLoginEventBean>forBoundedOutOfOrderness(Duration.ofSeconds(2))
                        .withTimestampAssigner(new SerializableTimestampAssigner<UserLoginEventBean>() {
                            @Override
                            public long extractTimestamp(UserLoginEventBean userLoginEventBean, long l) {
                                return userLoginEventBean.getTimestamp();
                            }
                        }));

//        sourceStream.print("source");
        // map方法构造数据
        SingleOutputStreamOperator<UserLoginEventBean> mapStream = sourceStream.map(new MapFunction<UserLoginEventBean, UserLoginEventBean>() {
            @Override
            public UserLoginEventBean map(UserLoginEventBean userLoginEventBean) throws Exception {
                String s = "123456789";
                userLoginEventBean.setUserId(userLoginEventBean.getUserId() + s.substring(RandomUtil.randomInt(9)) + "," + s.substring(RandomUtil.randomInt(9)));
                return userLoginEventBean;
            }
        });

        mapStream.print("map source");
        
        Table sourceTable = tableEnv.fromDataStream(mapStream, $("userId").as("user_id"), $("loginAddress"), $("loginType"), $("loginTime"));
        tableEnv.createTemporaryView("loginTable", sourceTable);

        // 调用udf函数
        tableEnv.createTemporarySystemFunction("MySplit", MySplitFunction.class);

        Table resultTable = tableEnv.sqlQuery("select user_id, u1, u2 from loginTable, lateral table(MySplit(user_id)) as T(u1, u2)");

        DataStream<Row> resultStream = tableEnv.toDataStream(resultTable);
        resultStream.print("result");

        env.execute("flink udf start");
    }

    public static class MySplitFunction extends TableFunction<Tuple2<String, Integer>>{
        public void eval(String str){
            String[] split = str.split(",");
            for (String s : split) {
                collect(Tuple2.of(s, s.length()));
            }
        }
    }
}

--------------------------结果------------------------------------
map source> UserLoginEventBean{userId='u3456789,789', loginAddress='杭州', loginType='fail', loginTime='2022-07-28 14:49:53', timestamp=1658990993346}
result> +I[u3456789,789, u3456789, 8]
result> +I[u3456789,789, 789, 3]
map source> UserLoginEventBean{userId='u356789,456789', loginAddress='北京', loginType='success', loginTime='2022-07-28 14:49:57', timestamp=1658990997357}
result> +I[u356789,456789, u356789, 7]
result> +I[u356789,456789, 456789, 6]

聚合函数(Aggregate Functions)(多对一关系)

自定义函数需要继承(extends) AggregateFunction类,T:输出类型,ACC:累加器,并实现 createAccumulator(),getValue(),accumulate()方法。

  • createAccumulator() : 初始化累加器
  • getValue():获取最后的结果值
  • accumulate():聚合计算的核心方法,每来一行数据都会调用。它的第一个参数是确定的,就是当前的累加器,类型为 ACC,表示当前聚合的中间状态;后面的参数则是聚合函数调用时传入的参数,可以有多个,类型也可以不同。这个方法主要是更新聚合状态,所以没有返回类型
package com.ali.flink.demo.driver.flink_udf;

import cn.hutool.core.util.RandomUtil;
import com.ali.flink.demo.bean.UserLoginEventBean;
import com.ali.flink.demo.utils.DataGeneratorImpl005;
import com.ali.flink.demo.utils.FlinkEnv;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.AggregateFunction;
import org.apache.flink.types.Row;

import java.time.Duration;

import static org.apache.flink.table.api.Expressions.$;

public class FlinkUDFAggregateFunctions {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
        env.setParallelism(1);

        StreamTableEnvironment tableEnv = FlinkEnv.getStreamTableEnv(env);

        DataGeneratorSource<UserLoginEventBean> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl005());
        SingleOutputStreamOperator<UserLoginEventBean> sourceStream = env.addSource(dataGeneratorSource).returns(UserLoginEventBean.class)
                .assignTimestampsAndWatermarks(WatermarkStrategy.<UserLoginEventBean>forBoundedOutOfOrderness(Duration.ofSeconds(2))
                        .withTimestampAssigner(new SerializableTimestampAssigner<UserLoginEventBean>() {
                            @Override
                            public long extractTimestamp(UserLoginEventBean userLoginEventBean, long l) {
                                return userLoginEventBean.getTimestamp();
                            }
                        }));

//        sourceStream.print("source");

        // map方法构造数据
        SingleOutputStreamOperator<UserLoginEventBean> mapStream = sourceStream.map(new MapFunction<UserLoginEventBean, UserLoginEventBean>() {
            @Override
            public UserLoginEventBean map(UserLoginEventBean userLoginEventBean) throws Exception {
                userLoginEventBean.setCnt(RandomUtil.randomInt(5));
                return userLoginEventBean;
            }
        });

        mapStream.print("map source");

        Table sourceTable = tableEnv.fromDataStream(mapStream, $("userId").as("user_id"), $("cnt"));
        tableEnv.createTemporaryView("loginTable", sourceTable);

        // 调用udf函数
        tableEnv.createTemporarySystemFunction("avgFunction", MyAggregateFunction.class);

        Table resultTable = tableEnv.sqlQuery("select user_id, avgFunction(cnt, 1)  from loginTable group by user_id");
		// group by操作的流,需要转换为变化流toChangelogStream()
        DataStream<Row> resultStream = tableEnv.toChangelogStream(resultTable);
        resultStream.print("result");

        env.execute("flink udf start");
    }

    public static class MyAccumulator{
        public long sum = 0;
        public int count = 0;
    }

    public static class MyAggregateFunction extends AggregateFunction<Long, MyAccumulator>{

        @Override
        public Long getValue(MyAccumulator myAccumulator) {
            if (myAccumulator.count == 0){
                return null;
            }else {
                return myAccumulator.sum / myAccumulator.count;
            }
        }

        @Override
        public MyAccumulator createAccumulator() {
            // 初始化累加器
            return new MyAccumulator();
        }

        // 该方法必须为 accumulate(),同时为 public
        public void accumulate(MyAccumulator acc, Long sumValue, Integer countValue){
            acc.sum += sumValue * countValue;
            acc.count += countValue;
        }
    }
}

---------------------------------------------------------------
map source> UserLoginEventBean{userId='u4', loginAddress='杭州', loginType='success', loginTime='2022-07-28 15:35:35', timestamp=1658993735844, cnt=4}
result> +I[u4, 4]
map source> UserLoginEventBean{userId='u3', loginAddress='上海', loginType='fail', loginTime='2022-07-28 15:35:44', timestamp=1658993744860, cnt=4}
result> +I[u3, 4]
15:35:45,627 INFO  org.apache.flink.runtime.checkpoint.CheckpointCoordinator    [] - Triggering checkpoint 1 (type=CHECKPOINT) @ 1658993745619 for job a3ee41ca5ad7bd5a3dbd070ba3de5a43.
map source> UserLoginEventBean{userId='u4', loginAddress='南京', loginType='fail', loginTime='2022-07-28 15:35:52', timestamp=1658993752874, cnt=0}
result> -U[u4, 4]				-- 产生了回撤操作
result> +U[u4, 2]				-- 重新计算后的结果

表聚合函数(Table Aggregate Functions)(多对多关系)

自定义函数需要继承(extends) TableAggregateFunction类,T:输出类型,ACC:累加器,并实现createAccumulator(),accumulate(ACC, value), emitValue(ACC,Collector)

  • createAccumulator():创建累加器的方法
  • accumulate(ACC, value):聚合计算的核心方法,每来一行数据都会调用。它的第一个参数是确定的,就是当前的累加器,类型为 ACC,表示当前聚合的中间状态;后面的参数则是聚合函数调用时传入的参数
  • emitValue(ACC,Collector):输出最终计算结果,第一个参数是 ACC类型的累加器,第二个参数则是用于输出数据的“收集器”out,它的类型为 Collect
package com.ali.flink.demo.driver.flink_udf;

import com.ali.flink.demo.bean.UserLoginEventBean;
import com.ali.flink.demo.utils.DataGeneratorImpl005;
import com.ali.flink.demo.utils.FlinkEnv;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.datagen.DataGeneratorSource;
import org.apache.flink.table.api.Schema;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableAggregateFunction;
import org.apache.flink.types.Row;
import org.apache.flink.util.Collector;

import java.time.Duration;

import static org.apache.flink.table.api.Expressions.$;
import static org.apache.flink.table.api.Expressions.call;

public class FlinkUDFTableAggregateFunctions {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = FlinkEnv.FlinkDataStreamRunEnv();
        env.setParallelism(1);

        StreamTableEnvironment tableEnv = FlinkEnv.getStreamTableEnv(env);

        DataGeneratorSource<UserLoginEventBean> dataGeneratorSource = new DataGeneratorSource<>(new DataGeneratorImpl005());
        DataStream<UserLoginEventBean> sourceStream = env.addSource(dataGeneratorSource).returns(UserLoginEventBean.class)
                .assignTimestampsAndWatermarks(WatermarkStrategy.<UserLoginEventBean>forBoundedOutOfOrderness(Duration.ofSeconds(2))
                        .withTimestampAssigner(new SerializableTimestampAssigner<UserLoginEventBean>() {
                            @Override
                            public long extractTimestamp(UserLoginEventBean userLoginEventBean, long l) {
                                return userLoginEventBean.getTimestamp();
                            }
                        }));

        sourceStream.print("source");

        // 将 sourceStream转换为Table,指定属性字段和时间字段(事件时间)
        Table sourceTable = tableEnv.fromDataStream(sourceStream, Schema.newBuilder()
                .column("userId", "string")
                .column("timestamp", "bigint")
                .columnByExpression("rowtime", "CAST(TO_TIMESTAMP(FROM_UNIXTIME(`timestamp`/1000)) AS TIMESTAMP(3))")
                .watermark("rowtime", "rowtime - interval '10' second ")
                .build());
        // 创建临时表
        tableEnv.createTemporaryView("loginTable", sourceTable);

        // 调用udf函数
        tableEnv.createTemporarySystemFunction("Top2Function", Top2Function.class);

        // group sql,时间窗内,按用户分组计算记录数
        String query = "SELECT userId ,COUNT(1) AS cnt,window_start,window_end " +
                "FROM TABLE(" +
                " TUMBLE(TABLE loginTable, DESCRIPTOR(rowtime),INTERVAL '10' SECOND)" +
                ")" +
                "GROUP BY userId,window_start,window_end";
        // 执行sql
        Table resultTable = tableEnv.sqlQuery(query);
        
        // 转换为更新流
        DataStream<Row> resultStream = tableEnv.toChangelogStream(resultTable);
        resultStream.print("resultStream");
        
        // 由于调用call,所以使用table api方式,按窗口分组计算窗口内的top2
        Table outTable = resultTable.groupBy($("window_end"))
                .flatAggregate(call("Top2Function", $("cnt")).as("value", "rank"))
                .select($("window_end"), $("value"), $("rank"));
        
        // 转换为更新流
        DataStream<Row> outStream = tableEnv.toChangelogStream(outTable);
        outStream.print("outStream");

        env.execute("flink udf start");
    }

    public static class MyTop2Accumulator{
        public Long firstValue;
        public Long secondValue;
    }

    // 继承 TableAggregateFunction, T:输出类型, ACC:累加器
    public static class Top2Function extends TableAggregateFunction<Tuple2<Long, Integer>, MyTop2Accumulator>{

        @Override
        public MyTop2Accumulator createAccumulator() {
            MyTop2Accumulator accumulator = new MyTop2Accumulator();
            accumulator.firstValue = Long.MIN_VALUE;
            accumulator.secondValue = Long.MIN_VALUE;
            return accumulator;
        }

        public void accumulate(MyTop2Accumulator accumulator, Long value) {
            if (value > accumulator.firstValue){
                accumulator.secondValue = accumulator.firstValue;
                accumulator.firstValue = value;
            }else if (value > accumulator.secondValue){
                accumulator.secondValue = value;
            }

        }

        public void emitValue(MyTop2Accumulator accumulator, Collector<Tuple2<Long, Integer>> out) {
            if (accumulator.firstValue != Long.MIN_VALUE){
                out.collect(Tuple2.of(accumulator.firstValue, 1));
            }
            if (accumulator.secondValue != Long.MIN_VALUE){
                out.collect(Tuple2.of(accumulator.secondValue, 2));
            }
        }
    }
}

-------------------------------结果---------------------------------
source> UserLoginEventBean{userId='u1', loginAddress='南京', loginType='fail', loginTime='2022-07-28 18:47:43', timestamp=1659005263897, cnt=0}
source> UserLoginEventBean{userId='u2', loginAddress='南京', loginType='fail', loginTime='2022-07-28 18:47:43', timestamp=1659005263900, cnt=0}
source> UserLoginEventBean{userId='u1', loginAddress='北京', loginType='success', loginTime='2022-07-28 18:47:43', timestamp=1659005263900, cnt=0}
source> UserLoginEventBean{userId='u4', loginAddress='杭州', loginType='fail', loginTime='2022-07-28 18:47:43', timestamp=1659005263900, cnt=0}
source> UserLoginEventBean{userId='u1', loginAddress='北京', loginType='fail', loginTime='2022-07-28 18:47:47', timestamp=1659005267902, cnt=0}
source> UserLoginEventBean{userId='u2', loginAddress='上海', loginType='success', loginTime='2022-07-28 18:47:51', timestamp=1659005271915, cnt=0}
source> UserLoginEventBean{userId='u1', loginAddress='北京', loginType='fail', loginTime='2022-07-28 18:47:51', timestamp=1659005271915, cnt=0}
18:47:52,685 INFO  org.apache.flink.runtime.checkpoint.CheckpointCoordinator    [] - Triggering checkpoint 1 (type=CHECKPOINT) @ 1659005272678 for job d0c69d024fbbcfb4c5372c297094438b.
source> UserLoginEventBean{userId='u3', loginAddress='南京', loginType='fail', loginTime='2022-07-28 18:47:53', timestamp=1659005273925, cnt=0}
source> UserLoginEventBean{userId='u2', loginAddress='南京', loginType='success', loginTime='2022-07-28 18:47:56', timestamp=1659005276929, cnt=0}
source> UserLoginEventBean{userId='u4', loginAddress='南京', loginType='fail', loginTime='2022-07-28 18:47:59', timestamp=1659005279943, cnt=0}
source> UserLoginEventBean{userId='u4', loginAddress='南京', loginType='fail', loginTime='2022-07-28 18:48:01', timestamp=1659005281948, cnt=0}
resultStream> +I[u1, 3, 2022-07-28T18:47:40, 2022-07-28T18:47:50]
resultStream> +I[u2, 1, 2022-07-28T18:47:40, 2022-07-28T18:47:50]
resultStream> +I[u4, 1, 2022-07-28T18:47:40, 2022-07-28T18:47:50]
outStream> +I[2022-07-28T18:47:50, 3, 1]
outStream> -D[2022-07-28T18:47:50, 3, 1]
outStream> +I[2022-07-28T18:47:50, 3, 1]
outStream> +I[2022-07-28T18:47:50, 1, 2]
outStream> -D[2022-07-28T18:47:50, 3, 1]
outStream> -D[2022-07-28T18:47:50, 1, 2]
outStream> +I[2022-07-28T18:47:50, 3, 1]
outStream> +I[2022-07-28T18:47:50, 1, 2]

你可能感兴趣的:(flink,flink,java,apache)