Flink Table & SQL 维表Join

基于之前的总结,再次汇总总结下Flink Table & SQL 中维表Join的实现方式,包括DataStream中的维表Join。

  1. 定时加载维度数据

  2. Distributed Cache(分布式缓存)

  3. Async IO(异步IO)

  4. Broadcast State(广播状态)

  5. UDTF + LATERAL TABLE语法

  6. LookupableTableSource

定时加载维度数据

实现方式

  1. 实现RichFlatMapFunction, 在open()方法中起个线程定时读取维度数据并加载到内存。

  2. flatMap()方法中实现维度关联。

代码示例

package com.bigdata.flink.dimJoin;

import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.common.functions.RichFlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.util.Collector;

import java.sql.*;
import java.util.HashMap;
import java.util.Timer;
import java.util.TimerTask;

/**
 * Author: Wang Pei
 * Summary:
 * 定时加载维度数据到内存
 */
@Slf4j
public class DimRichFlatMapFunction extends RichFlatMapFunction<UserBrowseLog, Tuple2<UserBrowseLog, UserInfo>> {

    private final String url;
    private final String user;
    private final String passwd;
    private final Integer reloadInterval;

    private Connection connection;
    private final String JDBC_DRIVER = "com.mysql.cj.jdbc.Driver";
    HashMap dimInfo = new HashMap<String, UserInfo>();

    public DimRichFlatMapFunction(String url, String user, String passwd, Integer reloadInterval) {
        this.url = url;
        this.user = user;
        this.passwd = passwd;
        this.reloadInterval = reloadInterval;
    }

    /**
     * 打开连接
     * 定时加载维度数据
     *
     * @param parameters
     * @throws Exception
     */
    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        Class.forName(JDBC_DRIVER);

        TimerTask timerTask = new TimerTask() {
            @Override
            public void run() {
                try {
                    if (connection == null || connection.isClosed()) {
                        log.warn("No connection. Trying to reconnect...");
                        connection = DriverManager.getConnection(url, user, passwd);
                    }
                    String sql = "select uid,name,age,address from t_user_info";
                    PreparedStatement preparedStatement = connection.prepareStatement(sql);
                    ResultSet resultSet = preparedStatement.executeQuery();
                    while (resultSet.next()) {
                        UserInfo userInfo = new UserInfo();
                        userInfo.setUid(resultSet.getString("uid"));
                        userInfo.setName(resultSet.getString("name"));
                        userInfo.setAge(resultSet.getInt("age"));
                        userInfo.setAddress(resultSet.getString("address"));

                        dimInfo.put(userInfo.getUid(), userInfo);
                    }
                } catch (SQLException e) {
                    log.error("Get dimension data exception...", e);
                }
            }
        };

        Timer timer = new Timer();
        timer.scheduleAtFixedRate(timerTask, 0, reloadInterval * 1000);

    }

    /**
     * 关闭连接
     *
     * @throws Exception
     */
    @Override
    public void close() throws Exception {
        super.close();
        if (connection != null) {
            connection.close();
        }
    }

    /**
     * 维度关联
     *
     * @param value
     * @param out
     * @throws Exception
     */
    @Override
    public void flatMap(UserBrowseLog value, Collector<Tuple2<UserBrowseLog, UserInfo>> out) throws Exception {
        String userID = value.getUserID();
        if (dimInfo.containsKey(userID)) {
            UserInfo dim = (UserInfo) dimInfo.get(userID);
            out.collect(new Tuple2<>(value, dim));
        }
    }
}

注意

  1. 由于数据会存储在内存中,因此,仅支持小数据量维表。

  2. 定时加载,仅适用于更新不太频繁的维表。

Distributed Cache(分布式缓存)

实现方式

  1. 通过env.registerCachedFile(cachedFilePath, cachedFileName)注册本地或HDFS缓存文件。

  2. 程序启动时,Flink会自动将文件分发到TaskManager文件系统中。

  3. 实现RichFlatMapFunction,在open()方法中通过RuntimeContext获取缓存文件并解析。

  4. 解析后的数据在内存中,此时可在flatMap()方法中实现维度关联。

代码示例

package com.bigdata.flink.dimJoin;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.flink.api.common.functions.RichFlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

import java.io.File;
import java.util.HashMap;
import java.util.List;

/**
 * Author: Wang Pei
 * Summary:
 * 通过Distributed Cache实现维度关联
 */
@Slf4j
public class DistributedCacheJoinDim {
    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 注册缓存文件 如: file:///some/path 或 hdfs://host:port/and/path
        String cachedFilePath = "./user_info.txt";
        String cachedFileName = "user_info";
        env.registerCachedFile(cachedFilePath, cachedFileName);

        // 添加实时流
        DataStreamSource<Tuple2<String, String>> stream = env.fromElements(
                Tuple2.of("1", "click"),
                Tuple2.of("2", "click"),
                Tuple2.of("3", "browse"));

        // 关联维度
        SingleOutputStreamOperator<String> dimedStream = stream.flatMap(new RichFlatMapFunction<Tuple2<String, String>, String>() {

            HashMap dimInfo = new HashMap<String, Integer>();

            // 读取文件
            @Override
            public void open(Configuration parameters) throws Exception {
                super.open(parameters);
                File cachedFile = getRuntimeContext().getDistributedCache().getFile(cachedFileName);
                List<String> lines = FileUtils.readLines(cachedFile);
                for (String line : lines) {
                    String[] split = line.split(",");
                    dimInfo.put(split[0], Integer.valueOf(split[1]));
                }
            }

            // 关联维度
            @Override
            public void flatMap(Tuple2<String, String> value, Collector<String> out) throws Exception {
                if (dimInfo.containsKey(value.f0)) {
                    Integer age = (Integer) dimInfo.get(value.f0);
                    out.collect(value.f0 + "," + value.f1 + "," + age);
                }
            }
        });

        dimedStream.print();

        env.execute();
    }
}

注意

  1. 由于数据会存储在内存中,因此,仅支持小数据量维表。

  2. 启动时加载,在维表变化时,需要重启任务。

Async IO(异步IO)

实现方式

  1. 维度数据在外部存储中,如ES、Redis、HBase中。

  2. 通过异步IO查询维度数据

  3. 结合本地缓存如Guava Cache 减少对外部存储的访问。

代码示例

之前总结过,这里就不写示例了。附上之前的链接。

Flink DataStream流表与维表Join(Async I/O)

注意

  1. 此方式不受限于内存,可支持数据量较大的维度数据。

  2. 需要外部存储支持。

  3. 应尽量减少对外部存储访问。

Broadcast State

实现方式

  1. 将维度数据发送到Kafka作为流S1。事实数据是流S2。

  2. 定义状态描述符MapStateDescriptor,如descriptor

  3. 结合状态描述符,将S1广播出去,如S1.broadcast(descriptor),形成广播流(BroadcastStream) B1。

  4. 事实流S2和广播流B1连接,形成连接后的流BroadcastConnectedStream BC。

  5. 基于BC流,在KeyedBroadcastProcessFunction/BroadcastProcessFunction中实现Join的逻辑处理。

代码示例

之前总结过,这里就不写示例了。附上之前的链接。

Flink DataStream基于Broadcast State动态更新配置以实现实时过滤数据并增加字段

注意

  1. 需要将维度数据的变化转换成Kafka中的流。

  2. 维度的变化可实时感知。

  3. 维度数据保存在内存中,支持的数据量相对较小。

UDTF + LATERAL TABLE语法

实现方式

  1. 假设你用的是Flink SQL。首先,自定义UTDF, 继承TableFunction抽象类,实现open()close()eval()方法。

  2. 注册TableFunction

  3. 在SQL中使用LATERAL TABLE语法和UDTF运行的结果进行关联。

代码示例-Flink Table API

之前总结过,这里就不写示例了。附上之前的链接。

Flink Table & SQL 时态表Temporal Table

代码示例-Flink SQL

定义UDTF

package com.bigdata.flink.dimJoin;

import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.typeutils.RowTypeInfo;
import org.apache.flink.table.functions.FunctionContext;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;
import redis.clients.jedis.Jedis;

/**
 * Author: Wang Pei
 * Summary:
 * UDTF
 */
public class UDTFRedis extends TableFunction<Row> {

    private Jedis jedis;

    /**
     * 打开连接
     * @param context
     * @throws Exception
     */
    @Override
    public void open(FunctionContext context) throws Exception {
        jedis = new Jedis("localhost", 6379);
        jedis.select(0);
    }

    /**
     * 关闭连接
     * @throws Exception
     */
    @Override
    public void close() throws Exception {
        if (jedis != null) {
            jedis.close();
        }
    }

    /**
     * 从Redis中查找维度数据
     * @param key
     */
    public void eval(String key) {
        String value = jedis.get(key);
        if (value != null) {
            String[] valueSplit = value.split(",");
            Row row = new Row(2);
            row.setField(0, valueSplit[0]);
            row.setField(1, Integer.valueOf(valueSplit[1]));
            collector.collect(row);
        }
    }

    /**
     * 定义返回的数据类型,返回数据为userName,userAge,所以这里为String,Int。
     * @return
     */
    @Override
    public TypeInformation<Row> getResultType() {
        return new RowTypeInfo(Types.STRING, Types.INT);
    }
}

Kafka Join Redis-Dim

package com.bigdata.flink.dimJoin;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;

/**
 * Author: Wang Pei
 * Summary:
 * Kafka Join Redis-Dim
 */
public class KafkaJoinRedisDimWithUDTF {
    public static void main(String[] args) throws Exception {

        EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build();
        StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings);

        // Source DDL
        // Kafka数据: {"userID":"user_1","eventType":"click","eventTime":"2015-01-01 00:00:00"}
        String sourceDDL = ""
                + "create table source_kafka "
                + "( "
                + "    userID String, "
                + "    eventType String, "
                + "    eventTime String "
                + ") with ( "
                + "    'connector.type' = 'kafka', "
                + "    'connector.version' = '0.10', "
                + "    'connector.properties.bootstrap.servers' = 'kafka01:9092', "
                + "    'connector.properties.zookeeper.connect' = 'kafka01:2181', "
                + "    'connector.topic' = 'test_1', "
                + "    'connector.properties.group.id' = 'c1_test_1', "
                + "    'connector.startup-mode' = 'latest-offset', "
                + "    'format.type' = 'json' "
                + ")";
        tableEnv.sqlUpdate(sourceDDL);
        tableEnv.toAppendStream(tableEnv.from("source_kafka"), Row.class).print();

        // UDTF DDL
        // Redis中的数据 userID userName,userAge
        // 127.0.0.1:6379> get user_1
        // "name1,10"
        String udtfDDL = ""
                + "CREATE TEMPORARY FUNCTION "
                + "  IF NOT EXISTS UDTFRedis "
                + "  AS 'com.bigdata.flink.dimJoin.UDTFRedis'";
        tableEnv.sqlUpdate(udtfDDL);

        // Query
        // Left Join
        String execSQL = ""
                + "select "
                + " source_kafka.*,dim.* "
                + "from source_kafka "
                + "LEFT JOIN LATERAL TABLE(UDTFRedis(userID)) as dim (userName,userAge) ON TRUE";
        Table table = tableEnv.sqlQuery(execSQL);
        tableEnv.toAppendStream(table, Row.class).print();

        tableEnv.execute(KafkaJoinRedisDimWithUDTF.class.getSimpleName());
    }
}

注意

  1. 需要定义UDTF和使用LATERAL TABLE语法。

  2. 不是很通用,如想用一个UDTF实现所有从Redis获取维度数据的场景,很难实现。

  3. 依赖外部存储,当数据变化时,可及时获取。

LookupableTableSource

实现方式

  1. 数据源实现LookupableTableSource接口。

  2. 在Flink SQL中直接注册Lookup表即可,在Flink Table API中需要注册LookupFunction

  3. 本质上,还是通过TableFunction来获取维度数据。

代码示例-Flink Table API

之前总结过,这里就不写示例了。附上之前的链接。

Flink Table & SQL LookableTableSource Join HBase

代码示例-Flink SQL

package com.bigdata.flink.dimJoin;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;

/**
 * Author: Wang Pei
 * Summary:
 *  Kafka Join Mysql-Dim
 */
public class KafkaJoinMysqlDim {
    public static void main(String[] args) throws Exception {

        EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build();
        StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings);

        // Source DDL
        // Kafka数据: {"userID":"user_1","eventType":"click","eventTime":"2015-01-01 00:00:00"}
        String sourceDDL = ""
                + "create table source_kafka "
                + "( "
                + "    userID STRING, "
                + "    eventType STRING, "
                + "    eventTime STRING, "
                + "    proctime AS PROCTIME() "
                + ") with ( "
                + "    'connector.type' = 'kafka', "
                + "    'connector.version' = '0.10', "
                + "    'connector.properties.bootstrap.servers' = 'kafka01:9092', "
                + "    'connector.properties.zookeeper.connect' = 'kafka01:2181', "
                + "    'connector.topic' = 'test_1', "
                + "    'connector.properties.group.id' = 'c1_test_1', "
                + "    'connector.startup-mode' = 'latest-offset', "
                + "    'format.type' = 'json' "
                + ")";
        tableEnv.sqlUpdate(sourceDDL);
        //tableEnv.toAppendStream(tableEnv.from("source_kafka"), Row.class).print();

        // Dim DDL
        // Mysql维度数据
        // mysql> select * from t_user_info limit 1;
        // +--------+----------+---------+
        // | userID | userName | userAge |
        // +--------+----------+---------+
        // | user_1 | name1    |      10 |
        // +--------+----------+---------+
        String dimDDL = ""
                + "CREATE TABLE dim_mysql ( "
                + "    userID STRING, "
                + "    userName STRING, "
                + "    userAge INT "
                + ") WITH ( "
                + "    'connector.type' = 'jdbc', "
                + "    'connector.url' = 'jdbc:mysql://localhost:3306/bigdata', "
                + "    'connector.table' = 't_user_info', "
                + "    'connector.driver' = 'com.mysql.jdbc.Driver', "
                + "    'connector.username' = '****', "
                + "    'connector.password' = '******' "
                + ")";
        tableEnv.sqlUpdate(dimDDL);

        // Query
        // Left Join
        String execSQL = ""
                + "SELECT "
                + "  kafka.*,mysql.userName,mysql.userAge "
                + "FROM "
                + "  source_kafka as kafka"
                + "  LEFT JOIN dim_mysql FOR SYSTEM_TIME AS OF kafka.proctime AS mysql "
                + "  ON kafka.userID = mysql.userID";
        Table table = tableEnv.sqlQuery(execSQL);
        tableEnv.toAppendStream(table, Row.class).print();

        tableEnv.execute(KafkaJoinMysqlDim.class.getSimpleName());

    }
}

注意

  1. 需要实现LookupableTableSource接口。

  2. 比较通用。

  3. 依赖外部存储,当数据变化时,可及时获取。

  4. 目前仅支持Blink Planner。

你可能感兴趣的:(Flink)