基于之前的总结,再次汇总总结下Flink Table & SQL 中维表Join的实现方式,包括DataStream中的维表Join。
定时加载维度数据
Distributed Cache(分布式缓存)
Async IO(异步IO)
Broadcast State(广播状态)
UDTF + LATERAL TABLE语法
LookupableTableSource
实现RichFlatMapFunction
, 在open()
方法中起个线程定时读取维度数据并加载到内存。
在flatMap()
方法中实现维度关联。
package com.bigdata.flink.dimJoin;
import lombok.extern.slf4j.Slf4j;
import org.apache.flink.api.common.functions.RichFlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.util.Collector;
import java.sql.*;
import java.util.HashMap;
import java.util.Timer;
import java.util.TimerTask;
/**
* Author: Wang Pei
* Summary:
* 定时加载维度数据到内存
*/
@Slf4j
public class DimRichFlatMapFunction extends RichFlatMapFunction<UserBrowseLog, Tuple2<UserBrowseLog, UserInfo>> {
private final String url;
private final String user;
private final String passwd;
private final Integer reloadInterval;
private Connection connection;
private final String JDBC_DRIVER = "com.mysql.cj.jdbc.Driver";
HashMap dimInfo = new HashMap<String, UserInfo>();
public DimRichFlatMapFunction(String url, String user, String passwd, Integer reloadInterval) {
this.url = url;
this.user = user;
this.passwd = passwd;
this.reloadInterval = reloadInterval;
}
/**
* 打开连接
* 定时加载维度数据
*
* @param parameters
* @throws Exception
*/
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
Class.forName(JDBC_DRIVER);
TimerTask timerTask = new TimerTask() {
@Override
public void run() {
try {
if (connection == null || connection.isClosed()) {
log.warn("No connection. Trying to reconnect...");
connection = DriverManager.getConnection(url, user, passwd);
}
String sql = "select uid,name,age,address from t_user_info";
PreparedStatement preparedStatement = connection.prepareStatement(sql);
ResultSet resultSet = preparedStatement.executeQuery();
while (resultSet.next()) {
UserInfo userInfo = new UserInfo();
userInfo.setUid(resultSet.getString("uid"));
userInfo.setName(resultSet.getString("name"));
userInfo.setAge(resultSet.getInt("age"));
userInfo.setAddress(resultSet.getString("address"));
dimInfo.put(userInfo.getUid(), userInfo);
}
} catch (SQLException e) {
log.error("Get dimension data exception...", e);
}
}
};
Timer timer = new Timer();
timer.scheduleAtFixedRate(timerTask, 0, reloadInterval * 1000);
}
/**
* 关闭连接
*
* @throws Exception
*/
@Override
public void close() throws Exception {
super.close();
if (connection != null) {
connection.close();
}
}
/**
* 维度关联
*
* @param value
* @param out
* @throws Exception
*/
@Override
public void flatMap(UserBrowseLog value, Collector<Tuple2<UserBrowseLog, UserInfo>> out) throws Exception {
String userID = value.getUserID();
if (dimInfo.containsKey(userID)) {
UserInfo dim = (UserInfo) dimInfo.get(userID);
out.collect(new Tuple2<>(value, dim));
}
}
}
由于数据会存储在内存中,因此,仅支持小数据量维表。
定时加载,仅适用于更新不太频繁的维表。
通过env.registerCachedFile(cachedFilePath, cachedFileName)
注册本地或HDFS缓存文件。
程序启动时,Flink会自动将文件分发到TaskManager文件系统中。
实现RichFlatMapFunction
,在open()
方法中通过RuntimeContext
获取缓存文件并解析。
解析后的数据在内存中,此时可在flatMap()
方法中实现维度关联。
package com.bigdata.flink.dimJoin;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.flink.api.common.functions.RichFlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
import java.io.File;
import java.util.HashMap;
import java.util.List;
/**
* Author: Wang Pei
* Summary:
* 通过Distributed Cache实现维度关联
*/
@Slf4j
public class DistributedCacheJoinDim {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 注册缓存文件 如: file:///some/path 或 hdfs://host:port/and/path
String cachedFilePath = "./user_info.txt";
String cachedFileName = "user_info";
env.registerCachedFile(cachedFilePath, cachedFileName);
// 添加实时流
DataStreamSource<Tuple2<String, String>> stream = env.fromElements(
Tuple2.of("1", "click"),
Tuple2.of("2", "click"),
Tuple2.of("3", "browse"));
// 关联维度
SingleOutputStreamOperator<String> dimedStream = stream.flatMap(new RichFlatMapFunction<Tuple2<String, String>, String>() {
HashMap dimInfo = new HashMap<String, Integer>();
// 读取文件
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
File cachedFile = getRuntimeContext().getDistributedCache().getFile(cachedFileName);
List<String> lines = FileUtils.readLines(cachedFile);
for (String line : lines) {
String[] split = line.split(",");
dimInfo.put(split[0], Integer.valueOf(split[1]));
}
}
// 关联维度
@Override
public void flatMap(Tuple2<String, String> value, Collector<String> out) throws Exception {
if (dimInfo.containsKey(value.f0)) {
Integer age = (Integer) dimInfo.get(value.f0);
out.collect(value.f0 + "," + value.f1 + "," + age);
}
}
});
dimedStream.print();
env.execute();
}
}
由于数据会存储在内存中,因此,仅支持小数据量维表。
启动时加载,在维表变化时,需要重启任务。
维度数据在外部存储中,如ES、Redis、HBase中。
通过异步IO查询维度数据
结合本地缓存如Guava Cache 减少对外部存储的访问。
之前总结过,这里就不写示例了。附上之前的链接。
Flink DataStream流表与维表Join(Async I/O)
此方式不受限于内存,可支持数据量较大的维度数据。
需要外部存储支持。
应尽量减少对外部存储访问。
将维度数据发送到Kafka作为流S1。事实数据是流S2。
定义状态描述符MapStateDescriptor
,如descriptor
。
结合状态描述符,将S1广播出去,如S1.broadcast(descriptor)
,形成广播流(BroadcastStream
) B1。
事实流S2和广播流B1连接,形成连接后的流BroadcastConnectedStream BC。
基于BC流,在KeyedBroadcastProcessFunction
/BroadcastProcessFunction
中实现Join的逻辑处理。
之前总结过,这里就不写示例了。附上之前的链接。
Flink DataStream基于Broadcast State动态更新配置以实现实时过滤数据并增加字段
需要将维度数据的变化转换成Kafka中的流。
维度的变化可实时感知。
维度数据保存在内存中,支持的数据量相对较小。
假设你用的是Flink SQL。首先,自定义UTDF, 继承TableFunction
抽象类,实现open()
、close()
、eval()
方法。
注册TableFunction
。
在SQL中使用LATERAL TABLE
语法和UDTF
运行的结果进行关联。
之前总结过,这里就不写示例了。附上之前的链接。
Flink Table & SQL 时态表Temporal Table
package com.bigdata.flink.dimJoin;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.typeutils.RowTypeInfo;
import org.apache.flink.table.functions.FunctionContext;
import org.apache.flink.table.functions.TableFunction;
import org.apache.flink.types.Row;
import redis.clients.jedis.Jedis;
/**
* Author: Wang Pei
* Summary:
* UDTF
*/
public class UDTFRedis extends TableFunction<Row> {
private Jedis jedis;
/**
* 打开连接
* @param context
* @throws Exception
*/
@Override
public void open(FunctionContext context) throws Exception {
jedis = new Jedis("localhost", 6379);
jedis.select(0);
}
/**
* 关闭连接
* @throws Exception
*/
@Override
public void close() throws Exception {
if (jedis != null) {
jedis.close();
}
}
/**
* 从Redis中查找维度数据
* @param key
*/
public void eval(String key) {
String value = jedis.get(key);
if (value != null) {
String[] valueSplit = value.split(",");
Row row = new Row(2);
row.setField(0, valueSplit[0]);
row.setField(1, Integer.valueOf(valueSplit[1]));
collector.collect(row);
}
}
/**
* 定义返回的数据类型,返回数据为userName,userAge,所以这里为String,Int。
* @return
*/
@Override
public TypeInformation<Row> getResultType() {
return new RowTypeInfo(Types.STRING, Types.INT);
}
}
package com.bigdata.flink.dimJoin;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
/**
* Author: Wang Pei
* Summary:
* Kafka Join Redis-Dim
*/
public class KafkaJoinRedisDimWithUDTF {
public static void main(String[] args) throws Exception {
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build();
StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings);
// Source DDL
// Kafka数据: {"userID":"user_1","eventType":"click","eventTime":"2015-01-01 00:00:00"}
String sourceDDL = ""
+ "create table source_kafka "
+ "( "
+ " userID String, "
+ " eventType String, "
+ " eventTime String "
+ ") with ( "
+ " 'connector.type' = 'kafka', "
+ " 'connector.version' = '0.10', "
+ " 'connector.properties.bootstrap.servers' = 'kafka01:9092', "
+ " 'connector.properties.zookeeper.connect' = 'kafka01:2181', "
+ " 'connector.topic' = 'test_1', "
+ " 'connector.properties.group.id' = 'c1_test_1', "
+ " 'connector.startup-mode' = 'latest-offset', "
+ " 'format.type' = 'json' "
+ ")";
tableEnv.sqlUpdate(sourceDDL);
tableEnv.toAppendStream(tableEnv.from("source_kafka"), Row.class).print();
// UDTF DDL
// Redis中的数据 userID userName,userAge
// 127.0.0.1:6379> get user_1
// "name1,10"
String udtfDDL = ""
+ "CREATE TEMPORARY FUNCTION "
+ " IF NOT EXISTS UDTFRedis "
+ " AS 'com.bigdata.flink.dimJoin.UDTFRedis'";
tableEnv.sqlUpdate(udtfDDL);
// Query
// Left Join
String execSQL = ""
+ "select "
+ " source_kafka.*,dim.* "
+ "from source_kafka "
+ "LEFT JOIN LATERAL TABLE(UDTFRedis(userID)) as dim (userName,userAge) ON TRUE";
Table table = tableEnv.sqlQuery(execSQL);
tableEnv.toAppendStream(table, Row.class).print();
tableEnv.execute(KafkaJoinRedisDimWithUDTF.class.getSimpleName());
}
}
需要定义UDTF和使用LATERAL TABLE
语法。
不是很通用,如想用一个UDTF实现所有从Redis获取维度数据的场景,很难实现。
依赖外部存储,当数据变化时,可及时获取。
数据源实现LookupableTableSource
接口。
在Flink SQL中直接注册Lookup
表即可,在Flink Table API中需要注册LookupFunction
。
本质上,还是通过TableFunction
来获取维度数据。
之前总结过,这里就不写示例了。附上之前的链接。
Flink Table & SQL LookableTableSource Join HBase
package com.bigdata.flink.dimJoin;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
/**
* Author: Wang Pei
* Summary:
* Kafka Join Mysql-Dim
*/
public class KafkaJoinMysqlDim {
public static void main(String[] args) throws Exception {
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build();
StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings);
// Source DDL
// Kafka数据: {"userID":"user_1","eventType":"click","eventTime":"2015-01-01 00:00:00"}
String sourceDDL = ""
+ "create table source_kafka "
+ "( "
+ " userID STRING, "
+ " eventType STRING, "
+ " eventTime STRING, "
+ " proctime AS PROCTIME() "
+ ") with ( "
+ " 'connector.type' = 'kafka', "
+ " 'connector.version' = '0.10', "
+ " 'connector.properties.bootstrap.servers' = 'kafka01:9092', "
+ " 'connector.properties.zookeeper.connect' = 'kafka01:2181', "
+ " 'connector.topic' = 'test_1', "
+ " 'connector.properties.group.id' = 'c1_test_1', "
+ " 'connector.startup-mode' = 'latest-offset', "
+ " 'format.type' = 'json' "
+ ")";
tableEnv.sqlUpdate(sourceDDL);
//tableEnv.toAppendStream(tableEnv.from("source_kafka"), Row.class).print();
// Dim DDL
// Mysql维度数据
// mysql> select * from t_user_info limit 1;
// +--------+----------+---------+
// | userID | userName | userAge |
// +--------+----------+---------+
// | user_1 | name1 | 10 |
// +--------+----------+---------+
String dimDDL = ""
+ "CREATE TABLE dim_mysql ( "
+ " userID STRING, "
+ " userName STRING, "
+ " userAge INT "
+ ") WITH ( "
+ " 'connector.type' = 'jdbc', "
+ " 'connector.url' = 'jdbc:mysql://localhost:3306/bigdata', "
+ " 'connector.table' = 't_user_info', "
+ " 'connector.driver' = 'com.mysql.jdbc.Driver', "
+ " 'connector.username' = '****', "
+ " 'connector.password' = '******' "
+ ")";
tableEnv.sqlUpdate(dimDDL);
// Query
// Left Join
String execSQL = ""
+ "SELECT "
+ " kafka.*,mysql.userName,mysql.userAge "
+ "FROM "
+ " source_kafka as kafka"
+ " LEFT JOIN dim_mysql FOR SYSTEM_TIME AS OF kafka.proctime AS mysql "
+ " ON kafka.userID = mysql.userID";
Table table = tableEnv.sqlQuery(execSQL);
tableEnv.toAppendStream(table, Row.class).print();
tableEnv.execute(KafkaJoinMysqlDim.class.getSimpleName());
}
}
需要实现LookupableTableSource
接口。
比较通用。
依赖外部存储,当数据变化时,可及时获取。
目前仅支持Blink Planner。