各种sink算子,无非是对各种数据存储的客户端程序的封装。
kafka sink封装了 kafkaProducer
jdbc sink 封装了 jdbc
redis sink 封装了jedis
KafkaSink 是能结合 Flink 的 Checkpoint 机制,来支持端到端精确一次语义的 ;
(底层,当然是利用了 kafka producer 的事务机制)
Flink 可以和 Kafka 多个版本整合,比如 0.11.x、1.x、2.x 等;
从 Flink1.9 开始,使用的是 kafka 2.2 的客户端。
核心类 :
新版kafkaSink算子的代码示例:
import com.alibaba.fastjson.JSON;
import com.yang.flink.source.MySourceFunction;
import com.yang.flink.vo.EventLog;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.base.DeliveryGuarantee;
import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema;
import org.apache.flink.connector.kafka.sink.KafkaSink;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class KafkaSinkOperatorDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 开启checkpoint
env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckpt");
DataStreamSource<EventLog> streamSource = env.addSource(new MySourceFunction());//自定义的source源,可以使用kafka的source源
KafkaSink<String> kafkaSink = KafkaSink.<String>builder()
.setBootstrapServers("hadoop102:9092,hadoop103:9092")
.setRecordSerializer(KafkaRecordSerializationSchema.<String>builder()
.setTopic("event_log")
.setValueSerializationSchema(new SimpleStringSchema())
.build()
)
.setDeliverGuarantee(DeliveryGuarantee.EXACTLY_ONCE)
.setTransactionalIdPrefix("yang")
.build();
streamSource.map(JSON::toJSONString).disableChaining()//disableChaining()禁止被绑定,作为一个单独的task来处理,计算的弹性很强
.sinkTo(kafkaSink);
env.execute();
}
}
import com.alibaba.fastjson.JSON;
import com.mysql.cj.jdbc.MysqlXADataSource;
import com.yang.flink.source.MySourceFunction;
import com.yang.flink.vo.EventLog;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.connector.jdbc.*;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.util.function.SerializableSupplier;
import javax.sql.XADataSource;
import java.sql.PreparedStatement;
import java.sql.SQLException;
public class JDBCSinkOperatorDemo {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.setInteger("rest.port", 8822);
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
// 开启checkpoint
env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckpt");
// 构造好一个数据流
DataStreamSource<EventLog> streamSource = env.addSource(new MySourceFunction());
/**
* 一、 不保证 EOS语义的方式
*/
SinkFunction<EventLog> jdbcSink = JdbcSink.sink(
"insert into t_eventlog values (?,?,?,?) on duplicate key update guid=?,sessionId=?,eventId=?,ts=?",
new JdbcStatementBuilder<EventLog>() {
@Override
public void accept(PreparedStatement preparedStatement, EventLog eventLog) throws SQLException {
preparedStatement.setLong(1, eventLog.getGuid());
preparedStatement.setString(2, eventLog.getSessionId());
preparedStatement.setString(3, eventLog.getEventId());
preparedStatement.setLong(4, eventLog.getTimeStamp());
preparedStatement.setString(5, eventLog.getSessionId());
preparedStatement.setString(6, eventLog.getEventId());
preparedStatement.setLong(7, eventLog.getTimeStamp());
}
},
JdbcExecutionOptions.builder()
.withMaxRetries(3)
.withBatchSize(1)
.build(),
new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
.withUrl("jdbc:mysql://hadoop102:3306/abc?serverTimezone=Asia/Shanghai&useUnicode=true&characterEncoding=UTF-8")
.withUsername("root")
.withPassword("000000")
.build()
);
// 输出数据
streamSource.addSink(jdbcSink);
env.execute();
}
}
import com.mysql.cj.jdbc.MysqlXADataSource;
import com.yang.flink.source.MySourceFunction;
import com.yang.flink.vo.EventLog;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.connector.jdbc.*;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.util.function.SerializableSupplier;
import javax.sql.XADataSource;
import java.sql.PreparedStatement;
import java.sql.SQLException;
public class JDBCSinkOperatorDemo {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.setInteger("rest.port", 8822);
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
// 开启checkpoint
env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckpt");
// 构造好一个数据流
DataStreamSource<EventLog> streamSource = env.addSource(new MySourceFunction());
/**
* 二、可以提供 EOS 语义保证的 sink
* 构造一个JdbcSink算子对象(保证exactly-once的一种)
* 底层是利用jdbc目标数据库的事务机制
*
*/
SinkFunction<EventLog> exactlyOnceSink = JdbcSink.exactlyOnceSink(
"insert into t_eventlog values (?,?,?,?) on duplicate key update guid=?,sessionId=?,eventId=?,ts=?",
new JdbcStatementBuilder<EventLog>() {
@Override
public void accept(PreparedStatement preparedStatement, EventLog eventLog) throws SQLException {
preparedStatement.setLong(1, eventLog.getGuid());
preparedStatement.setString(2, eventLog.getSessionId());
preparedStatement.setString(3, eventLog.getEventId());
preparedStatement.setLong(4, eventLog.getTimeStamp());
preparedStatement.setString(5, eventLog.getSessionId());
preparedStatement.setString(6, eventLog.getEventId());
preparedStatement.setLong(7, eventLog.getTimeStamp());
}
},
JdbcExecutionOptions.builder()
.withMaxRetries(3)
.withBatchSize(1)
.build(),
JdbcExactlyOnceOptions.builder()
// mysql不支持同一个连接上存在并行的多个事务,必须把该参数设置为true
.withTransactionPerConnection(true)
.build(),
new SerializableSupplier<XADataSource>() {
@Override
public XADataSource get() {
// XADataSource就是jdbc连接,不过它是支持分布式事务的连接
// 而且它的构造方法,不同的数据库构造方法不同
MysqlXADataSource xaDataSource = new MysqlXADataSource();
xaDataSource.setUrl("jdbc:mysql://hadoop102:3306/abc?serverTimezone=Asia/Shanghai&useUnicode=true&characterEncoding=UTF-8");
xaDataSource.setUser("root");
xaDataSource.setPassword("000000");
return xaDataSource;
}
}
);
// 输出数据
streamSource.addSink(exactlyOnceSink);
env.execute();
}
}
现在官方没有相关的Flink写出到Redis的算子,
https://github.com/apache/bahir-flink
下面的工程导入到自己代码库里面,代码案例如下
import com.alibaba.fastjson.JSON;
import com.yang.flink.source.MySourceFunction;
import com.yang.flink.vo.EventLog;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.redis.RedisSink;
import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommand;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommandDescription;
import org.apache.flink.streaming.connectors.redis.common.mapper.RedisMapper;
import java.util.Optional;
/**
* 将数据流写入redis,利用RedisSink算子
*/
public class RedisSinkOperatorDemo {
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
configuration.setInteger("rest.port", 8822);
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(configuration);
// 开启checkpoint
env.enableCheckpointing(5000, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointStorage("file:///e:/ckpt");
// 构造好一个数据流
DataStreamSource<EventLog> streamSource = env.addSource(new MySourceFunction());
// eventLog数据插入redis,你想用什么结构来存储?
FlinkJedisPoolConfig config = new FlinkJedisPoolConfig.Builder().setHost("hadoop102").build();
RedisSink<EventLog> redisSink = new RedisSink<>(config, new StringInsertMapper());
streamSource.addSink(redisSink);
env.execute();
}
/**
* 1.String结构插入数据到Redis
*/
static class StringInsertMapper implements RedisMapper<EventLog> {
@Override
public RedisCommandDescription getCommandDescription() {
return new RedisCommandDescription(RedisCommand.SET,"eventlogs");
}
/**
* 如果选择的是没有内部key的redis数据结构,则此方法返回的就是大 key,最外部的key
* 如果选择的是有内部key的redis数据结构(hset),则此方法返回的是hset内部的小key,而把上面Description中传入的值作为大key
* @param data
* @return
*/
@Override
public String getKeyFromData(EventLog data) {
return data.getGuid()+"-"+data.getSessionId()+"-"+data.getTimeStamp(); // 这里就是string数据的大key
}
@Override
public String getValueFromData(EventLog data) {
return JSON.toJSONString(data); // 这里就是string数据的value
}
}
/**
* 2.HASH结构数据插入
*/
static class HsetInsertMapper implements RedisMapper<EventLog>{
// 可以根据具体数据, 选择额外key(就是hash这种结构,它有额外key(大key)
@Override
public Optional<String> getAdditionalKey(EventLog data) {
return RedisMapper.super.getAdditionalKey(data);
}
// 可以根据具体数据,设置不同的TTL(time to live,数据的存活时长)
@Override
public Optional<Integer> getAdditionalTTL(EventLog data) {
return RedisMapper.super.getAdditionalTTL(data);
}
@Override
public RedisCommandDescription getCommandDescription() {
return new RedisCommandDescription(RedisCommand.HSET,"event-logs");
}
/**
* 如果选择的是没有内部key的redis数据结构,则此方法返回的就是大 key
* 如果选择的是有内部key的redis数据结构(hset),则此方法返回的是hset内部的小key,而把上面Description中传入的值作为大key
* @param data
* @return
*/
@Override
public String getKeyFromData(EventLog data) {
return data.getGuid()+"-"+data.getSessionId()+"-"+data.getTimeStamp(); // 这里就是hset中的field(小key)
}
@Override
public String getValueFromData(EventLog data) {
return data.getEventId(); // 这里就是hset中的value
}
}
}