对于实时处理当中,我们实际工作当中的数据源一般都是使用kafka,所以我们一起来看看如何通过Flink来集成kafka。flink提供了一个特有的kafka connector去读写kafka topic的数据。flink消费kafka数据,并不是完全通过跟踪kafka消费组的offset来实现去保证exactly-once的语义,而是flink内部去跟踪offset和做checkpoint去实现exactly-once的语义,而且对于kafka的partition,Flink会启动对应的并行度去处理kafka当中的每个分区的数据
flink整合kafka官网介绍:https://ci.apache.org/projects/flink/flink-docs-release-1.6/dev/connectors/kafka.html
第一步:导入jar包
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-kafka-0.11_2.11artifactId>
<version>1.8.1version>
dependency>
<dependency>
<groupId>org.apache.kafkagroupId>
<artifactId>kafka-clientsartifactId>
<version>1.1.0version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-apiartifactId>
<version>1.7.25version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
<version>1.7.25version>
dependency>
第二步:将kafka作为flink的source来使用
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.contrib.streaming.state.RocksDBStateBackend
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.CheckpointConfig
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
object FlinkKafkaSource {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
//隐式转换
import org.apache.flink.api.scala._
//checkpoint配置
env.enableCheckpointing(100);
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500);
env.getCheckpointConfig.setCheckpointTimeout(60000);
env.getCheckpointConfig.setMaxConcurrentCheckpoints(1);
env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
env.setStateBackend(new RocksDBStateBackend("hdfs://node01:8020/flink/checkDir",true))
val topic = "test"
val prop = new Properties()
prop.setProperty("bootstrap.servers","node01:9092")
prop.setProperty("group.id","con1")
prop.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
prop.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
var kafkaSoruce: FlinkKafkaConsumer011[String] = new FlinkKafkaConsumer011[String](topic, new SimpleStringSchema(), prop)
kafkaSoruce.setCommitOffsetsOnCheckpoints(true)
//设置statebackend
env.setStateBackend(new RocksDBStateBackend("hdfs://node01:8020/flink_kafka/checkpoints",true));
val result: DataStream[String] = env.addSource(kafkaSoruce)
result.print()
env.execute()
}
}
第三步:将kafka作为flink的sink来使用
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.contrib.streaming.state.RocksDBStateBackend
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.CheckpointConfig
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011
import org.apache.flink.streaming.connectors.kafka.internals.KeyedSerializationSchemaWrapper
object FlinkKafkaSink {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
//隐式转换
import org.apache.flink.api.scala._
//checkpoint配置
env.enableCheckpointing(5000);
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500);
env.getCheckpointConfig.setCheckpointTimeout(60000);
env.getCheckpointConfig.setMaxConcurrentCheckpoints(1);
env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
//设置statebackend
env.setStateBackend(new RocksDBStateBackend("hdfs://node01:8020/flink_kafka_sink/checkpoints",true));
val text = env.socketTextStream("node01",9000)
val topic = "test"
val prop = new Properties()
prop.setProperty("bootstrap.servers","node01:9092")
prop.setProperty("group.id","kafka_group1")
//第一种解决方案,设置FlinkKafkaProducer011里面的事务超时时间
//设置事务超时时间
prop.setProperty("transaction.timeout.ms",60000*15+"");
//第二种解决方案,设置kafka的最大事务超时时间
//FlinkKafkaProducer011 myProducer = new FlinkKafkaProducer011<>(brokerList, topic, new SimpleStringSchema());
//使用支持仅一次语义的形式
val myProducer = new FlinkKafkaProducer011[String](topic,new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema()), prop, FlinkKafkaProducer011.Semantic.EXACTLY_ONCE)
text.addSink(myProducer)
env.execute("StreamingFromCollectionScala")
}
}