scala Flink读取kafka 写入elasticsearch 简单实现
- 引入pom依赖
- es的Mapping
- 读取kafka 写入es 代码实现
- scala 构建kafka生产者
- scala 构建kafka消费者
引入pom依赖
org.apache.kafka
kafka_2.11
1.1.0
org.apache.kafka
kafka-clients
1.1.0
org.apache.flink
flink-connector-kafka_2.11
1.9.0
org.apache.flink
flink-connector-elasticsearch6_2.11
1.7.0
org.apache.flink
flink-streaming-scala_2.11
1.7.0
es的Mapping
{
"order": 0,
"index_patterns": [
"xiaojin_*"
],
"settings": {
"index": {
"number_of_shards": "3",
"translog": {
"flush_threshold_size": "1g"
},
"number_of_replicas": "1"
}
},
"mappings": {
"_default_": {
"dynamic_templates": [
{
"string_as_keyword": {
"mapping": {
"type": "keyword"
},
"match_mapping_type": "string",
"match": "*"
}
}
],
"_all": {
"enabled": false
},
"properties": {
"originalMsg": {
"index": false,
"type": "text"
},
"index": {
"index": false,
"type": "keyword"
},
"location": {
"type": "geo_point"
},
"id": {
"type": "keyword"
},
"table": {
"index": false,
"type": "keyword"
},
"speed": {
"type": "double"
}
}
}
},
"aliases": {}
}
读取kafka 写入es 代码实现
import java.util.Properties
import org.apache.flink.streaming.connectors.kafka._
import org.apache.flink.streaming.api.scala._
import org.apache.flink.api.common.functions.RuntimeContext
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer
import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink
import org.apache.http.HttpHost
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.elasticsearch.action.index.IndexRequest
import org.elasticsearch.client.Requests
import org.apache.flink.api.common.serialization.SimpleStringSchema
object Flink_kafka {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
// 非常关键,一定要设置启动检查点!!
env.enableCheckpointing(5000)
import org.apache.flink.api.scala._
//配置kafka信息
val props = new Properties()
props.setProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "xxx:9092,xxx:9092,xxx:9092")
props.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest")
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "group-8")
//读取数据
val consumer = new FlinkKafkaConsumer[String]("flink_xiaojin", new SimpleStringSchema(), props)
//设置只读取最新数据
consumer.setStartFromLatest()
//添加kafka为数据源
val stream = env.addSource(consumer)
stream.print()
val httpHosts = new java.util.ArrayList[HttpHost]
httpHosts.add(new HttpHost("172.16.15.52", 9200, "http"))
val esSinkBuilder = new ElasticsearchSink.Builder[String](
httpHosts,
new ElasticsearchSinkFunction[String] {
def createIndexRequest(element: String): IndexRequest = {
val json = new java.util.HashMap[String, String]
json.put("wei", element.split(",")(0))
json.put("jing", element.split(",")(1))
json.put("time", element.split(",")(2))
return Requests.indexRequest()
.index("xiaojin_20200724")
.`type`("location")
.source(json)
}
override def process(element: String, runtimeContext: RuntimeContext, requestIndexer: RequestIndexer): Unit = {
requestIndexer.add(createIndexRequest(element))
}
}
)
//批量请求的配置;这将指示接收器在每个元素之后发出请求,否则将对它们进行缓冲。
esSinkBuilder.setBulkFlushMaxActions(1)
stream.addSink(esSinkBuilder.build())
env.execute("Kafka_Flink")
}
}
scala 构建kafka生产者
package com.xiaojin
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
/**
* 实现producer 生产者
*/
object KafkaProducerDemo {
def main(args: Array[String]): Unit = {
val prop = new Properties
// 指定请求的kafka集群列表
prop.put("bootstrap.servers", "xxx:9092,xxx:9092,xxx:9092") // 指定响应方式
//prop.put("acks", "0")
prop.put("acks", "all")
// 请求失败重试次数
//prop.put("retries", "3")
// 指定key的序列化方式, key是用于存放数据对应的offset
prop.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
// 指定value的序列化方式
prop.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
// 配置超时时间
prop.put("request.timeout.ms", "60000")
//prop.put("batch.size", "16384")
//prop.put("linger.ms", "1")
//prop.put("buffer.memory", "33554432")
// 得到生产者的实例
val producer = new KafkaProducer[String, String](prop)
// 模拟一些数据并发送给kafka
for (i <- 1 to 100) {
val msg = s"${i},this is a,linys ${i} kafka data"
println("send -->" + msg)
// 得到返回值
val rmd: RecordMetadata = producer.send(new ProducerRecord[String, String]("flink_xiaojin", msg)).get()
println(rmd.toString)
Thread.sleep(500)
}
producer.close()
}
}
scala 构建kafka消费者
package com.xiaojin
import java.util.{Collections, Properties}
import org.apache.kafka.clients.consumer.{ConsumerRecords, KafkaConsumer}
/**
* 实现consumer 消费者
*/
object KafkaConsumerDemo {
def main(args: Array[String]): Unit = {
// 配置信息
val prop = new Properties
prop.put("bootstrap.servers", "xxx:9092,xxx:9092,xxx:9092")
// 指定消费者组
prop.put("group.id", "group01")
// 指定消费位置: earliest/latest/none
prop.put("auto.offset.reset", "earliest")
// 指定消费的key的反序列化方式
prop.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
// 指定消费的value的反序列化方式
prop.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
prop.put("enable.auto.commit", "true")
prop.put("session.timeout.ms", "30000")
// 得到Consumer实例
val kafkaConsumer = new KafkaConsumer[String, String](prop)
// 首先需要订阅topic
kafkaConsumer.subscribe(Collections.singletonList("flink_xiaojin"))
// 开始消费数据
while (true) {
// 如果Kafak中没有消息,会隔timeout这个值读一次。比如上面代码设置了2秒,也是就2秒后会查一次。
// 如果Kafka中还有消息没有消费的话,会马上去读,而不需要等待。
val msgs: ConsumerRecords[String, String] = kafkaConsumer.poll(2000)
// println(msgs.count())
val it = msgs.iterator()
while (it.hasNext) {
val msg = it.next()
println(s"partition: ${msg.partition()}, offset: ${msg.offset()}, key: ${msg.key()}, value: ${msg.value()}")
}
}
}
}