前言
spark-streaming-kafka_2.11这个包的更新日期比较老了,sparkstraming-kafka最新的包是spark-streaming-kafka-0-10_2.11,所以当你看到这篇记录的时候不用按照流程操作了,应为方法过于麻烦,最新的包中有自动保存offset的功能,这篇文章只是用于记录而已。在spark-streaming-kafka-0-10_2.11中如果需要自动保存上次读取topic的位置,则只需要简单的设置"enable.auto.commit" -> (true: java.lang.Boolean),params设置如下:
kafkaParams
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop-mgr:9092,hadoop-node1:9092,hadoop-node2:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "spark_group",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (true: java.lang.Boolean)
)
内容
xml
org.apache.spark
spark-streaming-kafka_2.11
1.5.1
main.scala
val kafkaParams = HashMap[String, String](
"bootstrap.servers" -> "hadoop-mgr:9092,hadoop-node1:9092,hadoop-node2:9092",
"group.id" -> "spark_group")
val topics = Set(TopicConstant.TOPIC_FACE_NAME, TopicConstant.TOPIC_PERSON_NAME
, TopicConstant.TOPIC_MOTOR_VEHICLE, TopicConstant.TOPIC_SUB_NOTIFICATION)
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
val manager = new KafkaManager(kafkaParams)
val stream = manager.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
stream.foreachRDD(
rdd => {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition(partitionOfRecords => {
partitionOfRecords.foreach(data => {
// Log.info("------>:data:" + data._2)
val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
println(s"----->topic:${o.topic},partition:${o.partition},fromOffset:${o.fromOffset},untilOffset:${o.untilOffset}"
}
})
})
manager.updateZKOffsets(rdd)
}
)
KafkaManager.scala
这个类需要放在org.apache.spark.streaming.kafka包下面,不然KafkaCluster初始化不成功。
package org.apache.spark.streaming.kafka
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import scala.collection.immutable.HashMap
import scala.reflect.ClassTag
/**
* description: KafkaManager
* date: 2020/6/24 17:50
* author: chezi008/[email protected]
* version: 1.0
*/
class KafkaManager(val kafkaParams: HashMap[String, String]) extends Serializable {
private val kc = new KafkaCluster(kafkaParams)
/**
* 创建数据流
*
* @param ssc
* @param kafkaParams
* @param topics
* @tparam K
* @tparam V
* @tparam KD
* @tparam VD
* @return
*/
def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K] : ClassTag, VD <: Decoder[V] : ClassTag]
(ssc: StreamingContext, kafkaParams: HashMap[String, String], topics: Set[String]): InputDStream[(K, V)] = {
val groupId = kafkaParams.get("group.id").get
//从zookeeper上读取offset开始消费message
// val messages = {
val partitionsE = kc.getPartitions(topics)
if (partitionsE.isLeft)
throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
val partitions = partitionsE.right.get
val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
if (!consumerOffsetsE.isLeft) {
val consumerOffsets = consumerOffsetsE.right.get
KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](
ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message))
} else {
val p = kafkaParams + ("auto.offset.reset" -> "largest")
KafkaUtils.createDirectStream(ssc, p, topics)
}
// }
// messages
}
/**
* 更新消费offsets
*
* @param rdd
*/
def updateZKOffsets(rdd: RDD[(String, String)]): Unit = {
val groupId = kafkaParams.get("group.id").get
val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
for (offsets <- offsetsList) {
val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
val o = kc.setConsumerOffsets(groupId, HashMap((topicAndPartition, offsets.untilOffset)))
if (o.isLeft) {
println(s"Error updating the offset to Kafka cluster: ${o.left.get}")
}
}
}
}