spark-streaming-kafka_2.11实现存储offset的位置

前言

spark-streaming-kafka_2.11这个包的更新日期比较老了,sparkstraming-kafka最新的包是spark-streaming-kafka-0-10_2.11,所以当你看到这篇记录的时候不用按照流程操作了,应为方法过于麻烦,最新的包中有自动保存offset的功能,这篇文章只是用于记录而已。在spark-streaming-kafka-0-10_2.11中如果需要自动保存上次读取topic的位置,则只需要简单的设置"enable.auto.commit" -> (true: java.lang.Boolean),params设置如下:
kafkaParams

val kafkaParams = Map[String, Object](
          "bootstrap.servers" -> "hadoop-mgr:9092,hadoop-node1:9092,hadoop-node2:9092",
          "key.deserializer" -> classOf[StringDeserializer],
          "value.deserializer" -> classOf[StringDeserializer],
          "group.id" -> "spark_group",
          "auto.offset.reset" -> "latest",
          "enable.auto.commit" -> (true: java.lang.Boolean)
        )

内容

xml

        
            org.apache.spark
            spark-streaming-kafka_2.11
            1.5.1
        

main.scala

val kafkaParams = HashMap[String, String](
      "bootstrap.servers" -> "hadoop-mgr:9092,hadoop-node1:9092,hadoop-node2:9092",
      "group.id" -> "spark_group")

val topics = Set(TopicConstant.TOPIC_FACE_NAME, TopicConstant.TOPIC_PERSON_NAME
      , TopicConstant.TOPIC_MOTOR_VEHICLE, TopicConstant.TOPIC_SUB_NOTIFICATION)
        val stream = KafkaUtils.createDirectStream[String, String](
          ssc,
          LocationStrategies.PreferConsistent,
          Subscribe[String, String](topics, kafkaParams)
        )

    val manager = new KafkaManager(kafkaParams)
    val stream = manager.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)

    stream.foreachRDD(
      rdd => {
        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
        rdd.foreachPartition(partitionOfRecords => {
          partitionOfRecords.foreach(data => {
//            Log.info("------>:data:" + data._2)
            val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
            println(s"----->topic:${o.topic},partition:${o.partition},fromOffset:${o.fromOffset},untilOffset:${o.untilOffset}"
            }
          })
        })
        manager.updateZKOffsets(rdd)
      }
    )

KafkaManager.scala

这个类需要放在org.apache.spark.streaming.kafka包下面,不然KafkaCluster初始化不成功。

package org.apache.spark.streaming.kafka

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream

import scala.collection.immutable.HashMap
import scala.reflect.ClassTag

/**
 * description: KafkaManager 
* date: 2020/6/24 17:50
* author: chezi008/[email protected]
* version: 1.0
*/ class KafkaManager(val kafkaParams: HashMap[String, String]) extends Serializable { private val kc = new KafkaCluster(kafkaParams) /** * 创建数据流 * * @param ssc * @param kafkaParams * @param topics * @tparam K * @tparam V * @tparam KD * @tparam VD * @return */ def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K] : ClassTag, VD <: Decoder[V] : ClassTag] (ssc: StreamingContext, kafkaParams: HashMap[String, String], topics: Set[String]): InputDStream[(K, V)] = { val groupId = kafkaParams.get("group.id").get //从zookeeper上读取offset开始消费message // val messages = { val partitionsE = kc.getPartitions(topics) if (partitionsE.isLeft) throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}") val partitions = partitionsE.right.get val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions) if (!consumerOffsetsE.isLeft) { val consumerOffsets = consumerOffsetsE.right.get KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)]( ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)) } else { val p = kafkaParams + ("auto.offset.reset" -> "largest") KafkaUtils.createDirectStream(ssc, p, topics) } // } // messages } /** * 更新消费offsets * * @param rdd */ def updateZKOffsets(rdd: RDD[(String, String)]): Unit = { val groupId = kafkaParams.get("group.id").get val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges for (offsets <- offsetsList) { val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition) val o = kc.setConsumerOffsets(groupId, HashMap((topicAndPartition, offsets.untilOffset))) if (o.isLeft) { println(s"Error updating the offset to Kafka cluster: ${o.left.get}") } } } }

你可能感兴趣的:(spark-streaming-kafka_2.11实现存储offset的位置)