SparkStreaming 消费 Kafka 数据保存 Offset 到 Zookeeper

Sparkstreaming程序:

package com.cloudera.saveoffsettoZk

import com.cloudera.utils.KafkaZkUtils
import kafka.utils.ZkUtils
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.HasOffsetRanges
import org.apache.spark.streaming.{Seconds, StreamingContext}

object SparkSaveOffsetToZkApp {

  private val logger: Logger = Logger.getLogger(this.getClass)

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setIfMissing("spark.master", "local[2]").setAppName("Spark Save Offset To Zookeeper App")

    val streamingContext = new StreamingContext(conf, Seconds(30))
    val kafkaParams = Map[String, Object](
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.1.100:9092,192.168.1.101:9092,192.168.1.102:9092",
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.GROUP_ID_CONFIG -> "group-01",
      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
      ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean)
    )

    val topic: String = "my-topic"
    val zkUrl = "192.168.1.100:2181,192.168.1.101:2181,192.168.1.102:2181"
    val sessionTimeout = 1000
    val connectionTimeout = 1000

    val zkClient = ZkUtils.createZkClient(zkUrl, sessionTimeout, connectionTimeout)

    val kafkaStream = KafkaZkUtils.createDirectStream(zkClient, streamingContext, kafkaParams, topic)

    //开始处理批次消息
    kafkaStream.foreachRDD(rdd => {
      //获取当前批次的RDD的偏移量
      val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

      // 处理从获取 kafka 中的数据
      val result = rdd.map(_.value()).map(_.split("\\|\\|")).map(x => (x(0), x(1), x(2)))
      result.foreach(println(_))
      println("=============== Total " + rdd.count() + " events in this    batch ..")

      // 更新offset到zookeeper中
      KafkaZkUtils.saveOffsets(zkClient, topic, KafkaZkUtils.getZkPath(kafkaParams, topic), rdd)
    })
    streamingContext.start()
    streamingContext.awaitTermination()
    streamingContext.stop()
  }
}

KafkaZkUtils 工具类:

package com.cloudera.utils

import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.kafka.common.TopicPartition
import org.apache.log4j.Logger
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils}

import scala.collection.immutable.Map

object KafkaZkUtils {
  private val logger: Logger = Logger.getLogger(this.getClass)


  /**
    * 获取 consumer 在zk上的路径
    * @param kafkaParams
    * @param topic
    * @return
    */
  def getZkPath(kafkaParams: Map[String, Object], topic: String): String ={
    val topicDirs = new ZKGroupTopicDirs(kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).toString, topic)
    s"${topicDirs.consumerOffsetDir}"
  }

  /**
    * 创建 DirectStream
    * @param zkClient
    * @param streamingContext
    * @param kafkaParams
    * @param topic
    * @return
    */
  def createDirectStream(zkClient: ZkClient,streamingContext: StreamingContext, kafkaParams: Map[String, Object], topic: String): InputDStream[ConsumerRecord[String, String]] = {


    val zkPath = getZkPath(kafkaParams,topic)

    //读取 topic 的 offset
    val storedOffsets = readOffsets(zkClient, topic, zkPath)

    val kafkaStream: InputDStream[ConsumerRecord[String, String]] = storedOffsets match {
      //上次未保存offsets
      case None =>
        KafkaUtils.createDirectStream[String, String](
          streamingContext,
          PreferConsistent,
          ConsumerStrategies.Subscribe[String, String](Array(topic), kafkaParams)
        )
      case Some(fromOffsets) => {
        KafkaUtils.createDirectStream[String, String](
          streamingContext,
          PreferConsistent,         
          // 指定分区消费,无法动态感知分区变化
//          ConsumerStrategies.Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
          ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, fromOffsets)
        )
      }
    }
    kafkaStream
  }

  /**
    * 保存 offset
    * @param zkClient
    * @param topic
    * @param zkPath
    * @param rdd
    */
  def saveOffsets(zkClient: ZkClient,topic: String, zkPath: String, rdd: RDD[_]): Unit = {

    logger.info("Saving offsets to zookeeper")

    val offsetsRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

    offsetsRanges.foreach(offsetRange => logger.debug(s"Using ${offsetRange}"))

    val offsetsRangesStr = offsetsRanges.map(offsetRange => s"${offsetRange.partition}:${offsetRange.untilOffset}").mkString(",")

    logger.info(s"Writing offsets to Zookeeper: ${offsetsRangesStr}")

    ZkUtils(zkClient, false).updatePersistentPath(zkPath, offsetsRangesStr)
  }

  /**
    * 读取 offset
    * @param zkClient
    * @param topic
    * @param zkPath
    * @return
    */
  def readOffsets(zkClient: ZkClient, topic: String, zkPath: String): Option[Map[TopicPartition, Long]] = {
    logger.info("Reading offsets from zookeeper")

    val (offsetsRangesStrOpt, _) = ZkUtils(zkClient, false).readDataMaybeNull(zkPath)
    offsetsRangesStrOpt match {
      case Some(offsetsRangesStr) => {
        logger.debug(s"Read offset ranges: ${
          offsetsRangesStr
        }")
        val offsets: Map[TopicPartition, Long] = offsetsRangesStr.split(",").map(s => s.split(":"))
          .map({
            case Array(partitionStr, offsetStr) =>
              (new TopicPartition(topic, partitionStr.toInt) -> offsetStr.toLong)
          }).toMap
        Some(offsets)
      }
      case None =>
        logger.info("No offsets found in Zookeeper")
        None
    }
  }
}

KafkaProducer

package com.cloudera.saveoffsettoZk

import java.text.SimpleDateFormat
import java.util.concurrent.Future
import java.util.{Date, Properties, UUID}

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord, RecordMetadata}
import org.apache.kafka.common.serialization.StringSerializer

import scala.util.Random

object KafkaProducerDemo {

  def main(args: Array[String]): Unit = {

    // 设置 Kafka 配置属性
    val props = new Properties
    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.1.100:9092,192.168.1.101:9092,192.168.1.102:9092")
    props.put(ProducerConfig.ACKS_CONFIG, "all")
    props.put(ProducerConfig.RETRIES_CONFIG, "0")
    props.put(ProducerConfig.BATCH_SIZE_CONFIG, "16384") //批量发送的字节数
    props.put(ProducerConfig.LINGER_MS_CONFIG, "1") //将会减少请求数目,但是同时会增加1ms的延迟
    props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, "33554432") //用来缓存数据的内存大小
    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName)
    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName)

    val producer = new KafkaProducer[String, String](props)

    val TOPIC_NAME = "my-topic"

    try {
      // 产生并发送消息
      while (true) {
        val runtime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date().getTime)

        // 2019-09-30 08:46:10||24016FFD664A4C32AE3A4276416329B0||6
        val message: String = s"${runtime}||${UUID.randomUUID().toString.replace("-", "").toUpperCase}||${Random.nextInt()}"

        val record: ProducerRecord[String, String] = new ProducerRecord[String, String](TOPIC_NAME, message)
        // 发送消息,并获得一个Future对象
        val metadataFuture: Future[RecordMetadata] = producer.send(record)
        //同步获得Future对象的结果
        val recordMetadata: RecordMetadata = metadataFuture.get()

        Thread.sleep(500)
        println(message)
      }
    } catch {
      case e: Exception => {
        e.printStackTrace();
      }
    }
    // 关闭producer
    producer.flush()
    producer.close()
  }
}

pom.xml

    
        UTF-8
        UTF-8
        1.8
        2.11
        0.10.0.1
        
        2.2.0
        0.11
        3.0.0
        compile
        compile
        
    

    
        
        
            org.scala-lang
            scala-library
            ${scala.version}.12
            ${provided.scope}
        
        
            org.apache.spark
            spark-core_${scala.version}
            ${spark.version}
            ${provided.scope}
        
        
            org.apache.spark
            spark-streaming_${scala.version}
            ${spark.version}
            ${provided.scope}
        
        
            org.apache.spark
            spark-sql_${scala.version}
            ${spark.version}
            ${provided.scope}
        
        
            org.apache.spark
            spark-hive_${scala.version}
            ${spark.version}
            ${provided.scope}
        
        
            org.apache.spark
            spark-yarn_${scala.version}
            ${spark.version}
            ${provided.scope}
        
        
            org.apache.spark
            spark-sql-kafka-0-10_${scala.version}
            ${spark.version}
            ${provided.scope}
        
        
            org.apache.spark
            spark-streaming-kafka-0-10_${scala.version}
            ${spark.version}
            ${provided.scope}
        
        
            org.apache.kafka
            kafka_${scala.version}
            ${kafka.version}
            ${provided.scope}
        
        
            org.apache.kafka
            kafka-clients
            ${kafka.version}
            ${provided.scope}
        
        
            com.101tec
            zkclient
            ${zkclient.version}
        
        
            redis.clients
            jedis
            ${redis.version}
        
    

运行结果:

...
(2019-09-30 10:09:05,0F9EE952FEC548D2B2E91E4EDCF8DDC4,99555169)
(2019-09-30 10:08:59,2837DA6C8B3A4F29973B2FC210D20D54,-591678628)
(2019-09-30 10:09:12,A19DE366F9F640CFB239BF3197C5E10B,513230648)
(2019-09-30 10:09:12,E8EAF40E636E461C86C27161B794F60A,691784811)
(2019-09-30 10:09:14,CC09A901AEA0478F8C60346150FF87FA,-1644884384)
(2019-09-30 10:09:13,1317E46EE6154D2F9EF69B8382246E93,135563951)
(2019-09-30 10:09:15,121156B463AC4A22BF5CF07E22FA9CE0,-414732657)
(2019-09-30 10:09:15,DAECF018F6384933A40364A906BC99D0,-145448624)
(2019-09-30 10:09:17,46EDA56DE868495BAC44613EBFFB0BDF,1356997081)
(2019-09-30 10:09:06,5191550588544F229C33839655D8ECB8,-1222599956)
(2019-09-30 10:09:13,65833427C68A44B79FC27B14652A916A,1140020871)
(2019-09-30 10:09:14,655D6FB67C5F43AFB3EABB842818742F,1760913134)
(2019-09-30 10:09:16,752F6AFA9EA54EF48E0D2124DFBC3BC7,1518003893)
19/09/30 10:09:30 INFO Executor: Finished task 2.0 in stage 0.0 (TID 2). 622 bytes result sent to driver
19/09/30 10:09:30 INFO TaskSetManager: Finished task 2.0 in stage 0.0 (TID 2) in 19 ms on localhost (executor driver) (3/3)
19/09/30 10:09:30 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
19/09/30 10:09:30 INFO DAGScheduler: ResultStage 0 (foreach at Consumer.scala:46) finished in 0.128 s
19/09/30 10:09:30 INFO DAGScheduler: Job 0 finished: foreach at Consumer.scala:46, took 0.284038 s
=============== Total 13 events in this    batch ..
19/09/30 10:09:30 INFO KafkaZkUtils$: Saving offsets to zookeeper
19/09/30 10:09:30 INFO KafkaZkUtils$: Writing offsets to Zookeeper: 0:764,1:765,2:765
19/09/30 10:09:30 INFO JobScheduler: Finished job streaming job 1569809370000 ms.0 from job set of time 1569809370000 ms
19/09/30 10:09:30 INFO JobScheduler: Total delay: 0.465 s for time 1569809370000 ms (execution: 0.335 s)
......
(2019-09-30 10:09:17,D9BD82C2A7CD45D6BA6193DF9B44F2EC,1363055585)
19/09/30 10:10:00 INFO Executor: Finished task 0.0 in stage 1.0 (TID 3). 622 bytes result sent to driver
19/09/30 10:10:00 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 4) in 13 ms on localhost (executor driver) (1/3)
19/09/30 10:10:00 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 3) in 14 ms on localhost (executor driver) (2/3)
19/09/30 10:10:00 INFO KafkaRDD: Beginning offset 765 is the same as ending offset skipping my-topic 2
19/09/30 10:10:00 INFO Executor: Finished task 2.0 in stage 1.0 (TID 5). 622 bytes result sent to driver
19/09/30 10:10:00 INFO TaskSetManager: Finished task 2.0 in stage 1.0 (TID 5) in 9 ms on localhost (executor driver) (3/3)
19/09/30 10:10:00 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool 
19/09/30 10:10:00 INFO DAGScheduler: ResultStage 1 (foreach at Consumer.scala:46) finished in 0.018 s
19/09/30 10:10:00 INFO DAGScheduler: Job 1 finished: foreach at Consumer.scala:46, took 0.026031 s
=============== Total 1 events in this    batch ..
19/09/30 10:10:00 INFO KafkaZkUtils$: Saving offsets to zookeeper
19/09/30 10:10:00 INFO KafkaZkUtils$: Writing offsets to Zookeeper: 0:765,1:765,2:765
19/09/30 10:10:00 INFO JobScheduler: Finished job streaming job 1569809400000 ms.0 from job set of time 1569809400000 ms
19/09/30 10:10:00 INFO JobScheduler: Total delay: 0.051 s for time 1569809400000 ms (execution: 0.048 s)
...

你可能感兴趣的:(Spark)