学习spark streaming 2.2.0 kafka源码解读(一)

首先一段代码,然后逐步去追逐代码中的重要信息:


import java.text.SimpleDateFormat
import java.util
import java.util.{Date, TimeZone}

import com.alibaba.fastjson.JSON
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.elasticsearch.spark._
import org.slf4j.LoggerFactory

case class KafkaAlarmInfo(ip: String, AlarmMessage: String, AlarmStartTime: String, AlarmEndTime: String, AlarmDuration: Long)

class KafkaAlarm {
}

object KafkaAlarm {
  def main(args: Array[String]): Unit = {
    val log = LoggerFactory.getLogger("KafkaAlarm")
    val Array(second,brokers,topics,groupid,indexType,log_level) = args
    val sparkconf = new SparkConf().setAppName("KafkaAlarm").setMaster("local[2]")
    val ssc = new StreamingContext(sparkconf, Seconds(second.toLong))
    ssc.sparkContext.setLogLevel(log_level)
    val topicSet = topics.split(",").toSet
    val kafkaParams = Map[String, Object]("bootstrap.servers" -> brokers,
      "group.id" -> groupid,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean))

    val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
    dateFormat.setTimeZone(TimeZone.getTimeZone("Asia/Shanghai"))
    val dfMin: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
    dfMin.setTimeZone(TimeZone.getTimeZone("Asia/Shanghai"))

    //从ES查询一遍数据
    val sc = ssc.sparkContext
    val esRdd = sc.esRDD("spj/EQStatusTransForm").values
    esRdd.collect().foreach(println)
    val rddMap = esRdd.map(value => (value("AlarmMessage").toString, value("type").toString))
    rddMap.collect().foreach(println)
    val rddAlarmMessage = rddMap.collect().toMap
    log.debug(rddAlarmMessage)

    // 用于保存告警或换轮信息
    var alarmOrWheelInfo = new util.HashMap[String, String]()
    // 广播告警或换轮信息
    var alarmOrWheelValue = sc.broadcast(alarmOrWheelInfo)

    val dStream = KafkaUtils.createDirectStream(ssc, PreferConsistent, Subscribe[String, String](topicSet, kafkaParams))
    val messgeDStream = dStream.map(
      message => {
        try {
          val json = JSON.parseObject(message.value())
          log.debug("获取的Kafka json数据:" + json)
          //具体业务逻辑代码实现,就不贴出来了
      }
    )
    // 广播变量的修改
    alarmOrWheelValue.unpersist()
    alarmOrWheelValue = sc.broadcast(alarmOrWheelInfo)

    dStream.foreachRDD { rdd =>
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      // some time later, after outputs have completed
      dStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
    }
    messgeDStream.print()
    messgeDStream.foreachRDD(
      rdd => try {
        val toESRdd = rdd.map(value => {
          value._1 + "&" + value._2
        }).filter(_.contains("alarm"))

        toESRdd.map(value => {
          val values = value.split("&")
          val ip = values(0)
          val AlarmMessage = values(2)
          var AlarmStartTime = values(3)
          var AlarmEndTime = values(4)

          val begin: Date = dateFormat.parse(AlarmStartTime)
          var AlarmDuration: Long = 0

          if (!"0".equals(AlarmEndTime)) {
            val end: Date = dateFormat.parse(AlarmEndTime)
            AlarmDuration = (end.getTime - begin.getTime) / 1000 //3600000.00
            AlarmEndTime = dfMin.format(end)
          } else {
            AlarmEndTime = ""
          }

          AlarmStartTime = dfMin.format(begin)
          KafkaAlarmInfo(ip, AlarmMessage, AlarmStartTime, AlarmEndTime, AlarmDuration)
        }).saveToEs(indexType, Map("es.mapping.id" -> "AlarmStartTime"))
      } catch {
        case e: Exception => {
          println(rdd.count())
          println("检查elasticsearch是否正常!")
        }
      }
    )
    ssc.start()
    ssc.awaitTermination()
  }
}

val sparkconf = new SparkConf().setAppName("KafkaAlarm").setMaster("local[2]")
这行代码的作用是创建一个spark的配置,其中local[2]指定本地模式运行,使用两个core,如果指定一个会出现警告提示,原因稍后提及。

val ssc = new StreamingContext(sparkconf, Seconds(second.toLong))
设置每次执行的时间,并把spark的配置文件传入,追踪一下StreamingContext的源码

class StreamingContext private[streaming] (
    _sc: SparkContext,
    _cp: Checkpoint,
    _batchDur: Duration
  ) extends Logging 

首先可以看到这个类只能由streaming下面的包对其访问private[streaming],同时构造方法接收3个参数(SparkContext,Checkpoint,Duration),我们调用的初始化方法为:

def this(conf: SparkConf, batchDuration: Duration) = {
    this(StreamingContext.createNewSparkContext(conf), null, batchDuration)
  }

这个类初始化了很多spark streaming运行需要的信息,如上面提到的为什么local模式需要两个cpu

if (sc.conf.get("spark.master") == "local" || sc.conf.get("spark.master") == "local[1]") {
    logWarning("spark.master should be set as local[n], n > 1 in local mode if you have receivers" +
      " to get data, otherwise Spark jobs will not get resources to process the received data.")
  }

本地模式的时候,如果你有一个接收器去获取数据,如果配置CPU为1,那么spark的job获取不到资源去处理数据,意思就是接收数据需要一个单独的CPU,处理数据至少也需要一个cpu。

同时创建了两个重要的类:DStreamGraph和JobScheduler,DStreamGraph保存每次batch创建时候的rdd信息,JobScheduler执行batch。

private[streaming] val graph: DStreamGraph = {
    if (isCheckpointPresent) {
      _cp.graph.setContext(this)
      _cp.graph.restoreCheckpointData()
      _cp.graph
    } else {
      require(_batchDur != null, "Batch duration for StreamingContext cannot be null")
      val newGraph = new DStreamGraph()
      newGraph.setBatchDuration(_batchDur)
      newGraph
    }
  }

  private[streaming] val scheduler = new JobScheduler(this)

通信机制建立,目前绝大数的分布式,都是通过发送消息的方式来完成同步:

private[streaming] val progressListener = new StreamingJobProgressListener(this)

checkpoint也是这个时候创建的,这个是用来保证任务长时间运行的机制:

 private[streaming] val checkpointDuration: Duration = {
    if (isCheckpointPresent) _cp.checkpointDuration else graph.batchDuration
  }

如何没有单独指定checkpoint的执行频率的话,每次checkpoint的时间也与任务每次执行的时间基本一致:

private[streaming] val checkpointDuration: Duration = {
    if (isCheckpointPresent) _cp.checkpointDuration else graph.batchDuration
  }

spark streaming执行重复内容的大致原理:就是把需要重复执行的rdd,生成一个静态rdd dag保存在graph中,我们看下createDirectStream的实例实现:

private[spark] class DirectKafkaInputDStream[K, V](
    _ssc: StreamingContext,
    locationStrategy: LocationStrategy,
    consumerStrategy: ConsumerStrategy[K, V],
    ppc: PerPartitionConfig
  ) extends InputDStream[ConsumerRecord[K, V]](_ssc) with Logging with CanCommitOffsets {

我们可以发现这个类继承了InputDStream,并且在创建的时候初始化了InputDStream,我们看下InputDStream的源码:

abstract class InputDStream[T: ClassTag](_ssc: StreamingContext)
  extends DStream[T](_ssc) {

  private[streaming] var lastValidTime: Time = null

  ssc.graph.addInputStream(this)

我们发现InputDStream创建的时候,同时会创建DStream,我们看到同时把这个stream保存到了graph中。到这,JobScheduler需要的rdd dag 模板准备工作就已经完成。

接下来我们在代码中完成业务逻辑的部分,调用ssc.start(),我们追踪一下start的源码:

def start(): Unit = synchronized {
    state match {
      case INITIALIZED =>
        startSite.set(DStream.getCreationSite())
        StreamingContext.ACTIVATION_LOCK.synchronized {
          StreamingContext.assertNoOtherContextIsActive()
          try {
            validate()

            // Start the streaming scheduler in a new thread, so that thread local properties
            // like call sites and job groups can be reset without affecting those of the
            // current thread.
            ThreadUtils.runInNewThread("streaming-start") {
              sparkContext.setCallSite(startSite.get)
              sparkContext.clearJobGroup()
              sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
              savedProperties.set(SerializationUtils.clone(sparkContext.localProperties.get()))
              scheduler.start()
            }
            state = StreamingContextState.ACTIVE
            scheduler.listenerBus.post(
              StreamingListenerStreamingStarted(System.currentTimeMillis()))
          } catch {
            case NonFatal(e) =>
              logError("Error starting the context, marking it as stopped", e)
              scheduler.stop(false)
              state = StreamingContextState.STOPPED
              throw e
          }
          StreamingContext.setActiveContext(this)
        }
        logDebug("Adding shutdown hook") // force eager creation of logger
        shutdownHookRef = ShutdownHookManager.addShutdownHook(
          StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
        // Registering Streaming Metrics at the start of the StreamingContext
        assert(env.metricsSystem != null)
        env.metricsSystem.registerSource(streamingSource)
        uiTab.foreach(_.attach())
        logInfo("StreamingContext started")
      case ACTIVE =>
        logWarning("StreamingContext has already been started")
      case STOPPED =>
        throw new IllegalStateException("StreamingContext has already been stopped")
    }
  }

在streamingcontext创建的时候,state的状态被设置成了:

private var state: StreamingContextState = INITIALIZED

这里面最主要的代码是:
scheduler.start()
继续追踪一下源码:

def start(): Unit = synchronized {
    if (eventLoop != null) return // scheduler has already been started

    logDebug("Starting JobScheduler")
    eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
      override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)

      override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
    }
    eventLoop.start()

    // attach rate controllers of input streams to receive batch completion updates
    for {
      inputDStream <- ssc.graph.getInputStreams
      rateController <- inputDStream.rateController
    } ssc.addStreamingListener(rateController)

    listenerBus.start()
    receiverTracker = new ReceiverTracker(ssc)
    inputInfoTracker = new InputInfoTracker(ssc)

    val executorAllocClient: ExecutorAllocationClient = ssc.sparkContext.schedulerBackend match {
      case b: ExecutorAllocationClient => b.asInstanceOf[ExecutorAllocationClient]
      case _ => null
    }

    executorAllocationManager = ExecutorAllocationManager.createIfEnabled(
      executorAllocClient,
      receiverTracker,
      ssc.conf,
      ssc.graph.batchDuration.milliseconds,
      clock)
    executorAllocationManager.foreach(ssc.addStreamingListener)
    receiverTracker.start()
    jobGenerator.start()
    executorAllocationManager.foreach(_.start())
    logInfo("Started JobScheduler")
  }

这里启动eventLoop,listenerBus、receiverTracker、inputInfoTracker。
eventLoop(job之间通信)
listenerBus消息总线
receiverTracker用来产生数据
inputInfoTracker用来管理 input streams
这里面每一个的实现都很复杂,大家可以参考博客
streaming源码详解

我这里只是来关注下kafka的streaming到底怎么工作的,比如怎么获取offset,通过Offset怎么获取数据,调用异步commit之后把offset写入到哪里去了。
这里主要关注:
jobGenerator.start()

直接进入代码:

def start(): Unit = synchronized {
    if (eventLoop != null) return // generator has already been started

    // Call checkpointWriter here to initialize it before eventLoop uses it to avoid a deadlock.
    // See SPARK-10125
    checkpointWriter

    eventLoop = new EventLoop[JobGeneratorEvent]("JobGenerator") {
      override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event)

      override protected def onError(e: Throwable): Unit = {
        jobScheduler.reportError("Error in job generator", e)
      }
    }
    eventLoop.start()

    if (ssc.isCheckpointPresent) {
      restart()
    } else {
      startFirstTime()
    }
  }

第一次执行是执行的startFirstTime,继续代码:

  /** Starts the generator for the first time */
  private def startFirstTime() {
    val startTime = new Time(timer.getStartTime())
    graph.start(startTime - graph.batchDuration)
    timer.start(startTime.milliseconds)
    logInfo("Started JobGenerator at " + startTime)
  }

关注graph.start:

def start(time: Time) {
    this.synchronized {
      require(zeroTime == null, "DStream graph computation already started")
      zeroTime = time
      startTime = time
      outputStreams.foreach(_.initialize(zeroTime))
      outputStreams.foreach(_.remember(rememberDuration))
      outputStreams.foreach(_.validateAtStart())
      numReceivers = inputStreams.count(_.isInstanceOf[ReceiverInputDStream[_]])
      inputStreamNameAndID = inputStreams.map(is => (is.name, is.id))
      inputStreams.par.foreach(_.start())
    }
  }

这里其实只是执行了一系列初始化的工作:

将所有的outputStreams都initialize,初始化首次执行时间,依赖的DStream一并设置。
如果设置了duration,将所有的outputStreams都remember,依赖的DStream一并设置
启动前验证,主要是验证chechpoint设置是否冲突以及各种Duration
将所有的inputStreams启动;因为在Streaming中,inputStreams都已经交由ReceiverTracker管理了。

其中outputStreams是在代码中实际action的这些部分,比如foreachRDD
追踪下源码:

 /**
   * Apply a function to each RDD in this DStream. This is an output operator, so
   * 'this' DStream will be registered as an output stream and therefore materialized.
   * @param foreachFunc foreachRDD function
   * @param displayInnerRDDOps Whether the detailed callsites and scopes of the RDDs generated
   *                           in the `foreachFunc` to be displayed in the UI. If `false`, then
   *                           only the scopes and callsites of `foreachRDD` will override those
   *                           of the RDDs on the display.
   */
  private def foreachRDD(
      foreachFunc: (RDD[T], Time) => Unit,
      displayInnerRDDOps: Boolean): Unit = {
    new ForEachDStream(this,
      context.sparkContext.clean(foreachFunc, false), displayInnerRDDOps).register()
  }

注释中已经告诉我们会注册到output stream
看下register()

/**
   * Register this streaming as an output stream. This would ensure that RDDs of this
   * DStream will be generated.
   */
  private[streaming] def register(): DStream[T] = {
    ssc.graph.addOutputStream(this)
    this
  }

很简单,就是把这个rdd加入到outputstream中。

接着timer.start(startTime.milliseconds),这个timer是在JobGenerator生成的。

private val timer = new RecurringTimer(clock, ssc.graph.batchDuration.milliseconds,
    longTime => eventLoop.post(GenerateJobs(new Time(longTime))), "JobGenerator")

这里每个一个周期它将会向eventLoop 提交一个GenerateJobs的事件,然后eventLoop会调用处理事件的方法,我们回头看下eventloop创建的时候初始化了一个方法:

eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
      override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)

      override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
    }

重写了onReceive方法,然后我们下eventLoop.post方法:

/**
   * Put the event into the event queue. The event thread will process it later.
   */
  def post(event: E): Unit = {
    eventQueue.put(event)
  }

然后eventloop的源码中:

private[spark] abstract class EventLoop[E](name: String) extends Logging {

  private val eventQueue: BlockingQueue[E] = new LinkedBlockingDeque[E]()

  private val stopped = new AtomicBoolean(false)

  private val eventThread = new Thread(name) {
    setDaemon(true)

    override def run(): Unit = {
      try {
        while (!stopped.get) {
          val event = eventQueue.take()
          try {
            onReceive(event)
          } catch {
            case NonFatal(e) =>
              try {
                onError(e)
              } catch {
                case NonFatal(e) => logError("Unexpected error in " + name, e)
              }
          }
        }
      } catch {
        case ie: InterruptedException => // exit even if eventQueue is not empty
        case NonFatal(e) => logError("Unexpected error in " + name, e)
      }
    }

  }

  def start(): Unit = {
    if (stopped.get) {
      throw new IllegalStateException(name + " has already been stopped")
    }
    // Call onStart before starting the event thread to make sure it happens before onReceive
    onStart()
    eventThread.start()
  }

start方法实际是启动了线程,这个线程如果没有停止之前一直运行,eventQueue是一个阻塞队列,如果有值的就去执行,我们接着去看processEvent方法:

/** Processes all events */
  private def processEvent(event: JobGeneratorEvent) {
    logDebug("Got event " + event)
    event match {
      case GenerateJobs(time) => generateJobs(time)
      case ClearMetadata(time) => clearMetadata(time)
      case DoCheckpoint(time, clearCheckpointDataLater) =>
        doCheckpoint(time, clearCheckpointDataLater)
      case ClearCheckpointData(time) => clearCheckpointData(time)
    }
  }

刚才提交的事件case GenerateJobs(time) => generateJobs(time),继续追踪源码:

/** Generate jobs and perform checkpointing for the given `time`.  */
  private def generateJobs(time: Time) {
    // Checkpoint all RDDs marked for checkpointing to ensure their lineages are
    // truncated periodically. Otherwise, we may run into stack overflows (SPARK-6847).
    ssc.sparkContext.setLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS, "true")
    Try {
      //把接收到的数据分配个batch
      jobScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch
      graph.generateJobs(time) // generate jobs using allocated block
    } match {
      case Success(jobs) =>
        val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
        jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
      case Failure(e) =>
        jobScheduler.reportError("Error generating jobs for time " + time, e)
        PythonDStream.stopStreamingContextIfPythonProcessIsDead(e)
    }
    eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
  }

关注graph.generateJobs(time) // generate jobs using allocated block:

def generateJobs(time: Time): Seq[Job] = {
    logDebug("Generating jobs for time " + time)
    val jobs = this.synchronized {
      outputStreams.flatMap { outputStream =>
        val jobOption = outputStream.generateJob(time)
        jobOption.foreach(_.setCallSite(outputStream.creationSite))
        jobOption
      }
    }
    logDebug("Generated " + jobs.length + " jobs for time " + time)
    jobs
  }

继续查看outputStream.generateJob(time),这里outputstreaming实际上是forearchRDD,查看下它的generateJob:

/**
   * Generate a SparkStreaming job for the given time. This is an internal method that
   * should not be called directly. This default implementation creates a job
   * that materializes the corresponding RDD. Subclasses of DStream may override this
   * to generate their own jobs.
   */
  private[streaming] def generateJob(time: Time): Option[Job] = {
    getOrCompute(time) match {
      case Some(rdd) =>
        val jobFunc = () => {
          val emptyFunc = { (iterator: Iterator[T]) => {} }
          context.sparkContext.runJob(rdd, emptyFunc)
        }
        Some(new Job(time, jobFunc))
      case None => None
    }
  }

看这里它去找父类的getOrCompute方法了,这里需要注意的是根据这个getOrCompute方法,他获得的是RDD,重磅炸弹出现了,DStream的本质就是去实现一个个的RDD来进行计算,所以说DStream只是RDD的一个模版类,查看DStream的getOrCompute代码:

private[streaming] final def getOrCompute(time: Time): Option[RDD[T]] = {
    // If RDD was already generated, then retrieve it from HashMap,
    // or else compute the RDD
    generatedRDDs.get(time).orElse {
      // Compute the RDD if time is valid (e.g. correct time in a sliding window)
      // of RDD generation, else generate nothing.
      if (isTimeValid(time)) {

        val rddOption = createRDDWithLocalProperties(time, displayInnerRDDOps = false) {
          // Disable checks for existing output directories in jobs launched by the streaming
          // scheduler, since we may need to write output to an existing directory during checkpoint
          // recovery; see SPARK-4835 for more details. We need to have this call here because
          // compute() might cause Spark jobs to be launched.
          SparkHadoopWriterUtils.disableOutputSpecValidation.withValue(true) {
            compute(time)
          }
        }

        rddOption.foreach { case newRDD =>
          // Register the generated RDD for caching and checkpointing
          if (storageLevel != StorageLevel.NONE) {
            newRDD.persist(storageLevel)
            logDebug(s"Persisting RDD ${newRDD.id} for time $time to $storageLevel")
          }
          if (checkpointDuration != null && (time - zeroTime).isMultipleOf(checkpointDuration)) {
            newRDD.checkpoint()
            logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing")
          }
          generatedRDDs.put(time, newRDD)
        }
        rddOption
      } else {
        None
      }
    }
  }

这里注意compute(time),DStream本身是没有实现的,这里DStream实际是DirectKafkaInputDStream,查看下它的compute方法代码:

override def compute(validTime: Time): Option[KafkaRDD[K, V]] = {
    val untilOffsets = clamp(latestOffsets())
    val offsetRanges = untilOffsets.map { case (tp, uo) =>
      val fo = currentOffsets(tp)
      OffsetRange(tp.topic, tp.partition, fo, uo)
    }
    val useConsumerCache = context.conf.getBoolean("spark.streaming.kafka.consumer.cache.enabled",
      true)
    val rdd = new KafkaRDD[K, V](context.sparkContext, executorKafkaParams, offsetRanges.toArray,
      getPreferredHosts, useConsumerCache)

    // Report the record number and metadata of this batch interval to InputInfoTracker.
    val description = offsetRanges.filter { offsetRange =>
      // Don't display empty ranges.
      offsetRange.fromOffset != offsetRange.untilOffset
    }.map { offsetRange =>
      s"topic: ${offsetRange.topic}\tpartition: ${offsetRange.partition}\t" +
        s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}"
    }.mkString("\n")
    // Copy offsetRanges to immutable.List to prevent from being modified by the user
    val metadata = Map(
      "offsets" -> offsetRanges.toList,
      StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
    val inputInfo = StreamInputInfo(id, rdd.count, metadata)
    ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)

    currentOffsets = untilOffsets
    commitAll()
    Some(rdd)
  }

最终返回获取的rdd, commitAll()第一次是不执行中,我们的案例代码中是异步提交的,查看下commitAll()代码:

protected def commitAll(): Unit = {
    val m = new ju.HashMap[TopicPartition, OffsetAndMetadata]()
    var osr = commitQueue.poll()
    while (null != osr) {
      val tp = osr.topicPartition
      val x = m.get(tp)
      val offset = if (null == x) { osr.untilOffset } else { Math.max(x.offset, osr.untilOffset) }
      m.put(tp, new OffsetAndMetadata(offset))
      osr = commitQueue.poll()
    }
    if (!m.isEmpty) {
      consumer.commitAsync(m, commitCallback.get)
    }
  }

可以发现commitQueue第一次执行的时候是没有值的,必须要异步提交一次才有值:

/**
   * Queue up offset ranges for commit to Kafka at a future time.  Threadsafe.
   * @param offsetRanges The maximum untilOffset for a given partition will be used at commit.
   * @param callback Only the most recently provided callback will be used at commit.
   */
  def commitAsync(offsetRanges: Array[OffsetRange], callback: OffsetCommitCallback): Unit = {
    commitCallback.set(callback)
    commitQueue.addAll(ju.Arrays.asList(offsetRanges: _*))
  }

之前看过很多帖子说了关于offset的问题,其实通过源码我们会发现offset是从msg中获取的,这个offset应该是kafka自己维护的,所以只要Kafka没有完全数据丢失,这个offset应该是可以一直维护住的,所以我的代码中是取消了checkpoint的,如果理解有误,请指出,感谢,这里注意一下案例中的代码:

val kafkaParams = Map[String, Object]("bootstrap.servers" -> brokers,
      "group.id" -> groupid,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean))

enable.auto.commit为false官网说最好设置为false,可是我通过阅读代码发现,你不设置成false也会强制设置成false:

private[spark] class DirectKafkaInputDStream[K, V](
    _ssc: StreamingContext,
    locationStrategy: LocationStrategy,
    consumerStrategy: ConsumerStrategy[K, V],
    ppc: PerPartitionConfig
  ) extends InputDStream[ConsumerRecord[K, V]](_ssc) with Logging with CanCommitOffsets {

  val executorKafkaParams = {
    val ekp = new ju.HashMap[String, Object](consumerStrategy.executorKafkaParams)
    KafkaUtils.fixKafkaParams(ekp)
    ekp
  }

这个值在compute中被使用了,看下KafkaUtils.fixKafkaParams(ekp):

/**
   * Tweak kafka params to prevent issues on executors
   */
  private[kafka010] def fixKafkaParams(kafkaParams: ju.HashMap[String, Object]): Unit = {
//强制修改ENABLE_AUTO_COMMIT_CONFIG 为false,
    logWarning(s"overriding ${ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG} to false for executor")
    kafkaParams.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false: java.lang.Boolean)

//强制修改AUTO_OFFSET_RESET_CONFIG 为NONE,
    logWarning(s"overriding ${ConsumerConfig.AUTO_OFFSET_RESET_CONFIG} to none for executor")
    kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "none")

    // driver and executor should be in different consumer groups
    val originalGroupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG)
    if (null == originalGroupId) {
      logError(s"${ConsumerConfig.GROUP_ID_CONFIG} is null, you should probably set it")
    }
    val groupId = "spark-executor-" + originalGroupId
    logWarning(s"overriding executor ${ConsumerConfig.GROUP_ID_CONFIG} to ${groupId}")
    kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId)

    // possible workaround for KAFKA-3135
    val rbb = kafkaParams.get(ConsumerConfig.RECEIVE_BUFFER_CONFIG)
    if (null == rbb || rbb.asInstanceOf[java.lang.Integer] < 65536) {
      logWarning(s"overriding ${ConsumerConfig.RECEIVE_BUFFER_CONFIG} to 65536 see KAFKA-3135")
      kafkaParams.put(ConsumerConfig.RECEIVE_BUFFER_CONFIG, 65536: java.lang.Integer)
    }
  }

源码阅读到此结束,如有错误请赐教,谢谢。

你可能感兴趣的:(学习spark streaming 2.2.0 kafka源码解读(一))