首先一段代码,然后逐步去追逐代码中的重要信息:
import java.text.SimpleDateFormat
import java.util
import java.util.{Date, TimeZone}
import com.alibaba.fastjson.JSON
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.elasticsearch.spark._
import org.slf4j.LoggerFactory
case class KafkaAlarmInfo(ip: String, AlarmMessage: String, AlarmStartTime: String, AlarmEndTime: String, AlarmDuration: Long)
class KafkaAlarm {
}
object KafkaAlarm {
def main(args: Array[String]): Unit = {
val log = LoggerFactory.getLogger("KafkaAlarm")
val Array(second,brokers,topics,groupid,indexType,log_level) = args
val sparkconf = new SparkConf().setAppName("KafkaAlarm").setMaster("local[2]")
val ssc = new StreamingContext(sparkconf, Seconds(second.toLong))
ssc.sparkContext.setLogLevel(log_level)
val topicSet = topics.split(",").toSet
val kafkaParams = Map[String, Object]("bootstrap.servers" -> brokers,
"group.id" -> groupid,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean))
val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
dateFormat.setTimeZone(TimeZone.getTimeZone("Asia/Shanghai"))
val dfMin: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
dfMin.setTimeZone(TimeZone.getTimeZone("Asia/Shanghai"))
//从ES查询一遍数据
val sc = ssc.sparkContext
val esRdd = sc.esRDD("spj/EQStatusTransForm").values
esRdd.collect().foreach(println)
val rddMap = esRdd.map(value => (value("AlarmMessage").toString, value("type").toString))
rddMap.collect().foreach(println)
val rddAlarmMessage = rddMap.collect().toMap
log.debug(rddAlarmMessage)
// 用于保存告警或换轮信息
var alarmOrWheelInfo = new util.HashMap[String, String]()
// 广播告警或换轮信息
var alarmOrWheelValue = sc.broadcast(alarmOrWheelInfo)
val dStream = KafkaUtils.createDirectStream(ssc, PreferConsistent, Subscribe[String, String](topicSet, kafkaParams))
val messgeDStream = dStream.map(
message => {
try {
val json = JSON.parseObject(message.value())
log.debug("获取的Kafka json数据:" + json)
//具体业务逻辑代码实现,就不贴出来了
}
)
// 广播变量的修改
alarmOrWheelValue.unpersist()
alarmOrWheelValue = sc.broadcast(alarmOrWheelInfo)
dStream.foreachRDD { rdd =>
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// some time later, after outputs have completed
dStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
messgeDStream.print()
messgeDStream.foreachRDD(
rdd => try {
val toESRdd = rdd.map(value => {
value._1 + "&" + value._2
}).filter(_.contains("alarm"))
toESRdd.map(value => {
val values = value.split("&")
val ip = values(0)
val AlarmMessage = values(2)
var AlarmStartTime = values(3)
var AlarmEndTime = values(4)
val begin: Date = dateFormat.parse(AlarmStartTime)
var AlarmDuration: Long = 0
if (!"0".equals(AlarmEndTime)) {
val end: Date = dateFormat.parse(AlarmEndTime)
AlarmDuration = (end.getTime - begin.getTime) / 1000 //3600000.00
AlarmEndTime = dfMin.format(end)
} else {
AlarmEndTime = ""
}
AlarmStartTime = dfMin.format(begin)
KafkaAlarmInfo(ip, AlarmMessage, AlarmStartTime, AlarmEndTime, AlarmDuration)
}).saveToEs(indexType, Map("es.mapping.id" -> "AlarmStartTime"))
} catch {
case e: Exception => {
println(rdd.count())
println("检查elasticsearch是否正常!")
}
}
)
ssc.start()
ssc.awaitTermination()
}
}
val sparkconf = new SparkConf().setAppName("KafkaAlarm").setMaster("local[2]")
这行代码的作用是创建一个spark的配置,其中local[2]指定本地模式运行,使用两个core,如果指定一个会出现警告提示,原因稍后提及。
val ssc = new StreamingContext(sparkconf, Seconds(second.toLong))
设置每次执行的时间,并把spark的配置文件传入,追踪一下StreamingContext的源码
class StreamingContext private[streaming] (
_sc: SparkContext,
_cp: Checkpoint,
_batchDur: Duration
) extends Logging
首先可以看到这个类只能由streaming下面的包对其访问private[streaming],同时构造方法接收3个参数(SparkContext,Checkpoint,Duration),我们调用的初始化方法为:
def this(conf: SparkConf, batchDuration: Duration) = {
this(StreamingContext.createNewSparkContext(conf), null, batchDuration)
}
这个类初始化了很多spark streaming运行需要的信息,如上面提到的为什么local模式需要两个cpu
if (sc.conf.get("spark.master") == "local" || sc.conf.get("spark.master") == "local[1]") {
logWarning("spark.master should be set as local[n], n > 1 in local mode if you have receivers" +
" to get data, otherwise Spark jobs will not get resources to process the received data.")
}
本地模式的时候,如果你有一个接收器去获取数据,如果配置CPU为1,那么spark的job获取不到资源去处理数据,意思就是接收数据需要一个单独的CPU,处理数据至少也需要一个cpu。
同时创建了两个重要的类:DStreamGraph和JobScheduler,DStreamGraph保存每次batch创建时候的rdd信息,JobScheduler执行batch。
private[streaming] val graph: DStreamGraph = {
if (isCheckpointPresent) {
_cp.graph.setContext(this)
_cp.graph.restoreCheckpointData()
_cp.graph
} else {
require(_batchDur != null, "Batch duration for StreamingContext cannot be null")
val newGraph = new DStreamGraph()
newGraph.setBatchDuration(_batchDur)
newGraph
}
}
private[streaming] val scheduler = new JobScheduler(this)
通信机制建立,目前绝大数的分布式,都是通过发送消息的方式来完成同步:
private[streaming] val progressListener = new StreamingJobProgressListener(this)
checkpoint也是这个时候创建的,这个是用来保证任务长时间运行的机制:
private[streaming] val checkpointDuration: Duration = {
if (isCheckpointPresent) _cp.checkpointDuration else graph.batchDuration
}
如何没有单独指定checkpoint的执行频率的话,每次checkpoint的时间也与任务每次执行的时间基本一致:
private[streaming] val checkpointDuration: Duration = {
if (isCheckpointPresent) _cp.checkpointDuration else graph.batchDuration
}
spark streaming执行重复内容的大致原理:就是把需要重复执行的rdd,生成一个静态rdd dag保存在graph中,我们看下createDirectStream的实例实现:
private[spark] class DirectKafkaInputDStream[K, V](
_ssc: StreamingContext,
locationStrategy: LocationStrategy,
consumerStrategy: ConsumerStrategy[K, V],
ppc: PerPartitionConfig
) extends InputDStream[ConsumerRecord[K, V]](_ssc) with Logging with CanCommitOffsets {
我们可以发现这个类继承了InputDStream,并且在创建的时候初始化了InputDStream,我们看下InputDStream的源码:
abstract class InputDStream[T: ClassTag](_ssc: StreamingContext)
extends DStream[T](_ssc) {
private[streaming] var lastValidTime: Time = null
ssc.graph.addInputStream(this)
我们发现InputDStream创建的时候,同时会创建DStream,我们看到同时把这个stream保存到了graph中。到这,JobScheduler需要的rdd dag 模板准备工作就已经完成。
接下来我们在代码中完成业务逻辑的部分,调用ssc.start(),我们追踪一下start的源码:
def start(): Unit = synchronized {
state match {
case INITIALIZED =>
startSite.set(DStream.getCreationSite())
StreamingContext.ACTIVATION_LOCK.synchronized {
StreamingContext.assertNoOtherContextIsActive()
try {
validate()
// Start the streaming scheduler in a new thread, so that thread local properties
// like call sites and job groups can be reset without affecting those of the
// current thread.
ThreadUtils.runInNewThread("streaming-start") {
sparkContext.setCallSite(startSite.get)
sparkContext.clearJobGroup()
sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
savedProperties.set(SerializationUtils.clone(sparkContext.localProperties.get()))
scheduler.start()
}
state = StreamingContextState.ACTIVE
scheduler.listenerBus.post(
StreamingListenerStreamingStarted(System.currentTimeMillis()))
} catch {
case NonFatal(e) =>
logError("Error starting the context, marking it as stopped", e)
scheduler.stop(false)
state = StreamingContextState.STOPPED
throw e
}
StreamingContext.setActiveContext(this)
}
logDebug("Adding shutdown hook") // force eager creation of logger
shutdownHookRef = ShutdownHookManager.addShutdownHook(
StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
// Registering Streaming Metrics at the start of the StreamingContext
assert(env.metricsSystem != null)
env.metricsSystem.registerSource(streamingSource)
uiTab.foreach(_.attach())
logInfo("StreamingContext started")
case ACTIVE =>
logWarning("StreamingContext has already been started")
case STOPPED =>
throw new IllegalStateException("StreamingContext has already been stopped")
}
}
在streamingcontext创建的时候,state的状态被设置成了:
private var state: StreamingContextState = INITIALIZED
这里面最主要的代码是:
scheduler.start()
继续追踪一下源码:
def start(): Unit = synchronized {
if (eventLoop != null) return // scheduler has already been started
logDebug("Starting JobScheduler")
eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)
override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
}
eventLoop.start()
// attach rate controllers of input streams to receive batch completion updates
for {
inputDStream <- ssc.graph.getInputStreams
rateController <- inputDStream.rateController
} ssc.addStreamingListener(rateController)
listenerBus.start()
receiverTracker = new ReceiverTracker(ssc)
inputInfoTracker = new InputInfoTracker(ssc)
val executorAllocClient: ExecutorAllocationClient = ssc.sparkContext.schedulerBackend match {
case b: ExecutorAllocationClient => b.asInstanceOf[ExecutorAllocationClient]
case _ => null
}
executorAllocationManager = ExecutorAllocationManager.createIfEnabled(
executorAllocClient,
receiverTracker,
ssc.conf,
ssc.graph.batchDuration.milliseconds,
clock)
executorAllocationManager.foreach(ssc.addStreamingListener)
receiverTracker.start()
jobGenerator.start()
executorAllocationManager.foreach(_.start())
logInfo("Started JobScheduler")
}
这里启动eventLoop,listenerBus、receiverTracker、inputInfoTracker。
eventLoop(job之间通信)
listenerBus消息总线
receiverTracker用来产生数据
inputInfoTracker用来管理 input streams
这里面每一个的实现都很复杂,大家可以参考博客
streaming源码详解
我这里只是来关注下kafka的streaming到底怎么工作的,比如怎么获取offset,通过Offset怎么获取数据,调用异步commit之后把offset写入到哪里去了。
这里主要关注:
jobGenerator.start()
直接进入代码:
def start(): Unit = synchronized {
if (eventLoop != null) return // generator has already been started
// Call checkpointWriter here to initialize it before eventLoop uses it to avoid a deadlock.
// See SPARK-10125
checkpointWriter
eventLoop = new EventLoop[JobGeneratorEvent]("JobGenerator") {
override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event)
override protected def onError(e: Throwable): Unit = {
jobScheduler.reportError("Error in job generator", e)
}
}
eventLoop.start()
if (ssc.isCheckpointPresent) {
restart()
} else {
startFirstTime()
}
}
第一次执行是执行的startFirstTime,继续代码:
/** Starts the generator for the first time */
private def startFirstTime() {
val startTime = new Time(timer.getStartTime())
graph.start(startTime - graph.batchDuration)
timer.start(startTime.milliseconds)
logInfo("Started JobGenerator at " + startTime)
}
关注graph.start:
def start(time: Time) {
this.synchronized {
require(zeroTime == null, "DStream graph computation already started")
zeroTime = time
startTime = time
outputStreams.foreach(_.initialize(zeroTime))
outputStreams.foreach(_.remember(rememberDuration))
outputStreams.foreach(_.validateAtStart())
numReceivers = inputStreams.count(_.isInstanceOf[ReceiverInputDStream[_]])
inputStreamNameAndID = inputStreams.map(is => (is.name, is.id))
inputStreams.par.foreach(_.start())
}
}
这里其实只是执行了一系列初始化的工作:
将所有的outputStreams都initialize,初始化首次执行时间,依赖的DStream一并设置。
如果设置了duration,将所有的outputStreams都remember,依赖的DStream一并设置
启动前验证,主要是验证chechpoint设置是否冲突以及各种Duration
将所有的inputStreams启动;因为在Streaming中,inputStreams都已经交由ReceiverTracker管理了。
其中outputStreams是在代码中实际action的这些部分,比如foreachRDD
追踪下源码:
/**
* Apply a function to each RDD in this DStream. This is an output operator, so
* 'this' DStream will be registered as an output stream and therefore materialized.
* @param foreachFunc foreachRDD function
* @param displayInnerRDDOps Whether the detailed callsites and scopes of the RDDs generated
* in the `foreachFunc` to be displayed in the UI. If `false`, then
* only the scopes and callsites of `foreachRDD` will override those
* of the RDDs on the display.
*/
private def foreachRDD(
foreachFunc: (RDD[T], Time) => Unit,
displayInnerRDDOps: Boolean): Unit = {
new ForEachDStream(this,
context.sparkContext.clean(foreachFunc, false), displayInnerRDDOps).register()
}
注释中已经告诉我们会注册到output stream
看下register()
/**
* Register this streaming as an output stream. This would ensure that RDDs of this
* DStream will be generated.
*/
private[streaming] def register(): DStream[T] = {
ssc.graph.addOutputStream(this)
this
}
很简单,就是把这个rdd加入到outputstream中。
接着timer.start(startTime.milliseconds),这个timer是在JobGenerator生成的。
private val timer = new RecurringTimer(clock, ssc.graph.batchDuration.milliseconds,
longTime => eventLoop.post(GenerateJobs(new Time(longTime))), "JobGenerator")
这里每个一个周期它将会向eventLoop 提交一个GenerateJobs的事件,然后eventLoop会调用处理事件的方法,我们回头看下eventloop创建的时候初始化了一个方法:
eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)
override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
}
重写了onReceive方法,然后我们下eventLoop.post方法:
/**
* Put the event into the event queue. The event thread will process it later.
*/
def post(event: E): Unit = {
eventQueue.put(event)
}
然后eventloop的源码中:
private[spark] abstract class EventLoop[E](name: String) extends Logging {
private val eventQueue: BlockingQueue[E] = new LinkedBlockingDeque[E]()
private val stopped = new AtomicBoolean(false)
private val eventThread = new Thread(name) {
setDaemon(true)
override def run(): Unit = {
try {
while (!stopped.get) {
val event = eventQueue.take()
try {
onReceive(event)
} catch {
case NonFatal(e) =>
try {
onError(e)
} catch {
case NonFatal(e) => logError("Unexpected error in " + name, e)
}
}
}
} catch {
case ie: InterruptedException => // exit even if eventQueue is not empty
case NonFatal(e) => logError("Unexpected error in " + name, e)
}
}
}
def start(): Unit = {
if (stopped.get) {
throw new IllegalStateException(name + " has already been stopped")
}
// Call onStart before starting the event thread to make sure it happens before onReceive
onStart()
eventThread.start()
}
start方法实际是启动了线程,这个线程如果没有停止之前一直运行,eventQueue是一个阻塞队列,如果有值的就去执行,我们接着去看processEvent方法:
/** Processes all events */
private def processEvent(event: JobGeneratorEvent) {
logDebug("Got event " + event)
event match {
case GenerateJobs(time) => generateJobs(time)
case ClearMetadata(time) => clearMetadata(time)
case DoCheckpoint(time, clearCheckpointDataLater) =>
doCheckpoint(time, clearCheckpointDataLater)
case ClearCheckpointData(time) => clearCheckpointData(time)
}
}
刚才提交的事件case GenerateJobs(time) => generateJobs(time),继续追踪源码:
/** Generate jobs and perform checkpointing for the given `time`. */
private def generateJobs(time: Time) {
// Checkpoint all RDDs marked for checkpointing to ensure their lineages are
// truncated periodically. Otherwise, we may run into stack overflows (SPARK-6847).
ssc.sparkContext.setLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS, "true")
Try {
//把接收到的数据分配个batch
jobScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch
graph.generateJobs(time) // generate jobs using allocated block
} match {
case Success(jobs) =>
val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
case Failure(e) =>
jobScheduler.reportError("Error generating jobs for time " + time, e)
PythonDStream.stopStreamingContextIfPythonProcessIsDead(e)
}
eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
}
关注graph.generateJobs(time) // generate jobs using allocated block:
def generateJobs(time: Time): Seq[Job] = {
logDebug("Generating jobs for time " + time)
val jobs = this.synchronized {
outputStreams.flatMap { outputStream =>
val jobOption = outputStream.generateJob(time)
jobOption.foreach(_.setCallSite(outputStream.creationSite))
jobOption
}
}
logDebug("Generated " + jobs.length + " jobs for time " + time)
jobs
}
继续查看outputStream.generateJob(time),这里outputstreaming实际上是forearchRDD,查看下它的generateJob:
/**
* Generate a SparkStreaming job for the given time. This is an internal method that
* should not be called directly. This default implementation creates a job
* that materializes the corresponding RDD. Subclasses of DStream may override this
* to generate their own jobs.
*/
private[streaming] def generateJob(time: Time): Option[Job] = {
getOrCompute(time) match {
case Some(rdd) =>
val jobFunc = () => {
val emptyFunc = { (iterator: Iterator[T]) => {} }
context.sparkContext.runJob(rdd, emptyFunc)
}
Some(new Job(time, jobFunc))
case None => None
}
}
看这里它去找父类的getOrCompute方法了,这里需要注意的是根据这个getOrCompute方法,他获得的是RDD,重磅炸弹出现了,DStream的本质就是去实现一个个的RDD来进行计算,所以说DStream只是RDD的一个模版类,查看DStream的getOrCompute代码:
private[streaming] final def getOrCompute(time: Time): Option[RDD[T]] = {
// If RDD was already generated, then retrieve it from HashMap,
// or else compute the RDD
generatedRDDs.get(time).orElse {
// Compute the RDD if time is valid (e.g. correct time in a sliding window)
// of RDD generation, else generate nothing.
if (isTimeValid(time)) {
val rddOption = createRDDWithLocalProperties(time, displayInnerRDDOps = false) {
// Disable checks for existing output directories in jobs launched by the streaming
// scheduler, since we may need to write output to an existing directory during checkpoint
// recovery; see SPARK-4835 for more details. We need to have this call here because
// compute() might cause Spark jobs to be launched.
SparkHadoopWriterUtils.disableOutputSpecValidation.withValue(true) {
compute(time)
}
}
rddOption.foreach { case newRDD =>
// Register the generated RDD for caching and checkpointing
if (storageLevel != StorageLevel.NONE) {
newRDD.persist(storageLevel)
logDebug(s"Persisting RDD ${newRDD.id} for time $time to $storageLevel")
}
if (checkpointDuration != null && (time - zeroTime).isMultipleOf(checkpointDuration)) {
newRDD.checkpoint()
logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing")
}
generatedRDDs.put(time, newRDD)
}
rddOption
} else {
None
}
}
}
这里注意compute(time),DStream本身是没有实现的,这里DStream实际是DirectKafkaInputDStream,查看下它的compute方法代码:
override def compute(validTime: Time): Option[KafkaRDD[K, V]] = {
val untilOffsets = clamp(latestOffsets())
val offsetRanges = untilOffsets.map { case (tp, uo) =>
val fo = currentOffsets(tp)
OffsetRange(tp.topic, tp.partition, fo, uo)
}
val useConsumerCache = context.conf.getBoolean("spark.streaming.kafka.consumer.cache.enabled",
true)
val rdd = new KafkaRDD[K, V](context.sparkContext, executorKafkaParams, offsetRanges.toArray,
getPreferredHosts, useConsumerCache)
// Report the record number and metadata of this batch interval to InputInfoTracker.
val description = offsetRanges.filter { offsetRange =>
// Don't display empty ranges.
offsetRange.fromOffset != offsetRange.untilOffset
}.map { offsetRange =>
s"topic: ${offsetRange.topic}\tpartition: ${offsetRange.partition}\t" +
s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}"
}.mkString("\n")
// Copy offsetRanges to immutable.List to prevent from being modified by the user
val metadata = Map(
"offsets" -> offsetRanges.toList,
StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
val inputInfo = StreamInputInfo(id, rdd.count, metadata)
ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
currentOffsets = untilOffsets
commitAll()
Some(rdd)
}
最终返回获取的rdd, commitAll()第一次是不执行中,我们的案例代码中是异步提交的,查看下commitAll()代码:
protected def commitAll(): Unit = {
val m = new ju.HashMap[TopicPartition, OffsetAndMetadata]()
var osr = commitQueue.poll()
while (null != osr) {
val tp = osr.topicPartition
val x = m.get(tp)
val offset = if (null == x) { osr.untilOffset } else { Math.max(x.offset, osr.untilOffset) }
m.put(tp, new OffsetAndMetadata(offset))
osr = commitQueue.poll()
}
if (!m.isEmpty) {
consumer.commitAsync(m, commitCallback.get)
}
}
可以发现commitQueue第一次执行的时候是没有值的,必须要异步提交一次才有值:
/**
* Queue up offset ranges for commit to Kafka at a future time. Threadsafe.
* @param offsetRanges The maximum untilOffset for a given partition will be used at commit.
* @param callback Only the most recently provided callback will be used at commit.
*/
def commitAsync(offsetRanges: Array[OffsetRange], callback: OffsetCommitCallback): Unit = {
commitCallback.set(callback)
commitQueue.addAll(ju.Arrays.asList(offsetRanges: _*))
}
之前看过很多帖子说了关于offset的问题,其实通过源码我们会发现offset是从msg中获取的,这个offset应该是kafka自己维护的,所以只要Kafka没有完全数据丢失,这个offset应该是可以一直维护住的,所以我的代码中是取消了checkpoint的,如果理解有误,请指出,感谢,这里注意一下案例中的代码:
val kafkaParams = Map[String, Object]("bootstrap.servers" -> brokers,
"group.id" -> groupid,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean))
enable.auto.commit为false官网说最好设置为false,可是我通过阅读代码发现,你不设置成false也会强制设置成false:
private[spark] class DirectKafkaInputDStream[K, V](
_ssc: StreamingContext,
locationStrategy: LocationStrategy,
consumerStrategy: ConsumerStrategy[K, V],
ppc: PerPartitionConfig
) extends InputDStream[ConsumerRecord[K, V]](_ssc) with Logging with CanCommitOffsets {
val executorKafkaParams = {
val ekp = new ju.HashMap[String, Object](consumerStrategy.executorKafkaParams)
KafkaUtils.fixKafkaParams(ekp)
ekp
}
这个值在compute中被使用了,看下KafkaUtils.fixKafkaParams(ekp):
/**
* Tweak kafka params to prevent issues on executors
*/
private[kafka010] def fixKafkaParams(kafkaParams: ju.HashMap[String, Object]): Unit = {
//强制修改ENABLE_AUTO_COMMIT_CONFIG 为false,
logWarning(s"overriding ${ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG} to false for executor")
kafkaParams.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false: java.lang.Boolean)
//强制修改AUTO_OFFSET_RESET_CONFIG 为NONE,
logWarning(s"overriding ${ConsumerConfig.AUTO_OFFSET_RESET_CONFIG} to none for executor")
kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "none")
// driver and executor should be in different consumer groups
val originalGroupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG)
if (null == originalGroupId) {
logError(s"${ConsumerConfig.GROUP_ID_CONFIG} is null, you should probably set it")
}
val groupId = "spark-executor-" + originalGroupId
logWarning(s"overriding executor ${ConsumerConfig.GROUP_ID_CONFIG} to ${groupId}")
kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId)
// possible workaround for KAFKA-3135
val rbb = kafkaParams.get(ConsumerConfig.RECEIVE_BUFFER_CONFIG)
if (null == rbb || rbb.asInstanceOf[java.lang.Integer] < 65536) {
logWarning(s"overriding ${ConsumerConfig.RECEIVE_BUFFER_CONFIG} to 65536 see KAFKA-3135")
kafkaParams.put(ConsumerConfig.RECEIVE_BUFFER_CONFIG, 65536: java.lang.Integer)
}
}
源码阅读到此结束,如有错误请赐教,谢谢。