序
本文主要讨论一下kafka consumer offset lag的监控
方案
- 利用官方的类库
ConsumerOffsetChecker
ConsumerGroupCommand
- 利用官方的JMX
ConsumerOffsetChecker
在0.8.2.2版本如下
kafka_2.10-0.8.2.2-sources.jar!/kafka/tools/ConsumerOffsetChecker.scala
object ConsumerOffsetChecker extends Logging {
private val consumerMap: mutable.Map[Int, Option[SimpleConsumer]] = mutable.Map()
private val offsetMap: mutable.Map[TopicAndPartition, Long] = mutable.Map()
private var topicPidMap: immutable.Map[String, Seq[Int]] = immutable.Map()
private def getConsumer(zkClient: ZkClient, bid: Int): Option[SimpleConsumer] = {
//...
}
private def processPartition(zkClient: ZkClient,
group: String, topic: String, pid: Int) {
//...
}
private def processTopic(zkClient: ZkClient, group: String, topic: String) {
topicPidMap.get(topic) match {
case Some(pids) =>
pids.sorted.foreach {
pid => processPartition(zkClient, group, topic, pid)
}
case None => // ignore
}
}
private def printBrokerInfo() {
println("BROKER INFO")
for ((bid, consumerOpt) <- consumerMap)
consumerOpt match {
case Some(consumer) =>
println("%s -> %s:%d".format(bid, consumer.host, consumer.port))
case None => // ignore
}
}
def main(args: Array[String]) {
//...
try {
zkClient = new ZkClient(zkConnect, 30000, 30000, ZKStringSerializer)
val topicList = topics match {
case Some(x) => x.split(",").view.toList
case None => ZkUtils.getChildren(zkClient, groupDirs.consumerGroupDir + "/owners").toList
}
topicPidMap = immutable.Map(ZkUtils.getPartitionsForTopics(zkClient, topicList).toSeq:_*)
val topicPartitions = topicPidMap.flatMap { case(topic, partitionSeq) => partitionSeq.map(TopicAndPartition(topic, _)) }.toSeq
val channel = ClientUtils.channelToOffsetManager(group, zkClient, channelSocketTimeoutMs, channelRetryBackoffMs)
debug("Sending offset fetch request to coordinator %s:%d.".format(channel.host, channel.port))
channel.send(OffsetFetchRequest(group, topicPartitions))
val offsetFetchResponse = OffsetFetchResponse.readFrom(channel.receive().buffer)
debug("Received offset fetch response %s.".format(offsetFetchResponse))
offsetFetchResponse.requestInfo.foreach { case (topicAndPartition, offsetAndMetadata) =>
if (offsetAndMetadata == OffsetMetadataAndError.NoOffset) {
val topicDirs = new ZKGroupTopicDirs(group, topicAndPartition.topic)
// this group may not have migrated off zookeeper for offsets storage (we don't expose the dual-commit option in this tool
// (meaning the lag may be off until all the consumers in the group have the same setting for offsets storage)
try {
val offset = ZkUtils.readData(zkClient, topicDirs.consumerOffsetDir + "/%d".format(topicAndPartition.partition))._1.toLong
offsetMap.put(topicAndPartition, offset)
} catch {
case z: ZkNoNodeException =>
if(ZkUtils.pathExists(zkClient,topicDirs.consumerOffsetDir))
offsetMap.put(topicAndPartition,-1)
else
throw z
}
}
else if (offsetAndMetadata.error == ErrorMapping.NoError)
offsetMap.put(topicAndPartition, offsetAndMetadata.offset)
else {
println("Could not fetch offset for %s due to %s.".format(topicAndPartition, ErrorMapping.exceptionFor(offsetAndMetadata.error)))
}
}
channel.disconnect()
println("%-15s %-30s %-3s %-15s %-15s %-15s %s".format("Group", "Topic", "Pid", "Offset", "logSize", "Lag", "Owner"))
topicList.sorted.foreach {
topic => processTopic(zkClient, group, topic)
}
if (options.has("broker-info"))
printBrokerInfo()
for ((_, consumerOpt) <- consumerMap)
consumerOpt match {
case Some(consumer) => consumer.close()
case None => // ignore
}
}
catch {
case t: Throwable =>
println("Exiting due to: %s.".format(t.getMessage))
}
finally {
for (consumerOpt <- consumerMap.values) {
consumerOpt match {
case Some(consumer) => consumer.close()
case None => // ignore
}
}
if (zkClient != null)
zkClient.close()
if (channel != null)
channel.disconnect()
}
}
}
缺点就是该类是给命令行调用的,每调用一次,就new一次zkClient,对于监控用来说,不是太合适,需要改造一下,抽取zkClient出来
ConsumerGroupCommand
0.8.2.2以上版本使用ConsumerGroupCommand替代了ConsumerOffsetChecker
kafka_2.11-0.10.2.1-sources.jar!/kafka/admin/ConsumerGroupCommand.scala
object ConsumerGroupCommand extends Logging {
//...
def main(args: Array[String]) {
val opts = new ConsumerGroupCommandOptions(args)
if (args.length == 0)
CommandLineUtils.printUsageAndDie(opts.parser, "List all consumer groups, describe a consumer group, or delete consumer group info.")
// should have exactly one action
val actions = Seq(opts.listOpt, opts.describeOpt, opts.deleteOpt).count(opts.options.has _)
if (actions != 1)
CommandLineUtils.printUsageAndDie(opts.parser, "Command must include exactly one action: --list, --describe, --delete")
opts.checkArgs()
val consumerGroupService = {
if (opts.useOldConsumer) {
System.err.println("Note: This will only show information about consumers that use ZooKeeper (not those using the Java consumer API).\n")
new ZkConsumerGroupService(opts)
} else {
System.err.println("Note: This will only show information about consumers that use the Java consumer API (non-ZooKeeper-based consumers).\n")
new KafkaConsumerGroupService(opts)
}
}
try {
if (opts.options.has(opts.listOpt))
consumerGroupService.listGroups().foreach(println(_))
else if (opts.options.has(opts.describeOpt)) {
val (state, assignments) = consumerGroupService.describeGroup()
val groupId = opts.options.valuesOf(opts.groupOpt).asScala.head
assignments match {
case None =>
// applies to both old and new consumer
printError(s"The consumer group '$groupId' does not exist.")
case Some(assignments) =>
if (opts.useOldConsumer)
printAssignment(assignments, false)
else
state match {
case Some("Dead") =>
printError(s"Consumer group '$groupId' does not exist.")
case Some("Empty") =>
System.err.println(s"Consumer group '$groupId' has no active members.")
printAssignment(assignments, true)
case Some("PreparingRebalance") | Some("AwaitingSync") =>
System.err.println(s"Warning: Consumer group '$groupId' is rebalancing.")
printAssignment(assignments, true)
case Some("Stable") =>
printAssignment(assignments, true)
case other =>
// the control should never reach here
throw new KafkaException(s"Expected a valid consumer group state, but found '${other.getOrElse("NONE")}'.")
}
}
}
else if (opts.options.has(opts.deleteOpt)) {
consumerGroupService match {
case service: ZkConsumerGroupService => service.deleteGroups()
case _ => throw new IllegalStateException(s"delete is not supported for $consumerGroupService.")
}
}
} catch {
case e: Throwable =>
printError(s"Executing consumer group command failed due to ${e.getMessage}", Some(e))
} finally {
consumerGroupService.close()
}
}
}
也是基于命令行来设计的
JMX
这个是利用kafka本身写入的JMX的数据,就不用额外在去像ConsumerOffsetChecker去自己连接再去获取。比如
ObjectName oName = new ObjectName("kafka.producer:*");
Set metricsBeans = mBeanServer.queryNames(oName, null);
for (ObjectName mBeanName : metricsBeans) {
MBeanInfo metricsBean = mBeanServer.getMBeanInfo(mBeanName);
MBeanAttributeInfo[] metricsAttrs = metricsBean.getAttributes();
for (MBeanAttributeInfo metricsAttr : metricsAttrs) {
//get value
Object value = mBeanServer.getAttribute(mBeanName, metricsAttr.getName());
//process ...
}
}
小结
可以自己改造ConsumerOffsetChecker或者ConsumerGroupCommand,然后上报到statsd或者Prometheus。当然能利用JMX是最省事的了。
doc
- kafka官方JMX+Reporters