Sparkstreaming程序:
package com.cloudera.saveoffsettoZk
import com.cloudera.utils.KafkaZkUtils
import kafka.utils.ZkUtils
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.HasOffsetRanges
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkSaveOffsetToZkApp {
private val logger: Logger = Logger.getLogger(this.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setIfMissing("spark.master", "local[2]").setAppName("Spark Save Offset To Zookeeper App")
val streamingContext = new StreamingContext(conf, Seconds(30))
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.1.100:9092,192.168.1.101:9092,192.168.1.102:9092",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.GROUP_ID_CONFIG -> "group-01",
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean)
)
val topic: String = "my-topic"
val zkUrl = "192.168.1.100:2181,192.168.1.101:2181,192.168.1.102:2181"
val sessionTimeout = 1000
val connectionTimeout = 1000
val zkClient = ZkUtils.createZkClient(zkUrl, sessionTimeout, connectionTimeout)
val kafkaStream = KafkaZkUtils.createDirectStream(zkClient, streamingContext, kafkaParams, topic)
//开始处理批次消息
kafkaStream.foreachRDD(rdd => {
//获取当前批次的RDD的偏移量
val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// 处理从获取 kafka 中的数据
val result = rdd.map(_.value()).map(_.split("\\|\\|")).map(x => (x(0), x(1), x(2)))
result.foreach(println(_))
println("=============== Total " + rdd.count() + " events in this batch ..")
// 更新offset到zookeeper中
KafkaZkUtils.saveOffsets(zkClient, topic, KafkaZkUtils.getZkPath(kafkaParams, topic), rdd)
})
streamingContext.start()
streamingContext.awaitTermination()
streamingContext.stop()
}
}
KafkaZkUtils 工具类:
package com.cloudera.utils
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.kafka.common.TopicPartition
import org.apache.log4j.Logger
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils}
import scala.collection.immutable.Map
object KafkaZkUtils {
private val logger: Logger = Logger.getLogger(this.getClass)
/**
* 获取 consumer 在zk上的路径
* @param kafkaParams
* @param topic
* @return
*/
def getZkPath(kafkaParams: Map[String, Object], topic: String): String ={
val topicDirs = new ZKGroupTopicDirs(kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).toString, topic)
s"${topicDirs.consumerOffsetDir}"
}
/**
* 创建 DirectStream
* @param zkClient
* @param streamingContext
* @param kafkaParams
* @param topic
* @return
*/
def createDirectStream(zkClient: ZkClient,streamingContext: StreamingContext, kafkaParams: Map[String, Object], topic: String): InputDStream[ConsumerRecord[String, String]] = {
val zkPath = getZkPath(kafkaParams,topic)
//读取 topic 的 offset
val storedOffsets = readOffsets(zkClient, topic, zkPath)
val kafkaStream: InputDStream[ConsumerRecord[String, String]] = storedOffsets match {
//上次未保存offsets
case None =>
KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array(topic), kafkaParams)
)
case Some(fromOffsets) => {
KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
// 指定分区消费,无法动态感知分区变化
// ConsumerStrategies.Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, fromOffsets)
)
}
}
kafkaStream
}
/**
* 保存 offset
* @param zkClient
* @param topic
* @param zkPath
* @param rdd
*/
def saveOffsets(zkClient: ZkClient,topic: String, zkPath: String, rdd: RDD[_]): Unit = {
logger.info("Saving offsets to zookeeper")
val offsetsRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
offsetsRanges.foreach(offsetRange => logger.debug(s"Using ${offsetRange}"))
val offsetsRangesStr = offsetsRanges.map(offsetRange => s"${offsetRange.partition}:${offsetRange.untilOffset}").mkString(",")
logger.info(s"Writing offsets to Zookeeper: ${offsetsRangesStr}")
ZkUtils(zkClient, false).updatePersistentPath(zkPath, offsetsRangesStr)
}
/**
* 读取 offset
* @param zkClient
* @param topic
* @param zkPath
* @return
*/
def readOffsets(zkClient: ZkClient, topic: String, zkPath: String): Option[Map[TopicPartition, Long]] = {
logger.info("Reading offsets from zookeeper")
val (offsetsRangesStrOpt, _) = ZkUtils(zkClient, false).readDataMaybeNull(zkPath)
offsetsRangesStrOpt match {
case Some(offsetsRangesStr) => {
logger.debug(s"Read offset ranges: ${
offsetsRangesStr
}")
val offsets: Map[TopicPartition, Long] = offsetsRangesStr.split(",").map(s => s.split(":"))
.map({
case Array(partitionStr, offsetStr) =>
(new TopicPartition(topic, partitionStr.toInt) -> offsetStr.toLong)
}).toMap
Some(offsets)
}
case None =>
logger.info("No offsets found in Zookeeper")
None
}
}
}
KafkaProducer
package com.cloudera.saveoffsettoZk
import java.text.SimpleDateFormat
import java.util.concurrent.Future
import java.util.{Date, Properties, UUID}
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord, RecordMetadata}
import org.apache.kafka.common.serialization.StringSerializer
import scala.util.Random
object KafkaProducerDemo {
def main(args: Array[String]): Unit = {
// 设置 Kafka 配置属性
val props = new Properties
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.1.100:9092,192.168.1.101:9092,192.168.1.102:9092")
props.put(ProducerConfig.ACKS_CONFIG, "all")
props.put(ProducerConfig.RETRIES_CONFIG, "0")
props.put(ProducerConfig.BATCH_SIZE_CONFIG, "16384") //批量发送的字节数
props.put(ProducerConfig.LINGER_MS_CONFIG, "1") //将会减少请求数目,但是同时会增加1ms的延迟
props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, "33554432") //用来缓存数据的内存大小
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName)
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName)
val producer = new KafkaProducer[String, String](props)
val TOPIC_NAME = "my-topic"
try {
// 产生并发送消息
while (true) {
val runtime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date().getTime)
// 2019-09-30 08:46:10||24016FFD664A4C32AE3A4276416329B0||6
val message: String = s"${runtime}||${UUID.randomUUID().toString.replace("-", "").toUpperCase}||${Random.nextInt()}"
val record: ProducerRecord[String, String] = new ProducerRecord[String, String](TOPIC_NAME, message)
// 发送消息,并获得一个Future对象
val metadataFuture: Future[RecordMetadata] = producer.send(record)
//同步获得Future对象的结果
val recordMetadata: RecordMetadata = metadataFuture.get()
Thread.sleep(500)
println(message)
}
} catch {
case e: Exception => {
e.printStackTrace();
}
}
// 关闭producer
producer.flush()
producer.close()
}
}
pom.xml
UTF-8
UTF-8
1.8
2.11
0.10.0.1
2.2.0
0.11
3.0.0
compile
compile
org.scala-lang
scala-library
${scala.version}.12
${provided.scope}
org.apache.spark
spark-core_${scala.version}
${spark.version}
${provided.scope}
org.apache.spark
spark-streaming_${scala.version}
${spark.version}
${provided.scope}
org.apache.spark
spark-sql_${scala.version}
${spark.version}
${provided.scope}
org.apache.spark
spark-hive_${scala.version}
${spark.version}
${provided.scope}
org.apache.spark
spark-yarn_${scala.version}
${spark.version}
${provided.scope}
org.apache.spark
spark-sql-kafka-0-10_${scala.version}
${spark.version}
${provided.scope}
org.apache.spark
spark-streaming-kafka-0-10_${scala.version}
${spark.version}
${provided.scope}
org.apache.kafka
kafka_${scala.version}
${kafka.version}
${provided.scope}
org.apache.kafka
kafka-clients
${kafka.version}
${provided.scope}
com.101tec
zkclient
${zkclient.version}
redis.clients
jedis
${redis.version}
运行结果:
...
(2019-09-30 10:09:05,0F9EE952FEC548D2B2E91E4EDCF8DDC4,99555169)
(2019-09-30 10:08:59,2837DA6C8B3A4F29973B2FC210D20D54,-591678628)
(2019-09-30 10:09:12,A19DE366F9F640CFB239BF3197C5E10B,513230648)
(2019-09-30 10:09:12,E8EAF40E636E461C86C27161B794F60A,691784811)
(2019-09-30 10:09:14,CC09A901AEA0478F8C60346150FF87FA,-1644884384)
(2019-09-30 10:09:13,1317E46EE6154D2F9EF69B8382246E93,135563951)
(2019-09-30 10:09:15,121156B463AC4A22BF5CF07E22FA9CE0,-414732657)
(2019-09-30 10:09:15,DAECF018F6384933A40364A906BC99D0,-145448624)
(2019-09-30 10:09:17,46EDA56DE868495BAC44613EBFFB0BDF,1356997081)
(2019-09-30 10:09:06,5191550588544F229C33839655D8ECB8,-1222599956)
(2019-09-30 10:09:13,65833427C68A44B79FC27B14652A916A,1140020871)
(2019-09-30 10:09:14,655D6FB67C5F43AFB3EABB842818742F,1760913134)
(2019-09-30 10:09:16,752F6AFA9EA54EF48E0D2124DFBC3BC7,1518003893)
19/09/30 10:09:30 INFO Executor: Finished task 2.0 in stage 0.0 (TID 2). 622 bytes result sent to driver
19/09/30 10:09:30 INFO TaskSetManager: Finished task 2.0 in stage 0.0 (TID 2) in 19 ms on localhost (executor driver) (3/3)
19/09/30 10:09:30 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
19/09/30 10:09:30 INFO DAGScheduler: ResultStage 0 (foreach at Consumer.scala:46) finished in 0.128 s
19/09/30 10:09:30 INFO DAGScheduler: Job 0 finished: foreach at Consumer.scala:46, took 0.284038 s
=============== Total 13 events in this batch ..
19/09/30 10:09:30 INFO KafkaZkUtils$: Saving offsets to zookeeper
19/09/30 10:09:30 INFO KafkaZkUtils$: Writing offsets to Zookeeper: 0:764,1:765,2:765
19/09/30 10:09:30 INFO JobScheduler: Finished job streaming job 1569809370000 ms.0 from job set of time 1569809370000 ms
19/09/30 10:09:30 INFO JobScheduler: Total delay: 0.465 s for time 1569809370000 ms (execution: 0.335 s)
......
(2019-09-30 10:09:17,D9BD82C2A7CD45D6BA6193DF9B44F2EC,1363055585)
19/09/30 10:10:00 INFO Executor: Finished task 0.0 in stage 1.0 (TID 3). 622 bytes result sent to driver
19/09/30 10:10:00 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 4) in 13 ms on localhost (executor driver) (1/3)
19/09/30 10:10:00 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 3) in 14 ms on localhost (executor driver) (2/3)
19/09/30 10:10:00 INFO KafkaRDD: Beginning offset 765 is the same as ending offset skipping my-topic 2
19/09/30 10:10:00 INFO Executor: Finished task 2.0 in stage 1.0 (TID 5). 622 bytes result sent to driver
19/09/30 10:10:00 INFO TaskSetManager: Finished task 2.0 in stage 1.0 (TID 5) in 9 ms on localhost (executor driver) (3/3)
19/09/30 10:10:00 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
19/09/30 10:10:00 INFO DAGScheduler: ResultStage 1 (foreach at Consumer.scala:46) finished in 0.018 s
19/09/30 10:10:00 INFO DAGScheduler: Job 1 finished: foreach at Consumer.scala:46, took 0.026031 s
=============== Total 1 events in this batch ..
19/09/30 10:10:00 INFO KafkaZkUtils$: Saving offsets to zookeeper
19/09/30 10:10:00 INFO KafkaZkUtils$: Writing offsets to Zookeeper: 0:765,1:765,2:765
19/09/30 10:10:00 INFO JobScheduler: Finished job streaming job 1569809400000 ms.0 from job set of time 1569809400000 ms
19/09/30 10:10:00 INFO JobScheduler: Total delay: 0.051 s for time 1569809400000 ms (execution: 0.048 s)
...