Sparkstreaming程序:
package com.cloudera.savekafkaoffset
import com.cloudera.utils.{JedisPoolUtils, KafkaRedisUtils, RedisConfig}
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.HasOffsetRanges
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.Pipeline
import scala.collection.immutable.Map
object SparkSaveOffsetToRedisApp {
private val logger: Logger = Logger.getLogger(this.getClass)
def main(args: Array[String]): Unit = {
// Kafka 的 Offsets 以 module:groupId:topic 为 key 的 hash 结构存入 Redis 中
val module: String = "Test"
val groupId: String = "groupId-01"
val topics: Array[String] = "my-topic".split(",")
// sparkstreaming 消费 kafka 时的 Consumer 参数
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.1.100:9092,192.168.1.101:9092,192.168.1.102:9092",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.GROUP_ID_CONFIG -> groupId,
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean)
)
// 初始化 Redis 连接池
JedisPoolUtils.makePool(RedisConfig("192.168.1.100", 6379, 30000, 1000, 100, 50))
val conf = new SparkConf().setIfMissing("spark.master", "local[2]").setAppName("Spark Save Offset To Zookeeper App")
val streamingContext = new StreamingContext(conf, Seconds(30))
val kafkaStream = KafkaRedisUtils.createDirectStream(streamingContext, kafkaParams, module, groupId, topics)
//开始处理批次消息
kafkaStream.foreachRDD(rdd => {
//获取当前批次的RDD的偏移量
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// 处理从获取 kafka 中的数据
if (!rdd.isEmpty()) {
// 获取 redis 连接
val jedisClient = JedisPoolUtils.getPool.getResource
//开启事务
val pipeline: Pipeline = jedisClient.pipelined()
pipeline.multi()
try {
// 处理从获取 kafka 中的数据
val result = rdd.map(_.value()).map(_.split("\\|\\|")).map(x => (x(0), x(1), x(2)))
logger.info("==========> Total " + rdd.count() + " events in this batch ..")
result.foreach(println(_))
//更新offset到Redis中
offsetRanges.foreach({ offsetRange =>
logger.info("==========> partition : " + offsetRange.partition + " fromOffset: " + offsetRange.fromOffset
+ " untilOffset: " + offsetRange.untilOffset)
// Kafka 的 Offsets 以 module:groupId:topic 为 key 的 hash 结构存入 Redis 中
val key = s"${module}:${groupId}:${offsetRange.topic}"
pipeline.hset(key, offsetRange.partition.toString, offsetRange.untilOffset.toString)
})
//提交事务
pipeline.exec()
//关闭pipeline
pipeline.sync()
} catch {
case e: Exception => {
logger.error("数据处理异常", e)
pipeline.discard()
}
} finally {
//关闭连接
pipeline.close()
jedisClient.close()
}
}
})
streamingContext.start()
streamingContext.awaitTermination()
streamingContext.stop()
}
}
JedisPoolUtils 工具类
package com.cloudera.utils
import redis.clients.jedis.{JedisPool, JedisPoolConfig}
case class RedisConfig(redisHost: String, redisPort: Int, redisTimeout: Int, maxTotal: Int, maxIdle: Int, minIdle: Int) extends Serializable
object JedisPoolUtils extends Serializable {
@transient private var pool: JedisPool = null
def makePool(redisConfig:RedisConfig): Unit ={
makePool(redisConfig.redisHost, redisConfig.redisPort, redisConfig.redisTimeout, redisConfig.maxTotal, redisConfig.maxIdle, redisConfig.minIdle)
}
def makePool(redisHost: String, redisPort: Int, redisTimeout: Int, maxTotal: Int, maxIdle: Int, minIdle: Int): Unit = {
makePool(redisHost, redisPort, redisTimeout, maxTotal, maxIdle, minIdle, true, false, 10000)
}
def makePool(redisHost: String, redisPort: Int, redisTimeout: Int, maxTotal: Int, maxIdle: Int, minIdle: Int, testOnBorrow: Boolean, testOnReturn: Boolean, maxWaitMillis: Long): Unit = {
if (pool == null) {
val poolConfig = new JedisPoolConfig()
poolConfig.setMaxTotal(maxTotal)
poolConfig.setMaxIdle(maxIdle)
poolConfig.setMinIdle(minIdle)
poolConfig.setTestOnBorrow(testOnBorrow)
poolConfig.setTestOnReturn(testOnReturn)
poolConfig.setMaxWaitMillis(maxWaitMillis)
pool = new JedisPool(poolConfig, redisHost, redisPort, redisTimeout)
val hook = new Thread {
override def run = pool.destroy()
}
sys.addShutdownHook(hook.run)
}
}
def getPool: JedisPool = {
assert(pool != null)
pool
}
}
KafkaRedisUtils工具类:
package com.cloudera.utils
import java.util
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.log4j.Logger
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils}
import scala.collection.JavaConversions._
import scala.collection.immutable.Map
object KafkaRedisUtils {
private val logger: Logger = Logger.getLogger(this.getClass)
/**
* 创建 DirectStream
* @param streamingContext
* @param kafkaParams kafka参数
* @param module 模块名
* @param groupId 消费者组
* @param topics topics
* @return
*/
def createDirectStream(streamingContext: StreamingContext, kafkaParams: Map[String, Object],
module: String, groupId: String,topics: Array[String]): InputDStream[ConsumerRecord[String, String]] = {
//读取 topic 的 offset
val storedOffsets = readOffsets(module, groupId, kafkaParams, topics)
val kafkaStream: InputDStream[ConsumerRecord[String, String]] = storedOffsets match {
//上次未保存offsets
case None => {
KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
)
}
case Some(fromOffsets) => {
KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
// 指定分区消费,无法动态感知分区变化
// ConsumerStrategies.Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, fromOffsets)
)
}
}
kafkaStream
}
/**
* 读取 offset
* @param module
* @param groupId
* @param kafkaParams
* @param topics
* @return
*/
def readOffsets(module: String, groupId: String,kafkaParams: Map[String, Object], topics: Array[String]): Option[Map[TopicPartition, Long]] = {
logger.info("Reading offsets from Redis")
val jedis = JedisPoolUtils.getPool.getResource
//设置每个分区起始的Offset
var fromOffSets: Map[TopicPartition, Long] = Map()
try {
topics.foreach(topic => {
var topicFromOffsets: Map[TopicPartition, Long] = Map()
val key = s"${module}:${groupId}:${topic}"
if (jedis.exists(key)) {
val offsetMap: util.Map[String, String] = jedis.hgetAll(key)
offsetMap.map(record => {
//判断field(即分区)是否存在
if(jedis.hexists(key,record._1)){
logger.info(s"topic ${topic} partition ${record._1} get lastSavedOffset from redis: ${record._2}")
topicFromOffsets += new TopicPartition(topic, record._1.toInt) -> record._2.toLong
}else{
jedis.hset(key,record._1,"0")
}
})
}
fromOffSets ++= topicFromOffsets
})
} catch {
case e: Exception => logger.error("readOffsets error ", e)
System.exit(1)
}finally {
jedis.close()
}
if (fromOffSets.isEmpty) {
None
} else {
Some(fromOffSets)
}
}
}