sparkStreaming的offset保存在redis中

实现功能:

从kafka读取某一主题,消费者组的偏移量

基于读出的offset,创建kafka读取流程

把各个分区的偏移量 保存到redis。


   
   
   
   
  1. import Kafka010.Utils.{MyKafkaUtils, RedisUtilsDemo}
  2. import org.apache.kafka.clients.consumer.ConsumerRecord
  3. import org.apache.kafka.common.TopicPartition
  4. import org.apache.spark.SparkConf
  5. import org.apache.spark.streaming.dstream.InputDStream
  6. import org.apache.spark.streaming.kafka010._
  7. import org.apache.spark.streaming.{Seconds, StreamingContext}
  8. object Test{
  9. def main(args: Array[ String]): Unit = {
  10. //创建spark环境
  11. val conf = new SparkConf()
  12. .setMaster( "local[*]")
  13. .setAppName(s "${this.getClass.getCanonicalName}")
  14. //创建sparkStreamingContext
  15. val ssc = new StreamingContext(conf, Seconds( 2))
  16. //创建kafka相关参数
  17. val groupId = "SparkKafka010"
  18. val topics = List( "datacollection")
  19. //获取kafka参数,这里是自定义的MyKafkaUtils类,后面给出类的具体内容
  20. val kafkaParams = MyKafkaUtils.getKafkaConsumerParams(groupId, "false")
  21. // 从redis读取offset
  22. val offsets: Map[TopicPartition, Long] = RedisUtilsDemo.getOffsetFromRedis( "datacollection", groupId)
  23. //获取kafka数据
  24. val ds: InputDStream[ConsumerRecord[ String, String]] = KafkaUtils.createDirectStream[ String, String](ssc,
  25. LocationStrategies.PreferConsistent,
  26. ConsumerStrategies.Subscribe[ String, String](topics, kafkaParams,offsets)
  27. )
  28. ds.foreachRDD( rdd => {
  29. val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
  30. // 代表对数据进行处理
  31. if (! rdd.isEmpty())
  32. println(rdd.count)
  33. // 代表对offset进行处理
  34. ranges.foreach( offset =>
  35. println(s "${offset.partition}, ${offset.fromOffset}, ${offset.untilOffset}")
  36. )
  37. //保存offset到redis
  38. RedisUtilsDemo.saveOffsetToRedis(ranges, groupId)
  39. })
  40. ssc.start()
  41. ssc.awaitTermination()
  42. }
  43. }
MyKafkaUtils工具类
   
   
   
   

   
   
   
   
  1. import org.apache.kafka.clients.consumer.ConsumerConfig
  2. import org.apache.kafka.common.serialization.StringDeserializer
  3. object MyKafkaUtils {
  4. def getKafkaConsumerParams(groupId: String = "SparkStreaming010", autoCommit: String= "true"): Map[ String, String] = {
  5. val kafkaParams = Map[ String, String](
  6. ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "mini1:9092,mini2:9092",
  7. ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer].getName,
  8. ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer].getName,
  9. ConsumerConfig.GROUP_ID_CONFIG -> groupId,
  10. ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> autoCommit)
  11. kafkaParams
  12. }
  13. def main(args: Array[ String]): Unit = {
  14. }
  15. }

RedisUtilDemo工具类


   
   
   
   
  1. import cn.bigdata.antispider.common.util.jedis.{JedisConUtil, JedisConnectionUtil}
  2. import org.apache.kafka.common.TopicPartition
  3. import org.apache.spark.streaming.kafka010.OffsetRange
  4. object RedisUtilsDemo {
  5. //获取jedis连接
  6. private val jedis = JedisConUtil.getJedisClient()
  7. // 读取offset
  8. // key(kafka:topic:groupid) value(parition:offset;parition:offset)
  9. // 没有考虑 key 不存在的情况,为了方便测试可以先提前set一下
  10. def getOffsetFromRedis(topic: String, groupid: String): Map[TopicPartition, Long] = {
  11. val key = s "kafka:$topic:$groupid"
  12. val offsetStr: String = jedis. get(key)
  13. offsetStr.split( ";").map(str => {
  14. val fileds = str.split( ":")
  15. val parition: Int = fileds.head.toInt
  16. val offset: Long = fileds.last.toLong
  17. (new TopicPartition(topic, parition) -> offset)
  18. }).toMap
  19. }
  20. // 可以给定多个topic;返回多个topic对应的对应的offset
  21. def getOffsetFromRedis2(topics: Iterator[String], groupid: String): Iterator[Option[Map[TopicPartition, Long]]] = {
  22. topics.map { topic =>
  23. val key = s "kafka:$topic:$groupid"
  24. val offsetStr: String = jedis. get(key)
  25. if (offsetStr != null && offsetStr.trim.size> 0 ) {
  26. val offsets = offsetStr.split( ";").map { str =>
  27. val fileds = str.split( ":")
  28. val parition: Int = fileds.head.toInt
  29. val offset: Long = fileds.last.toLong
  30. (new TopicPartition(topic, parition) -> offset)
  31. }.toMap
  32. Some(offsets)
  33. }
  34. else None
  35. }
  36. }
  37. def getOffsetFromRedis1(topic: String, groupid: String): Option[Map[TopicPartition, Long]] = {
  38. val key = s "kafka:$topic:$groupid"
  39. val offsetStr: String = jedis. get(key)
  40. if (offsetStr != null && offsetStr.trim.size> 0 ) {
  41. val offsets = offsetStr.split( ";").map(str => {
  42. val fileds = str.split( ":")
  43. val parition: Int = fileds.head.toInt
  44. val offset: Long = fileds.last.toLong
  45. (new TopicPartition(topic, parition) -> offset)
  46. }).toMap
  47. Some(offsets)
  48. }
  49. else
  50. None
  51. }
  52. // 保存offset
  53. // key(kafka:topic:groupid) value(parition:offset;parition:offset)
  54. def saveOffsetToRedis(ranges: Array[OffsetRange], groupId: String): Unit = {
  55. ranges.map(offsets => (offsets.topic, (offsets.partition, offsets.untilOffset)))
  56. .groupBy(_._1)
  57. .foreach{case ((topic, buffer)) =>
  58. val key = s "kafka:$topic:$groupId"
  59. val value = buffer.map{case (_, (partition, untilOffset)) => s "$partition:$untilOffset"}.mkString( ";")
  60. jedis. set(key, value)
  61. }
  62. }
  63. }

 

你可能感兴趣的:(spark,Streaming)