sparkstreaming2.2使用checkpoint、kafka(1.0.1)、redis三种方式保存kafka偏移量测试

1、测试代码,使用三种方法保存kafka的offset(未优化版

package kafka.comsumer

import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.PropUtil

/**
  * @author yanghb
  * @date 2019/7/25 10:03
  * @description:使用kafka自己维护offset,读取多个topic
  */
object KafkaOffset {

  //加载配置变量
  val prop = new PropUtil("config.properties")
  val oracleUrl = prop.getProp("ORACLE_URL")
  val oracleUser = prop.getProp("ORACLE_USER")
  val oraclePassword = prop.getProp("ORACLE_PASSWORD")
  val brokers = prop.getProp("KAFKA_BROKERS")
  val groupName:String = this.getClass.getName

  def main(args: Array[String]): Unit = {

    //获取SparkSession连接
    val spark = SparkSession.builder().appName(groupName).master("local[4]").getOrCreate()
//        val spark = SparkSession.builder().appName("SparkToOracleStatus").getOrCreate()
    val sc = spark.sparkContext
    //设置日志级别
    sc.setLogLevel("WARN")
    val ssc = new StreamingContext(sc, Seconds(5))

    //读取的topic
    //    val topics=Array("DC_HISTORY_STATUS_T2","DC_HISTORY_STATUS_T")
    val topics = Array("testTopic2")

    //配置kafka参数
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupName,
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    //创建数据流
    val messages = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )

    var offsetRanges = Array[OffsetRange]()

    //kafkaStreamDate.foreachRDD里面的业务逻辑是在Driver端执行
    messages.foreachRDD { kafkaRDD =>
      //判断当前的 kafkaStream 中的RDD是否有数据
      if (!kafkaRDD.isEmpty()) {

//        try{
//
//        }catch {
//          case e:Throwable => e.printStackTrace()
//        }


        //只有KafkaRDD可以强转成HasOffsetRanges,并获取到偏移量
        offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges

        //获取message中的具体数据
        val kafkaData: RDD[String] = kafkaRDD.map(_.value())

        //todo 此行可注释
        for (o <- offsetRanges) {
          println(o)
        }

        // 数据处理,解析json数据,转换时间戳
        kafkaData.foreachPartition(rdds => {

            // 数据推送,将数据批量推送到oracle中
            rdds.foreach(x => {
              println(x)
            })

        })

        // 更新偏移量。数据处理完更新偏移量到kafkagroup中
        messages.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
      }
    }

    ssc.start()
    ssc.awaitTermination()
  }

}
package kafka.comsumer

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.Jedis
import utils.PropUtil

/**
  * @author yanghb
  * @date 2019/7/25 10:03
  * @description :topic默认分区3; 每个topic都会建立一个连接;
  *                  如要上线后又重新修改分区,或增加topic,则需要删除redis,重新拉取数据;
  */
object RedisOffset {

  //加载配置变量
  val prop = new PropUtil("config.properties")
  val oracleUrl = prop.getProp("ORACLE_URL")
  val oracleUser = prop.getProp("ORACLE_USER")
  val oraclePassword = prop.getProp("ORACLE_PASSWORD")
  val brokers = prop.getProp("KAFKA_BROKERS")
  val groupName: String = this.getClass.getName

  def main(args: Array[String]): Unit = {

    //获取SparkSession连接,没有则创建
    val spark = SparkSession.builder().appName(groupName).master("local[4]").getOrCreate()
    //        val spark = SparkSession.builder().appName("SparkToOracleStatus").getOrCreate()
    val sc = spark.sparkContext
    //设置日志级别
    sc.setLogLevel("WARN")
    val ssc = new StreamingContext(sc, Seconds(5))

    //读取的topic
    //    val topics=Array("DC_HISTORY_STATUS_T2","DC_HISTORY_STATUS_T")
    val topics = Array("testTopic2","DC_HISTORY_STATUS_T","DC_HISTORY_STATUS_T_2")

    //配置kafka参数
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupName,
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null
    var fromOffsets: Map[TopicPartition, Long] = Map()

    //redis连接,也可使用连接池
    val js = new Jedis("10.1.168.140")
    //使用try关闭js连接
    try {

      //todo 默认判断存不存在第一个topic的分区,待优化,循环遍历topic进行判断
      if (js.exists(topics(0) + "-0")) {
        //循环遍历topic
        for (i <- 0 until topics.length) {
          if (js.exists(topics(i) + "-0")) {
            // todo 此处的partition分区数量需修改,改为最多的分区数量,设定每个topic的分区数为5,将每个topic的offset按照map方式添加到fromOffsets中
            for (j <- 0 until 5) {
              //判断key存不存在,存在则读取offset
              if (js.exists(topics(i) + "-" + j)) {
                val tp = new TopicPartition(topics(i), j)
                val offset: Long = js.get(topics(i) + "-" + j).toLong
                fromOffsets += (tp -> offset)
              }
            }
          }
          //根据fromOffsets创建数据流
          kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, ConsumerStrategies.Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets))
          println("使用offset建立连接")
        }
      } else {
        //第一次运行,新建数据流
        kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
        println("新建连接")
      }

      var offsetRanges = Array[OffsetRange]()

      //kafkaStreamDate.foreachRDD里面的业务逻辑是在Driver端执行
      kafkaStream.foreachRDD { kafkaRDD =>
        //判断当前的 kafkaStream 中的RDD是否有数据
        if (!kafkaRDD.isEmpty()) {
          //只有KafkaRDD可以强转成HasOffsetRanges,并获取到偏移量
          offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges

          //获取message中的具体数据
          val kafkaData: RDD[String] = kafkaRDD.map(_.value())

          kafkaData.foreachPartition(da => {
            da.foreach(x => {
              //            println(x)
            })
          })

          // todo 向redis提交偏移量
          for (o <- offsetRanges) {
            js.set(o.topic + "-" + o.partition, o.untilOffset.toString)
            println(o)
          }

        }
      }

      //redis的try,用于关闭redis连接
    } catch {
      case ex: Exception => println(ex)
    } finally {
      js.close()
    }

    ssc.start()
    ssc.awaitTermination()
  }

}
package kafka.comsumer

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import utils.PropUtil

/**
  * @author yanghb
  * @date 2019/7/25 10:03
  * @description: Checkpoint可以同时维护多个topic的offset,并确保可以读取一次,但是代码改动则需要清空checkpoints
  */

object CheckpointOffset {

  //获取参数
  val prop = new PropUtil("config.properties")
  val oracleUrl = prop.getProp("ORACLE_URL")
  val oracleUser = prop.getProp("ORACLE_USER")
  val oraclePassword = prop.getProp("ORACLE_PASSWORD")
  val brokers = prop.getProp("KAFKA_BROKERS")
  //  val checkpointDir = prop.getProp("checkpointDir")
  val checkpointDir = "./CheckpointOffset"
  val groupName: String = this.getClass.getName

  def functionToCreateContext(): StreamingContext = {

    //获取SparkSession连接,没有则创建
    val spark = SparkSession.builder().appName(groupName).master("local[3]").getOrCreate()
    //        val spark = SparkSession.builder().appName("SparkToOracleStatus").getOrCreate()
    val sc = spark.sparkContext
    //设置日志级别
    sc.setLogLevel("WARN")
    val ssc = new StreamingContext(sc, Seconds(3))
    ssc.checkpoint(checkpointDir)

    //读取的topic
    val topics = Array("testTopic2", "DC_HISTORY_STATUS_T")

    //配置kafka参数
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupName,
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null

    kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))

    kafkaStream.foreachRDD(kafkaRDD => {

      //todo 可注释
      val offsetRanges: Array[OffsetRange] = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges

      for (o <- offsetRanges) {
        println(o)
      }

      //获取message信息
      val value: RDD[String] = kafkaRDD.map(x => {
        x.value()
      })

      //逻辑处理
      value.foreachPartition(rdds => {
        rdds.foreach(x => {
          println(x)
        })
      })

    })

    ssc
  }

  def main(args: Array[String]): Unit = {
    // 创建context
    val context = StreamingContext.getOrCreate(checkpointDir, functionToCreateContext _)

    // 启动流计算
    context.start()
    context.awaitTermination()
  }
}

2、utils

package utils

import java.io.InputStream
import java.util.Properties

/**
  * 读取配置文件信息
  * @param file
  */

class PropUtil(val file: String) {
  var prop = new Properties()

  def getProp(key: String): String = {
    val ipStream: InputStream = this.getClass.getResourceAsStream("/config.properties")
    prop.load(ipStream)
    prop.getProperty(key)
  }
}

3、pom


        2.11.8
        2.2.0
        3.0.0
        2.0.0
        12.1.0.2
    

    
        
            org.apache.kafka
            kafka_2.11
            1.0.0
            
                
                    com.fasterxml.jackson.core
                    *
                
            
        
        
            com.fasterxml.jackson.core
            jackson-core
            2.6.6
        
        
        
            org.scala-lang
            scala-library
            ${scala.version}
        
        
        
            org.apache.spark
            spark-core_2.11
            ${spark.version}
        
        
        
            org.apache.hadoop
            hadoop-client
            ${hadoop.version}
        
        
            org.apache.hadoop
            hadoop-common
            ${hadoop.version}
        
        
            org.apache.hadoop
            hadoop-mapreduce-client-core
            ${hadoop.version}
        
        
        
            org.apache.spark
            spark-sql_2.11
            ${spark.version}
        
        
        
            org.apache.spark
            spark-hive_2.11
            ${spark.version}
        
        
        
            org.apache.spark
            spark-streaming_2.11
            ${spark.version}
        
        
        
        
            org.apache.spark
            spark-streaming-kafka-0-10_2.11
            2.2.0
        


        
            junit
            junit
            4.12
            compile
        

        
            com.alibaba
            fastjson
            1.2.47
        

        
            com.github.noraui
            ojdbc7
            ${ojdbc7}
        

        
            redis.clients
            jedis
            2.9.0
        

    

    
        src/main/scala
        src/test/scala
        
            
                net.alchim31.maven
                scala-maven-plugin
                3.2.2
                
                    
                        
                            compile
                            testCompile
                        
                        
                            
                                -dependencyfile
                                ${project.build.directory}/.scala_dependencies
                            
                        
                    
                
            
            
                org.apache.maven.plugins
                maven-shade-plugin
                2.3
                
                    
                        package
                        
                            shade
                        
                        
                            
                                
                                    *:*
                                    
                                        META-INF/*.SF
                                        META-INF/*.DSA
                                        META-INF/*.RSA
                                    
                                
                            
                            
                                
                                    
                                
                            
                        
                    
                
            
        
    

 

你可能感兴趣的:(sparkstreaming2.2使用checkpoint、kafka(1.0.1)、redis三种方式保存kafka偏移量测试)