[Spark基础]-- spark streaming从指定offset处消费Kafka数据(第一种方式)

一、情景:当spark streaming程序意外退出时,数据仍然再往Kafka中推送,然而由于Kafka默认是从latest的offset读取,这会导致数据丢失。为了避免数据丢失,那么我们需要记录每次消费的offset,以便下次检查并且从指定的offset开始读取

二、环境:kafka-0.9.0、spark-1.6.0、jdk-1.7、scala-2.10.5、idea16

三、实现代码:

      1、引入spark和kafka的相关依赖包

  
  
    4.0.0  
  
    com.ngaa  
    test-my  
    1.0-SNAPSHOT  
    2008  
      
        UTF-8  
        UTF-8  
          
        1.7  
        1.7  
        UTF-8  
          
        2.10.5  
          
        2.11.7  
  
        2.3.0  
          
        1.7.20  
          
        1.6.0-cdh5.8.0  
        1.6.0-cdh5.8.0  
        1.6.0-cdh5.8.0  
          
        2.6.0-cdh5.8.0  
          
        4.2.5  
  
          
        4.2.5  
          
        1.1.39  
  
      
  
      
          
            scala-tools.org  
            Scala-Tools Maven2 Repository  
            http://scala-tools.org/repo-releases  
          
          
          
            cloudera  
            https://repository.cloudera.com/artifactory/cloudera-repos/  
          
      
  
      
          
            scala-tools.org  
            Scala-Tools Maven2 Repository  
            http://scala-tools.org/repo-releases  
          
      
  
      
  
          
          
            com.alibaba  
            fastjson  
            ${fastjson.version}  
          
          
          
            org.apache.httpcomponents  
            httpclient  
            ${httpclient.version}  
          
  
          
          
            org.apache.httpcomponents  
            httpcore  
            ${httpcore.version}  
          
  
          
          
            org.slf4j  
            slf4j-log4j12  
            ${slf4j-version}  
          
          
          
            org.apache.hadoop  
            hadoop-client  
            ${hadoop.cdh.version}  
              
                  
                    javax.servlet  
                    *  
                  
              
          
          
            org.apache.hadoop  
            hadoop-common  
            ${hadoop.cdh.version}  
              
                  
                    javax.servlet  
                    *  
                  
              
          
          
            org.apache.hadoop  
            hadoop-hdfs  
            ${hadoop.cdh.version}  
              
                  
                    javax.servlet  
                    *  
                  
              
          
          
          
            org.scala-lang  
            scala-library  
            ${scala.version}  
          
          
            com.fasterxml.jackson.core  
            jackson-databind  
            ${jackson.version}  
          
  
          
          
            org.apache.spark  
            spark-streaming_2.10  
            ${spark.streaming.cdh.version}  
          
          
            org.apache.spark  
            spark-streaming-kafka_2.10  
            ${kafka.spark.cdh.version}  
          
          
            junit  
            junit  
            4.12  
            test  
          
  
          
          
        org.apache.spark  
        spark-assembly_2.10  
        ${spark.cdh.version}  
        system  
        D:/crt_send_document/spark-assembly-1.6.0-cdh5.8.0-hadoop2.6.0-cdh5.8.0.jar  
          
  
          
          
              
              
              
              
              
              
          
  
          
          
          
          
          
          
  
          
          
            org.apache.hadoop  
            hadoop-yarn-server-web-proxy  
            2.6.0-cdh5.8.0  
          
  
      
  
      
      
        test-my  
        src/main/scala  
        src/test/scala  
          
              
                org.scala-tools  
                maven-scala-plugin  
                2.15.2  
                  
                      
                          
                            compile  
                            testCompile  
                          
                      
                  
                  
                    ${scala.version}  
                      
                        -target:jvm-1.7  
                      
                  
              
              
                org.apache.maven.plugins  
                maven-eclipse-plugin  
                  
                    true  
                      
                        ch.epfl.lamp.sdt.core.scalabuilder  
                      
                      
                        ch.epfl.lamp.sdt.core.scalanature  
                      
                      
                        org.eclipse.jdt.launching.JRE_CONTAINER  
                        ch.epfl.lamp.sdt.launching.SCALA_CONTAINER  
                      
                  
              
              
                maven-assembly-plugin  
                  
                      
                        jar-with-dependencies  
                      
                      
                          
                              
                          
                      
                  
                  
                      
                        make-assembly  
                        package  
                          
                            single  
                          
                      
                  
              
          
      
      
          
              
                org.scala-tools  
                maven-scala-plugin  
                  
                    ${scala.version}  
                  
              
          
      
  
  

 

      2、新建测试类

 

     

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, TaskContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.slf4j.LoggerFactory

/**
  * Created by yangjf on 2016/12/18
  * Update date:
  * Time: 11:10
  * Describle :从指定偏移量读取kafka数据
  * Result of Test:
  * Command:
  * Email: [email protected]
  */
object ReadBySureOffsetTest {
  val logger = LoggerFactory.getLogger(ReadBySureOffsetTest.getClass)

  def main(args: Array[String]) {
    //设置打印日志级别
    Logger.getLogger("org.apache.kafka").setLevel(Level.ERROR)
    Logger.getLogger("org.apache.zookeeper").setLevel(Level.ERROR)
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    logger.info("测试从指定offset消费kafka的主程序开始")
    if (args.length < 1) {
      System.err.println("Your arguments were " + args.mkString(","))
      System.exit(1)
      logger.info("主程序意外退出")
    }
    //hdfs://hadoop1:8020/user/root/spark/checkpoint
    val Array(checkpointDirectory) = args
    logger.info("checkpoint检查:" + checkpointDirectory)
    val ssc = StreamingContext.getOrCreate(checkpointDirectory,
      () => {
        createContext(checkpointDirectory)
      })
    logger.info("streaming开始启动")
    ssc.start()
    ssc.awaitTermination()
  }

  def createContext(checkpointDirectory: String): StreamingContext = {
    //获取配置
    val brokers = "hadoop3:9092,hadoop4:9092"
    val topics = "20161218a"

    //默认为5秒
    val split_rdd_time = 8
    // 创建上下文
    val sparkConf = new SparkConf()
      .setAppName("SendSampleKafkaDataToApple").setMaster("local[2]")
      .set("spark.app.id", "streaming_kafka")

    val ssc = new StreamingContext(sparkConf, Seconds(split_rdd_time))

    ssc.checkpoint(checkpointDirectory)

    // 创建包含brokers和topic的直接kafka流
    val topicsSet: Set[String] = topics.split(",").toSet
    //kafka配置参数
    val kafkaParams: Map[String, String] = Map[String, String](
      "metadata.broker.list" -> brokers,
      "group.id" -> "apple_sample",
      "serializer.class" -> "kafka.serializer.StringEncoder"
//      "auto.offset.reset" -> "largest"   //自动将偏移重置为最新偏移(默认)
//      "auto.offset.reset" -> "earliest"  //自动将偏移重置为最早的偏移
//      "auto.offset.reset" -> "none"      //如果没有为消费者组找到以前的偏移,则向消费者抛出异常
    )
    /**
      * 从指定位置开始读取kakfa数据
      * 注意:由于Exactly  Once的机制,所以任何情况下,数据只会被消费一次!
      *      指定了开始的offset后,将会从上一次Streaming程序停止处,开始读取kafka数据
      */
    val offsetList = List((topics, 0, 22753623L),(topics, 1, 327041L))                          //指定topic,partition_no,offset
    val fromOffsets = setFromOffsets(offsetList)     //构建参数
    val messageHandler = (mam: MessageAndMetadata[String, String]) => (mam.topic, mam.message()) //构建MessageAndMetadata
   //使用高级API从指定的offset开始消费,欲了解详情,
   //请进入"http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$"查看
    val messages: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)

    //数据操作
    messages.foreachRDD(mess => {
      //获取offset集合
      val offsetsList = mess.asInstanceOf[HasOffsetRanges].offsetRanges
      mess.foreachPartition(lines => {
        lines.foreach(line => {
          val o: OffsetRange = offsetsList(TaskContext.get.partitionId)
          logger.info("++++++++++++++++++++++++++++++此处记录offset+++++++++++++++++++++++++++++++++++++++")
          logger.info(s"${o.topic}  ${o.partition}  ${o.fromOffset}  ${o.untilOffset}")
          logger.info("+++++++++++++++++++++++++++++++此处消费数据操作++++++++++++++++++++++++++++++++++++++")
          logger.info("The kafka  line is " + line)
        })
      })
    })
    ssc
  }

  //构建Map
  def setFromOffsets(list: List[(String, Int, Long)]): Map[TopicAndPartition, Long] = {
    var fromOffsets: Map[TopicAndPartition, Long] = Map()
    for (offset <- list="" val="" tp="TopicAndPartition(offset._1," offset="" _2="" topic="" fromoffsets="" tp="" -=""> offset._3)           // offset位置
    }
    fromOffsets
  }
}

 

四、参考文档:

    1、spark API  http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$

    2、Kafka官方配置说明:http://kafka.apache.org/documentation.html#configuration

    3、Kafka SampleConsumer:https://cwiki.apache.org/confluence/display/KAFKA/0.8.0+SimpleConsumer+Example

    4、Spark streaming 消费遍历offset说明:http://spark.apache.org/docs/1.6.0/streaming-kafka-integration.html

    5、Kafka官方API说明:http://kafka.apache.org/090/javadoc/index.html?org/apache/kafka/clients/consumer/KafkaConsumer.html

注:以上测试通过,可以根据需要修改。如有疑问,请留言!

 

 

你可能感兴趣的:(Spark)