sparkstreaming开发kafka实战(二)

第一部分对sparkstreaming向kafka写数据和读取数据进行简单的测试和开发,本部分主要是从kafka消费数据的时候,防止意外情况sparkstreaming程序终止运行,导致数据丢失情况发生,需要对kafka的offset 进行记录,在这里我用的是直接读取kafka的方式(createDirectStream),没有经过zookeep,所以这个读取的偏移量需要自己去维护。

  • 消费者代码
    package com.baofeng.dataparse
    
    import org.apache.spark.{SparkConf, TaskContext}
    import org.apache.spark.streaming.StreamingContext
    import org.apache.spark.streaming.Seconds
    import kafka.serializer.StringDecoder
    import org.apache.spark.rdd.RDD
    
    import org.apache.spark.streaming.kafka.HasOffsetRanges
    import org.apache.spark.streaming.kafka.OffsetRange
    
    import org.apache.spark.streaming.kafka.KafkaManager
    
    import spray.json._
    
    object Comsumer {
    
        def main(args: Array[String]): Unit = {
          println("Comsumer")
          val conf = new SparkConf().setMaster("local[2]").setAppName("ReadAndSave")
          val ssc = new StreamingContext(conf, Seconds(5))
    
          val topics = Set("user_msg","mytopic")
          val brokers  = "192.168.201.117:9092"
          val kafkaParams = Map[String, String](
            "metadata.broker.list" -> brokers,
            "serializer.class" -> "kafka.serializer.StringEncoder",
            "group.id" -> "group_stream_id", "auto.offset.reset" -> "largest")
    
          val km = new KafkaManager(kafkaParams)
          //封装createDirectStream方法,读取其中的当前offset  
          val kafkaStream = km.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
          var offsetRanges = Array[OffsetRange]()
    
          kafkaStream.transform(rdd =>{
            offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
            rdd
          }).foreachRDD { rdd =>
            for (o <- offsetRanges) {
              println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
            }
            //多个主题,每个主题存在不同的日志,需要不同的分析方法,在logParser中实现
            rdd.foreach(r=>{
              val offsetRange: OffsetRange = offsetRanges(TaskContext.get.partitionId)
              val obj = LogParser.getObject(offsetRange.topic)
              if(obj == null) {
                println(offsetRange.topic+" error ,not found")
              }else{
                obj.deal(r)
              }
            })
          }
          //更新回offset  
          km.updateZKOffsetsFromoffsetRanges(offsetRanges, 1)
    
          ssc.start()
          ssc.awaitTermination()
        }
    
    }
    

    其中有个疑惑的地方,从kafka中读取的消息中是没有topic信息,需要自己通过以下代码进行实现通过
    offsetRanges(TaskContext.get.partitionId) 获得当前的topic信息
     

         kafkaStream.transform(rdd =>{
            //根本官方的文档必须在此运行
            offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
            rdd
          }).foreachRDD { rdd =>
            for (o <- offsetRanges) {
              println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
            }
            rdd.foreach(r=>{
              //
              val offsetRange: OffsetRange = offsetRanges(TaskContext.get.partitionId)
              println(offsetRange.topic)
            })


     hashOffsetRanges必须在第一个方法中调用,由于以后的一系列的在RDD和kafkaRDD的map操作导致hasOffsetRanges信息丢失。

  • 业务分析代码
     

    package com.baofeng.dataparse
    
    import spray.json._
    
    trait LogParser {
        def deal(record:Tuple2[String,String])
    }
    
    object LogParser {
      val objList:Map[String,LogParser] = Map("mytopic"->new CmsLog,"user"->new UserLog)
      def getObject(name:String):LogParser= {
        return objList.get(name).getOrElse(null)
      }
    }
    
    class CmsLog() extends LogParser {
      val name:String = "mytopic"
      override def deal(record: Tuple2[String,String]): Unit = {
        val r=record._2
        val data = r.split(" ")
        println(r)
      }
    
    }
    
    class UserLog() extends LogParser {
        val name:String = "user"
    
      override def deal(record: Tuple2[String,String]): Unit = {
        val data = JsonParser(record._2).asJsObject()
        println(data.getFields("userid")+" "+data.getFields("access"))
      }
    }
    
    KafkaManager的实现,网上拷贝过来的代码
    
    
    package org.apache.spark.streaming.kafka
    
    import kafka.common.TopicAndPartition
    import kafka.message.MessageAndMetadata
    import kafka.serializer.Decoder
    import scala.reflect.ClassTag
    import org.apache.spark.SparkException
    import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
    import org.apache.spark.rdd.RDD
    import org.apache.spark.streaming.dstream.InputDStream
    import org.apache.spark.streaming.StreamingContext
    
    class KafkaManager(val kafkaParams: Map[String, String]) extends Serializable {
    
      private val kc = new KafkaCluster(kafkaParams)
    
      private val flag = 1150 * 10000l
    
    
      def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, kafkaParams: Map[String, String], topics: Set[String]): InputDStream[(K, V)] = {
        val groupId = kafkaParams.get("group.id").get
        // 在zookeeper上读取offsets前先根据实际情况更新offsets
        setOrUpdateOffsets(topics, groupId)
    
        //从zookeeper上读取offset开始消费message
        val messages = {
          val partitionsE = kc.getPartitions(topics)
          if (partitionsE.isLeft)
            throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
          val partitions = partitionsE.right.get
          val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
          if (consumerOffsetsE.isLeft)
            throw new SparkException(s"get kafka consumer offsets failed: ${consumerOffsetsE.left.get}")
          val consumerOffsets = consumerOffsetsE.right.get
          KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)](
            ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message))
        }
        messages
      }
    
      /**
        * 创建数据流前,根据实际消费情况更新消费offsets
        * @param topics
        * @param groupId
        */
      private def setOrUpdateOffsets(topics: Set[String], groupId: String): Unit = {
        topics.foreach(topic => {
          var hasConsumed = true
          val partitionsE = kc.getPartitions(Set(topic))
          if (partitionsE.isLeft)
            throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
          val partitions = partitionsE.right.get
          val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions)
          if (consumerOffsetsE.isLeft) hasConsumed = false
          if (hasConsumed) { 
            val earliestLeaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
            if (earliestLeaderOffsetsE.isLeft)
              throw new SparkException(s"get earliest leader offsets failed: ${earliestLeaderOffsetsE.left.get}")
            val earliestLeaderOffsets = earliestLeaderOffsetsE.right.get
            val consumerOffsets = consumerOffsetsE.right.get
    
            // 可能只是存在部分分区consumerOffsets过时,所以只更新过时分区的consumerOffsets为earliestLeaderOffsets
            var offsets: Map[TopicAndPartition, Long] = Map()
            consumerOffsets.foreach({
              case (tp, n) =>
                val earliestLeaderOffset = earliestLeaderOffsets(tp).offset
                if (n < earliestLeaderOffset) {
                  println("consumer group:" + groupId + ",topic:" + tp.topic + ",partition:" + tp.partition +
                    " offsets已经过时,更新为" + earliestLeaderOffset)
                  offsets += (tp -> earliestLeaderOffset)
                }
            })
            if (!offsets.isEmpty) {
              kc.setConsumerOffsets(groupId, offsets)
            }
          } else { // 没有消费过
            val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
            var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null
            if (reset == Some("smallest")) {
              val leaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
              if (leaderOffsetsE.isLeft)
                throw new SparkException(s"get earliest leader offsets failed: ${leaderOffsetsE.left.get}")
              leaderOffsets = leaderOffsetsE.right.get
            } else {
              val leaderOffsetsE = kc.getLatestLeaderOffsets(partitions)
              if (leaderOffsetsE.isLeft)
                throw new SparkException(s"get latest leader offsets failed: ${leaderOffsetsE.left.get}")
              leaderOffsets = leaderOffsetsE.right.get
            }
            val offsets = leaderOffsets.map {
              case (tp, offset) => (tp, offset.offset)
            }
            kc.setConsumerOffsets(groupId, offsets)
          }
        })
      }
    
      /**
        * 更新zookeeper上的消费offsets
        * 把当前的消费记录,写入zk
        *
        * @param rdd
        */
      def updateZKOffsets(rdd: RDD[(String, String)]): Unit = {
        val groupId = kafkaParams.get("group.id").get
        val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
    
        for (offsets <- offsetsList) {
          val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
          val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset)))
          if (o.isLeft) {
            println(s"Error updating the offset to Kafka cluster: ${o.left.get}")
          }
        }
      }
    
      /**
        * 更新zookeeper上的消费offsets
        * 把当前的消费记录的offset往前推
        * 并写入zk
        *
        * @param rdd
        * @param day
        */
      def updateZKOffsetsFromoffsetRanges(offsetRanges: Array[OffsetRange], day: Double): Unit = {
        val groupId = kafkaParams.get("group.id").get
    
        for (offsets <- offsetRanges) {
          val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition)
    
          var offsetStreaming = 0l
    
          println("offsets.untilOffset " + offsets.untilOffset)
          if (offsets.untilOffset >= flag) {
            offsetStreaming = offsets.untilOffset - (flag * day).toLong
          } else {
            offsetStreaming = 0
          }
    
          println("offsetStreaming " + offsetStreaming)
    
          val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsetStreaming)))
          if (o.isLeft) {
            println(s"Error updating the offset to Kafka cluster: ${o.left.get}")
          }
        }
      }
    }
    

     

你可能感兴趣的:(spark,大数据)