SparkStreaming连接到kafka,防止重复消费

package com.manulife.mbps.behavior.data.stream

import java.util.Properties

import com.cloudera.com.amazonaws.util.json.JSONObject
import com.manulife.mbps.behavior.common.utils.{IpUtil, ValidateUtil}
import kafka.api.{OffsetRequest, PartitionOffsetRequestInfo, TopicMetadataRequest}
import kafka.common.TopicAndPartition
import kafka.consumer.SimpleConsumer
import kafka.producer.{KeyedMessage, Producer, ProducerConfig}
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.{ZkClient, ZkConnection}
import org.I0Itec.zkclient.serialize.ZkSerializer
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.ConsumerStrategies.{Assign, Subscribe}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}


/**
  * Created by nickliu on 7/5/2017.
  */
object SparkStream_kafka {

  def main(args: Array[String]) {
    val topic = //
    val topics = Set(//)
    val group =//
    val zkHosts = //
    val brokers = //

    val sparkConf = new SparkConf().setAppName("Test-SparkDemo-kafka").setMaster("local[3]")
    /** 限制每秒钟从topic的每个partition最多消费的消息条数 */
    sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "1000")

    val ssc = new StreamingContext(sparkConf, Seconds(1))

    /** kafka config */
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> group,
      "zookeeper.set.acl" -> "false",
      "auto.offset.reset" -> "latest", //kafka.api.OffsetRequest.SmallestTimeString
      "enable.auto.commit" -> (false: java.lang.Boolean) // 关闭自动提交
    )

    /** 获取zk中offset */
    val topicDirs = new ZKGroupTopicDirs(group, topic)
    /** consumerOffsetDir:zk中的topic路径,这里是/consumers/example/offsets/ICPRecommend */
    val consumerOffsetDir = topicDirs.consumerOffsetDir
    /** zookeeper connection */
    val zkClient = new ZkClient(zkHosts, 10000, 10000, new ZkSerializer {
      override def serialize(data: scala.Any): Array[Byte] = data.asInstanceOf[String].getBytes("UTF-8")

      override def deserialize(bytes: Array[Byte]): AnyRef = if (bytes == null) null else new String(bytes, "UTF-8")
    })

    //连接到zk更新offset
    val zkUtils = new ZkUtils(zkClient, new ZkConnection(zkHosts), false)


    val children = zkClient.countChildren(consumerOffsetDir)



    var kafkaStream: InputDStream[ConsumerRecord[String, String]] = null
    var fromOffsets: Map[TopicPartition, Long] = Map()

    if (children > 0) {
      /**  get partition leader begin */
      val topicList = List(topic)
      /** 得到该topic的一些信息,比如broker,partition分布情况 */
      val req = new TopicMetadataRequest(topicList, 0)


      /**  low level api interface */
      val getLeaderConsumer = new SimpleConsumer("master", 9092, 10000, 10000, "OffsetLookup")
      val res = getLeaderConsumer.send(req) //TopicMetadataRequest   topic broker partition 的一些信息
      val topicMetaOption = res.topicsMetadata.headOption  

      /** partitions */
      val partitions = topicMetaOption match {
        case Some(tm) =>
          tm.partitionsMetadata.map(pm => (pm.partitionId, pm.leader.get.host)).toMap[Int, String]  //分区与主机之间的map映射
        case None =>
          Map[Int, String]()
      }
      /**  get partition leader end */

      /** offset */
      for (i <- 0 until children) {
        val zkPartitionOffset = zkClient.readData[String](s"${topicDirs.consumerOffsetDir}/${i}")
        val tp = new TopicPartition(topic, i)
        val tap = new TopicAndPartition(topic, i)

        /** additional begin */
        val requestMin = OffsetRequest(Map(tap -> PartitionOffsetRequestInfo(OffsetRequest.EarliestTime, 1))) // -2,1
        val getMinOffsetConsumer = new SimpleConsumer(partitions(i), 9092, 10000, 10000, "getMinOffset")
        val kafkaCurMinOffset = getMinOffsetConsumer.getOffsetsBefore(requestMin).partitionErrorAndOffsets(tap).offsets
        var nextOffset = zkPartitionOffset.toLong
        if (kafkaCurMinOffset.length > 0 && nextOffset < kafkaCurMinOffset.head) {
          /** 如果下一个offset小于当前的offset */
          nextOffset = kafkaCurMinOffset.head
        }
        /** additional end */
        fromOffsets += (tp -> nextOffset)
      }

      kafkaStream = KafkaUtils.createDirectStream[String, String](
        ssc,
        PreferConsistent,
        Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets)
      )
    } else {
      kafkaStream = KafkaUtils.createDirectStream[String, String](
        ssc,
        PreferConsistent,
        Subscribe[String, String](topics, kafkaParams)
      )
    }

    kafkaStream.foreachRDD { rdd =>
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

      /** 将partitionId及起始offset存储到zookeeper中 */
      for (offset <- offsetRanges) {
        val zkPath = s"${topicDirs.consumerOffsetDir}/${offset.partition}"
        zkUtils.updatePersistentPath(zkPath, offset.fromOffset.toString)
      }

      rdd.foreachPartition(
        message => {
          while (message.hasNext) {
            val msg = message.next()
            println("message: " + msg)
            sendData(msg)
          }
        })
    }

    ssc.start()
    ssc.awaitTermination()
  }

  def sendData(value: ConsumerRecord[String, String]) = {
    var events = value.value().toString
    var js = new JSONObject(events)

    val channel = js.getString("channel")
    var ip = js.getString("ip")
    var user_id = js.getString("user_id")
    var user_id_md5 = js.getString("user_id_md5")
    var plan_code = js.getString("plan_code")
    var oper_type = js.getString("oper_type")
    var catagory = js.getString("category")
    var time = js.getString("time")
    var mid = js.getString("mid")

    //校验ip地址是否合法
    ip = ValidateUtil.ipCheck(ip)

    var IpObject = IpUtil.ipToBean(IpUtil.ipToLong(ip))
    var stat: String = null
    var territory: String = null
    if (IpObject == null) {
      stat = "-"
      territory = "-"
    } else {
      stat = IpObject.getState
      territory = IpObject.getTerritory
    }

    val topic1 = //
    val brokers = //
    val props = new Properties()
    props.put("metadata.broker.list", brokers)
    props.put("serializer.class", "kafka.serializer.StringEncoder")
    val kafkaConfig = new ProducerConfig(props)
    val producer = new Producer[String, String](kafkaConfig)
    while (true) {
      // prepare event data
      val event = new JSONObject()
      event.put("user_id", user_id)
        .put("channel", channel)
        .put("ip", ip)
        .put("user_id_md5", user_id_md5)
        .put("plan_code", plan_code)
        .put("oper_type", oper_type)
        .put("category", catagory)
        .put("time", time)
        .put("stat", stat)
        .put("territory", territory)
        .put("mid", mid)
      producer.send(new KeyedMessage[String, String](topic1, event.toString))
      Thread.sleep(200)
      println("Message sent: " + event)
    }
  }
}

你可能感兴趣的:(spark)