Kafka直连存储HBase

在之前介绍了Kafka与SparkStreaming交互的两种方式,我提到了公司采用的是Direct方式,这次我向大家分享一下将偏移量存储在HBase中。
代码如下:

package kafka1
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.ZkUtils
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Scan}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object KafkaHbaseManager {
  // 保存offset到hbase
  def saveOffsets(TOPIC_NAME: String, GROUP_ID: String, offsetRanges: Array[OffsetRange],
                  hbaseTableName: String, batchTime: org.apache.spark.streaming.Time) = {
    val hbaseConf = HBaseConfiguration.create()
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(hbaseTableName))
    val rowKey = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(batchTime.milliseconds)
    val put = new Put(rowKey.getBytes())
    for (offset <- offsetRanges) {
      put.addColumn(Bytes.toBytes("offsets"), Bytes.toBytes(offset.partition.toString),
        Bytes.toBytes(offset.untilOffset.toString))
    }
    table.put(put)
    conn.close()
  }

  // 从zookeeper中获取topic的分区数
  def getNumberOfPartitionsForTopicFromZK(TOPIC_NAME: String, GROUP_ID: String,
                                          zkQuorum: String, zkRootDir: String, sessTimeout: Int, connTimeOut: Int): Int = {
    val zkUrl = zkQuorum + "/" + zkRootDir
    val zkClientAndConn = ZkUtils.createZkClientAndConnection(zkUrl, sessTimeout, connTimeOut)
    val zkUtils = new ZkUtils(zkClientAndConn._1, zkClientAndConn._2, false)
    // 获取分区数量
    val zkPartitions = zkUtils.getPartitionsForTopics(Seq(TOPIC_NAME)).get(TOPIC_NAME).toList.head.size
    println(zkPartitions)
    zkClientAndConn._1.close()
    zkClientAndConn._2.close()
    zkPartitions
  }

  // 获取hbase的offset
  def getLastestOffsets(TOPIC_NAME: String, GROUP_ID: String, hTableName: String,
                        zkQuorum: String, zkRootDir: String, sessTimeout: Int, connTimeOut: Int): Map[TopicAndPartition, Long] = {

    // 连接zk获取topic的partition数量
    val zKNumberOfPartitions = getNumberOfPartitionsForTopicFromZK(TOPIC_NAME, GROUP_ID, zkQuorum, zkRootDir, sessTimeout, connTimeOut)

    val hbaseConf = HBaseConfiguration.create()

    // 获取hbase中最后提交的offset
    val conn = ConnectionFactory.createConnection(hbaseConf)
    val table = conn.getTable(TableName.valueOf(hTableName))
    val startRow = TOPIC_NAME + ":" + GROUP_ID + ":" + String.valueOf(System.currentTimeMillis())
    val stopRow = TOPIC_NAME + ":" + GROUP_ID + ":" + 0
    val scan = new Scan()
    val scanner = table.getScanner(scan.setStartRow(startRow.getBytes).setStopRow(stopRow.getBytes).setReversed(true))
    val result = scanner.next()

    var hbaseNumberOfPartitions = 0 // 在hbase中获取的分区数量
    if (result != null) {
      // 将分区数量设置为hbase表的列数量
      hbaseNumberOfPartitions = result.listCells().size()
    }

    val fromOffsets = collection.mutable.Map[TopicAndPartition, Long]()
    if (hbaseNumberOfPartitions == 0) { // 如果没有保存过offset
      // 初始化kafka为开始
      for (partition <- 0 until zKNumberOfPartitions) {
        fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), 0))
      }

    } else if (zKNumberOfPartitions > hbaseNumberOfPartitions) { // 如果zk的partition数量大于hbase的partition数量,说明topic增加了分区,就需要对分区做单独处理
      // 处理新增加的分区添加到kafka的topic
      for (partition <- 0 until zKNumberOfPartitions) {
        val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),
          Bytes.toBytes(partition.toString)))
        fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), fromOffset.toLong))
      }
      // 对新增加的分区将它的offset值设为0
      for (partition <- hbaseNumberOfPartitions until zKNumberOfPartitions) {
        fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), 0))
      }
    } else { // 如果既没有新增加的分区,也不是第一次运行
      // 获取上次运行的offset
      for (partition <- 0 until hbaseNumberOfPartitions) {
        val fromOffset = Bytes.toString(result.getValue(Bytes.toBytes("offsets"),
          Bytes.toBytes(partition.toString)))
        fromOffsets += ((TopicAndPartition(TOPIC_NAME, partition), fromOffset.toLong))
      }
    }

    scanner.close()
    conn.close()
    fromOffsets.toMap
  }

  def main(args: Array[String]): Unit = {
    val processingInterval = 2
    val brokers = "192.168.85.200:9092,192.168.85.201:9092,192.168.85.202:9092"
    val topics = "test01"
    // Create context with 2 second batch interval
    val sparkConf = new SparkConf().setAppName("kafkahbase").setMaster("local[2]")
    // Create direct kafka stream with brokers and topics
    val topicsSet = topics.split(",").toSet
    val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers,
      "auto.offset.reset" -> "smallest")

    val ssc = new StreamingContext(sparkConf, Seconds(processingInterval))
    val groupId = "gp01"
    val hbaseTableName = "spark_kafka_offsets"

    // 获取kafkaStream
    //val kafkaStream = createMyDirectKafkaStream(ssc, kafkaParams, zkClient, topicsSet, "testp")
    val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
    // 获取offset
    val fromOffsets = getLastestOffsets("test01", groupId, hbaseTableName, "192.168.85.200:2181,192.168.85.201:2181,192.168.85.202:2181", "kafka0.9", 30000, 30000)

    var kafkaStream: InputDStream[(String, String)] = null
    kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)

    kafkaStream.foreachRDD((rdd, btime) => {
      if (!rdd.isEmpty()) {
        println("==========================:" + rdd.count())
        println("==========================btime:" + btime)
        saveOffsets(topics, groupId, rdd.asInstanceOf[HasOffsetRanges].offsetRanges, hbaseTableName, btime)
      }

    })

    ssc.start()
    ssc.awaitTermination()
  }
}

存放在HBase中代码有点麻烦,接下来我的博客中会像大家介绍两种比较简单的。

									 summed up by JiaMingcan
									 转载请署名:JiaMingcan

你可能感兴趣的:(kafka,大数据基础)