Spark Stream对接Kafka的两种版本选择

对接kafka 0.8以及0.8以上的版本

  • Spark要在2.3.0一下选择较好,因为这个Spark对接kafka用这个比较稳定,1.0还是测试
    导入依赖
        
            org.apache.spark
            
            spark-streaming-kafka-0-8_2.11
             
            2.1.1
        

不保存offset(默认最新的数据开始)

package com.spark.streaming.day01.kafka


import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * Author 
  * Date 2020/1/13 16:40
  */
object WordCount1 {
    def main(args: Array[String]): Unit = { 
      // 获得ssc
        val conf: SparkConf = new SparkConf().setAppName("a").setMaster("local[2]")
        val ssc: StreamingContext = new StreamingContext(conf, Seconds(3))
         // kafka参数
        val params: Map[String, String] = Map[String, String](
            "group.id" -> "8888",
            "bootstrap.servers" -> "hadoop102:9092,hadoop103:9092,hadoop104:9092")
        // 从kafka读取数据,用直连的方式,传入参数和主题即可,泛型为kafka读取的KV类型
        val sourceStream: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
            ssc,
            params,
            Set("s0830"))
        sourceStream
            .map {
                case (_, v) => v
            }
            .flatMap(_.split("\\W+"))
            .map((_, 1))
            .reduceByKey(_ + _)
            .print(1000)        
        ssc.start()
        
        ssc.awaitTermination()
    }
}

checkpoint保存(第一次从最新,后面从offset)

package com.spark.streaming.day02.kafka

import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

object WordCount {
//  第一次创建ssc时调用
  def creatSSC()={
    val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("wd")
    val ssc = new StreamingContext(conf,Seconds(3))
//    设置检查点(比第一张方法多的)
    ssc.checkpoint("./ck1")
    val params: Map[String, String] = Map[String,String]("group.id"->"0830","bootstrap.servers"->"hadoop102:9092,hadoop103:9092,hadoop104:9092")

    KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,params,Set("s0830"))
      .flatMap{
        case(_,v) => v.split(" ").map((_,1))
      }.reduceByKey(_+_)
      .print()
    ssc
  }
  def main(args: Array[String]): Unit = {
//从给定checkpoint路径创建ssc,如果给点目录不存在,那么从 creatSSC方法中创建新的ssc
    val ssc = StreamingContext.getActiveOrCreate("./ck1", creatSSC)
    ssc.start()
    ssc.awaitTermination()
  }
}

kafka保存offset(第一次从0的位置消费)

package com.spark.streaming.day02.kafka

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.Err
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object WordCount2 {
//kafka参数
  val params: Map[String, String] = Map[String, String](
    "group.id" -> "0830",
    "bootstrap.servers" -> "hadoop102:9092,hadoop103:9092,hadoop104:9092")
  val topics = Set("s0830")
 //维护offset  和读取的时候需要
  val groupId = "0830"
  //获取cluster 
  private val cluster = new KafkaCluster(params)
 //读取offset调用的方法
  def readOffsets() ={
   //获取对应主题的主题和分区,类型Either,right为有,left为错误,表示传入主题没有
    val topicAndPartitionEither: Either[Err, Set[TopicAndPartition]] = cluster.getPartitions(topics)
    var resultMap = Map[TopicAndPartition,Long]()
    topicAndPartitionEither match {
      case Right(topicAndPartitionSet) =>
       //如果主题存在,获取每个分区的offset,right为有,left为没有,为第一次消费
        val topicAndPartitonAndOffsetsEither: Either[Err, Map[TopicAndPartition, Long]] = cluster.getConsumerOffsets(groupId, topicAndPartitionSet)
        topicAndPartitonAndOffsetsEither match {
          // 表示不是第一次消费
          case Right(map) =>
            resultMap ++= map
          // 表示是第一次消费. 把每个分区的offset设置为0
          case _ =>
            topicAndPartitionSet.foreach(topicAndPartition => {
              resultMap += topicAndPartition -> 0L
            })
        }
      case _ =>

    }
    resultMap
  }
//保存每个分区的offset
  def saveOffsets(sourceStream:InputDStream[String]): Unit ={
  //从直连得到的stream中获取每批次的offset
    sourceStream.foreachRDD(rdd => {
    //转换为类型
      val hasOffsetRanges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
      //得到这一个批次的offset的起始和末尾值
      val offsetRanges: Array[OffsetRange] = hasOffsetRanges.offsetRanges
      var map = Map[TopicAndPartition,Long]()
      offsetRanges.foreach(offsetRange => {
      //得到TopicAndPartition 
         val key: TopicAndPartition = offsetRange.topicAndPartition()
         //得到对应分区的ffset
        val value: Long = offsetRange.untilOffset
        //添加到map中
        map += key->value
      })
//保存到kafka中
      cluster.setConsumerOffsets(groupId,map)

    })
  }


  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("local[2]").setMaster("local[2]")
    val ssc = new StreamingContext(conf, Seconds(3))
    ssc.checkpoint("./ck2")

//kafka直连获取dstream
    val sourceStream: InputDStream[String]=KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](
      ssc,
      params,
      //传入offset, 类型为Map[TopicAndPartition,Long]()
      readOffsets(),
      (handler: MessageAndMetadata[String, String]) => handler.message()
    )
    // 可以手动维护 offset  使用低阶api
    sourceStream
      .flatMap(_.split("\\W+"))
      .map((_, 1))
      .reduceByKey(_ + _)
      .print(10000)
      //数据处理后保存offset,Map[TopicAndPartition,Long]()类型,注意只能保证最少一次,加入处理数据完了,保存offset时挂掉,会重读拉去数据,不然就要实现事务
    saveOffsets(sourceStream)
    ssc.start()
    ssc.awaitTermination()
  }
}

对接kafka 1.0以及以上的版本

  • Spark要在2.3.0以及以上选择较好,因为这个Spark对接kafka用这个稳定,0.8的已经过时了
  • 导入依赖
        
            org.apache.spark
            spark-streaming-kafka-0-10_2.11
            2.4.0
        

自动维护偏移量

package com.atguigu.realtime.util

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe

/**
  * Author 8888
  * Date 2020/1/15 16:36
  */
object MyKafkaUtil {
    val kafkaParams = Map[String, Object](
        "bootstrap.servers" -> "hadoop102:9092,hadoop103:9092,hadoop104:9092",
        "key.deserializer" -> classOf[StringDeserializer],
        "value.deserializer" -> classOf[StringDeserializer],
        "group.id" -> "bigdata",
        "auto.offset.reset" -> "latest",
        "enable.auto.commit" -> (true: java.lang.Boolean)
    )
    
    
    def getKafkaStream(ssc: StreamingContext, topic:String, otherTopics: String*): DStream[String] = {
        KafkaUtils.createDirectStream[String, String](
            ssc,
            PreferConsistent,
            Subscribe[String, String](otherTopics :+ topic, kafkaParams)
        ).map(record => record.value())
    }
}

手动Mysql维护偏移量

package com.qzpoint.streaming.aa

import java.lang
import java.sql.ResultSet

import com.atguigu.qzpoint.util.{DataSourceUtil, QueryCallback, SqlProxy}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

object RegisterStreaming {
  private val groupid = "register_group_test"

  def main(args: Array[String]): Unit = {
    System.setProperty("HADOOP_USER_NAME", "atguigu")
    val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
//对于kafka的每个分区,每秒钟的读取最大数量
      .set("spark.streaming.kafka.maxRatePerPartition", "100")
      .setMaster("local[*]")
//    1. 创建SparkStreaming的入口对象: StreamingContext  参数2: 表示事件间隔
//    内部会创建 SparkContext
    val ssc = new StreamingContext(conf, Seconds(3))
//    根据ssc,创建sc
    val sparkContext: SparkContext = ssc.sparkContext
//    消费的主题,可以是多个  直连的时候需要用
    val topics = Array("register_topic")
// kafka的参数,直连的时候需要用
    val kafkaMap: Map[String, Object] = Map[String, Object](
//      kafka 地址
      "bootstrap.servers" -> "hadoop102:9092,hadoop103:9092,hadoop104:9092",
//      反序列化器
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupid,
//      如果是第一次消费,从头开始消费
      "auto.offset.reset" -> "earliest",
      //如果是true,则这个消费者的偏移量会在后台自动提交,但是kafka宕机容易丢失数据
      //如果是false,则需要手动维护kafka偏移量
      "enable.auto.commit" -> (false: lang.Boolean)
    )
    //sparkStreaming对有状态的数据操作,需要设定检查点目录,然后将状态保存到检查点中
    ssc.checkpoint("/user/atguigu/sparkstreaming/checkpoint")
    //查询mysql中是否有偏移量
    val sqlProxy = new SqlProxy()
//    对应主题分区的offset,直连的时候需要用
    val offsetMap = new mutable.HashMap[TopicPartition, Long]()
//    从德鲁伊连接池获取jdbc连接
    val client = DataSourceUtil.getConnection
//    获取查询语句结果并且对结果进行封装到offsetMap
    try {
//      回调函数
      sqlProxy.executeQuery(client, "select * from `offset_manager` where groupid=?", Array(groupid), new QueryCallback {
        override def process(rs: ResultSet): Unit = {
          while (rs.next()) {
            val model = new TopicPartition(rs.getString(2), rs.getInt(3))
            val offset = rs.getLong(4)
            offsetMap.put(model, offset)
          }
          rs.close() //关闭执行语句
        }
      })
    } catch {
      case e: Exception => e.printStackTrace()
    } finally {
//      关闭执行语句、关闭连接,关闭预编译
      sqlProxy.shutdown(client)
    }
    //设置kafka消费数据的参数  判断本地是否有偏移量  有则根据偏移量继续消费 无则重新消费
    val stream: InputDStream[ConsumerRecord[String, String]] = if (offsetMap.isEmpty) {
//      从头消费
      KafkaUtils.createDirectStream(
        ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaMap))
//      从offset位置消费
    } else {
      KafkaUtils.createDirectStream(
        ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaMap, offsetMap))
    }

    //处理完 业务逻辑后 (因为代码从上到下执行)手动提交offset维护到本地 mysql中,stream是从kafka获取的流,这个里面的RDD有我们需要的数据
    stream.foreachRDD(rdd => {
      val sqlProxy = new SqlProxy()
      val client = DataSourceUtil.getConnection
      try {
//        转换RDD
        val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//        获取主题、分区和偏移量
        for (or <- offsetRanges) {
          sqlProxy.executeUpdate(client, "replace into `offset_manager` (groupid,topic,`partition`,untilOffset) values(?,?,?,?)",
            Array(groupid, or.topic, or.partition.toString, or.untilOffset))
        }
      } catch {
        case e: Exception => e.printStackTrace()
//          关闭连接、预编译、执行语句
      } finally {
        sqlProxy.shutdown(client)
      }
    })

//    开启任务
    ssc.start()
//    一直执行
    ssc.awaitTermination()
  }

}

两者区别

  • 0.8 版本的直连模式,返回的是 InputDStream[(String, String)],kafka中的K,V。
  • 1.0版本的直连模式,返回的是 InputDStream[ConsumerRecord[String, String]] ,将K,V包装为ConsumerRecord,需要map(record => record.value())得到value的值。

没有克服不了的困难,只有畏惧的心。 生活之所以耀眼,是因为磨难与辉煌会同时出现。所以,别畏惧暂时的困顿,即使无人鼓掌,也要全情投入,优雅坚持。请相信:不管多险峻的高山,总会给勇敢的人留一条攀登的路。只要你肯迈步,路就会在你脚下延伸。

你可能感兴趣的:(Spark Stream对接Kafka的两种版本选择)