用spark将kafka的数据存到kafka里(方案一)

依赖

 <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-core_2.11artifactId>
            <version>2.4.3version>
        dependency>

 <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-clientartifactId>
            <version>2.9.2version>
        dependency>
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-authartifactId>
            <version>2.9.2version>
        dependency>
  
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-streaming_2.11artifactId>
            <version>2.4.3version>
        dependency>
           
        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-streaming-kafka-0-10_2.11artifactId>
            <version>2.4.3version>
        dependency>

代码

package com.baizhi

import com.baizhi.demo08.KafkaSink
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, State, StateSpec, StreamingContext}

object WordCountKafka {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount").setMaster("local[6]")
    val ssc = new StreamingContext(sparkConf, Seconds(2))
    ssc.sparkContext.setLogLevel("FATAL")
    ssc.checkpoint("file:///D:/checkp")
    val kafkaParams = Map[String, Object](
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "spark:9092",
      ConsumerConfig.GROUP_ID_CONFIG -> "g1",
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer])

    val kafkaSinkBroadcast=ssc.sparkContext.broadcast(new KafkaSink("topic02","spark:9092"))

    //直接读取Kafka中的数据 将kafka独立维护Kafka topic的offset偏移量 checkpoint
    val messages = KafkaUtils.createDirectStream[String, String](ssc,
      LocationStrategies.PreferConsistent,//设置读取策略,如果你的spark计算节点和kafka broker节点不在一台物理主机
      ConsumerStrategies.Subscribe[String, String](List("topic01"), kafkaParams))

    messages.map(record=>record.value)
      .flatMap(line=>line.split(" "))
      .map(word => (word, 1))
      .mapWithState(StateSpec.function((k:String,v:Option[Int],stage:State[Int])=>{
        var total:Int=0
        if(stage.exists()){
          total=stage.getOption().getOrElse(0)
        }
        total += v.getOrElse(0)
        stage.u
    ssc.start()
    ssc.awaitTermination()
  }
}

package com.baizhi.demo08

import java.util.{Properties, UUID}

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer

class KafkaSink(topic:String,severs:String) extends Serializable {

  def createKafkaConnection(): KafkaProducer[String, String] = {
    val props = new Properties()
    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,severs)
    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,classOf[StringSerializer].getName)
    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,classOf[StringSerializer].getName)
    props.put(ProducerConfig.ENABLE_IDEMPOTENCE_CONFIG,"true")//开启幂等性
    props.put(ProducerConfig.RETRIES_CONFIG,"2")//设置重试
    props.put(ProducerConfig.BATCH_SIZE_CONFIG,"100")//设置缓冲区大小
    props.put(ProducerConfig.LINGER_MS_CONFIG,"1000")//最多延迟1000毫秒
    new KafkaProducer[String,String](props)
  }

  lazy val kafkaProducer:KafkaProducer[String,String]= createKafkaConnection()
  Runtime.getRuntime.addShutdownHook(new Thread(){
    override def run(): Unit = {
      kafkaProducer.close()
    }
  })
  def save(vs: Iterator[(String, Int)]): Unit = {

    try{
      vs.foreach(tuple=>{
        val record = new ProducerRecord[String,String](topic,tuple._1,tuple._2.toString)
        kafkaProducer.send(record)
      })

    }catch {
      case e:Exception=> println("发邮件,出错啦~")
    }

  }
}


你可能感兴趣的:(用spark将kafka的数据存到kafka里(方案一))