六、Spark Streaming消费数据反写Kafka

需求:
1、flume将socket流数据采集到kafka
2、streaming读取kafka的数据进行清洗
3、将清洗后的数据再次放到kafka

六、Spark Streaming消费数据反写Kafka_第1张图片

清洗后的结果数据为:
houseid|gathertime|srcip:srcport|destip:destport|url

spark streaming消费数据反写Kafka代码:

import java.util
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.kafka.HasOffsetRanges
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.JavaConversions._

/**
  * 将kafka中的数据消费后写入到kafka, 按照batch的方式。
  * 使用广播变量 将kafka创建生产者广播到每个executor上面
  */
object Kafka2KafkaPerBatch {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("test").setMaster("local[2]")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new HiveContext(sc)

    val processingInterval = 2
    val brokers = "node01:9092,node02:9092,node03:9092"
    val topic = "mytest1"
    val topicsSet = topic.split(",").toSet
    val groupName = "group02"
    val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "smallest")

    val ssc = new StreamingContext(sc, Seconds(processingInterval))

    val streaming = MyKafkaUtils.createMyDirectKafkaStream(
      ssc, kafkaParams, Set(topic), groupName)

    val sinkTopic = "mykafka"

    // Kafka的Producer不能序列化
    // Caused by: java.io.NotSerializableException: org.apache.kafka.clients.producer.KafkaProducer
    //    streaming.foreachRDD(rdd=>{
    //      if(!rdd.isEmpty()){
    //        val props = new util.HashMap[String, Object]()
    //        props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
    //        props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
    //          "org.apache.kafka.common.serialization.StringSerializer")
    //        props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
    //          "org.apache.kafka.common.serialization.StringSerializer")
    //        val producer = new KafkaProducer[String,String](props)
    //
    //        rdd.map(x=>x._2).map(msg=>ParseUtils.parseMsg(msg)).foreach(msg=>{
    //
    //          val message=new ProducerRecord[String, String]( sinkTopic ,null,msg)
    //          producer.send(message)
    //        })
    //      }
    //    })


    // 数据可以写入到kafka, 但是性能差, 每条记录都需要创建producer
    // streaming.foreachRDD(rdd=>{
    //      if(!rdd.isEmpty()){
    //        rdd.map(x=>x._2).map(msg=>ParseUtils.parseMsg(msg)).filter(_.length!=1).foreach(msg=>{
    //
    //          val props = new util.HashMap[String, Object]()
    //          props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
    //          props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
    //            "org.apache.kafka.common.serialization.StringSerializer")
    //          props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
    //            "org.apache.kafka.common.serialization.StringSerializer")
    //          val producer = new KafkaProducer[String,String](props)
    //          val message=new ProducerRecord[String, String]( sinkTopic ,null,msg)
    //          producer.send(message)
    //        })
    //      }
    //    })

    // 推荐:
    // 将KafkaProducer对象广播到所有的executor节点,
    // 这样就可以在每个executor节点将数据插入到kafka
//    val kafkaProducer: Broadcast[MyKafkaProducer[String, String]] = {
//      val kafkaProducerConfig = {
//        val p = new Properties()
//        p.setProperty("bootstrap.servers", brokers)
//        p.setProperty("key.serializer", classOf[StringSerializer].getName)
//        p.setProperty("value.serializer", classOf[StringSerializer].getName)
//        p
//      }
//      ssc.sparkContext.broadcast(MyKafkaProducer[String, String](kafkaProducerConfig))
//    }
//
//    streaming.foreachRDD(rdd => {
//      if (!rdd.isEmpty()) {
//        rdd.map(x => x._2).map(msg => ParseUtils.parseMsg(msg)).filter(_.length != 1).foreach(msg => {
//          kafkaProducer.value.send(sinkTopic, msg)
//        })
//        MyKafkaUtils.saveOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, groupName)
//      }
//    })

    // 推荐:
    // 用partition的方式,一个rdd的partition对应一个KafkaProducer
    streaming.foreachRDD(rdd=>rdd.foreachPartition(
      // 该rdd的partition对应着kafka里topic的partition
      partition=>{
        val props = new util.HashMap[String, Object]()
        props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers)
        props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
          "org.apache.kafka.common.serialization.StringSerializer")
        props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
          "org.apache.kafka.common.serialization.StringSerializer")
        // 创建的producer在partition里而不是在foreach里,这样减少了KafkaProducer对象的个数
        val producer = new KafkaProducer[String,String](props)

        partition.map(msg=>ParseUtils.parseMsg(msg._2)).filter(_.length!=1).foreach(msg=>{
          val message=new ProducerRecord[String, String](sinkTopic, null, msg)
          producer.send(message)
        })

        MyKafkaUtils.saveOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, groupName)
      }
    ))

    ssc.start()
    ssc.awaitTermination()
  }

}

你可能感兴趣的:(Kafka)