通过Spark向Kafka写入数据

1、·将KafkaProducer利用lazy val的方式进行包装

package cn.com.m.util

import java.util.concurrent.Future

import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {
  /* This is the key idea that allows us to work around running into
     NotSerializableExceptions. */
  lazy val producer = createProducer()
  def send(topic: String, key: K, value: V): Future[RecordMetadata] =
    producer.send(new ProducerRecord[K, V](topic, key, value))
  def send(topic: String, value: V): Future[RecordMetadata] =
    producer.send(new ProducerRecord[K, V](topic, value))
}

object KafkaSink {
  import scala.collection.JavaConversions._
  def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = {
    val createProducerFunc = () => {
      val producer = new KafkaProducer[K, V](config)
      sys.addShutdownHook {
        // Ensure that, on executor JVM shutdown, the Kafka producer sends
        // any buffered messages to Kafka before shutting down.
        producer.close()
      }
      producer
    }
    new KafkaSink(createProducerFunc)
  }
  def apply[K, V](config: java.util.Properties): KafkaSink[K, V] = apply(config.toMap)
}

2、使用时,我们利用广播变量的形式,将KafkaProducer广播到每一个executor

val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
      val kafkaProducerConfig = {
        val p = new Properties()
        p.setProperty("bootstrap.servers", conf("bootstrapServers"))
        p.setProperty("key.serializer", classOf[StringSerializer].getName)
        p.setProperty("value.serializer", classOf[StringSerializer].getName)
        p
      }
      spark.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
    }

3、完整应用例子:

package cn.com.m.work

import java.util.Properties

import cn.com.m.util.{DataUtils, KafkaSink}
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import org.json4s.jackson.Serialization
import org.json4s.{Formats, NoTypeHints}

object SendKafkaDemo {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("SendKafkaDemo") /*.setMaster("local[*]")*/
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sparkConf.registerKryoClasses(Array(classOf[com.qianxinyao.analysis.jieba.keyword.Keyword]))
    val spark = SparkSession.builder().config(sparkConf).getOrCreate()
    val sc = spark.sparkContext
    
    val conf = DataUtils.getConfig(spark)
    val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
      val kafkaProducerConfig = {
        val p = new Properties()
        p.setProperty("bootstrap.servers", conf("bootstrapServers"))
        p.setProperty("key.serializer", classOf[StringSerializer].getName)
        p.setProperty("value.serializer", classOf[StringSerializer].getName)
        p
      }
      spark.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
    }

    val action_topic ="test"
    val csvPath = "/tmp/tmp/mysqlInfo/part-00020-3e214cd7-b222-42fe-be3f-46c77fe1f916-c000.csv"
    val df = spark.read.format("csv").csv(csvPath).toDF("userid", "title", "content", "words")
    df.foreach(row => {
      val userid = row.getString(row.fieldIndex("userid"))
      val title = row.getString(row.fieldIndex("title"))
      val words = row.getString(row.fieldIndex("words"))
      val columnMap = Map(
        "userid" -> userid,
        "title" -> title,
        "words" -> words
      )
      implicit val formats: AnyRef with Formats = Serialization.formats(NoTypeHints)
      val formatStr = Serialization.write(columnMap)
      println(s"推送完成标志:$action_topic  " + formatStr)
      kafkaProducer.value.send(action_topic, formatStr)
    })
    sc.stop()
    spark.stop()
  }

}

 

 

你可能感兴趣的:(Hadoop相关,kafka,spark,kafka,spark)