1、·将KafkaProducer利用lazy val的方式进行包装
package cn.com.m.util
import java.util.concurrent.Future
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {
/* This is the key idea that allows us to work around running into
NotSerializableExceptions. */
lazy val producer = createProducer()
def send(topic: String, key: K, value: V): Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, key, value))
def send(topic: String, value: V): Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, value))
}
object KafkaSink {
import scala.collection.JavaConversions._
def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = {
val createProducerFunc = () => {
val producer = new KafkaProducer[K, V](config)
sys.addShutdownHook {
// Ensure that, on executor JVM shutdown, the Kafka producer sends
// any buffered messages to Kafka before shutting down.
producer.close()
}
producer
}
new KafkaSink(createProducerFunc)
}
def apply[K, V](config: java.util.Properties): KafkaSink[K, V] = apply(config.toMap)
}
2、使用时,我们利用广播变量的形式,将KafkaProducer广播到每一个executor
val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
val kafkaProducerConfig = {
val p = new Properties()
p.setProperty("bootstrap.servers", conf("bootstrapServers"))
p.setProperty("key.serializer", classOf[StringSerializer].getName)
p.setProperty("value.serializer", classOf[StringSerializer].getName)
p
}
spark.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
}
3、完整应用例子:
package cn.com.m.work
import java.util.Properties
import cn.com.m.util.{DataUtils, KafkaSink}
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
import org.json4s.jackson.Serialization
import org.json4s.{Formats, NoTypeHints}
object SendKafkaDemo {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("SendKafkaDemo") /*.setMaster("local[*]")*/
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.registerKryoClasses(Array(classOf[com.qianxinyao.analysis.jieba.keyword.Keyword]))
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
val sc = spark.sparkContext
val conf = DataUtils.getConfig(spark)
val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
val kafkaProducerConfig = {
val p = new Properties()
p.setProperty("bootstrap.servers", conf("bootstrapServers"))
p.setProperty("key.serializer", classOf[StringSerializer].getName)
p.setProperty("value.serializer", classOf[StringSerializer].getName)
p
}
spark.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
}
val action_topic ="test"
val csvPath = "/tmp/tmp/mysqlInfo/part-00020-3e214cd7-b222-42fe-be3f-46c77fe1f916-c000.csv"
val df = spark.read.format("csv").csv(csvPath).toDF("userid", "title", "content", "words")
df.foreach(row => {
val userid = row.getString(row.fieldIndex("userid"))
val title = row.getString(row.fieldIndex("title"))
val words = row.getString(row.fieldIndex("words"))
val columnMap = Map(
"userid" -> userid,
"title" -> title,
"words" -> words
)
implicit val formats: AnyRef with Formats = Serialization.formats(NoTypeHints)
val formatStr = Serialization.write(columnMap)
println(s"推送完成标志:$action_topic " + formatStr)
kafkaProducer.value.send(action_topic, formatStr)
})
sc.stop()
spark.stop()
}
}