Spark-Streaming 采集Kafka数据源中的数据

1、自定义获取 kafkaDStream对象方法

import kafka.serializer.StringDecoder
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils

object SparkUtils {
def kafkaDStream(ssc: StreamingContext,kafkaParams: Map[String, String],topic:Set[String]):InputDStream[(String, String)]= {
    KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
      ssc, kafkaParams, Set(topic)
 }
}

2、获取kafkaDStream对象

import kafka.serializer.StringDecoder
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

object SparkStreaming04_Kafka {
    def main(args: Array[String]): Unit = {

        // 监听指定端口,获取数据,实现WordCount功能

        // TODO 创建上下文环境对象
        val sparkConf: SparkConf = new SparkConf().setAppName("SparkStreaming04_Kafka").setMaster("local[*]")
        val ssc: StreamingContext = new StreamingContext(sparkConf, Seconds(3))

        // TODO 获取离散化流
        // 采集 Kafka 数据源中的数据
        val brokers = "hadoop102:9092,hadoop103:9092,hadoop104:9092"
        val topic = "kafkaDStream"
        val group = "bigdata"
        val deserialization = "org.apache.kafka.common.serialization.StringDeserializer"
        val kafkaParams: Map[String, String] = Map(
            ConsumerConfig.GROUP_ID_CONFIG -> group,
            ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
            ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> deserialization,
            ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> deserialization
        )

        val kafkaDStream: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
            ssc, kafkaParams, Set(topic)
        )
        kafkaDStream.map(_._2).foreachRDD(rdd=>rdd.foreach(println))

        // TODO 让采集器启动执行
        ssc.start()

        // TODO Driver等待采集器的执行完毕
        ssc.awaitTermination()
    }
}

你可能感兴趣的:(笔记)