Kafka+SparkStreaming+MongoDB

快放假了,不想说废话

主要操作类

package com.action

import com.conf.{ConfigManager, ConstantsInterface}
import com.until.LocalKafkaUntils
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * @Author: king
  * @Datetime: 2018/12/27
  * @Desc: TODO
  *
  */
object Kafka2SparkStreaming2MongoDB {
  def main(args: Array[String]): Unit = {
    //sparkSql驱动注册
    val spark = SparkSession
      .builder()
      .master("local[*]")
      .appName("Kafka2SparkStream2Mongo")
      .getOrCreate()
    //sparkStreaming驱动注册
    val ssc = new StreamingContext(spark.sparkContext, Seconds(1))
    //kafka配置
    val kafkaParams=LocalKafkaUntils.
      getKafkaParams(ConstantsInterface.KAFKA_BOOTSTRAP_LIST,"saprk_to_mongo")
    val topics = LocalKafkaUntils.getKafkaTopics()
    //接收到kafka的数据
    val stream:InputDStream[ConsumerRecord[String, String]] =
      LocalKafkaUntils.getSteam(ssc,kafkaParams)
    //mongo的数据结构
    val schemaString="field1 field2 field3"
    val fields =schemaString.split(" ").map(fieldname=>
    StructField(fieldname,StringType,nullable = true))
    val schema = StructType(fields)
    //mongo配置
    val url = ConfigManager.getProperty("mongodb.uri")
    val dbName = ConfigManager.getProperty("mongodb.dbname")
    val MongoDbOptions :Map[String, String]= Map[String,String](
      "spark.mongodb.output.uri"->url.concat(dbName),
      "spark.mongodb.output.replaceDocument"->"false"
    )
    //数据处理存入mongo
    val elementDstream =stream.map(v=>v.value()).foreachRDD{
      rdd=>
        val mongoDF = spark.read.schema(schema).json(rdd)
        mongoDF.write.
          format("com.mongodb.spark.sql.DefaultSource")
          .mode("append")
          .options(MongoDbOptions)
          .save()
    }
    ssc.start
    ssc.awaitTermination
  }

}

sparkstreaming搭载kafka

package com.until


import com.conf.{ConfigManager, ConstantsInterface}
import kafka.message.MessageAndMetadata
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{SparkConf, SparkContext, SparkException, TaskContext}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import kafka.serializer.Decoder

import scala.reflect.ClassTag


/**
  * @Author: king
  * @Datetime: 2018/10/11 
  * @Desc: TODO
  *
  */
object LocalKafkaUntils {
  /**
    * get kafkaconf
    *
    * @return kafkaParams
    */
 

  def getKafkaParams(brokers: String, groupId: String): Map[String, Object] = {
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "group.id" -> groupId,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "use_a_separate_group_id_for_each_stream",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    kafkaParams
  }

  /**
    * get topics
    *
    */
  def getKafkaTopics(): Set[String] = {
    val topicSet = ConfigManager.getProperty(ConstantsInterface.KAFKA_TOPICS).split(",").toSet
    topicSet
  }

  /**
    * 获取stream
    * @param ssc
    * @param kafkaParams
    * @return stream
    */
  def getSteam(ssc: StreamingContext, kafkaParams: Map[String, Object])
    : InputDStream[ConsumerRecord[String, String]] = {
    val topicSet = getKafkaTopics
    val stream = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topicSet, kafkaParams))
    stream
  }
  /**
    * get offset
    */
  def getOffsets(stream:InputDStream[ConsumerRecord[String, String]]): Unit ={
    stream.foreachRDD{rdd =>
      val offsetRanges =rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd.foreachPartition{ iter =>
      val o:OffsetRange=offsetRanges(TaskContext.get.partitionId)
        println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
      }
      //offsetRanges
    }
  }
}

你可能感兴趣的:(saprk)