spark批量写入mongodb以及参数配置设置(spark实时关闭处理)

sparkstreaming参数配置设置

spark.streaming.receiver.writeAheadLog.enable 防止读取数据丢失参数设置为true
然后persist(缓存在内存中StorageLevel.MEMORY_AND_DISK)
conf.set(“spark.streaming.stopGracefullyOnShutdown”,“true”)//优雅的关闭
conf.set(“spark.streaming.receiver.writeAheadLog.enable”,“true”)//防止数据丢失
conf.set(“spark.streaming.backpressure.enabled”,“true”)//激活削峰功能
conf.set(“spark.streaming.backpressure.initialRate”,firstCount.toString)//第一次读取的最大数据值
conf.set(“spark.streaming.kafka.maxRatePerPartition”,threadCount.toString)//每个进程每秒最多从kafka读取的数据条数
conf.set(“spark.mongodb.input.uri”, Property.getProperty(“bigScreenInUri1”))
conf.set(“spark.mongodb.output.uri”, Property.getProperty(“bigScreenOutUri1”))
conf.set(“spark.streaming.kafka.consumer.poll.ms”,“10000”)//拉取数据超时时间

批量写入MongoDB($inc 的用法就是原有基础上在增加多少)

val mongoConnector = MongoConnector(writeAllTrade.asOptions)
  allTrades.foreachPartition(
    iter => if (iter.nonEmpty) {
      val writeConfig = WriteConfig(Map("database" -> Property.getProperty("resultDatabase"), "collection" ->  Property.getProperty("bigScreenAllTradeTable"), "writeConcern.w" -> "1","spark.mongodb.output.uri"->Property.getProperty("uri")))
      mongoConnector.withCollectionDo(writeConfig, {collection: MongoCollection[BsonDocument] =>
        iter.grouped(writeConfig.maxBatchSize).foreach(batch => {
          val updateOptions = new UpdateOptions().upsert(true)
          val requests = batch.map(doc =>{
            val queryDocument = new BsonDocument()
            queryDocument.append("_id", doc.get("_id"))
            doc.remove("_id")
            new UpdateOneModel[BsonDocument](queryDocument, new BsonDocument("$inc", doc), updateOptions)
          })
          collection.bulkWrite(requests.toList.asJava)
        })
      })
    }
  )

StreamUtils.stopByMarkFile(server,ssc,hdfs_file_path) //方式二通过扫描HDFS文件来优雅的关闭
/***
* 通过一个消息文件来定时触发是否需要关闭流程序
* @param ssc StreamingContext
*/

 def stopByMarkFile(server :Server,ssc:StreamingContext,hdfs_file_path :String):Unit= {
val intervalMills = 10 * 1000 // 每隔10秒扫描一次消息是否存在
var isStop = false
while (!isStop) {
  isStop = ssc.awaitTerminationOrTimeout(intervalMills)
  if (!isStop && isExistsMarkFile(hdfs_file_path)) {
    server.stop()
    log.warn("2秒后开始关闭sparstreaming程序.....")
    Thread.sleep(2000)
    ssc.stop(true, true)
  }

//更新 $set 的用法和特性(没有就自动添加一条,有就更新)

mercTradesByDay.foreachPartition(
    iter => if (iter.nonEmpty) {
      val writeConfig = WriteConfig(Map("database" -> Property.getProperty("resultDatabase"), "collection" ->  Property.getProperty("bigScreenByMercTable"), "writeConcern.w" -> "1","spark.mongodb.output.uri"->Property.getProperty("uri")))
      mongoConnector.withCollectionDo(writeConfig, {collection: MongoCollection[BsonDocument] =>
        iter.grouped(writeConfig.maxBatchSize).foreach(batch => {
          val updateOptions = new UpdateOptions().upsert(true)
          val requests = batch.map(doc =>{
            val queryDocument = new BsonDocument()
            queryDocument.append("_id", doc.get("_id"))
            var value = new BasicDBObject("$set", new BasicDBObject("MERC_NAME",doc.get("MERC_NAME")))
            new UpdateOneModel[BsonDocument](queryDocument, new BsonDocument("$set", doc), updateOptions)
          })
          collection.bulkWrite(requests.toList.asJava)
        })
      })
    }
  )



package com.eptok.scala.offline.testm

import com.mongodb.spark.MongoSpark
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.bson.Document

/*

  • @author: ljx

  • @date: 2019/11/13

  • @description:
    */

     	object Demo {
       def main(args: Array[String]): Unit = {
        //spark操作mongodb入门
       val spark = SparkSession.builder()
    .master("local[2]")
       .appName("ConnAppTest")
       .config("spark.mongodb.input.uri", 	"mongodb://interface_manager:2wsxCDE#@10.213.:50000,1	0.213.32.85:50000/xzq_test.collection?authSource=admin") // 指定	mongodb输入
     .config("spark.mongodb.output.uri", "mongodb://interface_manager:2wsxCDE#@10.213.32.:50000,10.213.35:50000/xzq_test.collection?authSource=admin") // 指定mongodb输出
      .getOrCreate()
    
     // 生成测试数据
       val documents = spark.sparkContext.parallelize((8 to 10).map(i => Document.parse(s"{test: $i}")))
      // 存储数据到mongodb
     MongoSpark.save(documents)
    
     //    // 加载数据
         val rdd: DataFrame = MongoSpark.load(spark)
     //    // 打印输出
         rdd.show
       }
    
    
     }
    

你可能感兴趣的:(Spark)