sparkstreaming参数配置设置
spark.streaming.receiver.writeAheadLog.enable 防止读取数据丢失参数设置为true
然后persist(缓存在内存中StorageLevel.MEMORY_AND_DISK)
conf.set(“spark.streaming.stopGracefullyOnShutdown”,“true”)//优雅的关闭
conf.set(“spark.streaming.receiver.writeAheadLog.enable”,“true”)//防止数据丢失
conf.set(“spark.streaming.backpressure.enabled”,“true”)//激活削峰功能
conf.set(“spark.streaming.backpressure.initialRate”,firstCount.toString)//第一次读取的最大数据值
conf.set(“spark.streaming.kafka.maxRatePerPartition”,threadCount.toString)//每个进程每秒最多从kafka读取的数据条数
conf.set(“spark.mongodb.input.uri”, Property.getProperty(“bigScreenInUri1”))
conf.set(“spark.mongodb.output.uri”, Property.getProperty(“bigScreenOutUri1”))
conf.set(“spark.streaming.kafka.consumer.poll.ms”,“10000”)//拉取数据超时时间
批量写入MongoDB($inc 的用法就是原有基础上在增加多少)
val mongoConnector = MongoConnector(writeAllTrade.asOptions)
allTrades.foreachPartition(
iter => if (iter.nonEmpty) {
val writeConfig = WriteConfig(Map("database" -> Property.getProperty("resultDatabase"), "collection" -> Property.getProperty("bigScreenAllTradeTable"), "writeConcern.w" -> "1","spark.mongodb.output.uri"->Property.getProperty("uri")))
mongoConnector.withCollectionDo(writeConfig, {collection: MongoCollection[BsonDocument] =>
iter.grouped(writeConfig.maxBatchSize).foreach(batch => {
val updateOptions = new UpdateOptions().upsert(true)
val requests = batch.map(doc =>{
val queryDocument = new BsonDocument()
queryDocument.append("_id", doc.get("_id"))
doc.remove("_id")
new UpdateOneModel[BsonDocument](queryDocument, new BsonDocument("$inc", doc), updateOptions)
})
collection.bulkWrite(requests.toList.asJava)
})
})
}
)
StreamUtils.stopByMarkFile(server,ssc,hdfs_file_path) //方式二通过扫描HDFS文件来优雅的关闭
/***
* 通过一个消息文件来定时触发是否需要关闭流程序
* @param ssc StreamingContext
*/
def stopByMarkFile(server :Server,ssc:StreamingContext,hdfs_file_path :String):Unit= {
val intervalMills = 10 * 1000 // 每隔10秒扫描一次消息是否存在
var isStop = false
while (!isStop) {
isStop = ssc.awaitTerminationOrTimeout(intervalMills)
if (!isStop && isExistsMarkFile(hdfs_file_path)) {
server.stop()
log.warn("2秒后开始关闭sparstreaming程序.....")
Thread.sleep(2000)
ssc.stop(true, true)
}
//更新 $set 的用法和特性(没有就自动添加一条,有就更新)
mercTradesByDay.foreachPartition(
iter => if (iter.nonEmpty) {
val writeConfig = WriteConfig(Map("database" -> Property.getProperty("resultDatabase"), "collection" -> Property.getProperty("bigScreenByMercTable"), "writeConcern.w" -> "1","spark.mongodb.output.uri"->Property.getProperty("uri")))
mongoConnector.withCollectionDo(writeConfig, {collection: MongoCollection[BsonDocument] =>
iter.grouped(writeConfig.maxBatchSize).foreach(batch => {
val updateOptions = new UpdateOptions().upsert(true)
val requests = batch.map(doc =>{
val queryDocument = new BsonDocument()
queryDocument.append("_id", doc.get("_id"))
var value = new BasicDBObject("$set", new BasicDBObject("MERC_NAME",doc.get("MERC_NAME")))
new UpdateOneModel[BsonDocument](queryDocument, new BsonDocument("$set", doc), updateOptions)
})
collection.bulkWrite(requests.toList.asJava)
})
})
}
)
package com.eptok.scala.offline.testm
import com.mongodb.spark.MongoSpark
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.bson.Document
/*
@author: ljx
@date: 2019/11/13
@description:
*/
object Demo {
def main(args: Array[String]): Unit = {
//spark操作mongodb入门
val spark = SparkSession.builder()
.master("local[2]")
.appName("ConnAppTest")
.config("spark.mongodb.input.uri", "mongodb://interface_manager:2wsxCDE#@10.213.:50000,1 0.213.32.85:50000/xzq_test.collection?authSource=admin") // 指定 mongodb输入
.config("spark.mongodb.output.uri", "mongodb://interface_manager:2wsxCDE#@10.213.32.:50000,10.213.35:50000/xzq_test.collection?authSource=admin") // 指定mongodb输出
.getOrCreate()
// 生成测试数据
val documents = spark.sparkContext.parallelize((8 to 10).map(i => Document.parse(s"{test: $i}")))
// 存储数据到mongodb
MongoSpark.save(documents)
// // 加载数据
val rdd: DataFrame = MongoSpark.load(spark)
// // 打印输出
rdd.show
}
}