Spark Streaming和Spark Structured Streaming更新broadcast

1)Structured Streaming更新broadcast

val enSpark = enSparkSession.session()    
enSpark.streams.addListener(new StreamingQueryListener {
      override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = {
      }

      override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
        val mins = sdf.format(new Date()).substring(14, 16).toInt
        if (mins % 5 == 0 && broadcastWrapper.rulebroadcast != null) {
          broadcastWrapper.update(enSpark.sparkContext, true)
        }
      }

      override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = {
      }
    })
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

/**
  * 广播变量 Wrapper
  */
class BroadcastWrapper extends Serializable{
  var rulebroadcast: Broadcast[ArrayBuffer[(Int,String,String,String,String,String,String,String)]] = _

  /**
    * 更新 instance
    * @param sc spark context
    * @param blocking unpersist by default
    */
  def update(sc: SparkContext, blocking: Boolean = false): Broadcast[ArrayBuffer[(Int,String,String,String,String,String,String,String)]] = {
    if (rulebroadcast != null) {
      rulebroadcast.unpersist(blocking)
      synchronized {
        rulebroadcast = sc.broadcast(new JdbcUtil().getRuleBroadcast)
      }
    }
    rulebroadcast
  }

  /**
    * 初始化 instance
    * @param sc spark context
    * @return
    */
  def getInstance(sc: SparkContext): Broadcast[ArrayBuffer[(Int,String,String,String,String,String,String,String)]] = {
    if (rulebroadcast == null) {
      synchronized {
        if (rulebroadcast == null) {
          rulebroadcast = sc.broadcast(new JdbcUtil( ).getRuleBroadcast)
        }
      }
    }
    rulebroadcast
  }


}

2)Spark Streaming更新broadcast

  def sparkStreaming(): Unit = {
    // Create a local StreamingContext with two working thread and batch interval of 1 second.
    // The master requires 2 cores to prevent a starvation scenario.
    val conf = new SparkConf().setMaster("local[*]").setAppName("NetworkWordCount")
    val ssc = new StreamingContext(conf, Seconds(15))

    // Create a DStream that will connect to hostname:port, like localhost:9999
    val lines = ssc.socketTextStream(ipAddr, 19999)
    val mro = lines.map(row => {
      val fields = row.split(",")
      Mro(fields(0), fields(1))
    })

    val cellJoinMro = mro.transform(row => {
      if (1 < 3) {
        println("更新broadcast..." + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new java.util.Date()))
        BroadcastWrapper.update(ssc.sparkContext)
      }
      var broadcastCellRes = BroadcastWrapper.getInstance(ssc.sparkContext)
      row.map(row => {
        val int_id: String = row.int_id
        val rsrp: String = row.rsrp
        val findResult: String = String.join(",", broadcastCellRes.value.get(int_id).get)
        val timeStamps: String = String.join(",", findResult)

        CellJoinMro(int_id, rsrp, timeStamps)
      })
    })

    cellJoinMro.print()

    ssc.start() // Start the computation
    ssc.awaitTermination() // Wait for the computation to terminate
  }
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast

object BroadcastWrapper {
  @volatile private var instance: Broadcast[Map[String, java.util.List[String]]] = null
  private val baseDir = "/user/my/streaming/test/"

  def loadData(): Map[String, java.util.List[String]] = {
    val files = HdfsUtil.getFiles(baseDir)

    var latest: String = null
    for (key <- files.keySet) {
      if (latest == null) latest = key
      else if (latest.compareTo(key) <= 0) latest = key
    }

    val filePath = baseDir + latest

    val map = HdfsUtil.getFileContent(filePath)
    map
  }

  def update(sc: SparkContext, blocking: Boolean = false): Unit = {
    if (instance != null)
      instance.unpersist(blocking)
    instance = sc.broadcast(loadData())
  }

  def getInstance(sc: SparkContext): Broadcast[Map[String, java.util.List[String]]] = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          instance = sc.broadcast(loadData)
        }
      }
    }
    instance
  }

}

import java.io.{BufferedReader, InputStreamReader}
import java.text.SimpleDateFormat
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.fs.FileSystem
import scala.collection.mutable

object HdfsUtil {
  private val sdf = new SimpleDateFormat("yyyy-MM-dd 00:00:00")

  def getFiles(path: String): mutable.Map[String, String] = {
    val fileItems = new mutable.LinkedHashMap[String, String]
    val fs = FileSystem.get(new Configuration())
    val files = fs.listStatus(new Path(path))
    var pathStr: String = ""
    for (file <- files) {
      if (file.isFile) {
        pathStr = file.getPath().getName()
        fileItems.put(pathStr.split("/")(pathStr.split("/").length - 1), pathStr)
      }
    }

    fs.close()

    fileItems
  }

  def getFileContent(filePath: String): Map[String, java.util.List[String]] = {
    val map = new mutable.LinkedHashMap[String, java.util.List[String]]

    val fs = FileSystem.get(new Configuration())
    val path = new Path(filePath)
    if (fs.exists(path)) {
      val bufferedReader = new BufferedReader(new InputStreamReader(fs.open(path)))
      var line: String = null
      line = bufferedReader.readLine()
      while (line != null) {

        val fields: Array[String] = line.split(",")
        val int_id: String = fields(0)
        val date = new java.util.Date(java.lang.Long.valueOf(fields(2)))
        val time = sdf.format(date)
        System.out.println(line + "(" + time + ")")

        if (!map.keySet.contains(int_id))
          map.put(int_id, new java.util.ArrayList[String])
        map.get(int_id).get.add(time)

        line = bufferedReader.readLine()
      }

      map.toMap
    } else {
      throw new RuntimeException("the file do not exists")
    }
  }
}

https://blog.csdn.net/weixin_34255793/article/details/86026293

https://www.cnblogs.com/yy3b2007com/p/10610845.html

你可能感兴趣的:(spark,spark,broadcast)