Githup项目LearningSpark代码讲解(九)

package streaming

import java.text.SimpleDateFormat
import java.util.Date

import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

/**
  * 这个类挺重要的前面我们讲了,DStream的源代码发现slideDuration这个方法是一个RDD的周期,
  * 而Window函数里有两个(windowDuration: Duration, slideDuration: Duration),那么哪个参数是决定一个RDD的呢
  *
  *
  *
  */
object Windowing {
  def main (args: Array[String]) {
    val conf = new SparkConf().setAppName("Windowing").setMaster("local[4]")
    val sc = new SparkContext(conf)

    // streams will produce data every second
    val ssc = new StreamingContext(sc, Seconds(1))
    val qm = new QueueMaker(sc, ssc)

    // create the stream
    val stream = qm.inputStream

    // register for data -- a five second sliding window every two seconds
    /**
      * 先看看官方介绍
      * 返回一个新的DStream,其中每个RDD包含在此DStream上的滑动时间窗口中看到的所有元素。新的DStream生成与此DStream具有相同间隔的RDD。
      * 这里有个必须注意的地方:
      * window(windowDuration: Duration, slideDuration: Duration)中参数的大小,必须是new StreamingContext(sc, Seconds(1))
      * 中Seconds(1)的整数倍
      *
      * window有两个函数
      * windowDuration:窗口宽度
      * slideDuration:滑动间隔
      * def window(windowDuration: Duration, slideDuration: Duration): DStream[T] = ssc.withScope {
      *   new WindowedDStream(this, windowDuration, slideDuration)
      * }
      * def window(windowDuration: Duration, slideDuration: Duration): DStream[T] = ssc.withScope {
      *   new WindowedDStream(this, windowDuration, slideDuration)
      * }
      * 当不传第二个参数的时候slideDuration的大小默认和new StreamingContext(sc, Seconds(1))中的Seconds(1)相同
      *
      * 当两个参数都传入的时候这个RDD的周期怎么计算呢
      *
      * override def compute(validTime: Time): Option[RDD[T]] = {
      *   val currentWindow = new Interval(validTime - windowDuration + parent.slideDuration, validTime)
      *   val rddsInWindow = parent.slice(currentWindow)
      *   Some(ssc.sc.union(rddsInWindow))
      * }
      *
      * 当前窗口期currentWindow的值怎么算
      * validTime数据截止的时间戳
      * windowDuration是window(Seconds(5), Seconds(2))中的第一个参数所以是5秒
      * parent.slideDuration是new StreamingContext(sc, Seconds(1))中的Seconds(1)所以是1秒
      *所以计算完之后是4秒
      * val rddsInWindow = parent.slice(currentWindow)通过这个截取RDD并对现有RDD进行合并
      * 所以最后的流程是:
      * 1 new StreamingContext(sc, Seconds(1))每秒一次读取QueueMaker中的值
      * 2 stream.window(Seconds(5), Seconds(2)),每隔2秒取最近5秒的记录(为什么是5,因为他算上了当前那一秒)
      *
      *
      */
    stream.window(Seconds(5), Seconds(2)).foreachRDD(r => {
      println(new SimpleDateFormat("yyyyMMddHHmmss").format(new Date().getTime))
      if (r.count() == 0)
        println("Empty")
      else
        println("Count = " + r.count() + " min = " + r.min()+ " max = " + r.max())
    })

    // start streaming
    ssc.start()

    new Thread("Delayed Termination") {
      override def run() {
        qm.populateQueue()
        Thread.sleep(20000)
        println("*** stopping streaming")
        ssc.stop()
      }
    }.start()

    try {
      ssc.awaitTermination()
      println("*** streaming terminated")
    } catch {
      case e: Exception => {
        println("*** streaming exception caught in monitor thread")
      }
    }
  }
}

你可能感兴趣的:(Spark)