spark学习(13)之SparkStreaming的其他数据源

之前我们的SparkStreaming都是一些Socket的数据了,还有其他几种
文件流:
监控一个目录中文件变化,只要有新文件产生他就会读入


import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel

object MyFileStreamingDemo {
  def main(args: Array[String]): Unit = {
    //不打印多余的日志配置
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
    //获取StreamingContext对象
    val conf=new SparkConf().setAppName("StreamTest2").setMaster("local[2]")
    val ssc=new StreamingContext(conf,Seconds(3))
    //得到DStream,通过监控一个文件目录的变化 
    val dstream = ssc.textFileStream("G:/msdownld.tmp")
    dstream.print()
    //启动流式计算
    ssc.start()
    ssc.awaitTermination()
  }
}

RDD队列流:

import org.apache.spark.SparkConf
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.storage.StorageLevel
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.rdd.RDD
import scala.collection.mutable.Queue

object MyRDDQueueStreamDemo {
  def main(args: Array[String]): Unit = {
    //不打印多余的日志配置
    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
    //获取StreamingContext对象
    val conf=new SparkConf().setAppName("StreamTest2").setMaster("local[2]")
    val ssc=new StreamingContext(conf,Seconds(3))
    //定义一个队列
    val queue = new Queue[RDD[Int]]()
    //初始化队列,1秒产生一个RDD,并把它放进队列中,等待sparkStreaming采样
    for(i <- 1 to 3){
      queue += ssc.sparkContext.makeRDD(1 to 5)
      
      Thread.sleep(1000)
    }
    
    val inputStream = ssc.queueStream(queue)
    inputStream.print()
    //启动流式计算
    ssc.start()
    ssc.awaitTermination()
  }
}

你可能感兴趣的:(大数据hadoop,spark,学习,scala)