spark streaming 使用socket数据来源

1.编写监听socket的模拟socket程序
2.编写SocketWordCount
3.基于状态的单词累计出现次数
4.基于窗口的单词累计出现次数

1.编写监听socket的模拟socket程序

import java.io.PrintWriter
import java.net.ServerSocket

import scala.io.Source

object DataFlowSimulator {

  //定义随机获取整数的方法
  def index(length:Int)={
    import java.util.Random
    val rdm = new Random();
    rdm.nextInt(length)
  }

  def main(args: Array[String]): Unit = {

    //调用该模拟器需要三个参数,分为文件路径、端口好、间隔时间(单位:毫秒)
    if(args.length != 3){
      System.err.println("Usage   ")
      System.exit(-1)
    }

    //获取文件的总行数
    val filename = args(0)
    val lines = Source.fromFile(filename).getLines().toList
    val filerow = lines.length

    //制定端口,但外部程序请求时建立连接
    val lister = new ServerSocket(args(1).toInt)
    while (true){
      val socket = lister.accept()
      new Thread(){
        override def run(): Unit ={
          println("Got client connection from:"+socket.getInetAddress)
          val out = new PrintWriter(socket.getOutputStream,true)
          while(true){
            Thread.sleep(args(2).toLong)
            //当该端口接受请求时,随机获取某以行的数据
            val content = lines(index(filerow))
            println(content)
            out.write(content+'\n')
            out.flush()
          }
          out.close()
          socket.close()
        }
      }.start()
    }

  }
}

2.编写SocketWordCount

package streaming

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}

object SocketSparkStreaming {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("socketSparkStreaming").setMaster("local[2]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc,Seconds(5))

    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

    val lines = ssc.socketTextStream("spark02",8089,StorageLevel.MEMORY_ONLY)

    val worlds = lines.flatMap(_.split(","))

    val worldCounts = worlds.map(x=>(x,1)).reduceByKey((_+_))
    worldCounts.print()
    //worldCounts.saveAsTextFiles("file:///home/spark/test/sparktest")
    ssc.start()
    ssc.awaitTermination()
  }
}

3.基于状态的单词累计出现次数

package streaming
import org.apache.hadoop.hdfs.server.common.Storage
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}

object StateWorldCount {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)


    //更新函数,参数values作为当前批次单词的频率,state为以往批次单词的频率
    val updateFunc = (values:Seq[Int],state:Option[Int])=>{
      val currentCount = values.foldLeft(0)(_+_)
      val previousCount = state.getOrElse(0)
      Some(currentCount + previousCount)
    }

    val conf = new SparkConf().setAppName("stateSparkStreaming").setMaster("local[2]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc,Seconds(5))
    ssc.checkpoint("file:///d:/checkpoint")
    val lines = ssc.socketTextStream(args(0),args(1).toInt,StorageLevel.MEMORY_ONLY)

    val worldCounts = lines.flatMap(_.split(",")).map((_,1))

    //使用updateStateByKey来更新状态,统计从运行开始到现在以来的单词频率
    val stateDStream = worldCounts.updateStateByKey(updateFunc)
    stateDStream.print()

    ssc.start()
    ssc.awaitTermination()

  }
}

4.基于窗口的单词累计出现次数

package streaming

import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
object WindowWordCount {
  def main(args: Array[String]): Unit = {

//    if(args.length != 4){
//      System.err.println("Usage: WindowWorldCount    ")
//      System.exit(-1)
//    }

    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

    val conf = new SparkConf().setAppName("window").setMaster("local[2]")
    val ssc = new StreamingContext(conf,Seconds(5))

    ssc.checkpoint("file:///H:/checkpoint")

    val lines = ssc.socketTextStream("spark02",8089,StorageLevel.MEMORY_ONLY)
    val words = lines.flatMap(_.split(",")).map((_,1))

    //windos操作,第一种为叠加处理,第二种为增量处理
    //val worldCounts = words.reduceByKeyAndWindow((a:Int,b:Int)=>(a+b),Seconds(10),Seconds(10))
    val worldCounts = words.reduceByKeyAndWindow(_+_,_-_,Seconds(10),Seconds(10))

    worldCounts.print()
    ssc.start()
    ssc.awaitTermination()

  }
}

你可能感兴趣的:(spark)