package cn.testdemo.dstream.socket
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
//todo:利用sparkStreaming实现一定时间内的热门词汇
object SparkStreamingSocketHotWord {
def main(args: Array[String]): Unit = {
//1、创建sparkConf
val sparkConf: SparkConf = new SparkConf().setAppName("SparkStreamingSocketHotWord").setMaster("local[2]")
//2、创建sparkContext
val sc = new SparkContext(sparkConf)
sc.setLogLevel("WARN")
//3、创建StreamingContext
val ssc = new StreamingContext(sc,Seconds(5))
//4、获取socket数据
val stream: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.216.121",9999)
//5、切分每一行
val wordAndOne: DStream[(String, Int)] = stream.flatMap(_.split(" ")).map((_,1))
//6、使用开窗函数,统计相同单词出现的次数
val result: DStream[(String, Int)] = wordAndOne.reduceByKeyAndWindow((x:Int,y:Int)=>x+y,Seconds(10),Seconds(5))
//7、对单词出现的次数降序排列
val sortedDstream: DStream[(String, Int)] = result.transform(rdd => {
val sortedRDD: RDD[(String, Int)] = rdd.sortBy(_._2, false)
val sortHotWords: Array[(String, Int)] = sortedRDD.take(3)
//打印热门词汇
sortHotWords.foreach(x => println(x))
sortedRDD
})
sortedDstream.print()
//开启任务
ssc.start()
ssc.awaitTermination()
}
}