Spark Streaming Join与优雅关闭

join
两个流之间的 join 需要两个流的批次大小一致,这样才能做到同时触发计算。计算过程就是对当前批次的两个流中各自的 RDD 进行 join,与两个 RDD 的 join 效果相同。

linux>nc -lk 1111
linux>telnet 192.168.58.200 1111
linux>nc -lk 2222
linux>telnet 192.168.58.200 2222
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
object JoinTest {
 def main(args: Array[String]): Unit = {
 //1.创建 SparkConf
 val sparkConf: SparkConf = new 
SparkConf().setMaster("local[*]").setAppName("JoinTest")
 //2.创建 StreamingContext
 val ssc = new StreamingContext(sparkConf, Seconds(5))
 //3.从端口获取数据创建流
 val lineDStream1: ReceiverInputDStream[String] = 
ssc.socketTextStream("192.168.58.200", 1111)
 val lineDStream2: ReceiverInputDStream[String] = 
ssc.socketTextStream("192.168.58.200", 2222)
 //4.将两个流转换为 KV 类型
 val wordToOneDStream: DStream[(String, Int)] = lineDStream1.flatMap(_.split(" ")).map((_, 1))
 val wordToADStream: DStream[(String, String)] = lineDStream2.flatMap(_.split(" ")).map((_, "a"))
 //5.流的 JOIN
 val joinDStream: DStream[(String, (Int, String))] = 
wordToOneDStream.join(wordToADStream)
 //6.打印
 joinDStream.print()
 //7.启动任务
 ssc.start()
 ssc.awaitTermination()
 }
}

优雅关闭
流式任务需要 7*24 小时执行,但是有时涉及到升级代码需要主动停止程序,但是分布式程序,没办法做到一个个进程去杀死,所有需要配置优雅的关闭。
使用外部文件系统来控制内部程序关闭。

import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.streaming.{StreamingContext, StreamingContextState}
class MonitorStop(ssc: StreamingContext) extends Runnable {
 override def run(): Unit = {
 val fs: FileSystem = FileSystem.get(new URI("hdfs://192.168.58.200:9000"), new 
Configuration(), "atguigu")
 while (true) {
 try
 Thread.sleep(5000)
 catch {
 case e: InterruptedException =>
 e.printStackTrace()
 }
 val state: StreamingContextState = ssc.getState
 val bool: Boolean = fs.exists(new Path("hdfs://192.168.58.200:9000/StopSpark"))
 if (bool) {
 if (state == StreamingContextState.ACTIVE) {
 ssc.stop(stopSparkContext = true, stopGracefully = true)
 System.exit(0)
 }
 }
 }
 }
}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object SparkTest {
 def createSSC(): _root_.org.apache.spark.streaming.StreamingContext = {
 val update: (Seq[Int], Option[Int]) => Some[Int] = (values: Seq[Int], status: 
Option[Int]) => {
 //当前批次内容的计算
 val sum: Int = values.sum
 //取出状态信息中上一次状态
 val lastStatu: Int = status.getOrElse(0)
 Some(sum + lastStatu)
 }
 val sparkConf: SparkConf = new 
SparkConf().setMaster("local[4]").setAppName("SparkTest")
 //设置优雅的关闭
 sparkConf.set("spark.streaming.stopGracefullyOnShutdown", "true")
 val ssc = new StreamingContext(sparkConf, Seconds(5))
 ssc.checkpoint("./ck")
 val line: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.58.200", 1111)
 val word: DStream[String] = line.flatMap(_.split(" "))
 val wordAndOne: DStream[(String, Int)] = word.map((_, 1))
 val wordAndCount: DStream[(String, Int)] = wordAndOne.updateStateByKey(update)
 wordAndCount.print()
 ssc
 }
 def main(args: Array[String]): Unit = {
 val ssc: StreamingContext = StreamingContext.getActiveOrCreate("./ck", () => 
createSSC())
 new Thread(new MonitorStop(ssc)).start()
 ssc.start()
 ssc.awaitTermination()
 }
}

你可能感兴趣的:(spark,spark,hadoop,大数据,scala)