Spark Streaming是将流式计算分解成一系列短小的批处理作业。这里的批处理引擎是Spark Engine,也就是把Spark Streaming的输入数据按照batch size(如1秒)分成一段一段的数据(Discretized Stream),每一段数据都转换成Spark中的RDD(Resilient Distributed Dataset),然后将Spark Streaming中对DStream的Transformation操作变为针对Spark中对RDD的Transformation操作,将RDD经过操作变成中间结果保存在内存中。整个流式计算根据业务的需求可以对中间的结果进行叠加,或者存储到外部设备。Spark Streaming属于Spark的核心api,它支持高吞吐量、支持容错的实时流数据处理。它可以接受来自Kafka, Flume, Twitter, ZeroMQ和TCP Socket的数据源,使用简单的api函数比如 map, reduce, join, window等操作,还可以直接使用内置的机器学习算法、图算法包来处理数据。
Spark Streaming基本操作:
import org.apache.log4j.Level import org.apache.spark.SparkContext import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext._ import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.lib.input.TextInputFormat object SparkStreaming { def main(args: Array[String]) { val sc = new SparkContext("spark://centos.host1:7077", "Spark Streaming") //创建StreamingContext,20秒一个批次 val ssc = new StreamingContext(sc, Seconds(20)) //获得一个DStream来负责TCP连接(监听端口:地址) val serverIP = "localhost" val serverPort = 9999 val lines = ssc.socketTextStream(serverIP, serverPort) val rdd1 = lines.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _) //打印到控制台 rdd1.print() //获得一个InputDStream来负责监听文件目录 val dataDirectory = "/user/hadoop/data/temp/streaming/"; val inputDStream1 = ssc.fileStream[LongWritable, Text, TextInputFormat](dataDirectory) val rdd2 = inputDStream1.flatMap(_._2.toString().split(" ")).map(word => (word, 1)).reduceByKey(_ + _) rdd2.print() val inputDStream2 = ssc.textFileStream(dataDirectory) val rdd3 = inputDStream2.flatMap(_.toString().split(" ")).map(word => (word, 1)).reduceByKey(_ + _) rdd3.print() //特定的窗口操作,窗口操作涉及两个参数:一个是滑动窗口的宽度(Window Duration);另一个是窗口滑动的频率(Slide Duration) val inputDStream3 = ssc.fileStream[LongWritable, Text, TextInputFormat](dataDirectory) val rdd4 = inputDStream3.flatMap(_._2.toString().split(" ")).map(word => (word, 1)).reduceByKeyAndWindow( (x:Int, y:Int) => (x + y), Seconds(40), Seconds(20)) rdd4.print() //保存流的内容,文件默认会保存在用户的目录下 //保存流的内容为SequenceFile, 文件目录名 : "prefix-TIME_IN_MS.suffix" rdd-1411894750000.seq rdd4.saveAsObjectFiles("/user/hadoop/data/temp/rdd", "seq") //保存流的内容为TextFile, 文件目录名 : "prefix-TIME_IN_MS.suffix" rdd-1411894750000.txt rdd4.saveAsTextFiles("/user/hadoop/data/temp/rdd", "txt") //保存流的内容为HadoopFile, 文件目录名 : "prefix-TIME_IN_MS.suffix" rdd-1411894750000.hadoop //这个API暂时没有正确使用出来 //rdd4.saveAsHadoopFiles("/user/hadoop/data/temp/rdd", "hadoop") //rdd4.saveAsHadoopFiles("/user/hadoop/data/temp/rdd", "hadoop", Text.class, IntWritable.class, TextOutputFormat.class, conf) //开始运行 ssc.start() //计算完毕退出 ssc.awaitTermination() sc.stop() } }