先贴下案例源码
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Durations, StreamingContext}
/**
* 感谢王家林老师的知识分享
* 王家林老师名片:
* 中国Spark第一人
* 感谢王家林老师的知识分享
* 新浪微博:http://weibo.com/ilovepains
* 微信公众号:DT_Spark
* 博客:http://blog.sina.com.cn/ilovepains
* 手机:18610086859
* QQ:1740415547
* 邮箱:[email protected]
* YY课堂:每天20:00免费现场授课频道68917580
* 王家林:DT大数据梦工厂创始人、Spark亚太研究院院长和首席专家、大数据培训专家、大数据架构师。
*/
object StreamingWordCountSelfScala {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setMaster("spark://master:7077").setAppName("StreamingWordCountSelfScala")
val ssc = new StreamingContext(sparkConf, Durations.seconds(5)) // 每5秒收割一次数据
val lines = ssc.socketTextStream("localhost", 9999) // 监听 本地9999 socket 端口
val words = lines.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _) // flat map 后 reduce
words.print() // 打印结果
ssc.start() // 启动
ssc.awaitTermination()
ssc.stop(true)
}
}
上文已经从源码分析了StreamingContext实例化过程,下一步是
val lines = ssc.socketTextStream("localhost", 9999)
源码钻进去
// StreamingContext line 327
def socketTextStream(
hostname: String,
port: Int,
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
): ReceiverInputDStream[String] = withNamedScope("socket text stream") {
socketStream[String](hostname, port, SocketReceiver.bytesToLines, storageLevel)
}
// StreamingContext line 345
def socketStream[T: ClassTag](
hostname: String,
port: Int,
converter: (InputStream) => Iterator[T],
storageLevel: StorageLevel
): ReceiverInputDStream[T] = {
new SocketInputDStream[T](this, hostname, port, converter, storageLevel)
}
由此创建SocketInputDStream
// SocketInputDStream line 33
private[streaming]
class SocketInputDStream[T: ClassTag](
ssc_ : StreamingContext,
host: String,
port: Int,
bytesToObjects: InputStream => Iterator[T],
storageLevel: StorageLevel
) extends ReceiverInputDStream[T](ssc_) {
// 这里只定义了Receiver,应该加一个override的
// 而且只是单纯的定义,并没有
def getReceiver(): Receiver[T] = {
new SocketReceiver(host, port, bytesToObjects, storageLevel)
}
// 除了定义一个Receiver之外,没其他内容。
}
再看下 ReceiverInputDStream构造。
// ReceiverInputDStream line 41
// 是个抽象类
abstract class ReceiverInputDStream[T: ClassTag](ssc_ : StreamingContext)
extends InputDStream[T](ssc_) {
/**
* Asynchronously maintains & sends new rate limits to the receiver through the receiver tracker.
*/
// 定义了一个RateController,这是干什么的呢?后续会分析。
override protected[streaming] val rateController: Option[RateController] = {
if (RateController.isBackPressureEnabled(ssc.conf)) {
Some(new ReceiverRateController(id, RateEstimator.create(ssc.conf, ssc.graph.batchDuration)))
} else {
None
}
}
// 其他方法
}
接着跟踪到父类InputDStream
// InputDStream.scala line
abstract class InputDStream[T: ClassTag] (ssc_ : StreamingContext)
extends DStream[T](ssc_) {
private[streaming] var lastValidTime: Time = null
// line 47
// 这里是重点,此处在构造的时候,将自身传入到graph的addInputStream的方法中
ssc.graph.addInputStream(this)
/** This is an unique identifier for the input stream. */
// line 50
// 新建一个InputStream的ID
val id = ssc.getNewInputStreamId()
// Keep track of the freshest rate for this stream using the rateEstimator
protected[streaming] val rateController: Option[RateController] = None
// ... 一些方法
// line 68
/**
* The base scope associated with the operation that created this DStream.
*
* For InputDStreams, we use the name of this DStream as the scope name.
* If an outer scope is given, we assume that it includes an alternative name for this stream.
*/
protected[streaming] override val baseScope: Option[String] = {
val scopeName = Option(ssc.sc.getLocalProperty(SparkContext.RDD_SCOPE_KEY))
.map { json => RDDOperationScope.fromJson(json).name + s" [$id]" }
.getOrElse(name.toLowerCase)
Some(new RDDOperationScope(scopeName).toJson)
}
让我们看看
// line 47
ssc.graph.addInputStream(this)
// DStreamGraph.scala line 83
def addInputStream(inputStream: InputDStream[_]) {
this.synchronized {
// 将当前的graph传递给InputDStream
inputStream.setGraph(this)
inputStreams += inputStream
}
}
再看看父类DStream的构造。
首先validateAtInit(),可以看最后的方法,主要是确保在创建DStream的时候StreamingContext在StreamingContextState.INITIALIZED状态。
slideDuration,每隔多久创建一次RDD,这个RDD和之前的batchDuration不是同一个概念。
dependencies,依赖关系,而且是DStream的依赖关系
def compute(validTime: Time): Option[RDD[T]],具体的计算。后面详细分析
private[streaming] var generatedRDDs = new HashMap[Time, RDD[T]] (),创建出RDD,以时间作为key。
rememberDuration:又一个Duration,概念完全不一样。
abstract class DStream[T: ClassTag] (
@transient private[streaming] var ssc: StreamingContext
) extends Serializable with Logging {
validateAtInit()
// =======================================================================
// Methods that should be implemented by subclasses of DStream
// =======================================================================
// 看,这里还有一个Duration
/** Time interval after which the DStream generates a RDD */
def slideDuration: Duration
// 依赖的DStream
/** List of parent DStreams on which this DStream depends on */
def dependencies: List[DStream[_]]
// 计算
/** Method that generates a RDD for the given time */
def compute(validTime: Time): Option[RDD[T]]
// =======================================================================
// Methods and fields available on all DStreams
// =======================================================================
// RDDs generated, marked as private[streaming] so that testsuites can access it
// 创建出RDD
@transient
private[streaming] var generatedRDDs = new HashMap[Time, RDD[T]] ()
// Time zero for the DStream
private[streaming] var zeroTime: Time = null
// Duration for which the DStream will remember each RDD created
private[streaming] var rememberDuration: Duration = null
// Storage level of the RDDs in the stream
private[streaming] var storageLevel: StorageLevel = StorageLevel.NONE
// Checkpoint details
private[streaming] val mustCheckpoint = false
private[streaming] var checkpointDuration: Duration = null
// 将当前的DStream转变为Checkpoint
private[streaming] val checkpointData = new DStreamCheckpointData(this)
@transient
private var restoredFromCheckpointData = false
// Reference to whole DStream graph
private[streaming] var graph: DStreamGraph = null
// 一些方法
/** Return the StreamingContext associated with this DStream */
def context: StreamingContext = ssc
/* Set the creation call site */
private[streaming] val creationSite = DStream.getCreationSite()
/**
* The base scope associated with the operation that created this DStream.
*
* This is the medium through which we pass the DStream operation name (e.g. updatedStateByKey)
* to the RDDs created by this DStream. Note that we never use this scope directly in RDDs.
* Instead, we instantiate a new scope during each call to `compute` based on this one.
*
* This is not defined if the DStream is created outside of one of the public DStream operations.
*/
protected[streaming] val baseScope: Option[String] = {
Option(ssc.sc.getLocalProperty(SparkContext.RDD_SCOPE_KEY))
}
private def validateAtInit(): Unit = {
ssc.getState() match {
case StreamingContextState.INITIALIZED =>
// good to go
case StreamingContextState.ACTIVE =>
throw new IllegalStateException(
"Adding new inputs, transformations, and output operations after " +
"starting a context is not supported")
case StreamingContextState.STOPPED =>
throw new IllegalStateException(
"Adding new inputs, transformations, and output operations after " +
"stopping a context is not supported")
}
}
// 一些方法
}
看下dependencies,对比下RDD的dependencies。
此处返回的是List[DStream[_]],而RDD返回的是Seq[Dependency[_]],
//Dependency.scala line 32
abstract class Dependency[T] extends Serializable {
def rdd: RDD[T]
}
再对比看下compute方法。
DStream返回的是Option[RDD[T]],RDD返回的是Iterator[T]
再次回忆王家林说的,DStream是RDD的模版,以上两点也证明是这样的。
至此 创建InputDStream已经完成
val lines = ssc.socketTextStream("localhost", 9999)
同时请注意,此时也只是创建了DStream,并没有执行。也就是说,即使向socket中发送消息,也不会消费。
为什么呢?且听下回分解。
感谢王家林老师的知识分享
王家林老师名片:
中国Spark第一人
感谢王家林老师的知识分享
新浪微博:http://weibo.com/ilovepains
微信公众号:DT_Spark
博客:http://blog.sina.com.cn/ilovepains
手机:18610086859
QQ:1740415547
YY课堂:每天20:00免费现场授课频道68917580
王家林:DT大数据梦工厂创始人、Spark亚太研究院院长和首席专家、大数据培训专家、大数据架构师。