上文从源码分析到 ReceiverSupervisorImpl 已经实例化完成。
关联下代码上下文
// ReceiverTracker.scala line 573
val supervisor = new ReceiverSupervisorImpl(
receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption)
supervisor.start()
supervisor.awaitTermination()
下一步就是start
// ReceiverSupervisor.scala line 128
/** Start the supervisor */
def start() {
onStart()
startReceiver()
}
onStart()
// ReceiverSupervisorImpl.scala line 172
override protected def onStart() {
registeredBlockGenerators.foreach { _.start() }
}
_.start()。启动上文中提到的两条线程。这两条线程很重要,先抖个包袱
// BlockGenerator.scala line 114
/** Start block generating and pushing threads. */
def start(): Unit = synchronized {
if (state == Initialized) {
state = Active
blockIntervalTimer.start()
blockPushingThread.start()
logInfo("Started BlockGenerator")
} else {
throw new SparkException(
s"Cannot start BlockGenerator as its not in the Initialized state [state = $state]")
}
}
再启动startReceiver
onReceiverStart ;确认receiver存在
receiver.onStart;启动Receiver,本例中是SocketReceiver
// ReceiverSupervisor.scala line 143
/** Start receiver */
def startReceiver(): Unit = synchronized {
try {
if (onReceiverStart()) {
logInfo("Starting receiver")
receiverState = Started
receiver.onStart()
logInfo("Called receiver onStart")
} else {
// The driver refused us
stop("Registered unsuccessfully because Driver refused to start receiver " + streamId, None)
}
} catch {
case NonFatal(t) =>
stop("Error starting receiver " + streamId, Some(t))
}
}
SocketReceiver.onStart
// SocketInputDStream.scala line 55
def onStart() {
// Start the thread that receives data over a connection
new Thread("Socket Receiver") {
setDaemon(true)
override def run() { receive() }
}.start()
}
receive()
// SocketInputDStream.scala line 69
/** Create a socket connection and receive data until receiver is stopped */
def receive() {
var socket: Socket = null
try {
logInfo("Connecting to " + host + ":" + port)
socket = new Socket(host, port)
logInfo("Connected to " + host + ":" + port)
val iterator = bytesToObjects(socket.getInputStream())
while(!isStopped && iterator.hasNext) {
store(iterator.next)
}
if (!isStopped()) {
restart("Socket data stream had no more data")
} else {
logInfo("Stopped receiving")
}
} catch {// 一些代码
} finally { // 一些代码
}
}
// Receiver.scala line 113
/**
* Store a single item of received data to Spark's memory.
* These single items will be aggregated together into data blocks before
* being pushed into Spark's memory.
*/
def store(dataItem: T) {
supervisor.pushSingle(dataItem)
}
supervisor.pushSingle
// ReceiverSupervisorImpl.scala line 118
/** Push a single record of received data into block generator. */
def pushSingle(data: Any) {
defaultBlockGenerator.addData(data)
}
至此,数据就已经接收到了,并且交给了BlockGenerator。
先别高兴,数据只是到了Receiver,而真正执行计算的可不是Receiver哦。那么数据是如何到下游的呢?
我们再回到之前抖的包袱里。
// BlockGenerator.scala line 118
blockIntervalTimer.start()
启动定时器,
private[streaming]
class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String)
extends Logging {
private val thread = new Thread("RecurringTimer - " + name) {
setDaemon(true)
override def run() { loop } // 运行loop 方法
}
// 一些代码
// RecurringTimer.scala line 66
/**
* Start at the earliest time it can start based on the period.
*/
def start(): Long = {
start(getStartTime()) // 调用start方法
}
// RecurringTimer.scala line 56
/**
* Start at the given start time.
*/
def start(startTime: Long): Long = synchronized {
nextTime = startTime
thread.start() // 启动线程,这里线程终于启动了
logInfo("Started timer for " + name + " at time " + nextTime)
nextTime
}
// 一些代码
// line 92
private def triggerActionForNextInterval(): Unit = {
clock.waitTillTime(nextTime) // 等到上次设置的
callback(nextTime) // 调用传入的代码块
prevTime = nextTime
nextTime += period
logDebug("Callback for " + name + " called at time " + prevTime)
}
/**
* Repeatedly call the callback every interval.
*/
private def loop() {
try {
while (!stopped) {
triggerActionForNextInterval() // 只要没stop,循环调用triggerActionForNextInterval方法
}
triggerActionForNextInterval()
} catch {
case e: InterruptedException =>
}
}
// 一些代码
}
再看主构造中传入的代码块callback。callback = updateCurrentBuffer
// BlockGenerator.scala line 105
private val blockIntervalTimer =
new RecurringTimer(clock, blockIntervalMs, updateCurrentBuffer, "BlockGenerator")
updateCurrentBuffer方法定义
// BlockGenerator.scala line 231
/** Change the buffer to which single records are added to. */
private def updateCurrentBuffer(time: Long): Unit = {
try {
var newBlock: Block = null
synchronized {
if (currentBuffer.nonEmpty) {
val newBlockBuffer = currentBuffer
currentBuffer = new ArrayBuffer[Any]
val blockId = StreamBlockId(receiverId, time - blockIntervalMs)
listener.onGenerateBlock(blockId)
newBlock = new Block(blockId, newBlockBuffer) // 创建了一个block
}
}
if (newBlock != null) {
blocksForPushing.put(newBlock) // put is blocking when queue is full , 此处,将block方法待push的队列中
}
} catch {
case ie: InterruptedException =>
logInfo("Block updating timer thread was interrupted")
case e: Exception =>
reportError("Error in block updating thread", e)
}
}
至此,receiver接收到的信息已经发送到待发送的队列中。
那么,这些数据又是如何到executor的呢?
BlockGenerator.scala 中还有另一条启动的线程。
启动时调用 keepPushingBlocks方法。
// BlockGenerator.scala line 109
private val blockPushingThread = new Thread() { override def run() { keepPushingBlocks() } }
// BlockGenerator.scala line 256
/** Keep pushing blocks to the BlockManager. */
private def keepPushingBlocks() {
logInfo("Started block pushing thread")
def areBlocksBeingGenerated: Boolean = synchronized {
state != StoppedGeneratingBlocks
}
try {
// While blocks are being generated, keep polling for to-be-pushed blocks and push them.
// 若状态不是停止,则一直循环
while (areBlocksBeingGenerated) {
Option(blocksForPushing.poll(10, TimeUnit.MILLISECONDS)) match {
case Some(block) => pushBlock(block)
case None =>
}
}
// At this point, state is StoppedGeneratingBlock. So drain the queue of to-be-pushed blocks.
// 程序能执行到这里,则说明上面的while已经跳出,状态是StoppedGeneratingBlocks,则需要将剩余的Block都取光。
logInfo("Pushing out the last " + blocksForPushing.size() + " blocks")
while (!blocksForPushing.isEmpty) {
val block = blocksForPushing.take()
logDebug(s"Pushing block $block")
pushBlock(block)
logInfo("Blocks left to push " + blocksForPushing.size())
}
logInfo("Stopped block pushing thread")
} catch {
case ie: InterruptedException =>
logInfo("Block pushing thread was interrupted")
case e: Exception =>
reportError("Error in block pushing thread", e)
}
}
// BlockGenerator.scala line 256
private def pushBlock(block: Block) {
listener.onPushBlock(block.id, block.buffer)
logInfo("Pushed block " + block.id)
}
listener.onPushBlock(block.id,block.buffer)
// ReceiverSupervisorImpl.scala line 108
def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]) {
pushArrayBuffer(arrayBuffer, None, Some(blockId))
}
pushArrayBuffer
// ReceiverSupervisorImpl.scala line 122
/** Store an ArrayBuffer of received data as a data block into Spark's memory. */
def pushArrayBuffer(
arrayBuffer: ArrayBuffer[_],
metadataOption: Option[Any],
blockIdOption: Option[StreamBlockId]
) {
pushAndReportBlock(ArrayBufferBlock(arrayBuffer), metadataOption, blockIdOption) // 封装成 ArrayBufferBlock,后面有模式匹配
}
pushAndReportBlock
// ReceiverSupervisorImpl.scala line 149
/** Store block and report it to driver */
def pushAndReportBlock(
receivedBlock: ReceivedBlock,
metadataOption: Option[Any],
blockIdOption: Option[StreamBlockId]
) {
val blockId = blockIdOption.getOrElse(nextBlockId)
val time = System.currentTimeMillis
val blockStoreResult = receivedBlockHandler.storeBlock(blockId, receivedBlock)
logDebug(s"Pushed block $blockId in ${(System.currentTimeMillis - time)} ms")
val numRecords = blockStoreResult.numRecords
val blockInfo = ReceivedBlockInfo(streamId, numRecords, metadataOption, blockStoreResult)
trackerEndpoint.askWithRetry[Boolean](AddBlock(blockInfo))
logDebug(s"Reported block $blockId")
}
storeBlock(block,receivedBlock)
// BlockManagerBasedBlockHandler.scala line 70
def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {
var numRecords = None: Option[Long]
val putResult: Seq[(BlockId, BlockStatus)] = block match {
case ArrayBufferBlock(arrayBuffer) =>
numRecords = Some(arrayBuffer.size.toLong)
blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel,
tellMaster = true)
// 其他的case class
}
if (!putResult.map { _._1 }.contains(blockId)) {
throw new SparkException(
s"Could not store $blockId to block manager with storage level $storageLevel")
}
BlockManagerBasedStoreResult(blockId, numRecords)
}
blockManager.putIterator
// BlockManager.scala line 638
def putIterator(
blockId: BlockId,
values: Iterator[Any],
level: StorageLevel,
tellMaster: Boolean = true,
effectiveStorageLevel: Option[StorageLevel] = None): Seq[(BlockId, BlockStatus)] = {
require(values != null, "Values is null")
doPut(blockId, IteratorValues(values), level, tellMaster, effectiveStorageLevel)
}
doPut
// BlockManager.scala line 797
case IteratorValues(iterator) =>
blockStore.putIterator(blockId, iterator, putLevel, returnValues)
至此,可见数据完全交由BlockManager掌管。同时数据接收也已经解析完。
那就下一步:接收到的数据是如何运行的?
感谢王家林老师的知识分享
王家林老师名片:
中国Spark第一人
新浪微博:http://weibo.com/ilovepains
微信公众号:DT_Spark
博客:http://blog.sina.com.cn/ilovepains
手机:18610086859
QQ:1740415547
YY课堂:每天20:00免费现场授课频道68917580
王家林:DT大数据梦工厂创始人、Spark亚太研究院院长和首席专家、大数据培训专家、大数据架构师。