spark 版本定制 第5课:基于案例一节课贯通Spark Streaming流计算框架运行源码10

上文从源码分析到 ReceiverSupervisorImpl 已经实例化完成。

关联下代码上下文

// ReceiverTracker.scala line 573
    val supervisor = new ReceiverSupervisorImpl(
      receiver, SparkEnv.get, serializableHadoopConf.value, checkpointDirOption)
    supervisor.start()
    supervisor.awaitTermination()

下一步就是start

// ReceiverSupervisor.scala line 128
/** Start the supervisor */
def start() {
  onStart()
  startReceiver()
}

onStart()

// ReceiverSupervisorImpl.scala line 172
override protected def onStart() {
  registeredBlockGenerators.foreach { _.start() }
}

_.start()。启动上文中提到的两条线程。这两条线程很重要,先抖个包袱

// BlockGenerator.scala line 114
/** Start block generating and pushing threads. */
def start(): Unit = synchronized {
  if (state == Initialized) {
    state = Active
    blockIntervalTimer.start()
    blockPushingThread.start()
    logInfo("Started BlockGenerator")
  } else {
    throw new SparkException(
      s"Cannot start BlockGenerator as its not in the Initialized state [state = $state]")
  }
}

再启动startReceiver

  1. onReceiverStart ;确认receiver存在

  2. receiver.onStart;启动Receiver,本例中是SocketReceiver

// ReceiverSupervisor.scala line 143
/** Start receiver */
def startReceiver(): Unit = synchronized {
  try {
    if (onReceiverStart()) {
      logInfo("Starting receiver")
      receiverState = Started
      receiver.onStart()
      logInfo("Called receiver onStart")
    } else {
      // The driver refused us
      stop("Registered unsuccessfully because Driver refused to start receiver " + streamId, None)
    }
  } catch {
    case NonFatal(t) =>
      stop("Error starting receiver " + streamId, Some(t))
  }
}

SocketReceiver.onStart

// SocketInputDStream.scala line 55
def onStart() {
  // Start the thread that receives data over a connection
  new Thread("Socket Receiver") {
    setDaemon(true)
    override def run() { receive() }
  }.start()
}

receive()

// SocketInputDStream.scala line 69
/** Create a socket connection and receive data until receiver is stopped */
def receive() {
  var socket: Socket = null
  try {
    logInfo("Connecting to " + host + ":" + port)
    socket = new Socket(host, port)
    logInfo("Connected to " + host + ":" + port)
    val iterator = bytesToObjects(socket.getInputStream())
    while(!isStopped && iterator.hasNext) {
      store(iterator.next)
    }
    if (!isStopped()) {
      restart("Socket data stream had no more data")
    } else {
      logInfo("Stopped receiving")
    }
  } catch {// 一些代码
  } finally { // 一些代码
  }
}
// Receiver.scala line 113
/**
 * Store a single item of received data to Spark's memory.
 * These single items will be aggregated together into data blocks before
 * being pushed into Spark's memory.
 */
def store(dataItem: T) {
  supervisor.pushSingle(dataItem)
}

supervisor.pushSingle

// ReceiverSupervisorImpl.scala line 118
/** Push a single record of received data into block generator. */
def pushSingle(data: Any) {
  defaultBlockGenerator.addData(data)
}

至此,数据就已经接收到了,并且交给了BlockGenerator。

先别高兴,数据只是到了Receiver,而真正执行计算的可不是Receiver哦。那么数据是如何到下游的呢?

我们再回到之前抖的包袱里。

// BlockGenerator.scala line 118
blockIntervalTimer.start()

启动定时器,

private[streaming]
class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String)
  extends Logging {
  
    private val thread = new Thread("RecurringTimer - " + name) {
      setDaemon(true)
      override def run() { loop } // 运行loop 方法
    }
    
    // 一些代码
    // RecurringTimer.scala line 66
    /**
     * Start at the earliest time it can start based on the period.
     */
    def start(): Long = {
      start(getStartTime()) // 调用start方法
    }
    
    // RecurringTimer.scala line 56
    /**
     * Start at the given start time.
     */
    def start(startTime: Long): Long = synchronized {
      nextTime = startTime
      thread.start()  // 启动线程,这里线程终于启动了
      logInfo("Started timer for " + name + " at time " + nextTime)
      nextTime
    }
    // 一些代码
    // line 92
    private def triggerActionForNextInterval(): Unit = {
      clock.waitTillTime(nextTime)        // 等到上次设置的
      callback(nextTime)         // 调用传入的代码块
      prevTime = nextTime
      nextTime += period
      logDebug("Callback for " + name + " called at time " + prevTime)
    }
    /**
     * Repeatedly call the callback every interval.
     */
    private def loop() {
      try {
        while (!stopped) {
          triggerActionForNextInterval()  // 只要没stop,循环调用triggerActionForNextInterval方法
        }
        triggerActionForNextInterval()
      } catch {
        case e: InterruptedException =>
      }
    }
    // 一些代码
}

再看主构造中传入的代码块callback。callback = updateCurrentBuffer

// BlockGenerator.scala line 105
private val blockIntervalTimer =
  new RecurringTimer(clock, blockIntervalMs, updateCurrentBuffer, "BlockGenerator")

updateCurrentBuffer方法定义

// BlockGenerator.scala line 231
/** Change the buffer to which single records are added to. */
private def updateCurrentBuffer(time: Long): Unit = {
  try {
    var newBlock: Block = null
    synchronized {
      if (currentBuffer.nonEmpty) {
        val newBlockBuffer = currentBuffer
        currentBuffer = new ArrayBuffer[Any]
        val blockId = StreamBlockId(receiverId, time - blockIntervalMs)
        listener.onGenerateBlock(blockId)
        newBlock = new Block(blockId, newBlockBuffer)          // 创建了一个block
      }
    }

    if (newBlock != null) {
      blocksForPushing.put(newBlock)  // put is blocking when queue is full , 此处,将block方法待push的队列中
    }
  } catch {
    case ie: InterruptedException =>
      logInfo("Block updating timer thread was interrupted")
    case e: Exception =>
      reportError("Error in block updating thread", e)
  }
}

至此,receiver接收到的信息已经发送到待发送的队列中。

那么,这些数据又是如何到executor的呢?

BlockGenerator.scala 中还有另一条启动的线程。

启动时调用 keepPushingBlocks方法。

// BlockGenerator.scala line 109
private val blockPushingThread = new Thread() { override def run() { keepPushingBlocks() } }

// BlockGenerator.scala line 256
/** Keep pushing blocks to the BlockManager. */
private def keepPushingBlocks() {
  logInfo("Started block pushing thread")

  def areBlocksBeingGenerated: Boolean = synchronized {
    state != StoppedGeneratingBlocks
  }

  try {
    // While blocks are being generated, keep polling for to-be-pushed blocks and push them.
    // 若状态不是停止,则一直循环
    while (areBlocksBeingGenerated) {
      Option(blocksForPushing.poll(10, TimeUnit.MILLISECONDS)) match {
        case Some(block) => pushBlock(block)
        case None =>
      }
    }

    // At this point, state is StoppedGeneratingBlock. So drain the queue of to-be-pushed blocks.
    // 程序能执行到这里,则说明上面的while已经跳出,状态是StoppedGeneratingBlocks,则需要将剩余的Block都取光。
    logInfo("Pushing out the last " + blocksForPushing.size() + " blocks")
    while (!blocksForPushing.isEmpty) {
      val block = blocksForPushing.take()
      logDebug(s"Pushing block $block")
      pushBlock(block)
      logInfo("Blocks left to push " + blocksForPushing.size())
    }
    logInfo("Stopped block pushing thread")
  } catch {
    case ie: InterruptedException =>
      logInfo("Block pushing thread was interrupted")
    case e: Exception =>
      reportError("Error in block pushing thread", e)
  }
}

// BlockGenerator.scala line 256
  private def pushBlock(block: Block) {
  listener.onPushBlock(block.id, block.buffer)
  logInfo("Pushed block " + block.id)
}

listener.onPushBlock(block.id,block.buffer)

// ReceiverSupervisorImpl.scala line 108
def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]) {
  pushArrayBuffer(arrayBuffer, None, Some(blockId))
}

pushArrayBuffer

// ReceiverSupervisorImpl.scala line 122
/** Store an ArrayBuffer of received data as a data block into Spark's memory. */
def pushArrayBuffer(
    arrayBuffer: ArrayBuffer[_],
    metadataOption: Option[Any],
    blockIdOption: Option[StreamBlockId]
  ) {
  pushAndReportBlock(ArrayBufferBlock(arrayBuffer), metadataOption, blockIdOption) // 封装成 ArrayBufferBlock,后面有模式匹配
}

pushAndReportBlock

// ReceiverSupervisorImpl.scala line 149
/** Store block and report it to driver */
def pushAndReportBlock(
    receivedBlock: ReceivedBlock,
    metadataOption: Option[Any],
    blockIdOption: Option[StreamBlockId]
  ) {
  val blockId = blockIdOption.getOrElse(nextBlockId)
  val time = System.currentTimeMillis
  val blockStoreResult = receivedBlockHandler.storeBlock(blockId, receivedBlock)
  logDebug(s"Pushed block $blockId in ${(System.currentTimeMillis - time)} ms")
  val numRecords = blockStoreResult.numRecords
  val blockInfo = ReceivedBlockInfo(streamId, numRecords, metadataOption, blockStoreResult)
  trackerEndpoint.askWithRetry[Boolean](AddBlock(blockInfo))
  logDebug(s"Reported block $blockId")
}

storeBlock(block,receivedBlock)

// BlockManagerBasedBlockHandler.scala line 70
def storeBlock(blockId: StreamBlockId, block: ReceivedBlock): ReceivedBlockStoreResult = {

  var numRecords = None: Option[Long]

  val putResult: Seq[(BlockId, BlockStatus)] = block match {
    case ArrayBufferBlock(arrayBuffer) =>
      numRecords = Some(arrayBuffer.size.toLong)
      blockManager.putIterator(blockId, arrayBuffer.iterator, storageLevel,
        tellMaster = true)
    // 其他的case class
  }
  if (!putResult.map { _._1 }.contains(blockId)) {
    throw new SparkException(
      s"Could not store $blockId to block manager with storage level $storageLevel")
  }
  BlockManagerBasedStoreResult(blockId, numRecords)
}

blockManager.putIterator

// BlockManager.scala line 638
def putIterator(
    blockId: BlockId,
    values: Iterator[Any],
    level: StorageLevel,
    tellMaster: Boolean = true,
    effectiveStorageLevel: Option[StorageLevel] = None): Seq[(BlockId, BlockStatus)] = {
  require(values != null, "Values is null")
  doPut(blockId, IteratorValues(values), level, tellMaster, effectiveStorageLevel)
}

doPut

// BlockManager.scala line 797
case IteratorValues(iterator) =>
  blockStore.putIterator(blockId, iterator, putLevel, returnValues)

至此,可见数据完全交由BlockManager掌管。同时数据接收也已经解析完。

那就下一步:接收到的数据是如何运行的?

感谢王家林老师的知识分享

王家林老师名片:

中国Spark第一人

新浪微博:http://weibo.com/ilovepains

微信公众号:DT_Spark

博客:http://blog.sina.com.cn/ilovepains

手机:18610086859

QQ:1740415547

邮箱:[email protected]

YY课堂:每天20:00免费现场授课频道68917580

王家林:DT大数据梦工厂创始人、Spark亚太研究院院长和首席专家、大数据培训专家、大数据架构师。

 

你可能感兴趣的:(spark 版本定制 第5课:基于案例一节课贯通Spark Streaming流计算框架运行源码10)