BlockManager运行在每个节点上(包括Driver和Executor),提供对本地或远端节点上的内存、磁盘及堆外内存中Block的管理。存储体系从狭义上来说指的就是BlockManager,从广义上来说,则包括整个Spark集群中的各个 BlockManager、BlockInfoManager、DiskBlockManager、DiskStore、MemoryManager、MemoryStore、对集群中的所有BlockManager进行管理的BlockManagerMaster及各个节点上对外提供Block上传与下载服务的BlockTransferService。
每个Driver或Executor在创建自身的SparkEnv时都会创建BlockManager,BlockManager只有在其initialize方法被调用手才能发挥作用
//org.apache.spark.storage.BlockManager
def initialize(appId: String): Unit = {
blockTransferService.init(this)
shuffleClient.init(appId)
blockManagerId = BlockManagerId(
executorId, blockTransferService.hostName, blockTransferService.port)
shuffleServerId = if (externalShuffleServiceEnabled) {
logInfo(s"external shuffle service port = $externalShuffleServicePort")
BlockManagerId(executorId, blockTransferService.hostName, externalShuffleServicePort)
} else {
blockManagerId
}
master.registerBlockManager(blockManagerId, maxMemory, slaveEndpoint)
// Register Executors' configuration with the local shuffle service, if one should exist.
if (externalShuffleServiceEnabled && !blockManagerId.isDriver) {
registerWithExternalShuffleServer()
}
}
2.1 reregister
用于向BlockManagerMaster重新注册BlockManager,并向BlockManagerMaster报告所有的Block信息
//org.apache.spark.storage.BlockManager
def reregister(): Unit = {
logInfo(s"BlockManager $blockManagerId re-registering with master")
master.registerBlockManager(blockManagerId, maxMemory, slaveEndpoint)
reportAllBlocks()
}
//org.apache.spark.storage.BlockManager
private def reportAllBlocks(): Unit = {
logInfo(s"Reporting ${blockInfoManager.size} blocks to the master.")
for ((blockId, info) <- blockInfoManager.entries) {
val status = getCurrentBlockStatus(blockId, info)
if (info.tellMaster && !tryToReportBlockStatus(blockId, status)) {
logError(s"Failed to report $blockId to master; giving up.")
return
}
}
}
//org.apache.spark.storage.BlockManager
private def getCurrentBlockStatus(blockId: BlockId, info: BlockInfo): BlockStatus = {
info.synchronized {
info.level match {
case null =>
BlockStatus.empty
case level =>
val inMem = level.useMemory && memoryStore.contains(blockId)
val onDisk = level.useDisk && diskStore.contains(blockId)
val deserialized = if (inMem) level.deserialized else false
val replication = if (inMem || onDisk) level.replication else 1
val storageLevel = StorageLevel(
useDisk = onDisk,
useMemory = inMem,
useOffHeap = level.useOffHeap,
deserialized = deserialized,
replication = replication)
val memSize = if (inMem) memoryStore.getSize(blockId) else 0L
val diskSize = if (onDisk) diskStore.getSize(blockId) else 0L
BlockStatus(storageLevel, memSize, diskSize)
}
}
}
private def tryToReportBlockStatus(
blockId: BlockId,
status: BlockStatus,
droppedMemorySize: Long = 0L): Boolean = {
val storageLevel = status.storageLevel
val inMemSize = Math.max(status.memSize, droppedMemorySize)
val onDiskSize = status.diskSize
master.updateBlockInfo(blockManagerId, blockId, storageLevel, inMemSize, onDiskSize)
}
根据上述代码可知,向BlockManagerMaster汇报Block的状态信息是通过调用BlockManagerMaster的updateBlockInfo方法完成的。BlockManagerMaster的updateBlockInfo方法将向BlockManagerMasterEndpoint发送UpdateBlockInfo消息。
2.2 getLocalBytes
用于存储体系获取BlockId所对应Block的数据,并封装为ChunkedByteBuffer后返回
def getLocalBytes(blockId: BlockId): Option[ChunkedByteBuffer] = {
logDebug(s"Getting local block $blockId as bytes")
if (blockId.isShuffle) {
val shuffleBlockResolver = shuffleManager.shuffleBlockResolver
Option(
new ChunkedByteBuffer(
shuffleBlockResolver.getBlockData(blockId.asInstanceOf[ShuffleBlockId]).nioByteBuffer()))
} else {
blockInfoManager.lockForReading(blockId).map { info => doGetLocalBytes(blockId, info) }
}
}
//org.apache.spark.storage.BlockManager
private def doGetLocalBytes(blockId: BlockId, info: BlockInfo): ChunkedByteBuffer = {
val level = info.level //获取Block的存储级别
logDebug(s"Level for block $blockId is $level")
if (level.deserialized) {//BLock没有被序列化,按照DiskStore、MemoryStore的顺序获取Block数据
if (level.useDisk && diskStore.contains(blockId)) {
diskStore.getBytes(blockId)
} else if (level.useMemory && memoryStore.contains(blockId)) {
serializerManager.dataSerializeWithExplicitClassTag(
blockId, memoryStore.getValues(blockId).get, info.classTag)
} else {
handleLocalReadFailure(blockId)
}
} else { // Block被序列化了,那么按照MemoryStore、DiskStore的顺序获取Block数据
if (level.useMemory && memoryStore.contains(blockId)) {
memoryStore.getBytes(blockId).get
} else if (level.useDisk && diskStore.contains(blockId)) {
val diskBytes = diskStore.getBytes(blockId)
maybeCacheDiskBytesInMemory(info, blockId, level, diskBytes).getOrElse(diskBytes)
} else {
handleLocalReadFailure(blockId)
}
}
}
doGetLocalBytes的执行步骤如下:
2.3 getBlockData
此方法用于获取本地Block的数据。
//org.apache.spark.storage.BlockManager
override def getBlockData(blockId: BlockId): ManagedBuffer = {
if (blockId.isShuffle) {
shuffleManager.shuffleBlockResolver.getBlockData(blockId.asInstanceOf[ShuffleBlockId])
} else {
getLocalBytes(blockId) match {
case Some(buffer) => new BlockManagerManagedBuffer(blockInfoManager, blockId, buffer)
case None =>
reportBlockStatus(blockId, BlockStatus.empty)
throw new BlockNotFoundException(blockId.toString)
}
}
}
//org.apache.spark.storage.BlockManager
private def reportBlockStatus(
blockId: BlockId,
status: BlockStatus,
droppedMemorySize: Long = 0L): Unit = {
val needReregister = !tryToReportBlockStatus(blockId, status, droppedMemorySize)
if (needReregister) {
logInfo(s"Got told to re-register updating block $blockId")
asyncReregister()
}
logDebug(s"Told master about block $blockId")
}
reportBlockStatus的执行步骤如下:
asyncReregister方法实际另起线程调用 reregister,来实现异步注册BlockManager。
private def asyncReregister(): Unit = {
asyncReregisterLock.synchronized {
if (asyncReregisterTask == null) {
asyncReregisterTask = Future[Unit] {
// This is a blocking action and should run in futureExecutionContext which is a cached
// thread pool
reregister()
asyncReregisterLock.synchronized {
asyncReregisterTask = null
}
}(futureExecutionContext)
}
}
}
2.4 putBytes
要介绍putBytes,需要首先介绍doPut。doPut用于执行Block的写入
private def doPut[T](
blockId: BlockId,
level: StorageLevel,
classTag: ClassTag[_],
tellMaster: Boolean,
keepReadLock: Boolean)(putBody: BlockInfo => Option[T]): Option[T] = {
require(blockId != null, "BlockId is null")
require(level != null && level.isValid, "StorageLevel is null or invalid")
val putBlockInfo = {
val newInfo = new BlockInfo(level, classTag, tellMaster)
if (blockInfoManager.lockNewBlockForWriting(blockId, newInfo)) {//获取Block的写锁
newInfo
} else {
logWarning(s"Block $blockId already exists on this machine; not re-adding it")
if (!keepReadLock) {
releaseLock(blockId)
}
return None
}
}
val startTimeMs = System.currentTimeMillis
var exceptionWasThrown: Boolean = true
val result: Option[T] = try {
val res = putBody(putBlockInfo) //执行Block写入
exceptionWasThrown = false
if (res.isEmpty) { //Block成功存储,执行锁降级或释放锁
if (keepReadLock) {
blockInfoManager.downgradeLock(blockId)
} else {
blockInfoManager.unlock(blockId)
}
} else { //Block存储失败,移除此BLock
removeBlockInternal(blockId, tellMaster = false)
logWarning(s"Putting block $blockId failed")
}
res
} finally {
if (exceptionWasThrown) {
logWarning(s"Putting block $blockId failed due to an exception")
removeBlockInternal(blockId, tellMaster = tellMaster)
addUpdatedBlockStatusToTaskMetrics(blockId, BlockStatus.empty)
}
}
if (level.replication > 1) {
logDebug("Putting block %s with replication took %s"
.format(blockId, Utils.getUsedTimeMs(startTimeMs)))
} else {
logDebug("Putting block %s without replication took %s"
.format(blockId, Utils.getUsedTimeMs(startTimeMs)))
}
result
}
上述代码中,doPut有一个函数参数putBody,putBody将执行真正的Block数据写入。doPut的执行步骤如下:
在doPut方法调用了removeBlockInternal方法来移除Block
private def removeBlockInternal(blockId: BlockId, tellMaster: Boolean): Unit = {
val removedFromMemory = memoryStore.remove(blockId)
val removedFromDisk = diskStore.remove(blockId)
if (!removedFromMemory && !removedFromDisk) {
logWarning(s"Block $blockId could not be removed as it was not found on disk or in memory")
}
blockInfoManager.removeBlock(blockId)
if (tellMaster) {
reportBlockStatus(blockId, BlockStatus.empty)
}
}
了解了doPut,现在来看看putBytes的实现。
def putBytes[T: ClassTag](
blockId: BlockId,
bytes: ChunkedByteBuffer,
level: StorageLevel,
tellMaster: Boolean = true): Boolean = {
require(bytes != null, "Bytes is null")
doPutBytes(blockId, bytes, level, implicitly[ClassTag[T]], tellMaster)
}
根据上述代码可知, putBytes实际调用的是doPutBytes方法
private def doPutBytes[T](
blockId: BlockId,
bytes: ChunkedByteBuffer,
level: StorageLevel,
classTag: ClassTag[T],
tellMaster: Boolean = true,
keepReadLock: Boolean = false): Boolean = {
doPut(blockId, level, classTag, tellMaster = tellMaster, keepReadLock = keepReadLock) { info =>
val startTimeMs = System.currentTimeMillis
val replicationFuture = if (level.replication > 1) {
Future {//创建异步线程,通过调用replicate方法复制Block数据到其它节点的存储体系中
replicate(blockId, bytes, level, classTag)
}(futureExecutionContext)
} else {
null
}
val size = bytes.size
if (level.useMemory) { //优先写入内存
val putSucceeded = if (level.deserialized) {
val values =
serializerManager.dataDeserializeStream(blockId, bytes.toInputStream())(classTag)
memoryStore.putIteratorAsValues(blockId, values, classTag) match {
case Right(_) => true
case Left(iter) =>
iter.close()
false
}
} else {
memoryStore.putBytes(blockId, size, level.memoryMode, () => bytes)
}
if (!putSucceeded && level.useDisk) { //内存不足,写入磁盘
logWarning(s"Persisting block $blockId to disk instead.")
diskStore.putBytes(blockId, bytes)
}
} else if (level.useDisk) { //不能使用内存时,写入磁盘
diskStore.putBytes(blockId, bytes)
}
val putBlockStatus = getCurrentBlockStatus(blockId, info)
val blockWasSuccessfullyStored = putBlockStatus.storageLevel.isValid
if (blockWasSuccessfullyStored) {
info.size = size
if (tellMaster && info.tellMaster) {
reportBlockStatus(blockId, putBlockStatus) //向BlockManagerMaster报告Block的状态
}
addUpdatedBlockStatusToTaskMetrics(blockId, putBlockStatus)
}
logDebug("Put block %s locally took %s".format(blockId, Utils.getUsedTimeMs(startTimeMs)))
if (level.replication > 1) {
//等待异步的复制线程完成
try {
Await.ready(replicationFuture, Duration.Inf)
} catch {
case NonFatal(t) =>
throw new Exception("Error occurred while waiting for replication to finish", t)
}
}
if (blockWasSuccessfullyStored) {
None
} else {
Some(bytes)
}
}.isEmpty
}
根据doPutBytes的实现,其首先定义了偏函数,这个偏函数将作为doPut的putBody参数,然后调用doPut方法,doPut方法将调用此偏函数,偏函数写入数据的步骤如下:
2.5 putBlockData
用于将Block数据写入本地
override def putBlockData(
blockId: BlockId,
data: ManagedBuffer,
level: StorageLevel,
classTag: ClassTag[_]): Boolean = {
putBytes(blockId, new ChunkedByteBuffer(data.nioByteBuffer()), level)(classTag)
}
2.6 getStatus
用于获取Block的状态
def getStatus(blockId: BlockId): Option[BlockStatus] = {
blockInfoManager.get(blockId).map { info =>
val memSize = if (memoryStore.contains(blockId)) memoryStore.getSize(blockId) else 0L
val diskSize = if (diskStore.contains(blockId)) diskStore.getSize(blockId) else 0L
BlockStatus(info.level, memSize = memSize, diskSize = diskSize)
}
}
2.7 getMatchingBlockIds
用于获取匹配过滤器条件的BlockId 的序列
def getMatchingBlockIds(filter: BlockId => Boolean): Seq[BlockId] = {
(blockInfoManager.entries.map(_._1) ++ diskBlockManager.getAllBlocks())
.filter(filter)
.toArray
.toSeq
}
代码中除了从BlockInfoManager的entries缓存中获取BlockId外,还需要从DiskBlockManager中获取,这是因为DiskBlockManager中可能存在BlockInfoManager不知道的Block
2.8 getLocalValues
用于从本地的BlockManager中获取Block数据
def getLocalValues(blockId: BlockId): Option[BlockResult] = {
logDebug(s"Getting local block $blockId")
blockInfoManager.lockForReading(blockId) match {
case None =>
logDebug(s"Block $blockId was not found")
None
case Some(info) =>
val level = info.level
logDebug(s"Level for block $blockId is $level")
if (level.useMemory && memoryStore.contains(blockId)) {
//优先从MemoryStore中读取Block数据
val iter: Iterator[Any] = if (level.deserialized) {
memoryStore.getValues(blockId).get
} else {
serializerManager.dataDeserializeStream(
blockId, memoryStore.getBytes(blockId).get.toInputStream())(info.classTag)
}
val ci = CompletionIterator[Any, Iterator[Any]](iter, releaseLock(blockId))
Some(new BlockResult(ci, DataReadMethod.Memory, info.size))
} else if (level.useDisk && diskStore.contains(blockId)) {
//从DiskStore中读取Block数据
val iterToReturn: Iterator[Any] = {
val diskBytes = diskStore.getBytes(blockId)
if (level.deserialized) {
val diskValues = serializerManager.dataDeserializeStream(
blockId,
diskBytes.toInputStream(dispose = true))(info.classTag)
maybeCacheDiskValuesInMemory(info, blockId, level, diskValues)
} else {
val stream = maybeCacheDiskBytesInMemory(info, blockId, level, diskBytes)
.map {_.toInputStream(dispose = false)}
.getOrElse { diskBytes.toInputStream(dispose = true) }
serializerManager.dataDeserializeStream(blockId, stream)(info.classTag)
}
}
val ci = CompletionIterator[Any, Iterator[Any]](iterToReturn, releaseLock(blockId))
Some(new BlockResult(ci, DataReadMethod.Disk, info.size))
} else {
handleLocalReadFailure(blockId)
}
}
}
2.9 getRemoteBytes
getRemoteBytes方法的作用为从远端的BlockManager以序列化的字节形式获取Block数据。但在此之前,首先介绍获取Block位置信息的方法getLocations
private def getLocations(blockId: BlockId): Seq[BlockManagerId] = {
val locs = Random.shuffle(master.getLocations(blockId))
val (preferredLocs, otherLocs) = locs.partition { loc => blockManagerId.host == loc.host }
preferredLocs ++ otherLocs
}
其执行步骤如下:
有了对getLocations方法的了解,现在来看看getRemoteBytes的实现:
def getRemoteBytes(blockId: BlockId): Option[ChunkedByteBuffer] = {
logDebug(s"Getting remote block $blockId")
require(blockId != null, "BlockId is null")
var runningFailureCount = 0
var totalFailureCount = 0
val locations = getLocations(blockId)
val maxFetchFailures = locations.size
var locationIterator = locations.iterator
while (locationIterator.hasNext) {
val loc = locationIterator.next()
logDebug(s"Getting remote block $blockId from $loc")
val data = try { //以同步方式从远端下载Block
blockTransferService.fetchBlockSync(
loc.host, loc.port, loc.executorId, blockId.toString).nioByteBuffer()
} catch {
case NonFatal(e) =>
runningFailureCount += 1
totalFailureCount += 1
if (totalFailureCount >= maxFetchFailures) { //没能下载成功
logWarning(s"Failed to fetch block after $totalFailureCount fetch failures. " +
s"Most recent failure cause:", e)
return None
}
logWarning(s"Failed to fetch remote block $blockId " +
s"from $loc (failed attempt $runningFailureCount)", e)
//刷新Block所在的所有位置信息
if (runningFailureCount >= maxFailuresBeforeLocationRefresh) {
locationIterator = getLocations(blockId).iterator
logDebug(s"Refreshed locations from the driver " +
s"after ${runningFailureCount} fetch failures.")
runningFailureCount = 0
}
null
}
if (data != null) {
return Some(new ChunkedByteBuffer(data))
}
logDebug(s"The value of block $blockId is null")
}
logDebug(s"Block $blockId not found")
None
}
2.10 get
用于优先从本地获取Block数据,当本地获取不到所需的Block数据,再从远端获取Block数据
def get[T: ClassTag](blockId: BlockId): Option[BlockResult] = {
val local = getLocalValues(blockId)
if (local.isDefined) {
logInfo(s"Found block $blockId locally")
return local
}
val remote = getRemoteValues[T](blockId)
if (remote.isDefined) {
logInfo(s"Found block $blockId remotely")
return remote
}
None
}
2.11 downgradeLock
将当前线程持有的Block的写锁降级为读锁
def downgradeLock(blockId: BlockId): Unit = {
blockInfoManager.downgradeLock(blockId)
}
实际代理了BlockInfoManager的downgradeLock方法
2.12 releaseLock
用于当前线程对持有的Block的锁进行释放
def releaseLock(blockId: BlockId): Unit = {
blockInfoManager.unlock(blockId)
}
实际调用了BlockInfoManager的unlock方法
2.13 registerTask
用于将任务尝试线程注册到BlockInfoManager
def registerTask(taskAttemptId: Long): Unit = {
blockInfoManager.registerTask(taskAttemptId)
}
实际代理了BlockInfoManager的registerTask方法
2.14 releaseAllLocksForTask
用于任务尝试线程对持有的所有Block的锁进行释放
def releaseAllLocksForTask(taskAttemptId: Long): Seq[BlockId] = {
blockInfoManager.releaseAllLocksForTask(taskAttemptId)
}
2.15 getOrElseUpdate
用于获取Block。如果Block存在,则获取此Block并返回BlockResult,否则调用makeIterator方法计算Block,并持久化后返回BlockResult或Iterator
def getOrElseUpdate[T](
blockId: BlockId,
level: StorageLevel,
classTag: ClassTag[T],
makeIterator: () => Iterator[T]): Either[BlockResult, Iterator[T]] = {
get[T](blockId)(classTag) match { //从本地或远端的BlockManager获取Block
case Some(block) =>
return Left(block)
case _ =>
}
doPutIterator(blockId, makeIterator, level, classTag, keepReadLock = true) match {
case None => //Block已经成功存储到内存
val blockResult = getLocalValues(blockId).getOrElse {
releaseLock(blockId)
throw new SparkException(s"get() failed for block $blockId even though we held a lock")
}
releaseLock(blockId)
Left(blockResult)
case Some(iter) => //Block存储到内存时发生了错误
Right(iter)
}
}
执行步骤如下:
2.16 putIterator
此方法用于将Block数据写入存储体系
def putIterator[T: ClassTag](
blockId: BlockId,
values: Iterator[T],
level: StorageLevel,
tellMaster: Boolean = true): Boolean = {
require(values != null, "Values is null")
doPutIterator(blockId, () => values, level, implicitly[ClassTag[T]], tellMaster) match {
case None =>
true
case Some(iter) =>
iter.close()
false
}
}
putIterator内部实际也调用了doPutIterator方法,当doPutIterator返回None,说明计算得到的Block已经成功存储到内存,因此再次读取此Block。doPutIterator方法的返回结果匹配Some,则说明计算得到的Block存储到内存时发生了错误。
2.17 getDiskWriter
用于创建并获取DiskBlockObjectWriter,通过DiskBlockObjectWriter可以跳过对DiskStore的使用,直接将数据写入磁盘
def getDiskWriter(
blockId: BlockId,
file: File,
serializerInstance: SerializerInstance,
bufferSize: Int,
writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = {
val compressStream: OutputStream => OutputStream =
serializerManager.wrapForCompression(blockId, _)
val syncWrites = conf.getBoolean("spark.shuffle.sync", false)
new DiskBlockObjectWriter(file, serializerInstance, bufferSize, compressStream,
syncWrites, writeMetrics, blockId)
}
属性spark.shuffle.sync将决定DiskBlockObjectWrite把数据写入磁盘时是采用同步方式还是异步方式,默认是异步方式。
2.18 dropFromMemory
用于从内存中删除Block,当Block的存储级别允许写入磁盘,Block将被写入磁盘。此方法主要在内存不足,需要从内存腾出空闲空间时使用。
private[storage] override def dropFromMemory[T: ClassTag](
blockId: BlockId,
data: () => Either[Array[T], ChunkedByteBuffer]): StorageLevel = {
logInfo(s"Dropping block $blockId from memory")
//确认当前任务尝试线程是否已经持有BlockId对应的写锁
val info = blockInfoManager.assertBlockIsLockedForWriting(blockId)
var blockIsUpdated = false
val level = info.level
// 将Block写入磁盘
if (level.useDisk && !diskStore.contains(blockId)) {
logInfo(s"Writing block $blockId to disk")
data() match {
case Left(elements) =>
diskStore.put(blockId) { fileOutputStream =>
serializerManager.dataSerializeStream(
blockId,
fileOutputStream,
elements.toIterator)(info.classTag.asInstanceOf[ClassTag[T]])
}
case Right(bytes) =>
diskStore.putBytes(blockId, bytes)
}
blockIsUpdated = true
}
// 将内存中的Block删除
val droppedMemorySize =
if (memoryStore.contains(blockId)) memoryStore.getSize(blockId) else 0L
val blockIsRemoved = memoryStore.remove(blockId)
if (blockIsRemoved) {
blockIsUpdated = true
} else {
logWarning(s"Block $blockId could not be dropped from memory as it does not exist")
}
val status = getCurrentBlockStatus(blockId, info)
if (info.tellMaster) {
reportBlockStatus(blockId, status, droppedMemorySize) //向BlockManagerMaster报告Block状态
}
if (blockIsUpdated) {
addUpdatedBlockStatusToTaskMetrics(blockId, status) //更新任务度量信息
}
status.storageLevel //返回Block的存储级别
}
执行步骤如下:
2.19 removeRdd
移除属于指定RDD的所有Block
def removeRdd(rddId: Int): Int = {
// TODO: Avoid a linear scan by creating another mapping of RDD.id to blocks.
logInfo(s"Removing RDD $rddId")
val blocksToRemove = blockInfoManager.entries.flatMap(_._1.asRDDId).filter(_.rddId == rddId)
blocksToRemove.foreach { blockId => removeBlock(blockId, tellMaster = false) }
blocksToRemove.size
}
执行步骤:
2.20 removeBroadcast
移除属于指定Broadcast的所有Block
def removeBroadcast(broadcastId: Long, tellMaster: Boolean): Int = {
logDebug(s"Removing broadcast $broadcastId")
val blocksToRemove = blockInfoManager.entries.map(_._1).collect {
case bid @ BroadcastBlockId(`broadcastId`, _) => bid
}
blocksToRemove.foreach { blockId => removeBlock(blockId, tellMaster) }
blocksToRemove.size
}