override def runTask(context: TaskContext): MapStatus = {
// Deserialize the RDD using the broadcast variable.
val deserializeStartTime = System.currentTimeMillis()
// 获得反序列化器closureSerializer
val ser = SparkEnv.get.closureSerializer.newInstance()
// 调用反序列化器closureSerializer的deserialize()进行RDD和ShuffleDependency的反序列化,数据来源于taskBinary
val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
_executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
metrics = Some(context.taskMetrics)
var writer: ShuffleWriter[Any, Any] = null
try {
val manager = SparkEnv.get.shuffleManager
//根据partition指定分区的Shufflea获取Shuffle Writer,shuffleHandle是shuffle ID
writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
writer.stop(success = true).get
} catch {
case e: Exception =>
try {
if (writer != null) {
writer.stop(success = false)
} catch {
case e: Exception =>
log.debug("Could not stop writer", e)
throw e
* Write a bunch of records to this task's output
* 将一堆记录写入此任务的输出*/
* 主要处理两件事:
* 1)判断是否需要进行聚合,比如和都要写入的话,那么先生成
* 然后再进行后续的写入工作
* 2)利用Partition函数来决定写入哪一个文件中.
override def write(records: Iterator[Product2[K, V]]): Unit = {
val iter = if (dep.aggregator.isDefined) {
if (dep.mapSideCombine) {//判断是否需要聚合,如果需要,聚合records执行map端的聚合
dep.aggregator.get.combineValuesByKey(records, context)
} else {
} else {
require(!dep.mapSideCombine, "Map-side combine without Aggregator specified!")
for (elem <- iter) {
val bucketId = dep.partitioner.getPartition(elem._1)//获得该element需要写入的partitioner
//bucketId文件名称,key elem._1,value elem._2
shuffle.writers(bucketId).write(elem._1, elem._2)
* Manages assigning disk-based block writers to shuffle tasks. Each shuffle task gets one file
* per reducer (this set of files is called a ShuffleFileGroup).
* 管理分配基于磁盘的块写入器来随机播放任务,每个shuffle任务每个reducer获取一个文件(这组文件称为ShuffleFileGroup)
* As an optimization to reduce the number of physical shuffle files produced, multiple shuffle
* blocks are aggregated into the same file. There is one "combined shuffle file" per reducer
* per concurrently executing shuffle task. As soon as a task finishes writing to its shuffle
* files, it releases them for another task.
* 作为减少生成的物理随机播放文件数量的优化,多个shuffle块被聚合到同一个文件中,每个并发执行随机播放任务,每个reducer有一个“组合shuffle文件”
* 一旦任务完成对其随机播放文件的写入,它将释放它们用于另一个任务。
* Regarding the implementation of this feature, shuffle files are identified by a 3-tuple:
* 关于此功能的实现,随机播放文件由3元组标识:
* - shuffleId: The unique id given to the entire shuffle stage.给予整个洗牌阶段的唯一身份
* - bucketId: The id of the output partition (i.e., reducer id)输出分区的id(即reducer id)
* - fileId: The unique id identifying a group of "combined shuffle files." Only one task at a
* time owns a particular fileId, and this id is returned to a pool when the task finishes.
* 识别一组“组合的shuffle文件”的唯一ID,一次只有一个任务拥有一个特定的fileId,当任务完成时,这个id返回给一个池
* Each shuffle file is then mapped to a FileSegment, which is a 3-tuple (file, offset, length)
* that specifies where in a given file the actual block data is located.
* 然后将每个随机shuffle文件映射到FileSegment,FileSegment是一个3元组(文件,偏移量,长度),用于指定给定文件中实际块数据所在的位置
* Shuffle file metadata is stored in a space-efficient manner. Rather than simply mapping
* ShuffleBlockIds directly to FileSegments, each ShuffleFileGroup maintains a list of offsets for
* each block stored in each file. In order to find the location of a shuffle block, we search the
* files within a ShuffleFileGroups associated with the block's reducer.
* 每个ShuffleFileGroup为每个文件中存储的每个块维护一个偏移量列表,为了找到混洗块的位置,
* 我们搜索与块的reducer相关联的ShuffleFileGroup中的文件。
上面这个类的 forMapTask方法如下
* Get a ShuffleWriterGroup for the given map task, which will register it as complete
* when the writers are closed successfully
* 为给定的Map任务获取一个ShuffleWriterGroup,当写关闭成功时,它将注册为完整的
* mapId对应RDD的partionsID
def forMapTask(shuffleId: Int, mapId: Int, numBuckets: Int, serializer: Serializer,
writeMetrics: ShuffleWriteMetrics): ShuffleWriterGroup = {
new ShuffleWriterGroup {
shuffleStates.putIfAbsent(shuffleId, new ShuffleState(numBuckets))
private val shuffleState = shuffleStates(shuffleId)
private var fileGroup: ShuffleFileGroup = null
val openStartTime = System.nanoTime
val serializerInstance = serializer.newInstance()
val writers: Array[DiskBlockObjectWriter] = if (consolidateShuffleFiles) {
fileGroup = getUnusedFileGroup()//获取没有使用的FileGroup
Array.tabulate[DiskBlockObjectWriter](numBuckets) { bucketId =>
val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
blockManager.getDiskWriter(blockId, fileGroup(bucketId), serializerInstance, bufferSize,
} else {
Array.tabulate[DiskBlockObjectWriter](numBuckets) { bucketId =>
val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
val blockFile = blockManager.diskBlockManager.getFile(blockId)
val tmp = Utils.tempFileWith(blockFile)
blockManager.getDiskWriter(blockId, tmp, serializerInstance, bufferSize, writeMetrics)
// Creating the file to write to and creating a disk writer both involve interacting with
// the disk, so should be included in the shuffle write time.
writeMetrics.incShuffleWriteTime(System.nanoTime - openStartTime)
override def releaseWriters(success: Boolean) {
if (consolidateShuffleFiles) {
if (success) {
val offsets = writers.map(_.fileSegment().offset)
val lengths = writers.map(_.fileSegment().length)
fileGroup.recordMapOutput(mapId, offsets, lengths)
} else {