MapReduce源码解读之MapTask-output

写在前面

我们知道MapTask执行完之后,会调用context.write()方法将map的输出结果写入到磁盘中,然后Reducer从磁盘中拉取结果,那么在源码层面,这一过程到底是怎么执行的呢?前面我们分析了MapTas类的input部分代码,现在我们来分析MapTask类的run()方法中关于output的部分代码,看看shuffle过程是怎么执行的。

:本人使用的版本为Hadoop2.6.5,所以源码分析也是基于Hadoop2.6.5,如果对源码感兴趣的话可以点击这里下载源码

1. 入口:MapTask 类

  @Override
  /**
  *为了保证内容的完整性,我们仍从MapTask类开始分析
  */
  public void run(final JobConf job, final TaskUmbilicalProtocol umbilical)
    throws IOException, ClassNotFoundException, InterruptedException {
    this.umbilical = umbilical;

    if (isMapTask()) {
    
      // 从conf中获取ReduceTask的数量,如果没有ReduceTask,那么map阶段占100%
      if (conf.getNumReduceTasks() == 0) {
        mapPhase = getProgress().addPhase("map", 1.0f);
      } else {
      
        // 如果有ReduceTask,Map会多一个排序阶段,map阶段占67%,排序阶段占33%
        mapPhase = getProgress().addPhase("map", 0.667f);
        sortPhase  = getProgress().addPhase("sort", 0.333f);
      }
    }
    TaskReporter reporter = startReporter(umbilical);
 
    boolean useNewApi = job.getUseNewMapper();
    initialize(job, getJobID(), reporter, useNewApi);

    // 检查和清理JobTask
    if (jobCleanup) {
      runJobCleanupTask(umbilical, reporter);
      return;
    }
    if (jobSetup) {
      runJobSetupTask(umbilical, reporter);
      return;
    }
    if (taskCleanup) {
      runTaskCleanupTask(umbilical, reporter);
      return;
    }

    if (useNewApi) {
    
    //我们使用的是新API,直接进入runNewMapper()方法
      runNewMapper(job, splitMetaInfo, umbilical, reporter);
    } else {
      runOldMapper(job, splitMetaInfo, umbilical, reporter);
    }
    done(umbilical, reporter);
  }

2.进入runNewMapper()方法

private <INKEY,INVALUE,OUTKEY,OUTVALUE>
  void runNewMapper(final JobConf job,
                    final TaskSplitIndex splitIndex,
                    final TaskUmbilicalProtocol umbilical,
                    TaskReporter reporter
                    ) throws IOException, ClassNotFoundException,
                             InterruptedException {
    // 创建MapTask的上下文,注意,job里面已经有从HDFS中下载的各种配置信息
    org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
      new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, 
                                                                  getTaskID(),
                                                                  reporter);
    // 创建一个mapper,这个mapper就是用户手写的类的实例,taskContext会从job中获取用户设定的MapperClass,然后传给mapper
    org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
      (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
      
      	//这里通过反射来创建类,反射的类就是用户手写的Mapper类
        ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
        
    // 创建一个输入格式化类实例inputformat
    org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
      (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
      
//这里通过反射来创建一个InputFormatClass实例,
//注意,这个getInputFormatClass()方法在客户端提交作业时就出现过,默认是TextInputFormat   
ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);

    /** 重新构建Inputsplit实例split,为什么要重新构建呢?因为原split中包含了所有MapTask应该要执行的文件路径,起始位置,偏移量。
    而单个MapTask只需要关心自己执行哪一部分就行,所以需要重新构建应该split来存放自己要执行的split信息**/
    org.apache.hadoop.mapreduce.InputSplit split = null;
    split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
        splitIndex.getStartOffset());
    LOG.info("Processing split: " + split);

//这里使用split,inputFormat, reporter, taskContext构造出了一个RecordReader实例input
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
      new NewTrackingRecordReader<INKEY,INVALUE>
        (split, inputFormat, reporter, taskContext);
    job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
	//创建一个output
    org.apache.hadoop.mapreduce.RecordWriter output = null;
    
    // 对output进行赋值,如果没有ReduceTask,直接输出,无需排序
    if (job.getNumReduceTasks() == 0) {
      output = 
        new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
    } else {
    //如果ReduceTask不为0,执行这个方法,我们假设有ReduceTask
    //进入NewOutputCollector类
      output = new NewOutputCollector(taskContext, job, umbilical, reporter);
    }

3. 进入NewOutputCollector类

 private class NewOutputCollector<K,V>
    extends org.apache.hadoop.mapreduce.RecordWriter<K,V> {
    private final MapOutputCollector<K,V> collector;
    private final org.apache.hadoop.mapreduce.Partitioner<K,V> partitioner;
    private final int partitions;

    @SuppressWarnings("unchecked")
    NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
                       JobConf job,
                       TaskUmbilicalProtocol umbilical,
                       TaskReporter reporter
                       ) throws IOException, ClassNotFoundException {
                       
	  //collector涉及到溢写等过程,分完区才会执行溢写,所以先放着待会分析
      collector = createSortingCollector(job, reporter);

	  //这里可以看到,partitions的数量就是ReduceTasks的数量
      partitions = jobContext.getNumReduceTasks();

      //如果partition>1,则通过反射创建分区器,我们进入getPartitionerClass()方法获取分区器
      if (partitions > 1) {
        partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
          ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
      } else {

     	//	如果partition的数量为1,意味着所有的map输出都会分到一个分区器里面
     	//所以会创建一个分区器Partitioner,Partitioner是一个接口类,所以要实现它的方法getPartition()
        partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
          @Override
          //getPartition()返回的就是分区号,因为只有一个分区,所以这里直接返回0号分区
          public int getPartition(K key, V value, int numPartitions) {
            return partitions - 1;
          }
        };
      }
    }

3. 进入getPartitionerClass()方法

/**
*这个方法的实现类是JobContextImpl
*/
  public Class<? extends Partitioner<?,?>> getPartitionerClass() 
     throws ClassNotFoundException {
    return (Class<? extends Partitioner<?,?>>) 
    
    //这里我们可以看到,如果用户没有传入Partitioner,默认会使用HashPartitioner
    //进入HasPartitioner类
      conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class);
  }

4. 进入HashPartitioner类

public class HashPartitioner<K, V> extends Partitioner<K, V> {

  /** Use {@link Object#hashCode()} to partition. */
  public int getPartition(K key, V value,
                          int numReduceTasks) {

	//这里直接对numReduceTasks取模,key.hashCode() & Integer.MAX_VALUE是为了返回一个正整数
	//hashCode()是一个稳定算法,相同的key返回的hashCode一定相等,也就一定会返回相同的分区号
	//假设numReduceTasks为3,那么取模的结果就只能是0,1,2,这样就返回了这条记录的分区号
	//现在key,value,partitioner都准备好了,我们进入用户实现的map()方法
    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
  }

}

5. 进入用户实现的map()方法

/**
*这个方法就是我们写的map方法,这里调用了context.write()方法
*/
    //key  是每一行字符串自己第一个字节面向源文件的偏移量
    public void map(Object key, Text value, Context context) throws IOException, InterruptedException {


        StringTokenizer itr = new StringTokenizer(value.toString());
        while (itr.hasMoreTokens()) {
            word.set(itr.nextToken());
            
            //这里调用了context.write()方法,我们进入context.write()方法      
            context.write(word, one);
        }
    }

6.进入context.write()方法

    @Override
    /**
    *这个方法的实现类是NewOutputCollector
    */
    public void write(K key, V value) throws IOException, InterruptedException {
    //这里我们发现之前创建的collector在这里调用collect方法将数据写入缓冲区
    //现在我们来分析一下这个collector究竟是什么,回到NewOutputCollector()方法
      collector.collect(key, value,
                        partitioner.getPartition(key, value, partitions));
    }

7. 回到NewOutputCollector()方法

/**
*这是NewOutputCollector的构造方法
*/
    NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
                       JobConf job,
                       TaskUmbilicalProtocol umbilical,
                       TaskReporter reporter
                       ) throws IOException, ClassNotFoundException {
                      
	 //这里通过createSortingCollector()创建了一个collector,进入createSortingCollector()方法
      collector = createSortingCollector(job, reporter);
      partitions = jobContext.getNumReduceTasks();

8.进入createSortingCollector()方法

  private <KEY, VALUE> MapOutputCollector<KEY, VALUE>
          createSortingCollector(JobConf job, TaskReporter reporter)
    throws IOException, ClassNotFoundException {
    
    //这里从job中获取并创建上下文信息
    MapOutputCollector.Context context =
      new MapOutputCollector.Context(this, job, reporter);
	
	//这里获取用户设置的MAP_OUTPUT_COLLECTOR_CLASS_ATTR,如果没有,默认为MapOutputBuffer
    Class<?>[] collectorClasses = job.getClasses(
      JobContext.MAP_OUTPUT_COLLECTOR_CLASS_ATTR, MapOutputBuffer.class);
    int remainingCollectors = collectorClasses.length;

	//clazz在这里被创建
    for (Class clazz : collectorClasses) {
      try {
        if (!MapOutputCollector.class.isAssignableFrom(clazz)) {
          throw new IOException("Invalid output collector class: " + clazz.getName() +
            " (does not implement MapOutputCollector)");
        }

		//clazz创建出一个MapOutputCollector的子类
        Class<? extends MapOutputCollector> subclazz =
          clazz.asSubclass(MapOutputCollector.class);
        LOG.debug("Trying map output collector class: " + subclazz.getName());

		//调用反射subclazz来实例化一个collector
        MapOutputCollector<KEY, VALUE> collector =
          ReflectionUtils.newInstance(subclazz, job);

		//到这里collector开始初始化,进入init()方法
        collector.init(context);
        LOG.info("Map output collector class = " + collector.getClass().getName());
        return collector;
      } catch (Exception e) {
        String msg = "Unable to initialize MapOutputCollector " + clazz.getName();
        if (--remainingCollectors > 0) {
          msg += " (" + remainingCollectors + " more collector(s) to try)";
        }
        LOG.warn(msg, e);
      }
    }
    throw new IOException("Unable to initialize any output collector");
  }

9. 进入init()方法(最重要)

    public void init(MapOutputCollector.Context context
                    ) throws IOException, ClassNotFoundException {
      job = context.getJobConf();
      reporter = context.getReporter();
      mapTask = context.getMapTask();
      mapOutputFile = mapTask.getMapOutputFile();
      sortPhase = mapTask.getSortPhase();
      spilledRecordsCounter = reporter.getCounter(TaskCounter.SPILLED_RECORDS);
      partitions = job.getNumReduceTasks();
      rfs = ((LocalFileSystem)FileSystem.getLocal(job)).getRaw();

      //一个浮点数,默认为0.8,其实就是溢写的阈值
      final float spillper =
        job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT, (float)0.8);
      
      //设置一个缓冲区sortmb,默认大小为100MB
      final int sortmb = job.getInt(JobContext.IO_SORT_MB, 100);
      indexCacheMemoryLimit = job.getInt(JobContext.INDEX_CACHE_MEMORY_LIMIT,
                                         INDEX_CACHE_MEMORY_LIMIT_DEFAULT);
      if (spillper > (float)1.0 || spillper <= (float)0.0) {
        throw new IOException("Invalid \"" + JobContext.MAP_SORT_SPILL_PERCENT +
            "\": " + spillper);
      }
      if ((sortmb & 0x7FF) != sortmb) {
        throw new IOException(
            "Invalid \"" + JobContext.IO_SORT_MB + "\": " + sortmb);
      }

 	  //通过反射来创建一个排序类,如果用户没有设置,默认为快排
      sorter = ReflectionUtils.newInstance(job.getClass("map.sort.class",
            QuickSort.class, IndexedSorter.class), job);
      // buffers and accounting
      int maxMemUsage = sortmb << 20;
      maxMemUsage -= maxMemUsage % METASIZE;
      kvbuffer = new byte[maxMemUsage];
      bufvoid = kvbuffer.length;
      kvmeta = ByteBuffer.wrap(kvbuffer)
         .order(ByteOrder.nativeOrder())
         .asIntBuffer();
      setEquator(0);
      bufstart = bufend = bufindex = equator;
      kvstart = kvend = kvindex;

      maxRec = kvmeta.capacity() / NMETA;

     //这里可以看到,缓冲区数据的限制容量为80%,也就是说,如果写入缓冲区的数据大小超过了缓冲区容量的80%,就会发生溢写
      softLimit = (int)(kvbuffer.length * spillper);
      bufferRemaining = softLimit;
      LOG.info(JobContext.IO_SORT_MB + ": " + sortmb);
      LOG.info("soft limit at " + softLimit);
      LOG.info("bufstart = " + bufstart + "; bufvoid = " + bufvoid);
      LOG.info("kvstart = " + kvstart + "; length = " + maxRec);

      // 创建了一个比较器,这个比较器会优先被定义为用户定义的排序比较器,如果用户未定义,则定义为key自身的比较器
      comparator = job.getOutputKeyComparator();
      keyClass = (Class<K>)job.getMapOutputKeyClass();
      valClass = (Class<V>)job.getMapOutputValueClass();

	  //先将参数进行序列化,然后才能进行比较
      serializationFactory = new SerializationFactory(job);
      keySerializer = serializationFactory.getSerializer(keyClass);
      keySerializer.open(bb);
      valSerializer = serializationFactory.getSerializer(valClass);
      valSerializer.open(bb);

      // output counters
      mapOutputByteCounter = reporter.getCounter(TaskCounter.MAP_OUTPUT_BYTES);
      mapOutputRecordCounter =
        reporter.getCounter(TaskCounter.MAP_OUTPUT_RECORDS);
      fileOutputByteCounter = reporter
          .getCounter(TaskCounter.MAP_OUTPUT_MATERIALIZED_BYTES);

      // compression
      if (job.getCompressMapOutput()) {
        Class<? extends CompressionCodec> codecClass =
          job.getMapOutputCompressorClass(DefaultCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, job);
      } else {
        codec = null;
      }

      // 这里创建了一个combiner,这个combiner其实就是在map端进行一次Reduce,为了减少shuffle过程中产生的I/O量
      //假设一个分区里有10w条(hell0,1)的记录,我们做单词统计时就可以使用combiner将10w条压缩成1条(hello,10w)
      final Counters.Counter combineInputCounter =
        reporter.getCounter(TaskCounter.COMBINE_INPUT_RECORDS);
      combinerRunner = CombinerRunner.create(job, getTaskID(), 
                                             combineInputCounter,
                                             reporter, null);
      if (combinerRunner != null) {
        final Counters.Counter combineOutputCounter =
          reporter.getCounter(TaskCounter.COMBINE_OUTPUT_RECORDS);
        combineCollector= new CombineOutputCollector<K,V>(combineOutputCounter, reporter, job);
      } else {
        combineCollector = null;
      }
      spillInProgress = false;
      minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
      spillThread.setDaemon(true);
      spillThread.setName("SpillThread");
      spillLock.lock();
      try {

		//这里启动了一个溢写线程,我们进入这个溢写线程spillThread
        spillThread.start();
        while (!spillThreadRunning) {
          spillDone.await();
        }
      } catch (InterruptedException e) {
        throw new IOException("Spill thread failed to initialize", e);
      } finally {
        spillLock.unlock();
      }
      if (sortSpillException != null) {
        throw new IOException("Spill thread failed to initialize",
            sortSpillException);
      }
    }

10. 进入溢写线程spillThread

    protected class SpillThread extends Thread {

      @Override
      public void run() {
        spillLock.lock();
        spillThreadRunning = true;
        try {
          while (true) {
            spillDone.signal();
            while (!spillInProgress) {
              spillReady.await();
            }
            try {
              spillLock.unlock();

		      //这里执行二次排序过程,先排序再溢写,进入sortAndSpill()
              sortAndSpill();
            } catch (Throwable t) {
              sortSpillException = t;
            } finally {
              spillLock.lock();
              if (bufend < bufstart) {
                bufvoid = kvbuffer.length;
              }
              kvstart = kvend;
              bufstart = bufend;
              spillInProgress = false;
            }
          }
        } catch (InterruptedException e) {
          Thread.currentThread().interrupt();
        } finally {
          spillLock.unlock();
          spillThreadRunning = false;
        }
      }
    }

11. 进入sortAndSpill()

注: 这里发生了溢写,详细过程可以参考MapReduce shuffle过程详解,我认为这篇文章写得非常详细,值得推荐。

private void sortAndSpill() throws IOException, ClassNotFoundException,
                                       InterruptedException {
      //approximate the length of the output file to be the length of the
      //buffer + header lengths for the partitions
      final long size = distanceTo(bufstart, bufend, bufvoid) +
                  partitions * APPROX_HEADER_LENGTH;
      FSDataOutputStream out = null;
      try {
        // 创建一个溢写文件
        final SpillRecord spillRec = new SpillRecord(partitions);
        final Path filename =
            mapOutputFile.getSpillFileForWrite(numSpills, size);
        out = rfs.create(filename);

		//NMETA值为16,mstart是记录起始数
        final int mstart = kvend / NMETA;
        //kvbuffer是一个环形缓冲区,当end>kvbuffer.size的时候,kvend就从kvbuffer[0]开始存
        //所以在这里需要对kvstart和kvend做判断
        final int mend = 1 + // kvend is a valid record
          (kvstart >= kvend
          ? kvstart
          : kvmeta.capacity() + kvstart) / NMETA;
          
          //开始对记录排序
        sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);
        int spindex = mstart;
        final IndexRecord rec = new IndexRecord();
        final InMemValBytes value = new InMemValBytes();
        
        //开始按分区一条一条地写入磁盘
        for (int i = 0; i < partitions; ++i) {
          IFile.Writer<K, V> writer = null;
          try {
            long segmentStart = out.getPos();
            FSDataOutputStream partitionOut = CryptoUtils.wrapIfNecessary(job, out);
            writer = new Writer<K, V>(job, partitionOut, keyClass, valClass, codec,
                                      spilledRecordsCounter);
            if (combinerRunner == null) {
            
              // 如果用户没有定义combiner,那么直接发生溢写
              DataInputBuffer key = new DataInputBuffer();
              while (spindex < mend &&
                  kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {
                final int kvoff = offsetFor(spindex % maxRec);
                int keystart = kvmeta.get(kvoff + KEYSTART);
                int valstart = kvmeta.get(kvoff + VALSTART);
                key.reset(kvbuffer, keystart, valstart - keystart);
                getVBytesForOffset(kvoff, value);
                writer.append(key, value);
                ++spindex;
              }
            } else {

			  //有combiner,先进行combiner
              int spstart = spindex;
              while (spindex < mend &&
                  kvmeta.get(offsetFor(spindex % maxRec)
                            + PARTITION) == i) {
                ++spindex;
              }
             
              if (spstart != spindex) {
                combineCollector.setWriter(writer);
                RawKeyValueIterator kvIter =
                  new MRResultIterator(spstart, spindex);
                combinerRunner.combine(kvIter, combineCollector);
              }
            }

            // close the writer
            writer.close();

            // record offsets
            rec.startOffset = segmentStart;
            rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job);
            rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job);
            spillRec.putIndex(rec, i);

            writer = null;
          } finally {
            if (null != writer) writer.close();
          }
        }

        if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
          // create spill index file
          Path indexFilename =
              mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions
                  * MAP_OUTPUT_INDEX_RECORD_LENGTH);
          spillRec.writeToFile(indexFilename, job);
        } else {
          indexCacheList.add(spillRec);
          totalIndexCacheMemory +=
            spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
        }
        LOG.info("Finished spill " + numSpills);
        ++numSpills;
      } finally {
        if (out != null) out.close();
      }
    }

至此,output的工作已经完成。溢写完成之后,剩下的工作交给ReduceTask。

你可能感兴趣的:(MapReduce)