master-dragon

hadoop 简单的MapReduce源码分析(源码&流程&word count日志)

Job提交

 /**
   * Internal method for submitting jobs to the system.
   * 
   * The job submission process involves:
   * 

   *   
   *   Checking the input and output specifications of the job.
   *   
   *   
   *   Computing the {@link InputSplit}s for the job.
   *   
   *   
   *   Setup the requisite accounting information for the 
   *   {@link DistributedCache} of the job, if necessary.
   *   
   *   
   *   Copying the job's jar and configuration to the map-reduce system
   *   directory on the distributed file-system. 
   *   
   *   
   *   Submitting the job to the JobTracker and optionally
   *   monitoring it's status.
   *   
   * 

   * @param job the configuration to submit
   * @param cluster the handle to the Cluster
   * @throws ClassNotFoundException
   * @throws InterruptedException
   * @throws IOException
   */
  JobStatus submitJobInternal(Job job, Cluster cluster)

  // 把任务相关的文件，配置，jars上传
  copyAndConfigureFiles(job, submitJobDir);

  // 获取配置文件job.xml的路径
  Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);

  // 输入文件的splits,配置信息写入job信息中
  // Create the splits for the job
  LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
  int maps = writeSplits(job, submitJobDir);
  conf.setInt(MRJobConfig.NUM_MAPS, maps);
  LOG.info("number of splits:" + maps);

  int maxMaps = conf.getInt(MRJobConfig.JOB_MAX_MAP,
      MRJobConfig.DEFAULT_JOB_MAX_MAP);
  if (maxMaps >= 0 && maxMaps < maps) {
    throw new IllegalArgumentException("The number of map tasks " + maps +
        " exceeded limit " + maxMaps);
  }

  // 设置job使用的资源队列
  // write "queue admins of the queue to which job is being submitted"
  // to job file.
  String queue = conf.get(MRJobConfig.QUEUE_NAME,
      JobConf.DEFAULT_QUEUE_NAME);
  AccessControlList acl = submitClient.getQueueAdmins(queue);
  conf.set(toFullPropertyName(queue,
      QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());

  // removing jobtoken referrals before copying the jobconf to HDFS
  // as the tasks don't need this setting, actually they may break
  // because of it if present as the referral will point to a
  // different job.
  TokenCache.cleanUpTokenReferral(conf);

  if (conf.getBoolean(
      MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,
      MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {
    // Add HDFS tracking ids
    ArrayList<String> trackingIds = new ArrayList<String>();
    for (Token<? extends TokenIdentifier> t :
        job.getCredentials().getAllTokens()) {
      trackingIds.add(t.decodeIdentifier().getTrackingId());
    }
    conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,
        trackingIds.toArray(new String[trackingIds.size()]));
  }

  // Set reservation info if it exists
  ReservationId reservationId = job.getReservationId();
  if (reservationId != null) {
    conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());
  }

  // Write job file to submit dir
  writeConf(conf, submitJobFile);

  // submitClient.submitJob 提交Job
  //
  // Now, actually submit the job (using the submit name)
  //
  printTokens(jobId, job.getCredentials());
  status = submitClient.submitJob(
      jobId, submitJobDir.toString(), job.getCredentials());
  if (status != null) {
    return status;
  } else {
    throw new IOException("Could not launch job");
  }
} finally {
  if (status == null) {
    LOG.info("Cleaning up the staging area " + submitJobDir);
    if (jtFs != null && submitJobDir != null)
      jtFs.delete(submitJobDir, true);

  }
}

输入文件 writeNewSplits

@SuppressWarnings("unchecked")
 private <T extends InputSplit>
 int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException,
     InterruptedException, ClassNotFoundException {
   Configuration conf = job.getConfiguration();
   // 获取输入文件格式，默认:TextInputFormat
   InputFormat<?, ?> input =
     ReflectionUtils.newInstance(job.getInputFormatClass(), conf);

   // 进行split操作
   List<InputSplit> splits = input.getSplits(job);
   T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);

   // 排序，让最大的split优先处理
   // sort the splits into order based on size, so that the biggest
   // go first
   Arrays.sort(array, new SplitComparator());

   // 把split文件信息写入
   JobSplitWriter.createSplitFiles(jobSubmitDir, conf, 
       jobSubmitDir.getFileSystem(conf), array);
   return array.length;
 }

mapTask的输入:splits

可以通过配置BlockSize、split min size、split max size 等参数，达到控制mapper的数量(注意到：即使在程序里对 conf 显式地设置了 mapred.map.tasks 或 mapreduce.job.maps，程序不一定能运行期望数量的 mapper)

input.getSplits(job); （org.apache.hadoop.mapreduce.lib.input FileInputFormat）

/** 
  * Generate the list of files and make them into FileSplits.
  * @param job the job context
  * @throws IOException
  */
 public List<InputSplit> getSplits(JobContext job) throws IOException {
   StopWatch sw = new StopWatch().start();
   // split的最大最小值，可配置
   long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
   long maxSize = getMaxSplitSize(job);

   // 获取job信息，并判断产生split
   // generate splits
   List<InputSplit> splits = new ArrayList<InputSplit>();
   List<FileStatus> files = listStatus(job);

   boolean ignoreDirs = !getInputDirRecursive(job)
     && job.getConfiguration().getBoolean(INPUT_DIR_NONRECURSIVE_IGNORE_SUBDIRS, false);
   for (FileStatus file: files) {
     if (ignoreDirs && file.isDirectory()) {
       continue;
     }
     Path path = file.getPath();
     long length = file.getLen();
     if (length != 0) {
       // 获取输入文件的所有 block 信息
       BlockLocation[] blkLocations;
       if (file instanceof LocatedFileStatus) {
         blkLocations = ((LocatedFileStatus) file).getBlockLocations();
       } else {
         FileSystem fs = path.getFileSystem(job.getConfiguration());
         blkLocations = fs.getFileBlockLocations(file, 0, length);
       }
       if (isSplitable(job, path)) {
         // 计算 splitSize: Math.max(minSize, Math.min(maxSize, blockSize));
         long blockSize = file.getBlockSize();
         long splitSize = computeSplitSize(blockSize, minSize, maxSize);

         long bytesRemaining = length;
         while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
           int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
           splits.add(makeSplit(path, length-bytesRemaining, splitSize,
                       blkLocations[blkIndex].getHosts(),
                       blkLocations[blkIndex].getCachedHosts()));
           bytesRemaining -= splitSize;
         }

         if (bytesRemaining != 0) {
           int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
           splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,
                      blkLocations[blkIndex].getHosts(),
                      blkLocations[blkIndex].getCachedHosts()));
         }
       } else { // not splitable
         if (LOG.isDebugEnabled()) {
           // Log only if the file is big enough to be splitted
           if (length > Math.min(file.getBlockSize(), minSize)) {
             LOG.debug("File is not splittable so no parallelization "
                 + "is possible: " + file.getPath());
           }
         }
         splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
                     blkLocations[0].getCachedHosts()));
       }
     } else { 
       //Create empty hosts array for zero length files
       splits.add(makeSplit(path, 0, length, new String[0]));
     }
   }
   // Save the number of input files for metrics/loadgen
   job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
   sw.stop();
   if (LOG.isDebugEnabled()) {
     LOG.debug("Total # of splits generated by getSplits: " + splits.size()
         + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
   }
   return splits;
 }

// 输入文件的splits,配置信息写入job信息中
// Create the splits for the job
 LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
 int maps = writeSplits(job, submitJobDir);
 conf.setInt(MRJobConfig.NUM_MAPS, maps);
 LOG.info("number of splits:" + maps);

 int maxMaps = conf.getInt(MRJobConfig.JOB_MAX_MAP,
     MRJobConfig.DEFAULT_JOB_MAX_MAP);
 if (maxMaps >= 0 && maxMaps < maps) {
   throw new IllegalArgumentException("The number of map tasks " + maps +
       " exceeded limit " + maxMaps);
 }

split 的多少决定了 Map Task 的数目，因为每个 split 会交由一个 Map Task 处理

MapReduce Job执行的架构(1.0简单描述)

client

用户编写的 MapReduce 程序通过 Client 提交到 JobTracker 端；同时，用户可通过 Client 提供的一些接口查看作业运行状态。在 Hadoop 内部用“作业”（Job）表示 MapReduce 程序。一个 MapReduce 程序可对应若干个作业，而每个作业会被分解成若干个 Map/Reduce 任务（Task）。

JobTracker

JobTracker 主要负责资源监控和作业调度。JobTracker 监控所有 TaskTracker 与作业的健康状况，一旦发现失败情况后，其会将相应的任务转移到其他节点；同时，JobTracker 会跟踪任务的执行进度、资源使用量等信息，并将这些信息告诉任务调度器，而调度器会在资源出现空闲时，选择合适的任务使用这些资源。在 Hadoop 中，任务调度器是一个可插拔的模块，用户可以根据自己的需要设计相应的调度器。

TaskTracker

TaskTracker 会周期性地通过 Heartbeat 将本节点上资源的使用情况和任务的运行进度汇报给 JobTracker，同时接收 JobTracker 发送过来的命令并执行相应的操作（如启动新任务、杀死任务等）。TaskTracker 使用“slot”等量划分本节点上的资源量。“slot”代表计算资源（CPU、内存等）。一个 Task 获取到一个 slot 后才有机会运行，而 Hadoop 调度器的作用就是将各个 TaskTracker 上的空闲 slot 分配给 Task 使用。slot 分为 Map slot 和 Reduce slot 两种，分别供 Map Task 和 Reduce Task 使用。TaskTracker 通过 slot 数目（可配置参数）限定 Task 的并发度。

Task

Task 分为 Map Task 和 Reduce Task 两种，均由 TaskTracker 启动。从上一小节中我们知道， HDFS 以固定大小的 block 为基本单位存储数据，而对于 MapReduce 而言，其处理单位是 split。

MapTask run

MapTask的`run`方法和流程

@Override
  public void run(final JobConf job, final TaskUmbilicalProtocol umbilical)
    throws IOException, ClassNotFoundException, InterruptedException {
    this.umbilical = umbilical;

    if (isMapTask()) {
      // If there are no reducers then there won't be any sort. Hence the map 
      // phase will govern the entire attempt's progress.
      if (conf.getNumReduceTasks() == 0) {
        mapPhase = getProgress().addPhase("map", 1.0f);
      } else {
        // If there are reducers then the entire attempt's progress will be 
        // split between the map phase (67%) and the sort phase (33%).
        mapPhase = getProgress().addPhase("map", 0.667f);
        sortPhase  = getProgress().addPhase("sort", 0.333f);
      }
    }
    TaskReporter reporter = startReporter(umbilical);
 
    boolean useNewApi = job.getUseNewMapper();
    initialize(job, getJobID(), reporter, useNewApi);

    // check if it is a cleanupJobTask
    if (jobCleanup) {
      runJobCleanupTask(umbilical, reporter);
      return;
    }
    if (jobSetup) {
      runJobSetupTask(umbilical, reporter);
      return;
    }
    if (taskCleanup) {
      runTaskCleanupTask(umbilical, reporter);
      return;
    }

    if (useNewApi) {
      runNewMapper(job, splitMetaInfo, umbilical, reporter);
    } else {
      runOldMapper(job, splitMetaInfo, umbilical, reporter);
    }
    done(umbilical, reporter);
  }

MapTask先判断是否有Reduce任务；没有reduce, 则只有map阶段，Map阶段结束则整个job可以结束
有reduce任务，则map阶段会有个排序：当Map任务完成的时候设置当前进度为66.7%，Sort完成的时候设置进度为33.3%

(补充：排序的好处：方便后续的reduce读取减少IO次数)

 @SuppressWarnings("unchecked")
  private <INKEY,INVALUE,OUTKEY,OUTVALUE>
  void runNewMapper(final JobConf job,
                    final TaskSplitIndex splitIndex,
                    final TaskUmbilicalProtocol umbilical,
                    TaskReporter reporter
                    ) throws IOException, ClassNotFoundException,
                             InterruptedException {
    // make a task context so we can get the classes
    org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
      new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, 
                                                                  getTaskID(),
                                                                  reporter);
    // 构造 mapper, 用客户端自己手写的Mapper函数反射构造
    // 每个 mapper 处理 一个 split, job是构造了整个split数组信息的
    // make a mapper
    org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
      (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
        ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
    // make the input format
    org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
      (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
        ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
    // rebuild the input split
    org.apache.hadoop.mapreduce.InputSplit split = null;
    split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
        splitIndex.getStartOffset());
    LOG.info("Processing split: " + split);

    // 从split获取信息，读取真正的block得到原始的一条一条的记录
    org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
      new NewTrackingRecordReader<INKEY,INVALUE>
        (split, inputFormat, reporter, taskContext);
    
    job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
    org.apache.hadoop.mapreduce.RecordWriter output = null;
    
    // get an output object
    if (job.getNumReduceTasks() == 0) {
      output = 
        new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
    } else {
      output = new NewOutputCollector(taskContext, job, umbilical, reporter);
    }

    org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> 
    mapContext = 
      new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), 
          input, output, 
          committer, 
          reporter, split);

    org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context 
        mapperContext = 
          new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
              mapContext);

    try {
      // 输入(split)初始化
      input.initialize(split, mapperContext);
      // map操作：while循环不断的读取原始数据进行map处理
      mapper.run(mapperContext);
      // map阶段完成
      mapPhase.complete();
      // 排序阶段
      setPhase(TaskStatus.Phase.SORT);
      statusUpdate(umbilical);
      // 输入关闭
      input.close();
      input = null;
      // 输出，刷回磁盘
      output.close(mapperContext);
      output = null;
    } finally {
      closeQuietly(input);
      closeQuietly(output, mapperContext);
    }
  }

map执行思路：读取splits；内存中完成计算和排序；然后输入关闭，输出要刷回磁盘，然后关闭

split怎么读取？

需要使用记录读取器RecordReader

// 从split获取信息，读取真正的block得到原始的一条一条的记录
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
  new NewTrackingRecordReader<INKEY,INVALUE>
    (split, inputFormat, reporter, taskContext);

TextInputFormat & LineRecordReader

RecordReader的创建由TextInputFormat来创建this.real = inputFormat.createRecordReader(split, taskContext);

input.initialize(split, mapperContext);初始化就在LineRecordReader的initialize方法

/** An {@link InputFormat} for plain text files.  Files are broken into lines.
 * Either linefeed or carriage-return are used to signal end of line.  Keys are
 * the position in the file, and values are the line of text.. */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class TextInputFormat extends FileInputFormat<LongWritable, Text> {

  @Override
  public RecordReader<LongWritable, Text> 
    createRecordReader(InputSplit split,
                       TaskAttemptContext context) {
    String delimiter = context.getConfiguration().get(
        "textinputformat.record.delimiter");
    byte[] recordDelimiterBytes = null;
    if (null != delimiter)
      recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
    return new LineRecordReader(recordDelimiterBytes);
  }

  @Override
  protected boolean isSplitable(JobContext context, Path file) {
    final CompressionCodec codec =
      new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    if (null == codec) {
      return true;
    }
    return codec instanceof SplittableCompressionCodec;
  }

}

LineRecordReader

 public void initialize(InputSplit genericSplit,
                         TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    
    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null!=codec) {
      isCompressedInput = true;
      decompressor = CodecPool.getDecompressor(codec);
      if (codec instanceof SplittableCompressionCodec) {
        final SplitCompressionInputStream cIn =
          ((SplittableCompressionCodec)codec).createInputStream(
            fileIn, decompressor, start, end,
            SplittableCompressionCodec.READ_MODE.BYBLOCK);
        in = new CompressedSplitLineReader(cIn, job,
            this.recordDelimiterBytes);
        start = cIn.getAdjustedStart();
        end = cIn.getAdjustedEnd();
        filePosition = cIn;
      } else {
        if (start != 0) {
          // So we have a split that is only part of a file stored using
          // a Compression codec that cannot be split.
          throw new IOException("Cannot seek in " +
              codec.getClass().getSimpleName() + " compressed stream");
        }

        in = new SplitLineReader(codec.createInputStream(fileIn,
            decompressor), job, this.recordDelimiterBytes);
        filePosition = fileIn;
      }
    } else {
      fileIn.seek(start);
      in = new UncompressedSplitLineReader(
          fileIn, job, this.recordDelimiterBytes, split.getLength());
      filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
      start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
  }

对 hdfs 文件进行读取是构造一个 fs 对象并打开:
final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file);

一个map只处理自己负责的split，split的几个属性

start = split.getStart();
end = start + split.getLength();
final Path file = split.getPath();

原始 k,v 进行 map 处理

源码是一句：mapper.run(mapperContext); , 所以mapperContext是什么？

mapContext 包含了上文的input

org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> 
    mapContext = 
      new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), 
          input, output, 
          committer, 
          reporter, split);

public class Mapper {

/**
   * Expert users can override this method for more complete control over the
   * execution of the Mapper.
   * @param context
   * @throws IOException
   */
  public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    try {
      while (context.nextKeyValue()) {
        map(context.getCurrentKey(), context.getCurrentValue(), context);
      }
    } finally {
      cleanup(context);
    }
  }

使用迭代器模式，不断的判断nextKeyValue然后处理

LineRecordReader的nextKeyValue方法

 public boolean nextKeyValue() throws IOException {
    if (key == null) {
      key = new LongWritable();
    }
    key.set(pos);
    if (value == null) {
      value = new Text();
    }
    int newSize = 0;
    // We always read one extra line, which lies outside the upper
    // split limit i.e. (end - 1)
    while (getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
      if (pos == 0) {
        newSize = skipUtfByteOrderMark();
      } else {
        newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
        pos += newSize;
      }

      if ((newSize == 0) || (newSize < maxLineLength)) {
        break;
      }

      // line too long. try again
      LOG.info("Skipped line of size " + newSize + " at pos " + 
               (pos - newSize));
    }
    if (newSize == 0) {
      key = null;
      value = null;
      return false;
    } else {
      return true;
    }
  }

map结果怎么输出？

回到class MapTask, 会构造output, 然后mapperContext引入了该output,最后会写到磁盘并关闭；对输出的写入磁盘操作，程序员是在自定义的map类中写了context.write(k,v);这一句的

 // get an output object
if (job.getNumReduceTasks() == 0) {
  output = 
    new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
} else {
  output = new NewOutputCollector(taskContext, job, umbilical, reporter);
}

output.close(mapperContext);
output = null;

output(org.apache.hadoop.mapreduce.RecordWriter)构造

由reduceTask，则output = new NewOutputCollector(taskContext, job, umbilical, reporter);

@SuppressWarnings("unchecked")
NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
                   JobConf job,
                   TaskUmbilicalProtocol umbilical,
                   TaskReporter reporter
                   ) throws IOException, ClassNotFoundException {
  collector = createSortingCollector(job, reporter);
  // reduce任务的数量 就是 partitions
  partitions = jobContext.getNumReduceTasks();
  if (partitions > 1) { // 多个reduce任务
    partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
      ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
  } else {
    partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
      @Override
      public int getPartition(K key, V value, int numPartitions) {
        return partitions - 1;
      }
    };
  }
}

引入collector & partitioner 这两个对象
分区数量就是reduceTask的数量

分区(partitioner & 默认的HashPartitioner)

/**
 * Get the {@link Partitioner} class for the job.
 * 
 * @return the {@link Partitioner} class for the job.
 */
@SuppressWarnings("unchecked")
public Class<? extends Partitioner<?,?>> getPartitionerClass() 
   throws ClassNotFoundException {
  return (Class<? extends Partitioner<?,?>>) 
    conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class);
}

/** 
 * Partitions the key space.
 * 
 * Partitioner controls the partitioning of the keys of the 
 * intermediate map-outputs. The key (or a subset of the key) is used to derive
 * the partition, typically by a hash function. The total number of partitions
 * is the same as the number of reduce tasks for the job. Hence this controls
 * which of the m reduce tasks the intermediate key (and hence the 
 * record) is sent for reduction.
 *
 * Note: A Partitioner is created only when there are multiple
 * reducers.
 *
 * Note: If you require your Partitioner class to obtain the Job's
 * configuration object, implement the {@link Configurable} interface.
 * 
 * @see Reducer
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public abstract class Partitioner<KEY, VALUE> {

/** Partition keys by their {@link Object#hashCode()}. */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class HashPartitioner<K, V> extends Partitioner<K, V> {

  /** Use {@link Object#hashCode()} to partition. */
  public int getPartition(K key, V value,
                          int numReduceTasks) {
    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
  }

}

context.write(collector.collect)

k, v, p 三个维度写入到 buffer memory(环形buffer缓冲区)

@Override
public void write(K key, V value) throws IOException, InterruptedException {
  collector.collect(key, value,
                    partitioner.getPartition(key, value, partitions));
}

collect()方法会先序列化对象的k,v 然后放到buffer缓冲区中

缓冲区 MapOutputBuffer

@SuppressWarnings("unchecked")
  private <KEY, VALUE> MapOutputCollector<KEY, VALUE>
          createSortingCollector(JobConf job, TaskReporter reporter)
    throws IOException, ClassNotFoundException {
    MapOutputCollector.Context context =
      new MapOutputCollector.Context(this, job, reporter);

    Class<?>[] collectorClasses = job.getClasses(
      JobContext.MAP_OUTPUT_COLLECTOR_CLASS_ATTR, MapOutputBuffer.class);
    int remainingCollectors = collectorClasses.length;
    Exception lastException = null;
    for (Class clazz : collectorClasses) {
      try {
        if (!MapOutputCollector.class.isAssignableFrom(clazz)) {
          throw new IOException("Invalid output collector class: " + clazz.getName() +
            " (does not implement MapOutputCollector)");
        }
        Class<? extends MapOutputCollector> subclazz =
          clazz.asSubclass(MapOutputCollector.class);
        LOG.debug("Trying map output collector class: " + subclazz.getName());
        MapOutputCollector<KEY, VALUE> collector =
          ReflectionUtils.newInstance(subclazz, job);
        collector.init(context);
        LOG.info("Map output collector class = " + collector.getClass().getName());
        return collector;
      } catch (Exception e) {
        String msg = "Unable to initialize MapOutputCollector " + clazz.getName();
        if (--remainingCollectors > 0) {
          msg += " (" + remainingCollectors + " more collector(s) to try)";
        }
        lastException = e;
        LOG.warn(msg, e);
      }
    }

init方法

@SuppressWarnings("unchecked")
public void init(MapOutputCollector.Context context
                ) throws IOException, ClassNotFoundException {
  job = context.getJobConf();
  reporter = context.getReporter();
  mapTask = context.getMapTask();
  mapOutputFile = mapTask.getMapOutputFile();
  sortPhase = mapTask.getSortPhase();
  spilledRecordsCounter = reporter.getCounter(TaskCounter.SPILLED_RECORDS);
  partitions = job.getNumReduceTasks();
  rfs = ((LocalFileSystem)FileSystem.getLocal(job)).getRaw();

  //sanity checks
  final float spillper =
    job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT, (float)0.8);
  final int sortmb = job.getInt(MRJobConfig.IO_SORT_MB,
      MRJobConfig.DEFAULT_IO_SORT_MB);
  indexCacheMemoryLimit = job.getInt(JobContext.INDEX_CACHE_MEMORY_LIMIT,
                                     INDEX_CACHE_MEMORY_LIMIT_DEFAULT);
  if (spillper > (float)1.0 || spillper <= (float)0.0) {
    throw new IOException("Invalid \"" + JobContext.MAP_SORT_SPILL_PERCENT +
        "\": " + spillper);
  }
  if ((sortmb & 0x7FF) != sortmb) {
    throw new IOException(
        "Invalid \"" + JobContext.IO_SORT_MB + "\": " + sortmb);
  }
  sorter = ReflectionUtils.newInstance(job.getClass(
               MRJobConfig.MAP_SORT_CLASS, QuickSort.class,
               IndexedSorter.class), job);
  // buffers and accounting
  int maxMemUsage = sortmb << 20;
  maxMemUsage -= maxMemUsage % METASIZE;
  kvbuffer = new byte[maxMemUsage];
  bufvoid = kvbuffer.length;
  kvmeta = ByteBuffer.wrap(kvbuffer)
     .order(ByteOrder.nativeOrder())
     .asIntBuffer();
  setEquator(0);
  bufstart = bufend = bufindex = equator;
  kvstart = kvend = kvindex;

  maxRec = kvmeta.capacity() / NMETA;
  softLimit = (int)(kvbuffer.length * spillper);
  bufferRemaining = softLimit;
  LOG.info(JobContext.IO_SORT_MB + ": " + sortmb);
  LOG.info("soft limit at " + softLimit);
  LOG.info("bufstart = " + bufstart + "; bufvoid = " + bufvoid);
  LOG.info("kvstart = " + kvstart + "; length = " + maxRec);

  // k/v serialization
  comparator = job.getOutputKeyComparator();
  keyClass = (Class<K>)job.getMapOutputKeyClass();
  valClass = (Class<V>)job.getMapOutputValueClass();
  serializationFactory = new SerializationFactory(job);
  keySerializer = serializationFactory.getSerializer(keyClass);
  keySerializer.open(bb);
  valSerializer = serializationFactory.getSerializer(valClass);
  valSerializer.open(bb);

  // output counters
  mapOutputByteCounter = reporter.getCounter(TaskCounter.MAP_OUTPUT_BYTES);
  mapOutputRecordCounter =
    reporter.getCounter(TaskCounter.MAP_OUTPUT_RECORDS);
  fileOutputByteCounter = reporter
      .getCounter(TaskCounter.MAP_OUTPUT_MATERIALIZED_BYTES);

  // compression
  if (job.getCompressMapOutput()) {
    Class<? extends CompressionCodec> codecClass =
      job.getMapOutputCompressorClass(DefaultCodec.class);
    codec = ReflectionUtils.newInstance(codecClass, job);
  } else {
    codec = null;
  }

  // combiner
  final Counters.Counter combineInputCounter =
    reporter.getCounter(TaskCounter.COMBINE_INPUT_RECORDS);
  combinerRunner = CombinerRunner.create(job, getTaskID(), 
                                         combineInputCounter,
                                         reporter, null);
  if (combinerRunner != null) {
    final Counters.Counter combineOutputCounter =
      reporter.getCounter(TaskCounter.COMBINE_OUTPUT_RECORDS);
    combineCollector= new CombineOutputCollector<K,V>(combineOutputCounter, reporter, job);
  } else {
    combineCollector = null;
  }
  spillInProgress = false;
  minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
  spillThread.setDaemon(true);
  spillThread.setName("SpillThread");
  spillLock.lock();
  try {
    spillThread.start();
    while (!spillThreadRunning) {
      spillDone.await();
    }
  } catch (InterruptedException e) {
    throw new IOException("Spill thread failed to initialize", e);
  } finally {
    spillLock.unlock();
  }
  if (sortSpillException != null) {
    throw new IOException("Spill thread failed to initialize",
        sortSpillException);
  }
}

collect(kv缓冲区) & partition(分区)处理图

SpillThread(将缓冲区中的数据spill到磁盘)

private void sortAndSpill() throws IOException, ClassNotFoundException,
                                       InterruptedException {
      // 近似提前统计这次溢写是多少个字节长度：包含kv原始数据和分区的头信息
      //approximate the length of the output file to be the length of the
      //buffer + header lengths for the partitions
      final long size = distanceTo(bufstart, bufend, bufvoid) +
                  partitions * APPROX_HEADER_LENGTH;
      FSDataOutputStream out = null;
      FSDataOutputStream partitionOut = null;
      try {
        // 构造SpillRecord对象和spill文件名称，并创建spill文件，返回FSDataOutputStream对象
        // create spill file
        final SpillRecord spillRec = new SpillRecord(partitions);
        final Path filename =
            mapOutputFile.getSpillFileForWrite(numSpills, size);
        out = rfs.create(filename);

        final int mstart = kvend / NMETA;
        final int mend = 1 + // kvend is a valid record
          (kvstart >= kvend
          ? kvstart
          : kvmeta.capacity() + kvstart) / NMETA;
        // 默认快排，进行按照partition和key进行排序操作，参见 MapOutputBuffer 的 compare 实现
        sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);
        int spindex = mstart;
        final IndexRecord rec = new IndexRecord();
        final InMemValBytes value = new InMemValBytes();
        // 遍历partitions进行处理
        for (int i = 0; i < partitions; ++i) {
          IFile.Writer<K, V> writer = null;
          try {
            long segmentStart = out.getPos();
            partitionOut = CryptoUtils.wrapIfNecessary(job, out, false);
            writer = new Writer<K, V>(job, partitionOut, keyClass, valClass, codec,
                                      spilledRecordsCounter);
            if (combinerRunner == null) {
              // spill directly
              DataInputBuffer key = new DataInputBuffer();
              while (spindex < mend &&
                  kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {
                final int kvoff = offsetFor(spindex % maxRec);
                int keystart = kvmeta.get(kvoff + KEYSTART);
                int valstart = kvmeta.get(kvoff + VALSTART);
                key.reset(kvbuffer, keystart, valstart - keystart);
                getVBytesForOffset(kvoff, value);
                writer.append(key, value);
                ++spindex;
              }
            } else {
              int spstart = spindex;
              while (spindex < mend &&
                  kvmeta.get(offsetFor(spindex % maxRec)
                            + PARTITION) == i) {
                ++spindex;
              }
              // Note: we would like to avoid the combiner if we've fewer
              // than some threshold of records for a partition
              if (spstart != spindex) {
                combineCollector.setWriter(writer);
                RawKeyValueIterator kvIter =
                  new MRResultIterator(spstart, spindex);
                combinerRunner.combine(kvIter, combineCollector);
              }
            }

            // close the writer
            writer.close();
            if (partitionOut != out) {
              partitionOut.close();
              partitionOut = null;
            }

            // record offsets
            rec.startOffset = segmentStart;
            rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job);
            rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job);
            spillRec.putIndex(rec, i);

            writer = null;
          } finally {
            if (null != writer) writer.close();
          }
        }
        // spill索引信息的处理
        if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
          // create spill index file
          Path indexFilename =
              mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions
                  * MAP_OUTPUT_INDEX_RECORD_LENGTH);
          spillRec.writeToFile(indexFilename, job);
        } else {
          indexCacheList.add(spillRec);
          totalIndexCacheMemory +=
            spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
        }
        // 完成一个spill文件
        LOG.info("Finished spill " + numSpills);
        ++numSpills;
      } finally {
        if (out != null) out.close();
        if (partitionOut != null) {
          partitionOut.close();
        }
      }
    }

当缓冲区满足一定条件后就会对缓冲区kvbuffer中的数据进行排序，先按分区编号partition进行升序，然后按照key进行升序，使得同一分区数据聚合，且分区内所有数据按照key有序（这里使用了快速排序）
接着会生成spill文件,不过在此之前如果有combiner,则会进行combiner
最后是归并合并所有spill文件和索引文件, 这里还会进行一次combiner(如果有设置的话)

flush & mergeParts()

public void flush() throws IOException, ClassNotFoundException,
           InterruptedException {
      LOG.info("Starting flush of map output");
      if (kvbuffer == null) {
        LOG.info("kvbuffer is null. Skipping flush.");
        return;
      }
      spillLock.lock();
      try {
        while (spillInProgress) {
          reporter.progress();
          spillDone.await();
        }
        checkSpillException();

        final int kvbend = 4 * kvend;
        if ((kvbend + METASIZE) % kvbuffer.length !=
            equator - (equator % METASIZE)) {
          // spill finished
          resetSpill();
        }
        if (kvindex != kvend) {
          kvend = (kvindex + NMETA) % kvmeta.capacity();
          bufend = bufmark;
          LOG.info("Spilling map output");
          LOG.info("bufstart = " + bufstart + "; bufend = " + bufmark +
                   "; bufvoid = " + bufvoid);
          LOG.info("kvstart = " + kvstart + "(" + (kvstart * 4) +
                   "); kvend = " + kvend + "(" + (kvend * 4) +
                   "); length = " + (distanceTo(kvend, kvstart,
                         kvmeta.capacity()) + 1) + "/" + maxRec);
          sortAndSpill();
        }
      } catch (InterruptedException e) {
        throw new IOException("Interrupted while waiting for the writer", e);
      } finally {
        spillLock.unlock();
      }
      assert !spillLock.isHeldByCurrentThread();
      // shut down spill thread and wait for it to exit. Since the preceding
      // ensures that it is finished with its work (and sortAndSpill did not
      // throw), we elect to use an interrupt instead of setting a flag.
      // Spilling simultaneously from this thread while the spill thread
      // finishes its work might be both a useful way to extend this and also
      // sufficient motivation for the latter approach.
      try {
        spillThread.interrupt();
        spillThread.join();
      } catch (InterruptedException e) {
        throw new IOException("Spill failed", e);
      }
      // release sort buffer before the merge
      kvbuffer = null;
      mergeParts();
      Path outputPath = mapOutputFile.getOutputFile();
      fileOutputByteCounter.increment(rfs.getFileStatus(outputPath).getLen());
      // If necessary, make outputs permissive enough for shuffling.
      if (!SHUFFLE_OUTPUT_PERM.equals(
          SHUFFLE_OUTPUT_PERM.applyUMask(FsPermission.getUMask(job)))) {
        Path indexPath = mapOutputFile.getOutputIndexFile();
        rfs.setPermission(outputPath, SHUFFLE_OUTPUT_PERM);
        rfs.setPermission(indexPath, SHUFFLE_OUTPUT_PERM);
      }
    }

spill(combiner)到输出文件图

当最后一次sortAndSpill操作完成后; 开始合并溢写文件，形成最终的一个输出文件

回顾MapTask的流程

input -> map -> output

input输入：来自输入格式化类给我们实际返回的记录读取器对象

TextInputFormat -> LineRecordreader
				   split: file, offset, seek(offset)
				   init():
				   		  in = fs.open(file).seek(offset)
				   nextKeyValue()
				   		  * 读取原始Block数据中的一条记录，对key,value默认赋值
				   		  * 返回boolean
				   getCurrentKey()
				   getCurrentValue()

map: 就是我们自己定义的MapperClass,eg: job.setMapperClass(TempMapper.class);对input进行map操作

output:

MapOutputBuffer
			init():
				splitter: 0.8
				sortedmn: 100M
				sorter = QuickSort
				comparator =  jobConf.getOutputKeyComparator(); // 默认，key自己的Comparator
		        combiner(相当于map做小reduce压缩一下)
		        	reduce, minSpillsForCombine
		        SpillThread
		        	sortAndSpill()
		        		combiner

回顾为什么要排序和分区

内存中进行排序，而不是随机写到磁盘；以减少后续的IO

多组文件：内部有序，外部无序，要求整体有序 =》 归并排序

partitions 归并排序

忘记上面的图，假如什么排序都没有，map输出的一个文件会如下：

partition是杂乱无序的，是交错在磁盘文件上的

那么在reduce阶段，因为是一个partition一个reduce任务；那么每一个reduce任务为了完成对一个partition的计算，不得不去读取整个的磁盘文件数据(即一条一条记录的去读取并判断是不是要处理的partition)，那么IO次数是partitions次(且每一次IO，需要读取整个的磁盘文件), 如下图：

加入按照partition排序好了，如下

每个map输出文件内部都是按照partition排好序的

这样之后：单个reduce任务进行操作的时候，没有必要去读取整个的文件了，只要读取排序好的，自己处理的partiton的那一块，如下图

在partitions排序的基础上:key再排序

在partition排序基础上，如果partition内的key是乱序的

比如单词统计，那么每一个reduce处理是类似如下的：

可以看到reduce处理，对每一组处理需要全部遍历完，才能计算出这组数据的结果；假如有排序，每个组的计算成本显然下降不少，如下：（又可以类似归并排序处理）

即最后如下：

map输出的是一个有序的(partititon有序，partition内key有序)的文件，供reduceTask来执行

为什么有 combiner

在map阶段进行合并操作，降低后续reduceTask的计算压力；也相当于提前做一次小reduce操作

并不是所有的程序都适合combiner
需要测试
默认关闭的

eg: Sum()求和适合，Average()求平均数不适合；例如，求0、20、10、25 、15的平均数，直接使用Reduce求平均数Average(0,20,10,25,15)，得到的结果是14，如果先使用Combiner分别对不同Mapper结果求平均数，Average(0,20,10)=10，Average(25,15)=20，再使用Reducer求平均数Average(10,20)，得到的结果为15，很明显求平均数并不适合使用Combiner。

为什么有 compression

减少磁盘IO和网络IO

mapreduce处理流程图

map 到 shuffle 到 reduce 实例流程图

ReduceTask

reduceTask流程

ReduceTask
	input -> reduce -> output
	map:run:		while(context.nextKeyValue())
						一条记录调用一次map
	reduece:run 	while(context.nextKey())
						一组数据调用一次reduce
    shuffle:		拉取数据（相同partion的key会被拉取到一个partion）（一个partition 一个 reduceTask）
    sort:			归并排序, grouping comparator
    reduce run:		迭代器
    				reduce拉取属于自己partition的数据，并包装成迭代器
    				reduce方法被调用的时候，并没有把一组数据真的加载到内存中，而是传递一个迭代器-values
    					hasNext：判断下一条是否还在当前一组的nextKeyIsSame
    					next：负责读取nextKeyValue方法，同时也会更新nextKeyIsSame

Reducer

/**
   * Advanced application writers can use the 
   * {@link #run(org.apache.hadoop.mapreduce.Reducer.Context)} method to
   * control how the reduce task works.
   */
  public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    try {
      while (context.nextKey()) {
        reduce(context.getCurrentKey(), context.getValues(), context);
        // If a back up store is used, reset it
        Iterator<VALUEIN> iter = context.getValues().iterator();
        if(iter instanceof ReduceContext.ValueIterator) {
          ((ReduceContext.ValueIterator<VALUEIN>)iter).resetBackupStore();        
        }
      }
    } finally {
      cleanup(context);
    }
  }

ReduceTask run()

 @Override
  @SuppressWarnings("unchecked")
  public void run(JobConf job, final TaskUmbilicalProtocol umbilical)
    throws IOException, InterruptedException, ClassNotFoundException {
    job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());

    if (isMapOrReduce()) {
      copyPhase = getProgress().addPhase("copy");
      sortPhase  = getProgress().addPhase("sort");
      reducePhase = getProgress().addPhase("reduce");
    }
    // start thread that will handle communication with parent
    TaskReporter reporter = startReporter(umbilical);
    
    boolean useNewApi = job.getUseNewReducer();
    initialize(job, getJobID(), reporter, useNewApi);

    // check if it is a cleanupJobTask
    if (jobCleanup) {
      runJobCleanupTask(umbilical, reporter);
      return;
    }
    if (jobSetup) {
      runJobSetupTask(umbilical, reporter);
      return;
    }
    if (taskCleanup) {
      runTaskCleanupTask(umbilical, reporter);
      return;
    }
    
    // Initialize the codec
    codec = initCodec();
    RawKeyValueIterator rIter = null;
    ShuffleConsumerPlugin shuffleConsumerPlugin = null;
    
    Class combinerClass = conf.getCombinerClass();
    CombineOutputCollector combineCollector = 
      (null != combinerClass) ? 
     new CombineOutputCollector(reduceCombineOutputCounter, reporter, conf) : null;

    Class<? extends ShuffleConsumerPlugin> clazz =
          job.getClass(MRConfig.SHUFFLE_CONSUMER_PLUGIN, Shuffle.class, ShuffleConsumerPlugin.class);
					
    shuffleConsumerPlugin = ReflectionUtils.newInstance(clazz, job);
    LOG.info("Using ShuffleConsumerPlugin: " + shuffleConsumerPlugin);

    ShuffleConsumerPlugin.Context shuffleContext = 
      new ShuffleConsumerPlugin.Context(getTaskID(), job, FileSystem.getLocal(job), umbilical, 
                  super.lDirAlloc, reporter, codec, 
                  combinerClass, combineCollector, 
                  spilledRecordsCounter, reduceCombineInputCounter,
                  shuffledMapsCounter,
                  reduceShuffleBytes, failedShuffleCounter,
                  mergedMapOutputsCounter,
                  taskStatus, copyPhase, sortPhase, this,
                  mapOutputFile, localMapFiles);
    shuffleConsumerPlugin.init(shuffleContext);

    rIter = shuffleConsumerPlugin.run();

    // free up the data structures
    mapOutputFilesOnDisk.clear();
    
    sortPhase.complete();                         // sort is complete
    setPhase(TaskStatus.Phase.REDUCE); 
    statusUpdate(umbilical);
    Class keyClass = job.getMapOutputKeyClass();
    Class valueClass = job.getMapOutputValueClass();
    RawComparator comparator = job.getOutputValueGroupingComparator();

    if (useNewApi) {
      runNewReducer(job, umbilical, reporter, rIter, comparator, 
                    keyClass, valueClass);
    } else {
      runOldReducer(job, umbilical, reporter, rIter, comparator, 
                    keyClass, valueClass);
    }

    shuffleConsumerPlugin.close();
    done(umbilical, reporter);
  }

reduce的三个阶段

copyPhase = getProgress().addPhase("copy");
sortPhase  = getProgress().addPhase("sort");
reducePhase = getProgress().addPhase("reduce");

首先需要从各个mapTask的输出文件下载自身reduceTask要执行的分区部分
显然为了减少IO,需要归并排序进行文件合并，方便reduce任务处理
reduce操作后写结果回磁盘

reduce的输入来源？（shuffle拉取）

Shuffle run方法

mapreduce.reduce.shuffle.parallelcopies并发线程拉取

@Override
  public RawKeyValueIterator run() throws IOException, InterruptedException {
    // Scale the maximum events we fetch per RPC call to mitigate OOM issues
    // on the ApplicationMaster when a thundering herd of reducers fetch events
    // TODO: This should not be necessary after HADOOP-8942
    int eventsPerReducer = Math.max(MIN_EVENTS_TO_FETCH,
        MAX_RPC_OUTSTANDING_EVENTS / jobConf.getNumReduceTasks());
    int maxEventsToFetch = Math.min(MAX_EVENTS_TO_FETCH, eventsPerReducer);

    // Start the map-completion events fetcher thread
    final EventFetcher<K, V> eventFetcher =
        new EventFetcher<K, V>(reduceId, umbilical, scheduler, this,
            maxEventsToFetch);
    eventFetcher.start();
    
    // Start the map-output fetcher threads
    boolean isLocal = localMapFiles != null;
    final int numFetchers = isLocal ? 1 :
        jobConf.getInt(MRJobConfig.SHUFFLE_PARALLEL_COPIES, 5);
    Fetcher<K, V>[] fetchers = new Fetcher[numFetchers];
    if (isLocal) {
      fetchers[0] = new LocalFetcher<K, V>(jobConf, reduceId, scheduler,
          merger, reporter, metrics, this, reduceTask.getShuffleSecret(),
          localMapFiles);
      fetchers[0].start();
    } else {
      for (int i=0; i < numFetchers; ++i) {
        fetchers[i] = new Fetcher<K, V>(jobConf, reduceId, scheduler, merger,
                                       reporter, metrics, this, 
                                       reduceTask.getShuffleSecret());
        fetchers[i].start();
      }
    }
    
    // Wait for shuffle to complete successfully
    while (!scheduler.waitUntilDone(PROGRESS_FREQUENCY)) {
      reporter.progress();
      
      synchronized (this) {
        if (throwable != null) {
          throw new ShuffleError("error in shuffle in " + throwingThreadName,
                                 throwable);
        }
      }
    }

    // Stop the event-fetcher thread
    eventFetcher.shutDown();
    
    // Stop the map-output fetcher threads
    for (Fetcher<K, V> fetcher : fetchers) {
      fetcher.shutDown();
    }
    
    // stop the scheduler
    scheduler.close();

    copyPhase.complete(); // copy is already complete
    taskStatus.setPhase(TaskStatus.Phase.SORT);
    reduceTask.statusUpdate(umbilical);

    // Finish the on-going merges...
    RawKeyValueIterator kvIter = null;
    try {
      kvIter = merger.close();
    } catch (Throwable e) {
      throw new ShuffleError("Error while doing final merge ", e);
    }

    // Sanity check
    synchronized (this) {
      if (throwable != null) {
        throw new ShuffleError("error in shuffle in " + throwingThreadName,
                               throwable);
      }
    }
    
    return kvIter;
  }

public void run() {
    try {
      while (!stopped && !Thread.currentThread().isInterrupted()) {
        MapHost host = null;
        try {
          // If merge is on, block
          merger.waitForResource();

          // Get a host to shuffle from
          host = scheduler.getHost();
          metrics.threadBusy();

          // Shuffle
          copyFromHost(host);
        } finally {
          if (host != null) {
            scheduler.freeHost(host);
            metrics.threadFree();            
          }
        }
      }
    } catch (InterruptedException ie) {
      return;
    } catch (Throwable t) {
      exceptionReporter.reportException(t);
    }
  }

在shuffle 拉取数据完成后，相关拉取线程关闭，然后会返回RawKeyValueIterator，在真正执行reduce操作之前，还会进行一次排序, 这使得一个reduce拉取到的是一个有序的数据集

rIter 迭代器模式的使用？

数据仍在磁盘中，使用迭代器模式，直接文件IO读取，避免大数据量在内存中导致内存不足的问题

runNewReducer

runNewReducer(job, umbilical, reporter, rIter, comparator, 
                    keyClass, valueClass);

reducer对象使用Job设置的reducerClass

 @SuppressWarnings("unchecked")
  private <INKEY,INVALUE,OUTKEY,OUTVALUE>
  void runNewReducer(JobConf job,
                     final TaskUmbilicalProtocol umbilical,
                     final TaskReporter reporter,
                     RawKeyValueIterator rIter,
                     RawComparator<INKEY> comparator,
                     Class<INKEY> keyClass,
                     Class<INVALUE> valueClass
                     ) throws IOException,InterruptedException, 
                              ClassNotFoundException {
    // wrap value iterator to report progress.
    final RawKeyValueIterator rawIter = rIter;
    rIter = new RawKeyValueIterator() {
      public void close() throws IOException {
        rawIter.close();
      }
      public DataInputBuffer getKey() throws IOException {
        return rawIter.getKey();
      }
      public Progress getProgress() {
        return rawIter.getProgress();
      }
      public DataInputBuffer getValue() throws IOException {
        return rawIter.getValue();
      }
      public boolean next() throws IOException {
        boolean ret = rawIter.next();
        reporter.setProgress(rawIter.getProgress().getProgress());
        return ret;
      }
    };
    // make a task context so we can get the classes
    org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
      new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,
          getTaskID(), reporter);
    // make a reducer
    org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE> reducer =
      (org.apache.hadoop.mapreduce.Reducer<INKEY,INVALUE,OUTKEY,OUTVALUE>)
        ReflectionUtils.newInstance(taskContext.getReducerClass(), job);
    org.apache.hadoop.mapreduce.RecordWriter<OUTKEY,OUTVALUE> trackedRW = 
      new NewTrackingRecordWriter<OUTKEY, OUTVALUE>(this, taskContext);
    job.setBoolean("mapred.skip.on", isSkipping());
    job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
    org.apache.hadoop.mapreduce.Reducer.Context 
         reducerContext = createReduceContext(reducer, job, getTaskID(),
                                               rIter, reduceInputKeyCounter, 
                                               reduceInputValueCounter, 
                                               trackedRW,
                                               committer,
                                               reporter, comparator, keyClass,
                                               valueClass);
    try {
      reducer.run(reducerContext);
    } finally {
      trackedRW.close(reducerContext);
    }
  }

如何执行reduce方法？

也是迭代器模式（不过是两次：nextKey, 然后ValueIterator）

 /**
   * Advanced application writers can use the 
   * {@link #run(org.apache.hadoop.mapreduce.Reducer.Context)} method to
   * control how the reduce task works.
   */
  public void run(Context context) throws IOException, InterruptedException {
    setup(context);
    try {
      while (context.nextKey()) {
        reduce(context.getCurrentKey(), context.getValues(), context);
        // If a back up store is used, reset it
        Iterator<VALUEIN> iter = context.getValues().iterator();
        if(iter instanceof ReduceContext.ValueIterator) {
          ((ReduceContext.ValueIterator<VALUEIN>)iter).resetBackupStore();        
        }
      }
    } finally {
      cleanup(context);
    }
  }

reduce的结果是程序员context.write

/**
  * This method is called once for each key. Most applications will define
  * their reduce class by overriding this method. The default implementation
  * is an identity function.
  */
 @SuppressWarnings("unchecked")
 protected void reduce(KEYIN key, Iterable<VALUEIN> values, Context context
                       ) throws IOException, InterruptedException {
   for(VALUEIN value: values) {
     context.write((KEYOUT) key, (VALUEOUT) value);
   }
 }

shuffle

shuffle 包括了map的输出到 reduce 的数据输入处理

mapreduce log 学习

WordCount 例子

输入 & 输出

hdfs命令

hadoop fs -put ~/words.txt hdfs://localhost:9000/input/wcinput

hadoop fs -ls hdfs://localhost:9000/

hadoop fs -mkdir hdfs://localhost:9000/output

hadoop fs -cat hdfs://localhost:9000/output/wcoutput/part-r-00000

原始input

hello hadoop
hello world hello
hadoop hello

最后的mapreduce输出

hadoop	2
hello	4
world	1

不实用combiner & 使用combiner的log 区别

2020-04-30 00:04:57,430 INFO [org.apache.hadoop.mapreduce.Job] - Counters: 36
	File System Counters
		FILE: Number of bytes read=524
		FILE: Number of bytes written=709206
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=88
		HDFS: Number of bytes written=25
		HDFS: Number of read operations=15
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=4
		HDFS: Number of bytes read erasure-coded=0
	Map-Reduce Framework
		Map input records=3
		Map output records=7
		Map output bytes=72
		Map output materialized bytes=92
		Input split bytes=100
		Combine input records=0
		Combine output records=0
		Reduce input groups=3
		Reduce shuffle bytes=92
		Reduce input records=7
		Reduce output records=3
		Spilled Records=14
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=7
		Total committed heap usage (bytes)=586153984
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=44
	File Output Format Counters 
		Bytes Written=25
2020-04-30 00:04:57,430 DEBUG [org.apache.hadoop.security.UserGroupInformation] - PrivilegedAction as:mubi (auth:SIMPLE) from:org.apache.hadoop.mapreduce.Job.updateStatus(Job.java:328)

Map output records=7 即是 Reduce input records=7

// 第一行
hello 1
hadoop 1
// 第二行
hello 1
world 1
hello 1
// 第三行
hadoop 1
hello 1

加入combiner job.setCombinerClass(WordCountReducer.class);

2020-04-30 00:22:10,512 INFO [org.apache.hadoop.mapreduce.Job] - Counters: 36
	File System Counters
		FILE: Number of bytes read=426
		FILE: Number of bytes written=709865
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
		HDFS: Number of bytes read=88
		HDFS: Number of bytes written=25
		HDFS: Number of read operations=17
		HDFS: Number of large read operations=0
		HDFS: Number of write operations=6
		HDFS: Number of bytes read erasure-coded=0
	Map-Reduce Framework
		Map input records=3
		Map output records=7
		Map output bytes=72
		Map output materialized bytes=43
		Input split bytes=100
		Combine input records=7
		Combine output records=3
		Reduce input groups=3
		Reduce shuffle bytes=43
		Reduce input records=3
		Reduce output records=3
		Spilled Records=6
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=79
		Total committed heap usage (bytes)=801112064
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=44
	File Output Format Counters 
		Bytes Written=25
2020-04-30 00:22:10,512 DEBUG [org.apache.hadoop.security.UserGroupInformation] - PrivilegedAction as:mubi (auth:SIMPLE) from:org.apache.hadoop.mapreduce.Job.updateStatus(Job.java:328)

多了combine, 且reduce的输入是combine 的输出了

Combine input records=7
Combine output records=3
Reduce input groups=3
Reduce shuffle bytes=43
Reduce input records=3
Reduce output records=3

自定义key排序:`job.setSortComparatorClass`

word count, 让 word 逆序

static class KeyComparator extends WritableComparator{

   public KeyComparator() {
        super(Text.class, true);
    }

    @Override
    public int compare(WritableComparable w1, WritableComparable w2) {
        Text key1 = (Text)w1;
        Text key2 = (Text)w2;
        String s1 = key1.toString();
        String s2 = key2.toString();
        return s2.compareTo(s1);
    }

}

然后程序设置

// 设置key排序
job.setSortComparatorClass(KeyComparator.class);

最后结果输出

world	1
hello	4
hadoop	2

word count java 程序

import java.io.IOException;
import java.net.URI;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


/**
 * @Author mubi
 * @Date 2020/4/28 23:39
 */
public class WC {

    /**
     * map函数
     *
     * 四个泛型类型分别代表：
     * KeyIn        Mapper的输入数据的Key，这里是每行文字的起始位置（0,11,...）
     * ValueIn      Mapper的输入数据的Value，这里是每行文字
     * KeyOut       Mapper的输出数据的Key，这里是每行文字中的"单词"
     * ValueOut     Mapper的输出数据的Value，这里是单词的数量"1"
     */
    static class WordCountMapper extends
            Mapper<LongWritable, Text, Text, IntWritable> {

        @Override
        public void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            // 将mapTask传给我们的文本内容先转换为String
            String line = value.toString();
            // 根据空格将这一行切分为单词
            String[] words = line.split(" ");

            // 将单词输出为<单词,1>
            for(String word: words){
                // 将单词作为key, 将次数作为value, 以便于后续的数据分发, 根据单词分发, 以便于相同单词会到相同的reduceTask内部.
                context.write(new Text(word), new IntWritable(1));
            }
        }
    }

    /**
     * reduce函数
     *
     * 四个泛型类型分别代表：
     * KeyIn        Reducer的输入数据的Key，这里是每行文字中的“单词”
     * ValueIn      Reducer的输入数据的Value，这里是单词数量列表
     * KeyOut       Reducer的输出数据的Key，这里是不重复的“单词”
     * ValueOut     Reducer的输出数据的Value，这里是单词总数量
     */
    static class WordCountReducer extends
            Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context)throws IOException, InterruptedException {
            int count=0;
            Iterator<IntWritable> it = values.iterator();
            while(it.hasNext()){
                count += it.next().get();
            }
            context.write(key, new IntWritable(count));
        }
    }

    public static void main(String[] args) throws Exception {
        //输入路径
        String dst = "hdfs://localhost:9000/input/wcinput";
//        String dst = "hdfs://localhost:9000" + args[1];

        //输出路径，必须是不存在的，空文件也不行。
        String dstOut = "hdfs://localhost:9000/output/wcoutput";
//        String dstOut = "hdfs://localhost:9000" + args[2];

        Configuration hadoopConfig = new Configuration();
        hadoopConfig.set("fs.hdfs.impl",
                org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()
        );

        hadoopConfig.set("fs.file.impl",
                org.apache.hadoop.fs.LocalFileSystem.class.getName()
        );

        //如果输出目录已经存在，则先删除
        FileSystem fileSystem = FileSystem.get(new URI("hdfs://localhost:9000"), hadoopConfig);
        Path outputPath = new Path("/output/wcoutput");
        if(fileSystem.exists(outputPath)){
            fileSystem.delete(outputPath,true);
        }

 		//如果输出目录已经存在，则先删除
        FileSystem fileSystem = FileSystem.get(new URI("hdfs://localhost:9000"), hadoopConfig);
        Path outputPath = new Path("/output/wcoutput");
        if(fileSystem.exists(outputPath)){
            fileSystem.delete(outputPath,true);
        }
        
        Job job = new Job(hadoopConfig);
        //如果需要打成jar运行，需要下面这句
        job.setJarByClass(WC.class);
        job.setJobName("wordcount");

        //job执行作业时输入和输出文件的路径
        FileInputFormat.addInputPath(job, new Path(dst));
        FileOutputFormat.setOutputPath(job, new Path(dstOut));

        //指定自定义的Mapper和Reducer作为两个阶段的任务处理类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        // 设置combiner
        job.setCombinerClass(WordCountReducer.class);

        //设置最后输出结果的Key和Value的类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //执行job，直到完成
        job.waitForCompletion(true);
        System.out.println("Job Finished");
        System.exit(0);
    }
}

依赖

 <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.2.1</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>3.2.1</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-jobclient -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>3.2.1</version>
            <scope>provided</scope>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-common</artifactId>
            <version>3.2.1</version>
        </dependency>

    </dependencies>

gradle 格式

// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client
compile group: 'org.apache.hadoop', name: 'hadoop-client', version: '3.2.1'
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common
compile group: 'org.apache.hadoop', name: 'hadoop-common', version: '3.2.1'
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-jobclient
compile group: 'org.apache.hadoop', name: 'hadoop-mapreduce-client-jobclient', version: '3.2.1'

你可能感兴趣的:(#,hadoop)

浅谈MapReduce Android路上的人 Hadoop 分布式计算 mapreduce 分布式框架 hadoop
从今天开始，本人将会开始对另一项技术的学习，就是当下炙手可热的Hadoop分布式就算技术。目前国内外的诸多公司因为业务发展的需要，都纷纷用了此平台。国内的比如BAT啦，国外的在这方面走的更加的前面，就不一一列举了。但是Hadoop作为Apache的一个开源项目，在下面有非常多的子项目，比如HDFS，HBase,Hive，Pig,等等，要先彻底学习整个Hadoop，仅仅凭借一个的力量，是远远不够的。
Hadoop 傲雪凌霜，松柏长青后端大数据 hadoop 大数据分布式
ApacheHadoop是一个开源的分布式计算框架，主要用于处理海量数据集。它具有高度的可扩展性、容错性和高效的分布式存储与计算能力。Hadoop核心由四个主要模块组成，分别是HDFS（分布式文件系统）、MapReduce（分布式计算框架）、YARN（资源管理）和HadoopCommon（公共工具和库）。1.HDFS（HadoopDistributedFileSystem）HDFS是Hadoop生
Hadoop架构 henan程序媛 hadoop 大数据分布式
一、案列分析1.1案例概述现在已经进入了大数据(BigData)时代，数以万计用户的互联网服务时时刻刻都在产生大量的交互，要处理的数据量实在是太大了，以传统的数据库技术等其他手段根本无法应对数据处理的实时性、有效性的需求。HDFS顺应时代出现，在解决大数据存储和计算方面有很多的优势。1.2案列前置知识点1.什么是大数据大数据是指无法在一定时间范围内用常规软件工具进行捕捉、管理和处理的大量数据集合，
分享一个基于python的电子书数据采集与可视化分析 hadoop电子书数据分析与推荐系统 spark大数据毕设项目（源码、调试、LW、开题、PPT) 计算机源码社 Python项目大数据大数据 python hadoop 计算机毕业设计选题计算机毕业设计源码数据分析 spark毕设
作者：计算机源码社个人简介：本人八年开发经验，擅长Java、Python、PHP、.NET、Node.js、Android、微信小程序、爬虫、大数据、机器学习等，大家有这一块的问题可以一起交流！学习资料、程序开发、技术解答、文档报告如需要源码，可以扫取文章下方二维码联系咨询Java项目微信小程序项目Android项目Python项目PHP项目ASP.NET项目Node.js项目选题推荐项目实战|p
hbase介绍 CrazyL- 云计算+大数据 hbase
hbase是一个分布式的、多版本的、面向列的开源数据库hbase利用hadoophdfs作为其文件存储系统，提供高可靠性、高性能、列存储、可伸缩、实时读写、适用于非结构化数据存储的数据库系统hbase利用hadoopmapreduce来处理hbase、中的海量数据hbase利用zookeeper作为分布式系统服务特点：数据量大：一个表可以有上亿行，上百万列（列多时，插入变慢）面向列：面向列（族）的
大数据毕业设计hadoop+spark+hive知识图谱租房数据分析可视化大屏租房推荐系统 58同城租房爬虫房源推荐系统房价预测系统计算机毕业设计机器学习深度学习人工智能 2401_84572577 程序员大数据 hadoop 人工智能
做了那么多年开发，自学了很多门编程语言，我很明白学习资源对于学一门新语言的重要性，这些年也收藏了不少的Python干货，对我来说这些东西确实已经用不到了，但对于准备自学Python的人来说，或许它就是一个宝藏，可以给你省去很多的时间和精力。别在网上瞎学了，我最近也做了一些资源的更新，只要你是我的粉丝，这期福利你都可拿走。我先来介绍一下这些东西怎么用，文末抱走。（1）Python所有方向的学习路线（
Spark集群的三种模式 MelodyYN #Spark spark hadoop big data
文章目录1、Spark的由来1.1Hadoop的发展1.2MapReduce与Spark对比2、Spark内置模块3、Spark运行模式3.1Standalone模式部署配置历史服务器配置高可用运行模式3.2Yarn模式安装部署配置历史服务器运行模式4、WordCount案例1、Spark的由来定义：Hadoop主要解决，海量数据的存储和海量数据的分析计算。Spark是一种基于内存的快速、通用、可
月度总结 | 2022年03月 | 考研与就业的抉择 | 确定未来走大数据开发路线「已注销」个人总结 hadoop
一、时间线梳理3月3日，寻找到同专业的就业伙伴3月5日，着手准备Java八股文，决定先走Java后端路线3月8月，申请到了校图书馆的考研专座，决定暂时放弃就业，先准备考研，买了数学和408的资料书3月9日-3月13日，因疫情原因，宿舍区暂封，这段时间在准备考研，发现内容特别多3月13日-3月19日，大部分时间在刷Hadoop、Zookeeper、Kafka的视频，同时在准备实习的项目3月20日，退
HBase介绍 mingyu1016 数据库
概述HBase是一个分布式的、面向列的开源数据库,源于google的一篇论文《bigtable：一个结构化数据的分布式存储系统》。HBase是GoogleBigtable的开源实现，它利用HadoopHDFS作为其文件存储系统，利用HadoopMapReduce来处理HBase中的海量数据，利用Zookeeper作为协同服务。HBase的表结构HBase以表的形式存储数据。表有行和列组成。列划分为
Java中的大数据处理框架对比分析省赚客app开发者 java 开发语言
Java中的大数据处理框架对比分析大家好，我是微赚淘客系统3.0的小编，是个冬天不穿秋裤，天冷也要风度的程序猿！今天，我们将深入探讨Java中常用的大数据处理框架，并对它们进行对比分析。大数据处理框架是现代数据驱动应用的核心，它们帮助企业处理和分析海量数据，以提取有价值的信息。本文将重点介绍ApacheHadoop、ApacheSpark、ApacheFlink和ApacheStorm这四种流行的
Hadoop windows intelij 跑 MR WordCount piziyang12138
一、软件环境我使用的软件版本如下:IntellijIdea2017.1Maven3.3.9Hadoop分布式环境二、创建maven工程打开Idea,file->new->Project,左侧面板选择maven工程。(如果只跑MapReduce创建java工程即可，不用勾选Creatfromarchetype，如果想创建web工程或者使用骨架可以勾选)image.png设置GroupId和Artif
Hadoop学习第三课（HDFS架构--读、写流程）小小程序员呀~ 数据库 hadoop 架构 big data
1.块概念举例1：一桶水1000ml，瓶子的规格100ml=>需要10个瓶子装完一桶水1010ml，瓶子的规格100ml=>需要11个瓶子装完一桶水1010ml，瓶子的规格200ml=>需要6个瓶子装完块的大小规格，只要是需要存储，哪怕一点点，也是要占用一个块的块大小的参数：dfs.blocksize官方默认的大小为128M官网：https://hadoop.apache.org/docs/r3.
hadoop启动HDFS命令 m0_67401228 java 搜索引擎 linux 后端
启动命令：/hadoop/sbin/start-dfs.sh停止命令：/hadoop/sbin/stop-dfs.sh
【计算机毕设-大数据方向】基于Hadoop的电商交易数据分析可视化系统的设计与实现程序员-石头山大数据实战案例大数据 hadoop 毕业设计毕设
博主介绍：✌全平台粉丝5W+,高级大厂开发程序员，博客之星、掘金/知乎/华为云/阿里云等平台优质作者。【源码获取】关注并且私信我【联系方式】最下边感兴趣的可以先收藏起来，同学门有不懂的毕设选题，项目以及论文编写等相关问题都可以和学长沟通，希望帮助更多同学解决问题前言随着电子商务行业的迅猛发展，电商平台积累了海量的数据资源，这些数据不仅包括用户的基本信息、购物记录，还包括用户的浏览行为、评价反馈等多
分布式离线计算—Spark—基础介绍测试开发abbey 人工智能—大数据
原文作者：饥渴的小苹果原文地址：【Spark】Spark基础教程目录Spark特点Spark相对于Hadoop的优势Spark生态系统Spark基本概念Spark结构设计Spark各种概念之间的关系Executor的优点Spark运行基本流程Spark运行架构的特点Spark的部署模式Spark三种部署方式Hadoop和Spark的统一部署摘要：Spark是基于内存计算的大数据并行计算框架Spar
spark常用命令我是浣熊的微笑 spark
查看报错日志：yarnlogsapplicationIDspark2-submit--masteryarn--classcom.hik.ReadHdfstest-1.0-SNAPSHOT.jar进入$SPARK_HOME目录，输入bin/spark-submit--help可以得到该命令的使用帮助。hadoop@wyy:/app/hadoop/spark100$bin/spark-submit--
spark启动命令学不会又听不懂 spark 大数据分布式
hadoop启动：cd/root/toolssstart-dfs.sh，只需在hadoop01上启动stop-dfs.sh日志查看：cat/root/toolss/hadoop/logs/hadoop-root-datanode-hadoop03.outzookeeper启动：cd/root/toolss/zookeeperbin/zkServer.shstart，三台都要启动bin/zkServ
编程常用命令总结 Yellow0523 Linux BigData 大数据
编程命令大全1.软件环境变量的配置JavaScalaSparkHadoopHive2.大数据软件常用命令Spark基本命令Spark-SQL命令Hive命令HDFS命令YARN命令Zookeeper命令kafka命令Hibench命令MySQL命令3.Linux常用命令Git命令conda命令pip命令查看Linux系统的详细信息查看Linux系统架构(X86还是ARM，两种方法都可)端口号命令L
Hadoop常见面试题整理及解答叶青舟 Linux hdfs 大数据 hadoop linux
Hadoop常见面试题整理及解答一、基础知识篇：1.把数据仓库从传统关系型数据库转到hadoop有什么优势？答：（1）关系型数据库成本高，且存储空间有限。而Hadoop使用较为廉价的机器存储数据，且Hadoop可以将大量机器构建成一个集群，并在集群中使用HDFS文件系统统一管理数据，极大的提高了数据的存储及处理能力。（2）关系型数据库仅支持标准结构化数据格式，Hadoop不仅支持标准结构化数据格式
2025毕业设计指南：如何用Hadoop构建超市进货推荐系统？大数据分析助力精准采购计算机编程指导师 Java实战集 Python实战集大数据实战集课程设计 hadoop 数据分析 spring boot java 进货 python
✍✍计算机编程指导师⭐⭐个人介绍：自己非常喜欢研究技术问题！专业做Java、Python、小程序、安卓、大数据、爬虫、Golang、大屏等实战项目。⛽⛽实战项目：有源码或者技术上的问题欢迎在评论区一起讨论交流！⚡⚡Java实战|SpringBoot/SSMPython实战项目|Django微信小程序/安卓实战项目大数据实战项目⚡⚡文末获取源码文章目录⚡⚡文末获取源码基于hadoop的超市进货推荐系
Hadoop Common 之序列化机制小解猫君之上 #Apache Hadoop
1.JavaSerializable序列化该序列化通过ObjectInputStream的readObject实现序列化，ObjectOutputStream的writeObject实现反序列化。这不过此种序列化虽然跨病态兼容性强，但是因为存储过多的信息，但是传输效率比较低，所以hadoop弃用它。（序列化信息包括这个对象的类，类签名，类的所有静态，费静态成员的值，以及他们父类都要被写入）publ
深入理解hadoop(一)----Common的实现----Configuration maoxiao_jsd 深入理解----hadoop
属本人个人原创，转载请注明,希望对大家有帮助！！一,hadoop的配置管理a,hadoop通过独有的Configuration处理配置信息Configurationconf=newConfiguration();conf.addResource("core-default.xml");conf.addResource("core-site.xml");后者会覆盖前者中未final标记的相同配置项b
hadoop 0.22.0 部署笔记 weixin_33701564 大数据 java 运维
为什么80%的码农都做不了架构师？>>>因为需要使用hbase，所以开始对hbase进行学习。hbase是部署在hadoop平台上的NOSql数据库，因此在部署hbase之前需要先部署hadoop。环境：redhat5、hadoop-0.22.0.tar.gz、jdk-6u13-linux-i586.zipip192.168.1.128hostname：localhost.localdomain（
解决Windows环境下hadoop集群的运行_window运行hadoop,unknown hadoop01(4) 2401_84160087 大数据面试学习
网上学习资料一大堆，但如果学到的知识不成体系，遇到问题时只是浅尝辄止，不再深入研究，那么很难做到真正的技术提升。需要这份系统化资料的朋友，可以戳这里获取一个人可以走的很快，但一群人才能走的更远！不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人，都欢迎加入我们的的圈子（技术交流、学习资源、职场吐槽、大厂内推、面试辅导），让我们一起学习成长！org.apache.hadoophadoop-com
解决Windows环境下hadoop集群的运行_window运行hadoop,unknown hadoop01(3) 2401_84160087 大数据面试学习
网上学习资料一大堆，但如果学到的知识不成体系，遇到问题时只是浅尝辄止，不再深入研究，那么很难做到真正的技术提升。需要这份系统化资料的朋友，可以戳这里获取一个人可以走的很快，但一群人才能走的更远！不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人，都欢迎加入我们的的圈子（技术交流、学习资源、职场吐槽、大厂内推、面试辅导），让我们一起学习成长！xmlns:xsi="http://www.w3.or
深入解析HDFS：定义、架构、原理、应用场景及常用命令 CloudJourney hdfs 架构 hadoop
引言Hadoop分布式文件系统（HDFS，HadoopDistributedFileSystem）是Hadoop框架的核心组件之一，它提供了高可靠性、高可用性和高吞吐量的大规模数据存储和管理能力。本文将从HDFS的定义、架构、工作原理、应用场景以及常用命令等多个方面进行详细探讨，帮助读者全面深入地了解HDFS。1.HDFS的定义1.1什么是HDFSHDFS是Hadoop生态系统中的一个分布式文件系
Hadoop的搭建流程 lzhlizihang hadoop 大数据分布式
文章目录一、配置IP二、配置主机名三、配置主机映射四、关闭防火墙五、配置免密六、安装jdk1、第一步：2、第二步：3、第三步：4、第四步：5、第五步：七、安装hadoop1、上传2、解压3、重命名4、开始配置环境变量5、刷新配置文件6、验证hadoop命令是否可以识别八、全分布搭建7、修改配置文件core-site.xml8、修改配置文件hdfs-site.xml9、修改配置文件hadoop-en
hive搭建 -----内嵌模式和本地模式 lzhlizihang hive hadoop
文章目录一、内嵌模式（使用较少）1、上传、解压、重命名2、配置环境变量3、配置conf下的hive-env.sh4、修改conf下的hive-site.xml5、启动hadoop集群6、给hdfs创建文件夹7、修改hive-site.xml中的非法字符8、初始化元数据9、测试是否成功10、内嵌模式的缺点二、本地模式（最常用）1、检查mysql是否正常2、上传、解压、重命名3、配置环境变量4、修改c
Hadoop之mapreduce -- WrodCount案例以及各种概念 lzhlizihang hadoop mapreduce 大数据
文章目录一、MapReduce的优缺点二、MapReduce案例--WordCount1、导包2、Mapper方法3、Partitioner方法（自定义分区器）4、reducer方法5、driver（main方法）6、Writable（手机流量统计案例的实体类）三、关于片和块1、什么是片，什么是块？2、mapreduce启动多少个MapTask任务？四、MapReduce的原理五、Shuffle过
IAAS: IT公司去IOE-Alibaba系统构架解读 wishchin 心理学/职业 BigDataMini Spark PaaS
从Hadoop到自主研发，技术解读阿里去IOE后的系统架构原地址：......................云计算阿里飞天摘要：从IOE时代，到Hadoop与飞天并行，再到飞天单集群5000节点的实现，阿里一直摸索在技术衍变的前沿。这里，我们将从架构、性能、运维等多个方面深入了解阿里基础设施。【导读】互联网的普及，智能终端的增加，大数据时代悄然而至。在这个数据为王的时代，数十倍、数百倍的数据给各
java线程Thread和Runnable区别和联系 zx_code java jvm thread 多线程 Runnable
我们都晓得java实现线程2种方式，一个是继承Thread，另一个是实现Runnable。模拟窗口买票，第一例子继承thread，代码如下 package thread; public class ThreadTest { public static void main(String[] args) { Thread1 t1 = new Thread1(
【转】JSON与XML的区别比较丁_新 json xml
1.定义介绍 (1).XML定义扩展标记语言 (Extensible Markup Language, XML) ，用于标记电子文件使其具有结构性的标记语言，可以用来标记数据、定义数据类型，是一种允许用户对自己的标记语言进行定义的源语言。 XML使用DTD(document type definition)文档类型定义来组织数据;格式统一，跨平台和语言，早已成为业界公认的标准。 XML是标
c++ 实现五种基础的排序算法 CrazyMizzz C++c 算法
#include<iostream> using namespace std; //辅助函数，交换两数之值 template<class T> void mySwap(T &x, T &y){ T temp = x; x = y; y = temp; } const int size = 10; //一、用直接插入排
我的软件麦田的设计者我的软件音乐类娱乐放松
这是我写的一款app软件，耗时三个月，是一个根据央视节目开门大吉改变的，提供音调，猜歌曲名。1、手机拥有者在android手机市场下载本APP，同意权限，安装到手机上。2、游客初次进入时会有引导页面提醒用户注册。（同时软件自动播放背景音乐）。3、用户登录到主页后，会有五个模块。a、点击不胫而走，用户得到开门大吉首页部分新闻，点击进入有新闻详情。b、
linux awk命令详解被触发 linux awk
awk是行处理器: 相比较屏幕处理的优点，在处理庞大文件时不会出现内存溢出或是处理缓慢的问题，通常用来格式化文本信息 awk处理过程: 依次对每一行进行处理，然后输出 awk命令形式: awk [-F|-f|-v] ‘BEGIN{} //{command1; command2} END{}’ file [-F|-f|-v]大参数，-F指定分隔符，-f调用脚本，-v定义变量 var=val
各种语言比较 _wy_ 编程语言
Java Ruby PHP 擅长领域
oracle 中数据类型为clob的编辑知了ing oracle clob
public void updateKpiStatus(String kpiStatus,String taskId){ Connection dbc=null; Statement stmt=null; PreparedStatement ps=null; try { dbc = new DBConn().getNewConnection(); //stmt = db
分布式服务框架 Zookeeper -- 管理分布式环境中的数据矮蛋蛋 zookeeper
原文地址： http://www.ibm.com/developerworks/cn/opensource/os-cn-zookeeper/ 安装和配置详解本文介绍的 Zookeeper 是以 3.2.2 这个稳定版本为基础，最新的版本可以通过官网 http://hadoop.apache.org/zookeeper/来获取，Zookeeper 的安装非常简单，下面将从单机模式和集群模式两
tomcat数据源 alafqq tomcat
数据库 JNDI(Java Naming and Directory Interface，Java命名和目录接口)是一组在Java应用中访问命名和目录服务的API。没有使用JNDI时我用要这样连接数据库： 03. Class.forName("com.mysql.jdbc.Driver"); 04. conn
遍历的方法百合不是茶遍历
遍历在java的泛
linux查看硬件信息的命令 bijian1013 linux
linux查看硬件信息的命令一.查看CPU： cat /proc/cpuinfo 二.查看内存： free 三.查看硬盘： df linux下查看硬件信息 1、lspci 列出所有PCI 设备； lspci - list all PCI devices:列出机器中的PCI设备（声卡、显卡、Modem、网卡、USB、主板集成设备也能
java常见的ClassNotFoundException bijian1013 java
1.java.lang.ClassNotFoundException: org.apache.commons.logging.LogFactory 添加包common-logging.jar2.java.lang.ClassNotFoundException: javax.transaction.Synchronization
【Gson五】日期对象的序列化和反序列化 bit1129 反序列化
对日期类型的数据进行序列化和反序列化时，需要考虑如下问题： 1. 序列化时，Date对象序列化的字符串日期格式如何 2. 反序列化时，把日期字符串序列化为Date对象，也需要考虑日期格式问题 3. Date A -> str -> Date B,A和B对象是否equals 默认序列化和反序列化 import com
【Spark八十六】Spark Streaming之DStream vs. InputDStream bit1129 Stream
1. DStream的类说明文档： /** * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous * sequence of RDDs (of the same type) representing a continuous st
通过nginx获取header信息 ronin47 nginx header
1. 提取整个的Cookies内容到一个变量，然后可以在需要时引用，比如记录到日志里面， if ( $http_cookie ~* "(.*)$") { set $all_cookie $1; } 变量$all_cookie就获得了cookie的值，可以用于运算了
java-65.输入数字n，按顺序输出从1最大的n位10进制数。比如输入3，则输出1、2、3一直到最大的3位数即999 bylijinnan java
参考了网上的http://blog.csdn.net/peasking_dd/article/details/6342984 写了个java版的： public class Print_1_To_NDigit { /** * Q65.输入数字n，按顺序输出从1最大的n位10进制数。比如输入3，则输出1、2、3一直到最大的3位数即999 * 1.使用字符串
Netty源码学习-ReplayingDecoder bylijinnan java netty
ReplayingDecoder是FrameDecoder的子类，不熟悉FrameDecoder的，可以先看看 http://bylijinnan.iteye.com/blog/1982618 API说，ReplayingDecoder简化了操作，比如： FrameDecoder在decode时，需要判断数据是否接收完全： public class IntegerH
js特殊字符过滤 cngolon js特殊字符 js特殊字符过滤
1.js中用正则表达式过滤特殊字符, 校验所有输入域是否含有特殊符号function stripscript(s) { var pattern = new RegExp("[`~!@#$^&*()=|{}':;',\\[\\].<>/?~！@#￥……&*（）——|{}【】‘；：”“'。，、？]"
hibernate使用sql查询 ctrain Hibernate
import java.util.Iterator; import java.util.List; import java.util.Map; import org.hibernate.Hibernate; import org.hibernate.SQLQuery; import org.hibernate.Session; import org.hibernate.Transa
linux shell脚本中切换用户执行命令方法 daizj linux shell 命令切换用户
经常在写shell脚本时，会碰到要以另外一个用户来执行相关命令，其方法简单记下： 1、执行单个命令：su - user -c "command" 如：下面命令是以test用户在/data目录下创建test123目录 [root@slave19 /data]# su - test -c "mkdir /data/test123"
好的代码里只要一个 return 语句 dcj3sjt126com return
别再这样写了：public boolean foo() { if (true) { return true; } else { return false;
Android动画效果学习 dcj3sjt126com android
1、透明动画效果方法一：代码实现 public View onCreateView(LayoutInflater inflater, ViewGroup container, Bundle savedInstanceState) { View rootView = inflater.inflate(R.layout.fragment_main, container, fals
linux复习笔记之bash shell (4)管道命令 eksliang linux管道命令汇总 linux管道命令 linux常用管道命令
转载请出自出处： http://eksliang.iteye.com/blog/2105461 bash命令执行的完毕以后，通常这个命令都会有返回结果，怎么对这个返回的结果做一些操作呢？那就得用管道命令‘|’。上面那段话，简单说了下管道命令的作用，那什么事管道命令呢？答：非常的经典的一句话，记住了，何为管
Android系统中自定义按键的短按、双击、长按事件 gqdy365 android
在项目中碰到这样的问题：由于系统中的按键在底层做了重新定义或者新增了按键，此时需要在APP层对按键事件（keyevent）做分解处理，模拟Android系统做法，把keyevent分解成： 1、单击事件：就是普通key的单击； 2、双击事件：500ms内同一按键单击两次； 3、长按事件：同一按键长按超过1000ms（系统中长按事件为500ms）； 4、组合按键：两个以上按键同时按住；
asp.net获取站点根目录下子目录的名称 hvt .net C#asp.net hovertree Web Forms
使用Visual Studio建立一个.aspx文件(Web Forms)，例如hovertree.aspx,在页面上加入一个ListBox代码如下： <asp:ListBox runat="server" ID="lbKeleyiFolder" /> 那么在页面上显示根目录子文件夹的代码如下： string[] m_sub
Eclipse程序员要掌握的常用快捷键 justjavac java eclipse 快捷键 ide
判断一个人的编程水平，就看他用键盘多，还是鼠标多。用键盘一是为了输入代码（当然了，也包括注释），再有就是熟练使用快捷键。曾有人在豆瓣评《卓有成效的程序员》：“人有多大懒，才有多大闲”。之前我整理了一个程序员图书列表，目的也就是通过读书，让程序员变懒。写道程序员作为特殊的群体，有的人可以这么懒，懒到事情都交给机器去做，而有的人又可
c++编程随记 lx.asymmetric C++笔记
为了字体更好看，改变了格式…… &&运算符： #include<iostream> using namespace std; int main(){ int a=-1,b=4,k; k=(++a<0)&&!(b--
linux标准IO缓冲机制研究音频数据 linux
一、什么是缓存I/O(Buffered I/O)缓存I/O又被称作标准I/O,大多数文件系统默认I/O操作都是缓存I/O。在Linux的缓存I/O机制中，操作系统会将I/O的数据缓存在文件系统的页缓存(page cache)中，也就是说，数据会先被拷贝到操作系统内核的缓冲区中，然后才会从操作系统内核的缓冲区拷贝到应用程序的地址空间。1.缓存I/O有以下优点:A.缓存I/O使用了操作系统内核缓冲区，
随想生活暗黑小菠萝生活
其实账户之前就申请了，但是决定要自己更新一些东西看也是最近。从毕业到现在已经一年了。没有进步是假的，但是有多大的进步可能只有我自己知道。毕业的时候班里12个女生，真正最后做到软件开发的只要两个包括我，PS：我不是说测试不好。当时因为考研完全放弃找工作，考研失败，我想这只是我的借口。那个时候才想到为什么大学的时候不能好好的学习技术，增强自己的实战能力，以至于后来找工作比较费劲。我
我认为POJO是一个错误的概念 windshome java POJO 编程 J2EE 设计
这篇内容其实没有经过太多的深思熟虑，只是个人一时的感觉。从个人风格上来讲，我倾向简单质朴的设计开发理念；从方法论上，我更加倾向自顶向下的设计；从做事情的目标上来看，我追求质量优先，更愿意使用较为保守和稳妥的理念和方法。 &