hadoop初读--setInputFormatClass

鉴于平时的hadoop实践问题或粗读hadoop源码而记录形成”hadoop初读”系列部分。每篇主要以问题-解答两部分组成:问题是实践中的异常或疑惑;解答部分为源码的粗读理解(以注释形式给出说明)。

问题代码(主要实现多输入,报No input paths specified in job):

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ReduceSideJoin extends Configured implements Tool {
    private static final Logger logger = LoggerFactory
            .getLogger(ReduceSideJoin.class);

    public static class LeftOutJoinReducer extends
            Reducer {
        private ArrayList leftTable = new ArrayList();
        private ArrayList rightTable = new ArrayList();
        private Text secondPar = null;
        private Text output = new Text();
        @Override
        protected void reduce(IntWritable key, Iterable value,
                              Context context) throws IOException, InterruptedException {
            leftTable.clear();
            rightTable.clear();
            for (OrderGoodsInfoWritble cv : value) {
                String mRecordFlog = cv.getmRecordflag().toString().trim();
                if ("orderInfo".equals(mRecordFlog)) {
                    leftTable.add(cv);
                }else {
                    rightTable.add(cv);
                }
            }
            for (OrderGoodsInfoWritble leftPart : leftTable) {
                for (OrderGoodsInfoWritble rightPart : rightTable) {
                    if ((leftPart.getOrderId().toString()).equals((rightPart.getOrderId().toString()))) {
                        leftPart.setGoodsSn(rightPart.getGoodsSn());
                        leftPart.setGoodsName(rightPart.getGoodsName());
                        leftPart.setmRecordflag(new Text("orderGoodsInfo"));
                        context.write(key, leftPart);
                    }
                }
            }
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = getConf(); 
        Job job = Job.getInstance(conf);

        Path path1 = new Path(args[0]);
        Path path2 = new Path(args[1]);
        FileSystem fs = FileSystem.get(URI.create("hdfs://***:9000/venus/ordergoodsinfo"), conf);
        Path outputPath = new Path("hdfs://***:9000/venus/ordergoodsinfo");
        fs.deleteOnExit(outputPath);
        fs.close();

        MultipleInputs.addInputPath(job, path1, TextInputFormat.class,
                OrderInfoMapper.class);

        MultipleInputs.addInputPath(job, path2, TextInputFormat.class,
                OrderGoodsMapper.class);
        FileOutputFormat.setOutputPath(job, outputPath); 

        job.setJobName("LeftOutJoinMR");
        job.setJarByClass(ReduceSideJoin.class);
        job.setReducerClass(LeftOutJoinReducer.class);

/**问题代码:调用MultipleInputs.addInputPath还设置了setInputFormatClass,导致报No input paths specified in job异常**/
     job.setInputFormatClass(TextInputFormat.class);
      job.setOutputFormatClass(TextOutputFormat.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(OrderGoodsInfoWritble.class);

        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(OrderGoodsInfoWritble.class);
        job.waitForCompletion(true);
        return job.isSuccessful() ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
            Tool rdf = new ReduceSideJoin();
            int returnCode = ToolRunner.run(rdf, args);
            System.exit(returnCode);
    }
}  

问题:
1.既然已经设置了多输入路径(MultipleInputs.addInputPath)为什么还报No input paths specified in job异常。

解答:
因为最后的setInputFormatClass覆盖了多输入的设置。具体见下面源码实现的说明.

1)job.setInputFormatClass(TextInputFormat.class)的源码实现:

  public void setInputFormatClass(Class cls
                                  ) throws IllegalStateException {
    ensureState(JobState.DEFINE);
    /**
    设置的InputFormatClass为传进来的TextInputFormat
    **/
    conf.setClass(INPUT_FORMAT_CLASS_ATTR, cls, 
                  InputFormat.class);
  }

2)MultipleInputs.addInputPath源码实现:

public static void addInputPath(Job job, Path path,
      Classextends InputFormat> inputFormatClass,
      Classextends Mapper> mapperClass) {
    /**
    调用了另外一个addInputPath方法
    **/
    addInputPath(job, path, inputFormatClass);
    Configuration conf = job.getConfiguration();
    String mapperMapping = path.toString() + ";" + mapperClass.getName();
    String mappers = conf.get(DIR_MAPPERS);
    conf.set(DIR_MAPPERS, mappers == null ? mapperMapping
       : mappers + "," + mapperMapping);

    job.setMapperClass(DelegatingMapper.class);
  }
 ===========================================
 public static void addInputPath(Job job, Path path,
      Classextends InputFormat> inputFormatClass) {
    String inputFormatMapping = path.toString() + ";"
       + inputFormatClass.getName();
    Configuration conf = job.getConfiguration();
    String inputFormats = conf.get(DIR_FORMATS);
    conf.set(DIR_FORMATS,
       inputFormats == null ? inputFormatMapping : inputFormats + ","
           + inputFormatMapping);
    /**
    最后设置的InputFormatClass为DelegatingInputFormat
    **/
    job.setInputFormatClass(DelegatingInputFormat.class);
  }

在提交job时(submitter.submitJobInternal)会调用InputFormat的getSplits方法,如下:

JobStatus submitJobInternal(Job job, Cluster cluster) 
  throws ClassNotFoundException, InterruptedException, IOException {

    checkSpecs(job);

    Configuration conf = job.getConfiguration();
    addMRFrameworkToDistributedCache(conf);

    Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
    //configure the command line options correctly on the submitting dfs
    InetAddress ip = InetAddress.getLocalHost();
    if (ip != null) {
      submitHostAddress = ip.getHostAddress();
      submitHostName = ip.getHostName();
      conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
      conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
    }
    JobID jobId = submitClient.getNewJobID();
    job.setJobID(jobId);
    Path submitJobDir = new Path(jobStagingArea, jobId.toString());
    JobStatus status = null;
    try {
      conf.set(MRJobConfig.USER_NAME,
          UserGroupInformation.getCurrentUser().getShortUserName());
      conf.set("hadoop.http.filter.initializers", 
          "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
      conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());
      LOG.debug("Configuring job " + jobId + " with " + submitJobDir 
          + " as the submit dir");
      TokenCache.obtainTokensForNamenodes(job.getCredentials(),
          new Path[] { submitJobDir }, conf);

      populateTokenCache(conf, job.getCredentials());


      if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
        KeyGenerator keyGen;
        try {
          keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);
          keyGen.init(SHUFFLE_KEY_LENGTH);
        } catch (NoSuchAlgorithmException e) {
          throw new IOException("Error generating shuffle secret key", e);
        }
        SecretKey shuffleKey = keyGen.generateKey();
        TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),
            job.getCredentials());
      }
      if (CryptoUtils.isEncryptedSpillEnabled(conf)) {
        conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1);
        LOG.warn("Max job attempts set to 1 since encrypted intermediate" +
                "data spill is enabled");
      }

      copyAndConfigureFiles(job, submitJobDir);

      Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);

      LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
      /**
      最后会调用inputformat的getSplits方法
      **/
      int maps = writeSplits(job, submitJobDir);
      conf.setInt(MRJobConfig.NUM_MAPS, maps);
      LOG.info("number of splits:" + maps);
      String queue = conf.get(MRJobConfig.QUEUE_NAME,
          JobConf.DEFAULT_QUEUE_NAME);
      AccessControlList acl = submitClient.getQueueAdmins(queue);
      conf.set(toFullPropertyName(queue,
          QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());

      TokenCache.cleanUpTokenReferral(conf);

      if (conf.getBoolean(
          MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,
          MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {
        ArrayList trackingIds = new ArrayList();
        for (Token t :
            job.getCredentials().getAllTokens()) {
          trackingIds.add(t.decodeIdentifier().getTrackingId());
        }
        conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,
            trackingIds.toArray(new String[trackingIds.size()]));
      }
      ReservationId reservationId = job.getReservationId();
      if (reservationId != null) {
        conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());
      }
      writeConf(conf, submitJobFile);


      printTokens(jobId, job.getCredentials());
      status = submitClient.submitJob(
          jobId, submitJobDir.toString(), job.getCredentials());
      if (status != null) {
        return status;
      } else {
        throw new IOException("Could not launch job");
      }
    } finally {
      if (status == null) {
        LOG.info("Cleaning up the staging area " + submitJobDir);
        if (jtFs != null && submitJobDir != null)
          jtFs.delete(submitJobDir, true);

      }
    }
private 
  int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException,
      InterruptedException, ClassNotFoundException {
    Configuration conf = job.getConfiguration();
    InputFormat input =
      ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    /**
    调用了inputformat的getSplits
    **/
    List splits = input.getSplits(job);
    T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);

    Arrays.sort(array, new SplitComparator());
    JobSplitWriter.createSplitFiles(jobSubmitDir, conf, 
        jobSubmitDir.getFileSystem(conf), array);
    return array.length;
  }

如果最后设置InputFormatClass为TextInputFormat的情况下TextInputFormat.getSplits方法会通过名为mapreduce.input.fileinputformat.inputdir的属性来获取具体输入路径,在属性值为空的情况下会报
No input paths specified in job异常.

你可能感兴趣的:(hadoop源码初读)