鉴于平时的hadoop实践问题或粗读hadoop源码而记录形成”hadoop初读”系列部分。每篇主要以问题-解答两部分组成:问题是实践中的异常或疑惑;解答部分为源码的粗读理解(以注释形式给出说明)。
问题代码(主要实现多输入,报No input paths specified in job):
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ReduceSideJoin extends Configured implements Tool {
private static final Logger logger = LoggerFactory
.getLogger(ReduceSideJoin.class);
public static class LeftOutJoinReducer extends
Reducer {
private ArrayList leftTable = new ArrayList();
private ArrayList rightTable = new ArrayList();
private Text secondPar = null;
private Text output = new Text();
@Override
protected void reduce(IntWritable key, Iterable value,
Context context) throws IOException, InterruptedException {
leftTable.clear();
rightTable.clear();
for (OrderGoodsInfoWritble cv : value) {
String mRecordFlog = cv.getmRecordflag().toString().trim();
if ("orderInfo".equals(mRecordFlog)) {
leftTable.add(cv);
}else {
rightTable.add(cv);
}
}
for (OrderGoodsInfoWritble leftPart : leftTable) {
for (OrderGoodsInfoWritble rightPart : rightTable) {
if ((leftPart.getOrderId().toString()).equals((rightPart.getOrderId().toString()))) {
leftPart.setGoodsSn(rightPart.getGoodsSn());
leftPart.setGoodsName(rightPart.getGoodsName());
leftPart.setmRecordflag(new Text("orderGoodsInfo"));
context.write(key, leftPart);
}
}
}
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf);
Path path1 = new Path(args[0]);
Path path2 = new Path(args[1]);
FileSystem fs = FileSystem.get(URI.create("hdfs://***:9000/venus/ordergoodsinfo"), conf);
Path outputPath = new Path("hdfs://***:9000/venus/ordergoodsinfo");
fs.deleteOnExit(outputPath);
fs.close();
MultipleInputs.addInputPath(job, path1, TextInputFormat.class,
OrderInfoMapper.class);
MultipleInputs.addInputPath(job, path2, TextInputFormat.class,
OrderGoodsMapper.class);
FileOutputFormat.setOutputPath(job, outputPath);
job.setJobName("LeftOutJoinMR");
job.setJarByClass(ReduceSideJoin.class);
job.setReducerClass(LeftOutJoinReducer.class);
/**问题代码:调用MultipleInputs.addInputPath还设置了setInputFormatClass,导致报No input paths specified in job异常**/
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(OrderGoodsInfoWritble.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(OrderGoodsInfoWritble.class);
job.waitForCompletion(true);
return job.isSuccessful() ? 0 : 1;
}
public static void main(String[] args) throws Exception {
Tool rdf = new ReduceSideJoin();
int returnCode = ToolRunner.run(rdf, args);
System.exit(returnCode);
}
}
问题:
1.既然已经设置了多输入路径(MultipleInputs.addInputPath)为什么还报No input paths specified in job异常。
解答:
因为最后的setInputFormatClass覆盖了多输入的设置。具体见下面源码实现的说明.
1)job.setInputFormatClass(TextInputFormat.class)的源码实现:
public void setInputFormatClass(Class extends InputFormat> cls
) throws IllegalStateException {
ensureState(JobState.DEFINE);
/**
设置的InputFormatClass为传进来的TextInputFormat
**/
conf.setClass(INPUT_FORMAT_CLASS_ATTR, cls,
InputFormat.class);
}
2)MultipleInputs.addInputPath源码实现:
public static void addInputPath(Job job, Path path,
Class extends InputFormat> inputFormatClass,
Class extends Mapper> mapperClass) {
/**
调用了另外一个addInputPath方法
**/
addInputPath(job, path, inputFormatClass);
Configuration conf = job.getConfiguration();
String mapperMapping = path.toString() + ";" + mapperClass.getName();
String mappers = conf.get(DIR_MAPPERS);
conf.set(DIR_MAPPERS, mappers == null ? mapperMapping
: mappers + "," + mapperMapping);
job.setMapperClass(DelegatingMapper.class);
}
===========================================
public static void addInputPath(Job job, Path path,
Class extends InputFormat> inputFormatClass) {
String inputFormatMapping = path.toString() + ";"
+ inputFormatClass.getName();
Configuration conf = job.getConfiguration();
String inputFormats = conf.get(DIR_FORMATS);
conf.set(DIR_FORMATS,
inputFormats == null ? inputFormatMapping : inputFormats + ","
+ inputFormatMapping);
/**
最后设置的InputFormatClass为DelegatingInputFormat
**/
job.setInputFormatClass(DelegatingInputFormat.class);
}
在提交job时(submitter.submitJobInternal)会调用InputFormat的getSplits方法,如下:
JobStatus submitJobInternal(Job job, Cluster cluster)
throws ClassNotFoundException, InterruptedException, IOException {
checkSpecs(job);
Configuration conf = job.getConfiguration();
addMRFrameworkToDistributedCache(conf);
Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
//configure the command line options correctly on the submitting dfs
InetAddress ip = InetAddress.getLocalHost();
if (ip != null) {
submitHostAddress = ip.getHostAddress();
submitHostName = ip.getHostName();
conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
}
JobID jobId = submitClient.getNewJobID();
job.setJobID(jobId);
Path submitJobDir = new Path(jobStagingArea, jobId.toString());
JobStatus status = null;
try {
conf.set(MRJobConfig.USER_NAME,
UserGroupInformation.getCurrentUser().getShortUserName());
conf.set("hadoop.http.filter.initializers",
"org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());
LOG.debug("Configuring job " + jobId + " with " + submitJobDir
+ " as the submit dir");
TokenCache.obtainTokensForNamenodes(job.getCredentials(),
new Path[] { submitJobDir }, conf);
populateTokenCache(conf, job.getCredentials());
if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
KeyGenerator keyGen;
try {
keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);
keyGen.init(SHUFFLE_KEY_LENGTH);
} catch (NoSuchAlgorithmException e) {
throw new IOException("Error generating shuffle secret key", e);
}
SecretKey shuffleKey = keyGen.generateKey();
TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),
job.getCredentials());
}
if (CryptoUtils.isEncryptedSpillEnabled(conf)) {
conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1);
LOG.warn("Max job attempts set to 1 since encrypted intermediate" +
"data spill is enabled");
}
copyAndConfigureFiles(job, submitJobDir);
Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);
LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
/**
最后会调用inputformat的getSplits方法
**/
int maps = writeSplits(job, submitJobDir);
conf.setInt(MRJobConfig.NUM_MAPS, maps);
LOG.info("number of splits:" + maps);
String queue = conf.get(MRJobConfig.QUEUE_NAME,
JobConf.DEFAULT_QUEUE_NAME);
AccessControlList acl = submitClient.getQueueAdmins(queue);
conf.set(toFullPropertyName(queue,
QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());
TokenCache.cleanUpTokenReferral(conf);
if (conf.getBoolean(
MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,
MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {
ArrayList trackingIds = new ArrayList();
for (Token extends TokenIdentifier> t :
job.getCredentials().getAllTokens()) {
trackingIds.add(t.decodeIdentifier().getTrackingId());
}
conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,
trackingIds.toArray(new String[trackingIds.size()]));
}
ReservationId reservationId = job.getReservationId();
if (reservationId != null) {
conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());
}
writeConf(conf, submitJobFile);
printTokens(jobId, job.getCredentials());
status = submitClient.submitJob(
jobId, submitJobDir.toString(), job.getCredentials());
if (status != null) {
return status;
} else {
throw new IOException("Could not launch job");
}
} finally {
if (status == null) {
LOG.info("Cleaning up the staging area " + submitJobDir);
if (jtFs != null && submitJobDir != null)
jtFs.delete(submitJobDir, true);
}
}
private
int writeNewSplits(JobContext job, Path jobSubmitDir) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = job.getConfiguration();
InputFormat, ?> input =
ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
/**
调用了inputformat的getSplits
**/
List splits = input.getSplits(job);
T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);
Arrays.sort(array, new SplitComparator());
JobSplitWriter.createSplitFiles(jobSubmitDir, conf,
jobSubmitDir.getFileSystem(conf), array);
return array.length;
}
如果最后设置InputFormatClass为TextInputFormat的情况下TextInputFormat.getSplits方法会通过名为mapreduce.input.fileinputformat.inputdir的属性来获取具体输入路径,在属性值为空的情况下会报
No input paths specified in job异常.