$HADOOP_HOME/bin/hadoop jar xx.jar \
-D mapred.job.name="wordCount" \
-D mapred.reduce.tasks=5 \
-files=resources1.txt,resources2.txt \
-libjars=depend.jar \
-archives=dictionary.zip \
-input /test/input \
-output /test/output
然后,我们在来看看$HADOOP_HOME/bin/hadoop 脚本对作业提交jar命令处理,调用了org.apache.hadoop.util.RunJar类。
在RunJar类中通过unJar(File jarFile, File toDir)方法对jar进行解压;创建相应的临时目录然后将运行参数传递给MapReduce程序运行。源码定位到org.apache.hadoop.util.RunJar类的main方法:
/** Run a Hadoop job jar. If the main class is not in the jar's manifest, * then it must be provided on the command line. */ public static void main(String[] args) throws Throwable { String usage = "RunJar jarFile [mainClass] args..."; if (args.length < 1) { System.err.println(usage); System.exit(-1); } int firstArg = 0; String fileName = args[firstArg++]; File file = new File(fileName); String mainClassName = null; JarFile jarFile; try { jarFile = new JarFile(fileName); } catch(IOException io) { throw new IOException("Error opening job jar: " + fileName) .initCause(io); } Manifest manifest = jarFile.getManifest(); if (manifest != null) { mainClassName = manifest.getMainAttributes().getValue("Main-Class"); } jarFile.close(); if (mainClassName == null) { if (args.length < 2) { System.err.println(usage); System.exit(-1); } mainClassName = args[firstArg++]; } //进行相应的一些目录处理工作 mainClassName = mainClassName.replaceAll("/", "."); File tmpDir = new File(new Configuration().get("hadoop.tmp.dir")); tmpDir.mkdirs(); if (!tmpDir.isDirectory()) { System.err.println("Mkdirs failed to create " + tmpDir); System.exit(-1); } final File workDir = File.createTempFile("hadoop-unjar", "", tmpDir); workDir.delete(); workDir.mkdirs(); if (!workDir.isDirectory()) { System.err.println("Mkdirs failed to create " + workDir); System.exit(-1); } Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { try { FileUtil.fullyDelete(workDir); } catch (IOException e) { } } }); unJar(file, workDir);//解压jar包 ArrayListclassPath = new ArrayList (); classPath.add(new File(workDir+"/").toURL()); classPath.add(file.toURL()); classPath.add(new File(workDir, "classes/").toURL()); File[] libs = new File(workDir, "lib").listFiles(); if (libs != null) { for (int i = 0; i < libs.length; i++) { classPath.add(libs[i].toURL()); } } ClassLoader loader = new URLClassLoader(classPath.toArray(new URL[0])); Thread.currentThread().setContextClassLoader(loader); Class> mainClass = Class.forName(mainClassName, true, loader); Method main = mainClass.getMethod("main", new Class[] { Array.newInstance(String.class, 0).getClass() }); String[] newArgs = Arrays.asList(args) .subList(firstArg, args.length).toArray(new String[0]); try { main.invoke(null, new Object[] { newArgs }); } catch (InvocationTargetException e) { throw e.getTargetException(); } }
/** * Internal method for submitting jobs to the system. * @param job the configuration to submit * @return a proxy object for the running job * @throws FileNotFoundException * @throws ClassNotFoundException * @throws InterruptedException * @throws IOException */ public RunningJob submitJobInternal(final JobConf job ) throws FileNotFoundException, ClassNotFoundException, InterruptedException, IOException { /* * configure the command line options correctly on the submitting dfs */ return ugi.doAs(new PrivilegedExceptionAction() { public RunningJob run() throws FileNotFoundException, ClassNotFoundException, InterruptedException, IOException{ JobConf jobCopy = job; Path jobStagingArea = JobSubmissionFiles.getStagingDir(JobClient.this, jobCopy); JobID jobId = jobSubmitClient.getNewJobId(); Path submitJobDir = new Path(jobStagingArea, jobId.toString()); jobCopy.set("mapreduce.job.dir", submitJobDir.toString()); JobStatus status = null; try { populateTokenCache(jobCopy, jobCopy.getCredentials()); copyAndConfigureFiles(jobCopy, submitJobDir); // get delegation token for the dir TokenCache.obtainTokensForNamenodes(jobCopy.getCredentials(), new Path [] {submitJobDir}, jobCopy); Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir); int reduces = jobCopy.getNumReduceTasks(); InetAddress ip = InetAddress.getLocalHost(); if (ip != null) { job.setJobSubmitHostAddress(ip.getHostAddress()); job.setJobSubmitHostName(ip.getHostName()); } JobContext context = new JobContext(jobCopy, jobId); // Check the output specification if (reduces == 0 ? jobCopy.getUseNewMapper() : jobCopy.getUseNewReducer()) { org.apache.hadoop.mapreduce.OutputFormat,?> output = ReflectionUtils.newInstance(context.getOutputFormatClass(), jobCopy); output.checkOutputSpecs(context); } else { jobCopy.getOutputFormat().checkOutputSpecs(fs, jobCopy); } jobCopy = (JobConf)context.getConfiguration(); // Create the splits for the job FileSystem fs = submitJobDir.getFileSystem(jobCopy); LOG.debug("Creating splits at " + fs.makeQualified(submitJobDir)); int maps = writeSplits(context, submitJobDir); jobCopy.setNumMapTasks(maps); // write "queue admins of the queue to which job is being submitted" // to job file. String queue = jobCopy.getQueueName(); AccessControlList acl = jobSubmitClient.getQueueAdmins(queue); jobCopy.set(QueueManager.toFullPropertyName(queue, QueueACL.ADMINISTER_JOBS.getAclName()), acl.getACLString()); // Write job file to JobTracker's fs FSDataOutputStream out = FileSystem.create(fs, submitJobFile, new FsPermission(JobSubmissionFiles.JOB_FILE_PERMISSION)); try { jobCopy.writeXml(out); } finally { out.close(); } // // Now, actually submit the job (using the submit name) // printTokens(jobId, jobCopy.getCredentials()); status = jobSubmitClient.submitJob( jobId, submitJobDir.toString(), jobCopy.getCredentials()); JobProfile prof = jobSubmitClient.getJobProfile(jobId); if (status != null && prof != null) { return new NetworkedJob(status, prof, jobSubmitClient); } else { throw new IOException("Could not launch job"); } } finally { if (status == null) { LOG.info("Cleaning up the staging area " + submitJobDir); if (fs != null && submitJobDir != null) fs.delete(submitJobDir, true); } } } }); }
逐行看,首先看25行,JobClient会向JobTracker要到一个StagingAreaDir目录,其主要用途是作为HDFS作业文件的上传目录,管理员可以自行配置,配置可看JobTracker的getStagingAreaDirInternal(String user)方法:
private String getStagingAreaDirInternal(String user) throws IOException { final Path stagingRootDir = new Path(conf.get("mapreduce.jobtracker.staging.root.dir", "/tmp/hadoop/mapred/staging"));//默认的StagingAreaDir配置项 final FileSystem fs = stagingRootDir.getFileSystem(conf); return fs.makeQualified(new Path(stagingRootDir, user+"/.staging")).toString(); }
接着再看33行copyAndConfigureFiles(jobCopy, submitJobDir)方法,这方法主要是将作业文件上传到HDFS,然后通过DistributedCache放到Cache中,
/** * configure the jobconf of the user with the command line options of * -libjars, -files, -archives * @param job the JobConf * @param submitJobDir * @throws IOException */ private void copyAndConfigureFiles(JobConf job, Path jobSubmitDir) throws IOException, InterruptedException { short replication = (short)job.getInt("mapred.submit.replication", 10);//这里默认将作业文件的副本数调整为10的 copyAndConfigureFiles(job, jobSubmitDir, replication); // Set the working directory if (job.getWorkingDirectory() == null) { job.setWorkingDirectory(fs.getWorkingDirectory()); } }
注意一点:作业文件在HDFS上的副本数默认是为10的。进入copyAndConfigureFiles(job, jobSubmitDir, replication)看看一小段代码:
. . FileSystem.mkdirs(fs, submitJobDir, mapredSysPerms); Path filesDir = JobSubmissionFiles.getJobDistCacheFiles(submitJobDir); Path archivesDir = JobSubmissionFiles.getJobDistCacheArchives(submitJobDir); Path libjarsDir = JobSubmissionFiles.getJobDistCacheLibjars(submitJobDir); // add all the command line files/ jars and archive // first copy them to jobtrackers filesystem if (files != null) { FileSystem.mkdirs(fs, filesDir, mapredSysPerms); String[] fileArr = files.split(","); for (String tmpFile: fileArr) { URI tmpURI; try { tmpURI = new URI(tmpFile); } catch (URISyntaxException e) { throw new IllegalArgumentException(e); } Path tmp = new Path(tmpURI); Path newPath = copyRemoteFiles(fs,filesDir, tmp, job, replication);//上传作业本地温江到HDFS try { URI pathURI = getPathURI(newPath, tmpURI.getFragment()); DistributedCache.addCacheFile(pathURI, job);//通过DistributedCache工具将作业文件放到Cahe中 } catch(URISyntaxException ue) { //should not throw a uri exception throw new IOException("Failed to create uri for " + tmpFile, ue); } DistributedCache.createSymlink(job); } } if (libjars != null) { FileSystem.mkdirs(fs, libjarsDir, mapredSysPerms); . .
/** * This represents the meta information about the task split. * The main fields are * - start offset in actual split * - data length that will be processed in this split * - hosts on which this split is local */ public static class SplitMetaInfo implements Writable { private long startOffset;//该InputSplit元信息在job.split文件中的偏移量 private long inputDataLength;//该InputSplit数据长度 private String[] locations;//该InputSplit的host列表 ... }
/** * This represents the meta information about the task split that the * JobTracker creates */ public static class TaskSplitMetaInfo { private TaskSplitIndex splitIndex;//Split元信息在job.split文件中的位置 private long inputDataLength;//InputSplit的长度 private String[] locations;//InputSplit的hosts列表 .... }
/** * This represents the meta information about the task split that the * task gets */ public static class TaskSplitIndex { private String splitLocation; private long startOffset; ..... }
