嘤嘤嘤

MapReduce_Job提交流程源码详解

版本：Apache Hadoop 2.7.2

运行环境：Windows10

Eclipse Debug: https://blog.csdn.net/qq_40794973/article/details/87876772

主要介绍的是写完mr(job) 如何提交到集群上，这一段过程它执行了哪些操作。写一个简答的 WordCount ，通过调试来看程序是如何执行的。

1、创建一个 Maven 项目

package com.atguigu.mapreduce.wordcountDemo.map;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//map阶段
//KEYIN输入数据的key  官方的是 Object()  行的偏移量
//VALUEIN输入数据的 value
//KEYOUT输出数据的key的类型 atguigu,1 ss,1
//VALUEOUT输出的数据的 value类型
public class WordcountMapper extends Mapper{
	Text k = new Text();
	IntWritable v = new IntWritable(1);
	@Override
	protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
		// 1 获取一行
		String line = value.toString();//每次读取一行
		System.out.println("key: "+key+"\t\t\t"+"value: "+line);

		// 2 切割
		String[] words = line.split(" +");
		// 3 输出
		for (String word : words) {
			k.set(word);
			context.write(k, v);//写到缓存区里面
		}
	}
}

package com.atguigu.mapreduce.wordcountDemo.reduce;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordcountReducer extends Reducer{
	int sum;
	IntWritable v = new IntWritable();
	/**
	 * 把相同key 的值封装到了一个values里面
	 */
	@Override
	protected void reduce(Text key, Iterable values,Context context) throws IOException, InterruptedException {
		// 1 累加求和
		sum = 0;
		for (IntWritable count : values) {
			sum += count.get();//System.out.println("count.get(): "+count.get());
		}
		// 2 输出
		v.set(sum);//System.out.println("输出："+key+" "+sum);
		context.write(key,v);
	}
}

package com.atguigu.mapreduce.wordcountDemo;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Job.JobState;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.atguigu.mapreduce.wordcountDemo.map.WordcountMapper;
import com.atguigu.mapreduce.wordcountDemo.reduce.WordcountReducer;
public class WordcountDriver {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		// 输入输出路径需要根据自己电脑上实际的输入输出路径设置
		args = new String[] { "F:/Test/input.txt", "F:/Test/output" };
		
		
		// 1 获取配置信息以及封装任务
		Configuration configuration = new Configuration();
		Job job = Job.getInstance(configuration);//获取实例对象

		// 2 设置jar加载路径
		job.setJarByClass(WordcountDriver.class);//通过反射找到 jar 的存储位置  本地运行可以不用设置这个

		// 3 设置map和reduce类
		job.setMapperClass(WordcountMapper.class);
		job.setReducerClass(WordcountReducer.class);

		// 4 设置map输出
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);

		// 5 设置最终输出kv类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		//---------------------------------------------------
		// 如果不设置InputFormat，它默认用的是TextInputFormat.class
		//job.setInputFormatClass(CombineTextInputFormat.class);
		//虚拟存储切片最大值设置4m
		//CombineTextInputFormat.setMaxInputSplitSize(job, 4194304);//4M
		//---------------------------------------------------
		
		// 6 设置输入和输出路径
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		// 7 提交
		boolean result = job.waitForCompletion(true);
		//job.submit(); = job.waitForCompletion(false);
		
		System.exit(result ? 0 : 1);//运行成功系统打印0，失败打印1  这步可有可没有
	}
}


	4.0.0
	com.atguigu
	mr-Demo
	0.0.1-SNAPSHOT

	
	
		
			junit
			junit
			RELEASE
		
		
			org.apache.logging.log4j
			log4j-core
			2.8.2
		
		
			org.apache.hadoop
			hadoop-common
			2.7.2
		
		
		
			org.apache.hadoop
			hadoop-client
			2.7.2
		
		
			org.apache.hadoop
			hadoop-hdfs
			2.7.2
		
	

	
	
		
			
				maven-compiler-plugin
				2.3.2
				
					1.8
					1.8
				
			
			
				maven-assembly-plugin 
				
					
						jar-with-dependencies
					
					
						
							
							com.atguigu.mapreduce.wordcountDemo.WordcountDriver
						
					
				
				
					
						make-assembly
						package
						
							single

我这里的输入路径是 F:\Test\input.txt ，内容如下：

hehe hehe
xixi xixi
haha haha
jiao
Xxx
xue
hadoop

2、创建好项目，开始调试

首先在 WordcountDriver 类，的 job.waitForCompletion(true) 出打一个断点，进入waitForCompletion(true)方法内部。

WordcountDriver ：

-> job.waitForCompletion(true); （打断点）

Job:

 /**
   * Submit the job to the cluster and wait for it to finish.
   * @param verbose print the progress to the user
   * @return true if the job succeeded
   * @throws IOException thrown if the communication with the 
   *         JobTracker is lost
   */
  public boolean waitForCompletion(boolean verbose ) throws IOException, InterruptedException, ClassNotFoundException {
    if (state == JobState.DEFINE) {//状态是否是已定义  JobState.DEFINE 相当于还没有被占用，没有相同的 job 可以继续运行。在后面它提交 job 的时候，它会将这个状态置为JobState.RUNNING(正在运行)
      submit();//提交  进入
    }//提交完成之后要打印一些完成的信息
    if (verbose) {
      monitorAndPrintJob();
    } else {
      // get the completion poll interval from the client.
      int completionPollIntervalMillis = 
        Job.getCompletionPollInterval(cluster.getConf());
      while (!isComplete()) {
        try {
          Thread.sleep(completionPollIntervalMillis);
        } catch (InterruptedException ie) {
        }
      }
    }
    return isSuccessful();
  }

进入方法内部，首先判断的是 state == JobState.DEFINE，才会继续运行。submit() 就是提交 job 的代码，submit() 下面的这些代码主要是用于打印一些完成的信息，如果我们在 WordcountDriver 类里面最后写的提交代码是 job.waitForCompletion(false) 或者 job.submit() 这个时候也不会打印完成的信息。

打印的信息如下：

File System Counters
       FILE: Number of bytes read=712
       FILE: Number of bytes written=562435
       FILE: Number of read operations=0
       FILE: Number of large read operations=0
       FILE: Number of write operations=0
   Map-Reduce Framework
       Map input records=7
       Map output records=10
       Map output bytes=95
       Map output materialized bytes=121
       Input split bytes=115
       Combine input records=0
       Combine output records=0
       Reduce input groups=7
       Reduce shuffle bytes=121
       Reduce input records=10
       Reduce output records=7
       Spilled Records=20
       Shuffled Maps =1
       Failed Shuffles=0
       Merged Map outputs=1
       GC time elapsed (ms)=0
       Total committed heap usage (bytes)=514850816
   Shuffle Errors
       BAD_ID=0
       CONNECTION=0
       IO_ERROR=0
       WRONG_LENGTH=0
       WRONG_MAP=0
       WRONG_REDUCE=0
   File Input Format Counters
       Bytes Read=0
   File Output Format Counters
       Bytes Written=66

进入submit() 方法内

Job:

    /**
   * Submit the job to the cluster and return immediately.
   * @throws IOException
   */
  public void submit() throws IOException, InterruptedException, ClassNotFoundException {
    ensureState(JobState.DEFINE);//判断状态
    setUseNewAPI();//让你使用新 API 
    connect();//网络连接
    final JobSubmitter submitter =  getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
    status = ugi.doAs(new PrivilegedExceptionAction() {
      public JobStatus run() throws IOException, InterruptedException, ClassNotFoundException {
        return submitter.submitJobInternal(Job.this, cluster);//提交 job 的一些详细信息   （打断点）
      }
    });
    state = JobState.RUNNING;
    LOG.info("The url to track the job: " + getTrackingURL());
   }

-------------------------------------------ensureState(JobState.DEFINE)-------------开始------------------------------

进入 ensureState(JobState.DEFINE) 方法内部， ensureState() 方法还是用于判断 JobState 是不是同一个 JobState ，不是会抛出异常，然后跳出方法。

Job:

  private void ensureState(JobState state) throws IllegalStateException {
    if (state != this.state) {
      throw new IllegalStateException("Job in state "+ this.state + 
                                      " instead of " + state);
    }

-------------------------------------------ensureState(JobState.DEFINE)------------结束------------------------------

-------------------------------------------setUseNewAPI()-------------开始------------------------------

Step Into(F5) 进入 setUseNewAPI() 方法，setUseNewAPI()主要的作用是为了兼容低版本，他的作用就是把老的 API “翻译成新的 API”。

Job:

  //处理老的 API
  //把老的API"翻译"成新的 API，匹配兼容性
  /**
   * Default to the new APIs unless they are explicitly set or the old mapper or
   * reduce attributes are used.
   * @throws IOException if the configuration is inconsistant
   */
  private void setUseNewAPI() throws IOException {
    int numReduces = conf.getNumReduceTasks();
    String oldMapperClass = "mapred.mapper.class";
    String oldReduceClass = "mapred.reducer.class";
    conf.setBooleanIfUnset("mapred.mapper.new-api",
                           conf.get(oldMapperClass) == null);
    if (conf.getUseNewMapper()) {
      String mode = "new map API";
      ensureNotSet("mapred.input.format.class", mode);
      ensureNotSet(oldMapperClass, mode);
      if (numReduces != 0) {
        ensureNotSet("mapred.partitioner.class", mode);
       } else {
        ensureNotSet("mapred.output.format.class", mode);
      }      
    } else {
      String mode = "map compatability";
      ensureNotSet(INPUT_FORMAT_CLASS_ATTR, mode);
      ensureNotSet(MAP_CLASS_ATTR, mode);
      if (numReduces != 0) {
        ensureNotSet(PARTITIONER_CLASS_ATTR, mode);
       } else {
        ensureNotSet(OUTPUT_FORMAT_CLASS_ATTR, mode);
      }
    }
    if (numReduces != 0) {
      conf.setBooleanIfUnset("mapred.reducer.new-api",
                             conf.get(oldReduceClass) == null);
      if (conf.getUseNewReducer()) {
        String mode = "new reduce API";
        ensureNotSet("mapred.output.format.class", mode);
        ensureNotSet(oldReduceClass, mode);   
      } else {
        String mode = "reduce compatability";
        ensureNotSet(OUTPUT_FORMAT_CLASS_ATTR, mode);
        ensureNotSet(REDUCE_CLASS_ATTR, mode);   
      }
    }   
  }

-------------------------------------------setUseNewAPI()-------------结束------------------------------

Step Return(F7) 跳出 setUseNewAPI() 方法，Step Into(F5) 进入 connect() 方法内部，connect() 方法用于网络连接，根据运行环境的不同创建不同的对象，本地创建LocalJobRunner 集群上创建的是yarnRunner 。

-----------------------------------------------------------------connect() 方法开始-----------------------------------------------------------------

  	//网络连接
  	//根据运行环境的不同创建不同的对象，本地创建LocalJobRunner 集群上创建的是yarnRunner 
    private synchronized void connect() throws IOException, InterruptedException, ClassNotFoundException {
    if (cluster == null) {//cluster(集群)  集群为空，创建
      cluster = ugi.doAs(new PrivilegedExceptionAction() {
                   public Cluster run() throws IOException, InterruptedException,  ClassNotFoundException {
                     return new Cluster(getConfiguration());//
                   }
                 });
    }
  }

connect() 方法首先判断，集群是否已经创建，如果集群为空就创建一个集群，次数集群为空执行 if 语句，在

ugi.doAs(new PrivilegedExceptionAction() {

出 打断点 进入方法内部，一直执行下一语句，在

public Cluster(InetSocketAddress jobTrackAddr, Configuration conf)

方法的

initialize(jobTrackAddr, conf);

这里打个断点

Cluster：

 public Cluster(InetSocketAddress jobTrackAddr, Configuration conf) throws IOException {  (打断点)
    this.conf = conf;
    this.ugi = UserGroupInformation.getCurrentUser();
    initialize(jobTrackAddr, conf);//conf 就是 Hadoop etc/hadoop/ 里面的各种配置文件（如：yarn-site.xml）     进入(打断点)
  }

这个方法里面的 conf 其实就是 Hadoop etc/hadoop 里面的各种配置文件信息

进入initialize(jobTrackAddr, conf) 方法里面

Cluster:

  	//该方法做主要的作用就是判断你连接的是 Yarn 还是本地，根据环境的不同创建的对象也不同
    private void initialize(InetSocketAddress jobTrackAddr, Configuration conf) throws IOException {
    synchronized (frameworkLoader) { //加锁   (打断点)
      for (ClientProtocolProvider provider : frameworkLoader) {
        LOG.debug("Trying ClientProtocolProvider : "+ provider.getClass().getName());
        ClientProtocol clientProtocol = null; //客户端的协议目前是空
        try {
          if (jobTrackAddr == null) {
            clientProtocol = provider.create(conf);//创造一个客户端协议  根据运行的地方不同创建的协议也不同，如果是在本地运行创建的是LocalJobRunner 如果是Hadoop集群上运行创建的是 Yarn上的
          } else {
            clientProtocol = provider.create(jobTrackAddr, conf);
          }

          if (clientProtocol != null) {
            clientProtocolProvider = provider;
            client = clientProtocol;
            LOG.debug("Picked " + provider.getClass().getName() + " as the ClientProtocolProvider");
            break;
          }
          else {
            LOG.debug("Cannot pick " + provider.getClass().getName() + " as the ClientProtocolProvider - returned null protocol");
          }
        } 
        catch (Exception e) {
          LOG.info("Failed to use " + provider.getClass().getName() + " due to error: ", e);
        }
      }
    }
    if (null == clientProtocolProvider || null == client) { 
    throw new IOException( "Cannot initialize Cluster. Please check your configuration for " + MRConfig.FRAMEWORK_NAME  + " and the correspond server addresses.");
    }
  }

该方法做主要的作用就是判断你连接的是 Yarn 还是本地，根据环境的不同创建的对象也不同

-----------------------------------------------------------------connect() 方法结束-----------------------------------------------------------------

一直向下执行，回到 submit() （Job类）方法

在

return submitter.submitJobInternal(Job.this, cluster);

这一行 打个断点

进入 submitJobInternal(Job.this, cluster) 方法（进入，执行下一步会跳出来，再次进入）

---------------------------------------------submitJobInternal(Job.this, cluster) 方法开始-----------------------------------------------------------------

 
    /**
   * Internal method for submitting jobs to the system.
   * 
   * The job submission process involves:
   * 

   *   
   *   Checking the input and output specifications of the job.
   *   
   *   
   *   Computing the {@link InputSplit}s for the job.
   *   
   *   
   *   Setup the requisite accounting information for the 
   *   {@link DistributedCache} of the job, if necessary.
   *   
   *   
   *   Copying the job's jar and configuration to the map-reduce system
   *   directory on the distributed file-system. 
   *   
   *   
   *   Submitting the job to the JobTracker and optionally
   *   monitoring it's status.
   *   
   * 

   * @param job the configuration to submit
   * @param cluster the handle to the Cluster
   * @throws ClassNotFoundException
   * @throws InterruptedException
   * @throws IOException
   */
  JobStatus submitJobInternal(Job job, Cluster cluster) 
  throws ClassNotFoundException, InterruptedException, IOException {

    //validate the jobs output specs  //校验输出的路径  校验的很早，还没做什么事情之前都开始校验了
    checkSpecs(job);//进入

    Configuration conf = job.getConfiguration(); //配置信息 八个文件 (.xml)
    addMRFrameworkToDistributedCache(conf);//缓存的处理

    Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);//每次提交会创建一个临时路径，一旦提交完就会删除参数的数据 电脑上直接 收拾 tmp 即可找到
    //configure the command line options correctly on the submitting dfs
    InetAddress ip = InetAddress.getLocalHost();
    if (ip != null) {
      submitHostAddress = ip.getHostAddress();//网络 ip 
      submitHostName = ip.getHostName();
      conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
      conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
    }
    JobID jobId = submitClient.getNewJobID();//提交任务时的jobId，每个 job 提交任务的时候都会给你分配一个id
    job.setJobID(jobId);
    Path submitJobDir = new Path(jobStagingArea, jobId.toString());//提交的路径
    JobStatus status = null;
    try {
      conf.set(MRJobConfig.USER_NAME,
          UserGroupInformation.getCurrentUser().getShortUserName());
      conf.set("hadoop.http.filter.initializers", 
          "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
      conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());
      LOG.debug("Configuring job " + jobId + " with " + submitJobDir 
          + " as the submit dir");
      // get delegation token for the dir
      TokenCache.obtainTokensForNamenodes(job.getCredentials(),
          new Path[] { submitJobDir }, conf);
      
      populateTokenCache(conf, job.getCredentials());

      // generate a secret to authenticate shuffle transfers
      if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
        KeyGenerator keyGen;
        try {
          keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);
          keyGen.init(SHUFFLE_KEY_LENGTH);
        } catch (NoSuchAlgorithmException e) {
          throw new IOException("Error generating shuffle secret key", e);
        }
        SecretKey shuffleKey = keyGen.generateKey();
        TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),
            job.getCredentials());
      }
      if (CryptoUtils.isEncryptedSpillEnabled(conf)) {
        conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1);
        LOG.warn("Max job attempts set to 1 since encrypted intermediate" +
                "data spill is enabled");
      }

      copyAndConfigureFiles(job, submitJobDir);//提交一些文件的信息  进入 （提前打断点）

      Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);
      
      // Create the splits for the job
      LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
      int maps = writeSplits(job, submitJobDir);//所有的切片信息 提交到 submitJobDir 路径上
      conf.setInt(MRJobConfig.NUM_MAPS, maps);
      LOG.info("number of splits:" + maps);

      // write "queue admins of the queue to which job is being submitted"
      // to job file.
      String queue = conf.get(MRJobConfig.QUEUE_NAME,
          JobConf.DEFAULT_QUEUE_NAME);
      AccessControlList acl = submitClient.getQueueAdmins(queue);
      conf.set(toFullPropertyName(queue,
          QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());

      // removing jobtoken referrals before copying the jobconf to HDFS
      // as the tasks don't need this setting, actually they may break
      // because of it if present as the referral will point to a
      // different job.
      TokenCache.cleanUpTokenReferral(conf);

      if (conf.getBoolean(
          MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,
          MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {
        // Add HDFS tracking ids
        ArrayList trackingIds = new ArrayList();
        for (Token t :
            job.getCredentials().getAllTokens()) {
          trackingIds.add(t.decodeIdentifier().getTrackingId());
        }
        conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,
            trackingIds.toArray(new String[trackingIds.size()]));
      }

      // Set reservation info if it exists
      ReservationId reservationId = job.getReservationId();
      if (reservationId != null) {
        conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());
      }

      // Write job file to submit dir
      writeConf(conf, submitJobFile);//到你的提交目录  进入
      
      //
      // Now, actually submit the job (using the submit name)
      //
      printTokens(jobId, job.getCredentials());
      status = submitClient.submitJob(
          jobId, submitJobDir.toString(), job.getCredentials());
      if (status != null) {
        return status;
      } else {
        throw new IOException("Could not launch job");
      }
    } finally {
      if (status == null) {
        LOG.info("Cleaning up the staging area " + submitJobDir);
        if (jtFs != null && submitJobDir != null)
          jtFs.delete(submitJobDir, true);

      }
    }
  }

进入 checkSpecs(job) 方法

  	//校验输出路径
    private void checkSpecs(Job job) throws ClassNotFoundException,InterruptedException, IOException { //进入
    JobConf jConf = (JobConf)job.getConfiguration();
    // Check the output specification
    if (jConf.getNumReduceTasks() == 0 ? 
        jConf.getUseNewMapper() : jConf.getUseNewReducer()) {
      org.apache.hadoop.mapreduce.OutputFormat output =
        ReflectionUtils.newInstance(job.getOutputFormatClass(),job.getConfiguration());
      output.checkOutputSpecs(job);//进入 确保输出路径设置好，并且不存在   不然抛出异常   进入（提前打断点）
    } else {
      jConf.getOutputFormat().checkOutputSpecs(jtFs, jConf);
    }
  }

在方法

output.checkOutputSpecs(job)

打断点，避免点快了

    public void checkOutputSpecs(JobContext job
                               ) throws FileAlreadyExistsException, IOException{
    // Ensure that the output directory is set and not already there //确认输出路径已经被设置好，并且不存在
    Path outDir = getOutputPath(job);
    if (outDir == null) {
      throw new InvalidJobConfException("Output directory not set.");
    }

    // get delegation token for outDir's file system
    TokenCache.obtainTokensForNamenodes(job.getCredentials(),
        new Path[] { outDir }, job.getConfiguration());

    if (outDir.getFileSystem(job.getConfiguration()).exists(outDir)) {
      throw new FileAlreadyExistsException("Output directory " + outDir + 
                                           " already exists"); //路径存在会报异常
    }
  }

checkSpecs(job) 方法用于校验输入输出路径要设置好， checkSpecs(job) 方法里面有调用了checkOutputSpecs(job)该方法用于检测输出路径不能事先存在，不然就会报错

如果路径没有设置，或者输出路径已经存在在这里就会抛出异常，比如下面这样：

2019-03-07 16:08:39,711 INFO [org.apache.hadoop.conf.Configuration.deprecation] - session.id is deprecated. Instead, use dfs.metrics.session-id
2019-03-07 16:08:39,713 INFO [org.apache.hadoop.metrics.jvm.JvmMetrics] - Initializing JVM Metrics with processName=JobTracker, sessionId=
Exception in thread "main" org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory file:/F:/Test/output already exists
   at org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:146)
   at org.apache.hadoop.mapreduce.JobSubmitter.checkSpecs(JobSubmitter.java:266)
   at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:139)
   at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1290)
   at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1287)
   at java.security.AccessController.doPrivileged(Native Method)
   at javax.security.auth.Subject.doAs(Subject.java:422)
   at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
   at org.apache.hadoop.mapreduce.Job.submit(Job.java:1287)
   at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1308)
   at com.atguigu.mapreduce.wordcountDemo.WordcountDriver.main(WordcountDriver.java:52)

一直向下执行，执行到 JobSubmitter类里面的 JobStatus submitJobInternal(Job job, Cluster cluster) 方法

  /**
   * Internal method for submitting jobs to the system.
   * 
   * The job submission process involves:
   * 

   *   
   *   Checking the input and output specifications of the job.
   *   
   *   
   *   Computing the {@link InputSplit}s for the job.
   *   
   *   
   *   Setup the requisite accounting information for the 
   *   {@link DistributedCache} of the job, if necessary.
   *   
   *   
   *   Copying the job's jar and configuration to the map-reduce system
   *   directory on the distributed file-system. 
   *   
   *   
   *   Submitting the job to the JobTracker and optionally
   *   monitoring it's status.
   *   
   * 

   * @param job the configuration to submit
   * @param cluster the handle to the Cluster
   * @throws ClassNotFoundException
   * @throws InterruptedException
   * @throws IOException
   */
  JobStatus submitJobInternal(Job job, Cluster cluster) 
  throws ClassNotFoundException, InterruptedException, IOException {

    //validate the jobs output specs 
    checkSpecs(job);

    Configuration conf = job.getConfiguration();
    addMRFrameworkToDistributedCache(conf);

    Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
    //configure the command line options correctly on the submitting dfs
    InetAddress ip = InetAddress.getLocalHost();
    if (ip != null) {
      submitHostAddress = ip.getHostAddress();
      submitHostName = ip.getHostName();
      conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
      conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
    }
    JobID jobId = submitClient.getNewJobID();
    job.setJobID(jobId);
    Path submitJobDir = new Path(jobStagingArea, jobId.toString());
    JobStatus status = null;
    try {
      conf.set(MRJobConfig.USER_NAME,
          UserGroupInformation.getCurrentUser().getShortUserName());
      conf.set("hadoop.http.filter.initializers", 
          "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
      conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());
      LOG.debug("Configuring job " + jobId + " with " + submitJobDir 
          + " as the submit dir");
      // get delegation token for the dir
      TokenCache.obtainTokensForNamenodes(job.getCredentials(),
          new Path[] { submitJobDir }, conf);
      
      populateTokenCache(conf, job.getCredentials());

      // generate a secret to authenticate shuffle transfers
      if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
        KeyGenerator keyGen;
        try {
          keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);
          keyGen.init(SHUFFLE_KEY_LENGTH);
        } catch (NoSuchAlgorithmException e) {
          throw new IOException("Error generating shuffle secret key", e);
        }
        SecretKey shuffleKey = keyGen.generateKey();
        TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),
            job.getCredentials());
      }
      if (CryptoUtils.isEncryptedSpillEnabled(conf)) {
        conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1);
        LOG.warn("Max job attempts set to 1 since encrypted intermediate" +
                "data spill is enabled");
      }

      copyAndConfigureFiles(job, submitJobDir); //!! 提前打断点 进入方法

      Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);
      
      // Create the splits for the job
      LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
      int maps = writeSplits(job, submitJobDir);
      conf.setInt(MRJobConfig.NUM_MAPS, maps);
      LOG.info("number of splits:" + maps);

      // write "queue admins of the queue to which job is being submitted"
      // to job file.
      String queue = conf.get(MRJobConfig.QUEUE_NAME,
          JobConf.DEFAULT_QUEUE_NAME);
      AccessControlList acl = submitClient.getQueueAdmins(queue);
      conf.set(toFullPropertyName(queue,
          QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());

      // removing jobtoken referrals before copying the jobconf to HDFS
      // as the tasks don't need this setting, actually they may break
      // because of it if present as the referral will point to a
      // different job.
      TokenCache.cleanUpTokenReferral(conf);

      if (conf.getBoolean(
          MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,
          MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {
        // Add HDFS tracking ids
        ArrayList trackingIds = new ArrayList();
        for (Token t :
            job.getCredentials().getAllTokens()) {
          trackingIds.add(t.decodeIdentifier().getTrackingId());
        }
        conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,
            trackingIds.toArray(new String[trackingIds.size()]));
      }

      // Set reservation info if it exists
      ReservationId reservationId = job.getReservationId();
      if (reservationId != null) {
        conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());
      }

      // Write job file to submit dir
      writeConf(conf, submitJobFile);
      
      //
      // Now, actually submit the job (using the submit name)
      //
      printTokens(jobId, job.getCredentials());
      status = submitClient.submitJob(
          jobId, submitJobDir.toString(), job.getCredentials());
      if (status != null) {
        return status;
      } else {
        throw new IOException("Could not launch job");
      }
    } finally {
      if (status == null) {
        LOG.info("Cleaning up the staging area " + submitJobDir);
        if (jtFs != null && submitJobDir != null)
          jtFs.delete(submitJobDir, true);

      }
    }
  }

在 copyAndConfigureFiles(job, submitJobDir) 方法这里提前打断点（要到了一定要满！！，有点卡，容易点过，不要问我为什么知道的），等会进入方法。

submitJobInternal(Job job, Cluster cluster) 具体介绍

由于我是本地运行，这个时候的 ip 就是我本地的 ip

进入 copyAndConfigureFiles(job, submitJobDir) 方法

    /**
    JobSubbmitter.class
   * configure the jobconf of the user with the command line options of 
   * -libjars, -files, -archives.
   * @param job
   * @throws IOException
   */
  private void copyAndConfigureFiles(Job job, Path jobSubmitDir) 
  throws IOException {
    JobResourceUploader rUploader = new JobResourceUploader(jtFs);
    rUploader.uploadFiles(job, jobSubmitDir); //进入 （提前打断点）

    // Get the working directory. If not set, sets it to filesystem working dir
    // This code has been added so that working directory reset before running
    // the job. This is necessary for backward compatibility as other systems
    // might use the public API JobConf#setWorkingDirectory to reset the working
    // directory.
    job.getWorkingDirectory();
  }

rUploader.uploadFiles(job, jobSubmitDir) 方法处，也需要提前打断点

   /**
     JobResourceUploader
   * Upload and configure files, libjars, jobjars, and archives pertaining to
   * the passed job.
   * 
   * @param job the job containing the files to be uploaded
   * @param submitJobDir the submission directory of the job
   * @throws IOException
   */
  public void uploadFiles(Job job, Path submitJobDir) throws IOException {
    Configuration conf = job.getConfiguration();
    short replication =
        (short) conf.getInt(Job.SUBMIT_REPLICATION,
            Job.DEFAULT_SUBMIT_REPLICATION);

    if (!(conf.getBoolean(Job.USED_GENERIC_PARSER, false))) {
      LOG.warn("Hadoop command-line option parsing not performed. "
          + "Implement the Tool interface and execute your application "
          + "with ToolRunner to remedy this.");
    }

    // get all the command line arguments passed in by the user conf
    String files = conf.get("tmpfiles");
    String libjars = conf.get("tmpjars");
    String archives = conf.get("tmparchives");
    String jobJar = job.getJar();

    //
    // Figure out what fs the JobTracker is using. Copy the
    // job to it, under a temporary name. This allows DFS to work,
    // and under the local fs also provides UNIX-like object loading
    // semantics. (that is, if the job file is deleted right after
    // submission, we can still run the submission to completion)
    //

    // Create a number of filenames in the JobTracker's fs namespace
    LOG.debug("default FileSystem: " + jtFs.getUri());
    if (jtFs.exists(submitJobDir)) {
      throw new IOException("Not submitting job. Job directory " + submitJobDir
          + " already exists!! This is unexpected.Please check what's there in"
          + " that directory");
    }
    submitJobDir = jtFs.makeQualified(submitJobDir);
    submitJobDir = new Path(submitJobDir.toUri().getPath());
    FsPermission mapredSysPerms =
        new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION);
    FileSystem.mkdirs(jtFs, submitJobDir, mapredSysPerms);
    Path filesDir = JobSubmissionFiles.getJobDistCacheFiles(submitJobDir);
    Path archivesDir = JobSubmissionFiles.getJobDistCacheArchives(submitJobDir);
    Path libjarsDir = JobSubmissionFiles.getJobDistCacheLibjars(submitJobDir);
    // add all the command line files/ jars and archive
    // first copy them to jobtrackers filesystem

    if (files != null) {
      FileSystem.mkdirs(jtFs, filesDir, mapredSysPerms);
      String[] fileArr = files.split(",");
      for (String tmpFile : fileArr) {
        URI tmpURI = null;
        try {
          tmpURI = new URI(tmpFile);
        } catch (URISyntaxException e) {
          throw new IllegalArgumentException(e);
        }
        Path tmp = new Path(tmpURI);
        Path newPath = copyRemoteFiles(filesDir, tmp, conf, replication);
        try {
          URI pathURI = getPathURI(newPath, tmpURI.getFragment());
          DistributedCache.addCacheFile(pathURI, conf);
        } catch (URISyntaxException ue) {
          // should not throw a uri exception
          throw new IOException("Failed to create uri for " + tmpFile, ue);
        }
      }
    }

    if (libjars != null) {
      FileSystem.mkdirs(jtFs, libjarsDir, mapredSysPerms);
      String[] libjarsArr = libjars.split(",");
      for (String tmpjars : libjarsArr) {
        Path tmp = new Path(tmpjars);
        Path newPath = copyRemoteFiles(libjarsDir, tmp, conf, replication);
        DistributedCache.addFileToClassPath(
            new Path(newPath.toUri().getPath()), conf, jtFs);
      }
    }

    if (archives != null) {
      FileSystem.mkdirs(jtFs, archivesDir, mapredSysPerms);
      String[] archivesArr = archives.split(",");
      for (String tmpArchives : archivesArr) {
        URI tmpURI;
        try {
          tmpURI = new URI(tmpArchives);
        } catch (URISyntaxException e) {
          throw new IllegalArgumentException(e);
        }
        Path tmp = new Path(tmpURI);
        Path newPath = copyRemoteFiles(archivesDir, tmp, conf, replication);
        try {
          URI pathURI = getPathURI(newPath, tmpURI.getFragment());
          DistributedCache.addCacheArchive(pathURI, conf);
        } catch (URISyntaxException ue) {
          // should not throw an uri excpetion
          throw new IOException("Failed to create uri for " + tmpArchives, ue);
        }
      }
    }

    if (jobJar != null) { // copy jar to JobTracker's fs
      // use jar name if job is not named.
      if ("".equals(job.getJobName())) {
        job.setJobName(new Path(jobJar).getName());
      }
      Path jobJarPath = new Path(jobJar);
      URI jobJarURI = jobJarPath.toUri();
      // If the job jar is already in a global fs,
      // we don't need to copy it from local fs
      if (jobJarURI.getScheme() == null || jobJarURI.getScheme().equals("file")) {
        copyJar(jobJarPath, JobSubmissionFiles.getJobJar(submitJobDir),
            replication);
        job.setJar(JobSubmissionFiles.getJobJar(submitJobDir).toString());
      }
    } else {
      LOG.warn("No job jar file set.  User classes may not be found. "
          + "See Job or Job#setJar(String).");
    }

    addLog4jToDistributedCache(job, submitJobDir);

    // set the timestamps of the archives and files
    // set the public/private visibility of the archives and files
    ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(conf);
    // get DelegationToken for cached file
    ClientDistributedCacheManager.getDelegationTokens(conf,
        job.getCredentials());
  }

最开始是没有 mapred 这个文件夹的（被我提前删除了的），执行 FileSystem.mkdirs(jtFs, submitJobDir, mapredSysPerms); 就创建了相应的目录。

PS F:\tmp\hadoop-yuanyu\mapred> tree
F:.
└─staging
└─yuanyu1686068182
└─.staging
└─job_local1686068182_0001

进入 F:\tmp\hadoop-yuanyu\mapred\staging\yuanyu1686068182\.staging\job_local1686068182_0001 目录

向下执行

一直向下执行，执行到 JobSubmitter类里面的

  /**
   * Internal method for submitting jobs to the system.
   * 
   * The job submission process involves:
   * 

   *   
   *   Checking the input and output specifications of the job.
   *   
   *   
   *   Computing the {@link InputSplit}s for the job.
   *   
   *   
   *   Setup the requisite accounting information for the 
   *   {@link DistributedCache} of the job, if necessary.
   *   
   *   
   *   Copying the job's jar and configuration to the map-reduce system
   *   directory on the distributed file-system. 
   *   
   *   
   *   Submitting the job to the JobTracker and optionally
   *   monitoring it's status.
   *   
   * 

   * @param job the configuration to submit
   * @param cluster the handle to the Cluster
   * @throws ClassNotFoundException
   * @throws InterruptedException
   * @throws IOException
   */
  JobStatus submitJobInternal(Job job, Cluster cluster) 
  throws ClassNotFoundException, InterruptedException, IOException {

    //validate the jobs output specs 
    checkSpecs(job);

    Configuration conf = job.getConfiguration();
    addMRFrameworkToDistributedCache(conf);

    Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
    //configure the command line options correctly on the submitting dfs
    InetAddress ip = InetAddress.getLocalHost();
    if (ip != null) {
      submitHostAddress = ip.getHostAddress();
      submitHostName = ip.getHostName();
      conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
      conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
    }
    JobID jobId = submitClient.getNewJobID();
    job.setJobID(jobId);
    Path submitJobDir = new Path(jobStagingArea, jobId.toString());
    JobStatus status = null;
    try {
      conf.set(MRJobConfig.USER_NAME,
          UserGroupInformation.getCurrentUser().getShortUserName());
      conf.set("hadoop.http.filter.initializers", 
          "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
      conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());
      LOG.debug("Configuring job " + jobId + " with " + submitJobDir 
          + " as the submit dir");
      // get delegation token for the dir
      TokenCache.obtainTokensForNamenodes(job.getCredentials(),
          new Path[] { submitJobDir }, conf);
      
      populateTokenCache(conf, job.getCredentials());

      // generate a secret to authenticate shuffle transfers
      if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
        KeyGenerator keyGen;
        try {
          keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);
          keyGen.init(SHUFFLE_KEY_LENGTH);
        } catch (NoSuchAlgorithmException e) {
          throw new IOException("Error generating shuffle secret key", e);
        }
        SecretKey shuffleKey = keyGen.generateKey();
        TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),
            job.getCredentials());
      }
      if (CryptoUtils.isEncryptedSpillEnabled(conf)) {
        conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1);
        LOG.warn("Max job attempts set to 1 since encrypted intermediate" +
                "data spill is enabled");
      }

      copyAndConfigureFiles(job, submitJobDir);

      Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);
      
      // Create the splits for the job
      LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
      int maps = writeSplits(job, submitJobDir);
      conf.setInt(MRJobConfig.NUM_MAPS, maps);
      LOG.info("number of splits:" + maps);

      // write "queue admins of the queue to which job is being submitted"
      // to job file.
      String queue = conf.get(MRJobConfig.QUEUE_NAME,
          JobConf.DEFAULT_QUEUE_NAME);
      AccessControlList acl = submitClient.getQueueAdmins(queue);
      conf.set(toFullPropertyName(queue,
          QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());

      // removing jobtoken referrals before copying the jobconf to HDFS
      // as the tasks don't need this setting, actually they may break
      // because of it if present as the referral will point to a
      // different job.
      TokenCache.cleanUpTokenReferral(conf);

      if (conf.getBoolean(
          MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,
          MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {
        // Add HDFS tracking ids
        ArrayList trackingIds = new ArrayList();
        for (Token t :
            job.getCredentials().getAllTokens()) {
          trackingIds.add(t.decodeIdentifier().getTrackingId());
        }
        conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,
            trackingIds.toArray(new String[trackingIds.size()]));
      }

      // Set reservation info if it exists
      ReservationId reservationId = job.getReservationId();
      if (reservationId != null) {
        conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());
      }

      // Write job file to submit dir
      writeConf(conf, submitJobFile);
      
      //
      // Now, actually submit the job (using the submit name)
      //
      printTokens(jobId, job.getCredentials());
      status = submitClient.submitJob(
          jobId, submitJobDir.toString(), job.getCredentials());
      if (status != null) {
        return status;
      } else {
        throw new IOException("Could not launch job");
      }
    } finally {
      if (status == null) {
        LOG.info("Cleaning up the staging area " + submitJobDir);
        if (jtFs != null && submitJobDir != null)
          jtFs.delete(submitJobDir, true);

      }
    }
  }

JobStatus submitJobInternal(Job job, Cluster cluster)

方法

单步向下执行

继续执行

-------------------------------------------submitJobInternal(Job.this, cluster) 方法结束-----------------------------------------------------------------

输出：

总结：

waitForCompletion()

submit();

// 1建立连接
   connect();
       // 1）创建提交Job的代理
       new Cluster(getConfiguration());
           // （1）判断是本地yarn还是远程
           initialize(jobTrackAddr, conf);

// 2 提交job
submitter.submitJobInternal(Job.this, cluster)
   // 1）创建给集群提交数据的Stag路径
   Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);

   // 2）获取jobid ，并创建Job路径
   JobID jobId = submitClient.getNewJobID();

   // 3）拷贝jar包到集群
copyAndConfigureFiles(job, submitJobDir);
   rUploader.uploadFiles(job, jobSubmitDir);

// 4）计算切片，生成切片规划文件
writeSplits(job, submitJobDir);
       maps = writeNewSplits(job, jobSubmitDir);
       input.getSplits(job);

// 5）向Stag路径写XML配置文件
writeConf(conf, submitJobFile);
   conf.writeXml(out);

// 6）提交Job,返回提交状态
status = submitClient.submitJob(jobId, submitJobDir.toString(), job.getCredentials());

										 Job提交流程源码和切片源码详解
写完mr(job) 如何提交到集群上，这一段过程它执行了那些

1、数据块：Block是HDFS物理上把数据分成一块一块。
2、数据切片：数据切片只是在逻辑上对输入进行分片，并不会在磁盘上将其切分成片进行存储。
3、一个 Job 的Map阶段并行度由客户端在提交Job时的切片数决定（有多少个切片就有多少个 MapTask）



// 1 获取配置信息以及封装任务
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);//获取实例对象
job.waitForCompletion(true);//提交  (打断点)


  /**
   * Submit the job to the cluster and wait for it to finish.
   * @param verbose print the progress to the user
   * @return true if the job succeeded
   * @throws IOException thrown if the communication with the 
   *         JobTracker is lost
   */
  public boolean waitForCompletion(boolean verbose ) throws IOException, InterruptedException, ClassNotFoundException {
    if (state == JobState.DEFINE) {//状态是否是已定义  JobState.DEFINE 相当于还没有被占用，没有相同的 job 可以继续运行。在后面它提交 job 的时候，它会将这个状态置为JobState.RUNNING(正在运行)
      submit();//提交  进入
    }//提交完成之后要打印一些完成的信息
    if (verbose) {
      monitorAndPrintJob();
    } else {
      // get the completion poll interval from the client.
      int completionPollIntervalMillis = 
        Job.getCompletionPollInterval(cluster.getConf());
      while (!isComplete()) {
        try {
          Thread.sleep(completionPollIntervalMillis);
        } catch (InterruptedException ie) {
        }
      }
    }
    return isSuccessful();
  }
  //打印的信息
  	File System Counters
		FILE: Number of bytes read=712
		FILE: Number of bytes written=562435
		FILE: Number of read operations=0
		FILE: Number of large read operations=0
		FILE: Number of write operations=0
	Map-Reduce Framework
		Map input records=7
		Map output records=10
		Map output bytes=95
		Map output materialized bytes=121
		Input split bytes=115
		Combine input records=0
		Combine output records=0
		Reduce input groups=7
		Reduce shuffle bytes=121
		Reduce input records=10
		Reduce output records=7
		Spilled Records=20
		Shuffled Maps =1
		Failed Shuffles=0
		Merged Map outputs=1
		GC time elapsed (ms)=0
		Total committed heap usage (bytes)=514850816
	Shuffle Errors
		BAD_ID=0
		CONNECTION=0
		IO_ERROR=0
		WRONG_LENGTH=0
		WRONG_MAP=0
		WRONG_REDUCE=0
	File Input Format Counters 
		Bytes Read=0
	File Output Format Counters 
		Bytes Written=66


    /**
   * Submit the job to the cluster and return immediately.
   * @throws IOException
   */
  public void submit() throws IOException, InterruptedException, ClassNotFoundException {
    ensureState(JobState.DEFINE);//判断状态
    setUseNewAPI();//让你使用新 API 
    connect();//网络连接
    final JobSubmitter submitter =  getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
    status = ugi.doAs(new PrivilegedExceptionAction() {
      public JobStatus run() throws IOException, InterruptedException, ClassNotFoundException {
        return submitter.submitJobInternal(Job.this, cluster);//提交 job 的一些详细信息   （打断点）
      }
    });
    state = JobState.RUNNING;
    LOG.info("The url to track the job: " + getTrackingURL());
   }
  
  
  
  
  	//抛出异常
    private void ensureState(JobState state) throws IllegalStateException {
    if (state != this.state) {
      throw new IllegalStateException("Job in state "+ this.state +  " instead of " + state);//job 在什么状态
    }
    if (state == JobState.RUNNING && cluster == null) {
      throw new IllegalStateException ("Job in state " + this.state + ", but it isn't attached to any job tracker!");//job 的状态
    }
  }
  
  
  
  
  //处理老的 API
  //把老的API"翻译"成新的 API，匹配兼容性
  /**
   * Default to the new APIs unless they are explicitly set or the old mapper or
   * reduce attributes are used.
   * @throws IOException if the configuration is inconsistant
   */
  private void setUseNewAPI() throws IOException {
    int numReduces = conf.getNumReduceTasks();
    String oldMapperClass = "mapred.mapper.class";
    String oldReduceClass = "mapred.reducer.class";
    conf.setBooleanIfUnset("mapred.mapper.new-api",
                           conf.get(oldMapperClass) == null);
    if (conf.getUseNewMapper()) {
      String mode = "new map API";
      ensureNotSet("mapred.input.format.class", mode);
      ensureNotSet(oldMapperClass, mode);
      if (numReduces != 0) {
        ensureNotSet("mapred.partitioner.class", mode);
       } else {
        ensureNotSet("mapred.output.format.class", mode);
      }      
    } else {
      String mode = "map compatability";
      ensureNotSet(INPUT_FORMAT_CLASS_ATTR, mode);
      ensureNotSet(MAP_CLASS_ATTR, mode);
      if (numReduces != 0) {
        ensureNotSet(PARTITIONER_CLASS_ATTR, mode);
       } else {
        ensureNotSet(OUTPUT_FORMAT_CLASS_ATTR, mode);
      }
    }
    if (numReduces != 0) {
      conf.setBooleanIfUnset("mapred.reducer.new-api",
                             conf.get(oldReduceClass) == null);
      if (conf.getUseNewReducer()) {
        String mode = "new reduce API";
        ensureNotSet("mapred.output.format.class", mode);
        ensureNotSet(oldReduceClass, mode);   
      } else {
        String mode = "reduce compatability";
        ensureNotSet(OUTPUT_FORMAT_CLASS_ATTR, mode);
        ensureNotSet(REDUCE_CLASS_ATTR, mode);   
      }
    }   
  }
  
  
  	//网络连接
  	//根据运行环境的不同创建不同的对象，本地创建LocalJobRunner 集群上创建的是yarnRunner 
    private synchronized void connect() throws IOException, InterruptedException, ClassNotFoundException {
    if (cluster == null) {//cluster(集群)  集群为空，创建
      cluster = ugi.doAs(new PrivilegedExceptionAction() {
                   public Cluster run() throws IOException, InterruptedException,  ClassNotFoundException {
                     return new Cluster(getConfiguration());//
                   }
                 });
    }
  }
  
  
 public Cluster(InetSocketAddress jobTrackAddr, Configuration conf) throws IOException {  (打断点)
    this.conf = conf;
    this.ugi = UserGroupInformation.getCurrentUser();
    initialize(jobTrackAddr, conf);//conf 就是 Hadoop etc/hadoop/ 里面的各种配置文件（如：yarn-site.xml）     进入(打断点)
  }
  
  
  
  	//该方法做主要的作用就是判断你连接的是 Yarn 还是本地，根据环境的不同创建的对象也不同
    private void initialize(InetSocketAddress jobTrackAddr, Configuration conf) throws IOException {
    synchronized (frameworkLoader) { //加锁   (打断点)
      for (ClientProtocolProvider provider : frameworkLoader) {
        LOG.debug("Trying ClientProtocolProvider : "+ provider.getClass().getName());
        ClientProtocol clientProtocol = null; //客户端的协议目前是空
        try {
          if (jobTrackAddr == null) {
            clientProtocol = provider.create(conf);//创造一个客户端协议  根据运行的地方不同创建的协议也不同，如果是在本地运行创建的是LocalJobRunner 如果是Hadoop集群上运行创建的是 Yarn上的
          } else {
            clientProtocol = provider.create(jobTrackAddr, conf);
          }

          if (clientProtocol != null) {
            clientProtocolProvider = provider;
            client = clientProtocol;
            LOG.debug("Picked " + provider.getClass().getName() + " as the ClientProtocolProvider");
            break;
          }
          else {
            LOG.debug("Cannot pick " + provider.getClass().getName() + " as the ClientProtocolProvider - returned null protocol");
          }
        } 
        catch (Exception e) {
          LOG.info("Failed to use " + provider.getClass().getName() + " due to error: ", e);
        }
      }
    }
    if (null == clientProtocolProvider || null == client) { 
    throw new IOException( "Cannot initialize Cluster. Please check your configuration for " + MRConfig.FRAMEWORK_NAME  + " and the correspond server addresses.");
    }
  }
  
  
  
  
  
  
  
    /**
   * Internal method for submitting jobs to the system.
   * 
   * The job submission process involves:
   * 

   *   
   *   Checking the input and output specifications of the job.
   *   
   *   
   *   Computing the {@link InputSplit}s for the job.
   *   
   *   
   *   Setup the requisite accounting information for the 
   *   {@link DistributedCache} of the job, if necessary.
   *   
   *   
   *   Copying the job's jar and configuration to the map-reduce system
   *   directory on the distributed file-system. 
   *   
   *   
   *   Submitting the job to the JobTracker and optionally
   *   monitoring it's status.
   *   
   * 

   * @param job the configuration to submit
   * @param cluster the handle to the Cluster
   * @throws ClassNotFoundException
   * @throws InterruptedException
   * @throws IOException
   */
  JobStatus submitJobInternal(Job job, Cluster cluster) 
  throws ClassNotFoundException, InterruptedException, IOException {

    //validate the jobs output specs  //校验输出的路径  校验的很早，还没做什么事情之前都开始校验了
    checkSpecs(job);//进入

    Configuration conf = job.getConfiguration(); //配置信息 八个文件 (.xml)
    addMRFrameworkToDistributedCache(conf);//缓存的处理

    Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);//每次提交会创建一个临时路径，一旦提交完就会删除参数的数据 电脑上直接 收拾 tmp 即可找到
    //configure the command line options correctly on the submitting dfs
    InetAddress ip = InetAddress.getLocalHost();
    if (ip != null) {
      submitHostAddress = ip.getHostAddress();//网络 ip 
      submitHostName = ip.getHostName();
      conf.set(MRJobConfig.JOB_SUBMITHOST,submitHostName);
      conf.set(MRJobConfig.JOB_SUBMITHOSTADDR,submitHostAddress);
    }
    JobID jobId = submitClient.getNewJobID();//提交任务时的jobId，每个 job 提交任务的时候都会给你分配一个id
    job.setJobID(jobId);
    Path submitJobDir = new Path(jobStagingArea, jobId.toString());//提交的路径
    JobStatus status = null;
    try {
      conf.set(MRJobConfig.USER_NAME,
          UserGroupInformation.getCurrentUser().getShortUserName());
      conf.set("hadoop.http.filter.initializers", 
          "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer");
      conf.set(MRJobConfig.MAPREDUCE_JOB_DIR, submitJobDir.toString());
      LOG.debug("Configuring job " + jobId + " with " + submitJobDir 
          + " as the submit dir");
      // get delegation token for the dir
      TokenCache.obtainTokensForNamenodes(job.getCredentials(),
          new Path[] { submitJobDir }, conf);
      
      populateTokenCache(conf, job.getCredentials());

      // generate a secret to authenticate shuffle transfers
      if (TokenCache.getShuffleSecretKey(job.getCredentials()) == null) {
        KeyGenerator keyGen;
        try {
          keyGen = KeyGenerator.getInstance(SHUFFLE_KEYGEN_ALGORITHM);
          keyGen.init(SHUFFLE_KEY_LENGTH);
        } catch (NoSuchAlgorithmException e) {
          throw new IOException("Error generating shuffle secret key", e);
        }
        SecretKey shuffleKey = keyGen.generateKey();
        TokenCache.setShuffleSecretKey(shuffleKey.getEncoded(),
            job.getCredentials());
      }
      if (CryptoUtils.isEncryptedSpillEnabled(conf)) {
        conf.setInt(MRJobConfig.MR_AM_MAX_ATTEMPTS, 1);
        LOG.warn("Max job attempts set to 1 since encrypted intermediate" +
                "data spill is enabled");
      }

      copyAndConfigureFiles(job, submitJobDir);//提交一些文件的信息  进入 （提前打断点）

      Path submitJobFile = JobSubmissionFiles.getJobConfPath(submitJobDir);
      
      // Create the splits for the job
      LOG.debug("Creating splits at " + jtFs.makeQualified(submitJobDir));
      int maps = writeSplits(job, submitJobDir);//所有的切片信息 提交到 submitJobDir 路径上
      conf.setInt(MRJobConfig.NUM_MAPS, maps);
      LOG.info("number of splits:" + maps);

      // write "queue admins of the queue to which job is being submitted"
      // to job file.
      String queue = conf.get(MRJobConfig.QUEUE_NAME,
          JobConf.DEFAULT_QUEUE_NAME);
      AccessControlList acl = submitClient.getQueueAdmins(queue);
      conf.set(toFullPropertyName(queue,
          QueueACL.ADMINISTER_JOBS.getAclName()), acl.getAclString());

      // removing jobtoken referrals before copying the jobconf to HDFS
      // as the tasks don't need this setting, actually they may break
      // because of it if present as the referral will point to a
      // different job.
      TokenCache.cleanUpTokenReferral(conf);

      if (conf.getBoolean(
          MRJobConfig.JOB_TOKEN_TRACKING_IDS_ENABLED,
          MRJobConfig.DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED)) {
        // Add HDFS tracking ids
        ArrayList trackingIds = new ArrayList();
        for (Token t :
            job.getCredentials().getAllTokens()) {
          trackingIds.add(t.decodeIdentifier().getTrackingId());
        }
        conf.setStrings(MRJobConfig.JOB_TOKEN_TRACKING_IDS,
            trackingIds.toArray(new String[trackingIds.size()]));
      }

      // Set reservation info if it exists
      ReservationId reservationId = job.getReservationId();
      if (reservationId != null) {
        conf.set(MRJobConfig.RESERVATION_ID, reservationId.toString());
      }

      // Write job file to submit dir
      writeConf(conf, submitJobFile);//到你的提交目录  进入
      
      //
      // Now, actually submit the job (using the submit name)
      //
      printTokens(jobId, job.getCredentials());
      status = submitClient.submitJob(
          jobId, submitJobDir.toString(), job.getCredentials());
      if (status != null) {
        return status;
      } else {
        throw new IOException("Could not launch job");
      }
    } finally {
      if (status == null) {
        LOG.info("Cleaning up the staging area " + submitJobDir);
        if (jtFs != null && submitJobDir != null)
          jtFs.delete(submitJobDir, true);

      }
    }
  }
  
  
  
  
  
  
  
  	//校验输出路径
    private void checkSpecs(Job job) throws ClassNotFoundException,InterruptedException, IOException { //进入
    JobConf jConf = (JobConf)job.getConfiguration();
    // Check the output specification
    if (jConf.getNumReduceTasks() == 0 ? 
        jConf.getUseNewMapper() : jConf.getUseNewReducer()) {
      org.apache.hadoop.mapreduce.OutputFormat output =
        ReflectionUtils.newInstance(job.getOutputFormatClass(),job.getConfiguration());
      output.checkOutputSpecs(job);//进入 确保输出路径设置好，并且不存在   不然抛出异常
    } else {
      jConf.getOutputFormat().checkOutputSpecs(jtFs, jConf);
    }
  }
  
  
  
  	//各种属性参数  产数一个xml文件，真个hadoop集群的配置信息
    private void writeConf(Configuration conf, Path jobFile) 
      throws IOException {
    // Write job file to JobTracker's fs        
    FSDataOutputStream out = 
      FileSystem.create(jtFs, jobFile, 
                        new FsPermission(JobSubmissionFiles.JOB_FILE_PERMISSION));
    try {
      conf.writeXml(out);
    } finally {
      out.close();
    }
  }
  
  
  
    private void checkSpecs(Job job) throws ClassNotFoundException, 
      InterruptedException, IOException {
    JobConf jConf = (JobConf)job.getConfiguration();
    // Check the output specification
    if (jConf.getNumReduceTasks() == 0 ? 
        jConf.getUseNewMapper() : jConf.getUseNewReducer()) {
      org.apache.hadoop.mapreduce.OutputFormat output =
        ReflectionUtils.newInstance(job.getOutputFormatClass(),
          job.getConfiguration());
      output.checkOutputSpecs(job);  进入（提前打断点）
    } else {
      jConf.getOutputFormat().checkOutputSpecs(jtFs, jConf);
    }
  }
  
  
    public void checkOutputSpecs(JobContext job
                               ) throws FileAlreadyExistsException, IOException{
    // Ensure that the output directory is set and not already there //确认输出路径已经被设置好，并且不存在
    Path outDir = getOutputPath(job);
    if (outDir == null) {
      throw new InvalidJobConfException("Output directory not set.");
    }

    // get delegation token for outDir's file system
    TokenCache.obtainTokensForNamenodes(job.getCredentials(),
        new Path[] { outDir }, job.getConfiguration());

    if (outDir.getFileSystem(job.getConfiguration()).exists(outDir)) {
      throw new FileAlreadyExistsException("Output directory " + outDir + 
                                           " already exists"); //路径存在会报异常
    }
  }
  
  
  
  
    /**
    JobSubbmitter.class
   * configure the jobconf of the user with the command line options of 
   * -libjars, -files, -archives.
   * @param job
   * @throws IOException
   */
  private void copyAndConfigureFiles(Job job, Path jobSubmitDir) 
  throws IOException {
    JobResourceUploader rUploader = new JobResourceUploader(jtFs);
    rUploader.uploadFiles(job, jobSubmitDir); //进入 （提前打断点）

    // Get the working directory. If not set, sets it to filesystem working dir
    // This code has been added so that working directory reset before running
    // the job. This is necessary for backward compatibility as other systems
    // might use the public API JobConf#setWorkingDirectory to reset the working
    // directory.
    job.getWorkingDirectory();
  }
  

  //-------------------------------------------------------------
   /**
     JobResourceUploader
   * Upload and configure files, libjars, jobjars, and archives pertaining to
   * the passed job.
   * 
   * @param job the job containing the files to be uploaded
   * @param submitJobDir the submission directory of the job
   * @throws IOException
   */
  public void uploadFiles(Job job, Path submitJobDir) throws IOException {
    Configuration conf = job.getConfiguration();
    short replication =
        (short) conf.getInt(Job.SUBMIT_REPLICATION,
            Job.DEFAULT_SUBMIT_REPLICATION);

    if (!(conf.getBoolean(Job.USED_GENERIC_PARSER, false))) {
      LOG.warn("Hadoop command-line option parsing not performed. "
          + "Implement the Tool interface and execute your application "
          + "with ToolRunner to remedy this.");
    }

    // get all the command line arguments passed in by the user conf
    String files = conf.get("tmpfiles");
    String libjars = conf.get("tmpjars");
    String archives = conf.get("tmparchives");
    String jobJar = job.getJar();

    //
    // Figure out what fs the JobTracker is using. Copy the
    // job to it, under a temporary name. This allows DFS to work,
    // and under the local fs also provides UNIX-like object loading
    // semantics. (that is, if the job file is deleted right after
    // submission, we can still run the submission to completion)
    //

    // Create a number of filenames in the JobTracker's fs namespace
    LOG.debug("default FileSystem: " + jtFs.getUri());
    if (jtFs.exists(submitJobDir)) {
      throw new IOException("Not submitting job. Job directory " + submitJobDir
          + " already exists!! This is unexpected.Please check what's there in"
          + " that directory");
    }
    submitJobDir = jtFs.makeQualified(submitJobDir);
    submitJobDir = new Path(submitJobDir.toUri().getPath());
    FsPermission mapredSysPerms =
        new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION);
    FileSystem.mkdirs(jtFs, submitJobDir, mapredSysPerms);
    Path filesDir = JobSubmissionFiles.getJobDistCacheFiles(submitJobDir);
    Path archivesDir = JobSubmissionFiles.getJobDistCacheArchives(submitJobDir);
    Path libjarsDir = JobSubmissionFiles.getJobDistCacheLibjars(submitJobDir);
    // add all the command line files/ jars and archive
    // first copy them to jobtrackers filesystem

    if (files != null) {
      FileSystem.mkdirs(jtFs, filesDir, mapredSysPerms);
      String[] fileArr = files.split(",");
      for (String tmpFile : fileArr) {
        URI tmpURI = null;
        try {
          tmpURI = new URI(tmpFile);
        } catch (URISyntaxException e) {
          throw new IllegalArgumentException(e);
        }
        Path tmp = new Path(tmpURI);
        Path newPath = copyRemoteFiles(filesDir, tmp, conf, replication);
        try {
          URI pathURI = getPathURI(newPath, tmpURI.getFragment());
          DistributedCache.addCacheFile(pathURI, conf);
        } catch (URISyntaxException ue) {
          // should not throw a uri exception
          throw new IOException("Failed to create uri for " + tmpFile, ue);
        }
      }
    }

    if (libjars != null) {
      FileSystem.mkdirs(jtFs, libjarsDir, mapredSysPerms);
      String[] libjarsArr = libjars.split(",");
      for (String tmpjars : libjarsArr) {
        Path tmp = new Path(tmpjars);
        Path newPath = copyRemoteFiles(libjarsDir, tmp, conf, replication);
        DistributedCache.addFileToClassPath(
            new Path(newPath.toUri().getPath()), conf, jtFs);
      }
    }

    if (archives != null) {
      FileSystem.mkdirs(jtFs, archivesDir, mapredSysPerms);
      String[] archivesArr = archives.split(",");
      for (String tmpArchives : archivesArr) {
        URI tmpURI;
        try {
          tmpURI = new URI(tmpArchives);
        } catch (URISyntaxException e) {
          throw new IllegalArgumentException(e);
        }
        Path tmp = new Path(tmpURI);
        Path newPath = copyRemoteFiles(archivesDir, tmp, conf, replication);
        try {
          URI pathURI = getPathURI(newPath, tmpURI.getFragment());
          DistributedCache.addCacheArchive(pathURI, conf);
        } catch (URISyntaxException ue) {
          // should not throw an uri excpetion
          throw new IOException("Failed to create uri for " + tmpArchives, ue);
        }
      }
    }

    if (jobJar != null) { // copy jar to JobTracker's fs
      // use jar name if job is not named.
      if ("".equals(job.getJobName())) {
        job.setJobName(new Path(jobJar).getName());
      }
      Path jobJarPath = new Path(jobJar);
      URI jobJarURI = jobJarPath.toUri();
      // If the job jar is already in a global fs,
      // we don't need to copy it from local fs
      if (jobJarURI.getScheme() == null || jobJarURI.getScheme().equals("file")) {
        copyJar(jobJarPath, JobSubmissionFiles.getJobJar(submitJobDir),
            replication);
        job.setJar(JobSubmissionFiles.getJobJar(submitJobDir).toString());
      }
    } else {
      LOG.warn("No job jar file set.  User classes may not be found. "
          + "See Job or Job#setJar(String).");
    }

    addLog4jToDistributedCache(job, submitJobDir);

    // set the timestamps of the archives and files
    // set the public/private visibility of the archives and files
    ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(conf);
    // get DelegationToken for cached file
    ClientDistributedCacheManager.getDelegationTokens(conf,
        job.getCredentials());
  }

https://blog.csdn.net/chengyuqiang/article/details/78640294

你可能感兴趣的:(大数据)

nosql数据库技术与应用知识点皆过客，揽星河 NoSQL nosql 数据库大数据数据分析数据结构非关系型数据库
Nosql知识回顾大数据处理流程数据采集(flume、爬虫、传感器)数据存储(本门课程NoSQL所处的阶段)Hdfs、MongoDB、HBase等数据清洗(入仓)Hive等数据处理、分析(Spark、Flink等)数据可视化数据挖掘、机器学习应用(Python、SparkMLlib等)大数据时代存储的挑战(三高)高并发(同一时间很多人访问)高扩展(要求随时根据需求扩展存储)高效率(要求读写速度快)
ES聚合分析原理与代码实例讲解光剑书架上的书大厂Offer收割机面试题简历程序员读书硅基计算碳基计算认知计算生物计算深度学习神经网络大数据 AIGC AGI LLM Java Python 架构设计 Agent 程序员实现财富自由
ES聚合分析原理与代码实例讲解1.背景介绍1.1问题的由来在大规模数据分析场景中，特别是在使用Elasticsearch（ES）进行数据存储和检索时，聚合分析成为了一个至关重要的功能。聚合分析允许用户对数据集进行细分和分组，以便深入探索数据的结构和模式。这在诸如实时监控、日志分析、业务洞察等领域具有广泛的应用。1.2研究现状目前，ES聚合分析已经成为现代大数据平台的核心组件之一。它支持多种类型的聚
WebMagic：强大的Java爬虫框架解析与实战 Aaron_945 Java java 爬虫开发语言
文章目录引言官网链接WebMagic原理概述基础使用1.添加依赖2.编写PageProcessor高级使用1.自定义Pipeline2.分布式抓取优点结论引言在大数据时代，网络爬虫作为数据收集的重要工具，扮演着不可或缺的角色。Java作为一门广泛使用的编程语言，在爬虫开发领域也有其独特的优势。WebMagic是一个开源的Java爬虫框架，它提供了简单灵活的API，支持多线程、分布式抓取，以及丰富的
免费的GPT可在线直接使用（一键收藏） kkai人工智能 gpt
1、LuminAI（https://kk.zlrxjh.top）LuminAI标志着一款融合了星辰大数据模型与文脉深度模型的先进知识增强型语言处理系统，旨在自然语言处理（NLP）的技术开发领域发光发热。此系统展现了卓越的语义把握与内容生成能力，轻松驾驭多样化的自然语言处理任务。VisionAI在NLP界的应用领域广泛，能够胜任从机器翻译、文本概要撰写、情绪分析到问答等众多任务。通过对大量文本数据的
如何利用大数据与AI技术革新相亲交友体验 h17711347205 回归算法安全系统架构交友小程序
在数字化时代，大数据和人工智能（AI）技术正逐渐革新相亲交友体验，为寻找爱情的过程带来前所未有的变革（编辑h17711347205）。通过精准分析和智能匹配，这些技术能够极大地提高相亲交友系统的效率和用户体验。大数据的力量大数据技术能够收集和分析用户的行为模式、偏好和互动数据，为相亲交友系统提供丰富的信息资源。通过分析用户的搜索历史、浏览记录和点击行为，系统能够深入了解用户的兴趣和需求，从而提供更
未来软件市场是怎么样的？做开发的生存空间如何？ cesske 软件需求
目录前言一、未来软件市场的发展趋势二、软件开发人员的生存空间前言未来软件市场是怎么样的？做开发的生存空间如何？一、未来软件市场的发展趋势技术趋势：人工智能与机器学习：随着技术的不断成熟，人工智能将在更多领域得到应用，如智能客服、自动驾驶、智能制造等，这将极大地推动软件市场的增长。云计算与大数据：云计算服务将继续普及，大数据技术的应用也将更加广泛。企业将更加依赖云计算和大数据来优化运营、提升效率，并
Hadoop架构 henan程序媛 hadoop 大数据分布式
一、案列分析1.1案例概述现在已经进入了大数据(BigData)时代，数以万计用户的互联网服务时时刻刻都在产生大量的交互，要处理的数据量实在是太大了，以传统的数据库技术等其他手段根本无法应对数据处理的实时性、有效性的需求。HDFS顺应时代出现，在解决大数据存储和计算方面有很多的优势。1.2案列前置知识点1.什么是大数据大数据是指无法在一定时间范围内用常规软件工具进行捕捉、管理和处理的大量数据集合，
[转载] NoSQL简介 weixin_30325793 大数据数据库运维
摘自“百度百科”。NoSQL，泛指非关系型的数据库。随着互联网web2.0网站的兴起，传统的关系数据库在应付web2.0网站，特别是超大规模和高并发的SNS类型的web2.0纯动态网站已经显得力不从心，暴露了很多难以克服的问题，而非关系型的数据库则由于其本身的特点得到了非常迅速的发展。NoSQL数据库的产生就是为了解决大规模数据集合多重数据种类带来的挑战，尤其是大数据应用难题。虽然NoSQL流行语
Kafka详细解析与应用分析芊言芊语 kafka 分布式
Kafka是一个开源的分布式事件流平台（EventStreamingPlatform），由LinkedIn公司最初采用Scala语言开发，并基于ZooKeeper协调管理。如今，Kafka已经被Apache基金会纳入其项目体系，广泛应用于大数据实时处理领域。Kafka凭借其高吞吐量、持久化、分布式和可靠性的特点，成为构建实时流数据管道和流处理应用程序的重要工具。Kafka架构Kafka的架构主要由
分享一个基于python的电子书数据采集与可视化分析 hadoop电子书数据分析与推荐系统 spark大数据毕设项目（源码、调试、LW、开题、PPT) 计算机源码社 Python项目大数据大数据 python hadoop 计算机毕业设计选题计算机毕业设计源码数据分析 spark毕设
作者：计算机源码社个人简介：本人八年开发经验，擅长Java、Python、PHP、.NET、Node.js、Android、微信小程序、爬虫、大数据、机器学习等，大家有这一块的问题可以一起交流！学习资料、程序开发、技术解答、文档报告如需要源码，可以扫取文章下方二维码联系咨询Java项目微信小程序项目Android项目Python项目PHP项目ASP.NET项目Node.js项目选题推荐项目实战|p
疫情，疫情东山草
2020年，疫情爆发，至今已近三年，反反复复，此起彼伏。不但没被消灭，还自我发展，从德尔塔到奥密克戎，与时俱进的变异着。去年11月，疫情之下，大数据800米范围内，都成为时空伴随者。“你的码儿有没有变颜色”“你绿码还是黄码”成为那段时间的流行语，当然少不了的还有全员核酸。段子手整出来一首歌：我走过你走过的路,这算不算相逢？我吹过你吹过的风，这算不算相拥？800米内我们不曾擦肩而过，你却要我14天相
在服务器计算节点中使用 jupyter Lab ranshan567 程序人生
JupyterLab是一个基于网页的交互式开发环境,用于科学计算、数据分析和机器学.jupyterlab是jupyternotebook的下一代产品,集成了更多功能,使用起来更方便.在进行数据分析及可视化时，个人电脑不能满足大数据的分析需求，就需要用到高性能计算机集群资源，然而计算机集群的计算节点往往没有联网功能，所以在计算机集群中使用jupyterLab需要进行一些配置。具体的步骤如下：
大数据真实面试题---SQL The博宇大数据面试题——SQL 大数据 mysql sql 数据库 big data
视频号数据分析组外包招聘笔试题时间限时45分钟完成。题目根据3张表表结构，写出具体求解的SQL代码（搞笑品类定义：视频分类或者视频创建者分类为“搞笑”）1、表创建语句：createtablet_user_video_action_d(dsint,user_idstring,video_idstring,action_typeint,`timestamp`bigint)rowformatdelimi
Flume：大规模日志收集与数据传输的利器傲雪凌霜，松柏长青后端大数据 flume 大数据
Flume：大规模日志收集与数据传输的利器在大数据时代，随着各类应用的不断增长，产生了海量的日志和数据。这些数据不仅对业务的健康监控至关重要，还可以通过深入分析，帮助企业做出更好的决策。那么，如何高效地收集、传输和存储这些海量数据，成为了一项重要的挑战。今天我们将深入探讨ApacheFlume，它是如何帮助我们应对这些挑战的。一、Flume概述ApacheFlume是一个分布式、可靠、可扩展的日志
云服务业界动态简报-20180128 Captain7
一、青云青云QingCloud推出深度学习平台DeepLearningonQingCloud，包含了主流的深度学习框架及数据科学工具包，通过QingCloudAppCenter一键部署交付，可以让算法工程师和数据科学家快速构建深度学习开发环境，将更多的精力放在模型和算法调优。二、腾讯云1.腾讯云正式发布腾讯专有云TCE(TencentCloudEnterprise)矩阵，涵盖企业版、大数据版、AI
大数据毕业设计hadoop+spark+hive知识图谱租房数据分析可视化大屏租房推荐系统 58同城租房爬虫房源推荐系统房价预测系统计算机毕业设计机器学习深度学习人工智能 2401_84572577 程序员大数据 hadoop 人工智能
做了那么多年开发，自学了很多门编程语言，我很明白学习资源对于学一门新语言的重要性，这些年也收藏了不少的Python干货，对我来说这些东西确实已经用不到了，但对于准备自学Python的人来说，或许它就是一个宝藏，可以给你省去很多的时间和精力。别在网上瞎学了，我最近也做了一些资源的更新，只要你是我的粉丝，这期福利你都可拿走。我先来介绍一下这些东西怎么用，文末抱走。（1）Python所有方向的学习路线（
架构评审的自动化与人工智能: 如何提高效率光剑书架上的书架构自动化人工智能运维
1.背景介绍架构评审是软件开发过程中的一个关键环节，它旨在确保软件架构的质量、可维护性和可扩展性。传统的架构评审通常是由人工进行，需要大量的时间和精力。随着大数据技术和人工智能的发展，自动化和人工智能技术已经开始应用于架构评审，从而提高评审的效率和准确性。在本文中，我们将讨论如何通过自动化和人工智能技术来提高架构评审的效率。我们将从以下几个方面进行讨论：背景介绍核心概念与联系核心算法原理和具体操作
【数字化供应链】数字化供应链架构、全景管理、全流程贯通方案数字化建设方案数字化转型数据治理主数据数据仓库供应链数字仓储智慧物流智慧仓储物流园区架构微服务数据挖掘大数据人工智能
原文《数字化供应链架构、全景管理、全流程贯通方案》PPT格式。主要从供应链管理全景、智慧供应链建设总体目标、供应链总体业务流程、供应链总体功能架构、供应链总体技术架构、供应链全流程贯通、供应链全领域管理、供应链数据数据分析、供应链决策中台等进行建设。本文仅对主要内容进行介绍。来源网络公开渠道，旨在交流学习，如有侵权联系速删，更多参考公众号：优享智库基于先进IT技术、大数据能力、物联网应用、区块链平
80 鑫_259b
科普一个谈恋爱的方法。在以前，谈恋爱千难万难，就难在对对方不知底细，不知道对方希望自己是一个怎样的人，要耗费大量的时间去试探、再磨合，往往会因为一些小事一些细节，满盘皆输。在一个信息化的时代，在一个大数据近乎变成了流行语的时代，我们要跟上时代的步伐，通过大数据，去寻找异性最希望自己展现出来的形象是什么，才可以在爱情的道路上少走弯路。那这个大数据怎么操作呢？上街发问卷？问别人的择偶标准？一来会被打死
解锁企业潜能，Vatee万腾平台引领智能新纪元自媒体经济说其他
在数字化转型的浪潮中，企业正站在一个前所未有的十字路口，面对着前所未有的机遇与挑战。解锁企业内在潜能，实现跨越式发展，已成为众多企业的共同追求。而Vatee万腾平台，作为智能科技的先锋，正以其强大的智能赋能能力，引领企业步入一个全新的智能纪元。Vatee万腾平台，是一个集成了人工智能、大数据、云计算等前沿技术的综合性智能服务平台。它不仅仅是一个技术工具，更是企业转型升级的加速器，能够深入企业运营的
释放“AI+”新质生产力，深算院如何“把大数据变小”？ YashanDB YashanDB 国产数据库数据库数据库大数据
近期，南都·湾财社推出《新质·中国造》栏目，深入千行百业，遍访湾区企业，解锁湾区新质生产力，共探高质量发展之道。本期对话深圳计算科学研究院YashanDB首席技术官陈志标，探讨国产数据库如何实现创新突围，抢抓数字经济时代的新机遇。以下是专访内容：如何应对AI时代所面临的算力挑战？南都·湾财社：数据、算力和算法是发展人工智能的三要素，深算院做了怎样的前瞻性布局？陈志标：今年，政府工作报告中首次提及开
数字化智能工厂数字化供应链架构、全景管理、全流程贯通方案数字化建设方案智能制造数字工厂制造业数字化转型工业互联网架构
随着信息技术的飞速发展，数字化转型已成为制造企业提升竞争力的关键途径。数字化智能工厂通过集成先进的物联网(IoT)、大数据、云计算、人工智能(AI)等技术，实现了生产过程的智能化、供应链管理的精准化及决策的科学化。本方案旨在构建一套完善的数字化供应链架构，实现全景管理、全流程贯通、智慧化升级，以数据为驱动，强化技术支撑与安全管理体系，推动企业向智能制造迈进。一、数字化供应链架构1.**集成化平台构
日记——我的歌单静若小猴
又到一年一度大数据汇总的时候了，听歌已经成为很多人生活里的一种乐趣。春夏秋冬，我们都有自己喜欢的歌，歌词歌曲唱出沃尔玛你的心声。还记得大学时候最喜欢听的《春天里》，我有一天单曲回放了30遍，总觉得听着仿佛看到自己声音。还有的歌，初听不知曲中意，再听已经是曲终人，听着歌流泪，听着歌入睡……还记得那些年少的故事吗，总觉得自己才是故事外的人，却不是自己已经入歌。一段时间会喜欢一个人的音乐，一段时间会沉静
Linux dmesg命令：显示开机信息 fafadsj666 linux 数据库数据挖掘机器学习大数据
通过学习《Linux启动管理》一章可以知道，在系统启动过程中，内核还会进行一次系统检测（第一次是BIOS进行加测），但是检测的过程不是没有显示在屏幕上，就是会快速的在屏幕上一闪而过那么，如果开机时来不及查看相关信息，我们是否可以在开机后查看呢？答案是肯定的，使用dmesg命令就可以。无论是系统启动过程中，还是系统运行过程中，只要是内核产生的信息，都会被存储在系统缓冲区中，已经为大家精心准备了大数据
大数据新视界 --大数据大厂之揭秘大数据时代 Excel 魔法：大厂数据分析师进阶秘籍青云交大数据新视界 Excel 数据分析函数公式数据透视表图表功能规划求解数据分析工具库大数据新视界数据库
亲爱的朋友们，热烈欢迎你们来到青云交的博客！能与你们在此邂逅，我满心欢喜，深感无比荣幸。在这个瞬息万变的时代，我们每个人都在苦苦追寻一处能让心灵安然栖息的港湾。而我的博客，正是这样一个温暖美好的所在。在这里，你们不仅能够收获既富有趣味又极为实用的内容知识，还可以毫无拘束地畅所欲言，尽情分享自己独特的见解。我真诚地期待着你们的到来，愿我们能在这片小小的天地里共同成长，共同进步。本博客的精华专栏：Ja
大数据新视界 --大数据大厂之数据挖掘入门：用 R 语言开启数据宝藏的探索之旅青云交大数据新视界数据库大数据数据挖掘 R 语言算法案例未来趋势应用场景学习建议大数据新视界
亲爱的朋友们，热烈欢迎你们来到青云交的博客！能与你们在此邂逅，我满心欢喜，深感无比荣幸。在这个瞬息万变的时代，我们每个人都在苦苦追寻一处能让心灵安然栖息的港湾。而我的博客，正是这样一个温暖美好的所在。在这里，你们不仅能够收获既富有趣味又极为实用的内容知识，还可以毫无拘束地畅所欲言，尽情分享自己独特的见解。我真诚地期待着你们的到来，愿我们能在这片小小的天地里共同成长，共同进步。本博客的精华专栏：Ja
高职人工智能训练师边缘计算实训室解决方案武汉唯众智创人工智能训练师边缘计算实训室人工智能训练师实训室边缘计算实训室
一、引言随着物联网（IoT）、大数据、人工智能（AI）等技术的飞速发展，计算需求日益复杂和多样化。传统的云计算模式虽在一定程度上满足了这些需求，但在处理海量数据、保障实时性与安全性、提升计算效率等方面仍面临诸多挑战。在此背景下，边缘计算作为一种新兴的计算模式应运而生，通过将计算能力推向数据生成或用户所在的网络边缘，显著降低了数据传输的延迟，提升了处理效率，并增强了数据安全性。针对高等职业院校的人工
python基于django/flask的NBA球员大数据分析与可视化python+java+node.js QQ_511008285 python django flask java spring boot 数据分析
前端开发框架:vue.js数据库mysql版本不限后端语言框架支持：1java(SSM/springboot)-idea/eclipse2.Nodejs+Vue.js-vscode3.python(flask/django)--pycharm/vscode4.php(thinkphp/laravel)-hbuilderx数据库工具：Navicat/SQLyog等都可以本文针对NBA球员的大数据进行
Java基于spring boot的国产电影数据分析与可视化python+java+node.js QQ_511008285 java spring boot 数据分析 python django vue.js flask
前端开发框架:vue.js数据库mysql版本不限后端语言框架支持：1java(SSM/springboot)-idea/eclipse2.Nodejs+Vue.js-vscode3.python(flask/django)--pycharm/vscode4.php(thinkphp/laravel)-hbuilderx数据库工具：Navicat/SQLyog等都可以该系统使用进行大数据处理和
数字化（电子化）招标采购平台系统核心功能详细介绍 xinyuan_123456 oracle
数智化招标采购平台覆盖全业务类型、全采购流程、全采购方式，是郑州信源公司运用“互联网+”、大数据、人工智能、区块链、物联网等新兴技术，结合供应链管理理念，以招标采购为核心，提供交易、管理、数据、服务、监管为一体的高标准采购管理平台，赋能政企用户实现采购业务全流程的电子化、数字化、智慧化。根据产品功能及应用领域，产品包括：企业数智化招采供应链平台、金融数智化招采平台、政府数智化采购平台、公共资源数智
rust的指针作为函数返回值是直接传递，还是先销毁后创建？ wudixiaotie 返回值
这是我自己想到的问题，结果去知呼提问，还没等别人回答，我自己就想到方法实验了。。 fn main() { let mut a = 34; println!("a's addr:{:p}", &a); let p = &mut a; println!("p's addr:{:p}", &a
java编程思想 -- 数据的初始化百合不是茶 java 数据的初始化
1.使用构造器确保数据初始化 /* *在ReckInitDemo类中创建Reck的对象 */ public class ReckInitDemo { public static void main(String[] args) { //创建Reck对象 new Reck(); } }
[航天与宇宙]为什么发射和回收航天器有档期 comsci
地球的大气层中有一个时空屏蔽层,这个层次会不定时的出现,如果该时空屏蔽层出现,那么将导致外层空间进入的任何物体被摧毁,而从地面发射到太空的飞船也将被摧毁... 所以,航天发射和飞船回收都需要等待这个时空屏蔽层消失之后,再进行 &
linux下批量替换文件内容商人shang linux 替换
1、网络上现成的资料　　格式: sed -i "s/查找字段/替换字段/g" `grep 查找字段 -rl 路径` 　　linux sed 批量替换多个文件中的字符串　　sed -i "s/oldstring/newstring/g" `grep oldstring -rl yourdir` 　　例如：替换/home下所有文件中的www.admi
网页在线天气预报 oloz 天气预报
网页在线调用天气预报 <%@ page language="java" contentType="text/html; charset=utf-8" pageEncoding="utf-8"%> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transit
SpringMVC和Struts2比较杨白白 springMVC
1. 入口 spring mvc的入口是servlet，而struts2是filter（这里要指出，filter和servlet是不同的。以前认为filter是servlet的一种特殊），这样就导致了二者的机制不同，这里就牵涉到servlet和filter的区别了。参见：http://blog.csdn.net/zs15932616453/article/details/8832343 2
refuse copy, lazy girl! 小桔子 copy
妹妹坐船头啊啊啊啊！都打算一点点琢磨呢。文字编辑也写了基本功能了。。今天查资料，结果查到了人家写得完完整整的。我清楚的认识到： 1.那是我自己觉得写不出的高度 2.如果直接拿来用，很快就能解决问题 3.然后就是抄咩~~ 4.肿么可以这样子，都不想写了今儿个，留着作参考吧！拒绝大抄特抄，慢慢一点点写！
apache与php整合 aichenglong php apache web
一 apache web服务器 1 apeche web服务器的安装 1)下载Apache web服务器 2)配置域名(如果需要使用要在DNS上注册) 3)测试安装访问http://localhost/验证是否安装成功 2 apache管理 1)service.msc进行图形化管理 2)命令管理，配
Maven常用内置变量 AILIKES maven
Built-in properties ${basedir} represents the directory containing pom.xml ${version} equivalent to ${project.version} (deprecated: ${pom.version}) Pom/Project properties Al
java的类和对象百合不是茶 JAVA面向对象类对象
java中的类： java是面向对象的语言，解决问题的核心就是将问题看成是一个类，使用类来解决 java使用 class 类名来创建类，在Java中类名要求和构造方法，Java的文件名是一样的创建一个A类： class A{ } java中的类：将某两个事物有联系的属性包装在一个类中，再通
JS控制页面输入框为只读 bijian1013 JavaScript
在WEB应用开发当中，增、删除、改、查功能必不可少，为了减少以后维护的工作量，我们一般都只做一份页面，通过传入的参数控制其是新增、修改或者查看。而修改时需将待修改的信息从后台取到并显示出来，实际上就是查看的过程，唯一的区别是修改时，页面上所有的信息能修改，而查看页面上的信息不能修改。因此完全可以将其合并，但通过前端JS将查看页面的所有信息控制为只读，在信息量非常大时，就比较麻烦。
AngularJS与服务器交互 bijian1013 JavaScript AngularJS $http
对于AJAX应用（使用XMLHttpRequests）来说，向服务器发起请求的传统方式是：获取一个XMLHttpRequest对象的引用、发起请求、读取响应、检查状态码，最后处理服务端的响应。整个过程示例如下： var xmlhttp = new XMLHttpRequest(); xmlhttp.onreadystatechange
[Maven学习笔记八]Maven常用插件应用 bit1129 maven
常用插件及其用法位于：http://maven.apache.org/plugins/ 1. Jetty server plugin 2. Dependency copy plugin 3. Surefire Test plugin 4. Uber jar plugin 1. Jetty Pl
【Hive六】Hive用户自定义函数(UDF) bit1129 自定义函数
1. 什么是Hive UDF Hive是基于Hadoop中的MapReduce，提供HQL查询的数据仓库。Hive是一个很开放的系统，很多内容都支持用户定制，包括：文件格式：Text File，Sequence File 内存中的数据格式： Java Integer/String, Hadoop IntWritable/Text 用户提供的 map/reduce 脚本：不管什么
杀掉nginx进程后丢失nginx.pid，如何重新启动nginx ronin47 nginx 重启 pid丢失
nginx进程被意外关闭，使用nginx -s reload重启时报如下错误：nginx: [error] open() “/var/run/nginx.pid” failed (2: No such file or directory)这是因为nginx进程被杀死后pid丢失了，下一次再开启nginx -s reload时无法启动解决办法：nginx -s reload 只是用来告诉运行中的ng
UI设计中我们为什么需要设计动效 brotherlamp UI ui教程 ui视频 ui资料 ui自学
随着国际大品牌苹果和谷歌的引领，最近越来越多的国内公司开始关注动效设计了，越来越多的团队已经意识到动效在产品用户体验中的重要性了，更多的UI设计师们也开始投身动效设计领域。但是说到底，我们到底为什么需要动效设计？或者说我们到底需要什么样的动效？做动效设计也有段时间了，于是尝试用一些案例，从产品本身出发来说说我所思考的动效设计。一、加强体验舒适度嗯，就是让用户更加爽更加爽的用你的产品。
Spring中JdbcDaoSupport的DataSource注入问题 bylijinnan java spring
参考以下两篇文章： http://www.mkyong.com/spring/spring-jdbctemplate-jdbcdaosupport-examples/ http://stackoverflow.com/questions/4762229/spring-ldap-invoking-setter-methods-in-beans-configuration Sprin
数据库连接池的工作原理 chicony 数据库连接池
随着信息技术的高速发展与广泛应用，数据库技术在信息技术领域中的位置越来越重要，尤其是网络应用和电子商务的迅速发展，都需要数据库技术支持动态Web站点的运行，而传统的开发模式是：首先在主程序（如Servlet、Beans）中建立数据库连接；然后进行SQL操作，对数据库中的对象进行查询、修改和删除等操作；最后断开数据库连接。使用这种开发模式，对
java 关键字 CrazyMizzz java
关键字是事先定义的，有特别意义的标识符，有时又叫保留字。对于保留字，用户只能按照系统规定的方式使用，不能自行定义。 Java中的关键字按功能主要可以分为以下几类：（1）访问修饰符 public,private,protected p
Hive中的排序语法 daizj 排序 hive order by DISTRIBUTE BY sort by
Hive中的排序语法 2014.06.22 ORDER BY hive中的ORDER BY语句和关系数据库中的sql语法相似。他会对查询结果做全局排序，这意味着所有的数据会传送到一个Reduce任务上，这样会导致在大数量的情况下，花费大量时间。与数据库中 ORDER BY 的区别在于在hive.mapred.mode = strict模式下，必须指定 limit 否则执行会报错。
单态设计模式 dcj3sjt126com 设计模式
单例模式（Singleton）用于为一个类生成一个唯一的对象。最常用的地方是数据库连接。使用单例模式生成一个对象后，该对象可以被其它众多对象所使用。 <?phpclass Example{ // 保存类实例在此属性中 private static&
svn locked dcj3sjt126com Lock
post-commit hook failed (exit code 1) with output: svn: E155004: Working copy 'D:\xx\xxx' locked svn: E200031: sqlite: attempt to write a readonly database svn: E200031: sqlite: attempt to write a
ARM寄存器学习 e200702084 数据结构 C++c C#F#
无论是学习哪一种处理器，首先需要明确的就是这种处理器的寄存器以及工作模式。 ARM有37个寄存器，其中31个通用寄存器，6个状态寄存器。 1、不分组寄存器（R0-R7）不分组也就是说说，在所有的处理器模式下指的都时同一物理寄存器。在异常中断造成处理器模式切换时，由于不同的处理器模式使用一个名字相同的物理寄存器，就是
常用编码资料 gengzg 编码
List<UserInfo> list=GetUserS.GetUserList(11); String json=JSON.toJSONString(list); HashMap<Object,Object> hs=new HashMap<Object, Object>(); for(int i=0;i<10;i++) {
进程 vs. 线程 hongtoushizi 线程 linux 进程
我们介绍了多进程和多线程，这是实现多任务最常用的两种方式。现在，我们来讨论一下这两种方式的优缺点。首先，要实现多任务，通常我们会设计Master-Worker模式，Master负责分配任务，Worker负责执行任务，因此，多任务环境下，通常是一个Master，多个Worker。如果用多进程实现Master-Worker，主进程就是Master，其他进程就是Worker。如果用多线程实现
Linux定时Job：crontab -e 与 /etc/crontab 的区别 Josh_Persistence linux crontab
一、linux中的crotab中的指定的时间只有5个部分：* * * * * 分别表示：分钟，小时，日，月，星期，具体说来：第一段代表分钟 0—59 第二段代表小时 0—23 第三段代表日期 1—31 第四段代表月份 1—12 第五段代表星期几，0代表星期日 0—6 如： */1 * * * * 每分钟执行一次。 *
KMP算法详解 hm4123660 数据结构 C++算法字符串 KMP
字符串模式匹配我们相信大家都有遇过，然而我们也习惯用简单匹配法（即Brute-Force算法)，其基本思路就是一个个逐一对比下去，这也是我们大家熟知的方法，然而这种算法的效率并不高，但利于理解。假设主串s="ababcabcacbab",模式串为t="
枚举类型的单例模式 zhb8015 单例模式
E.编写一个包含单个元素的枚举类型[极推荐]。代码如下： public enum MaYun {himself; //定义一个枚举的元素，就代表MaYun的一个实例private String anotherField;MaYun() {//MaYun诞生要做的事情//这个方法也可以去掉。将构造时候需要做的事情放在instance赋值的时候：/** himself = MaYun() {*
Kafka+Storm+HDFS ssydxa219 storm
cd /myhome/usr/stormbin/storm nimbus &bin/storm supervisor &bin/storm ui &Kafka+Storm+HDFS整合实践kafka_2.9.2-0.8.1.1.tgzapache-storm-0.9.2-incubating.tar.gzKafka安装配置我们使用3台机器搭建Kafk
Java获取本地服务器的IP 中华好儿孙 java Web 获取服务器ip地址
System.out.println("getRequestURL:"+request.getRequestURL()); System.out.println("getLocalAddr:"+request.getLocalAddr()); System.out.println("getLocalPort:&quo