作业提交之YARN
作业提交过程之HDFS & MapReduce
作业提交全过程详解(1)作业提交
作业提交全过程详解(2)作业初始化
作业提交全过程详解(3)任务分配
作业提交全过程详解(4)任务运行
作业提交全过程详解(5)进度和状态更新
作业提交全过程详解(6)作业完成
<property>
<description>The class to use as the resource scheduler.description>
<name>yarn.resourcemanager.scheduler.classname>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulervalue>
property>
# 执行WordCount案例,hadoop103:8088页面查
myhadoop.sh start
hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar wordcount /input /output
# yarn application查看任务
# 列出所有Application
yarn application -list
# 根据Application状态过滤:yarn application -list -appStates (所有状态:ALL、NEW、NEW_SAVING、SUBMITTED、ACCEPTED、RUNNING、FINISHED、FAILED、KILLED
yarn application -list -appStates FINISHED
# Kill掉Application
yarn application -kill application_1612577921195_0001
# yarn logs查看日志
# 查询Application日志:yarn logs -applicationId
yarn logs -applicationId application_1612577921195_0001
# 查询Container日志:yarn logs -applicationId -containerId
yarn logs -applicationId
# yarn applicationattempt查看尝试运行的任务
# 列出所有Application尝试的列表:yarn applicationattempt -list
yarn applicationattempt -list application_1612577921195_0001
# 打印ApplicationAttemp状态:yarn applicationattempt -status
yarn applicationattempt -status appattempt_1612577921195_0001_000001
# yarn container查看容器
# 列出所有Container:yarn container -list
yarn container -list appattempt_1612577921195_0001_000001
# 打印Container状态: yarn container -status
yarn container -status container_1612577921195_0001_01_000001
# yarn node查看节点状态
# 列出所有节点:yarn node -list -all
yarn node -list -all
# yarn rmadmin更新配置
# 载队列配置:yarn rmadmin -refreshQueues
yarn rmadmin -refreshQueues
# yarn queue查看队列
# 打印队列信息:yarn queue -status
yarn queue -status default
# 需求分析:
# 1G / 128m = 8个MapTask;1个ReduceTask;1个mrAppMaster
# 平均每个节点运行10个 / 3台 ≈ 3个任务(4 3 3)
<property>
<description>The class to use as the resource scheduler.description>
<name>yarn.resourcemanager.scheduler.classname>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulervalue>
property>
<property>
<description>Number of threads to handle scheduler interface.description>
<name>yarn.resourcemanager.scheduler.client.thread-countname>
<value>8value>
property>
<property>
<description>Enable auto-detection of node capabilities such as
memory and CPU.
description>
<name>yarn.nodemanager.resource.detect-hardware-capabilitiesname>
<value>falsevalue>
property>
<property>
<description>Flag to determine if logical processors(such as
hyperthreads) should be counted as cores. Only applicable on Linux
when yarn.nodemanager.resource.cpu-vcores is set to -1 and
yarn.nodemanager.resource.detect-hardware-capabilities is true.
description>
<name>yarn.nodemanager.resource.count-logical-processors-as-coresname>
<value>falsevalue>
property>
<property>
<description>Multiplier to determine how to convert phyiscal cores to
vcores. This value is used if yarn.nodemanager.resource.cpu-vcores
is set to -1(which implies auto-calculate vcores) and
yarn.nodemanager.resource.detect-hardware-capabilities is set to true. The number of vcores will be calculated as number of CPUs * multiplier.
description>
<name>yarn.nodemanager.resource.pcores-vcores-multipliername>
<value>1.0value>
property>
<property>
<description>Amount of physical memory, in MB, that can be allocated
for containers. If set to -1 and
yarn.nodemanager.resource.detect-hardware-capabilities is true, it is
automatically calculated(in case of Windows and Linux).
In other cases, the default is 8192MB.
description>
<name>yarn.nodemanager.resource.memory-mbname>
<value>4096value>
property>
<property>
<description>Number of vcores that can be allocated
for containers. This is used by the RM scheduler when allocating
resources for containers. This is not used to limit the number of
CPUs used by YARN containers. If it is set to -1 and
yarn.nodemanager.resource.detect-hardware-capabilities is true, it is
automatically determined from the hardware in case of Windows and Linux.
In other cases, number of vcores is 8 by default.description>
<name>yarn.nodemanager.resource.cpu-vcoresname>
<value>4value>
property>
<name>yarn.scheduler.minimum-allocation-mbname>
<value>1024value>
property>
<property>
<description>The minimum allocation for every container request at the RM in MBs. Memory requests lower than this will be set to the value of this property. Additionally, a node manager that is configured to have less memory than this value will be shut down by the resource manager.
description>
<property>
<description>The maximum allocation for every container request at the RM in MBs. Memory requests higher than this will throw an InvalidResourceRequestException.
description>
<name>yarn.scheduler.maximum-allocation-mbname>
<value>2048value>
property>
<property>
<description>The minimum allocation for every container request at the RM in terms of virtual CPU cores. Requests lower than this will be set to the value of this property. Additionally, a node manager that is configured to have fewer virtual cores than this value will be shut down by the resource manager.
description>
<name>yarn.scheduler.minimum-allocation-vcoresname>
<value>1value>
property>
<property>
<description>The maximum allocation for every container request at the RM in terms of virtual CPU cores. Requests higher than this will throw an
InvalidResourceRequestException.description>
<name>yarn.scheduler.maximum-allocation-vcoresname>
<value>2value>
property>
<property>
<description>Whether virtual memory limits will be enforced for
containers.description>
<name>yarn.nodemanager.vmem-check-enabledname>
<value>falsevalue>
property>
<property>
<description>Ratio between virtual memory to physical memory when setting memory limits for containers. Container allocations are expressed in terms of physical memory, and virtual memory usage is allowed to exceed this allocation by this ratio.
description>
<name>yarn.nodemanager.vmem-pmem-rationame>
<value>2.1value>
property>
# 重启集群
sbin/stop-yarn.sh
sbin/start-yarn.sh
# 执行WordCount程序
hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar wordcount /input /output
# 观察Yarn任务执行页面
http://hadoop103:8088/cluster/apps
<property>
<name>yarn.scheduler.capacity.root.queuesname>
<value>default,hivevalue>
<description>
The queues at the this level (root is the root queue).
description>
property>
<property>
<name>yarn.scheduler.capacity.root.default.capacityname>
<value>40value>
property>
<property>
<name>yarn.scheduler.capacity.root.default.maximum-capacityname>
<value>60value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.capacityname>
<value>60value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.user-limit-factorname>
<value>1value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.maximum-capacityname>
<value>80value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.statename>
<value>RUNNINGvalue>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.acl_submit_applicationsname>
<value>*value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.acl_administer_queuename>
<value>*value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.acl_application_max_priorityname>
<value>*value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.maximum-application-lifetimename>
<value>-1value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.default-application-lifetimename>
<value>-1value>
property>
yarn rmadmin -refreshQueuesb # 这里只修改了队列参数可以不重启yarn
# 执行队列提交job 注: -D表示运行时改变参数值
hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar wordcount -D mapreduce.job.queuename=hive /input /output
public class WcDrvier {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("mapreduce.job.queuename","hive");
//1. 获取一个Job实例
Job job = Job.getInstance(conf);
。。。 。。。
//6. 提交Job
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
<property>
<name>yarn.cluster.max-application-priorityname>
<value>5value>
property>
xsync yarn-site.xml
sbin/stop-yarn.sh
sbin/start-yarn.sh
hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar pi 5 2000000
hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar pi -D mapreduce.job.priority=5 5 2000000
// yarn application -appID -updatePriority 优先级
yarn application -appID application_1611133087930_0009 -updatePriority 5
<property>
<name>yarn.resourcemanager.scheduler.classname>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairSchedulervalue>
<description>配置使用公平调度器description>
property>
<property>
<name>yarn.scheduler.fair.allocation.filename>
<value>/opt/module/hadoop-3.1.3/etc/hadoop/fair-scheduler.xmlvalue>
<description>指明公平调度器队列分配配置文件description>
property>
<property>
<name>yarn.scheduler.fair.preemptionname>
<value>falsevalue>
<description>禁止队列间资源抢占description>
property>
<allocations>
<queueMaxAMShareDefault>0.5queueMaxAMShareDefault>
<queueMaxResourcesDefault>4096mb,4vcoresqueueMaxResourcesDefault>
<queue name="test">
<minResources>2048mb,2vcoresminResources>
<maxResources>4096mb,4vcoresmaxResources>
<maxRunningApps>4maxRunningApps>
<maxAMShare>0.5maxAMShare>
<weight>1.0weight>
<schedulingPolicy>fairschedulingPolicy>
queue>
<queue name="atguigu" type="parent">
<minResources>2048mb,2vcoresminResources>
<maxResources>4096mb,4vcoresmaxResources>
<maxRunningApps>4maxRunningApps>
<maxAMShare>0.5maxAMShare>
<weight>1.0weight>
<schedulingPolicy>fairschedulingPolicy>
queue>
<queuePlacementPolicy>
<rule name="specified" create="false"/>
<rule name="nestedUserQueue" create="true">
<rule name="primaryGroup" create="false"/>
rule>
<rule name="reject" />
queuePlacementPolicy>
allocations>
xsync yarn-site.xml
xsync fair-scheduler.xml
sbin/stop-yarn.sh
sbin/start-yarn.sh
# 测试提交任务
# 提交任务时指定队列,按照配置规则,任务会到指定的root.test队列
hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar pi -Dmapreduce.job.queuename=root.test 1 1
# 提交任务时不指定队列,按照配置规则,任务会到root.atguigu.atguigu队列
hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar pi 1 1
hadoop jar wc.jar com.atguigu.mapreduce.wordcount2.WordCountDriver /input /output1
# 期望可以动态传参,结果报错,误认为是第一个输入参数。
hadoop jar wc.jar com.atguigu.mapreduce.wordcount2.WordCountDriver -Dmapreduce.job.queuename=root.test /input /output1
project>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>com.atguigu.hadoopgroupId>
<artifactId>yarn_tool_testartifactId>
<version>1.0-SNAPSHOTversion>
<dependencies>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-clientartifactId>
<version>3.1.3version>
dependency>
dependencies>
com.atguigu.yarn
包,创建类WordCount并实现Tool接口:package com.atguigu.yarn;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import java.io.IOException;
public class WordCount implements Tool {
private Configuration conf;
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(conf);
// 传入参数处理
job.setJarByClass(WordCountDriver.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true) ? 0 : 1;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return conf;
}
public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text outK = new Text();
private IntWritable outV = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
for (String word : words) {
outK.set(word);
context.write(outK, outV);
}
}
}
public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable outV = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
outV.set(sum);
context.write(key, outV);
}
}
}
package com.atguigu.yarn;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.util.Arrays;
public class WordCountDriver {
private static Tool tool;
public static void main(String[] args) throws Exception {
// 1. 创建配置文件
Configuration conf = new Configuration();
// 2. 判断是否有tool接口
switch (args[0]){
case "wordcount":
tool = new WordCount();
break;
default:
throw new RuntimeException(" No such tool: "+ args[0] );
}
// 3. 用Tool执行程序
// Arrays.copyOfRange 将老数组的元素放到新数组里面
int run = ToolRunner.run(conf, tool, Arrays.copyOfRange(args, 1, args.length));
System.exit(run);
}
}
yarn jar YarnDemo.jar com.atguigu.yarn.WordCountDriver wordcount /input /output
# 注意此时提交的3个参数,第一个用于生成特定的Tool,第二个和第三个为输入输出目录。此时如果我们希望加入设置参数,可以在wordcount后面添加参数,例如:
jar YarnDemo.jar com.atguigu.yarn.WordCountDriver wordcount -Dmapreduce.job.queuename=root.test /input /output1