1)规划
主机 | IP | 角色 |
172.16.10.67 | 172.16.10.67 | NameNode、SecondaryNameNode、ResourceManager、DataNode、NodeManager |
172.16.10.68 | 172.16.10.68 | DataNode、NodeManager |
172.16.10.68 | 172.16.10.69 | DataNode、NodeManager |
一、配置host(三台机器) 我没配置,如果配置
# 设置主机名
hostnamectl set-hostname node01
# hosts映射
cat /etc/hosts
172.16.10.67 node01
172.16.10.68 node02
172.16.10.69 node03
二、关闭防火墙(三台机器)
# 查看防火墙状态
firewall-cmd --state
# 停止firewalld服务
systemctl stop firewalld.service
# 开机禁用firewalld服务
systemctl disable firewalld.service
三、ssh免密登录
注:只需要配置node1至node1、node2、node3即可
#node1生成公钥私钥 (一路回车)
ssh-keygen
#node1配置免密登录到node1 node2 node3
ssh-copy-id node1
ssh-copy-id node2
ssh-copy-id node3
四、集群时间同步(三台机器)
yum install ntpdate
# 网络时间同步
ntpdate ntp5.aliyun.com
五、创建统一工作目录(三台机器)
mkdir -p /root/hadoop/server
六、安装Hadoop,所有节点
[root@172 server]# wget --no-check-certificate -c https://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-3.3.4/hadoop-3.3.4.tar.gz
JAVA_HOME=/usr/java/jdk
PATH=$GRADLE_HOME/bin:$JAVA_HOME/bin:$NODE_HOME/bin:$NODE_HOME1/bin:$PATH
CLASSPATH=.:$JAVA_HOME/jre/lib/ext:$JAVA_HOME/jre/lib/ext/jar:$JAVA_HOME/lib/tools.jar
export PATH JAVA_HOME CLASSPATH
export HADOOP_HOME=/root/hadoop/server/hadoop
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
# 配置文件生效
source /etc/profile
# 检查是否成功
[root@172 server]# hadoop version
Hadoop 3.3.4
Source code repository https://github.com/apache/hadoop.git -r a585a73c3e02ac62350c136643a5e7f6095a3dbb
Compiled by stevel on 2022-07-29T12:32Z
Compiled with protoc 3.7.1
From source with checksum fb9dd8918a7b8a5b430d61af858f6ec
This command was run using /root/hadoop/server/hadoop/share/hadoop/common/hadoop-common-3.3.4.jar
七、修改配置文件
要获取的默认文件 | 文件存放在Hadoop的jar包中的位置 | 作用 |
---|---|---|
[core-site.xml] | hadoop-common-2.7.2.jar/ core-default.xml | hdaoop核心模块配置文件 |
[hdfs-site.xml] | hadoop-hdfs-2.7.2.jar/ hdfs-default.xml | hdfs文件系统模块配置 |
[yarn-site.xml] | hadoop-yarn-common-2.7.2.jar/ yarn-default.xml | yarn模块配置 |
[mapred-site.xml] | hadoop-mapreduce-client-core-2.7.2.jar/ mapred-default.xml | MapReduce模块配置 |
core-site.xml:
[root@172 hadoop]# cat core-site.xml
fs.defaultFS
hdfs://172.16.10.67:8020
hadoop.tmp.dir
/root/hadoop/server/hadoop
hadoop.http.staticuser.user
root
hadoop.proxyuser.root.hosts
*
hadoop.proxyuser.root.groups
*
fs.trash.interval
1440
hdfs-site.xml
[root@172 hadoop]# cat hdfs-site.xml
dfs.replication
3
dfs.namenode.secondary.http-address
172.16.10.67:9868
yarn-site.xml
[root@172 hadoop]# cat yarn-site.xml
yarn.resourcemanager.hostname
172.16.10.67
yarn.nodemanager.aux-services
mapreduce_shuffle
yarn.nodemanager.pmem-check-enabled
false
yarn.nodemanager.vmem-check-enabled
false
yarn.log-aggregation-enable
true
yarn.log.server.url
http://172.16.10.67:19888/jobhistory/logs
yarn.log-aggregation.retain-seconds
604800
yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage
95.0
yarn.nodemanager.resource.memory-mb
2048
yarn.scheduler.minimum-allocation-mb
2048
yarn.nodemanager.vmem-pmem-ratio
2.1
mapred-site.xml
[root@172 hadoop]# cat mapred-site.xml
mapreduce.framework.name
yarn
mapreduce.jobhistory.address
172.16.10.67:10020
mapreduce.jobhistory.webapp.address
172.16.10.67:19888
yarn.app.mapreduce.am.env
HADOOP_MAPRED_HOME=${HADOOP_HOME}
mapreduce.map.env
HADOOP_MAPRED_HOME=${HADOOP_HOME}
mapreduce.reduce.env
HADOOP_MAPRED_HOME=${HADOOP_HOME}
mapreduce.application.classpath
$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/mapreduce/lib-examples/*
workers:
[root@172 hadoop]# cat workers
172.16.10.67
172.16.10.68
172.16.10.69
八、启动集群
1.
解决:
yarn.app.mapreduce.am.env
HADOOP_MAPRED_HOME=${HADOOP_HOME}
mapreduce.map.env
HADOOP_MAPRED_HOME=${HADOOP_HOME}
mapreduce.reduce.env
HADOOP_MAPRED_HOME=${HADOOP_HOME}
2.INFO conf.Configuration: resource-types.xml not found
INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
解决:
mapreduce.application.classpath
$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*
注意:格式化NameNode,会产生新的集群id,导致NameNode和DataNode的集群id不一致,集群找不到已往数据。所以,格式NameNode时,一定要先删除data数据和log日志,然后再格式化NameNode。
注:如果集群是第一次启动,需要格式化NameNode(注意格式化之前,一定要先停止上次启动的所有namenode和datanode进程,然后再删除data和log数据)
[root@172 hadoop]# hdfs namenode -format
[root@172 hadoop]# start-all.sh 启动所有
[root@172 hadoop]# jcmd | grep hadoo
23942 org.apache.hadoop.hdfs.server.namenode.NameNode
24726 org.apache.hadoop.yarn.server.nodemanager.NodeManager
24087 org.apache.hadoop.hdfs.server.datanode.DataNode
24587 org.apache.hadoop.yarn.server.resourcemanager.ResourceManager
24335 org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode
[root@172 hadoop]# jps
23942 NameNode
24726 NodeManager
24087 DataNode
24587 ResourceManager
17643 Jps
24335 SecondaryNameNode
指定启动Namenode;注:一个集群只有一个Namenode
[root@172 hadoop]# hdfs --daemon start namenode
其他节点执行:
启动Datanode和secondarynamenode
[root@173 hadoop]# hdfs --daemon start datanode
[root@173 hadoop]# hdfs --daemon start secondarynamenode
HDFS集群: http://172.16.10.67:9870/
YARN集群:http://172.16.10.67:8088/
九:文件传输测试
HDFS界面:http://172.16.10.67:9870/
查看NameNode状态;
该端口的定义位于core-site.xml中,可以在hdfs-site.xml 中修改;
如果通过该端口看着这个页面,NameNode节点是存活的
⒈ 上传文件到集群
构建一个测试文本和HDFS存储目录
创建目录:
[root@172 hadoop]# hadoop fs -mkdir /jettech
[root@172 work]# cat input.txt
I love runoob
I like runoob
I love hadoop
I like hadoop
上传:
[root@172 hadoop]# hadoop fs -put input.txt /jettech
查看:
[root@172 work]# hadoop fs -ls /jettech
Found 5 items
-rw-r--r-- 3 root supergroup 15217 2022-09-02 16:59 /jettech/LICENSE.txt
-rw-r--r-- 3 root supergroup 56 2022-09-02 17:00 /jettech/input.txt
drwxr-xr-x - root supergroup 0 2022-09-02 17:05 /jettech/output
drwxr-xr-x - root supergroup 0 2022-09-02 17:07 /jettech/output1
drwxr-xr-x - root supergroup 0 2022-09-02 17:21 /jettech/output2
[root@172 work]# hadoop fs -cat /jettech/input.txt
I love runoob
I like runoob
I love hadoop
I like hadoop
如果不想要可以用rm删除,hdfs dfs命令之后附加的命令和Linux下常用命令基本相同。
Size为当前大小,Block Size为一个存储块最大的存储容量为128M。
可从网站上进行下载,如果上传的文件容量大于规定的128M,将会把此文件分成两个块进行存储。同时,也可找到该文件存储目录下对应的blk_id,自行按顺序进行拼接后,可还原成原始文件。
[root@172 work]# ls /root/hadoop/server/hadoop/dfs/data/current/BP-512546310-172.16.10.67-1662109116057/current/finalized/subdir0/subdir0/
blk_1073741825 blk_1073741839_1015.meta blk_1073741851 blk_1073741862_1038.meta blk_1073741874 blk_1073741885_1061.meta
blk_1073741825_1001.meta blk_1073741840 blk_1073741851_1027.meta blk_1073741863 blk_1073741874_1050.meta blk_1073741886
blk_1073741826 blk_1073741840_1016.meta blk_1073741852 blk_1073741863_1039.meta blk_1073741875 blk_1073741886_1062.meta
blk_1073741826_1002.meta blk_1073741841 blk_1073741852_1028.meta blk_1073741870 blk_1073741875_1051.meta blk_1073741887
blk_1073741836 blk_1073741841_1017.meta blk_1073741859 blk_1073741870_1046.meta blk_1073741882 blk_1073741887_1063.meta
blk_1073741836_1012.meta blk_1073741848 blk_1073741859_1035.meta blk_1073741871 blk_1073741882_1058.meta blk_1073741888
blk_1073741837 blk_1073741848_1024.meta blk_1073741860 blk_1073741871_1047.meta blk_1073741883 blk_1073741888_1064.meta
blk_1073741837_1013.meta blk_1073741849 blk_1073741860_1036.meta blk_1073741872 blk_1073741883_1059.meta
blk_1073741838 blk_1073741849_1025.meta blk_1073741861 blk_1073741872_1048.meta blk_1073741884
blk_1073741838_1014.meta blk_1073741850 blk_1073741861_1037.meta blk_1073741873 blk_1073741884_1060.meta
blk_1073741839 blk_1073741850_1026.meta blk_1073741862 blk_1073741873_1049.meta blk_1073741885
创建tmp.file用来存放文件内容,开始拼接
# cat blk_1073741836>>tmp.file
# cat blk_1073741837>>tmp.file
案例:集群模式下,需要先上传到集群中
统计案例
[root@172 work]# yarn jar /root/hadoop/server/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.4.jar wordcount /jettech/input.txt /jettech/output4
2022-09-05 11:27:12,265 INFO client.DefaultNoHARMFailoverProxyProvider: Connecting to ResourceManager at /172.16.10.67:8032
2022-09-05 11:27:12,846 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/root/.staging/job_1662112035948_0001
2022-09-05 11:27:13,571 INFO input.FileInputFormat: Total input files to process : 1
2022-09-05 11:27:14,477 INFO mapreduce.JobSubmitter: number of splits:1
2022-09-05 11:27:14,635 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1662112035948_0001
2022-09-05 11:27:14,635 INFO mapreduce.JobSubmitter: Executing with tokens: []
2022-09-05 11:27:14,817 INFO conf.Configuration: resource-types.xml not found
2022-09-05 11:27:14,818 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2022-09-05 11:27:15,265 INFO impl.YarnClientImpl: Submitted application application_1662112035948_0001
2022-09-05 11:27:15,305 INFO mapreduce.Job: The url to track the job: http://172.16.10.67:8088/proxy/application_1662112035948_0001/
2022-09-05 11:27:15,306 INFO mapreduce.Job: Running job: job_1662112035948_0001
2022-09-05 11:27:23,514 INFO mapreduce.Job: Job job_1662112035948_0001 running in uber mode : false
2022-09-05 11:27:23,517 INFO mapreduce.Job: map 0% reduce 0%
2022-09-05 11:27:28,659 INFO mapreduce.Job: map 100% reduce 0%
2022-09-05 11:27:32,710 INFO mapreduce.Job: map 100% reduce 100%
2022-09-05 11:27:33,737 INFO mapreduce.Job: Job job_1662112035948_0001 completed successfully
2022-09-05 11:27:33,875 INFO mapreduce.Job: Counters: 54
File System Counters
FILE: Number of bytes read=62
FILE: Number of bytes written=553631
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=163
HDFS: Number of bytes written=36
HDFS: Number of read operations=8
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
HDFS: Number of bytes read erasure-coded=0
Job Counters
Launched map tasks=1
Launched reduce tasks=1
Rack-local map tasks=1
Total time spent by all maps in occupied slots (ms)=2599
Total time spent by all reduces in occupied slots (ms)=2680
Total time spent by all map tasks (ms)=2599
Total time spent by all reduce tasks (ms)=2680
Total vcore-milliseconds taken by all map tasks=2599
Total vcore-milliseconds taken by all reduce tasks=2680
Total megabyte-milliseconds taken by all map tasks=5322752
Total megabyte-milliseconds taken by all reduce tasks=5488640
Map-Reduce Framework
Map input records=4
Map output records=12
Map output bytes=104
Map output materialized bytes=62
Input split bytes=107
Combine input records=12
Combine output records=5
Reduce input groups=5
Reduce shuffle bytes=62
Reduce input records=5
Reduce output records=5
Spilled Records=10
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=135
CPU time spent (ms)=1340
Physical memory (bytes) snapshot=576929792
Virtual memory (bytes) snapshot=5615321088
Total committed heap usage (bytes)=486014976
Peak Map Physical memory (bytes)=345632768
Peak Map Virtual memory (bytes)=2804011008
Peak Reduce Physical memory (bytes)=231297024
Peak Reduce Virtual memory (bytes)=2811310080
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=56
File Output Format Counters
Bytes Written=36
查看执行结果
[root@172 work]# hadoop fs -cat /jettech/output4/part-r-00000
I 4
hadoop 2
like 2
love 2
runoob 2
在学习了 MapReduce 的使用之后,我们已经可以处理 Word Count 这类统计和检索任务,但是客观上 MapReduce 可以做的事情还有很多。
MapReduce 主要是依靠开发者通过编程来实现功能的,开发者可以通过实现 Map 和 Reduce 相关的方法来进行数据处理。
为了简单的展示这一过程,我们将手工编写一个 Word Count 程序。
注意:MapReduce 依赖 Hadoop 的库,但由于本教程使用的 Hadoop 运行环境是 Docker 容器,难以部署开发环境,所以真实的开发工作(包含调试)将需要一个运行 Hadoop 的计算机。在这里我们仅学习已完成程序的部署。
root@172 work]# mkdir -p com/runoob/hadoop/
root@172 work]# cat com/runoob/hadoop/
Map.class MyWordCount.class MyWordCount.java Reduce.class
[root@172 work]# cat com/runoob/hadoop/MyWordCount.java
/**
* 引用声明
* 本程序引用自 http://hadoop.apache.org/docs/r1.0.4/cn/mapred_tutorial.html
*/
package com.runoob.hadoop;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
/**
* 与 `Map` 相关的方法
*/
class Map extends MapReduceBase implements Mapper {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key,
Text value,
OutputCollector output,
Reporter reporter)
throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
/**
* 与 `Reduce` 相关的方法
*/
class Reduce extends MapReduceBase implements Reducer {
public void reduce(Text key,
Iterator values,
OutputCollector output,
Reporter reporter)
throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
public class MyWordCount {
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(MyWordCount.class);
conf.setJobName("my_word_count");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
// 第一个参数表示输入
FileInputFormat.setInputPaths(conf, new Path(args[0]));
// 第二个输入参数表示输出
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
JobClient.runJob(conf);
}
}
编译:
[root@172 work]# javac -classpath /root/hadoop/server/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-client-core-3.3.4.jar -classpath /root/hadoop/server/hadoop/share/hadoop/client/hadoop-client-api-3.3.4.jar com/runoob/hadoop/MyWordCount.java
打包:
[root@172 work]# jar -cf wubo-word-count.jar com
执行:
[root@172 work]# hadoop jar wubo-word-count.jar com.runoob.hadoop.MyWordCount /jettech/input.txt /jettech/output5