https://mirror.bit.edu.cn/apache/hadoop/common/hadoop-2.9.2/hadoop-2.9.2.tar.gz
https://download.oracle.com/otn/java/jdk/8u231-b11/5b13a193868b4bf28bcb45c792fce896/jdk-8u231-linux-x64.tar.gz
# 拉取Centos7镜像
docker pull centos:centos7.8.2003
# 创建一个基础容器
docker run -d --name basecentos --privileged=true centos:centos7.8.2003 /usr/sbin/init
# 启动容器
docker exec -it basecentos /bin/bash
# 修改yum源,方便下载
yum -y install wget
cd /etc/yum.repos.d/
mkdir repo_backup
mv *.repo repo_backup/
wget http://mirrors.163.com/.help/CentOS7-Base-163.repo
# 安装常用软件
yum install -y rsync openssh-server vim* lrzsz gcc-c++ pcre pcre-devel zlib zlib-devel ruby openssl openssl-devel patch bash-completion zlib.i686 libstdc++.i686 lsof unzip zip initscripts openssh-clients net-tools.x86_64 telnet* firewalld
# 修改sshd_config为密码登录
vim /etc/ssh/sshd_config
Port 22
PermitRootLogin yes
# 设置root密码
passwd root
# 新建软件包存放目录和软件安装目录
mkdir -p /root/tools
mkdir -p /root/servers
# 上传jdk、hadoop安装包
cd /root/tools
rz
# 解压jdk
tar -zxvf jdk-8u231-linux-x64.tar.gz -C /root
# 配置环境变量
vim /etc/profile
export JAVA_HOME=/root/jdk1.8.0_231
export CLASSPATH=.:$JAVA_HOME/jre/lib/rt.jar:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export PATH=$PATH:$JAVA_HOME/bin
source /etc/profile
# 查看java版本
java -version
# 创建自己的基础版centos
docker commit -a "lx" -m "base centos" 0f539d7878ee basecentos:v1
# 查看自己的镜像
docker images basecentos:v1
框架 | hadoop1 | hadoop2 | hadoop3 |
---|---|---|---|
HDFS | NameNode、 DataNode | DataNode | SecondaryNameNode、DataNode |
YARN | NodeManager | ResourceManager、NodeManager | NodeManager |
# 构建集群子网
docker network create --subnet=172.18.0.0/16 hadoopgroup
# 启动容器
docker run -d --privileged=true -it --name hadoop1 -h hadoop1 --net hadoopgroup --ip 172.18.0.2 -P -p 50070:50070 -p 19888:19888 basecentos:v1 /usr/sbin/init
docker run -d --privileged=true -it --name hadoop2 -h hadoop2 --net hadoopgroup --ip 172.18.0.3 -P basecentos:v1 /usr/sbin/init
docker run -d --privileged=true -it --name hadoop3 -h hadoop3 --net hadoopgroup --ip 172.18.0.4 -P -p 8088:8088 basecentos:v1 /usr/sbin/init
# 进入hadoop1
docker exec -it hadoop1 /bin/bash
# 1. 修改dns
vim /etc/resolv.conf
nameserver 114.114.114.114
# 2. 设置ssh登录自动添加kown_hosts
vim /etc/ssh/ssh_config
StrictHostKeyChecking no
# 3. 重启sshd
systemctl restart sshd
# hadoop2、hadoop3重复1、2、3步骤
# 配置主机名,方便后面同步,3台服务器都要配置
vim /etc/hosts
172.18.0.2 hadoop1
172.18.0.3 hadoop2
172.18.0.4 hadoop3
# 配置ssh key
ssh-keygen -t rsa
cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys
# 文件生成之后用scp将公钥文件分发到集群slave主机
ssh root@hadoop2 'mkdir ~/.ssh'
scp ~/.ssh/authorized_keys root@hadoop2:~/.ssh/
ssh root@hadoop3 'mkdir ~/.ssh'
scp ~/.ssh/authorized_keys root@hadoop3:~/.ssh/
# 测试免密登录,分别ssh到其他机器,不需要输入密码
# 在hadoop1上解压
tar -zxvf hadoop-2.9.2.tar.gz -C /root/servers/
# 添加环境变量
vim /etc/profile
export HADOOP_HOME=/root/servers/hadoop-2.9.2
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin
source /etc/profile
# 验证
hadoop version
cd /root/servers/hadoop-2.9.2/etc/hadoop
vim hadoop-env.sh
export JAVA_HOME=/root/jdk1.8.0_231
vim core-site.xml
<property>
<name>fs.defaultFSname>
<value>hdfs://hadoop1:9000value>
property>
<property>
<name>hadoop.tmp.dirname>
<value>/root/servers/hadoop-2.9.2/data/tmpvalue>
property>
vim hdfs-site.xml
<property>
<name>dfs.namenode.secondary.http-addressname>
<value>hadoop3:50090value>
property>
<property>
<name>dfs.replicationname>
<value>3value>
property>
vim slaves
hadoop1
hadoop2
hadoop3
vim mapred-env.sh
export JAVA_HOME=/root/jdk1.8.0_231
mv mapred-site.xml.template mapred-site.xml
vim mapred-site.xml
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
vim yarn-env.sh
export JAVA_HOME=/root/jdk1.8.0_231
vim yarn-site.xml
<property>
<name>yarn.resourcemanager.hostnamename>
<value>hadoop3value>
property>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
chown -R root:root /root/servers/hadoop-2.9.2
cd /usr/local/bin
vim rsync-script
#!/bin/bash
# 1 获取命令输入参数的个数,如果个数为0,直接退出命令
paramnum=$#
if((paramnum==0));
then
echo no params;
exit;
fi
# 2 根据传入参数获取文件名称
p1=$1
file_name=`basename $p1`
echo fname=$file_name
# 3 获取输入参数的绝对路径
pdir=`cd -P $(dirname $p1); pwd`
echo pdir=$pdir
# 4 获取用户名称
user=`whoami`
# 5 循环执行rsync
for((host=1; host<4; host++));
do
echo ------------------- linux$host --------------
rsync -rvl $pdir/$file_name $user@hadoop$host:$pdir
done
chmod 777 rsync-script
rsync-script /root/servers/hadoop-2.9.2
如果集群是第一次启动,需要在NameNode所在节点格式化NameNode,非第一次不用执行格式化NameNode操作!!!
# hadoop1上格式化
hadoop namenode -format
# hadoop1上启动NameNode
hadoop-daemon.sh start namenode
# hadoop2、hadoop3上启动DataNode,可以给hadoop2、hadoop3也增加环境变量
hadoop-daemon.sh start datanode
# 查看是否启动成功
jps
# web端查看Hdfs页面
http://localhost:50070/dfshealth.html#tab-overview
# Yarn集群单节点启动
# hadoop3上
yarn-daemon.sh start resourcemanager
# hadoop1、hadoop2上
yarn-daemon.sh start nodemanager
# 查看是否启动成功
jps
# Yarn集群节点启动
# hadoop3上
yarn-daemon.sh start resourcemanager
# hadoop1、hadoop2上
yarn-daemon.sh start nodemanager
# 查看是否启动成功
jps
# 停止服务
hadoop-daemon.sh stop namenode
hadoop-daemon.sh stop datanode
yarn-daemon.sh stop resourcemanager
yarn-daemon.sh stop nodemanager
# hadoop1上格式化,注意:之前格式化过,就不需要再次格式化!!!
hadoop namenode -format
# 启动HDFS,在hadoop1上
start-dfs.sh
# 启动YARN,在hadoop3上
start-yarn.sh
# 停止服务
stop-dfs.sh
stop-yarn.sh
# HDFS
hadoop-daemon.sh start / stop namenode / datanode / secondarynamenode
start-dfs.sh / stop-dfs.sh
# YARN
yarn-daemon.sh start / stop resourcemanager / nodemanager
start-yarn.sh / stop-yarn.sh
# 在任意节点
cd /root
vim test.txt
hello hdfs
# 在HDFS上创建目录,注意:需要关闭其他服务器的防火墙
hdfs dfs -mkdir -p /test/input
# 上传linxu文件到Hdfs
hdfs dfs -put /root/test.txt /test/input
# 从Hdfs下载文件到linux本地
hdfs dfs -get /test/input/test.txt
# HDFS下创建wcinput文件夹
hdfs dfs -mkdir /wcinput
# 在本地/root下创建wc.txt
vim /root/wc.txt
hadoop mapreduce yarn
hdfs hadoop mapreduce
mapreduce yarn lx
lx
lx
# 把wc.txt上传到HDFS下的/wcinput
hdfs dfs -put /root/wc.txt /wcinput
# 执行程序,注意:/wcoutput目录不要提前创建,要让程序自动创建
hadoop jar /root/servers/hadoop-2.9.2/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.2.jar wordcount /wcinput /wcoutput
# 查看结果
hdfs dfs -cat /wcoutput/part-r-00000
# 配置mapred-site.xml
vim mapred-site.xml
<property>
<name>mapreduce.jobhistory.addressname>
<value>hadoop1:10020value>
property>
<property>
<name>mapreduce.jobhistory.webapp.addressname>
<value>hadoop1:19888value>
property>
# 分发mapred-site.xml到其它节点
rsync-script mapred-site.xml
# 启动历史服务器
mr-jobhistory-daemon.sh start historyserver
# 查看历史服务器是否启动
jps
# 查看JobHistory
http://hadoop1:19888/jobhistory
# 配置日志的聚集
vim yarn-site.xml
<property>
<name>yarn.log-aggregation-enablename>
<value>truevalue>
property>
<property>
<name>yarn.log-aggregation.retain-secondsname>
<value>604800value>
property>
# 分发yarn-site.xml到集群其它节点
rsync-script yarn-site.xml
# 关闭NodeManager 、ResourceManager和HistoryManager
# hadoop3上
stop-yarn.sh
# hadoop1上
mr-jobhistory-daemon.sh stop historyserver
# 启动NodeManager 、ResourceManager和HistoryManager
# hadoop3上
start-yarn.sh
# hadoop1上
mr-jobhistory-daemon.sh start historyserver
# 删除HDFS上已经存在的输出文件
hdfs dfs -rm -R /wcoutput
# 执行WordCount程序
hadoop jar /root/servers/hadoop-2.9.2/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.9.2.jar wordcount /wcinput /wcoutput
# 查看日志
http://hadoop1:19888/jobhistory