#安装wget
apt-get install wget
#下载hadoop编译文件
wget http://mirrors.sonic.net/apache/hadoop/common/hadoop-2.7.7/hadoop-2.7.7.tar.gz
#解压文件
tar -xvzf Hadoop-2.7.7.tar.gz
#选择一个已有的os镜像作为基础:
FROM centos
#镜像作者
MAINTAINER 1919698075
#安装java
COPY jdk1.8.0_144 /usr/local/jdk1.8
ENV JAVA_HOME /usr/local/jdk1.8
ENV PATH $JAVA_HOME/bin:$PATH
#安装hadoop
COPY hadoop-2.7.7 /usr/local/hadoop
ENV HADOOP_HOME /usr/local/hadoop
ENV PATH $HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH
#安装相应的支持的软件包,openssh-server/clients 和 sudo
RUN yum install -y openssh-server sudo
RUN yum install which
RUN sed -i 's/UsePAM yes/UsePAM no/g' /etc/ssh/sshd_config
RUN yum install -y openssh-clients
#添加相应的镜像系统下的root用户,这个是必须的操作
RUN echo "root:root" | chpasswd
RUN echo "root ALL=(ALL) ALL" >> /etc/sudoers
#给SSH生成密钥对
RUN ssh-keygen -t dsa -f /etc/ssh/ssh_host_dsa_key
RUN ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key
#启用sshd服务并且暴露相关的端口信息
RUN mkdir /var/run/sshd
EXPOSE 22
CMD [ "/usr/sbin/sshd","-D"]
# -t后面指定镜像名,可以自定义
docker bulid -t ruiclear/clean-hadoop
docker pull registry.cn-hangzhou.aliyuncs.com/ruiclear/clean-hadoop:1.0.0
#master节点开通端点映射,便于通过web监控集群状态(slave节点不需要开通)
docker run --name master -d -h master -P -p 50070:50070 -p 8088:8088 ruiclear/clean-hadoop
docker run --name slave1 -d -h slave1 -P ruiclear/clean-hadoop
docker run --name slave2 -d -h slave2 -P ruiclear/clean-hadoop
docker exec -it master bin/bash
docker exec -it slave1 bin/bash
docker exec -it slave2 bin/bash
#地址一般在最后eth0的下两行
ip addr
#我以我的ip为例,每个人的地址可能不一样需要进行相应的修改
172.17.0.2 master
172.17.0.3 slave1
172.17.0.4 slave2
ssh-keygen -t rsa
ssh-copy-id master
fs.defaultFS
hdfs://master:9000
hadoop.tmp.dir
/opt/module/hadoop-2.7.2/data/tmp
dfs.replication
3
dfs.namenode.secondary.http-address
slave2:50090
yarn.nodemanager.resource.memory-mb
20480
yarn.scheduler.minimum-allocation-mb
2048
yarn.nodemanager.vmem-pmem-ratio
2.1
yarn.nodemanager.aux-services
mapreduce_shuffle
yarn.resourcemanager.hostname
slave1
yarn.log-aggregation-enable
true
yarn.log-aggregation.retain-seconds
604800
mapreduce.framework.name
yarn
mapreduce.jobhistory.address
slave2:10020
mapreduce.jobhistory.webapp.address
slave2:19888
master
slave1
slave2
scp hadoop-env.sh yarn-env.sh mapred-env.sh core-site.xml hdfs-site.xml yarn-site.xml mapred-site.xml slaves root@slave1:/usr/local/hadoop/etc/hadoop/
scp hadoop-env.sh yarn-env.sh mapred-env.sh core-site.xml hdfs-site.xml yarn-site.xml mapred-site.xml slaves root@slave2:/usr/local/hadoop/etc/hadoop/
注意:scp是全部覆盖的远程拷贝即不管文件相不相同都直接进行拷贝覆盖,也可以使用rsync进行差异拷贝即文件不同才拷贝文件相同则跳过,所以rsync效率要更高
hdfs namenode -format
#在sbin路径下
start-dfs.sh
#在sbin路径下
start-yarn.sh
#在sbin路径下
mr-jobhistory-daemon.sh start historyserver
master:
[root@master hadoop]# jps
5361 DataNode
5225 NameNode
6459 Jps
slave1:
[root@slave1 hadoop]# jps
3936 ResourceManager
4049 NodeManager
3798 DataNode
5084 Jps
slave2:
[root@slave2 hadoop]# jps
2962 Jps
2548 SecondaryNameNode
2437 DataNode
2647 NodeManager
cd /usr/local/hadoop
mkdir input
#在文件随便输入一些单词,以空格分隔
vim input/input
#上传到hdfs的根目录
hadoop fs -put /input /
注:可以通过hdoop fs -ls / 命令查看目录
hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.7.jar wordcount /input /output