[TOC]
hadoop283.dockerfile
#hadoop283-jdk.dockerfile
FROM xiaows/debian8-jdk8-ssh:3.0
# MAINTAINER XIAOWS
WORKDIR /root
#install hadoop
COPY hadoop-2.8.3/ /usr/local/hadoop-2.8.3
#copy hadoop-config
# COPY hadoop-conf/ $HADOOP_HOME/etc/hadoop/
ENV HADOOP_HOME=/usr/local/hadoop-2.8.3
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
RUN mkdir -p /data/hdfs/namenode;\
mkdir -p /data/hdfs/datanode;\
# mkdir -p $HADOOP_HOME/logs;\
# ln -s $HADOOP_HOME/etc/hadoop /root/hadoop-conf;\
rm -rfv $HADOOP_HOME/share/doc/;\
rm -rfv $HADOOP_HOME/bin/*.cmd;\
rm -rfv $HADOOP_HOME/sbin/*.cmd;\
mv $HADOOP_HOME/etc/hadoop/start-hadoop.sh /root/;\
mv $HADOOP_HOME/etc/hadoop/run-wordcount.sh /root/;\
mv /root/Dockerfile-debian8-jdk8-ssh /;\
chmod +x /root/*.sh;\
hdfs namenode -format
COPY hadoop283-jdk.dockerfile /
# ENTRYPOINT hdfs namenode -format
修改如下配置文件
- $HADOOP_HOME/etc/hadoop/core-site.xml
fs.defaultFS
hdfs://hadoop-1:9000/
- $HADOOP_HOME/etc/hadoop/hadoop-env.sh
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Set Hadoop-specific environment variables here.
# The only required environment variable is JAVA_HOME. All others are
# optional. When running a distributed configuration it is best to
# set JAVA_HOME in this file, so that it is correctly defined on
# remote nodes.
# The java implementation to use.
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
# The jsvc implementation to use. Jsvc is required to run secure datanodes
# that bind to privileged ports to provide authentication of data transfer
# protocol. Jsvc is not required if SASL is configured for authentication of
# data transfer protocol using non-privileged ports.
#export JSVC_HOME=${JSVC_HOME}
export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler.
for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
if [ "$HADOOP_CLASSPATH" ]; then
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
else
export HADOOP_CLASSPATH=$f
fi
done
# The maximum amount of heap to use, in MB. Default is 1000.
#export HADOOP_HEAPSIZE=
#export HADOOP_NAMENODE_INIT_HEAPSIZE=""
# Extra Java runtime options. Empty by default.
export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"
# Command specific options appended to HADOOP_OPTS when specified
export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
# On secure datanodes, user to run the datanode as after dropping privileges.
# This **MUST** be uncommented to enable secure HDFS if using privileged ports
# to provide authentication of data transfer protocol. This **MUST NOT** be
# defined if SASL is configured for authentication of data transfer protocol
# using non-privileged ports.
export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
# Where log files are stored. $HADOOP_HOME/logs by default.
#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
# Where log files are stored in the secure data environment.
export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
###
# HDFS Mover specific parameters
###
# Specify the JVM options to be used when starting the HDFS Mover.
# These options will be appended to the options specified as HADOOP_OPTS
# and therefore may override any similar flags set in HADOOP_OPTS
#
# export HADOOP_MOVER_OPTS=""
###
# Advanced Users Only!
###
# The directory where pid files are stored. /tmp by default.
# NOTE: this should be set to a directory that can only be written to by
# the user that will run the hadoop daemons. Otherwise there is the
# potential for a symlink attack.
export HADOOP_PID_DIR=${HADOOP_PID_DIR}
export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
# A string representing this instance of hadoop. $USER by default.
export HADOOP_IDENT_STRING=$USER
- $HADOOP_HOME/etc/hadoop/hdfs-site.xml
dfs.namenode.name.dir
file:///data/hdfs/namenode
dfs.datanode.data.dir
file:///data/hdfs/datanode
dfs.replication
2
- $HADOOP_HOME/etc/hadoop/mapred-site.xml
mapreduce.framework.name
yarn
- $HADOOP_HOME/etc/hadoop/slaves
hadoop-1
hadoop-2
hadoop-3
#hadoop-4
- $HADOOP_HOME/etc/hadoop/yarn-site.sh
yarn.nodemanager.aux-services
mapreduce_shuffle
yarn.nodemanager.aux-services.mapreduce_shuffle.class
org.apache.hadoop.mapred.ShuffleHandler
yarn.resourcemanager.hostname
hadoop-1
- $HADOOP_HOME/etc/hadoop/run-wordcount.sh [可选]
#!/bin/bash
# test the hadoop cluster by running wordcount
# create input files
mkdir input
echo "Hello Docker" >input/file2.txt
echo "Hello Hadoop" >input/file1.txt
# create input directory on HDFS
hadoop fs -mkdir -p input
# put input files to HDFS
hdfs dfs -put ./input/* input
# run wordcount
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/sources/hadoop-mapreduce-examples-2.8.3-sources.jar org.apache.hadoop.examples.WordCount input output
# print the input files
echo -e "\ninput file1.txt:"
hdfs dfs -cat input/file1.txt
echo -e "\ninput file2.txt:"
hdfs dfs -cat input/file2.txt
# print the output of wordcount
echo -e "\nwordcount output:"
hdfs dfs -cat output/part-r-00000
- run-container.sh
#start-cluster.sh
#!/bin/bash
if [ $# = 0 ]
then
echo "1. Please specify the docker-image of cluster !"
echo "2. Please specify name of cluster service !"
exit 1
fi
image_name=$1
service_name=$2
net_name=${3:-cluster}
docker rm -f ${service_name}-{1,2,3}
i=1
while [ $i -lt 4 ]
do
echo "start ${service_name}-$i container..."
docker run -itd \
--net=${net_name} \
--hostname ${service_name}-$i \
--name ${service_name}-$i \
--restart=always \
$image_name
i=$(( $i + 1 ))
done
docker exec -it ${service_name}-1 bash
- 为了使镜像构建的尽量小,可以删除Hadoop中的doc/文档 src/源码以及bin/.cmd sbin/.cmd
- 构建镜像
docker build -f hadoop283.dockerfile -t xiaows/hadoop283:3.0 .
- 启动Hadoop集群
./run-containeer.sh xiaows/hadoop283:3.0 hadoop