docker-compose部署hadoop集群(高可用)—— 筑梦之路

hadoop高可用依赖组件zookeeper

1. 下载二进制文件

### 1、zookeeper
# 下载地址:https://zookeeper.apache.org/releases.html
# zookeeper非高可用用不到
wget https://dlcdn.apache.org/zookeeper/zookeeper-3.8.0/apache-zookeeper-3.8.0-bin.tar.gz --no-check-certificate
tar -xf  apache-zookeeper-3.8.0-bin.tar.gz

### 2、Hadoop
# 下载地址:https://dlcdn.apache.org/hadoop/common/
wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz --no-check-certificate

### 3、spark
# Spark下载地址:http://spark.apache.org/downloads.html
wget https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz --no-check-certificate

### 4、flink
wget https://dlcdn.apache.org/flink/flink-1.17.0/flink-1.17.0-bin-scala_2.12.tgz --no-check-certificate

2. 编写Dockerfile

FROM centos:7.9

RUN rm -f /etc/localtime && \
    ln -sv /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
    echo "Asia/Shanghai" > /etc/timezone

RUN export LANG=zh_CN.UTF-8

# 创建用户和用户组,跟yaml编排里的user: 10000:10000
RUN groupadd --system --gid=10000 hadoop && useradd --system --home-dir /home/hadoop --uid=10000 --gid=hadoop hadoop

# 安装sudo
RUN yum -y install sudo  net-tools telnet wget curl ; chmod 640 /etc/sudoers

# 给hadoop添加sudo权限
RUN echo "hadoop ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers

RUN mkdir /opt/apache/

# 安装 JDK
ADD jdk-8u212-linux-x64.tar.gz /opt/apache/
ENV JAVA_HOME /opt/apache/jdk1.8.0_212
ENV PATH $JAVA_HOME/bin:$PATH

# 配置zookeeper
ENV ZOOKEEPER_VERSION 3.8.0
ADD apache-zookeeper-${ZOOKEEPER_VERSION}-bin.tar.gz /opt/apache/
ENV ZOOKEEPER_HOME /opt/apache/zookeeper
RUN ln -s /opt/apache/apache-zookeeper-${ZOOKEEPER_VERSION}-bin $ZOOKEEPER_HOME
COPY config/zookeeper-config/* ${ZOOKEEPER_HOME}/conf/

# 配置 Hadoop
ENV HADOOP_VERSION 3.3.5
ADD hadoop-${HADOOP_VERSION}.tar.gz /opt/apache/
ENV HADOOP_HOME /opt/apache/hadoop
RUN ln -s /opt/apache/hadoop-${HADOOP_VERSION} $HADOOP_HOME

ENV HADOOP_COMMON_HOME=${HADOOP_HOME} \
    HADOOP_HDFS_HOME=${HADOOP_HOME} \
    HADOOP_MAPRED_HOME=${HADOOP_HOME} \
    HADOOP_YARN_HOME=${HADOOP_HOME} \
    HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop \
    PATH=${PATH}:${HADOOP_HOME}/bin

# 配置Hive
ENV HIVE_VERSION 3.1.3
ADD apache-hive-${HIVE_VERSION}-bin.tar.gz /opt/apache/
ENV HIVE_HOME=/opt/apache/hive
ENV PATH=$HIVE_HOME/bin:$PATH
RUN ln -s /opt/apache/apache-hive-${HIVE_VERSION}-bin ${HIVE_HOME}

# 配置spark
ENV SPARK_VERSION 3.3.2
ADD spark-${SPARK_VERSION}-bin-hadoop3.tgz /opt/apache/
ENV SPARK_HOME=/opt/apache/spark
ENV PATH=$SPARK_HOME/bin:$PATH
RUN ln -s /opt/apache/spark-${SPARK_VERSION}-bin-hadoop3 ${SPARK_HOME}

# 配置 flink
ENV FLINK_VERSION 1.17.0
ADD flink-${FLINK_VERSION}-bin-scala_2.12.tgz /opt/apache/
ENV FLINK_HOME=/opt/apache/flink
ENV PATH=$FLINK_HOME/bin:$PATH
RUN ln -s /opt/apache/flink-${FLINK_VERSION} ${FLINK_HOME}

# 创建namenode、datanode存储目录
RUN mkdir -p /opt/apache/hadoop/data/{hdfs,yarn} /opt/apache/hadoop/data/hdfs/{journalnode,namenode} /opt/apache/hadoop/data/hdfs/datanode/data{1..3} /opt/apache/hadoop/data/yarn/{local-dirs,log-dirs,apps}

COPY bootstrap.sh /opt/apache/

COPY config/hadoop-config/* ${HADOOP_HOME}/etc/hadoop/

RUN chown -R hadoop:hadoop /opt/apache

ENV ll "ls -l"

WORKDIR /opt/apache

3. 构建镜像

docker build -t hadoop-ha:v1 . --no-cache

### 参数解释
# -t:指定镜像名称
# . :当前目录Dockerfile
# -f:指定Dockerfile路径
#  --no-cache:不缓存

4. 准备配置文件

1、Hadoop 配置
主要有以下几个文件:core-site.xml、dfs.hosts、dfs.hosts.exclude、hdfs-site.xml、mapred-site.xml、yarn-hosts-exclude、yarn-hosts-include、yarn-site.xml

2、Hive 配置
主要有以下几个文件:hive-env.sh、hive-site.xml
cat bootstrap.sh

#!/usr/bin/env sh

wait_for() {
    echo Waiting for $1 to listen on $2...
    while ! nc -z $1 $2; do echo waiting...; sleep 1s; done
}

start_zookeeper() {

        ${ZOOKEEPER_HOME}/bin/zkServer.sh start

        tail -f ${ZOOKEEPER_HOME}/logs/zookeeper-*.out
}

start_hdfs_journalnode() {

        wait_for $1 $2

        ${HADOOP_HOME}/bin/hdfs --loglevel INFO --daemon start journalnode

        tail -f ${HADOOP_HOME}/logs/*journalnode*.log

}

start_hdfs_namenode() {

        wait_for $1 $2

        if [ ! -f /opt/apache/hadoop/data/hdfs/namenode/formated ];then
                ${ZOOKEEPER_HOME}/bin/zkCli.sh -server zookeeper:${ZOOKEEPER_PORT} ls /hadoop-ha 1>/dev/null
                if [ $? -ne 0 ];then
                        $HADOOP_HOME/bin/hdfs zkfc -formatZK
                        $HADOOP_HOME/bin/hdfs namenode -format -force -nonInteractive && echo 1 > /opt/apache/hadoop/data/hdfs/namenode/formated
                else
                        $HADOOP_HOME/bin/hdfs namenode -bootstrapStandby && echo 1 > /opt/apache/hadoop/data/hdfs/namenode/formated
                fi
        fi

        $HADOOP_HOME/bin/hdfs --loglevel INFO --daemon start zkfc
        $HADOOP_HOME/bin/hdfs --loglevel INFO --daemon start namenode

        tail -f ${HADOOP_HOME}/logs/*.out
}

start_hdfs_datanode() {

        wait_for $1 $2

        ${HADOOP_HOME}/bin/hdfs --loglevel INFO --daemon start datanode

        tail -f ${HADOOP_HOME}/logs/*datanode*.log
}

start_yarn_resourcemanager() {

        wait_for $1 $2

        ${HADOOP_HOME}/bin/yarn --loglevel INFO --daemon start resourcemanager

        tail -f ${HADOOP_HOME}/logs/*resourcemanager*.log
}

start_yarn_nodemanager() {

        wait_for $1 $2

        ${HADOOP_HOME}/bin/yarn --loglevel INFO --daemon start nodemanager

        tail -f ${HADOOP_HOME}/logs/*nodemanager*.log
}

start_yarn_proxyserver() {

        wait_for $1 $2

        ${HADOOP_HOME}/bin/yarn --loglevel INFO --daemon start proxyserver

        tail -f ${HADOOP_HOME}/logs/*proxyserver*.log
}

start_mr_historyserver() {

        wait_for $1 $2

        ${HADOOP_HOME}/bin/mapred --loglevel INFO  --daemon  start historyserver

        tail -f ${HADOOP_HOME}/logs/*historyserver*.log
}

case $1 in
        zookeeper)
                start_zookeeper
                ;;
        hadoop-hdfs-jn)
                start_hdfs_journalnode $2 $3
                ;;
        hadoop-hdfs-nn)
                start_hdfs_namenode $2 $3
                ;;
        hadoop-hdfs-dn)
                start_hdfs_datanode $2 $3
                ;;
        hadoop-yarn-rm)
                start_yarn_resourcemanager $2 $3
                ;;
        hadoop-yarn-nm)
                start_yarn_nodemanager $2 $3
                ;;
        hadoop-yarn-proxyserver)
                start_yarn_proxyserver $2 $3
                ;;
        hadoop-mr-historyserver)
                start_mr_historyserver $2 $3
                ;;
        *)
                echo "请输入正确的服务启动命令~"
        ;;
esac
cat > .env << EOF
ZOOKEEPER_PORT=2181
HADOOP_HDFS_JN_PORT=8485
HADOOP_HDFS_NN_PORT=9870
HADOOP_HDFS_DN_PORT=9864
HADOOP_YARN_RM_PORT=8088
HADOOP_YARN_NM_PORT=8042
HADOOP_YARN_PROXYSERVER_PORT=9111
HADOOP_MR_HISTORYSERVER_PORT=19888
EOF

5. 编写docker-compose.yml

version: '3'
services:
  zookeeper:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: zookeeper
    hostname: zookeeper
    restart: always
    env_file:
      - .env
    ports:
      - ${ZOOKEEPER_PORT}
    command: ["sh","-c","/opt/apache/bootstrap.sh zookeeper"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "netstat -tnlp|grep :${ZOOKEEPER_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-hdfs-jn-0:
    image: registry.cn-hangzhou.aliyuncs.com/bigdata_cloudnative/hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-hdfs-jn-0
    hostname: hadoop-hdfs-jn-0
    restart: always
    depends_on:
      - zookeeper
    env_file:
      - .env
    expose:
      - ${HADOOP_HDFS_JN_PORT}
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-jn zookeeper ${ZOOKEEPER_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_HDFS_JN_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-hdfs-jn-1:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-hdfs-jn-1
    hostname: hadoop-hdfs-jn-1
    restart: always
    depends_on:
      - hadoop-hdfs-jn-0
    env_file:
      - .env
    expose:
      - ${HADOOP_HDFS_JN_PORT}
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-jn zookeeper ${ZOOKEEPER_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_HDFS_JN_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-hdfs-jn-2:
    image: registry.cn-hangzhou.aliyuncs.com/bigdata_cloudnative/hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-hdfs-jn-2
    hostname: hadoop-hdfs-jn-2
    restart: always
    depends_on:
      - hadoop-hdfs-jn-1
    env_file:
      - .env
    expose:
      - ${HADOOP_HDFS_JN_PORT}
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-jn zookeeper ${ZOOKEEPER_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_HDFS_JN_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-hdfs-nn-0:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-hdfs-nn-0
    hostname: hadoop-hdfs-nn-0
    restart: always
    depends_on:
      - hadoop-hdfs-jn-2
    env_file:
      - .env
    ports:
      - "30070:${HADOOP_HDFS_NN_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-nn hadoop-hdfs-jn-2 ${HADOOP_HDFS_JN_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_HDFS_NN_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-hdfs-nn-1:
    image: registry.cn-hangzhou.aliyuncs.com/bigdata_cloudnative/hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-hdfs-nn-1
    hostname: hadoop-hdfs-nn-1
    restart: always
    depends_on:
      - hadoop-hdfs-nn-0
    env_file:
      - .env
    ports:
      - "30071:${HADOOP_HDFS_NN_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-nn hadoop-hdfs-nn-0 ${HADOOP_HDFS_NN_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_HDFS_NN_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 6
  hadoop-hdfs-dn-0:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-hdfs-dn-0
    hostname: hadoop-hdfs-dn-0
    restart: always
    depends_on:
      - hadoop-hdfs-nn-1
    env_file:
      - .env
    ports:
      - "30864:${HADOOP_HDFS_DN_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-dn hadoop-hdfs-nn-1 ${HADOOP_HDFS_NN_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_HDFS_DN_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 8
  hadoop-hdfs-dn-1:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-hdfs-dn-1
    hostname: hadoop-hdfs-dn-1
    restart: always
    depends_on:
      - hadoop-hdfs-nn-1
    env_file:
      - .env
    ports:
      - "30865:${HADOOP_HDFS_DN_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-dn hadoop-hdfs-nn-1 ${HADOOP_HDFS_NN_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_HDFS_DN_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 8
  hadoop-hdfs-dn-2:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-hdfs-dn-2
    hostname: hadoop-hdfs-dn-2
    restart: always
    depends_on:
      - hadoop-hdfs-nn-1
    env_file:
      - .env
    ports:
      - "30866:${HADOOP_HDFS_DN_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-hdfs-dn hadoop-hdfs-nn-1 ${HADOOP_HDFS_NN_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_HDFS_DN_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 8
  hadoop-yarn-rm-0:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-yarn-rm-0
    hostname: hadoop-yarn-rm-0
    restart: always
    depends_on:
      - zookeeper
    env_file:
      - .env
    ports:
      - "30888:${HADOOP_YARN_RM_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-rm zookeeper ${ZOOKEEPER_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_YARN_RM_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-yarn-rm-1:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-yarn-rm-1
    hostname: hadoop-yarn-rm-1
    restart: always
    depends_on:
      - hadoop-yarn-rm-0
    env_file:
      - .env
    ports:
      - "30889:${HADOOP_YARN_RM_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-rm hadoop-yarn-rm-0 ${HADOOP_YARN_RM_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_YARN_RM_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-yarn-nm-0:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-yarn-nm-0
    hostname: hadoop-yarn-nm-0
    restart: always
    depends_on:
      - hadoop-yarn-rm-1
    env_file:
      - .env
    ports:
      - "30042:${HADOOP_YARN_NM_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-nm hadoop-yarn-rm-1 ${HADOOP_YARN_RM_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_YARN_NM_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-yarn-nm-1:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-yarn-nm-1
    hostname: hadoop-yarn-nm-1
    restart: always
    depends_on:
      - hadoop-yarn-rm-1
    env_file:
      - .env
    ports:
      - "30043:${HADOOP_YARN_NM_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-nm hadoop-yarn-rm-1 ${HADOOP_YARN_RM_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_YARN_NM_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-yarn-nm-2:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-yarn-nm-2
    hostname: hadoop-yarn-nm-2
    restart: always
    depends_on:
      - hadoop-yarn-rm-1
    env_file:
      - .env
    ports:
      - "30044:${HADOOP_YARN_NM_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-nm hadoop-yarn-rm-1 ${HADOOP_YARN_RM_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "curl --fail http://localhost:${HADOOP_YARN_NM_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-yarn-proxyserver:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-yarn-proxyserver
    hostname: hadoop-yarn-proxyserver
    restart: always
    depends_on:
      - hadoop-yarn-rm-1
    env_file:
      - .env
    ports:
      - "30911:${HADOOP_YARN_PROXYSERVER_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-yarn-proxyserver hadoop-yarn-rm-1 ${HADOOP_YARN_RM_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_YARN_PROXYSERVER_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
  hadoop-mr-historyserver:
    image: hadoop-ha:v1
    user: "hadoop:hadoop"
    container_name: hadoop-mr-historyserver
    hostname: hadoop-mr-historyserver
    restart: always
    depends_on:
      - hadoop-yarn-rm-1
    env_file:
      - .env
    ports:
      - "31988:${HADOOP_MR_HISTORYSERVER_PORT}"
    command: ["sh","-c","/opt/apache/bootstrap.sh hadoop-mr-historyserver hadoop-yarn-rm-1 ${HADOOP_YARN_RM_PORT}"]
    networks:
      - hadoopha_network
    healthcheck:
      test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_MR_HISTORYSERVER_PORT} || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 6

networks:
  hadoopha_network:
    driver: bridge

6.启动验证

HDFS:http://ip:30070 、http://ip:30071

# 随便登录一个容器即可
docker exec -it hadoop-hdfs-jn-0 bash

hdfs dfs -ls /
hdfs dfs -touchz /test
hdfs dfs -mkdir /test123
hdfs dfs -ls /

YARN:http://ip:30888http://ip:30889 

你可能感兴趣的:(数据库技术,大数据,Java技术,hadoop,docker,大数据)