基于docker简单快速搭测试大数据集群

作为一个大数据的学习者,有时候我们希望基于自己的笔记本中虚拟机简单配置一个大数据集群用于测试,如果基于cdh在多个虚拟机中配置集群可能对笔记本的硬件要求会很高。其实有更简单快速基于docker的搭建方式,以下方式亲测可用.

前期需要准备的工作自己安装虚拟机,在虚拟机中安装docker,docker-compose

1首先在linux虚拟机下任意路径下创建一个指定的目录,例如spark 。然后在该目录中创建如下两个文件

docker-compose.yml

version: "2.2"
services:
  namenode:
    image: bde2020/hadoop-namenode:2.0.0-hadoop3.1.3-java8
    container_name: namenode
    volumes:
      - ./hadoop/namenode:/hadoop/dfs/name
      - ./input_files:/input_files
    environment:
      - CLUSTER_NAME=test
    env_file:
      - ./hadoop.env
    ports:
      - 50070:9870
      - 8020:8020
  
  resourcemanager:
    image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.1.3-java8
    container_name: resourcemanager
    depends_on:
      - namenode
      - datanode1
      - datanode2
      - datanode3
    env_file:
      - ./hadoop.env
  
  historyserver:
    image: bde2020/hadoop-historyserver:2.0.0-hadoop3.1.3-java8
    container_name: historyserver
    depends_on:
      - namenode
      - datanode1
      - datanode2
      - datanode3
    volumes:
      - ./hadoop/historyserver:/hadoop/yarn/timeline
    env_file:
      - ./hadoop.env
  
  nodemanager:
    image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.1.3-java8
    container_name: nodemanager
    depends_on:
      - namenode
      - datanode1
      - datanode2
      - datanode3
    env_file:
      - ./hadoop.env
  
  datanode1:
    image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.3-java8
    container_name: datanode1
    depends_on:
      - namenode
    volumes:
      - ./hadoop/datanode1:/hadoop/dfs/data
    env_file:
      - ./hadoop.env
    ports:
      - "50075:9864"
  datanode2:
    image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.3-java8
    container_name: datanode2
    depends_on:
      - namenode
    volumes:
      - ./hadoop/datanode2:/hadoop/dfs/data
    env_file:
      - ./hadoop.env
    ports:
      - "50076:9864"
  
  datanode3:
    image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.3-java8
    container_name: datanode3
    depends_on:
      - namenode
    volumes:
      - ./hadoop/datanode3:/hadoop/dfs/data
    env_file:
      - ./hadoop.env
    ports:
      - "50077:9864"

  master:
    image: gettyimages/spark:2.4.1-hadoop-3.0
    container_name: master
    command: bin/spark-class org.apache.spark.deploy.master.Master -h master
    hostname: master
    environment:
      MASTER: spark://master:7077
      SPARK_CONF_DIR: /conf
      SPARK_PUBLIC_DNS: 192.168.174.88
    links:
      - namenode
    expose:
      - 4040
      - 7001
      - 7002
      - 7003
      - 7004
      - 7005
      - 7077
      - 6066
    ports:
      - "49100:22"
      - 4040:4040
      - 6066:6066
      - 7077:7077
      - 8080:8080
    volumes:
      - ./conf/master:/conf
      - ./data:/tmp/data
      - ./jars:/root/jars

  worker1:
    image: gettyimages/spark:2.4.1-hadoop-3.0
    container_name: worker1
    command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
    hostname: worker1
    environment:
      SPARK_CONF_DIR: /conf
      SPARK_WORKER_CORES: 2
      SPARK_WORKER_MEMORY: 2g
      SPARK_WORKER_PORT: 8881
      SPARK_WORKER_WEBUI_PORT: 8081
      SPARK_PUBLIC_DNS: 192.168.174.88
    links:
      - master
    expose:
      - 7012
      - 7013
      - 7014
      - 7015
      - 8881
      - 8081
    ports:
      - 8081:8081
    volumes:
      - ./conf/worker1:/conf
      - ./data/worker1:/tmp/data
  worker2:
    image: gettyimages/spark:2.4.1-hadoop-3.0
    container_name: worker2
    command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
    hostname: worker2
    environment:
      SPARK_CONF_DIR: /conf
      SPARK_WORKER_CORES: 2
      SPARK_WORKER_MEMORY: 2g
      SPARK_WORKER_PORT: 8881
      SPARK_WORKER_WEBUI_PORT: 8082
      SPARK_PUBLIC_DNS: 192.168.174.88
    links:
      - master
    expose:
      - 7012
      - 7013
      - 7014
      - 7015
      - 8881
      - 8082
    ports:
      - 8082:8082
    volumes:
      - ./conf/worker2:/conf
      - ./data/worker2:/tmp/data  
  hive-server:
    image: bde2020/hive:2.3.2-postgresql-metastore
    container_name: hive-server
    depends_on:
      - namenode
      - datanode1
      - datanode2
      - datanode3
      - hive-metastore
    env_file:
      - ./hadoop.env
    environment:
      HIVE_CORE_CONF_javax_jdo_option_ConnectionURL: "jdbc:postgresql://hive-metastore/metastore"
      SERVICE_PRECONDITION: "hive-metastore:9083"
    ports:
      - "10000:10000"
    volumes:
      - ./conf/hive-server:/conf
      - ./data/hive-server:/tmp/data
  hive-metastore:
    image: bde2020/hive:2.3.2-postgresql-metastore
    container_name: hive-metastore
    depends_on:
      - namenode
      - datanode1
      - datanode2
      - datanode3
    env_file:
      - ./hadoop.env
    command: /opt/hive/bin/hive --service metastore
    ports:
      - "9083:9083"
  hive-metastore-postgresql:
    image: bde2020/hive-metastore-postgresql:2.3.0
    container_name: hive-metastore-postgresql
    ports:
      - "5432:5432

hadoop.env 

HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false

CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*

HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false

YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031

2在该目录下执行以下命令启动服务

docker-compose up -d

正常启动如下

基于docker简单快速搭测试大数据集群_第1张图片

如果某个服务无法启动,可以直接使用“docker start 容器名称”重启报错的容器。

其他一些可能用到的命令如下:

#停止删除所有容器
docker stop $(docker ps -a -q)
docker rm $(docker ps -a -q)

#查看docker错误日志
docker logs 容器名称

#进入hive-server容器,并启动hive命令行进行测试
docker-compose exec hive-server bash
/opt/hive/bin/beeline -u jdbc:hive2://localhost:10000

 

 

补充有关hive中文无法正常显示的问题,而且“hive-server”容器还没有常用的vim:

1首先更新安装源:

1)使用“cat /etc/issue”发现容器中使用的是Debian版本

2)cp /etc/apt/sources.list /etc/apt/sources.list_bak  #备份源信息

3)设置/etc/apt/sources.list内容如下

deb http://mirrors.163.com/debian/ stretch main non-free contrib
deb http://mirrors.163.com/debian/ stretch-updates main non-free contrib
deb http://mirrors.163.com/debian/ stretch-backports main non-free contrib
deb-src http://mirrors.163.com/debian/ stretch main non-free contrib
deb-src http://mirrors.163.com/debian/ stretch-updates main non-free contrib
deb-src http://mirrors.163.com/debian/ stretch-backports main non-free contrib
deb http://mirrors.163.com/debian-security/ stretch/updates main non-free contrib
deb-src http://mirrors.163.com/debian-security/ stretch/updates main non-free contrib

4)apt update #更新源信息

5)apt-get install vim

2解决hive中文乱码

1)查看系统已安装的语言“locale -a”  

2)更新系统默认语言

     打开 “~/.bashrc”增加如下一行,(本来想安装并配置为“zh_CN.UTF-8”,但是安装失败,而且默认带的“C.UTF-8”就能用)

export LANG=C.UTF-8

3)查看元数据编码是否有问题(不需要此步骤,记录下来方便后续学习hive元数据时使用)

#进入hive的postgresql元数据容器
docker-compose exec hive-metastore-postgresql bash
#切换用户
su postgres
#进入psql命令行
psql
#查看数据库编码
\l
#如果有问题可以用如下命令更新编码
update pg_database set encoding = pg_char_to_encoding('UTF8') where datname = 'metastore';

 

 

参考文档:

docker-compose使用教程:https://www.jianshu.com/p/4fbe3de8f416

SparkML(1)环境构建:https://www.jianshu.com/p/35c0e291f4ea

https://github.com/big-data-europe/docker-hive

你可能感兴趣的:(大数据,docker)