作为一个大数据的学习者,有时候我们希望基于自己的笔记本中虚拟机简单配置一个大数据集群用于测试,如果基于cdh在多个虚拟机中配置集群可能对笔记本的硬件要求会很高。其实有更简单快速基于docker的搭建方式,以下方式亲测可用.
前期需要准备的工作自己安装虚拟机,在虚拟机中安装docker,docker-compose
docker-compose.yml
version: "2.2"
services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.1.3-java8
container_name: namenode
volumes:
- ./hadoop/namenode:/hadoop/dfs/name
- ./input_files:/input_files
environment:
- CLUSTER_NAME=test
env_file:
- ./hadoop.env
ports:
- 50070:9870
- 8020:8020
resourcemanager:
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.1.3-java8
container_name: resourcemanager
depends_on:
- namenode
- datanode1
- datanode2
- datanode3
env_file:
- ./hadoop.env
historyserver:
image: bde2020/hadoop-historyserver:2.0.0-hadoop3.1.3-java8
container_name: historyserver
depends_on:
- namenode
- datanode1
- datanode2
- datanode3
volumes:
- ./hadoop/historyserver:/hadoop/yarn/timeline
env_file:
- ./hadoop.env
nodemanager:
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.1.3-java8
container_name: nodemanager
depends_on:
- namenode
- datanode1
- datanode2
- datanode3
env_file:
- ./hadoop.env
datanode1:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.3-java8
container_name: datanode1
depends_on:
- namenode
volumes:
- ./hadoop/datanode1:/hadoop/dfs/data
env_file:
- ./hadoop.env
ports:
- "50075:9864"
datanode2:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.3-java8
container_name: datanode2
depends_on:
- namenode
volumes:
- ./hadoop/datanode2:/hadoop/dfs/data
env_file:
- ./hadoop.env
ports:
- "50076:9864"
datanode3:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.1.3-java8
container_name: datanode3
depends_on:
- namenode
volumes:
- ./hadoop/datanode3:/hadoop/dfs/data
env_file:
- ./hadoop.env
ports:
- "50077:9864"
master:
image: gettyimages/spark:2.4.1-hadoop-3.0
container_name: master
command: bin/spark-class org.apache.spark.deploy.master.Master -h master
hostname: master
environment:
MASTER: spark://master:7077
SPARK_CONF_DIR: /conf
SPARK_PUBLIC_DNS: 192.168.174.88
links:
- namenode
expose:
- 4040
- 7001
- 7002
- 7003
- 7004
- 7005
- 7077
- 6066
ports:
- "49100:22"
- 4040:4040
- 6066:6066
- 7077:7077
- 8080:8080
volumes:
- ./conf/master:/conf
- ./data:/tmp/data
- ./jars:/root/jars
worker1:
image: gettyimages/spark:2.4.1-hadoop-3.0
container_name: worker1
command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
hostname: worker1
environment:
SPARK_CONF_DIR: /conf
SPARK_WORKER_CORES: 2
SPARK_WORKER_MEMORY: 2g
SPARK_WORKER_PORT: 8881
SPARK_WORKER_WEBUI_PORT: 8081
SPARK_PUBLIC_DNS: 192.168.174.88
links:
- master
expose:
- 7012
- 7013
- 7014
- 7015
- 8881
- 8081
ports:
- 8081:8081
volumes:
- ./conf/worker1:/conf
- ./data/worker1:/tmp/data
worker2:
image: gettyimages/spark:2.4.1-hadoop-3.0
container_name: worker2
command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
hostname: worker2
environment:
SPARK_CONF_DIR: /conf
SPARK_WORKER_CORES: 2
SPARK_WORKER_MEMORY: 2g
SPARK_WORKER_PORT: 8881
SPARK_WORKER_WEBUI_PORT: 8082
SPARK_PUBLIC_DNS: 192.168.174.88
links:
- master
expose:
- 7012
- 7013
- 7014
- 7015
- 8881
- 8082
ports:
- 8082:8082
volumes:
- ./conf/worker2:/conf
- ./data/worker2:/tmp/data
hive-server:
image: bde2020/hive:2.3.2-postgresql-metastore
container_name: hive-server
depends_on:
- namenode
- datanode1
- datanode2
- datanode3
- hive-metastore
env_file:
- ./hadoop.env
environment:
HIVE_CORE_CONF_javax_jdo_option_ConnectionURL: "jdbc:postgresql://hive-metastore/metastore"
SERVICE_PRECONDITION: "hive-metastore:9083"
ports:
- "10000:10000"
volumes:
- ./conf/hive-server:/conf
- ./data/hive-server:/tmp/data
hive-metastore:
image: bde2020/hive:2.3.2-postgresql-metastore
container_name: hive-metastore
depends_on:
- namenode
- datanode1
- datanode2
- datanode3
env_file:
- ./hadoop.env
command: /opt/hive/bin/hive --service metastore
ports:
- "9083:9083"
hive-metastore-postgresql:
image: bde2020/hive-metastore-postgresql:2.3.0
container_name: hive-metastore-postgresql
ports:
- "5432:5432
hadoop.env
HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hive-metastore:9083
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
docker-compose up -d
正常启动如下
如果某个服务无法启动,可以直接使用“docker start 容器名称”重启报错的容器。
其他一些可能用到的命令如下:
#停止删除所有容器
docker stop $(docker ps -a -q)
docker rm $(docker ps -a -q)
#查看docker错误日志
docker logs 容器名称
#进入hive-server容器,并启动hive命令行进行测试
docker-compose exec hive-server bash
/opt/hive/bin/beeline -u jdbc:hive2://localhost:10000
补充有关hive中文无法正常显示的问题,而且“hive-server”容器还没有常用的vim:
1首先更新安装源:
1)使用“cat /etc/issue”发现容器中使用的是Debian版本
2)cp /etc/apt/sources.list /etc/apt/sources.list_bak #备份源信息
3)设置/etc/apt/sources.list内容如下
deb http://mirrors.163.com/debian/ stretch main non-free contrib
deb http://mirrors.163.com/debian/ stretch-updates main non-free contrib
deb http://mirrors.163.com/debian/ stretch-backports main non-free contrib
deb-src http://mirrors.163.com/debian/ stretch main non-free contrib
deb-src http://mirrors.163.com/debian/ stretch-updates main non-free contrib
deb-src http://mirrors.163.com/debian/ stretch-backports main non-free contrib
deb http://mirrors.163.com/debian-security/ stretch/updates main non-free contrib
deb-src http://mirrors.163.com/debian-security/ stretch/updates main non-free contrib
4)apt update #更新源信息
5)apt-get install vim
2解决hive中文乱码
1)查看系统已安装的语言“locale -a”
2)更新系统默认语言
打开 “~/.bashrc”增加如下一行,(本来想安装并配置为“zh_CN.UTF-8”,但是安装失败,而且默认带的“C.UTF-8”就能用)
export LANG=C.UTF-8
3)查看元数据编码是否有问题(不需要此步骤,记录下来方便后续学习hive元数据时使用)
#进入hive的postgresql元数据容器
docker-compose exec hive-metastore-postgresql bash
#切换用户
su postgres
#进入psql命令行
psql
#查看数据库编码
\l
#如果有问题可以用如下命令更新编码
update pg_database set encoding = pg_char_to_encoding('UTF8') where datname = 'metastore';
参考文档:
docker-compose使用教程:https://www.jianshu.com/p/4fbe3de8f416
SparkML(1)环境构建:https://www.jianshu.com/p/35c0e291f4ea
https://github.com/big-data-europe/docker-hive