以下基于Hadoop2.7+Spark 2.4,Mac机器。三个虚拟机(一主两从)做集群。
0.基础工作
0.1 hosts
修改
vim /etc/hosts
192.168.165.130 hadoop03
192.168.165.129 hadoop02
192.168.165.128 hadoop01
:wq
传一下
scp /etc/hosts root@hadoop01:/etc/
scp /etc/hosts root@hadoop02:/etc/
scp /etc/hosts root@hadoop03:/etc/
0.1 ssh免密登录
- 创建root用户的密钥
cd ~/.ssh
ssh-keygen -t rsa
一路回车
- 互相认证
# 追加
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
# 复制一下
cp ~/.ssh/id_rsa.pub ~/.ssh/id_rsa.pub.01
# 传给大家
scp ~/.ssh/id_rsa.pub.01 root@hadoop02:~/.ssh/
# 大家都这么相互传之后,把大家的公钥加到认证文件(以01为例)
cat id_rsa.pub.0* >> ~/.ssh/authorized_keys
在需要免密登录的机器之间互相拷贝公钥,然后追加到认证文件中。
0.2 装Java
下载->解压->上传
0.3 装scala
下载->解压->上传
0.4 装maven
下载->解压->上传
0.5 装zookeeper
- 下载->解压->上传
- 配置文件:
cp ./conf/zoo_sample.cfg ./conf/zoo.cfg
vim ./conf/zoo.cfg
#dataDir=/tmp/zookeeper
dataDir=/usr/local/zookeeper/data
dataLogDir=/usr/local/zookeeper/logs
server.1=hadoop01:2888:3888
server.2=hadoop02:2888:3888
server.3=hadoop03:2888:3888
myid
在dataDir 中创建myid文件,里面写上server.X中的X启动
./bin/zkServer.sh start验证
[root@localhost logs]# zkServer.sh status
ZooKeeper JMX enabled by default
Using config: /usr/local/zookeeper/bin/../conf/zoo.cfg
Mode: follower
1. 安装Hadoop
1.1 下载Hadoop安装包或源码包(自行编译)
下载地址:http://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-2.7.7/hadoop-2.7.7.tar.gz
1.2 解压缩
tar -zxvf ./hadoop-2.7.7.tar
1.3 修改配置文件(高可用)
以下文件都在/Users/pengjunzhe/Downloads/hadoop-2.7.7/etc/hadoop
中.
如果不需要配置高可用可以大大简化配置需要的工作,百度很多,不多赘述。
1.3.1 hadoop-env.sh
# The java implementation to use.
# export JAVA_HOME=${JAVA_HOME}
export JAVA_HOME=/usr/local/jdk
1.3.2 core-site.xml
fs.defaultFS
hdfs://hadoop01:9000
hadoop.tmp.dir
/usr/local/hadoop/tmp
ha.zookeeper.quorum
hadoop01:2181, hadoop02:2181, hadoop03:2181
1.3.3 hdfs-site.xml
dfs.nameservices
ns1
dfs.ha.namenodes.ns1
nn1,nn2
dfs.namenode.http-address.ns1.nn1
hadoop01:50070
dfs.namenode.rpc-address.ns1.nn1
hadoop01:9000
dfs.namenode.http-address.ns1.nn2
hadoop02:50070
dfs.namenode.rpc-address.ns1.nn2
hadoop02:9000
dfs.namenode.shared.edits.dir
qjournal://hadoop01:8485;hadoop02:8485;hadoop03:8485/ns1
dfs.journalnode.edits.dir
/usr/local/hadoop/journal
dfs.ha.automatic-failover.enabled
true
dfs.client.failover.proxy.provider.ns1
org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
dfs.ha.fencing.methods
sshfence
dfs.ha.fencing.ssh.private-key-files
/root/.ssh/id_rsa
1.3.4 mapred-site.xml
mapreduce.jobhistory.address
hadoop01:10020
mapreduce.jobhistory.webapp.address
hadoop01:19888
mapreduce.jobhistory.joblist.cache.size
20000
mapreduce.framework.name
yarn
mapreduce.map.memory.mb
128
mapreduce.reduce.memory.mb
512
mapreduce.map.java.opts
-Xmx128m -Xms64m
mapreduce.reduce.java.opts
-Xmx128m -Xms64m
mapreduce.client.submit.file.replication
20
1.3.5 yarn-site.xml
yarn.resourcemanager.ha.enabled
true
yarn.resourcemanager.cluster-id
yarn-ha
yarn.resourcemanager.ha.automatic-failover.enabled
true
yarn.resourcemanager.recovery.enabled
true
yarn.resourcemanager.ha.rm-ids
rm1,rm2
yarn.scheduler.fair.user-as-default-queue
true
yarn.resourcemanager.store.class
org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore
yarn.resourcemanager.hostname.rm1
hadoop01
yarn.resourcemanager.webapp.address.rm1
${yarn.resourcemanager.hostname.rm1}:8088
yarn.resourcemanager.hostname.rm2
hadoop02
yarn.resourcemanager.webapp.address.rm2
${yarn.resourcemanager.hostname.rm2}:8088
yarn.nodemanager.resource.memory-mb
1024
yarn.nodemanager.resource.cpu-vcores
3
yarn.resourcemanager.zk-address
hadoop01:2181,hadoop02:2181,hadoop03:2181
yarn.nodemanager.log-dirs
file:///usr/local/hadoop/data1/yarn/log
,file:///usr/local/hadoop/data2/yarn/log
yarn.app.mapreduce.am.resource.mb
512
yarn.scheduler.minimum-allocation-mb
128
yarn.scheduler.maximum-allocation-mb
2048
yarn.scheduler.minimum-allocation-vcores
1
yarn.scheduler.maximum-allocation-vcores
2
yarn.log-aggregation-enable
true
yarn.log-aggregation.retain-seconds
3600
yarn.nodemanager.remote-app-log-dir
/usr/local/hadoop/data/yarn-logs
yarn.resourcemanager.scheduler.class
org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler
yarn.scheduler.fair.allocation.file
/usr/local/hadoop/etc/hadoop/fair-scheduler.xml
1.3.5.0 fair-scheduler.xml
50000 mb, 10 vcores
10
1.0
fair
80000 mb, 20 vcores
20
1.0
fair
99
1.3.6 slaves
hadoop01
hadoop02
hadoop03
1.4 分发
scp -r ./hadoop-2.7.7/ root@hadoop01:/usr/local/hadoop/
scp -r ./hadoop-2.7.7/ root@hadoop02:/usr/local/hadoop/
scp -r ./hadoop-2.7.7/ root@hadoop03:/usr/local/hadoop/
1.6 配置环境变量
vim ~/.bashrc
export JAVA_HOME=/usr/local/jdk
export MAVEN_HOME=/usr/local/maven
export SCALA_HOME=/usr/local/scala
export ZK_HOME=/usr/local/zookeeper
export HADOOP_HOME=/usr/local/hadoop
export PATH=$PATH:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$MAVEN_HOME/bin:/$SCALA_HOME/bin:$ZK_HOME/bin:$HADOOP_HOME/bin
source ~/.bashrc
1.8 启动journalnode(所有虚拟机)
/usr/local/hadoop/sbin/hadoop-daemons.sh start journalnode
验证
jps
9324 JournalNode
9373 Jps
1.7 初始化HDFS
hadoop namenode -format
格式化之后会根据core-site.xml中的hadoop.tmp.dir配置生成一个文件夹,将这个文件夹scp到别的NameNode节点(这里是Hadoop02)。
scp -r /usr/local/hadoop/tmp/ root@hadoop02:/usr/local/hadoop/
1.8 格式化ZK(Hadoop01执行)
hdfs zkfc -formatZK
1.9 启动HDFS和YARN(Hadoop01执行)
./sbin/start-all.sh
1.10 验证安装和启动是否成功
1.10.1 检查进程
jsp
11776 QuorumPeerMain
15840 DataNode
16026 JournalNode
16524 Jps
15742 NameNode
16206 DFSZKFailoverController
16414 NodeManager
1.10.2 检查RM
yarn rmadmin -getServiceState rm1
active
1.10.2 检查web界面
-
Standby的NameNode
http://hadoop01:50070/dfshealth.html#tab-overview
-
Active的NameNode节点
http://hadoop01:50070/dfshealth.html#tab-overview
-
active 的RM
1.10.3 运行例子程序
2. 安装Spark
2.1 下载Spark安装包,源码包(自行编译)
下载地址:http://spark.apache.org/downloads.html
2.2 解压缩
tar -zxvf spark-2.4.0-bin-hadoop2.7.tar
2.3 修改配置文件
2.3.1 spark-env.sh
export JAVA_HOME=/usr/local/jdk
export SPARK_MASTER_HOST=hadoop01
export SPARK_MASTER_PORT=7077
2.3.2 slave
hadoop01
hadoop02
hadoop03
2.5 分发
文件夹
scp ./spark-2.4.0-bin-hadoop2.7 root@hadoop01:/usr/local/spark
scp ./spark-2.4.0-bin-hadoop2.7 root@hadoop02:/usr/local/spark
scp ./spark-2.4.0-bin-hadoop2.7 root@hadoop03:/usr/local/spark
2.6 配置环境变量
vim ~/.bashrc
export SPARK_HOMK=/usr/local/spark
export PATH=$PATH:$SPARK_HOME/bin
:wq
source ~/.bashrc
2.7 启动
./sbin/start-all.sh
2.8 验证
- hadoop01主机中有Master和Worker,其他主机中有Worker。
jps
22741 Worker
22668 Master
-
浏览器管理平台
地址http://hadoop01:8080
运行例子程序
./bin/run-example SparkPi 10
# 可以找到
Pi is roughly 3.1385551385551387
3 运行Spark的三种方式
3.1 run-example
./bin/run-example SparkPi 10
3.2 spark-submit
spark-submit \
--class org.apache.spark.examples.SparkPi \
--master spark://hadoop01:7077 \
--driver-memory 512m \
--executor-memory 512m \
--total-executor-cores 2 \
/usr/local/spark/examples/jars/spark-examples_2.11-2.4.0.jar \
100
3.3 spark-shell
spark-shell \
--master spark://hadoop01:7077