需要提前部署好 Zookeeper/Hadoop/Hive 环境
下载链接
链接:https://pan.baidu.com/s/1rLq39ddxh7np7JKiuRAhDA?pwd=e20h
提取码:e20h
将spark-3.1.2-bin-hadoop3.2.tar.gz压缩包到node1下的/export/server目录
tar -zxvf /export/server/spark-3.1.2-bin-hadoop3.2.tgz -C /export/server/
如果有权限问题,可以修改为root,方便学习时操作,实际中使用运维分配的用户和权限即可
chown -R root /export/server/spark-3.1.2-bin-hadoop3.2
chgrp -R root /export/server/spark-3.1.2-bin-hadoop3.2
mv /export/server/spark-3.1.2-bin-hadoop3.2 /export/server/spark
echo 'export SPARK_HOME=/export/server/spark' >> /etc/profile
echo 'export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin' >> /etc/profile
source /etc/profile
spark-shell
# 进入配置目录
cd /export/server/spark/conf
# 修改配置文件名称
mv workers.template workers
# 将三台机器写入workers
echo 'node1' > workers
echo 'node2' >> workers
echo 'node3' >> workers
cd /export/server/spark/conf
## 修改配置文件名称
mv spark-env.sh.template spark-env.sh
## 修改配置文件
## 设置JAVA安装目录,jdk1.8.0_65 看自己的java目录和版本填写
echo 'JAVA_HOME=/export/server/jdk1.8.0_65' >> spark-env.sh
## 设置python安装目录
echo 'PYSPARK_PYTHON=/export/server/python3/bin/python3' >> spark-env.sh
## HADOOP软件配置文件目录,读取HDFS上文件
echo 'HADOOP_CONF_DIR=/export/server/hadoop-3.3.0/etc/hadoop' >> spark-env.sh
## 指定spark老大Master的IP和提交任务的通信端口
echo 'SPARK_MASTER_HOST=node1' >> spark-env.sh
echo 'SPARK_MASTER_PORT=7077' >> spark-env.sh
echo 'SPARK_MASTER_WEBUI_PORT=8080' >> spark-env.sh
echo 'SPARK_WORKER_CORES=1' >> spark-env.sh
echo 'SPARK_WORKER_MEMORY=1g' >> spark-env.sh
echo 'SPARK_WORKER_PORT=7078' >> spark-env.sh
echo 'SPARK_WORKER_WEBUI_PORT=8081' >> spark-env.sh
## 历史日志服务器
echo 'SPARK_HISTORY_OPTS="-Dspark.history.fs.logDirectory=hdfs://node1:8020/sparklog/ -Dspark.history.fs.cleaner.enabled=true"' >> spark-env.sh
启动HDFS服务,创建应用运行事件日志目录
hdfs dfs -mkdir -p /sparklog/
hdfs dfs -chown hadoop:root /sparklog
hdfs dfs -chmod 775 /sparklog
## 进入配置目录
cd /export/server/spark/conf
## 修改配置文件名称
mv spark-defaults.conf.template spark-defaults.conf
## 添加内容如下:
echo 'spark.eventLog.enabled true' >> spark-defaults.conf
echo 'spark.eventLog.dir hdfs://node1:8020/sparklog/' >> spark-defaults.conf
echo 'spark.eventLog.compress true' >> spark-defaults.conf
## 进入目录
cd /export/server/spark/conf
## 修改日志属性配置文件名称
mv log4j.properties.template log4j.properties
## 改变日志级别
sed -i "1,25s/INFO/WARN/" /export/server/spark/conf/log4j.properties
避免和hadopp的启动文件名字冲突
mv /export/server/spark/sbin/stop-all.sh /export/server/spark/sbin/stop-all-spark.sh
mv /export/server/spark/sbin/start-all.sh /export/server/spark/sbin/start-all-spark.sh
scp -r /export/server/spark node2:/export/server/
scp -r /export/server/spark node3:/export/server/
scp -r /export/server/python3 node2:/export/server/
scp -r /export/server/python3 node3:/export/server/
scp /etc/profile node2:/etc/
scp /etc/profile node3:/etc/
# 启动spark
start-all-spark.sh
# 启动历史服务
start-history-server.sh
spark-shell --master spark://node1:7077
http://node1:8080
stop-all-spark.sh
将/export/server/spark/conf/spark-env.sh文件中的SPARK_MASTER_HOST注释
# SPARK_MASTER_HOST=node1
echo 'SPARK_DAEMON_JAVA_OPTS="-Dspark.deploy.recoveryMode=ZOOKEEPER -Dspark.deploy.zookeeper.url=node1:2181,node2:2181,node3:2181 -Dspark.deploy.zookeeper.dir=/spark-ha"' >> /export/server/spark/conf/spark-env.sh
cd /export/server/spark/conf
scp -r spark-env.sh node2:$PWD
scp -r spark-env.sh node3:$PWD
zkServer.sh start
hadoop fs -mkdir /spark-ha
start-all-spark.sh
start-master.sh
http://node1:8080
node2:8080
链接:https://pan.baidu.com/s/1LkpjREnLXLzebki4VTz1Ag?pwd=6bs5
提取码:6bs5
将python3.tar.gz压缩包到node1下的/export/server目录
tar -zxvf /export/server/python3.tar.gz -C /export/server
echo 'export PYTHON_HOME=/export/server/python3' >> /etc/profile
echo 'export PATH=$PATH:$PYTHON_HOME/bin' >> /etc/profile
source /etc/profile
scp -r /export/server/python3 node2:/export/server/
scp -r /export/server/python3 node3:/export/server/
pyspark
当前spark依赖的版本为3.1.2
pip3 install pyspark==3.1.2 -i https://pypi.tuna.tsinghua.edu.cn/simple/
链接:https://pan.baidu.com/s/1bZD0KbpXlUYb4UZBtCodAw?pwd=zcsx
提取码:zcsx
上传
spark_packages
到root目录下
cd /root/spark_packages
pip3 install --no-index --find-links=spark_packages -r requirements.txt
echo 'export ZOOKEEPER_HOME=/export/server/zookeeper' >> /etc/bashrc
echo 'export PATH=$PATH:$ZOOKEEPER_HOME/bin' >> /etc/bashrc
echo 'export JAVA_HOME=/export/server/jdk1.8.0_241' >> /etc/bashrc
echo 'export PATH=$PATH:$JAVA_HOME/bin' >> /etc/bashrc
echo 'export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar' >> /etc/bashrc
echo 'export HADOOP_HOME=/export/server/hadoop-3.3.0' >> /etc/bashrc
echo 'export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin' >> /etc/bashrc
echo 'export HIVE_HOME=/export/server/hive3.1.2' >> /etc/bashrc
echo 'export PATH=$PATH:$HIVE_HOME/bin:$HIVE_HOME/sbin' >> /etc/bashrc
echo 'export PYTHON_HOME=/export/server/python3' >> /etc/bashrc
echo 'export PATH=$PATH:$PYTHON_HOME/bin' >> /etc/bashrc
echo 'export PYSPARK_PYTHON=/export/server/python3/bin/python3' >> /etc/bashrc
echo 'export PYSPARK_DRIVER_PYTHON=/export/server/python3/bin/python3' >> /etc/bashrc
echo 'export SPARK_HOME=/export/server/spark' >> /etc/bashrc
echo 'export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin' >> /etc/bashrc
source /etc/bashrc
cd /export/server/spark/conf
echo 'YARN_CONF_DIR=/export/server/hadoop-3.3.0/etc/hadoop' >> spark-env.sh
scp -r spark-env.sh node2:/export/server/spark/conf
scp -r spark-env.sh node3:/export/server/spark/conf
需要修改Hadoop的yarn-site.xml文件
cd /export/server/hadoop-3.3.0/etc/hadoop
<configuration>
<property>
<name>yarn.resourcemanager.hostnamename>
<value>node1value>
property>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
<property>
<name>yarn.nodemanager.resource.memory-mbname>
<value>20480value>
property>
<property>
<name>yarn.scheduler.minimum-allocation-mbname>
<value>2048value>
property>
<property>
<name>yarn.nodemanager.vmem-pmem-rationame>
<value>2.1value>
property>
<property>
<name>yarn.log-aggregation-enablename>
<value>truevalue>
property>
<property>
<name>yarn.log-aggregation.retain-secondsname>
<value>604800value>
property>
<property>
<name>yarn.log.server.urlname>
<value>http://node1:19888/jobhistory/logsvalue>
property>
<property>
<name>yarn.nodemanager.pmem-check-enabledname>
<value>falsevalue>
property>
<property>
<name>yarn.nodemanager.vmem-check-enabledname>
<value>falsevalue>
property>
configuration>
cd /export/server/hadoop-3.3.0/etc/hadoop
scp -r yarn-site.xml node2:/export/server/hadoop-3.3.0/etc/hadoop
scp -r yarn-site.xml node3:/export/server/hadoop-3.3.0/etc/hadoop
cd /export/server/spark/conf
echo 'spark.yarn.historyServer.address node1:18080' >> spark-defaults.conf
cd /export/server/spark/conf
scp -r spark-defaults.conf node2:/export/server/spark/conf
scp -r spark-defaults.conf node3:/export/server/spark/conf
start-all.sh
mapred --daemon start historyserver
start-history-server.sh
spark-submit --master yarn --deploy-mode client 文件