echo "echo never > /sys/kernel/mm/transparent_hugepage/enabled" >> /etc/rc.local
echo "echo never > /sys/kernel/mm/transparent_hugepage/defrag" >> /etc/rc.local
两个月左右的学习时间,快速学习了很多软件的使用及原理,但是没有真正搭建过完整的集群。现在是一个阶段的结束,要有一个实战项目了,趁着这个空,搭建一下高可用集群。
学习的顺序是按照hadoop->hive->hue->hbase->zookeeper->redis->kafka,但是高可用的集群需要依赖zookeeper,而且kafka和redis与hadoop体系是相对独立的,所以这里先搭建zookeeper+kafka+redis。然后搭建hadoop的体系系统。
这里选用的zookeeper版本为:3.5.8、kafka的版本为:kafka_2.12-2.4.1,选择redis的版本为: redis-5.0.5
解压zookeeper压缩包
创建zookeeper需要的目录
##创建zookeeper保存数据的目录和日志目录
mkdir /opt/zookeeper/data
mkdir /opt/zookeeper/data/logs
复制并修改zookeepr配置文件
#复制配置文件
cp /opt/zookeeper/conf/zoo_sample.cfg /opt/zookeeper/conf/zoo.cfg
修改配置文件
#更新datadir
dataDir=/opt/zookeeper/data
#增加logdir
dataLogDir=/opt/zookeeper/data/logs
#增加集群配置
##server.服务器ID=服务器IP地址:服务器之间通信端⼝:服务器之间投票选举端⼝
server.1=master:2888:3888
server.2=slave0:2888:3888
server.3=slave2:2888:3888
#打开注释
#ZK提供了⾃动清理事务⽇志和快照⽂件的功能,这个参数指定了清理频率,单位是⼩时
autopurge.purgeInterval=1
添加myid配置
在zookeeper的data目录下创建一个myid文件,内容为1,该文件是记录每个服务器的ID
cd /opt/zookeeper/data
echo 1 > myid
分发安装包
scp -r zookeeper slave0:/opt/
scp -r zookeeper slave2:/opt/
##或
rsync -av zookeeper slave0:/opt/
rsync -av zookeeper slave2:/opt/
修改slave0和slave2的myid值
##slave0
echo 2 > /opt/zookeeper/data/myid
## slave2
echo 3 > /opt/zookeeper/data/myid
修改环境变量
export JAVA_HOME=/opt/jdk8
export KAFKA_HOME=/opt/kafka
export ZOOKEEPER_HOME=/opt/zookeeper
export PATH=$PATH:$JAVA_HOME/bin:$KAFKA_HOME/bin:$ZOOKEEPER_HOME/bin
使用如下脚本启动zookeeper
#!/bin/sh
echo "start zookeeper server..."
if(($#==0));then
echo "no params";
exit;
fi
hosts="master slave0 slave2"
for host in $hosts
do
ssh $host "source /etc/profile; /opt/zookeeper/bin/zkServer.sh $1"
done
./zk.sh start
使用如下命令查看启动状态
./zk.sh status
如下图启动成功
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9cAR77fi-1609320208171)(/Users/dengguoqing/IdeaProjects/lagou/image-20200824101548760.png)]
解压kafka包
tar /opt/kafka_2.12-2.4.1.tgz
修改配置文件,并使其生效
# 配置环境变量,三台Linux都要配置
vim /etc/profile
# 添加以下内容:
export KAFKA_HOME=/opt/kafka_2.12-1.0.2
export PATH=$PATH:$KAFKA_HOME/bin
source /etc/profile
修改kafka配置文件server.properties
vim /opt/kafka/config/server.properties
broker.id=0
listeners=PLAINTEXT://:9092
##其他机器,修改机器名
advertised.listeners=PLAINTEXT://master:9092
log.dirs=/var/kafka/kafka-logs
zookeeper.connect=master:2181,slave0:2181,slave2:2181/kafka
通过启动命令,启动kafka
kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties
启动脚本
#!/bin/sh
echo "start kafka server..."
hosts="master slave0 slave2"
for host in $hosts
do
ssh $host "source /etc/profile; kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties"
done
解压hadoop,这里hadoop版本为2.9.2
tar -zxvf hadoop-2.9.2.tar.gz
修改hadoop-env.sh
vim hadoop-env.sh
export JAVA_HOME=/opt/java8
指定namenode节点以及数据存储目录
vim core-site.xml
<property>
<name>fs.defaultFSname>
<value>hdfs://master:9000value>
property>
<property>
<name>hadoop.tmp.dirname>
<value>/opt/hadoop-2.9.2/datavalue>
property>
指定secondaryNamenode节点
vim hdfs-site.xml
<property>
<name>dfs.namenode.secondary.http-addressname>
<value>slave0:50090value>
property>
<property>
<name>dfs.replicationname>
<value>3value>
property>
指定datanode从节点
vim slaves
master
slave0
slave2
指定MapReduce使⽤的jdk路径(修改mapred-env.sh)
export JAVA_HOME=/opt/jdk8
指定MapReduce计算框架运⾏Yarn资源调度框架(修改mapred-site.xml)
cp mapred-site.xml.template mapred-site.xml
vim mapred-site.xml
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
指定JDK路径
vim yarn-env.sh
export JAVA_HOME=/opt/java8
指定ResourceMnager的master节点信息(修改yarn-site.xml)
vim yarn-site.xml
<property>
<name>yarn.resourcemanager.hostnamename>
<value>mastervalue>
property>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
配置mapred-site.xml
<property>
<name>mapreduce.jobhistory.addressname>
<value>master:10020value>
property>
<property>
<name>mapreduce.jobhistory.webapp.addressname>
<value>master:19888value>
property>
开启日志汇集功能
配置yarn-site.xml
<property>
<name>yarn.log-aggregation-enablename>
<value>truevalue>
property>
<property>
<name>yarn.log-aggregation.retain-secondsname>
<value>604800value>
property>
配置完成的yarn-site.xml
<configuration>
<property>
<name>yarn.resourcemanager.hostnamename>
<value>mastervalue>
property>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
<property>
<name>yarn.log-aggregation-enablename>
<value>truevalue>
property>
<property>
<name>yarn.log-aggregation.retain-secondsname>
<value>604800value>
property>
<property>
<name>yarn.nodemanager.vmem-check-enabledname>
<value>falsevalue>
property>
<property>
<name>yarn.nodemanager.pmem-check-enabledname>
<value>falsevalue>
<description>是否启动一个线程检查每个任务正使用的物理内存量,如果任务超出分配值,则直接将其杀掉,默认是truedescription>
property>
<property>
<name>mapred.child.java.optsname>
<value>-Xmx1024mvalue>
property>
configuration>
配置环境变量
vim /etc/profile
export HADOOP_HOME=/opt/software/hadoop
export PATH=$PATH:$JAVA_HOME/bin:$KAFKA_HOME/bin:$ZOOKEEPER_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
source /etc/profile
通过启动命令启动hdfs以及yarn
#第一次启动hdfs需要执行
hadoop namenode -format
####
start-dfs.sh
start-yarn.sh
##启动历史服务器
mr-jobhistory-daemon.sh start historyserver
下载Hive软件,并解压缩
cd /opt
tar zxvf apache-hive-2.3.7-bin.tar.gz
mv apache-hive-2.3.7-bin hive
修改环境变量
# 在 /etc/profile 文件中增加环境变量
export HIVE_HOME=/opt/software/hive
export PATH=$PATH:$HIVE_HOME/bin
# 执行并生效
source /etc/profile
修改hive配置
vi hive-site.xml
<configuration>
<property>
<name>javax.jdo.option.ConnectionURLname>
<value>jdbc:mysql://master:3306/hive?
createDatabaseIfNotExist=true&useSSL=falsevalue>
<description>JDBC connect string for a JDBC metastoredescription>
property>
<property>
<name>javax.jdo.option.ConnectionDriverNamename>
<value>com.mysql.jdbc.Drivervalue>
<description>Driver class name for a JDBC metastoredescription>
property>
<property>
<name>javax.jdo.option.ConnectionUserNamename>
<value>hivevalue>
<description>username to use against metastore databasedescription>
property>
<property>
<name>javax.jdo.option.ConnectionPasswordname>
<value>12345678value>
<description>password to use against metastore
databasedescription>
property>
<property>
<name>hive.metastore.warehouse.dirname>
<value>/user/hive/warehousevalue>
<description>location of default database for the warehousedescription>
property>
<property>
<name>hive.cli.print.current.dbname>
<value>truevalue>
<description>Whether to include the current database in the Hive prompt.description>
property>
<property>
<name>hive.server2.thrift.client.username>
<value>rootvalue>
<description>Username to use against thrift clientdescription>
property>
<property>
<name>hive.server2.thrift.client.passwordname>
<value>12345678value>
<description>Password to use against thrift clientdescription>
property>
configuration>
#初始化hive元数据
schematool -dbType mysql -initSchema
hiveServer2配置
修改集群上的 core-site.xml,增加如下内容
<property>
<name>hadoop.proxyuser.root.hostsname>
<value>*value>
property>
<property>
<name>hadoop.proxyuser.root.groupsname>
<value>*value>
property>
<property>
<name>hadoop.proxyuser.hadoop.hostsname>
<value>*value>
property>
<property>
<name>hadoop.proxyuser.hadoop.groupsname>
<value>*value>
property>
修改 集群上的 hdfs-site.xml,增加以下内容:
<property>
<name>dfs.webhdfs.enabledname>
<value>truevalue>
property>
远程配置,在三个机器同步hive配置
nohup hive --service metastore &
修改 slave0 上hive-site.xml。删除配置文件中:MySQL的配置、连接数据库
的用户名、口令等信息;增加连接metastore的配置:
<property>
<name>hive.metastore.urisname>
<value>thrift://master:9083,thrift://slave2:9083value>
property>
启动hiveserver2的命令
nohup hiveserver2 &
##这样就可以通过idea连接了
解压habse并修改文件夹名称
tar -zxvf hbase-1.3.1-bin.tar.gz
mv hbase-1.3.1 hbase
export HBASE_HOME=/opt/hbase-1.2.1
export PATH=$PATH:$HBASE_HOME/bin
修改配置文件,需要把hadoop中的配置core-site.xml 、 hdfs-site.xml拷⻉到hbase安装⽬录下的conf⽂件夹中。这里通过软连接的方式实现
ln -s $HADOOP_HOME/etc/hadoop/core-site.xml core-site.xml
ln -s $HADOOP_HOME/etc/hadoop/hdfs-site.xml hdfs-site.xml
修改conf目录下的文件
修改 hbase-env.sh
#添加java环境变量
export JAVA_HOME=/opt/java8
#指定使⽤外部的zk集群
export HBASE_MANAGES_ZK=FALSE
修改 hbase-site.xml
<configuration>
<property>
<name>hbase.rootdirname>
<value>hdfs://master:9000/hbasevalue>
property>
<property>
<name>hbase.cluster.distributedname>
<value>truevalue>
property>
<property>
<name>hbase.zookeeper.quorumname>
<value>master,slave0,slave2value>
property>
configuration>
修改regionservers文件
master
slave0
slave2
启动Hbase
## 前提条件:先启动hadoop和zk集群
##启动HBase:
start-hbase.sh
##停⽌HBase:
stop-hbase.sh
启动好HBase集群之后,可以访问地址: HMaster的主机名:16010
解压文件apache-tez-0.9.0-bin.tar.gz
tar -zxvf apache-tez-0.9.2-bin.tar.gz -C /opt/software
cd /opt/software
mv apache-tez-0.9.2-bin tez
cd /opt/software/tez/share
hdfs dfs -mkdir -p /user/tez
hdfs dfs -put tez.tar.gz /user/tez
<configuration>
<property>
<name>tez.lib.urisname>
<value>hdfs://master:9000/user/tez/tez.tar.gzvalue>
property>
configuration>
将配置分发到所有节点
export TEZ_CONF_DIR=$HADOOP_CONF_DIR
export TEZ_JARS=/opt/software/tez/*:/opt/software/tez/lib/*
export HADOOP_CLASSPATH=$TEZ_CONF_DIR:$TEZ_JARS:$HADOOP_CLASSPATH
hive> set hive.execution.engine=tez;
<property>
<name>hive.execution.enginename>
<value>tezvalue>
property>
下载软件解压缩,移动到指定位置
cd /opt/software/
tar -zxvf spark-2.4.5-bin-without-hadoop.tgz -C ../
mv spark-2.4.5-bin-without-hadoop spark-2.4.5
设置环境变量
vi /etc/profile
export SPARK_HOME=/opt/spark-2.4.5
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
source /etc/profile
修改配置
文件位置:$SPARK_HOME/conf
修改文件:slaves、spark-defaults.conf、spark-env.sh、log4j.properties
Slaves
master
slave0
slave2
Spark-defaults.conf
spark.master spark://master:7077
spark. .enabled true
spark.eventLog.dir hdfs://master:9000/sparkeventlog
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.driver.memory 512m
spark.yarn.jars hdfs:///spark-yarn/jars/*.jar
spark.yarn.historyServer.address master:18080
spark.history.ui.port 18080
创建HDFS目录:hdfs dfs -mkdir /spark-eventlog
备注:
spark.master。定义master节点,缺省端口号 7077
spark.eventLog.enabled。开启eventLog
spark.eventLog.dir。eventLog的存放位置
spark.serializer。一个高效的序列化器
spark.driver.memory。定义driver内存的大小(缺省1G
修改spark-env.sh
export JAVA_HOME=/opt/jdk8
export HADOOP_HOME=/opt/hadoop-2.9.2
export HADOOP_CONF_DIR=/opt/hadoop-2.9.2/etc/hadoop
export SPARK_DIST_CLASSPATH=$(/opt/hadoop-2.9.2/bin/hadoop classpath)
export SPARK_MASTER_HOST=master
export SPARK_MASTER_PORT=7077
将Spark软件分发到集群;修改其他节点上的环境变量
cd /opt/
scp -r spark-2.4.5/ slave0:$PWD
scp -r spark-2.4.5/ slave2:$PWD
启动集群
cd $SPARK_HOME/sbin
./start-all.sh
以上模式为standAlone模式的部署
# spark-defaults.conf
# history server
spark.eventLog.enabled true
spark.eventLog.dir hdfs://master:8020/spark-eventlog
spark.eventLog.compress true
# spark-env.sh
export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 -
Dspark.history.retainedApplications=50
-Dspark.history.fs.logDirectory=hdfs://master:9000/sparkeventlog"
软件 | 版本 |
---|---|
Hive | 2.3.7 |
Hadoop | 2.9.2 |
Hbase | 1.1.2 |
Zookeeper | 3.4.14 |
Kafka | 1.0.2 |
Spark | 2.4.5 |
解压缩软件
cd /opt/software
tar -zxvf apache-kylin-3.1.1-bin-hbase1x.tar.gz -C ../
mv apache-kylin-3.1.1-bin-hbase1x kylin-3.1.1
添加环境变量
vi /etc/profile
# 增加以下内容
export KYLIN_HOME=/opt/kylin-3.1.1
export PATH=$PATH:$KYLIN_HOME/bin
source /etc/profile
增加kylin依赖组件的配置
cd $KYLIN_HOME/conf
ln -s $HADOOP_HOME/etc/hadoop/hdfs-site.xml hdfs-site.xml
ln -s $HADOOP_HOME/etc/hadoop/core-site.xml core-site.xml
ln -s $HBASE_HOME/conf/hbase-site.xml hbase-site.xml
ln -s $HIVE_HOME/conf/hive-site.xml hive-site.xml
ln -s $SPARK_HOME/conf/spark-defaults.conf spark-defaults.conf
修改kylin.sh
cd $KYLIN_HOME/bin
vim kylin.sh
# 在 kylin.sh 文件头部添加
export HADOOP_HOME=/opt/hadoop-2.9.2
export HIVE_HOME=/opt/hive-2.3.7
export HBASE_HOME=/opt/hbase-1.1.2
export SPARK_HOME=/opt/spark-2.4.5
cd /opt/software
wget https://mirrors.tuna.tsinghua.edu.cn/apache/druid/0.19.0/apache-druid-0.19.0-bin.tar.gz
tar -zxvf apache-druid-0.19.0-bin.tar.gz -C ../
cd ../
mv apache-druid-0.19.0 druid-0.19.0
vim /etc/profile
# 在文件中增加以下内容
export DRUID_HOME=/opt/druid-0.19.0
export PATH=$PATH:$DRUID_HOME/bin
CREATE DATABASE druid DEFAULT CHARACTER SET utf8mb4;
CREATE USER 'druid'@'%' IDENTIFIED BY '12345678';
GRANT ALL PRIVILEGES ON druid.* TO 'druid'@'%';
将hadoop配置文件core-hdfs.xml,hdfs-site.xml,yarn-site.xml,mapred.xml链接到conf/druid/cluster/_common下
cd $DRUID_HOME/conf/druid/cluster/_common
ln -s $HADOOP_HOME/etc/hadoop/core-site.xml core-site.xml
ln -s $HADOOP_HOME/etc/hadoop/hdfs-site.xml hdfs-site.xml
ln -s $HADOOP_HOME/etc/hadoop/yarn-site.xml yarn-site.xml
ln -s $HADOOP_HOME/etc/hadoop/mapred-site.xml mapred-site.xml
配置mysql驱动程序
ln -s $HIVE_HOME/lib/mysql-connector-java-5.1.46.jar mysql-connector-java-5.1.46.jar
修改配置文件($DRUID_HOME/conf/druid/cluster/_common/common.runtime.properties)
# 增加"mysql-metadata-storage"
druid.extensions.loadList=["mysql-metadata-storage", "druid-hdfs-storage", "druid-kafkaindexing-service", "druid-datasketches"]
# 每台机器写自己的ip或hostname
druid.host=master
# 填写zk地址
druid.zk.service.host=master:2181,salve0:2181,slave2:2181
druid.zk.paths.base=/druid
# 注释掉前面 derby 的配置
# 增加 mysql 的配置
druid.metadata.storage.type=mysql
druid.metadata.storage.connector.connectURI=jdbc:mysql://slave1:3306/druid
druid.metadata.storage.connector.user=druid
druid.metadata.storage.connector.password=12345678
# 注释掉local的配置
# 增加HDFS的配置,即使用HDFS作为深度存储
druid.storage.type=hdfs
druid.storage.storageDirectory=/druid/segments
# 注释掉 indexer.logs For local disk的配置
# 增加 indexer.logs For HDFS 的配置
druid.indexer.logs.type=hdfs
druid.indexer.logs.directory=/druid/indexing-logs
配置主节点文件(参数大小根据实际情况配置)
$DRUID_HOME/conf/druid/cluster/master/coordinator-overlord/jvm.config
-server
-Xms512m
-Xmx512m
-XX:+ExitOnOutOfMemoryError
-XX:+UseG1GC
-Duser.timezone=UTC+8
-Dfile.encoding=UTF-8
-Djava.io.tmpdir=var/tmp
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
配置数据节点文件(参数大小根据实际情况配置)
$DRUID_HOME/conf/druid/cluster/data/historical/jvm.config
-server
-Xms512m
-Xmx512m
-XX:MaxDirectMemorySize=1g
-XX:+ExitOnOutOfMemoryError
-Duser.timezone=UTC+8
-Dfile.encoding=UTF-8
-Djava.io.tmpdir=var/tmp
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
修改文件$DRUID_HOME/conf/druid/cluster/data/historical/runtime.properties
# 修改这一个参数
druid.processing.buffer.sizeBytes=50000000
备注:druid.processing.buffer.sizeBytes 每个查询用于聚合的堆外哈希表的大小
maxDirectMemory= druid.processing.buffer.sizeBytes * (druid.processing.numMergeBuffers +
druid.processing.numThreads + 1)
如果 druid.processing.buffer.sizeBytes 太大,那么需要加大maxDirectMemory,否则 historical 服务无法启动
修改文件$DRUID_HOME/conf/druid/cluster/data/middleManager/jvm.config
-server
-Xms128m
-Xmx128m
-XX:+ExitOnOutOfMemoryError
-Duser.timezone=UTC+8
-Dfile.encoding=UTF-8
-Djava.io.tmpdir=var/tmp
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
配置查询节点文件(参数大小根据实际情况配置)$DRUID_HOME/conf/druid/cluster/query/broker/jvm.config
-server
-Xms512m
-Xmx512m
-XX:MaxDirectMemorySize=512m
-XX:+ExitOnOutOfMemoryError
-Duser.timezone=UTC+8
-Dfile.encoding=UTF-8
-Djava.io.tmpdir=var/tmp
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
$DRUID_HOME/conf/druid/cluster/query/broker/runtime.properties
# 修改这一个参数
druid.processing.buffer.sizeBytes=50000000
备注:
druid.processing.buffer.sizeBytes 每个查询用于聚合的堆外哈希表的大小
maxDirectMemory = druid.processing.buffer.sizeBytes*(druid.processing.numMergeBuffers +
druid.processing.numThreads + 1)
如果 druid.processing.buffer.sizeBytes 太大,那么需要加大maxDirectMemory,否则 broker 服务无法启动
$DRUID_HOME/conf/druid/cluster/query/router/jvm.config
-server
-Xms128m
-Xmx128m
-XX:+UseG1GC
-XX:MaxDirectMemorySize=128m
-XX:+ExitOnOutOfMemoryError
-Duser.timezone=UTC+8
-Dfile.encoding=UTF-8
-Djava.io.tmpdir=var/tmp
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
小结:各服务 JVM 内存分配设置如下:
coordinator-overlord,512m
historical,512m、堆外1g
middleManager,128m
broker,512m、堆外 512m
router,128m、堆外 128m
向slave0、slave2
scp -r druid-0.19.0/ slave0:$PWD
scp -r druid-0.19.0/ slave2:$PWD
在master节点启动服务
nohup start-cluster-master-no-zk-server &