大数据技术之实时数仓
版本:V1.0
第1章 环境准备
1.1虚拟机准备
克隆三台虚拟机(hadoop101、hadoop102、hadoop103),配置好对应主机的网络IP、主机名称、关闭防火墙。
设置hadoop102、hadoop103、hadoop104的主机对应内存分别是:4G、4G、4G
1.2配置免密登录
(1)配置ssh免密登录
[root@hadoop101 ~]# vim /etc/hosts
127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.1.101 hadoop101
192.168.1.102 hadoop102
192.168.1.103 hadoop103
[root@hadoop101 ~]# ssh-keygen -t rsa
[root@hadoop101 ~]# ssh-copy-id hadoop101
[root@hadoop101 ~]# ssh-copy-id hadoop102
[root@hadoop101 ~]# ssh-copy-id hadoop103
其余两台机器同样操作一遍
1.3安装jdk
(1) 卸载linux上原有open jdk,其余两台机器同样操作进行卸载
[root@hadoop101 ~]# rpm -qa | grep jdk
java-1.8.0-openjdk-1.8.0.161-2.b14.el7.x86_64
copy-jdk-configs-3.3-2.el7.noarch
java-1.8.0-openjdk-headless-1.8.0.161-2.b14.el7.x86_64
[root@hadoop101 ~]# rpm -e --nodeps java-1.8.0-openjdk-1.8.0.161-2.b14.el7.x86_64
[root@hadoop101 ~]# rpm -e --nodeps copy-jdk-configs-3.3-2.el7.noarch
[root@hadoop101 ~]# rpm -e --nodeps java-1.8.0-openjdk-headless-1.8.0.161-2.b14.el7.x86_64
(2)创建软件包存放目录
[root@hadoop101 ~]# mkdir /opt/software
[root@hadoop101 ~]# cd /opt/software/
(3)上传jdk安装包并进行解压,添加环境变量
[root@hadoop101 software]# mkdir /opt/module
[root@hadoop101 software]# tar -zxvf jdk-8u211-linux-x64.tar.gz -C /opt/module/
/opt/module/jdk1.8.0_211
[root@hadoop101 jdk1.8.0_211]# vim /etc/profile
在profile结尾处加上jdk路径
#JAVA_HOME
export JAVA_HOME=/opt/module/jdk1.8.0_211
export PATH= P A T H : PATH: PATH:JAVA_HOME/bin
(4)source下
[root@hadoop101 jdk1.8.0_211]# source /etc/profile
[root@hadoop101 jdk1.8.0_211]# java -version
java version “1.8.0_211”
Java™ SE Runtime Environment (build 1.8.0_211-b12)
Java HotSpot™ 64-Bit Server VM (build 25.211-b12, mixed mode)
[root@hadoop101 jdk1.8.0_211]#
(5)将module包的jdk路径传输到其余两台机器上,并配置jdk环境变量source下
[root@hadoop101 module]# scp -r /opt/module/jdk1.8.0_211/ hadoop102:/opt/module/
[root@hadoop101 module]# scp -r /opt/module/jdk1.8.0_211/ hadoop103:/opt/module/
[root@hadoop101 module]# scp /etc/profile hadoop102:/etc/
[root@hadoop101 module]# scp /etc/profile hadoop103:/etc/
[root@hadoop102 module]# source /etc/profile
[root@hadoop102 module]# java -version
java version “1.8.0_211”
Java™ SE Runtime Environment (build 1.8.0_211-b12)
Java HotSpot™ 64-Bit Server VM (build 25.211-b12, mixed mode)
[root@hadoop103 ~]# source /etc/profile
[root@hadoop103 ~]# java -version
java version “1.8.0_211”
Java™ SE Runtime Environment (build 1.8.0_211-b12)
Java HotSpot™ 64-Bit Server VM (build 25.211-b12, mixed mode)
1.4关闭防火墙
[root@hadoop101 ~]# systemctl stop firewalld.service
[root@hadoop101 ~]# systemctl disable firewalld.service
[root@hadoop102 ~]# systemctl stop firewalld.service
[root@hadoop102 ~]# systemctl disable firewalld.service
[root@hadoop103 ~]# systemctl stop firewalld.service
[root@hadoop103 ~]# systemctl disable firewalld.service
第2章 安装Zookeeper 3.5.7
(1)上传压缩包到software文件夹,并进行解压
[root@hadoop101 module]# cd /opt/software/
[root@hadoop101 software]# tar -zxvf apache-zookeeper-3.5.7-bin.tar.gz -C /opt/module/
(2)分发到各节点
[root@hadoop101 software]# cd /opt/module/
[root@hadoop101 module]# scp -r apache-zookeeper-3.5.7-bin/ hadoop102:/opt/module/
[root@hadoop101 module]# scp -r apache-zookeeper-3.5.7-bin/ hadoop103:/opt/module/
(3)在zookeeper目录创建zkData目录
[root@hadoop101 module]# cd apache-zookeeper-3.5.7-bin/
[root@hadoop101 apache-zookeeper-3.5.7-bin]# mkdir zkData
(4)在zkData目录下创建myid文件,写上对应比编号1并保存
[root@hadoop101 apache-zookeeper-3.5.7-bin]# cd zkData/
[root@hadoop101 zkData]# vim myid
1
(5)分发zkData目录
[root@hadoop101 zkData]# cd …
[root@hadoop101 apache-zookeeper-3.5.7-bin]# scp -r zkData/ hadoop102:/opt/module/apache-zookeeper-3.5.7-bin/
[root@hadoop101 apache-zookeeper-3.5.7-bin]# scp -r zkData/ hadoop103:/opt/module/apache-zookeeper-3.5.7-bin/
(6)配置zoo.cfg
[root@hadoop101 apache-zookeeper-3.5.7]# cd conf/
[root@hadoop101 conf]# mv zoo_sample.cfg zoo.cfg
[root@hadoop101 conf]# vim zoo.cfg
修改数据存储路径
dataDir=/opt/module/apache-zookeeper-3.5.7-bin/zkData
在文件末尾处增加集群配置
server.1=hadoop101:2888:3888
server.2=hadoop102:2888:3888
server.3=hadoop103:2888:3888
分发zoo.cfg
[root@hadoop101 conf]# scp zoo.cfg hadoop102:/opt/module/apache-zookeeper-3.5.7-bin/conf/
[root@hadoop101 conf]# scp zoo.cfg hadoop103:/opt/module/apache-zookeeper-3.5.7-bin/conf/
(7)修改其余两台机器的myid,分别为2,3
[root@hadoop102 apache-zookeeper-3.5.7]# vim zkData/myid
2
[root@hadoop103 apache-zookeeper-3.5.7]# vim zkData/myid
3
(8)启动集群
[root@hadoop101 ~]# /opt/module/apache-zookeeper-3.5.7-bin/bin/zkServer.sh start
[root@hadoop102~]# /opt/module/apache-zookeeper-3.5.7-bin/bin/zkServer.sh start
[root@hadoop103 ~]# /opt/module/apache-zookeeper-3.5.7-bin/bin/zkServer.sh start
第3章 安装Hadoop 3.1.3
3.1HDFS HA搭建
(1)上传压缩包到software文件夹,并进行解压
[root@hadoop101 module]# cd /opt/software/
[root@hadoop101 software]# tar -zxvf hadoop-3.1.3.tar.gz -C /opt/module/
(2)分发opt目录下hadoop文件夹
[root@hadoop101 software]# cd /opt/module/
[root@hadoop101 module]# scp -r hadoop-3.1.3/ hadoop102:/opt/module/
[root@hadoop101 module]# scp -r hadoop-3.1.3/ hadoop103:/opt/module/
(3)配置hadoop环境变量,结尾处加上hadoop路径,其余两台机器同样操作
[root@hadoop101 hadoop-3.1.3]# vim /etc/profile
#HADOOP_HOME
export HADOOP_HOME=/opt/module/hadoop-3.1.3
export PATH= P A T H : PATH: PATH:HADOOP_HOME/bin:$HADOOP_HOME/sbin
[root@hadoop101 hadoop-3.1.3]# source /etc/profile
[root@hadoop101 hadoop-3.1.3]# hadoop version
Hadoop 3.1.3
Source code repository https://gitbox.apache.org/repos/asf/hadoop.git -r ba631c436b806728f8ec2f54ab1e289526c90579
Compiled by ztang on 2019-09-12T02:47Z
Compiled with protoc 2.5.0
From source with checksum ec785077c385118ac91aadde5ec9799
This command was run using /opt/module/hadoop-3.1.3/share/hadoop/common/hadoop-common-3.1.3.jar
(4)配置nameservice,编写hdfs-sitx.xml
[root@hadoop101 hadoop-3.1.3]# cd etc/hadoop/
[root@hadoop101 hadoop]# vim hdfs-site.xml
dfs.ha.fencing.methods
sshfence
dfs.ha.fencing.ssh.private-key-files
/root/.ssh/id_rsa
dfs.permissions.enable
false
(5)编写core-site.xml fs.defaultFS hdfs://mycluster dfs.journalnode.edits.dir /opt/module/hadoop-3.1.3/JN/data hadoop.tmp.dir /opt/module/hadoop-3.1.3/tmp (6)在hdfs.xml添加故障自动转移 [root@hadoop101 hadoop]# vim hdfs-site.xml dfs.ha.automatic-failover.enabled true (7)在core-site.xml添加zookeeper地址 ha.zookeeper.quorum hadoop101:2181,hadoop102:2181,hadoop103:2181 3.2ResouceManager HA搭建 (1)编写yarn-site.xml [root@hadoop101 hadoop]# vim yarn-site.xml yarn.nodemanager.aux-services mapreduce_shuffle yarn.resourcemanager.ha.enabled true yarn.resourcemanager.cluster-id cluster1 yarn.resourcemanager.ha.rm-ids rm1,rm2 yarn.resourcemanager.hostname.rm1 hadoop101 yarn.resourcemanager.hostname.rm2 hadoop103 yarn.resourcemanager.webapp.address.rm1 hadoop101:8088 yarn.resourcemanager.webapp.address.rm2 hadoop103:8088 hadoop.zk.address hadoop101:2181,hadoop102:2181,hadoop103:2181 yarn.resourcemanager.recovery.enabled true yarn.resourcemanager.store.class org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore
3.3启动集群
(1)配置workers(老版本为slaves)
[root@hadoop101 hadoop]# vim workers
hadoop101
hadoop102
Hadoop103
(2)分发配置文件
[root@hadoop101 hadoop]# cd …
[root@hadoop101 etc]# scp -r hadoop/ hadoop102:/opt/module/hadoop-3.1.3/etc/
[root@hadoop101 etc]# scp -r hadoop/ hadoop103:/opt/module/hadoop-3.1.3/etc/
(3)在各台机器上启动journalnode服务
[root@hadoop101 hadoop-3.1.3]# sbin/hadoop-daemon.sh start journalnode
[root@hadoop102 hadoop-3.1.3]# sbin/hadoop-daemon.sh start journalnode
[root@hadoop103 hadoop-3.1.3]# sbin/hadoop-daemon.sh start journalnode
(4)在nn1上对namenode进行格式化
[root@hadoop101 hadoop-3.1.3]# bin/hdfs namenode -format
(5)在start-dfs.sh,stop-dfs.sh中配置root用户,顶部配置以下内容
[root@hadoop101 hadoop-3.1.3]# vim sbin/start-dfs.sh
HDFS_DATANODE_USER=root
HADOOP_SECURE_DN_USER=hdfs
HDFS_NAMENODE_USER=root
HDFS_SECONDARYNAMENODE_USER=root
HDFS_JOURNALNODE_USER=root
HDFS_ZKFC_USER=root
[root@hadoop101 hadoop-3.1.3]# vim sbin/stop-dfs.sh
HDFS_DATANODE_USER=root
HADOOP_SECURE_DN_USER=hdfs
HDFS_NAMENODE_USER=root
HDFS_SECONDARYNAMENODE_USER=root
HDFS_JOURNALNODE_USER=root
HDFS_ZKFC_USER=root
(6)在start-yarn.sh,stop-yarn.sh中配置root用户,顶部配置以下内容
[root@hadoop101 hadoop-3.1.3]# vim sbin/start-yarn.sh
YARN_RESOURCEMANAGER_USER=root
HADOOP_SECURE_DN_USER=yarn
YARN_NODEMANAGER_USER=root
[root@hadoop101 hadoop-3.1.3]# vim sbin/stop-yarn.sh
YARN_RESOURCEMANAGER_USER=root
HADOOP_SECURE_DN_USER=yarn
YARN_NODEMANAGER_USER=root
(7)编辑hadoop-env.sh,解开注释,添加JAVA_HOME
[root@hadoop101 hadoop-3.1.3]# vim etc/hadoop/hadoop-env.sh
export JAVA_HOME=/opt/module/jdk1.8.0_211
[root@hadoop102 hadoop-3.1.3]# vim etc/hadoop/hadoop-env.sh
export JAVA_HOME=/opt/module/jdk1.8.0_211
[root@hadoop103 hadoop-3.1.3]# vim etc/hadoop/hadoop-env.sh
export JAVA_HOME=/opt/module/jdk1.8.0_211
(8)分发以上.sh文件
[root@hadoop101 hadoop-3.1.3]# scp -r sbin/ hadoop102:/opt/module/hadoop-3.1.3/
[root@hadoop101 hadoop-3.1.3]# scp -r sbin/ hadoop103:/opt/module/hadoop-3.1.3/
(9)同步,启动nn1的namenode,在 nn2和nn3上进行同步
[root@hadoop101 hadoop-3.1.3]# sbin/hadoop-daemon.sh start namenode
[root@hadoop102 hadoop-3.1.3]# bin/hdfs namenode -bootstrapStandby
[root@hadoop103 hadoop-3.1.3]# bin/hdfs namenode -bootstrapStandby
[root@hadoop102 hadoop-3.1.3]# sbin/hadoop-daemon.sh start namenode
[root@hadoop103 hadoop-3.1.3]# sbin/hadoop-daemon.sh start namenode
(10)关闭所有hdfs服务
[root@hadoop101 hadoop-3.1.3]# sbin/stop-all.sh
(11)初始化HA在Zookeeper中状态:
[root@hadoop101 hadoop-3.1.3]# bin/hdfs zkfc -formatZK
(12)启动集群服务
[root@hadoop101 hadoop-3.1.3]# sbin/start-all.sh
第4章 安装MySql
4.1安装MySql服务端
(1)卸载MySql依赖,虽然机器上没有装MySql,但是这一步不可少
[root@hadoop101 software]# yum remove mysql-libs
(2)下载依赖并安装
[root@hadoop101 software]# yum install libaio
[root@hadoop101 software]# yum -y install autoconf
[root@hadoop101 software]# wget https://downloads.mysql.com/archives/get/p/23/file/MySQL-shared-compat-5.6.24-1.el6.x86_64.rpm
[root@hadoop101 software]# wget https://downloads.mysql.com/archives/get/p/23/file/MySQL-shared-5.6.24-1.el7.x86_64.rpm
[root@hadoop101 software]# rpm -ivh MySQL-shared-5.6.24-1.el7.x86_64.rpm
[root@hadoop101 software]# rpm -ivh MySQL-shared-compat-5.6.24-1.el6.x86_64.rpm
(3)上传mysql-libs.zip,并进行解压
[root@hadoop101 software]# yum install lzunzip
[root@hadoop101 software]# unzip mysql-libs.zip
(4)进入到mysql-libs文件夹下
[root@hadoop101 software]# cd mysql-libs/
[root@hadoop101 mysql-libs]# ls
MySQL-client-5.6.24-1.el6.x86_64.rpm mysql-connector-java-5.1.27.tar.gz MySQL-server-5.6.24-1.el6.x86_64.rpm
(5)安装MySql服务端
[root@hadoop101 mysql-libs]# rpm -ivh MySQL-server-5.6.24-1.el6.x86_64.rpm
(6)查看生产的随机密码
[root@hadoop101 mysql-libs]# cat /root/.mysql_secret
(7)查看MySql服务状态
[root@hadoop101 mysql-libs]# service mysql status
ERROR! MySQL is not running
(8)启动MySql
[root@hadoop101 mysql-libs]# service mysql start
Starting MySQL… SUCCESS!
4.2安装MySql客户端
(1)安装MySql客户端
[root@hadoop101 mysql-libs]# rpm -ivh MySQL-client-5.6.24-1.el6.x86_64.rpm
(2)登录MySql
[root@hadoop101 mysql-libs]# mysql -uroot -p8n2FEY8yf4vBMmLa
(3)修改密码
mysql> SET PASSWORD=PASSWORD(‘123456’);
(4)退出MySql
mysql> exit;
4.3配置User表访问权限
(1)登录MySql,访问库mysql
[root@hadoop101 mysql-libs]# mysql -uroot -p123456
mysql> show databases;
±-------------------+
| Database |
±-------------------+
| information_schema |
| mysql |
| performance_schema |
| test |
±-------------------+
mysql> use mysql
mysql> show tables;
(2)修改User表
mysql> select User, Host, Password from user;
mysql> update user set host=’%’ where host=‘localhost’;
(3)删除root用户其他的host
mysql> delete from user where host!=’%’;
(4)刷新
mysql> flush privileges;
(5)退出
mysql> exit;
第5章 安装Hive 3.1.2
(1)上传hive压缩包,并进行解压
[root@hadoop101 software]# tar -zxvf apache-hive-3.1.2-bin.tar.gz -C /opt/module/
(2)拷贝MySql驱动到hive lib下
[root@hadoop101 software]# cd mysql-libs/
[root@hadoop101 mysql-libs]# tar -zxvf mysql-connector-java-5.1.27.tar.gz
[root@hadoop101 mysql-libs]# cd mysql-connector-java-5.1.27/
[root@hadoop101 mysql-connector-java-5.1.27]# cp mysql-connector-java-5.1.27-bin.jar /opt/module/apache-hive-3.1.2-bin/lib/
(3)配置hive元数据到MySql
[root@hadoop101 mysql-connector-java-5.1.27]# cd /opt/module/apache-hive-3.1.2-bin/conf/
[root@hadoop101 conf]# vim hive-site.xml
javax.jdo.option.ConnectionDriverName
com.mysql.jdbc.Driver
Driver class name for a JDBC metastore
javax.jdo.option.ConnectionUserName
root
username to use against metastore database
javax.jdo.option.ConnectionPassword
123456
password to use against metastore database
hive.metastore.warehouse.dir
/user/hive/warehouse
location of default database for the warehouse
hive.cli.print.header
true
hive.cli.print.current.db true hive.metastore.schema.verification false datanucleus.schema.autoCreateAll true hive.metastore.uris thrift://hadoop101:9083 hive.server2.thrift.port 10000
hive.server2.thrift.bind.host
hadoop101
hive.metastore.event.db.notification.api.auth false hive.server2.active.passive.ha.enable true (4)配置hive环境变量,在profile结尾处加上以下内容 [root@hadoop101 apache-hive-3.1.2-bin]# vim /etc/profile #HIVE_HOME export HIVE_HOME=/opt/module/apache-hive-3.1.2-bin export PATH=$PATH:$HIVE_HOME/bin [root@hadoop101 apache-hive-3.1.2-bin]# source /etc/profile (5)替换hive中的guava.jar [root@hadoop101 apache-hive-3.1.2-bin]# cd lib/ [root@hadoop101 lib]# ls |grep guava guava-19.0.jar jersey-guava-2.25.1.jar 显示版本好为19.0,再次进入hadoop中查看对应版本 [root@hadoop101 lib]# cd /opt/module/hadoop-3.1.3/share/hadoop/common/lib/ [root@hadoop101 lib]# ls |grep guava guava-27.0-jre.jar listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar 版本号为27.0,删除hive原有guava的jar包并将hadoop中的guava-27.0-jre.jar复制过去 [root@hadoop101 lib]# cp guava-27.0-jre.jar /opt/module/apache-hive-3.1.2-bin/lib/ [root@hadoop101 lib]# cd /opt/module/apache-hive-3.1.2-bin/lib/ [root@hadoop101 lib]# ls |grep guava guava-19.0.jar guava-27.0-jre.jar jersey-guava-2.25.1.jar [root@hadoop101 lib]# rm -f guava-19.0.jar (6)启动元数据服务,后台运行服务 注意hive 2.x版本以上需要启动两个服务 metastore和hiveserver2 否则会报错Exception in thread "main" java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
出现错误导致hive没启动起来
org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException
解决方法:
这是由于设置了HA,需要将namenode1从standby态变为激活态
hdfs haadmin -transitionToActive nn1
建表
create table student(name string,age int);
insert into student values(“aa”,111);
select * from student;
[root@hadoop101 apache-hive-3.1.2-bin]# nohup hive --service metastore >metasotre.log>&1 &
[root@hadoop101 apache-hive-3.1.2-bin]# nohup hive --service hiveserver2 >hiveserver2.log >&1 &
(7) 启动hive
[root@hadoop101 apache-hive-3.1.2-bin]# hive
第6章 安装Kakfa_2.11-2.4.0
(1) 上传压缩包并解压,并进行解压
[root@hadoop101 software]# tar -zxvf kafka_2.11-2.4.0.tgz -C /opt/module/
(2) 进入kafka目录,穿件log日志文件夹
[root@hadoop101 software]# cd /opt/module/kafka_2.11-2.4.0/
[root@hadoop101 kafka_2.11-2.4.0]# mkdir logs
(3)修改配置文件
[root@hadoop101 kafka_2.11-2.4.0]# cd config/
[root@hadoop101 config]# vim server.properties
输入以下内容:
#broker的全局唯一编号,不能重复
broker.id=0
#删除topic功能使能
delete.topic.enable=true
#处理网络请求的线程数量
num.network.threads=3
#用来处理磁盘IO的现成数量
num.io.threads=8
#发送套接字的缓冲区大小
socket.send.buffer.bytes=102400
#接收套接字的缓冲区大小
socket.receive.buffer.bytes=102400
#请求套接字的缓冲区大小
socket.request.max.bytes=104857600
#kafka运行日志存放的路径
log.dirs=/opt/module/kafka_2.11-2.4.0/logs
#topic在当前broker上的分区个数
num.partitions=1
#用来恢复和清理data下数据的线程数量
num.recovery.threads.per.data.dir=1
#segment文件保留的最长时间,超时将被删除
#默认数据保留7天注释
#log.retention.hours=168
#配置连接Zookeeper集群地址
zookeeper.connect=hadoop102:2181,hadoop103:2181,hadoop104:2181/kafka_2.4
注意:zookeeper.connect之所以在zk地址后再加上个kafa_2.4目的在于注册信息不是直接注册到zk根目录下,而是注册到 /kakfa_2.4目录下。对应的kafka 命令zk参数也得跟着变
(4)分发到其他节点并对应修改broker.id。102,103节点分别对应1,2
[root@hadoop101 kafka_2.11-2.4.0]# cd /opt/module/
[root@hadoop101 module]# scp -r /opt/module/kafka_2.11-2.4.0/ hadoop102:/opt/module/
[root@hadoop101 module]# scp -r /opt/module/kafka_2.11-2.4.0/ hadoop103:/opt/module/
[root@hadoop102 config]# pwd
/opt/module/kafka_2.11-2.4.0/config
[root@hadoop102 config]# vim server.properties
broker.id=1
[root@hadoop103 config]# pwd
/opt/module/kafka_2.11-2.4.0/config
[root@hadoop103 config]# vim server.properties
broker.id=2
(5)启动zk集群,再启动kafka
[root@hadoop101 module]# /opt/module/apache-zookeeper-3.5.7-bin/bin/zkServer.sh start
[root@hadoop102 module]# /opt/module/apache-zookeeper-3.5.7-bin/bin/zkServer.sh start
[root@hadoop103 module]# /opt/module/apache-zookeeper-3.5.7-bin/bin/zkServer.sh start
[root@hadoop101 module]# /opt/module/kafka_2.11-2.4.0/bin/kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.0/config/server.properties
[root@hadoop102 config]# /opt/module/kafka_2.11-2.4.0/bin/kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.0/config/server.properties
[root@hadoop103 config]# /opt/module/kafka_2.11-2.4.0/bin/kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.0/config/server.properties
(6)启动后,可以去zk里看下注册信息
[root@hadoop101 module]# /opt/module/apache-zookeeper-3.5.7-bin/bin/zkCli.sh
[zk: localhost:2181(CONNECTED) 0] ls /
[hadoop-ha, kafka_2.4, rmstore, yarn-leader-election, zookeeper]
注册到kafka_2.4中,而不是根目录,可以继续查看里面信息
[zk: localhost:2181(CONNECTED) 1] ls /kafka_2.4
[admin, brokers, cluster, config, consumers, controller, controller_epoch, isr_change_notification, latest_producer_id_block, log_dir_event_notification]
(7)创建topic命令,因为注册信息不是在根目录,所以zk参数得跟着变
[root@hadoop101 module]# /opt/module/kafka_2.11-2.4.0/bin/kafka-topics.sh --zookeeper hadoop101:2181/kafka_2.4 --create --replication-factor 2 --partitions 3 --topic test
Created topic test.
第7章 其他配置与总结
9.1设置物理核和虚拟核占比
(1)当前虚拟机为处理其为2核,那么虚拟化为4核让他比值为1比2,修改
yarn.nodemanager.resource.cpu-vcores参数,修改为4
[root@hadoop101 module]# cd /opt/module/hadoop-3.1.3/etc/hadoop/
[root@hadoop101 hadoop]# vim yarn-site.xml
yarn.nodemanager.resource.cpu-vcores
4
9.2修改单个容器下最大cpu资源申请
任务提交时,比如spark-submit,executor-core参数不得超过4个
[root@hadoop101 hadoop]# vim yarn-site.xml
yarn.scheduler.maximum-allocation-vcores
4
9.3设置每个任务容器内存大小和节点内存大小
控制任务提交每个容器内存的上限,以及yarn所可以占用的内存上限,例如当前虚拟机内存为4g那么控制yarn的每个节点内存不能超过4g
[root@hadoop101 hadoop]# vim yarn-site.xml
yarn.scheduler.maximum-allocation-mb
4096
yarn.nodemanager.resource.memory-mb
7168
9.4配置容量调度器队列
容量调度器默认root队列,现在改为spark, hive两个队列,并设置spark队列资源占比为80%,hive为20%
[root@hadoop101 hadoop]# vim yarn-site.xml
9.5配置垃圾回收站
回收站保留半小时数据
[root@hadoop101 hadoop]# vim core-site.xml
fs.trash.interval
30
9.6配置历史服务器
[root@hadoop101 hadoop]# vim yarn-site.xml
yarn.nodemanager.pmem-check-enabled
false
yarn.nodemanager.vmem-check-enabled
false
yarn.log-aggregation-enable
true
yarn.nodemanager.remote-app-log-dir
/opt/module/hadoop-3.1.3/yarn-logs
yarn.log-aggregation.retain-seconds
604800
yarn.log.server.url http://hadoop102:19888/jobhistory/logs 修改mapred-site.xml mapreduce.framework.name yarn 指定mr框架为yarn方式 mapreduce.jobhistory.address hadoop102:10020 历史服务器端口号 mapreduce.jobhistory.webapp.address hadoop102:19888 历史服务器的WEB UI端口号
9.6总结
(1)分发core-site.xml yarn.xml
[root@hadoop101 hadoop]# scp yarn-site.xml hadoop102:/opt/module/hadoop-3.1.3/etc/hadoop/
[root@hadoop101 hadoop]# scp yarn-site.xml hadoop103:/opt/module/hadoop-3.1.3/etc/hadoop/
[root@hadoop101 hadoop]# scp core-site.xml hadoop102:/opt/module/hadoop-3.1.3/etc/hadoop/
[root@hadoop101 hadoop]# scp core-site.xml hadoop103:/opt/module/hadoop-3.1.3/etc/hadoop/
[root@hadoop101 hadoop]# scp mapred-site.xml hadoop102:/opt/module/hadoop-3.1.3/etc/hadoop/
mapred-site.xml
[root@hadoop101 hadoop]# scp mapred-site.xml hadoop103:/opt/module/hadoop-3.1.3/etc/hadoop/
mapred-site.xml
(2)重启集群,观察 yarn,8088页面,最大内存,最大vcore,容器可调度最大内存都已发生变化
(3)所有启动命令
启动zokeeper
[root@hadoop101 hadoop]# /opt/module/apache-zookeeper-3.5.7-bin/bin/zkServer.sh start
[root@hadoop102 hadoop]# /opt/module/apache-zookeeper-3.5.7-bin/bin/zkServer.sh start
[root@hadoop103 hadoop]# /opt/module/apache-zookeeper-3.5.7-bin/bin/zkServer.sh start
启动kafka
[root@hadoop101 hadoop]# /opt/module/kafka_2.11-2.4.0/bin/kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.0/config/server.properties
[root@hadoop102 hadoop]# /opt/module/kafka_2.11-2.4.0/bin/kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.0/config/server.properties
[root@hadoop103 hadoop]# /opt/module/kafka_2.11-2.4.0/bin/kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.0/config/server.properties
启动hive服务
[root@hadoop101 apache-hive-3.1.2-bin]# nohup hive --service metastore >metasotre.log>&1 &
[root@hadoop101 apache-hive-3.1.2-bin]# nohup hive --service hiveserver2 >hiveserver2.log >&1 &
启动hue
[root@hadoop102 hue-master]# build/env/bin/supervisor
启动hdfs集群
[root@hadoop101 hadoop]# start-all.sh
启动haoop历史服务器
[root@hadoop102 hadoop]# mr-jobhistory-daemon.sh start historyserver
启动spark历史节点
[root@hadoop102 hadoop]# start-history-server.sh
第8章 安装Flink
8.1 Yarn模式
(1)上传压缩包到hadoop103进行解压
[root@hadoop103 ~]# mkdir -p /opt/software
[root@hadoop103 software]# tar -zxvf flink-1.10.0-bin-scala_2.11.tgz -C /opt/module/
[root@hadoop103 software]# cd /opt/module/flink-1.10.1/
(2)进入到lib目录下,上传flink-shaded-hadoop-2-uber-2.8.3-10.0
[root@hadoop103 flink-1.10.1]# cd lib/
[root@hadoop103 lib]# ls
flink-dist_2.11-1.10.0.jar flink-table_2.11-1.10.0.jar log4j-1.2.17.jar
flink-shaded-hadoop-2-uber-2.8.3-10.0.jar flink-table-blink_2.11-1.10.0.jar slf4j-log4j12-1.7.15.jar
(3)编辑flink-conf.yaml
jobmanager.rpc.address: hadoop103
jobmanager.rpc.port: 6123
jobmanager.heap.size: 1024m
jobmanager.execution.failover-strategy: region
rest.port: 8081
web.submit.enable: true
env.java.home: /opt/module/jdk1.8.0_211
env.java.opts: -XX:+UseConcMarkSweepGC -XX:+PrintGCDetails -XX:-UseGCOverheadLimit -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/wormhole/gc
yarn.application-attempts: 2
(4)启动yarn-session
[root@hadoop103 lib]# cd …
[root@hadoop103 flink-1.10.0]# bin/yarn-session.sh --queue flink
(1)查看对应地址
8.2高可用(HA)
JobManager协调每个Flink部署。它负责调度和资源管理。
默认情况下,每个Flink集群只有一个JobManager实例。这将创建一个单点故障:如果一个JobManager崩溃,则无法提交任何新程序,并且正在运行的程序也会失败。
使用JobManager高可用性,可以从JobManager故障中恢复,从而消除单点故障问题。
下面介绍 YARN模式下的高可用
在运行YARN模式高可用情况下,不会起多个JobManager,只会运行一个JobManager实例,当实例出现故障时,YARN会重新启动该实例
(1)修改yarn-site.xml,修改最大重试次数,默认值为2
[root@hadoop101 hadoop]# vim /opt/module/hadoop-3.1.3/etc/hadoop/yarn-site.xml
yarn.resourcemanager.am.max-attempts
4
The maximum number of application master execution attempts.
(2)分发到其他机器上
[root@hadoop101 hadoop]# scp yarn-site.xml hadoop102:/opt/module/hadoop-3.1.3/etc/hadoop/
[root@hadoop101 hadoop]# scp yarn-site.xml hadoop103:/opt/module/hadoop-3.1.3/etc/hadoop/
(3)修改flink-conf.yaml,添加重试次数
[root@hadoop103 conf]# vim flink-conf.yaml
yarn.application-attempts: 4
注意:yarn.resourcemanager.am.maxattempts是应用程序重新启动的上限,因此Flink中设置的应用程序尝试次数不能超过启动YARN和YARN进群设置。
(4)配置zookeeper地址,修改flink-conf.yaml
[root@hadoop103 conf]# vim flink-conf.yaml
high-availability: zookeeper
high-availability.storageDir: hdfs://mycluster/flink/ha/
high-availability.zookeeper.quorum: hadoop101:2181,hadoop102:2181,hadoop103:2181
high-availability.zookeeper.path.root: /flink
(5)启动集群
[root@hadoop101 ~]# /opt/module/zookeeper-3.4.10/bin/zkServer.sh start
[root@hadoop102 ~]# /opt/module/zookeeper-3.4.10/bin/zkServer.sh start
[root@hadoop103 ~]# /opt/module/zookeeper-3.4.10/bin/zkServer.sh start
[root@hadoop101 ~]# /opt/module/hadoop-2.7.2/sbin/start-dfs.sh
[root@hadoop103 ~]# /opt/module/hadoop-2.7.2/sbin/start-yarn.sh
(6)启动flink
[root@hadoop103 flink-1.10.0]# bin/yarn-session.sh --queue flink
(7)配置flink环境变量
[root@hadoop103 flink-1.10.0]# vim /etc/profile
#FLINK_HOME
export FLINK_HOME=/opt/module/flink-1.10.0
export PATH= P A T H : PATH: PATH:FLINK_HOME/bin
[root@hadoop103 flink-1.10.0]# source /etc/profile
启动成功
(8)如果zookeeper使用kerberos安全模式运行,则需配置以下参数(可选)
[root@hadoop103 flink-1.10.1]# vim conf/flink-conf.yaml
zookeeper.sasl.service-name: zookeeper
zookeeper.sasl.login-context-name: Client
第9章 安装Hbase
9.1 概述
HBase是一个基于Hadoop的k,v数据库,是一个分布式的,可伸缩的大数据存储数据库。
HBase适用于实时读/写访问,模仿了Google的BigTable。
9.2 特性
(1)线性和模块化可扩展性
(2)严格一致的读写
(3)表的自动化和可配置切片
(4)RegionServer之间的自动故障转移支持
(5)通过HBase表备份Hadoop MapReduce作业
(6)提供简单易用的Java Api
(7)块缓存和布隆过滤器用于实时查询
9.3 架构图
图6-1 Atlas架构原理
9.4 完全分布是安装
在完全分布式配置中,集群包含多个节点,每个节点运行一个或多个Hbase守护进程。其中包括主实例和备份Master实例,多个Zookeeper节点和多个RegionServer节点。
Node Name Master Zookeeper RegionServer
Hadoop101 yes yes yes
Hadoop102 backup yes yes
Hadoop103 no yes yes
(1)上传并解压hbase-2.2.4-bin.tar.gz
[root@hadoop101 hadoop]# cd /opt/software/
[root@hadoop101 software]# tar -zxvf hbase-2.2.4-bin.tar.gz -C /opt/module/
(2)修改conf/regionservers,删除localhost,修改对应各主机域名或ip
[root@hadoop101 software]# cd /opt/module/hbase-2.2.4/
[root@hadoop101 hbase-2.2.4]# vim conf/regionservers
hadoop101
hadoop102
hadoop103
(3)在conf创建一个文件名为backup-masters,并且在这文件里添加hadoop102的域名
[root@hadoop101 hbase-2.2.4]# vim conf/backup-masters
hadoop102
(4)修改conf/hbase-site.xml文件
[root@hadoop101 hbase-2.2.4]# cd conf/
[root@hadoop101 conf]# vim hbase-site.xml
hbase.rootdir
hdfs://mycluster/hbase
hbase.cluster.distributed
true
hbase.master.port
16000
hbase.zookeeper.property.dataDir
/home/root/zookeeper
hbase.zookeeper.quorum
hadoop101,hadoop102,hadoop103
hbase.unsafe.stream.capability.enforce
false
(5)修改hbase-env.sh。声明jdk路径,并且讲hbase自带的zookeeper设置为false
[root@hadoop101 conf]# vim hbase-env.sh
export JAVA_HOME=/opt/module/jdk1.8.0_211
export HBASE_MANAGES_ZK=false
(6)拷贝hdfs-site.xml到hbase conf下
[root@hadoop101 conf]# cp /opt/module/hadoop-3.1.3/etc/hadoop/hdfs-site.xml /opt/module/hbase-2.2.4/conf/
(7)分发hbase到其他节点
[root@hadoop101 module]# scp -r hbase-2.2.4/ hadoop102:/opt/module/
[root@hadoop101 module]# scp -r hbase-2.2.4/ hadoop103:/opt/module/
(8)配置hbase环境变量
[root@hadoop101 module]# vim /etc/profile
#HBASE_HOME
export HBASE_HOME=/opt/module/hbase-2.2.4
export PATH= P A T H : PATH: PATH:HBASE_HOME/bin
[root@hadoop101 module]# source /etc/profile
[root@hadoop102 module]# vim /etc/profile
#HBASE_HOME
export HBASE_HOME=/opt/module/hbase-2.2.4
export PATH= P A T H : PATH: PATH:HBASE_HOME/bin
[root@hadoop102 module]# source /etc/profile
[root@hadoop103 module]# vim /etc/profile
#HBASE_HOME
export HBASE_HOME=/opt/module/hbase-2.2.4
export PATH= P A T H : PATH: PATH:HBASE_HOME/bin
[root@hadoop103 module]# source /etc/profile
(7)启动hbase
[root@hadoop101 module]# start-hbase.sh
(8) Web Ui访问,http://hadoop101:16010
第10章 实时数仓准备工作
10.1表模型
(1)宽表
(2)基础表
10.2创建对应topic
[root@hadoop101 module]# cd /opt/module/kafka_2.11-2.4.0/
[root@hadoop101 kafka_2.11-2.4.0]# bin/kafka-topics.sh --zookeeper hadoop102:2181/kafka_2.4 --create --replication-factor 2 --partitions 3 --topic basewebsite
[root@hadoop101 kafka_2.11-2.4.0]# bin/kafka-topics.sh --zookeeper hadoop102:2181/kafka_2.4 --create --replication-factor 2 --partitions 3 --topic basead
Created topic basead.
[root@hadoop101 kafka_2.11-2.4.0]# bin/kafka-topics.sh --zookeeper hadoop102:2181/kafka_2.4 --create --replication-factor 2 --partitions 3 --topic member
[root@hadoop101 kafka_2.11-2.4.0]# bin/kafka-topics.sh --zookeeper hadoop102:2181/kafka_2.4 --create --replication-factor 2 --partitions 3 --topic memberpaymoney
Created topic memberpaymoney.
[root@hadoop101 kafka_2.11-2.4.0]# bin/kafka-topics.sh --zookeeper hadoop102:2181/kafka_2.4 --create --replication-factor 2 --partitions 3 --topic memberregtype
Created topic memberregtype.
[root@hadoop101 kafka_2.11-2.4.0]# bin/kafka-topics.sh --zookeeper hadoop102:2181/kafka_2.4 --create --replication-factor 2 --partitions 3 --topic membervip
[root@hadoop101 kafka_2.11-2.4.0]# bin/kafka-topics.sh --zookeeper hadoop102:2181/kafka_2.4 --create --replication-factor 2 --partitions 3 --topic dwdmember
[root@hadoop101 kafka_2.11-2.4.0]# bin/kafka-topics.sh --zookeeper hadoop102:2181/kafka_2.4 --create --replication-factor 2 --partitions 3 --topic dwdmemberpaymoney
[root@hadoop101 kafka_2.11-2.4.0]# bin/kafka-topics.sh --zookeeper hadoop102:2181/kafka_2.4 --create --replication-factor 2 --partitions 3 --topic dwdmemberregtype
10.2创建对应Hbase表
[root@hadoop101 kafka_2.11-2.4.0]# hbase shell
hbase(main):001:0> create_namespace 'education
hbase(main):002:0> create ‘education:dwd_basewebsite’,{NAME => ‘info’, VERSIONS => ‘3’, TTL => ‘FOREVER’}
hbase(main):003:0> create ‘education:dwd_basead’,{NAME => ‘info’, VERSIONS => ‘3’, TTL => ‘FOREVER’}
hbase(main):004:0> create ‘education:dwd_membervip’,{NAME => ‘info’, VERSIONS => ‘3’, TTL => ‘FOREVER’}
hbase(main):005:0> create ‘education:dim_member’,{NAME=>‘info’,VERSIONS => ‘3’, TTL => ‘FOREVER’},{NUMREGIONS => 15, SPLITALGO => ‘HexStringSplit’}