1. 环境
2台虚机 (VirtualBox,CentOS 7.4):B=172.16.45.126, C=172.16.45.127
mysql vip:172.16.45.100
mysql .rpm下载:
https://downloads.mysql.com/archives/community/
5.6.25, Red Hat 7 (x86, 64bit)
2. mysql
B卸载
service mysql stop
# (https://www.cnblogs.com/kerrycode/p/4364465.html)
rpm -qa | grep -i mysql
rpm -ev MySQL-server-5.6.25-1.el7.x86_64 MySQL-client-5.6.25-1.el7.x86_64
# 删除旧数据 (否则可能导致新安装后无法启动) - 注意是否要备份旧数据!!!
rm -rf /var/lib/mysql/*
whereis mysql
rm -rf /usr/lib64/mysql /etc/mysql /usr/local/mysql /usr/my.cnf
reboot
C卸载
service mysql stop
rpm -qa | grep -i mysql
rpm -ev MySQL-server-5.6.25-1.el7.x86_64 MySQL-client-5.6.25-1.el7.x86_64
rm -rf /var/lib/mysql/*
whereis mysql
rm -rf /usr/lib64/mysql /etc/mysql /usr/local/mysql /usr/my.cnf
reboot
B安装
# (在.rpm文件下载目录执行)
yum install MySQL-server-5.6.25-1.el7.x86_64.rpm
yum install MySQL-client-5.6.25-1.el7.x86_64.rpm
service mysql start
# 启动成功,生成文件: .pid, .sock, .err (出错可查看err文件)
ls /var/lib/mysql
C安装
yum install MySQL-server-5.6.25-1.el7.x86_64.rpm
yum install MySQL-client-5.6.25-1.el7.x86_64.rpm
service mysql start
ls /var/lib/mysql
B+复制配置
mysql --help | grep my.cnf
mkdir -p /etc/mysql/
# 'server-id' 是节点标志,与C 的不同
vi /etc/mysql/my.cnf
[mysqld]
server-id=1
log-bin=/usr/local/mysql/log/bin-log
relay-log=/usr/local/mysql/log/relay-log
log-slave-updates=ON
replicate-wild-ignore-table=mysql.%
replicate-wild-ignore-table=information_schema.%
replicate-wild-ignore-table=performance_schema.%
# 自增主键:1,3,5,7,... - 与C错开以避免复制冲突
auto-increment-increment=2
auto-increment-offset=1
mkdir -p /usr/local/mysql/log
chown -R mysql:mysql /usr/local/mysql/log
service mysql restart
# 查看生成了文件:bin-log.000001 bin-log.index
ls /usr/local/mysql/log
C+复制配置
mysql --help | grep my.cnf
mkdir -p /etc/mysql/
vi /etc/mysql/my.cnf
[mysqld]
server-id=2
log-bin=/usr/local/mysql/log/bin-log
relay-log=/usr/local/mysql/log/relay-log
log-slave-updates=ON
replicate-wild-ignore-table=mysql.%
replicate-wild-ignore-table=information_schema.%
replicate-wild-ignore-table=performance_schema.%
# 自增主键:2,4,6,8,...
auto-increment-increment=2
auto-increment-offset=2
mkdir -p /usr/local/mysql/log
chown -R mysql:mysql /usr/local/mysql/log
service mysql restart
ls /usr/local/mysql/log
B安全设置
cat ~/.mysql_secret ## vW4vUtsf23zxsuw8
# 修改root密码;Disallow root login remotely - n;Remove test database - n
mysql_secure_installation
mysql -uroot -p
use mysql;
select host,user from user;
# 允许root远程登入。有一条记录:%--root
update user set host='%' where host='localhost' and user='root';
flush privileges;
# 创建复制用户,允许从127=C 访问
grant replication slave on *.* to 'repl_user'@'172.16.45.127' identified by 'repl_passwd';
select host,user from user;
flush privileges;
# 记住输出:bin-log.000001 2072
show master status;
C安全设置
cat ~/.mysql_secret ## B7vsNQLY7jieRe40
mysql_secure_installation
mysql -uroot -p
use mysql;
select host,user from user;
update user set host='%' where host='localhost' and user='root';
flush privileges;
# 126=B
grant replication slave on *.* to 'repl_user'@'172.16.45.126' identified by 'repl_passwd';
select host,user from user;
flush privileges;
# bin-log.000001 2157
show master status;
BC互联验证
#(在B用'repl_user'连C;在C用'repl_user'连B)
B开启同步
mysql
# 指向C
CHANGE MASTER TO MASTER_HOST='172.16.45.127', MASTER_USER='repl_user', MASTER_PASSWORD='repl_passwd', MASTER_LOG_FILE='bin-log.000001', MASTER_LOG_POS=2157;
start slave;
# Slave_IO_Running|Slave_SQL_Running 都是Yes 则开启成功
show slave status\G;
C开启同步
mysql
# 指向B
CHANGE MASTER TO MASTER_HOST='172.16.45.126', MASTER_USER='repl_user', MASTER_PASSWORD='repl_passwd', MASTER_LOG_FILE='bin-log.000001', MASTER_LOG_POS=2072;
start slave;
show slave status\G;
BC测试
B-mysql
use test;
create table t1 (id int auto_increment primary key, name text);
insert into t1 values (null, 'a1'), (null, 'a2'), (null, 'a3');
C-mysql
use test;
# 查到:1-a1,3-a2,5-a3
select * from t1;
insert into t1 values (null, 'b1'), (null, 'b2'), (null, 'b3');
B-mysql
# 查到多了:6-b1,8-b2,10-b3 (注意B、C插入的id是错开的)
select * from t1;
3. keepalived
# ip_vs模块
lsmod | grep ip_vs
yum install ipvsadm
modprobe ip_vs
lsmod | grep ip_vs
# 开启路由转发
vi /etc/sysctl.conf
net.ipv4.ip_forward=1
sysctl -p
# 关闭SElinux
vi /etc/selinux/config
SELINUX=disabled
setenforce 0
getenforce
# 防火墙要关闭
systemctl status firewalld
# 安装keepalived (1.3.5)
yum install keepalived
service keepalived start
# 配置 /etc/keepalived/keepalived.conf
# B、C机器注意主从不同的配置项
--------------------------------------------------------------------------
! Configuration File for keepalived
global_defs {
notification_email {
# [email protected]
}
# notification_email_from root@localhost
# smtp_server 127.0.0.1
# smtp_connect_timeout 30
# 服务标志
router_id mysql_msun
}
vrrp_script check_running {
# 健康检查脚本路径
script "/etc/keepalived/check_slave.sh"
interval 2
}
vrrp_instance mysql_msun {
# #两台都是BACKUP(只有MASTER和BACKUP两种,必须大写)
state BACKUP
# 网卡,通过ifconfig命令查看
interface enp0s3
# 虚拟路由标识,同一个vrrp实例使用唯一的标识。同一个vrrp_instance下,MASTER和BACKUP必须一致
virtual_router_id 68
# 另一台从服务器改为90。数字越大,优先级越高。在一个vrrp_instance下,MASTER的优先级必须大于BACKUP的优先级
priority 100
# MASTER与BACKUP负载均衡器之间同步检查的时间间隔,单位是秒
advert_int 1
# 不抢占,只在优先级高的机器上设置即可,优先级低的机器不设置,另一台需要删除此项
nopreempt
# 认证信息
authentication {
auth_type PASS
auth_pass 123456
}
track_script {
check_running
}
virtual_ipaddress {
# mysql的对外服务IP,即VIP
172.16.45.100
}
}
--------------------------------------------------------------------------
# 健康检查脚本 /etc/keepalived/check_slave.sh
# (https://zhuanlan.zhihu.com/p/83046628)
--------------------------------------------------------------------------
#!/bin/bash
# This script checks Mysql Slave status
Mysqlbin=/usr/bin/mysql
# 数据库账号密码
user='root'
pw='1'
port=3306
host=127.0.0.1
#最大延时
sbmax=120
# Check for $Mysqlbin
if [ ! -f $Mysqlbin ];then
echo 'Mysqlbin not found, check the variable Mysqlbin'
exit 99
fi
# Get Mysql Slave Status
IOThread=`$Mysqlbin -h $host -P $port -u$user -p$pw -e 'show slave status\G' 2>/dev/null|grep 'Slave_IO_Running:'|awk '{print $NF}'`
SQLThread=`$Mysqlbin -h $host -P $port -u$user -p$pw -e 'show slave status\G' 2>/dev/null|grep 'Slave_SQL_Running:'|awk '{print $NF}'`
SBM=`$Mysqlbin -h $host -P $port -u$user -p$pw -e 'show slave status\G' 2>/dev/null|grep 'Seconds_Behind_Master:'|awk '{print $NF}'`
# Check if the mysql run
if [[ -z "$IOThread" ]];then
exit 1
fi
# Check if the thread run
if [[ "$IOThread" == "No" || "$SQLThread" == "No" ]];then
exit 1
elif [[ $SBM -ge $sbmax ]];then
exit 1
else
exit 0
fi
--------------------------------------------------------------------------
chmod +x /etc/keepalived/check_slave.sh
service keepalived restart
# 试验:
ip addr 查看vip在哪台机器;
mysql -h172.16.45.100 -uroot -p
停止主机器的mysql服务,查看vip漂移到了从机器。
4. 讨论
4.1 阿里云不支持keepalived
出于安全考虑(如ARP欺骗):
https://mp.weixin.qq.com/s?src=11×tamp=1571897162&ver=1931&signature=G1XKflcROhYnI*DE8qFEapu7D9kO31TkS3pcDDoyXJSkgFto6ASyvT0hOMl79LdL5gTAlQXW2JYCAkNeZP2PtjlrtJtKe8IWresWes6HRQuniPbvpof9yz0vLQdfEywv&new=1
HAVIP似乎没有了,厂商建议SLB:
https://www.aliyun.com/ss/?k=havip
https://yq.aliyun.com/articles/24155?spm=5176.10695662.1996646101.searchclickresult.6d242bb5Bnf3Fa&aly_as=9BYdZ06S
测试阶段可以改用一台nginx代理多台mysql - 不考虑nginx的单点问题。
4.2 mysql复制的可靠性
在网络慢时同步延迟有几秒。
还没同步完就切换机器,好像发生过数据错乱 (丢失/留下部分旧的?),待验证 (用nginx方式)。
vip漂移后,mysql命令行可能会卡死,需要手工重启。
经试验,Java Druid 连接池在漂移期间更新/查询会报错,漂移完成后恢复正常。
4.2.1 jdbc报错“Communications link failure”
本地B,C + nginx 测试:
默认请求C,断开C时:
com.mysql.jdbc.exceptions.jdbc4.MySQLQueryInterruptedException: Query execution was interrupted
较快就切到B,不再报错。 B->C 类似。
本地B,C + keepalived 测试:
初始vip=B,断开B时:
先报 Query execution was interrupted
再报 Communications link failure ... The last packet successfully received from the server was 152,580 milliseconds ago. The last packet sent successfully to the server was 904 milliseconds ago. ==> last packet sent有0的,应该是last packet received 过大导致. 但断开到报错只有1秒
恢复B,断开C时:
先报 Query execution was interrupted
再报 create connection error ... Communications link failure ... The last packet sent successfully to the server was 0 milliseconds ago. The driver has not received any packets from the server ... Caused by: ... Connection refused
在本地虚机用nginx、keepalived两种方式都试了,只有切vip时会报Communications link failure。
初步猜测是:
last packet sent 一般较小,是在一个连接上发送数据。
last packet received 较大,是说发送后没有收到响应,而该连接上最近成功接收响应是last packet received 之前,报错原因是接收响应超时。注意也有可能这个连接是首次使用,从来没有接到过响应,此时last packet received = 创建连接的时间。
所以nginx报错的情况 估计是网络慢导致,nginx到B,C 都没出问题没有发生切换。
建议解决方法:在Druid连接池配置加上validation和闲时检测,自动丢弃问题连接。
4.3 mysql 保活
# crontab -e
* * * * * /usr/sbin/service mysql start;sleep 10;/usr/sbin/service mysql start;sleep 10;/usr/sbin/service mysql start;sleep 10;/usr/sbin/service mysql start;sleep 10;/usr/sbin/service mysql start;sleep 10;/usr/sbin/service mysql start;sleep 10
# 日志:tail -f /var/log/cron
4.4 冲突的mariadb libs
# 先卸载maria再装mysql
yum list installed | grep mariadb
yum erase mariadb-libs
# keepalived->依赖net-snmp->依赖mariadb-libs 与已安装的mysql冲突
rpm -ivh net-snmp-libs-5.7.2-43.el7.x86_64.rpm net-snmp-agent-libs-5.7.2-43.el7.x86_64.rpm --nodeps --force
yum install keepalived
5. 补充
5.1 my.cnf
在主库勿使用以下选项
#binlog-ignore-db = mysql
#binlog-ignore-db = test
#binlog-ignore-db = information_schema
#binlog-ignore-db = performance_schema
在从库勿使用以下选项
#binlog-do-db
#binlog-ignore-db
5.2 可以在/etc/hosts 配置B、C机器的域名,简化测试配置。