MHA + keepalived mysql高可用方案部署

目录

    原理

    安装mysql主从

    安装mha

    测试切换

    恢复故障

    增加keepalive,避免跨网段无法访问虚地址    


原理

        master出现故障时,可以自动把最新数据的slave提升为新的master,把所有其他的slave重新指向新的master。切换过程对应用程序透明。迁移时间20-30秒左右。

        自动切换过程中,mha尝试从原master保存binlog,最大程度保证数据不丢失,如果服务器宕机(无法ssh连接),mha无法保存二进制日志,只进行故障转移丢失少量最新数据。
        使用mysql5.5版半同步机制可以大大降低数据丢失风险。如果只有一个slave收到最新的binlog,会同步数据到所有slave,保证所有节点的数据一致性。


部署信息

    主 10.78.72.73 centos6.5
    从1(备主) 10.78.72.74 centos6.5
    从2 10.78.72.75 centos6.5
    管理节点 10.78.72.76 centos7
    VIP 10.78.72.77


安装mysql主从

# 1主2从 安装mysql(版本:Ver 14.14 Distrib 5.1.71, for redhat-linux-gnu (x86_64) using readline 5.1)
[root@mysql01 ~]# yum install mysql mysql-server mysql-libs mysql-devel -y

# 配置文件

# 另外两个将server-id修改为不同值

[root@ mysql01 ~]# cat /etc/my.cnf  | grep -v '^$' | grep -v '^#'
[client]
port            = 3306
socket          = /var/lib/mysql/mysql.sock
[mysqld]
port            = 3306
socket          = /var/lib/mysql/mysql.sock
skip-external-locking
key_buffer_size = 256M
max_allowed_packet = 1M
table_open_cache = 256
sort_buffer_size = 1M
read_buffer_size = 1M
read_rnd_buffer_size = 4M
myisam_sort_buffer_size = 64M
thread_cache_size = 8
query_cache_size= 16M
thread_concurrency = 8
wait_timeout = 10
max_connections = 1000
log-bin=mysql-bin
relay-log=relay-bin
relay-log-index=relay-bin-index
binlog-ignore-db=mysql
replicate-ignore-db=mysql
lower_case_table_names = 1
skip-name-resolve
binlog_format=mixed
log-slave-updates
relay_log_purge=0
read_only=1
server-id       = 1
[mysqldump]
quick
max_allowed_packet = 16M
[mysql]
no-auto-rehash
[myisamchk]
key_buffer_size = 128M
sort_buffer_size = 128M
read_buffer = 2M
write_buffer = 2M
[mysqlhotcopy]
interactive-timeout

# 启动mysql
# /etc/init.d/mysqld start

# 设置slave01、slave02主从同步
# 在master上创建复制用户(允许从使用repl登录)
grant replication slave, file, select on *.* to 'repl'@'10.78.72.74' identified by 'xxxxx';
grant replication slave, file, select on *.* to 'repl'@'10.78.72.75' identified by 'xxxxx';
flush privileges;

# 在备master上创建复制用户(允许从使用repl登录) (不要忘记,否则切换到这个的时候没法同步)
grant replication slave, file, select on *.* to 'repl'@'10.78.72.73' identified by 'xxxxx';
grant replication slave, file, select on *.* to 'repl'@'10.78.72.75' identified by 'xxxxx';
flush privileges;

# 在master上查询
mysql> show master status;
+------------------+----------+--------------+------------------+
| File | Position | Binlog_Do_DB | Binlog_Ignore_DB |
+------------------+----------+--------------+------------------+
| mysql-bin.000003 | 663 | | mysql |
+------------------+----------+--------------+------------------+
1 row in set (0.00 sec)

# 在slave01上同步
mysql> STOP SLAVE;
mysql> RESET SLAVE;
mysql> CHANGE MASTER TO MASTER_HOST='10.78.72.73',
MASTER_USER='repl',
MASTER_PASSWORD='xxxxx',
MASTER_LOG_FILE='mysql-bin.000003',
MASTER_LOG_POS=663;
mysql> START SLAVE;
mysql> SHOW SLAVE STATUS \G

# 在slave02上同步
mysql> STOP SLAVE;
mysql> RESET SLAVE;
mysql> CHANGE MASTER TO MASTER_HOST='10.78.72.73',
MASTER_USER='repl',
MASTER_PASSWORD='xxxxx',
MASTER_LOG_FILE='mysql-bin.000003',
MASTER_LOG_POS=663;
mysql> START SLAVE;
mysql> SHOW SLAVE STATUS \G

安装mha

设置免密码登录
# 在master、slave、manager上做免密登录
# master
ssh-keygen -t rsa
一直回车
ssh [email protected] "cat >> ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys" < ~/.ssh/id_rsa.pub
ssh [email protected] "cat >> ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys" < ~/.ssh/id_rsa.pub
ssh [email protected] "cat >> ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keys" < ~/.ssh/id_rsa.pub

# slave
...

# manager
...

创建mha复制账号
# 在master、slave01、salve02上创建mha复制账号(mha_rep)
grant all privileges on *.* to 'mha_rep'@'10.78.72.73' identified by 'xxxxx';
grant all privileges on *.* to 'mha_rep'@'10.78.72.74' identified by 'xxxxx';
grant all privileges on *.* to 'mha_rep'@'10.78.72.75' identified by 'xxxxx';
grant all privileges on *.* to 'mha_rep'@'10.78.72.76' identified by 'xxxxx';
flush privileges;

安装包
# 在master、slave01、slave02、manager上安装mha4mysql-node包
# yum install perl-DBD-MySQL -y
# rpm -ivh mha4mysql-node-0.56-0.el6.noarch.rpm
Preparing... ########################################### [100%]
1:mha4mysql-node ########################################### [100%]

# 用于识别差异日志并应用于其他slave
# cp /usr/bin/apply_diff_relay_logs /usr/local/bin/

# 用于保存和复制二进制日志
# cp /usr/bin/save_binary_logs /usr/local/bin/

# cp /usr/bin/filter_mysqlbinlog /usr/local/bin/

# 用于清除中继日志
# cp /usr/bin/purge_relay_logs /usr/local/bin/

# master安装manager
# yum install epel-release -y && yum clean all && yum makecache
# yum install perl-DBD-MySQL perl-Config-Tiny perl-Log-Dispatch perl-Parallel-ForkManager perl-Time-HiRes -y
# tar -xf mha4mysql-manager-0.56.tar.gz && cd mha4mysql-manager-0.56
[root@manager01 mha4mysql-manager-0.56]# yum install perl* -y && perl Makefile.PL && make && make install

# 复制脚本
[root@manager01]# cp /usr/bin/masterha_* /usr/local/bin/

# 查看masterha脚本
# ll /usr/local/bin/
total 84
-rwxr-xr-x 1 root root 16367 May 7 11:27 apply_diff_relay_logs
-rwxr-xr-x 1 root root 4807 May 7 11:27 filter_mysqlbinlog
-r-xr-xr-x 1 root root 1995 May 7 11:41 masterha_check_repl
-r-xr-xr-x 1 root root 1779 May 7 11:41 masterha_check_ssh
-r-xr-xr-x 1 root root 1865 May 7 11:41 masterha_check_status
-r-xr-xr-x 1 root root 3201 May 7 11:41 masterha_conf_host
-r-xr-xr-x 1 root root 2517 May 7 11:41 masterha_manager
-r-xr-xr-x 1 root root 2165 May 7 11:41 masterha_master_monitor
-r-xr-xr-x 1 root root 2373 May 7 11:41 masterha_master_switch
-r-xr-xr-x 1 root root 5171 May 7 11:41 masterha_secondary_check
-r-xr-xr-x 1 root root 1739 May 7 11:41 masterha_stop
-rwxr-xr-x 1 root root 8261 May 7 11:27 purge_relay_logs
-rwxr-xr-x 1 root root 7525 May 7 11:27 save_binary_logs

配置manager
# mkdir -p /etc/mha/ && cp samples/conf/app1.cnf /etc/mha/
# mkdir -p /var/log/masterha/app1
# mkdir -p /etc/mha/scripts

# 配置文件
# cat /etc/mha/app1.cnf
[server default]
manager_workdir=/var/log/masterha/app1
manager_log=/var/log/masterha/app1/manager.log
user=mha_rep # MHA管理mysql的用户名
password=xxxxx                         # MHA管理mysql的密码
ssh_user=root # 免密码登录的用户名
repl_user=repl # 主从复制账号名
repl_password=xxxxx                         # 主从复制账号密码
ping_interval=1 # 用来检查master是否正常

[server1]
hostname=10.78.72.73
candidate_master=1 # master宕机后,优先启用这台作为master

[server2]
hostname=10.78.72.74
candidate_master=1 # master宕机后,优先启用这台作为master

[server3]
hostname=10.78.72.75
no_master=1 # 使服务器不能成为master


# 测试ssh互信
[deploy@manager01 ~]$ masterha_check_ssh --conf=/etc/mha/app1.cnf
Mon May 7 13:53:40 2018 - [warning] Global configuration file /etc/masterha_default.cnf not found. Skipping.
Mon May 7 13:53:40 2018 - [info] Reading application default configuration from /etc/mha/app1.cnf..
Mon May 7 13:53:40 2018 - [info] Reading server configuration from /etc/mha/app1.cnf..
Mon May 7 13:53:40 2018 - [info] Starting SSH connection tests..
Mon May 7 13:53:41 2018 - [debug]
Mon May 7 13:53:40 2018 - [debug] Connecting via SSH from [email protected](10.78.72.73:22) to [email protected](10.78.72.74:22)..
Mon May 7 13:53:40 2018 - [debug] ok.
Mon May 7 13:53:40 2018 - [debug] Connecting via SSH from [email protected](10.78.72.73:22) to [email protected](10.78.72.75:22)..
Mon May 7 13:53:40 2018 - [debug] ok.
Mon May 7 13:53:41 2018 - [debug]
Mon May 7 13:53:41 2018 - [debug] Connecting via SSH from [email protected](10.78.72.74:22) to [email protected](10.78.72.73:22)..
Mon May 7 13:53:41 2018 - [debug] ok.
Mon May 7 13:53:41 2018 - [debug] Connecting via SSH from [email protected](10.78.72.74:22) to [email protected](10.78.72.75:22)..
Mon May 7 13:53:41 2018 - [debug] ok.
Mon May 7 13:53:42 2018 - [debug]
Mon May 7 13:53:41 2018 - [debug] Connecting via SSH from [email protected](10.78.72.75:22) to [email protected](10.78.72.73:22)..
Mon May 7 13:53:41 2018 - [debug] ok.
Mon May 7 13:53:41 2018 - [debug] Connecting via SSH from [email protected](10.78.72.75:22) to [email protected](10.78.72.74:22)..
Mon May 7 13:53:41 2018 - [debug] ok.
Mon May 7 13:53:42 2018 - [info] All SSH connection tests passed successfully.

# 测试mysql主从复制是否成功
[deploy@manager01 ~]$ masterha_check_repl --conf=/etc/mha/app1.cnf
Mon May 7 14:12:13 2018 - [warning] Global configuration file /etc/masterha_default.cnf not found. Skipping.
Mon May 7 14:12:13 2018 - [info] Reading application default configuration from /etc/mha/app1.cnf..
Mon May 7 14:12:13 2018 - [info] Reading server configuration from /etc/mha/app1.cnf..
Mon May 7 14:12:13 2018 - [info] MHA::MasterMonitor version 0.56.
Mon May 7 14:12:14 2018 - [info] GTID failover mode = 0
Mon May 7 14:12:14 2018 - [info] Dead Servers:
Mon May 7 14:12:14 2018 - [info] Alive Servers:
Mon May 7 14:12:14 2018 - [info] 10.78.72.73(10.78.72.73:3306)
Mon May 7 14:12:14 2018 - [info] 10.78.72.74(10.78.72.74:3306)
Mon May 7 14:12:14 2018 - [info] 10.78.72.75(10.78.72.75:3306)
Mon May 7 14:12:14 2018 - [info] Alive Slaves:
Mon May 7 14:12:14 2018 - [info] 10.78.72.74(10.78.72.74:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Mon May 7 14:12:14 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Mon May 7 14:12:14 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Mon May 7 14:12:14 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Mon May 7 14:12:14 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Mon May 7 14:12:14 2018 - [info] Not candidate for the new Master (no_master is set)
Mon May 7 14:12:14 2018 - [info] Current Alive Master: 10.78.72.73(10.78.72.73:3306)
Mon May 7 14:12:14 2018 - [info] Checking slave configurations..
Mon May 7 14:12:15 2018 - [info] Checking replication filtering settings..
Mon May 7 14:12:15 2018 - [info] binlog_do_db= , binlog_ignore_db= mysql
Mon May 7 14:12:15 2018 - [info] Replication filtering check ok.
Mon May 7 14:12:15 2018 - [info] GTID (with auto-pos) is not supported
Mon May 7 14:12:15 2018 - [info] Starting SSH connection tests..
Mon May 7 14:12:16 2018 - [info] All SSH connection tests passed successfully.
Mon May 7 14:12:16 2018 - [info] Checking MHA Node version..
Mon May 7 14:12:16 2018 - [info] Version check ok.
Mon May 7 14:12:16 2018 - [info] Checking SSH publickey authentication settings on the current master..
Mon May 7 14:12:16 2018 - [info] HealthCheck: SSH to 10.78.72.73 is reachable.
Mon May 7 14:12:17 2018 - [info] Master MHA Node version is 0.56.
Mon May 7 14:12:17 2018 - [info] Checking recovery script configurations on 10.78.72.73(10.78.72.73:3306)..
Mon May 7 14:12:17 2018 - [info] Executing command: save_binary_logs --command=test --start_pos=4 --binlog_dir=/var/lib/mysql,/var/log/mysql --output_file=/var/tmp/save_binary_logs_test --manager_version=0.56 --start_file=mysql-bin.000003
Mon May 7 14:12:17 2018 - [info] Connecting to [email protected](10.78.72.73:22)..
Creating /var/tmp if not exists.. ok.
Checking output directory is accessible or not..
ok.
Binlog found at /var/lib/mysql, up to mysql-bin.000003
Mon May 7 14:12:17 2018 - [info] Binlog setting check done.
Mon May 7 14:12:17 2018 - [info] Checking SSH publickey authentication and checking recovery script configurations on all alive slave servers..
Mon May 7 14:12:17 2018 - [info] Executing command : apply_diff_relay_logs --command=test --slave_user='mha_rep' --slave_host=10.78.72.74 --slave_ip=10.78.72.74 --slave_port=3306 --workdir=/var/tmp --target_version=5.1.71-log --manager_version=0.56 --relay_log_info=/var/lib/mysql/relay-log.info --relay_dir=/var/lib/mysql/ --slave_pass=xxx
Mon May 7 14:12:17 2018 - [info] Connecting to [email protected](10.78.72.74:22)..
Checking slave recovery environment settings..
Opening /var/lib/mysql/relay-log.info ... ok.
Relay log found at /var/lib/mysql, up to relay-bin.000004
Temporary relay log file is /var/lib/mysql/relay-bin.000004
Testing mysql connection and privileges.. done.
Testing mysqlbinlog output.. done.
Cleaning up test file(s).. done.
Mon May 7 14:12:17 2018 - [info] Executing command : apply_diff_relay_logs --command=test --slave_user='mha_rep' --slave_host=10.78.72.75 --slave_ip=10.78.72.75 --slave_port=3306 --workdir=/var/tmp --target_version=5.1.71-log --manager_version=0.56 --relay_log_info=/var/lib/mysql/relay-log.info --relay_dir=/var/lib/mysql/ --slave_pass=xxx
Mon May 7 14:12:17 2018 - [info] Connecting to [email protected](10.78.72.75:22)..
Checking slave recovery environment settings..
Opening /var/lib/mysql/relay-log.info ... ok.
Relay log found at /var/lib/mysql, up to relay-bin.000004
Temporary relay log file is /var/lib/mysql/relay-bin.000004
Testing mysql connection and privileges.. done.
Testing mysqlbinlog output.. done.
Cleaning up test file(s).. done.
Mon May 7 14:12:17 2018 - [info] Slaves settings check done.
Mon May 7 14:12:17 2018 - [info]
10.78.72.73(10.78.72.73:3306) (current master)
+--10.78.72.74(10.78.72.74:3306)
+--10.78.72.75(10.78.72.75:3306)

Mon May 7 14:12:17 2018 - [info] Checking replication health on 10.78.72.74..
Mon May 7 14:12:17 2018 - [info] ok.
Mon May 7 14:12:17 2018 - [info] Checking replication health on 10.78.72.75..
Mon May 7 14:12:17 2018 - [info] ok.
Mon May 7 14:12:17 2018 - [warning] master_ip_failover_script is not defined.
Mon May 7 14:12:17 2018 - [warning] shutdown_script is not defined.
Mon May 7 14:12:17 2018 - [info] Got exit code 0 (Not master dead).

MySQL Replication Health is OK.
[deploy@manager01 ~]$

# 注意事项
log-slave-updates // 只有从库开启log_slave_updates时,从库binlog才会记录主库同步的操作日志。
// 默认为1,mysql主从默认从库的relay logs会在SQL线程执行完毕后被自动删除,对于MHA场景下,对于某些滞后从库的恢复依赖于其他从库的relaylog,因此需要采用自动删除功能以及定期清理。清理过多过大的relay log需要注意引起的复制延迟资源开销。MHA通过purge_relay_logs脚本配合cronjob完成。
// https://blog.csdn.net/leshami/article/details/45688503
relay_log_purge=0
read_only=1 // 防止误写,在mha将slave切换为master时,会自动设置read_only=0


所有slave设置purge
# mysql数据库主从复制在默认情况下从库的relay logs会在SQL线程执行完毕后被自动删除
# 但是对于MHA场景下,对于某些滞后从库的恢复依赖其他从库的relay log,因此采用自动删除功能以及定期清理。
# 对于过多过大的relay log需要注意引起的复制延迟资源开销等。
# MHA通过purge_relay_logs脚本配合crontab完成

mkdir -p /var/log/masterha/
crontab -e
# 每天凌晨5点调用purge_relay_logs
0 5 * * * /usr/bin/purge_relay_logs --user=root --disable_relay_log_purge >> /var/log/masterha/purge_relay_logs.log 2>&1

测试切换

增加VIP

mha检测failover的源码文件
/usr/local/share/perl5/MHA/MasterFailover.pm
333行,关注发送stopssh命令,和发送stop命令的命令行参数
在master_ip_failover脚本中处理逻辑不同,
在发送stopssh命令时,原master ssh是通的(手动关闭mysql,但是ssh服务可连)
发送stop命令时,检测原master ssh不通,所以不需要执行ssh命令删除VIP(服务器重启的情况)

脚本:
[root@manager01 app1]# cat /etc/mha/scripts/master_ip_failover
#!/usr/bin/env perl
use strict;
use warnings FATAL => 'all';
use Getopt::Long;
my (
$command, $ssh_user, $orig_master_host, $orig_master_ip,
$orig_master_port, $new_master_host, $new_master_ip, $new_master_port
);
my $vip = '10.78.72.77'; #vip地址
my $key = '1';
my $ssh_start_vip = "/sbin/ifconfig eth0:$key $vip netmask 255.255.255.0"; #绑定在指定的网卡上面
my $ssh_stop_vip = "/sbin/ifconfig eth0:$key down";
GetOptions(
'command=s' => \$command,
'ssh_user=s' => \$ssh_user,
'orig_master_host=s' => \$orig_master_host,
'orig_master_ip=s' => \$orig_master_ip,
'orig_master_port=i' => \$orig_master_port,
'new_master_host=s' => \$new_master_host,
'new_master_ip=s' => \$new_master_ip,
'new_master_port=i' => \$new_master_port,
);
exit &main();
sub main {
print "\n\nIN SCRIPT TEST====$ssh_stop_vip==$ssh_start_vip===\n\n";
if ( $command eq "stop" || $command eq "stopssh" ) {
my $exit_code = 1;
eval {
print "Disabling the VIP on old master: $orig_master_host \n";
# stopssh命令才去执行stop vip
if ($command eq "stopssh") {
&stop_vip();
}
$exit_code = 0;
};
if ($@) {
warn "Got Error: $@\n";
exit $exit_code;
}
exit $exit_code;
}
elsif ( $command eq "start" ) {
my $exit_code = 10;
eval {
print "Enabling the VIP - $vip on the new master - $new_master_host \n";
&start_vip();
$exit_code = 0;
};
if ($@) {
warn $@;
exit $exit_code;
}
exit $exit_code;
}
elsif ( $command eq "status" ) {
print "Checking the Status of the script.. OK \n";
exit 0;
}
else {
&usage();
exit 1;
}
}
sub start_vip() {
`ssh $ssh_user\@$new_master_host \" $ssh_start_vip \"`;
}
# A simple system call that disable the VIP on the old_master
sub stop_vip() {
`ssh $ssh_user\@$orig_master_host \" $ssh_stop_vip \"`;
}
sub usage {
print
"Usage: master_ip_failover --command=start|stop|stopssh|status --orig_master_host=host --orig_master_ip=ip --orig_master_port=port --new_master_host=host --new_master_ip=ip --new_master_port=port\n";
}
[root@manager01 app1]#

配置文件增加脚本配置:
[root@manager01 app1]# cat /etc/mha/app1.cnf
[server default]
manager_workdir=/var/log/masterha/app1
manager_log=/var/log/masterha/app1/manager.log
master_ip_failover_script=/etc/mha/scripts/master_ip_failover
user=mha_rep
password=xxxxx
ssh_user=root
repl_user=repl
repl_password=xxxxx
ping_interval=1

[server1]
hostname=10.78.72.73
candidate_master=1

[server2]
hostname=10.78.72.74
candidate_master=1

[server3]
hostname=10.78.72.75
no_master=1

增加VIP
# 在master上添加vip 10.78.72.77, 命令行new_master_host=注意替换
[root@manager01 app1]# /etc/mha/scripts/master_ip_failover --command=start --ssh_user=root --new_master_host=10.78.72.73


确认删除failover文件
[root@manager01 ~]# ls /var/log/masterha/app1/
[root@manager01 ~]# rm /var/log/masterha/app1/app1.failover.error -f


批量写入数据库测试
创建测试账号、测试库
grant all privileges on *.* to 'test'@'%' identified by 'test';
flush privileges;
# 注意上面两句需要在 master 和 slave-mater上都要执行,否则切换后无法写入slave-master
CREATE DATABASE fortest;
USE fortest;
CREATE TABLE number ( num int NOT NULL, PRIMARY KEY (num) );

[root@manager01 ~]# yum install php php-mysql -y
[root@manager01 ~]# cat write.php

date_default_timezone_set('PRC');

// 注意修改VIP
define('DB_HOST', '10.78.72.77');
define('DB_USER', 'test');
define('DB_PASS', 'test');
define('DB_NAME', 'fortest');
$dbc = mysqli_connect(DB_HOST, DB_USER, DB_PASS, DB_NAME) or die();
mysqli_query($dbc, "SET NAMES 'UTF8'");

// 查询已插入数据行数
$query = "select * from number order by num desc limit 1";
$datas = mysqli_query($dbc, $query) or die(mysqli_error($dbc));
$task = mysqli_fetch_array($datas);

echo var_dump($task);
$num = $task["num"];
$num = $num + 1;

echo "num now:" . $num . "\n";

while (1) {
$query = "INSERT INTO number (num) VALUES (" . $num . ")";
if (mysqli_query($dbc, $query)) {
echo date('Y-m-d H:i:s') . " : " . $query . "\n";
$num++;
} else {
echo mysqli_error($dbc) . "\n";
while (!($dbc = mysqli_connect(DB_HOST, DB_USER, DB_PASS, DB_NAME))) {
echo date('Y-m-d H:i:s') . " : sleep 1s" . "\n";
sleep(1);
}
mysqli_query($dbc, "SET NAMES 'UTF8'");
sleep(1);
}
}

?>
[root@manager01 ~]# php write.php

启动masterha_manager
[root@manager01 ~]# nohup masterha_manager --conf=/etc/mha/app1.cnf 2>&1 &

[root@manager01 ~]# cat /var/log/masterha/app1/manager.log
Thu May 10 13:18:57 2018 - [info] MHA::MasterMonitor version 0.56.
Thu May 10 13:18:58 2018 - [info] GTID failover mode = 0
Thu May 10 13:18:58 2018 - [info] Dead Servers:
Thu May 10 13:18:58 2018 - [info] Alive Servers:
Thu May 10 13:18:58 2018 - [info] 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:18:58 2018 - [info] 10.78.72.74(10.78.72.74:3306)
Thu May 10 13:18:58 2018 - [info] 10.78.72.75(10.78.72.75:3306)
Thu May 10 13:18:58 2018 - [info] Alive Slaves:
Thu May 10 13:18:58 2018 - [info] 10.78.72.74(10.78.72.74:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:18:58 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:18:58 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 13:18:58 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:18:58 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:18:58 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 13:18:58 2018 - [info] Current Alive Master: 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:18:58 2018 - [info] Checking slave configurations..
Thu May 10 13:18:58 2018 - [info] Checking replication filtering settings..
Thu May 10 13:18:58 2018 - [info] binlog_do_db= , binlog_ignore_db= mysql
Thu May 10 13:18:58 2018 - [info] Replication filtering check ok.
Thu May 10 13:18:58 2018 - [info] GTID (with auto-pos) is not supported
Thu May 10 13:18:58 2018 - [info] Starting SSH connection tests..
Thu May 10 13:18:59 2018 - [info] All SSH connection tests passed successfully.
Thu May 10 13:18:59 2018 - [info] Checking MHA Node version..
Thu May 10 13:19:00 2018 - [info] Version check ok.
Thu May 10 13:19:00 2018 - [info] Checking SSH publickey authentication settings on the current master..
Thu May 10 13:19:00 2018 - [info] HealthCheck: SSH to 10.78.72.73 is reachable.
Thu May 10 13:19:00 2018 - [info] Master MHA Node version is 0.56.
Thu May 10 13:19:00 2018 - [info] Checking recovery script configurations on 10.78.72.73(10.78.72.73:3306)..
Thu May 10 13:19:00 2018 - [info] Executing command: save_binary_logs --command=test --start_pos=4 --binlog_dir=/var/lib/mysql,/var/log/mysql --output_file=/var/tmp/save_binary_logs_test --manager_version=0.56 --start_file=mysql-bin.000005
Thu May 10 13:19:00 2018 - [info] Connecting to [email protected](10.78.72.73:22)..
Creating /var/tmp if not exists.. ok.
Checking output directory is accessible or not..
ok.
Binlog found at /var/lib/mysql, up to mysql-bin.000005
Thu May 10 13:19:00 2018 - [info] Binlog setting check done.
Thu May 10 13:19:00 2018 - [info] Checking SSH publickey authentication and checking recovery script configurations on all alive slave servers..
Thu May 10 13:19:00 2018 - [info] Executing command : apply_diff_relay_logs --command=test --slave_user='mha_rep' --slave_host=10.78.72.74 --slave_ip=10.78.72.74 --slave_port=3306 --workdir=/var/tmp --target_version=5.1.71-log --manager_version=0.56 --relay_log_info=/var/lib/mysql/relay-log.info --relay_dir=/var/lib/mysql/ --slave_pass=xxx
Thu May 10 13:19:00 2018 - [info] Connecting to [email protected](10.78.72.74:22)..
Checking slave recovery environment settings..
Opening /var/lib/mysql/relay-log.info ... ok.
Relay log found at /var/lib/mysql, up to relay-bin.000003
Temporary relay log file is /var/lib/mysql/relay-bin.000003
Testing mysql connection and privileges.. done.
Testing mysqlbinlog output.. done.
Cleaning up test file(s).. done.
Thu May 10 13:19:00 2018 - [info] Executing command : apply_diff_relay_logs --command=test --slave_user='mha_rep' --slave_host=10.78.72.75 --slave_ip=10.78.72.75 --slave_port=3306 --workdir=/var/tmp --target_version=5.1.71-log --manager_version=0.56 --relay_log_info=/var/lib/mysql/relay-log.info --relay_dir=/var/lib/mysql/ --slave_pass=xxx
Thu May 10 13:19:00 2018 - [info] Connecting to [email protected](10.78.72.75:22)..
Checking slave recovery environment settings..
Opening /var/lib/mysql/relay-log.info ... ok.
Relay log found at /var/lib/mysql, up to relay-bin.000003
Temporary relay log file is /var/lib/mysql/relay-bin.000003
Testing mysql connection and privileges.. done.
Testing mysqlbinlog output.. done.
Cleaning up test file(s).. done.
Thu May 10 13:19:01 2018 - [info] Slaves settings check done.
Thu May 10 13:19:01 2018 - [info]
10.78.72.73(10.78.72.73:3306) (current master)
+--10.78.72.74(10.78.72.74:3306)
+--10.78.72.75(10.78.72.75:3306)

Thu May 10 13:19:01 2018 - [info] Checking master_ip_failover_script status:
Thu May 10 13:19:01 2018 - [info] /etc/mha/scripts/master_ip_failover --command=status --ssh_user=root --orig_master_host=10.78.72.73 --orig_master_ip=10.78.72.73 --orig_master_port=3306


IN SCRIPT TEST====/sbin/ifconfig eth0:1 down==/sbin/ifconfig eth0:1 10.78.72.77===

Checking the Status of the script.. OK
Thu May 10 13:19:01 2018 - [info] OK.
Thu May 10 13:19:01 2018 - [warning] shutdown_script is not defined.
Thu May 10 13:19:01 2018 - [info] Set master ping interval 1 seconds.
Thu May 10 13:19:01 2018 - [warning] secondary_check_script is not defined. It is highly recommended setting it to check master reachability from two or more routes.
Thu May 10 13:19:01 2018 - [info] Starting ping health check on 10.78.72.73(10.78.72.73:3306)..
Thu May 10 13:19:01 2018 - [info] Ping(SELECT) succeeded, waiting until MySQL doesn't respond..
[root@manager01 ~]#

手动关闭mysql或重启mysql服务器
1. 关闭原master 服务器
[root@manager01 ~]# tail -f /var/log/masterha/app1/manager.log
Thu May 10 13:42:15 2018 - [warning] Got error on MySQL select ping: 2006 (MySQL server has gone away)
Thu May 10 13:42:15 2018 - [info] Executing SSH check script: save_binary_logs --command=test --start_pos=4 --binlog_dir=/var/lib/mysql,/var/log/mysql --output_file=/var/tmp/save_binary_logs_test --manager_version=0.56 --binlog_prefix=mysql-bin
Thu May 10 13:42:15 2018 - [warning] HealthCheck: SSH to 10.78.72.73 is NOT reachable.
Thu May 10 13:42:16 2018 - [warning] Got error on MySQL connect: 2003 (Can't connect to MySQL server on '10.78.72.73' (111))
Thu May 10 13:42:16 2018 - [warning] Connection failed 2 time(s)..
Thu May 10 13:42:17 2018 - [warning] Got error on MySQL connect: 2003 (Can't connect to MySQL server on '10.78.72.73' (111))
Thu May 10 13:42:17 2018 - [warning] Connection failed 3 time(s)..
Thu May 10 13:42:18 2018 - [warning] Got error on MySQL connect: 2003 (Can't connect to MySQL server on '10.78.72.73' (111))
Thu May 10 13:42:18 2018 - [warning] Connection failed 4 time(s)..
Thu May 10 13:42:18 2018 - [warning] Master is not reachable from health checker!
Thu May 10 13:42:18 2018 - [warning] Master 10.78.72.73(10.78.72.73:3306) is not reachable!
Thu May 10 13:42:18 2018 - [warning] SSH is NOT reachable.
Thu May 10 13:42:18 2018 - [info] Connecting to a master server failed. Reading configuration file /etc/masterha_default.cnf and /etc/mha/app1.cnf again, and trying to connect to all servers to check server status..
Thu May 10 13:42:18 2018 - [warning] Global configuration file /etc/masterha_default.cnf not found. Skipping.
Thu May 10 13:42:18 2018 - [info] Reading application default configuration from /etc/mha/app1.cnf..
Thu May 10 13:42:18 2018 - [info] Reading server configuration from /etc/mha/app1.cnf..
Thu May 10 13:42:19 2018 - [info] GTID failover mode = 0
Thu May 10 13:42:19 2018 - [info] Dead Servers:
Thu May 10 13:42:19 2018 - [info] 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:19 2018 - [info] Alive Servers:
Thu May 10 13:42:19 2018 - [info] 10.78.72.74(10.78.72.74:3306)
Thu May 10 13:42:19 2018 - [info] 10.78.72.75(10.78.72.75:3306)
Thu May 10 13:42:19 2018 - [info] Alive Slaves:
Thu May 10 13:42:19 2018 - [info] 10.78.72.74(10.78.72.74:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:42:19 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:19 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 13:42:19 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:42:19 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:19 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 13:42:19 2018 - [info] Checking slave configurations..
Thu May 10 13:42:19 2018 - [info] Checking replication filtering settings..
Thu May 10 13:42:19 2018 - [info] Replication filtering check ok.
Thu May 10 13:42:19 2018 - [info] Master is down!
Thu May 10 13:42:19 2018 - [info] Terminating monitoring script.
Thu May 10 13:42:19 2018 - [info] Got exit code 20 (Master dead).
Thu May 10 13:42:19 2018 - [info] MHA::MasterFailover version 0.56.
Thu May 10 13:42:19 2018 - [info] Starting master failover.
Thu May 10 13:42:19 2018 - [info]
Thu May 10 13:42:19 2018 - [info] * Phase 1: Configuration Check Phase..
Thu May 10 13:42:19 2018 - [info]
Thu May 10 13:42:20 2018 - [info] GTID failover mode = 0
Thu May 10 13:42:20 2018 - [info] Dead Servers:
Thu May 10 13:42:20 2018 - [info] 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:20 2018 - [info] Checking master reachability via MySQL(double check)...
Thu May 10 13:42:20 2018 - [info] ok.
Thu May 10 13:42:20 2018 - [info] Alive Servers:
Thu May 10 13:42:20 2018 - [info] 10.78.72.74(10.78.72.74:3306)
Thu May 10 13:42:20 2018 - [info] 10.78.72.75(10.78.72.75:3306)
Thu May 10 13:42:20 2018 - [info] Alive Slaves:
Thu May 10 13:42:20 2018 - [info] 10.78.72.74(10.78.72.74:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:42:20 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:20 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 13:42:20 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:42:20 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:20 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 13:42:20 2018 - [info] Starting Non-GTID based failover.
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] ** Phase 1: Configuration Check Phase completed.
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] * Phase 2: Dead Master Shutdown Phase..
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] Forcing shutdown so that applications never connect to the current master..
Thu May 10 13:42:20 2018 - [info] Executing master IP deactivation script:
Thu May 10 13:42:20 2018 - [info] /etc/mha/scripts/master_ip_failover --orig_master_host=10.78.72.73 --orig_master_ip=10.78.72.73 --orig_master_port=3306 --command=stop


IN SCRIPT TEST====/sbin/ifconfig eth0:1 down==/sbin/ifconfig eth0:1 10.78.72.77===

Disabling the VIP on old master: 10.78.72.73
Thu May 10 13:42:20 2018 - [info] done.
Thu May 10 13:42:20 2018 - [warning] shutdown_script is not set. Skipping explicit shutting down of the dead master.
Thu May 10 13:42:20 2018 - [info] * Phase 2: Dead Master Shutdown Phase completed.
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] * Phase 3: Master Recovery Phase..
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] * Phase 3.1: Getting Latest Slaves Phase..
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] The latest binary log file/position on all slaves is mysql-bin.000005:6369627
Thu May 10 13:42:20 2018 - [info] Latest slaves (Slaves that received relay log files to the latest):
Thu May 10 13:42:20 2018 - [info] 10.78.72.74(10.78.72.74:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:42:20 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:20 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 13:42:20 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:42:20 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:20 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 13:42:20 2018 - [info] The oldest binary log file/position on all slaves is mysql-bin.000005:6369627
Thu May 10 13:42:20 2018 - [info] Oldest slaves:
Thu May 10 13:42:20 2018 - [info] 10.78.72.74(10.78.72.74:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:42:20 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:20 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 13:42:20 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:42:20 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:20 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] * Phase 3.2: Saving Dead Master's Binlog Phase..
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [warning] Dead Master is not SSH reachable. Could not save it's binlogs. Transactions that were not sent to the latest slave (Read_Master_Log_Pos to the tail of the dead master's binlog) were lost.
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] * Phase 3.3: Determining New Master Phase..
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] Finding the latest slave that has all relay logs for recovering other slaves..
Thu May 10 13:42:20 2018 - [info] All slaves received relay logs to the same position. No need to resync each other.
Thu May 10 13:42:20 2018 - [info] Searching new master from slaves..
Thu May 10 13:42:20 2018 - [info] Candidate masters from the configuration file:
Thu May 10 13:42:20 2018 - [info] 10.78.72.74(10.78.72.74:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:42:20 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:20 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 13:42:20 2018 - [info] Non-candidate masters:
Thu May 10 13:42:20 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 13:42:20 2018 - [info] Replicating from 10.78.72.73(10.78.72.73:3306)
Thu May 10 13:42:20 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 13:42:20 2018 - [info] Searching from candidate_master slaves which have received the latest relay log events..
Thu May 10 13:42:20 2018 - [info] New master is 10.78.72.74(10.78.72.74:3306)
Thu May 10 13:42:20 2018 - [info] Starting master failover..
Thu May 10 13:42:20 2018 - [info]
From:
10.78.72.73(10.78.72.73:3306) (current master)
+--10.78.72.74(10.78.72.74:3306)
+--10.78.72.75(10.78.72.75:3306)

To:
10.78.72.74(10.78.72.74:3306) (new master)
+--10.78.72.75(10.78.72.75:3306)
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] * Phase 3.3: New Master Diff Log Generation Phase..
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] This server has all relay logs. No need to generate diff files from the latest slave.
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] * Phase 3.4: Master Log Apply Phase..
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] *NOTICE: If any error happens from this phase, manual recovery is needed.
Thu May 10 13:42:20 2018 - [info] Starting recovery on 10.78.72.74(10.78.72.74:3306)..
Thu May 10 13:42:20 2018 - [info] This server has all relay logs. Waiting all logs to be applied..
Thu May 10 13:42:20 2018 - [info] done.
Thu May 10 13:42:20 2018 - [info] All relay logs were successfully applied.
Thu May 10 13:42:20 2018 - [info] Getting new master's binlog name and position..
Thu May 10 13:42:20 2018 - [info] mysql-bin.000004:6368065
Thu May 10 13:42:20 2018 - [info] All other slaves should start replication from here. Statement should be: CHANGE MASTER TO MASTER_HOST='10.78.72.74', MASTER_PORT=3306, MASTER_LOG_FILE='mysql-bin.000004', MASTER_LOG_POS=6368065, MASTER_USER='repl', MASTER_PASSWORD='xxx';
Thu May 10 13:42:20 2018 - [info] Executing master IP activate script:
Thu May 10 13:42:20 2018 - [info] /etc/mha/scripts/master_ip_failover --command=start --ssh_user=root --orig_master_host=10.78.72.73 --orig_master_ip=10.78.72.73 --orig_master_port=3306 --new_master_host=10.78.72.74 --new_master_ip=10.78.72.74 --new_master_port=3306 --new_master_user='mha_rep' --new_master_password='Videobase.cn'
Unknown option: new_master_user
Unknown option: new_master_password


IN SCRIPT TEST====/sbin/ifconfig eth0:1 down==/sbin/ifconfig eth0:1 10.78.72.77===

Enabling the VIP - 10.78.72.77 on the new master - 10.78.72.74
Thu May 10 13:42:20 2018 - [info] OK.
Thu May 10 13:42:20 2018 - [info] Setting read_only=0 on 10.78.72.74(10.78.72.74:3306)..
Thu May 10 13:42:20 2018 - [info] ok.
Thu May 10 13:42:20 2018 - [info] ** Finished master recovery successfully.
Thu May 10 13:42:20 2018 - [info] * Phase 3: Master Recovery Phase completed.
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] * Phase 4: Slaves Recovery Phase..
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] * Phase 4.1: Starting Parallel Slave Diff Log Generation Phase..
Thu May 10 13:42:20 2018 - [info]
Thu May 10 13:42:20 2018 - [info] -- Slave diff file generation on host 10.78.72.75(10.78.72.75:3306) started, pid: 4037. Check tmp log /var/log/masterha/app1/10.78.72.75_3306_20180510134219.log if it takes time..
Thu May 10 13:42:21 2018 - [info]
Thu May 10 13:42:21 2018 - [info] Log messages from 10.78.72.75 ...
Thu May 10 13:42:21 2018 - [info]
Thu May 10 13:42:20 2018 - [info] This server has all relay logs. No need to generate diff files from the latest slave.
Thu May 10 13:42:21 2018 - [info] End of log messages from 10.78.72.75.
Thu May 10 13:42:21 2018 - [info] -- 10.78.72.75(10.78.72.75:3306) has the latest relay log events.
Thu May 10 13:42:21 2018 - [info] Generating relay diff files from the latest slave succeeded.
Thu May 10 13:42:21 2018 - [info]
Thu May 10 13:42:21 2018 - [info] * Phase 4.2: Starting Parallel Slave Log Apply Phase..
Thu May 10 13:42:21 2018 - [info]
Thu May 10 13:42:21 2018 - [info] -- Slave recovery on host 10.78.72.75(10.78.72.75:3306) started, pid: 4039. Check tmp log /var/log/masterha/app1/10.78.72.75_3306_20180510134219.log if it takes time..
Thu May 10 13:42:22 2018 - [info]
Thu May 10 13:42:22 2018 - [info] Log messages from 10.78.72.75 ...
Thu May 10 13:42:22 2018 - [info]
Thu May 10 13:42:21 2018 - [info] Starting recovery on 10.78.72.75(10.78.72.75:3306)..
Thu May 10 13:42:21 2018 - [info] This server has all relay logs. Waiting all logs to be applied..
Thu May 10 13:42:21 2018 - [info] done.
Thu May 10 13:42:21 2018 - [info] All relay logs were successfully applied.
Thu May 10 13:42:21 2018 - [info] Resetting slave 10.78.72.75(10.78.72.75:3306) and starting replication from the new master 10.78.72.74(10.78.72.74:3306)..
Thu May 10 13:42:21 2018 - [info] Executed CHANGE MASTER.
Thu May 10 13:42:21 2018 - [info] Slave started.
Thu May 10 13:42:22 2018 - [info] End of log messages from 10.78.72.75.
Thu May 10 13:42:22 2018 - [info] -- Slave recovery on host 10.78.72.75(10.78.72.75:3306) succeeded.
Thu May 10 13:42:22 2018 - [info] All new slave servers recovered successfully.
Thu May 10 13:42:22 2018 - [info]
Thu May 10 13:42:22 2018 - [info] * Phase 5: New master cleanup phase..
Thu May 10 13:42:22 2018 - [info]
Thu May 10 13:42:22 2018 - [info] Resetting slave info on the new master..
Thu May 10 13:42:22 2018 - [info] 10.78.72.74: Resetting slave info succeeded.
Thu May 10 13:42:22 2018 - [info] Master failover to 10.78.72.74(10.78.72.74:3306) completed successfully.
Thu May 10 13:42:22 2018 - [info]

----- Failover Report -----

app1: MySQL Master failover 10.78.72.73(10.78.72.73:3306) to 10.78.72.74(10.78.72.74:3306) succeeded

Master 10.78.72.73(10.78.72.73:3306) is down!

Check MHA Manager logs at manager01:/var/log/masterha/app1/manager.log for details.

Started automated(non-interactive) failover.
Invalidated master IP address on 10.78.72.73(10.78.72.73:3306)
The latest slave 10.78.72.74(10.78.72.74:3306) has all relay logs for recovery.
Selected 10.78.72.74(10.78.72.74:3306) as a new master.
10.78.72.74(10.78.72.74:3306): OK: Applying all logs succeeded.
10.78.72.74(10.78.72.74:3306): OK: Activated master IP address.
10.78.72.75(10.78.72.75:3306): This host has the latest relay log events.
Generating relay diff files from the latest slave succeeded.
10.78.72.75(10.78.72.75:3306): OK: Applying all logs succeeded. Slave started, replicating from 10.78.72.74(10.78.72.74:3306)
10.78.72.74(10.78.72.74:3306): Resetting slave info succeeded.
Master failover to 10.78.72.74(10.78.72.74:3306) completed successfully.

# 删除failover.complete文件
[root@manager01 ~]# rm /var/log/masterha/app1/app1.failover.complete -f
# 启动masterha_manager
[root@manager01 ~]# nohup masterha_manager --conf=/etc/mha/app1.cnf 2>&1 &

# 恢复数据库参见 恢复故障
2. 关闭原master 服务
[root@manager01 ~]# tail -f /var/log/masterha/app1/manager.log
Thu May 10 18:21:20 2018 - [warning] Got error on MySQL select ping: 2006 (MySQL server has gone away)
Thu May 10 18:21:20 2018 - [info] Executing SSH check script: save_binary_logs --command=test --start_pos=4 --binlog_dir=/var/lib/mysql,/var/log/mysql --output_file=/var/tmp/save_binary_logs_test --manager_version=0.56 --binlog_prefix=mysql-bin
Thu May 10 18:21:20 2018 - [info] HealthCheck: SSH to 10.78.72.74 is reachable.
Thu May 10 18:21:21 2018 - [warning] Got error on MySQL connect: 2003 (Can't connect to MySQL server on '10.78.72.74' (111))
Thu May 10 18:21:21 2018 - [warning] Connection failed 2 time(s)..
Thu May 10 18:21:22 2018 - [warning] Got error on MySQL connect: 2003 (Can't connect to MySQL server on '10.78.72.74' (111))
Thu May 10 18:21:22 2018 - [warning] Connection failed 3 time(s)..
Thu May 10 18:21:23 2018 - [warning] Got error on MySQL connect: 2003 (Can't connect to MySQL server on '10.78.72.74' (111))
Thu May 10 18:21:23 2018 - [warning] Connection failed 4 time(s)..
Thu May 10 18:21:23 2018 - [warning] Master is not reachable from health checker!
Thu May 10 18:21:23 2018 - [warning] Master 10.78.72.74(10.78.72.74:3306) is not reachable!
Thu May 10 18:21:23 2018 - [warning] SSH is reachable.
Thu May 10 18:21:23 2018 - [info] Connecting to a master server failed. Reading configuration file /etc/masterha_default.cnf and /etc/mha/app1.cnf again, and trying to connect to all servers to check server status..
Thu May 10 18:21:23 2018 - [warning] Global configuration file /etc/masterha_default.cnf not found. Skipping.
Thu May 10 18:21:23 2018 - [info] Reading application default configuration from /etc/mha/app1.cnf..
Thu May 10 18:21:23 2018 - [info] Reading server configuration from /etc/mha/app1.cnf..
Thu May 10 18:21:24 2018 - [info] GTID failover mode = 0
Thu May 10 18:21:24 2018 - [info] Dead Servers:
Thu May 10 18:21:24 2018 - [info] 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:24 2018 - [info] Alive Servers:
Thu May 10 18:21:24 2018 - [info] 10.78.72.73(10.78.72.73:3306)
Thu May 10 18:21:24 2018 - [info] 10.78.72.75(10.78.72.75:3306)
Thu May 10 18:21:24 2018 - [info] Alive Slaves:
Thu May 10 18:21:24 2018 - [info] 10.78.72.73(10.78.72.73:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 18:21:24 2018 - [info] Replicating from 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:24 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 18:21:24 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 18:21:24 2018 - [info] Replicating from 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:24 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 18:21:24 2018 - [info] Checking slave configurations..
Thu May 10 18:21:24 2018 - [info] Checking replication filtering settings..
Thu May 10 18:21:24 2018 - [info] Replication filtering check ok.
Thu May 10 18:21:24 2018 - [info] Master is down!
Thu May 10 18:21:24 2018 - [info] Terminating monitoring script.
Thu May 10 18:21:24 2018 - [info] Got exit code 20 (Master dead).
Thu May 10 18:21:24 2018 - [info] MHA::MasterFailover version 0.56.
Thu May 10 18:21:24 2018 - [info] Starting master failover.
Thu May 10 18:21:24 2018 - [info]
Thu May 10 18:21:24 2018 - [info] * Phase 1: Configuration Check Phase..
Thu May 10 18:21:24 2018 - [info]
Thu May 10 18:21:25 2018 - [info] GTID failover mode = 0
Thu May 10 18:21:25 2018 - [info] Dead Servers:
Thu May 10 18:21:25 2018 - [info] 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:25 2018 - [info] Checking master reachability via MySQL(double check)...
Thu May 10 18:21:25 2018 - [info] ok.
Thu May 10 18:21:25 2018 - [info] Alive Servers:
Thu May 10 18:21:25 2018 - [info] 10.78.72.73(10.78.72.73:3306)
Thu May 10 18:21:25 2018 - [info] 10.78.72.75(10.78.72.75:3306)
Thu May 10 18:21:25 2018 - [info] Alive Slaves:
Thu May 10 18:21:25 2018 - [info] 10.78.72.73(10.78.72.73:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 18:21:25 2018 - [info] Replicating from 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:25 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 18:21:25 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 18:21:25 2018 - [info] Replicating from 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:25 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 18:21:25 2018 - [info] Starting Non-GTID based failover.
Thu May 10 18:21:25 2018 - [info]
Thu May 10 18:21:25 2018 - [info] ** Phase 1: Configuration Check Phase completed.
Thu May 10 18:21:25 2018 - [info]
Thu May 10 18:21:25 2018 - [info] * Phase 2: Dead Master Shutdown Phase..
Thu May 10 18:21:25 2018 - [info]
Thu May 10 18:21:25 2018 - [info] Forcing shutdown so that applications never connect to the current master..
Thu May 10 18:21:25 2018 - [info] Executing master IP deactivation script:
Thu May 10 18:21:25 2018 - [info] /etc/mha/scripts/master_ip_failover --orig_master_host=10.78.72.74 --orig_master_ip=10.78.72.74 --orig_master_port=3306 --command=stopssh --ssh_user=root


IN SCRIPT TEST====/sbin/ifconfig eth0:1 down==/sbin/ifconfig eth0:1 10.78.72.77===

Disabling the VIP on old master: 10.78.72.74
Thu May 10 18:21:25 2018 - [info] done.
Thu May 10 18:21:25 2018 - [warning] shutdown_script is not set. Skipping explicit shutting down of the dead master.
Thu May 10 18:21:25 2018 - [info] * Phase 2: Dead Master Shutdown Phase completed.
Thu May 10 18:21:25 2018 - [info]
Thu May 10 18:21:25 2018 - [info] * Phase 3: Master Recovery Phase..
Thu May 10 18:21:25 2018 - [info]
Thu May 10 18:21:25 2018 - [info] * Phase 3.1: Getting Latest Slaves Phase..
Thu May 10 18:21:25 2018 - [info]
Thu May 10 18:21:25 2018 - [info] The latest binary log file/position on all slaves is mysql-bin.000002:5344542
Thu May 10 18:21:25 2018 - [info] Latest slaves (Slaves that received relay log files to the latest):
Thu May 10 18:21:25 2018 - [info] 10.78.72.73(10.78.72.73:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 18:21:25 2018 - [info] Replicating from 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:25 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 18:21:25 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 18:21:25 2018 - [info] Replicating from 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:25 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 18:21:25 2018 - [info] The oldest binary log file/position on all slaves is mysql-bin.000002:5344542
Thu May 10 18:21:25 2018 - [info] Oldest slaves:
Thu May 10 18:21:25 2018 - [info] 10.78.72.73(10.78.72.73:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 18:21:25 2018 - [info] Replicating from 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:25 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 18:21:25 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 18:21:25 2018 - [info] Replicating from 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:25 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 18:21:25 2018 - [info]
Thu May 10 18:21:25 2018 - [info] * Phase 3.2: Saving Dead Master's Binlog Phase..
Thu May 10 18:21:25 2018 - [info]
Thu May 10 18:21:25 2018 - [info] Fetching dead master's binary logs..
Thu May 10 18:21:25 2018 - [info] Executing command on the dead master 10.78.72.74(10.78.72.74:3306): save_binary_logs --command=save --start_file=mysql-bin.000002 --start_pos=5344542 --binlog_dir=/var/lib/mysql,/var/log/mysql --output_file=/var/tmp/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog --handle_raw_binlog=1 --disable_log_bin=0 --manager_version=0.56
Creating /var/tmp if not exists.. ok.
Concat binary/relay logs from mysql-bin.000002 pos 5344542 to mysql-bin.000002 EOF into /var/tmp/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog ..
Dumping binlog format description event, from position 0 to 106.. ok.
Dumping effective binlog data from /var/lib/mysql/mysql-bin.000002 position 5344542 to tail(5344561).. ok.
Concat succeeded.
Thu May 10 18:21:25 2018 - [info] scp from [email protected]:/var/tmp/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog to local:/var/log/masterha/app1/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog succeeded.
Thu May 10 18:21:26 2018 - [info] HealthCheck: SSH to 10.78.72.73 is reachable.
Thu May 10 18:21:26 2018 - [info] HealthCheck: SSH to 10.78.72.75 is reachable.
Thu May 10 18:21:26 2018 - [info]
Thu May 10 18:21:26 2018 - [info] * Phase 3.3: Determining New Master Phase..
Thu May 10 18:21:26 2018 - [info]
Thu May 10 18:21:26 2018 - [info] Finding the latest slave that has all relay logs for recovering other slaves..
Thu May 10 18:21:26 2018 - [info] All slaves received relay logs to the same position. No need to resync each other.
Thu May 10 18:21:26 2018 - [info] Searching new master from slaves..
Thu May 10 18:21:26 2018 - [info] Candidate masters from the configuration file:
Thu May 10 18:21:26 2018 - [info] 10.78.72.73(10.78.72.73:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 18:21:26 2018 - [info] Replicating from 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:26 2018 - [info] Primary candidate for the new Master (candidate_master is set)
Thu May 10 18:21:26 2018 - [info] Non-candidate masters:
Thu May 10 18:21:26 2018 - [info] 10.78.72.75(10.78.72.75:3306) Version=5.1.71-log (oldest major version between slaves) log-bin:enabled
Thu May 10 18:21:26 2018 - [info] Replicating from 10.78.72.74(10.78.72.74:3306)
Thu May 10 18:21:26 2018 - [info] Not candidate for the new Master (no_master is set)
Thu May 10 18:21:26 2018 - [info] Searching from candidate_master slaves which have received the latest relay log events..
Thu May 10 18:21:26 2018 - [info] New master is 10.78.72.73(10.78.72.73:3306)
Thu May 10 18:21:26 2018 - [info] Starting master failover..
Thu May 10 18:21:26 2018 - [info]
From:
10.78.72.74(10.78.72.74:3306) (current master)
+--10.78.72.73(10.78.72.73:3306)
+--10.78.72.75(10.78.72.75:3306)

To:
10.78.72.73(10.78.72.73:3306) (new master)
+--10.78.72.75(10.78.72.75:3306)
Thu May 10 18:21:26 2018 - [info]
Thu May 10 18:21:26 2018 - [info] * Phase 3.3: New Master Diff Log Generation Phase..
Thu May 10 18:21:26 2018 - [info]
Thu May 10 18:21:26 2018 - [info] This server has all relay logs. No need to generate diff files from the latest slave.
Thu May 10 18:21:26 2018 - [info] Sending binlog..
Thu May 10 18:21:26 2018 - [info] scp from local:/var/log/masterha/app1/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog to [email protected]:/var/tmp/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog succeeded.
Thu May 10 18:21:26 2018 - [info]
Thu May 10 18:21:26 2018 - [info] * Phase 3.4: Master Log Apply Phase..
Thu May 10 18:21:26 2018 - [info]
Thu May 10 18:21:26 2018 - [info] *NOTICE: If any error happens from this phase, manual recovery is needed.
Thu May 10 18:21:26 2018 - [info] Starting recovery on 10.78.72.73(10.78.72.73:3306)..
Thu May 10 18:21:26 2018 - [info] Generating diffs succeeded.
Thu May 10 18:21:26 2018 - [info] Waiting until all relay logs are applied.
Thu May 10 18:21:26 2018 - [info] done.
Thu May 10 18:21:26 2018 - [info] Getting slave status..
Thu May 10 18:21:26 2018 - [info] This slave(10.78.72.73)'s Exec_Master_Log_Pos equals to Read_Master_Log_Pos(mysql-bin.000002:5344542). No need to recover from Exec_Master_Log_Pos.
Thu May 10 18:21:26 2018 - [info] Connecting to the target slave host 10.78.72.73, running recover script..
Thu May 10 18:21:26 2018 - [info] Executing command: apply_diff_relay_logs --command=apply --slave_user='mha_rep' --slave_host=10.78.72.73 --slave_ip=10.78.72.73 --slave_port=3306 --apply_files=/var/tmp/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog --workdir=/var/tmp --target_version=5.1.71-log --timestamp=20180510182124 --handle_raw_binlog=1 --disable_log_bin=0 --manager_version=0.56 --slave_pass=xxx
Thu May 10 18:21:27 2018 - [info]
Applying differential binary/relay log files /var/tmp/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog on 10.78.72.73:3306. This may take long time...
Applying log files succeeded.
Thu May 10 18:21:27 2018 - [info] All relay logs were successfully applied.
Thu May 10 18:21:27 2018 - [info] Getting new master's binlog name and position..
Thu May 10 18:21:27 2018 - [info] mysql-bin.000002:106
Thu May 10 18:21:27 2018 - [info] All other slaves should start replication from here. Statement should be: CHANGE MASTER TO MASTER_HOST='10.78.72.73', MASTER_PORT=3306, MASTER_LOG_FILE='mysql-bin.000002', MASTER_LOG_POS=106, MASTER_USER='repl', MASTER_PASSWORD='xxx';
Thu May 10 18:21:27 2018 - [info] Executing master IP activate script:
Thu May 10 18:21:27 2018 - [info] /etc/mha/scripts/master_ip_failover --command=start --ssh_user=root --orig_master_host=10.78.72.74 --orig_master_ip=10.78.72.74 --orig_master_port=3306 --new_master_host=10.78.72.73 --new_master_ip=10.78.72.73 --new_master_port=3306 --new_master_user='mha_rep' --new_master_password='Videobase.cn'
Unknown option: new_master_user
Unknown option: new_master_password


IN SCRIPT TEST====/sbin/ifconfig eth0:1 down==/sbin/ifconfig eth0:1 10.78.72.77===

Enabling the VIP - 10.78.72.77 on the new master - 10.78.72.73
Thu May 10 18:21:27 2018 - [info] OK.
Thu May 10 18:21:27 2018 - [info] Setting read_only=0 on 10.78.72.73(10.78.72.73:3306)..
Thu May 10 18:21:27 2018 - [info] ok.
Thu May 10 18:21:27 2018 - [info] ** Finished master recovery successfully.
Thu May 10 18:21:27 2018 - [info] * Phase 3: Master Recovery Phase completed.
Thu May 10 18:21:27 2018 - [info]
Thu May 10 18:21:27 2018 - [info] * Phase 4: Slaves Recovery Phase..
Thu May 10 18:21:27 2018 - [info]
Thu May 10 18:21:27 2018 - [info] * Phase 4.1: Starting Parallel Slave Diff Log Generation Phase..
Thu May 10 18:21:27 2018 - [info]
Thu May 10 18:21:27 2018 - [info] -- Slave diff file generation on host 10.78.72.75(10.78.72.75:3306) started, pid: 25781. Check tmp log /var/log/masterha/app1/10.78.72.75_3306_20180510182124.log if it takes time..
Thu May 10 18:21:28 2018 - [info]
Thu May 10 18:21:28 2018 - [info] Log messages from 10.78.72.75 ...
Thu May 10 18:21:28 2018 - [info]
Thu May 10 18:21:27 2018 - [info] This server has all relay logs. No need to generate diff files from the latest slave.
Thu May 10 18:21:28 2018 - [info] End of log messages from 10.78.72.75.
Thu May 10 18:21:28 2018 - [info] -- 10.78.72.75(10.78.72.75:3306) has the latest relay log events.
Thu May 10 18:21:28 2018 - [info] Generating relay diff files from the latest slave succeeded.
Thu May 10 18:21:28 2018 - [info]
Thu May 10 18:21:28 2018 - [info] * Phase 4.2: Starting Parallel Slave Log Apply Phase..
Thu May 10 18:21:28 2018 - [info]
Thu May 10 18:21:28 2018 - [info] -- Slave recovery on host 10.78.72.75(10.78.72.75:3306) started, pid: 25788. Check tmp log /var/log/masterha/app1/10.78.72.75_3306_20180510182124.log if it takes time..
Thu May 10 18:21:29 2018 - [info]
Thu May 10 18:21:29 2018 - [info] Log messages from 10.78.72.75 ...
Thu May 10 18:21:29 2018 - [info]
Thu May 10 18:21:28 2018 - [info] Sending binlog..
Thu May 10 18:21:28 2018 - [info] scp from local:/var/log/masterha/app1/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog to [email protected]:/var/tmp/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog succeeded.
Thu May 10 18:21:28 2018 - [info] Starting recovery on 10.78.72.75(10.78.72.75:3306)..
Thu May 10 18:21:28 2018 - [info] Generating diffs succeeded.
Thu May 10 18:21:28 2018 - [info] Waiting until all relay logs are applied.
Thu May 10 18:21:28 2018 - [info] done.
Thu May 10 18:21:28 2018 - [info] Getting slave status..
Thu May 10 18:21:28 2018 - [info] This slave(10.78.72.75)'s Exec_Master_Log_Pos equals to Read_Master_Log_Pos(mysql-bin.000002:5344542). No need to recover from Exec_Master_Log_Pos.
Thu May 10 18:21:28 2018 - [info] Connecting to the target slave host 10.78.72.75, running recover script..
Thu May 10 18:21:28 2018 - [info] Executing command: apply_diff_relay_logs --command=apply --slave_user='mha_rep' --slave_host=10.78.72.75 --slave_ip=10.78.72.75 --slave_port=3306 --apply_files=/var/tmp/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog --workdir=/var/tmp --target_version=5.1.71-log --timestamp=20180510182124 --handle_raw_binlog=1 --disable_log_bin=0 --manager_version=0.56 --slave_pass=xxx
Thu May 10 18:21:28 2018 - [info]
Applying differential binary/relay log files /var/tmp/saved_master_binlog_from_10.78.72.74_3306_20180510182124.binlog on 10.78.72.75:3306. This may take long time...
Applying log files succeeded.
Thu May 10 18:21:28 2018 - [info] All relay logs were successfully applied.
Thu May 10 18:21:28 2018 - [info] Resetting slave 10.78.72.75(10.78.72.75:3306) and starting replication from the new master 10.78.72.73(10.78.72.73:3306)..
Thu May 10 18:21:28 2018 - [info] Executed CHANGE MASTER.
Thu May 10 18:21:28 2018 - [info] Slave started.
Thu May 10 18:21:29 2018 - [info] End of log messages from 10.78.72.75.
Thu May 10 18:21:29 2018 - [info] -- Slave recovery on host 10.78.72.75(10.78.72.75:3306) succeeded.
Thu May 10 18:21:29 2018 - [info] All new slave servers recovered successfully.
Thu May 10 18:21:29 2018 - [info]
Thu May 10 18:21:29 2018 - [info] * Phase 5: New master cleanup phase..
Thu May 10 18:21:29 2018 - [info]
Thu May 10 18:21:29 2018 - [info] Resetting slave info on the new master..
Thu May 10 18:21:29 2018 - [info] 10.78.72.73: Resetting slave info succeeded.
Thu May 10 18:21:29 2018 - [info] Master failover to 10.78.72.73(10.78.72.73:3306) completed successfully.
Thu May 10 18:21:29 2018 - [info]

----- Failover Report -----

app1: MySQL Master failover 10.78.72.74(10.78.72.74:3306) to 10.78.72.73(10.78.72.73:3306) succeeded

Master 10.78.72.74(10.78.72.74:3306) is down!

Check MHA Manager logs at manager01:/var/log/masterha/app1/manager.log for details.

Started automated(non-interactive) failover.
Invalidated master IP address on 10.78.72.74(10.78.72.74:3306)
The latest slave 10.78.72.73(10.78.72.73:3306) has all relay logs for recovery.
Selected 10.78.72.73(10.78.72.73:3306) as a new master.
10.78.72.73(10.78.72.73:3306): OK: Applying all logs succeeded.
10.78.72.73(10.78.72.73:3306): OK: Activated master IP address.
10.78.72.75(10.78.72.75:3306): This host has the latest relay log events.
Generating relay diff files from the latest slave succeeded.
10.78.72.75(10.78.72.75:3306): OK: Applying all logs succeeded. Slave started, replicating from 10.78.72.73(10.78.72.73:3306)
10.78.72.73(10.78.72.73:3306): Resetting slave info succeeded.
Master failover to 10.78.72.73(10.78.72.73:3306) completed successfully.

# 删除failover.complete文件
[root@manager01 ~]# rm /var/log/masterha/app1/app1.failover.complete -f
# 启动masterha_manager
[root@manager01 ~]# nohup masterha_manager --conf=/etc/mha/app1.cnf 2>&1 &
# 恢复数据库参见 恢复故障

恢复故障

启动原master,手动切换到主库(恢复)

在原master上设置同步新master
mysql> reset master;
Query OK, 0 rows affected (0.04 sec)

# 从manager日志中查询change master to日志
# cat /var/log/masterha/app1/manager.log | grep "CHANGE MASTER"
mysql> CHANGE MASTER TO MASTER_HOST='10.78.72.73', MASTER_PORT=3306, MASTER_LOG_FILE='mysql-bin.000002', MASTER_LOG_POS=106, MASTER_USER='repl', MASTER_PASSWORD='xxx';
Query OK, 0 rows affected (0.08 sec)

mysql> start slave;
Query OK, 0 rows affected (0.00 sec)

mysql> show slave status \G

在原master上执行purge脚本(变成slave,需要执行purge,否则 [warning] relay_log_purge=0 is not set on slave
[root@mysql03 ~]# /usr/bin/purge_relay_logs --user=root --disable_relay_log_purge

删除failover文件
[root@manager01 ~]# rm /var/log/masterha/app1/app1.failover.complete -f

启动masterha_manager
[root@manager01 ~]# nohup masterha_manager --conf=/etc/mha/app1.cnf 2>&1 &

查看manager日志
[root@manager01 ~]# tail -f /var/log/masterha/app1/manager.log

增加keepalived

由于之前切换脚本直接通过远程执行 ifconfig eth0:1 添加子接口,会导致跨网段虚地址无法访问,需要通过vrrp协议的keepalive切换虚地址

编译安装keepalive

# 在master和slave-master上安装keepalive
yum -y install openssl-devel popt-devel libnl-devel
yum groupinstall "Development Tools" -y
ldconfig
tar xvf keepalived-1.2.7.tar.gz -C /usr/local/src/
cd /usr/local/src/keepalived-1.2.7/
./configure && make && make install
cp /usr/local/etc/rc.d/init.d/keepalived /etc/init.d/
cp /usr/local/etc/sysconfig/keepalived /etc/sysconfig/
mkdir /etc/keepalived
cp /usr/local/etc/keepalived/keepalived.conf /etc/keepalived/
cp /usr/local/sbin/keepalived /usr/sbin/

#配置文件
# master
[root@mysql01 ~]# cat /etc/keepalived/keepalived.conf
! Configuration File for keepalived

global_defs {
notification_email {
xxx
}
notification_email_from [email protected]
smtp_server 127.0.0.1
smtp_connect_timeout 30
router_id mysqlmha
}

vrrp_instance VI_1 {
# state为BACKUP,不是MASTER,避免抢占,从,还是按照原来的配置
state BACKUP
interface eth0
virtual_router_id 77
priority 100
advert_int 1
# 这个参数配置在主上,并且主state为BACKUP
nopreempt
authentication {
auth_type PASS
auth_pass formha
}
virtual_ipaddress {
10.78.72.77
}
}

# master-slave
! Configuration File for keepalived

global_defs {
notification_email {
xxx
}
notification_email_from [email protected]
smtp_server 127.0.0.1
smtp_connect_timeout 30
router_id mysqlmha
}

vrrp_instance VI_1 {
state BACKUP
interface eth0
virtual_router_id 77
priority 100
advert_int 1
authentication {
auth_type PASS
auth_pass formha
}
virtual_ipaddress {
10.78.72.77
}
}

# 在主上启动keepalive,不需要两个同时启动,在mha进行切换时会执行对应的启动或关闭keepalived


修改master_ip_failover脚本

[root@manager01 app1]# cat /etc/mha/scripts/master_ip_failover
#!/usr/bin/env perl
use strict;
use warnings FATAL => 'all';
use Getopt::Long;
my (
$command, $ssh_user, $orig_master_host, $orig_master_ip,
$orig_master_port, $new_master_host, $new_master_ip, $new_master_port
);
my $vip = '10.78.72.77'; #vip地址
my $key = '1';
my $ssh_start_keepalived = "/etc/init.d/keepalived start";
my $ssh_stop_keepalived = "/etc/init.d/keepalived stop";
GetOptions(
'command=s' => \$command,
'ssh_user=s' => \$ssh_user,
'orig_master_host=s' => \$orig_master_host,
'orig_master_ip=s' => \$orig_master_ip,
'orig_master_port=i' => \$orig_master_port,
'new_master_host=s' => \$new_master_host,
'new_master_ip=s' => \$new_master_ip,
'new_master_port=i' => \$new_master_port,
);
exit &main();
sub main {
if ( $command eq "stop" || $command eq "stopssh" ) {
my $exit_code = 1;
eval {
print "Disabling the VIP on old master: $orig_master_host \n";
# stopssh命令才去执行stop vip
if ($command eq "stopssh") {
&stop_vip();
}
$exit_code = 0;
};
if ($@) {
warn "Got Error: $@\n";
exit $exit_code;
}
exit $exit_code;
}
elsif ( $command eq "start" ) {
my $exit_code = 10;
eval {
print "Enabling the VIP - $vip on the new master - $new_master_host \n";
&start_vip();
$exit_code = 0;
};
if ($@) {
warn $@;
exit $exit_code;
}
exit $exit_code;
}
elsif ( $command eq "status" ) {
print "Checking the Status of the script.. OK \n";
exit 0;
}
else {
&usage();
exit 1;
}
}
sub start_vip() {
`ssh $ssh_user\@$new_master_host \" $ssh_start_keepalived \"`;
}
# A simple system call that disable the VIP on the old_master
sub stop_vip() {
`ssh $ssh_user\@$orig_master_host \" $ssh_stop_keepalived \"`;
}
sub usage {
print
"Usage: master_ip_failover --command=start|stop|stopssh|status --orig_master_host=host --orig_master_ip=ip --orig_master_port=port --new_master_host=host --new_master_ip=ip --new_master_port=port\n";
}

你可能感兴趣的:(数据库)