1. 使用单主机模式 进行测试.
用到的配置和脚本如下:
node1 - my.cnf
[mysqld]
user=mysql
server_id=1
gtid_mode=ON
enforce_gtid_consistency=ON
master_info_repository=TABLE
relay_log_info_repository=TABLE
binlog_checksum=NONE
log_slave_updates=ON
log_bin=binlog
binlog_format=ROW
transaction_write_set_extraction=XXHASH64
loose-group_replication_group_name="aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
loose-group_replication_start_on_boot=off
loose-group_replication_local_address= "node1:33060"
loose-group_replication_group_seeds= "node1:33060,node2:33060,node3:33060"
loose-group_replication_bootstrap_group=off
loose-group_replication_recovery_user=repl
loose-group-replication-single-primary-mode='ON'
loose-group-replication-enforce-update-everywhere-checks='OFF'
relay-log=replay-bin
node2-my.cnf
[mysqld]
user=mysql
server_id=2
gtid_mode=ON
enforce_gtid_consistency=ON
master_info_repository=TABLE
relay_log_info_repository=TABLE
binlog_checksum=NONE
log_slave_updates=ON
log_bin=binlog
binlog_format=ROW
transaction_write_set_extraction=XXHASH64
loose-group_replication_group_name="aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
loose-group_replication_start_on_boot=off
loose-group_replication_local_address= "node2:33060"
loose-group_replication_group_seeds= "node1:33060,node2:33060,node3:33060"
loose-group_replication_bootstrap_group=off
loose-group_replication_recovery_user=repl
loose-group-replication-single-primary-mode='ON'
loose-group-replication-enforce-update-everywhere-checks='OFF'
relay-log=replay-bin
node3-my.cnf
[mysqld]
user=mysql
server_id=3
gtid_mode=ON
enforce_gtid_consistency=ON
master_info_repository=TABLE
relay_log_info_repository=TABLE
binlog_checksum=NONE
log_slave_updates=ON
log_bin=binlog
binlog_format=ROW
transaction_write_set_extraction=XXHASH64
loose-group_replication_group_name="aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
loose-group_replication_start_on_boot=off
loose-group_replication_local_address= "node3:33060"
loose-group_replication_group_seeds= "node1:33060,node2:33060,node3:33060"
loose-group_replication_bootstrap_group=off
loose-group-replication-single-primary-mode=on
loose-group-replication-enforce-update-everywhere-checks=off
relay-log=replay-bin
[mysqld_safe]
docker启动脚本:run.sh
#!/bin/bash
if [ "$1" = "" ] || [ "$1" = '3306' ];then
echo -p 'examples: ./run.sh 33060; port must not a number and not equal with 3306!'
fi
#you can update this test dir for another dir
base_dir=/home/mysql_test/mysql_5.7/
if [ ! -d $base_dir ];then
echo "$base_dir not exist, please update base_dir in the shell!"
exit
fi
cd $base_dir
nodeDir=node$1
if [ ! -d $nodeDir ];then
mkdir $nodeDir
#else
#echo "$1 has exist, please input another port number!"
fi
base_dir=${base_dir}$nodeDir
conf_dir_name=conf
data_dir_name=data
conf_dir=${base_dir}/$conf_dir_name
data_dir=${base_dir}/$data_dir_name
port=3306$1:3306
container=mysql/mysql-server:5.7
cd $base_dir
if [ ! -d $conf_dir_name ];then
mkdir $conf_dir_name
fi
cd $conf_dir_name
if [ ! -f 'my.cnf' ];then
printf "[mysqld]\nuser=mysql\n[mysqld_safe]\n" > my.cnf
fi
cd $base_dir
if [ ! -d $data_dir_name ];then
mkdir $data_dir_name
fi
docker rm -f $nodeDir
docker run --name=$nodeDir --net=groupnet \
--mount type=bind,src=${conf_dir}/my.cnf,dst=/etc/my.cnf \
--mount type=bind,src=$data_dir,dst=/var/lib/mysql \
-d $container
#-p $port -d $container
测试步骤:
1. docker安装参照:centos6 安装docker17.06
docker pull mysql/mysql-server:5.7
2. 创建虚拟网络:docker network create groupnet (
删除使用docker network rm groupnet
禁用某个container net: docker network disconnect groupnet node3
)
3. cd /home/mysql_test/mysql_5.7 没有的话需要创建
mkdir -p node1/conf/ && touch my.cnf && {将node1对应的配置写入my.cnf}
mkdir -p node2/conf/ && touch my.cnf && {将node2对应的配置写入my.cnf}
mkdir -p node3/conf/ && touch my.cnf && {将node3对应的配置写入my.cnf}
./run.sh 1
./run.sh 2
./run.sh 3
4. 将node1作为主节点:
INSTALL PLUGIN group_replication SONAME 'group_replication.so';
SET SQL_LOG_BIN=0;
#必须设置SQL_LOG_BIN为0,否则会被其他主机执行
GRANT REPLICATION SLAVE ON *.* TO repl@'%' IDENTIFIED BY '123456';
SET SQL_LOG_BIN=1;
SET GLOBAL group_replication_bootstrap_group=ON;
START GROUP_REPLICATION;
SET GLOBAL group_replication_bootstrap_group=OFF;
5. node2, node3执行从配置:
INSTALL PLUGIN group_replication SONAME 'group_replication.so';
SET SQL_LOG_BIN=0;
#必须设置SQL_LOG_BIN为0,否则会被其他主机执行
GRANT REPLICATION SLAVE ON *.* TO repl@'%' IDENTIFIED BY '123456';
SET SQL_LOG_BIN=1;
START GROUP_REPLICATION;
6. 简单测试数据库创建和表创建:
mysql> CREATE DATABASE test;
mysql> USE test;
mysql> CREATE TABLE t1 (c1 INT PRIMARY KEY, c2 TEXT NOT NULL);
mysql> INSERT INTO t1 VALUES (1, 'Luis');
7. MGR状态监控:
mysql> select * from performance_schema.replication_group_member_stats \G
*************************** 1. row ***************************
CHANNEL_NAME: group_replication_applier
VIEW_ID: 15426117843744977:39
MEMBER_ID: 7390fffa-e96e-11e8-a664-0242ac110001
COUNT_TRANSACTIONS_IN_QUEUE: 0
COUNT_TRANSACTIONS_CHECKED: 9
COUNT_CONFLICTS_DETECTED: 0
COUNT_TRANSACTIONS_ROWS_VALIDATING: 0
TRANSACTIONS_COMMITTED_ALL_MEMBERS: 7390fffa-e96e-11e8-a664-0242ac110001:1-3,
aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa:1-48
LAST_CONFLICT_FREE_TRANSACTION: aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa:46
1 row in set (0.00 sec)
mysql> SELECT * FROM performance_schema.replication_group_members \G
*************************** 1. row ***************************
CHANNEL_NAME: group_replication_applier
MEMBER_ID: 7390fffa-e96e-11e8-a664-0242ac110001
MEMBER_HOST: 72057b10e109
MEMBER_PORT: 3306
MEMBER_STATE: ONLINE
*************************** 2. row ***************************
CHANNEL_NAME: group_replication_applier
MEMBER_ID: 9d32e2f1-e96e-11e8-a6a0-0242ac110002
MEMBER_HOST: f0f94ac3a074
MEMBER_PORT: 3306
MEMBER_STATE: ONLINE
*************************** 3. row ***************************
CHANNEL_NAME: group_replication_applier
MEMBER_ID: 9f9b2e75-e96e-11e8-a7f4-0242ac110003
MEMBER_HOST: 0da2cce13664
MEMBER_PORT: 3306
MEMBER_STATE: ONLINE
3 rows in set (0.00 sec)
mysql> SELECT * FROM performance_schema.replication_connection_status \G
*************************** 1. row ***************************
CHANNEL_NAME: group_replication_applier
GROUP_NAME: aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa
SOURCE_UUID: aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa
THREAD_ID: NULL
SERVICE_STATE: ON
COUNT_RECEIVED_HEARTBEATS: 0
LAST_HEARTBEAT_TIMESTAMP: 0000-00-00 00:00:00
RECEIVED_TRANSACTION_SET: 7390fffa-e96e-11e8-a664-0242ac110001:1-3,
aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa:1-48
LAST_ERROR_NUMBER: 0
LAST_ERROR_MESSAGE:
LAST_ERROR_TIMESTAMP: 0000-00-00 00:00:00
mysql> SELECT * FROM performance_schema.replication_applier_status \G
*************************** 1. row ***************************
CHANNEL_NAME: group_replication_applier
SERVICE_STATE: ON
REMAINING_DELAY: NULL
COUNT_TRANSACTIONS_RETRIES: 0
1 row in set (0.00 sec)
#查看已经执行过的事务。
select @@global.gtid_executed;
遇见的问题:
1 . [GCS] There is no local IP address matching the one configured for the local node
需要配置虚拟网络,并且所有组中的节点需要使用这个网络。
docker network create groupnet
--net=groupnet
2. 如下错误:
2018-11-19T07:20:01.743346Z 0 [ERROR] Plugin group_replication reported: 'The member contains transactions not present in the group. The member will now exit the group.'
2018-11-19T07:20:01.743353Z 0 [Note] Plugin group_replication reported: 'To force this member into the group you can use the group_replication_allow_local_disjoint_gtids_join option'
2018-11-19T08:20:20.702632Z 197 [Note] Slave I/O thread for channel 'group_replication_recovery': connected to master 'repl@72057b10e109:3306',replication started in log 'FIRST' at position 4
2018-11-19T08:20:20.735420Z 198 [Note] Slave SQL thread for channel 'group_replication_recovery' initialized, starting replication in log 'FIRST' at position 0, relay log './replay-bin-group_replication_recovery.000001' position: 4
2018-11-19T08:20:21.047090Z 198 [ERROR] Slave SQL for channel 'group_replication_recovery': Error 'Can't drop database 'test'; database doesn't exist' on query. Default database: 'test'. Query: 'drop database test', Error_code: 1008
2018-11-19T08:20:21.047151Z 198 [Warning] Slave: Can't drop database 'test'; database doesn't exist Error_code: 1008
2018-11-19T08:20:21.047198Z 198 [ERROR] Error running query, slave SQL thread aborted. Fix the problem, and restart the slave SQL thread with "SLAVE START". We stopped at log 'binlog.000008' position 594.
2018-11-19T08:20:21.047227Z 195 [Note] Plugin group_replication reported: 'Terminating existing group replication donor connection and purging the corresponding logs.'
2018-11-19T08:20:21.167739Z 197 [Note] Slave I/O thread exiting for channel 'group_replication_recovery', read up to log 'binlog.000009', position 4
解决方法:
set global group_replication_allow_local_disjoint_gtids_join=on;
START GROUP_REPLICATION;
set global group_replication_allow_local_disjoint_gtids_join=off;
如果上述执行完还不正常,需要查看冲突的点。比如上述错误很明显是主库有删除数据库test的操作,但是从库并没有相应的数据库。只要在当前库中创建test库即可。步骤如下:
STOP GROUP_REPLICATION;
set global read_only=false;
SET SQL_LOG_BIN=0;
create database test;
SET SQL_LOG_BIN=1;
START GROUP_REPLICATION;
set global group_replication_allow_local_disjoint_gtids_join=off;
3. [ERROR] Plugin group_replication reported: 'Member was expelled from the group due to network failures, changing member status to ERROR.'
需要在网络错误的节点执行:STOP GROUP_REPLICATION; START GROUP_REPLICATION;
4. 2018-11-22T09:53:00.999736Z 3656 [ERROR] Slave SQL for channel 'group_replication_applier': Error 'FUNCTION GTID_COUNT already exists' on query. Default database: 'sys'. Query: 'CREATE DEFINER=`root`@`localhost` FUNCTION `GTID_COUNT`(gtid_set TEXT(10000)) RETURNS int(11)
DETERMINISTIC
BEGIN
DECLARE result BIGINT DEFAULT 0;
DECLARE colon_pos INT;
DECLARE next_dash_pos INT;
DECLARE next_colon_pos INT;
DECLARE next_comma_pos INT;
SET gtid_set = GTID_NORMALIZE(gtid_set);
SET colon_pos = LOCATE2(':', gtid_set, 1);
WHILE colon_pos != LENGTH(gtid_set) + 1 DO
SET next_dash_pos = LOCATE2('-', gtid_set, colon_pos + 1);
SET next_colon_pos = LOCATE2(':', gtid_set, colon_pos + 1);
SET next_comma_pos = LOCATE2(',', gtid_set, colon_pos + 1);
IF next_dash_pos < next_colon_pos AND next_dash_pos < next_comma_pos THEN
SET result = result +
SUBSTR(gtid_set, next_dash_pos + 1,
LEAST(next_colon_pos, next_comma_pos) - (next_dash_pos + 1)) -
SUBSTR(gtid_set, colon_pos + 1, next_dash_pos - (co
2018-11-22T09:53:00.999766Z 3656 [Warning] Slave: FUNCTION GTID_COUNT already exists Error_code: 1304
2018-11-22T09:53:00.999776Z 3656 [ERROR] Plugin group_replication reported: 'The applier thread execution was aborted. Unable to process more transactions, this member will now leave the group.'
2018-11-22T09:53:00.999823Z 3656 [ERROR] Error running query, slave SQL thread aborted. Fix the problem, and restart the slave SQL thread with "SLAVE START". We stopped at log 'FIRST' position 0.
2018-11-22T09:53:00.999858Z 3653 [ERROR] Plugin group_replication reported: 'Fatal error during execution on the Applier process of Group Replication. The server will now leave the group.'
2018-11-22T09:53:00.999897Z 3653 [ERROR] Plugin group_replication reported: '[GCS] The member is already leaving or joining a group.'
2018-11-22T09:53:00.999912Z 3653 [ERROR] Plugin group_replication reported: 'Unable to confirm whether the server has left the group or not. Check performance_schema.replication_group_members to check group membership information.'
2018-11-22T09:53:00.999998Z 3653 [Note] Plugin group_replication reported: 'Going to wait for view modification'
2018-11-22T09:53:01.002868Z 0 [Note] Plugin group_replication reported: 'XCom protocol version: 3'
2018-11-22T09:53:01.002898Z 0 [Note] Plugin group_replication reported: 'XCom initialized and ready to accept incoming connections on port 33100'
2018-11-22T09:53:03.245164Z 0 [ERROR] Plugin group_replication reported: 'There was a previous plugin error while the member joined the group. The member will now exit the group.'
mysql> set global sql_slave_skip_counter=1;
ERROR 1858 (HY000): sql_slave_skip_counter can not be set when the server is running with @@GLOBAL.GTID_MODE = ON. Instead, for each transaction that you want to skip, generate an empty transaction with the same GTID as the transaction
出现如上错误时,需要执行:
SELECT * FROM performance_schema.replication_group_members \G
获取当前的写节点是哪台主机。假设该主机是localhost:3320。则需要访问3320的binlog,查看错误sql发生的gtid。
假如发生错误时的 gtid为:1bb1b861-f776-11e6-be42-782bcb377193:14,
则只要按照如下操作即可:
#查看当前执行过的gtid
SELECT @@global.gtid_executed;
SET GTID_NEXT='1bb1b861-f776-11e6-be42-782bcb377193:14';
BEGIN;
COMMIT;
SET GTID_NEXT='AUTOMATIC';
#验证当前已经执行过的gtid
SELECT @@global.gtid_executed;
这之后在开启组复制: START GROUP_REPLICATION; 查看是否有错误发生。如果还有类似记录已存在的错误,再继续按照上述方法进行跳过事务即可。
5. 2018-11-26T08:43:19.536306Z 0 [ERROR] Plugin group_replication reported: 'This member has more executed transactions than those present in the group. Local transactions: 2928f39d-f14f-11e8-acb4-005056a1794a:1-11 > Group transactions: 46c82b76-f14c-11e8-8bef-005056a17264:1-2,
f65d4372-f145-11e8-94e9-005056a17264:1-46,
fb3cb695-f152-11e8-83b1-005056a17264:1-15'
2018-11-26T08:43:19.536358Z 0 [ERROR] Plugin group_replication reported: 'The member contains transactions not present in the group. The member will now exit the group.'
执行reset master即可。
参考:
1. mgr相关参数:https://mysqlhighavailability.com/getting-started-with-mysql-group-replication/
2. oracle官方文档:https://dev.mysql.com/doc/refman/5.7/en/group-replication-adding-instances.html
3. docker测试mgr: https://mysqlhighavailability.com/setting-up-mysql-group-replication-with-mysql-docker-images/
4. 添加实例入组时遇到的问题: https://ronniethedba.wordpress.com/2017/04/22/this-member-has-more-executed-transactions-than-those-present-in-the-group/
5. MGR常见问题: http://drmingdrmer.github.io/tech/mysql/2018/08/04/mysql-group-replication.html
6. MGR测试用例:
https://www.cnblogs.com/paul8339/p/9667701.html
https://blog.csdn.net/Mlztesoft/article/details/79927425
7.mgr原理和配置: https://blog.csdn.net/poxiaonie/article/details/73505948