openstack trove代码的主从切换promote slave to master的源代码分析:
def promote_to_replica_source(self, context, instance_id):
# TODO(atomic77) Promote and eject need to be able to handle the case
# where a datastore like Postgresql needs to treat the slave to be
# promoted differently from the old master and the slaves which will
# be simply reassigned to a new master. See:
# https://bugs.launchpad.net/trove/+bug/1553339
def _promote_to_replica_source(old_master, master_candidate,
replica_models):
# First, we transition from the old master to new as quickly as
# possible to minimize the scope of unrecoverable error
# NOTE(zhaochao): we cannot reattach the old master to the new
# one immediately after the new master is up, because for MariaDB
# the other replicas are still connecting to the old master, and
# during reattaching the old master as a slave, new GTID may be
# created and synced to the replicas. After that, when attaching
# the replicas to the new master, 'START SLAVE' will fail by
# 'fatal error 1236' if the binlog of the replica diverged from
# the new master. So the proper order should be:
# -1. make the old master read only (and detach floating ips)
# -2. make sure the new master is up-to-date
# -3. detach the new master from the old one
# -4. enable the new master (and attach floating ips)
# -5. attach the other replicas to the new master
# -6. attach the old master to the new one
# (and attach floating ips)
# -7. demote the old master
# What we changed here is the order of the 6th step, previously
# this step took place right after step 4, which causes failures
# with MariaDB replications.
old_master.make_read_only(True)
latest_txn_id = old_master.get_latest_txn_id()
master_candidate.wait_for_txn(latest_txn_id)
master_candidate.detach_replica(old_master, for_failover=True)
master_candidate.enable_as_master()
master_candidate.make_read_only(False)
# At this point, should something go wrong, there
# should be a working master with some number of working slaves,
# and possibly some number of "orphaned" slaves
exception_replicas = []
error_messages = ""
for replica in replica_models:
try:
if replica.id != master_candidate.id:
replica.detach_replica(old_master, for_failover=True)
replica.attach_replica(master_candidate)
except exception.TroveError as ex:
log_fmt = ("Unable to migrate replica %(slave)s from "
"old replica source %(old_master)s to "
"new source %(new_master)s on promote.")
exc_fmt = _("Unable to migrate replica %(slave)s from "
"old replica source %(old_master)s to "
"new source %(new_master)s on promote.")
msg_content = {
"slave": replica.id,
"old_master": old_master.id,
"new_master": master_candidate.id}
LOG.error(log_fmt, msg_content)
exception_replicas.append(replica)
error_messages += "%s (%s)\n" % (
exc_fmt % msg_content, ex)
# dealing with the old master after all the other replicas
# has been migrated.
old_master.attach_replica(master_candidate)
try:
old_master.demote_replication_master()
except Exception as ex:
log_fmt = "Exception demoting old replica source %s."
exc_fmt = _("Exception demoting old replica source %s.")
LOG.error(log_fmt, old_master.id)
exception_replicas.append(old_master)
error_messages += "%s (%s)\n" % (
exc_fmt % old_master.id, ex)
self._set_task_status([old_master] + replica_models,
InstanceTasks.NONE)
if exception_replicas:
self._set_task_status(exception_replicas,
InstanceTasks.PROMOTION_ERROR)
msg = (_("promote-to-replica-source %(id)s: The following "
"replicas may not have been switched: %(replicas)s:"
"\n%(err)s") %
{"id": master_candidate.id,
"replicas": [repl.id for repl in exception_replicas],
"err": error_messages})
raise ReplicationSlaveAttachError(msg)
LOG.info('Finished to promote %s as master.', instance_id)
with EndNotification(context):
LOG.info('Promoting %s as replication master', instance_id)
master_candidate = BuiltInstanceTasks.load(context, instance_id)
old_master = BuiltInstanceTasks.load(context,
master_candidate.slave_of_id)
replicas = []
for replica_dbinfo in old_master.slaves:
if replica_dbinfo.id == instance_id:
replica = master_candidate
else:
replica = BuiltInstanceTasks.load(context,
replica_dbinfo.id)
replicas.append(replica)
try:
_promote_to_replica_source(old_master, master_candidate,
replicas)
except ReplicationSlaveAttachError:
raise
except Exception:
self._set_task_status([old_master] + replicas,
InstanceTasks.PROMOTION_ERROR)
raise
切换流程如下:
主从切换适合:一主多从的结构,而且绑定了floating_ip的情况。
- 根据要切换的slave_id获取实例作为新的maste;
- 将所有slave包括现在的作为新master的slave_id,添加到一个replicas列表
-
根据旧的master_id获取旧的master实例
以下开始进入正式切换步骤:_promote_to_replica_source(old_master, master_candidate,replicas)
注意: 以下说的公网IP是指floating ip - 设置旧的maste实例为只读
- 旧master 实例detach公网ip
- 新的master实例detach公网ip
- 获取旧master实例的同步位置
- 等待新的master实例同步到相同位置
- 解除新旧master实例的主从关系
- 新的master实例启用为master实例。
- 新的maste实例attache公网IP
- 新的master设置read_only 为flase,例如mysql: set global read_only = False
获取获取slave实例,如果slave实例的id不等于新的maste实例的id。
旧的maste解除主从关系
attache旧的slave到新的maste实例上
旧的maste实例和新的maste实例建立主从关系
旧的maste实例添加公网ip