本章主要介绍mon选举的过程。
选举过程
整个选举过程,在Elector类中实现。此类之中实现了一个election_epoch:
当这个election_epoch为偶数的时候,表示处于稳定状态,为奇数的时候,表示还在选举过程中,mon leader 还未定。
以下是自己环境的mon情况,当前 "election_epoch": 120,说明目前处于稳定状态,没有在选举。
[root@ceph1 ~]# ceph quorum_status | json_reformat
{
"election_epoch": 120,
"quorum": [
0,
1,
2
],
"quorum_names": [
"ceph2",
"ceph1",
"ceph3"
],
"quorum_leader_name": "ceph2",
"monmap": {
"epoch": 1,
"fsid": "5cd6b9b7-2ebd-47b1-a1d2-7362bb077fa3",
"modified": "2020-08-04 19:55:21.613792",
"created": "2020-08-04 19:55:21.613792",
"features": {
"persistent": [
"kraken",
"luminous"
],
"optional": [
]
},
"mons": [
{
"rank": 0,
"name": "ceph2",
"addr": "192.168.111.14:6789/0",
"public_addr": "192.168.111.14:6789/0"
},
{
"rank": 1,
"name": "ceph1",
"addr": "192.168.111.20:6789/0",
"public_addr": "192.168.111.20:6789/0"
},
{
"rank": 2,
"name": "ceph3",
"addr": "192.168.111.34:6789/0",
"public_addr": "192.168.111.34:6789/0"
}
]
}
}
[root@ceph1 ~]#
选举过程关键变量
a.acked_me
/**
* Set containing all those that acked our proposal to become the Leader.
*
* If we are acked by everyone in the MonMap, we will declare
* victory. Also note each peer's feature set.
*/
//发起选举本端,经过选举,其他所有mon 返回ack消息,承认选自己为leader
map acked_me;
b.leader_acked
/**
* Indicates who we have acked
*/
本端已经acked,承认别人是leader
int leader_acked;
/*
默认值-1,代表从未承认别人是leader。
如果defer 了mon.1,那么leader_acked=1.
Elector::defer(int who)函数中进行赋值
*/
c.electing_me
/**
* Indicates if we are the ones being elected.
*
* We always attempt to be the one being elected if we are the ones starting
* the election. If we are not the ones that started it, we will only attempt
* to be elected if we think we might have a chance (i.e., the other guy's
* rank is lower than ours).
*/
是否选取本端作为leader,选举发起端一发起,该值就置位true
bool electing_me;
d.epoch
/**
* Latest epoch we've seen.
*
* @remarks if its value is odd, we're electing; if it's even, then we're
* stable.
*/
//选举的版本,奇数为选举状态,偶数为稳定状态
epoch_t epoch;
以下开始介绍选举过程:
选举整体流程图
图一是mon选举的主体流程
图二是mon选举的消息调度模块
阶段1:mon端发起选举
注意,这里leader端和peon端都可以发起选举。
void Monitor::start_election()
{
dout(10) << "start_election" << dendl;
wait_for_paxos_write();
_reset();
state = STATE_ELECTING;//设置状态
logger->inc(l_mon_num_elections);
logger->inc(l_mon_election_call);
clog->info() << "mon." << name << " calling monitor election";
elector.call_election();
}
void call_election() {
start();
}
void Elector::start()
{
dout(5) << "start -- can i be leader?" << dendl;
acked_me.clear();
init();
// start by trying to elect me /*从稳定态进入选举态,需要将版本号从偶数往上抬,抬成奇数*/
if (epoch % 2 == 0) {
bump_epoch(epoch+1); // odd == election cycle
} else {
// do a trivial db write just to ensure it is writeable.
auto t(std::make_shared());
t->put(Monitor::MONITOR_NAME, "election_writeable_test", rand());
int r = mon->store->apply_transaction(t);
assert(r >= 0);
}
start_stamp = ceph_clock_now();
electing_me = true;
acked_me[mon->rank].cluster_features = CEPH_FEATURES_ALL;
acked_me[mon->rank].mon_features = ceph::features::mon::get_supported();
mon->collect_metadata(&acked_me[mon->rank].metadata);
leader_acked = -1;
// bcast to everyone else /*向每一个成员广播消息,提议开始重新选举*/
for (unsigned i=0; imonmap->size(); ++i) {
if ((int)i == mon->rank) continue;
MMonElection *m =
new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);// OP_PROPOSE
m->mon_features = ceph::features::mon::get_supported();
mon->messenger->send_message(m, mon->monmap->get_inst(i)); //发送选举信号
}
/*
*注意,如果所有的人都承认自己leader地位,那么可以宣布获胜。但是有些情况下,无法等到所有的回应。
*比如某个ceph-mon进程已经不在了,是不可能得到其承认的。为了防止出现这种情况下,在通知其他节点
*选自己的start函数设置了定时器“
*/
reset_timer();
}
当选举超时后,会进入expire函数。
void Elector::expire()
{
dout(5) << "election timer expired" << dendl;
// did i win? 从这里可以看到只要有超过一半的monitor 回复,仍然认为当前节点获胜为leader节点,否则就调用mon->bootstrap() 重新开始选举
if (electing_me &&
acked_me.size() > (unsigned)(mon->monmap->size() / 2)) {
// i win
victory();
} else {
// whoever i deferred to didn't declare victory quickly enough.
if (mon->has_ever_joined)
start();
else
mon->bootstrap();
}
}
阶段2:选举消息调度
消息发送后,会根据不同的消息类型,经过以下流程进行处理。注意这里一次选举过程,可能要经过N次以下处理流程。
bool ms_dispatch(Message *m) ->Monitor::_ms_dispatch(Message *m)-> Monitor::dispatch_op -> Elector::dispatch(MonOpRequestRef op) ->handle_propose| handle_ack|handle_victory|handle_nak
void Elector::dispatch(MonOpRequestRef op)
{
//选举的地址如果是非monmap中的地址,那么就退出
if (!mon->monmap->contains(em->get_source_addr())) {
dout(1) << "discarding election message: " << em->get_source_addr()
<< " not in my monmap " << *mon->monmap << dendl;
return;
}
...
switch (em->op) {
case MMonElection::OP_PROPOSE://发起选举提议消息
handle_propose(op);
return;
}
if (em->epoch < epoch) {
dout(5) << "old epoch, dropping" << dendl;
break;
}
switch (em->op) {
case MMonElection::OP_ACK://返回ack消息到发起端
handle_ack(op);
return;
case MMonElection::OP_VICTORY://选举成功,leader端发起消息
handle_victory(op);//peon端进行处理
return;
case MMonElection::OP_NAK:
handle_nak(op);
return;
}
阶段3:非发起端处理
//处理发起propose的mon端发过来的消息。
void Elector::handle_propose(MonOpRequestRef op)
{
op->mark_event("elector:handle_propose");
MMonElection *m = static_cast(op->get_req());
dout(5) << "handle_propose from " << m->get_source() << dendl;
int from = m->get_source().num();
assert(m->epoch % 2 == 1); // election
uint64_t required_features = mon->get_required_features();
mon_feature_t required_mon_features = mon->get_required_mon_features();
dout(10) << __func__ << " required features " << required_features
<< " " << required_mon_features
<< ", peer features " << m->get_connection()->get_features()
<< " " << m->mon_features
<< dendl;
if ((required_features ^ m->get_connection()->get_features()) &
required_features) {
dout(5) << " ignoring propose from mon" << from
<< " without required features" << dendl;
nak_old_peer(op);
return;
} else if (!m->mon_features.contains_all(required_mon_features)) {
// all the features in 'required_mon_features' not in 'm->mon_features'
mon_feature_t missing = required_mon_features.diff(m->mon_features);
dout(5) << " ignoring propose from mon." << from
<< " without required mon_features " << missing
<< dendl;
nak_old_peer(op);
} else if (m->epoch > epoch) {
bump_epoch(m->epoch);//正常选举流程
} else if (m->epoch < epoch) {//说明是较老的propose
// got an "old" propose,
if (epoch % 2 == 0 && // in a non-election cycle
mon->quorum.count(from) == 0) { // from someone outside the quorum
// a mon just started up, call a new election so they can rejoin!
dout(5) << " got propose from old epoch, quorum is " << mon->quorum
<< ", " << m->get_source() << " must have just started" << dendl;
// we may be active; make sure we reset things in the monitor appropriately.
mon->start_election();
} else {
dout(5) << " ignoring old propose" << dendl;
return;
}
}
if (mon->rank < from) {//如果从未收到过更强者(rank更小者)发来的选举请求,调用start_election,给所有成员发消息,让他们选自己为mon leader
// i would win over them.
if (leader_acked >= 0) { // we already acked someone 自己曾经认过怂,消息来源的mon还不如自己,直接不理
assert(leader_acked < from); // and they still win, of course
dout(5) << "no, we already acked " << leader_acked << dendl;
} else {
// wait, i should win!
/*注意,electing_me记录了自己是否发出过选我为leader的请求
*如果先后收到两个弱小者发来的选举请求,处理第一个的时候,本节点已经发出了选自己当leader的请求,
*当第二个弱者消息到来的时候,没必要再发送选自己当leader的请求*/
if (!electing_me) {
mon->start_election();
}
}
} else {//赞同其他人做leader
// they would win over me
if (leader_acked < 0 || // haven't acked anyone yet, or /*从未承认过别人,从未认过怂*/
leader_acked > from || // they would win over who you did ack, or /*虽然承认过别人,认过怂,无奈这次来的更强大,所以还是得认怂,承认它*/
leader_acked == from) { // this is the guy we're already deferring to
defer(from); /*defer函数的作用是认可对方可以当leader*/
} else {//leader_acked < from
// ignore them! /*曾经认可过更强者,不可能向不够强的mon发送认可,不理*/
dout(5) << "no, we already acked " << leader_acked << dendl;
}
}
}
void Elector::bump_epoch(epoch_t e)
{
dout(10) << "bump_epoch " << epoch << " to " << e << dendl;
assert(epoch <= e);
epoch = e;//epoch 奇数
auto t(std::make_shared());
t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
mon->store->apply_transaction(t);
mon->join_election();
// clear up some state
electing_me = false;//重置选举状态
acked_me.clear();//重置选举状态
}
上述handle_propose函数是判断选举条件的核心。已经注释了关键的信息。
大家可能会发现这么个现象,mon发起选举,有时所有mon中都会有选举自己的信息(见下),有时候部分日志中有,部分日志中没有。这个其实和发起选举的序号大小有关。
mon.ceph2 calling monitor election
- 本端mon序号 < mon发起端,才有可能选举自己;
- 本端mon序号 > mon发起端,不可能选举自己;
if (mon->rank < from) {
// i would win over them.
if (leader_acked >= 0) {//说明认怂过,此时也不可能选举自己
assert(leader_acked < from); // and they still win, of course
dout(5) << "no, we already acked " << leader_acked << dendl;
} else {
// wait, i should win!
if (!electing_me) {
mon->start_election();//本端发起选举
}
}
}
阶段3:非发起端向发起端认怂
此阶段,关键处理函数为defer(),该函数可能被调用多次。如果本端mon defer过序号更小的mon,当再来次小的mon,就不再调用defer() 了。
if (mon->rank < from) {
...
} else {//赞同其他人做leader
// they would win over me
if (leader_acked < 0 || // haven't acked anyone yet, or /*从未承认过别人,从未认过怂*/
leader_acked > from || // they would win over who you did ack, or /*虽然承认过别人,认过怂,无奈这次来的更强大,所以还是得认怂,承认它*/
leader_acked == from) { // this is the guy we're already deferring to
defer(from); /*defer函数的作用是认可对方可以当leader*/
} else {//leader_acked < from
// ignore them! /*曾经认可过更强者,不可能向不够强的mon发送认可,不理*/
dout(5) << "no, we already acked " << leader_acked << dendl;
}
void Elector::defer(int who)
{
dout(5) << "defer to " << who << dendl;
if (electing_me) { /*注意,认怂就要清零,即使自己曾经要求别人选过自己*/
// drop out
acked_me.clear();
electing_me = false;
}
// ack them
leader_acked = who;
ack_stamp = ceph_clock_now();
MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);// OP_ACK /*发送OP_ACK承认对方可以当leader*/
m->mon_features = ceph::features::mon::get_supported();
mon->collect_metadata(&m->metadata);
// This field is unused completely in luminous, but jewel uses it to
// determine whether we are a dumpling mon due to some crufty old
// code. It only needs to see this buffer non-empty, so put
// something useless there.
m->sharing_bl = mon->get_local_commands_bl(mon->get_required_mon_features());
mon->messenger->send_message(m, mon->monmap->get_inst(who));//发送消息
// set a timer
reset_timer(1.0); // give the leader some extra time to declare victory
}
阶段4:发起端处理非发起端的ack消息
基于阶段3,如果非发起端认怂后,会发送OP_ACK消息,那么发起端要进行对应消息的处理。主要处理函数为 Elector::handle_ack(),一般发生在leader端(临时优先级更高的mon端)。
void Elector::handle_ack(MonOpRequestRef op)
{
op->mark_event("elector:handle_ack");
MMonElection *m = static_cast(op->get_req());
dout(5) << "handle_ack from " << m->get_source() << dendl;
int from = m->get_source().num();
assert(m->epoch % 2 == 1); // election
assert(m->epoch == epoch);
if (electing_me) {
// thanks
acked_me[from].cluster_features = m->get_connection()->get_features();
acked_me[from].mon_features = m->mon_features;
acked_me[from].metadata = m->metadata;
dout(5) << " so far i have {";
for (map::const_iterator p = acked_me.begin();
p != acked_me.end();
++p) {
if (p != acked_me.begin())
*_dout << ",";
*_dout << " mon." << p->first << ":"
<< " features " << p->second.cluster_features
<< " " << p->second.mon_features;
}
*_dout << " }" << dendl;
// is that _everyone_? /*如果所有成员都承认了自己的leader地位,那么宣布获胜,调用victory*/
/*
*注意,如果所有的人都承认自己leader地位,那么可以宣布获胜。但是有些情况下,无法等到所有的回应。
*比如某个ceph-mon进程已经不在了,是不可能得到其承认的。为了防止出现这种情况下,在通知其他节点
*选自己的start函数设置了定时器“
*/
if (acked_me.size() == mon->monmap->size()) {//如果所有的mon端都进行了ack,说明都defer了自己,自己成为了leader
// if yes, shortcut to election finish
victory();
}
} else {
// ignore, i'm deferring already.
assert(leader_acked >= 0);
}
}
当发起端接收到所有mon的ack信息后,开始调用victory()函数,说明本端成为了leader。继续分析
void Elector::victory()
{
leader_acked = -1;//已经选举成功了,那么把相关标记信息复位
electing_me = false;//同上
cancel_timer();
assert(epoch % 2 == 1); // election
bump_epoch(epoch+1); // is over! 这里epoch变为偶数,稳定状态。
// tell everyone!
for (set::iterator p = quorum.begin();
p != quorum.end();
++p) {
if (*p == mon->rank) continue;
MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch,/*注意这里会发送OP_VICTORY 消息来帮自己设置为leader节点*/
mon->monmap);
m->quorum = quorum;
m->quorum_features = cluster_features;
m->mon_features = mon_features;
m->sharing_bl = mon->get_local_commands_bl(mon_features);
mon->messenger->send_message(m, mon->monmap->get_inst(*p));
}
// tell monitor
mon->win_election(epoch, quorum,
cluster_features, mon_features, metadata);
}
void Monitor::win_election(epoch_t epoch, set& active, uint64_t features,
const mon_feature_t& mon_features,
const map& metadata)
{
assert(is_electing());
state = STATE_LEADER;// 修改选举状态
clog->info() << "mon." << name << " is new leader, mons " << get_quorum_names()
<< " in quorum (ranks " << quorum << ")";
}
paxos->leader_init();//函数里paoxs状态变为, state=STATE_RECOVERING,开始进行数据恢复操作
monmon()->election_finished();
_finish_svc_election();
health_monitor->start(epoch);
阶段5:选举失败端(peon)处理OP_VICTORY消息
//OP_VICTORY消息对应的处理函数
void Elector::handle_victory(MonOpRequestRef op)
{
op->mark_event("elector:handle_victory");
MMonElection *m = static_cast(op->get_req());
dout(5) << "handle_victory from " << m->get_source()
<< " quorum_features " << m->quorum_features
<< " " << m->mon_features
<< dendl;
int from = m->get_source().num();
assert(from < mon->rank);// 已经选举成功了,远端的rank自然是小的
assert(m->epoch % 2 == 0); //victory 函数中已经对epoch+1 了
leader_acked = -1;
bump_epoch(m->epoch);//设置本端的epoch以及重置选举状态。此时非leader端mon也进入稳定状态
// they win 输掉选举的话,把自己设置为peon节点
mon->lose_election(epoch, m->quorum, from,
m->quorum_features, m->mon_features);
// cancel my timer
cancel_timer();
// stash leader's commands
assert(m->sharing_bl.length());
vector new_cmds;
bufferlist::iterator bi = m->sharing_bl.begin();
MonCommand::decode_vector(new_cmds, bi);
mon->set_leader_commands(new_cmds);
}
void Monitor::lose_election(epoch_t epoch, set &q, int l,
uint64_t features,
const mon_feature_t& mon_features)
{
dout(10) << "lose_election, epoch " << epoch << " leader is mon" << leader
<< " quorum is " << quorum << " features are " << quorum_con_features
<< " mon_features are " << quorum_mon_features
<< dendl;
state = STATE_PEON;//设置peon状态
paxos->peon_init();
_finish_svc_election();
health_monitor->start(epoch);
}
void Paxos::peon_init()
{
cancel_events();
new_value.clear();
state = STATE_RECOVERING;// peon 端的paxos也进入STATE_RECOVERING状态
}
选举实战
选举前mon dump如下,下边会分别进行三次mon的重启,触发选举,分别是mon.ceph2重启、mon.ceph1重启、mon.ceph3重启
[root@ceph1 ~]# ceph mon dump
dumped monmap epoch 1
epoch 1
fsid 5cd6b9b7-2ebd-47b1-a1d2-7362bb077fa3
last_changed 2020-08-04 19:55:21.613792
created 2020-08-04 19:55:21.613792
0: 192.168.111.14:6789/0 mon.ceph2
1: 192.168.111.20:6789/0 mon.ceph1
2: 192.168.111.34:6789/0 mon.ceph3
1.第一次重启mon.ceph2
----restart ceph2
node-1
2020-08-11 15:50:28.341174 7f2b02d26700 5 mon.ceph1@1(peon).elector(178) handle_propose from mon.0
2020-08-11 15:50:28.361621 7f2b02d26700 5 mon.ceph1@1(electing).elector(179) defer to 0
2020-08-11 15:50:28.440163 7f2b02d26700 5 mon.ceph1@1(electing).elector(179) handle_victory from mon.0 quorum_features 4611087853746454523 mon_feature_t([kraken,luminous])
node-2
2020-08-11 15:50:28.322994 7f611fff5700 0 log_channel(cluster) log [INF] : mon.ceph2 calling monitor election
2020-08-11 15:50:28.323030 7f611fff5700 5 mon.ceph2@0(electing).elector(0) start -- can i be leader?
2020-08-11 15:50:28.323052 7f611fff5700 1 mon.ceph2@0(electing).elector(178) init, last seen epoch 178
2020-08-11 15:50:28.323746 7f611cfef700 0 -- 192.168.111.14:6789/0 >> 192.168.111.20:6789/0 conn(0x7f6134106800 :-1 s=STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH pgs=400 cs=1 l=0).process missed message? skipped from seq 0 to 1350655075
2020-08-11 15:50:28.340142 7f611fff5700 5 mon.ceph2@0(electing) e1 _ms_dispatch setting monitor caps on this connection
2020-08-11 15:50:28.340178 7f611fff5700 5 mon.ceph2@0(electing) e1 _ms_dispatch setting monitor caps on this connection
2020-08-11 15:50:28.357795 7f611fff5700 5 mon.ceph2@0(electing).elector(179) handle_ack from mon.2
2020-08-11 15:50:28.357802 7f611fff5700 5 mon.ceph2@0(electing).elector(179) so far i have { mon.0: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.2: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
2020-08-11 15:50:28.362990 7f611fff5700 5 mon.ceph2@0(electing).elector(179) handle_ack from mon.1
2020-08-11 15:50:28.362999 7f611fff5700 5 mon.ceph2@0(electing).elector(179) so far i have { mon.0: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.1: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.2: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
2020-08-11 15:50:28.438989 7f611fff5700 0 log_channel(cluster) log [INF] : mon.ceph2 is new leader, mons ceph2,ceph1,ceph3 in quorum (ranks 0,1,2)
node-3
2020-08-11 15:50:28.341453 7f4c2a432700 5 mon.ceph3@2(peon).elector(178) handle_propose from mon.0
2020-08-11 15:50:28.356866 7f4c2a432700 5 mon.ceph3@2(electing).elector(179) defer to 0
2020-08-11 15:50:28.440894 7f4c2a432700 5 mon.ceph3@2(electing).elector(179) handle_victory from mon.0 quorum_features 4611087853746454523 mon_feature_t([kraken,luminous])
2.第二次重启mon.ceph3
----restart ceph3
node-1
2020-08-11 16:01:21.896774 7f2b02d26700 5 mon.ceph1@1(peon).elector(180) handle_propose from mon.2
2020-08-11 16:01:21.914156 7f2b02d26700 0 log_channel(cluster) log [INF] : mon.ceph1 calling monitor election
2020-08-11 16:01:21.914264 7f2b02d26700 5 mon.ceph1@1(electing).elector(181) start -- can i be leader?
2020-08-11 16:01:21.914305 7f2b02d26700 1 mon.ceph1@1(electing).elector(181) init, last seen epoch 181, mid-election, bumping
2020-08-11 16:01:21.950028 7f2b02d26700 5 mon.ceph1@1(electing).elector(183) handle_propose from mon.0
2020-08-11 16:01:21.950037 7f2b02d26700 5 mon.ceph1@1(electing).elector(183) defer to 0
2020-08-11 16:01:21.976960 7f2b02d26700 5 mon.ceph1@1(electing).elector(183) handle_victory from mon.0 quorum_features 4611087853746454523 mon_feature_t([kraken,luminous])
node-2
2020-08-11 16:01:21.893859 7f611fff5700 5 mon.ceph2@0(leader).elector(180) handle_propose from mon.2
2020-08-11 16:01:21.899238 7f611fff5700 2 mon.ceph2@0(leader) e1 send_reply 0x7f61350ef200 0x7f613459de00 log(last 294977) v1
2020-08-11 16:01:21.899307 7f611fff5700 0 log_channel(cluster) log [INF] : mon.ceph2 calling monitor election
2020-08-11 16:01:21.899355 7f611fff5700 5 mon.ceph2@0(electing).elector(181) start -- can i be leader?
2020-08-11 16:01:21.899374 7f611fff5700 1 mon.ceph2@0(electing).elector(181) init, last seen epoch 181, mid-election, bumping
2020-08-11 16:01:21.945892 7f611fff5700 5 mon.ceph2@0(electing).elector(183) handle_ack from mon.2
2020-08-11 16:01:21.945898 7f611fff5700 5 mon.ceph2@0(electing).elector(183) so far i have { mon.0: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.2: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
2020-08-11 16:01:21.949932 7f611fff5700 5 mon.ceph2@0(electing).elector(183) handle_propose from mon.1
2020-08-11 16:01:21.951660 7f611fff5700 5 mon.ceph2@0(electing).elector(183) handle_ack from mon.1
2020-08-11 16:01:21.951664 7f611fff5700 5 mon.ceph2@0(electing).elector(183) so far i have { mon.0: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.1: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.2: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
2020-08-11 16:01:21.975195 7f611fff5700 0 log_channel(cluster) log [INF] : mon.ceph2 is new leader, mons ceph2,ceph1,ceph3 in quorum (ranks 0,1,2)
node-3
2020-08-11 16:01:21.882833 7f710765d700 0 log_channel(cluster) log [INF] : mon.ceph3 calling monitor election
2020-08-11 16:01:21.882883 7f710765d700 5 mon.ceph3@2(electing).elector(0) start -- can i be leader?
2020-08-11 16:01:21.882911 7f710765d700 1 mon.ceph3@2(electing).elector(180) init, last seen epoch 180
2020-08-11 16:01:21.894141 7f710765d700 5 mon.ceph3@2(electing) e1 _ms_dispatch setting monitor caps on this connection
2020-08-11 16:01:21.897180 7f7103e56700 0 -- 192.168.111.34:6789/0 >> 192.168.111.20:6789/0 conn(0x7f711b1ba000 :-1 s=STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH pgs=407 cs=1 l=0).process missed message? skipped from seq 0 to 1841887457
2020-08-11 16:01:21.897262 7f710765d700 5 mon.ceph3@2(electing) e1 _ms_dispatch setting monitor caps on this connection
2020-08-11 16:01:21.913331 7f710765d700 5 mon.ceph3@2(electing).elector(181) handle_propose from mon.0
2020-08-11 16:01:21.944842 7f710765d700 5 mon.ceph3@2(electing).elector(183) defer to 0
2020-08-11 16:01:21.950680 7f710765d700 5 mon.ceph3@2(electing).elector(183) handle_propose from mon.1
2020-08-11 16:01:21.950691 7f710765d700 5 mon.ceph3@2(electing).elector(183) no, we already acked 0
2020-08-11 16:01:21.976848 7f710765d700 5 mon.ceph3@2(electing).elector(183) handle_victory from mon.0 quorum_features 4611087853746454523 mon_feature_t([kraken,luminous])
3.第三次重启mon.ceph1
---restart ceph1
node-1
2020-08-11 16:11:09.012046 7fd3ff7b4700 0 log_channel(cluster) log [INF] : mon.ceph1 calling monitor election
2020-08-11 16:11:09.012091 7fd3ff7b4700 5 mon.ceph1@1(electing).elector(0) start -- can i be leader?
2020-08-11 16:11:09.012118 7fd3ff7b4700 1 mon.ceph1@1(electing).elector(184) init, last seen epoch 184
2020-08-11 16:11:09.013813 7fd3fbfad700 0 -- 192.168.111.20:6789/0 >> 192.168.111.34:6789/0 conn(0x7fd413340800 :-1 s=STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH pgs=242 cs=1 l=0).process missed message? skipped from seq 0 to 1113628032
2020-08-11 16:11:09.041742 7fd3ff7b4700 5 mon.ceph1@1(electing) e1 _ms_dispatch setting monitor caps on this connection
2020-08-11 16:11:09.041785 7fd3ff7b4700 5 mon.ceph1@1(electing) e1 _ms_dispatch setting monitor caps on this connection
2020-08-11 16:11:09.054821 7fd3ff7b4700 5 mon.ceph1@1(electing).elector(185) handle_ack from mon.2
2020-08-11 16:11:09.054834 7fd3ff7b4700 5 mon.ceph1@1(electing).elector(185) so far i have { mon.1: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.2: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
2020-08-11 16:11:09.152191 7fd3ff7b4700 5 mon.ceph1@1(electing).elector(185) handle_propose from mon.0
2020-08-11 16:11:09.172100 7fd3ff7b4700 5 mon.ceph1@1(electing).elector(187) defer to 0
2020-08-11 16:11:09.188490 7fd3ff7b4700 5 mon.ceph1@1(electing).elector(187) handle_victory from mon.0 quorum_features 4611087853746454523 mon_feature_t([kraken,luminous])
node-2
2020-08-11 16:11:09.041814 7f611fff5700 5 mon.ceph2@0(leader).elector(184) handle_propose from mon.1
2020-08-11 16:11:09.091197 7f611fff5700 0 log_channel(cluster) log [INF] : mon.ceph2 calling monitor election
2020-08-11 16:11:09.091296 7f611fff5700 5 mon.ceph2@0(electing).elector(185) start -- can i be leader?
2020-08-11 16:11:09.091317 7f611fff5700 1 mon.ceph2@0(electing).elector(185) init, last seen epoch 185, mid-election, bumping
2020-08-11 16:11:09.161185 7f611fff5700 5 mon.ceph2@0(electing).elector(187) handle_ack from mon.2
2020-08-11 16:11:09.161198 7f611fff5700 5 mon.ceph2@0(electing).elector(187) so far i have { mon.0: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.2: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
2020-08-11 16:11:09.173833 7f611fff5700 5 mon.ceph2@0(electing).elector(187) handle_ack from mon.1
2020-08-11 16:11:09.173848 7f611fff5700 5 mon.ceph2@0(electing).elector(187) so far i have { mon.0: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.1: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.2: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
2020-08-11 16:11:09.187207 7f611fff5700 0 log_channel(cluster) log [INF] : mon.ceph2 is new leader, mons ceph2,ceph1,ceph3 in quorum (ranks 0,1,2)
node-3
2020-08-11 16:11:09.042667 7f710765d700 5 mon.ceph3@2(peon).elector(184) handle_propose from mon.1
2020-08-11 16:11:09.053677 7f710765d700 5 mon.ceph3@2(electing).elector(185) defer to 1
2020-08-11 16:11:09.152215 7f710765d700 5 mon.ceph3@2(electing).elector(185) handle_propose from mon.0
2020-08-11 16:11:09.159910 7f710765d700 5 mon.ceph3@2(electing).elector(187) defer to 0
2020-08-11 16:11:09.189147 7f710765d700 5 mon.ceph3@2(electing).elector(187) handle_victory from mon.0 quorum_features 4611087853746454523 mon_feature_t([kraken,luminous])
较复杂的一个选举过程
这是个生产环境由于node-5节点网络异常导致“election timer expired”,最终node-5节点发起选举的过程。
monmap,其中node-5发起了选举:
node-1,node-2,node-3,node-4,node-5 in quorum (ranks 0,1,2,3,4)
node-1 节点:
840197 2020-07-22 11:54:48.338 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.338387 ffffb674d990 5 mon.node-1@0(leader).elector(36700) handle_propose from mon.4
840198 2020-07-22 11:54:48.339 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.339107 ffffb674d990 2 mon.node-1@0(leader) e1 send_reply 0xaaab167109a0 0xaaab3d230ee0 log(last 4352576) v1
840199 2020-07-22 11:54:48.339 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.339193 ffffb674d990 0 log_channel(cluster) log [INF] : mon.node-1 calling monitor election
840200 2020-07-22 11:54:48.339 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.339215 ffffb674d990 5 mon.node-1@0(electing).elector(36701) start -- can i be leader?
840201 2020-07-22 11:54:48.339 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.339239 ffffb674d990 1 mon.node-1@0(electing).elector(36701) init, last seen epoch 36701, mid-election, bumping
840202 2020-07-22 11:54:48.341 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341004 ffffb674d990 5 mon.node-1@0(electing).elector(36703) handle_propose from mon.1
840203 2020-07-22 11:54:48.341 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341062 ffffb674d990 5 mon.node-1@0(electing).elector(36703) handle_propose from mon.2
840204 2020-07-22 11:54:48.341 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341095 ffffb674d990 5 mon.node-1@0(electing).elector(36703) handle_propose from mon.3
840205 2020-07-22 11:54:48.342 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342444 ffffb674d990 5 mon.node-1@0(electing).elector(36703) handle_ack from mon.1
840206 2020-07-22 11:54:48.342 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342475 ffffb674d990 5 mon.node-1@0(electing).elector(36703) so far i have { mon.0: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.1: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
840207 2020-07-22 11:54:48.342 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342595 ffffb674d990 5 mon.node-1@0(electing).elector(36703) handle_ack from mon.3
840208 2020-07-22 11:54:48.342 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342618 ffffb674d990 5 mon.node-1@0(electing).elector(36703) so far i have { mon.0: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.1: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.3: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
840209 2020-07-22 11:54:48.342 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342697 ffffb674d990 5 mon.node-1@0(electing).elector(36703) handle_ack from mon.2
840210 2020-07-22 11:54:48.342 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342716 ffffb674d990 5 mon.node-1@0(electing).elector(36703) so far i have { mon.0: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.1: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.2: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.3: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
840211 2020-07-22 11:54:48.343 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.343219 ffffb674d990 5 mon.node-1@0(electing).elector(36703) handle_ack from mon.4
840212 2020-07-22 11:54:48.343 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.343248 ffffb674d990 5 mon.node-1@0(electing).elector(36703) so far i have { mon.0: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.1: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.2: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.3: features 4611087853746454523 mon_feature_t([kraken,luminous]), mon.4: features 4611087853746454523 mon_feature_t([kraken,luminous]) }
840213 2020-07-22 11:54:48.344 +0800 ¦ node-1 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.344907 ffffb674d990 0 log_channel(cluster) log [INF] : mon.node-1 is new leader, mons node-1,node-2,node-3,
node-4,node-5 in quorum (ranks 0,1,2,3,4)
node-2节点
651253 2020-07-22 11:54:48.338 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.338391 ffffb53bd990 5 mon.node-2@1(peon).elector(36700) handle_propose from mon.4
651254 2020-07-22 11:54:48.339 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.339071 ffffb53bd990 0 log_channel(cluster) log [INF] : mon.node-2 calling monitor election
651255 2020-07-22 11:54:48.339 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.339103 ffffb53bd990 5 mon.node-2@1(electing).elector(36701) start -- can i be leader?
651256 2020-07-22 11:54:48.339 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.339128 ffffb53bd990 1 mon.node-2@1(electing).elector(36701) init, last seen epoch 36701, mid-election, bumping
651257 2020-07-22 11:54:48.340 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.340796 ffffb53bd990 5 mon.node-2@1(electing).elector(36703) handle_propose from mon.2
651258 2020-07-22 11:54:48.340 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.340850 ffffb53bd990 5 mon.node-2@1(electing).elector(36703) handle_propose from mon.3
651259 2020-07-22 11:54:48.341 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341086 ffffb53bd990 5 mon.node-2@1(electing).elector(36703) handle_propose from mon.0
651260 2020-07-22 11:54:48.341 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341100 ffffb53bd990 5 mon.node-2@1(electing).elector(36703) defer to 0
651261 2020-07-22 11:54:48.342 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342007 ffffb53bd990 5 mon.node-2@1(electing).elector(36703) handle_ack from mon.2
651262 2020-07-22 11:54:48.342 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342181 ffffb53bd990 5 mon.node-2@1(electing).elector(36703) handle_ack from mon.3
651263 2020-07-22 11:54:48.345 +0800 ¦ node-2 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.345388 ffffb53bd990 5 mon.node-2@1(electing).elector(36703) handle_victory from mon.0 quorum_features
node-3节点
671979 2020-07-22 11:54:48.338 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.338274 ffffa9c0d990 5 mon.node-3@2(peon).elector(36700) handle_propose from mon.4
671980 2020-07-22 11:54:48.338 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.338926 ffffa9c0d990 0 log_channel(cluster) log [INF] : mon.node-3 calling monitor election
671981 2020-07-22 11:54:48.338 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.338949 ffffa9c0d990 5 mon.node-3@2(electing).elector(36701) start -- can i be leader?
671982 2020-07-22 11:54:48.338 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.338971 ffffa9c0d990 1 mon.node-3@2(electing).elector(36701) init, last seen epoch 36701, mid-election, bumping
671983 2020-07-22 11:54:48.340 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.340627 ffffa9c0d990 5 mon.node-3@2(electing).elector(36703) handle_propose from mon.3
671984 2020-07-22 11:54:48.340 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.340863 ffffa9c0d990 5 mon.node-3@2(electing).elector(36703) handle_propose from mon.1
671985 2020-07-22 11:54:48.340 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.340872 ffffa9c0d990 5 mon.node-3@2(electing).elector(36703) defer to 1
671986 2020-07-22 11:54:48.341 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341417 ffffa9c0d990 5 mon.node-3@2(electing).elector(36703) handle_propose from mon.0
671987 2020-07-22 11:54:48.341 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341432 ffffa9c0d990 5 mon.node-3@2(electing).elector(36703) defer to 0
671988 2020-07-22 11:54:48.341 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341933 ffffa9c0d990 5 mon.node-3@2(electing).elector(36703) handle_ack from mon.3
671989 2020-07-22 11:54:48.342 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342660 ffffa9c0d990 5 mon.node-3@2(electing).elector(36703) handle_ack from mon.4
671990 2020-07-22 11:54:48.345 +0800 ¦ node-3 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.345443 ffffa9c0d990 5 mon.node-3@2(electing).elector(36703) handle_victory from mon.0 quorum_features
4611087853746454523 mon_feature_t([kraken,luminous])
node-4节点
820192 2020-07-22 11:54:48.338 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.338205 ffffaab3d990 5 mon.node-4@3(peon).elector(36700) handle_propose from mon.4
820193 2020-07-22 11:54:48.338 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.338803 ffffaab3d990 0 log_channel(cluster) log [INF] : mon.node-4 calling monitor election
820194 2020-07-22 11:54:48.338 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.338825 ffffaab3d990 5 mon.node-4@3(electing).elector(36701) start -- can i be leader?
820195 2020-07-22 11:54:48.338 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.338845 ffffaab3d990 1 mon.node-4@3(electing).elector(36701) init, last seen epoch 36701, mid-election, bumping
820196 2020-07-22 11:54:48.340 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.340735 ffffaab3d990 5 mon.node-4@3(electing).elector(36703) handle_propose from mon.2
820197 2020-07-22 11:54:48.340 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.340748 ffffaab3d990 5 mon.node-4@3(electing).elector(36703) defer to 2
820198 2020-07-22 11:54:48.341 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341219 ffffaab3d990 5 mon.node-4@3(electing).elector(36703) handle_propose from mon.1
820199 2020-07-22 11:54:48.341 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341231 ffffaab3d990 5 mon.node-4@3(electing).elector(36703) defer to 1
820200 2020-07-22 11:54:48.341 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341676 ffffaab3d990 5 mon.node-4@3(electing).elector(36703) handle_propose from mon.0
820201 2020-07-22 11:54:48.341 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341687 ffffaab3d990 5 mon.node-4@3(electing).elector(36703) defer to 0
820202 2020-07-22 11:54:48.342 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342134 ffffaab3d990 5 mon.node-4@3(electing).elector(36703) handle_ack from mon.4
820203 2020-07-22 11:54:48.342 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342459 ffffaab3d990 5 mon.node-4@3(electing) e1 waitlisting message auth(proto 0 31 bytes epoch 0) v1
820204 2020-07-22 11:54:48.345 +0800 ¦ node-4 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.345361 ffffaab3d990 5 mon.node-4@3(electing).elector(36703) handle_victory from mon.0 quorum_features
4611087853746454523 mon_feature_t([kraken,luminous])
node-5节点
759421 2020-07-22 11:54:48.336 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.336457 ffff9620d990 5 mon.node-5@4(electing).elector(36699) election timer expired
759422 2020-07-22 11:54:48.336 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.336479 ffff9620d990 5 mon.node-5@4(electing).elector(36699) start -- can i be leader?
759423 2020-07-22 11:54:48.336 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.336513 ffff9620d990 1 mon.node-5@4(electing).elector(36699) init, last seen epoch 36699, mid-election, bumping
759424 2020-07-22 11:54:48.340 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.340446 ffff939bd990 5 mon.node-5@4(electing).elector(36701) handle_propose from mon.3
759425 2020-07-22 11:54:48.341 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341000 ffff939bd990 5 mon.node-5@4(electing).elector(36703) defer to 3
759426 2020-07-22 11:54:48.341 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341555 ffff939bd990 5 mon.node-5@4(electing).elector(36703) handle_propose from mon.2
759427 2020-07-22 11:54:48.341 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.341567 ffff939bd990 5 mon.node-5@4(electing).elector(36703) defer to 2
759428 2020-07-22 11:54:48.342 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342027 ffff939bd990 5 mon.node-5@4(electing).elector(36703) handle_propose from mon.0
759429 2020-07-22 11:54:48.342 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342041 ffff939bd990 5 mon.node-5@4(electing).elector(36703) defer to 0
759430 2020-07-22 11:54:48.342 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342479 ffff939bd990 5 mon.node-5@4(electing).elector(36703) handle_propose from mon.1
759431 2020-07-22 11:54:48.342 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342490 ffff939bd990 5 mon.node-5@4(electing).elector(36703) no, we already acked 0
759432 2020-07-22 11:54:48.342 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.342505 ffff939bd990 5 mon.node-5@4(electing) e1 waitlisting message auth(proto 0 31 bytes epoch 0) v1
759433 2020-07-22 11:54:48.345 +0800 ¦ node-5 ¦ ceph ¦ ¦ 2020-07-22 11:54:48.345545 ffff939bd990 5 mon.node-5@4(electing).elector(36703) handle_victory from mon.0 quorum_features 461108853746454523 mon_feature_t([kraken,luminous])
相关link:https://blog.51cto.com/wendashuai/2517286