monitor选主的下半段就是真正的选主
void Monitor::start_election()
_reset();
elector.call_election();
start();
acked_me.clear();
init();
epoch = mon->store->get(Monitor::MONITOR_NAME, "election_epoch");
if (!epoch)
epoch = 1;
// 如果epoch为奇数,则说明正处在选举阶段,将epoch+1,以跨过这个阶段
else if (epoch % 2)
++epoch;
t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
mon->store->apply_transaction(t);
// 如果为偶数,则加1,表示处于选举阶段
if (epoch % 2 == 0)
bump_epoch(epoch+1);
electing_me = true;
leader_acked = -1;
for (unsigned i=0; i<mon->monmap->size(); ++i)
if ((int)i == mon->rank) continue;
MMonElection *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
mon->messenger->send_message(m, mon->monmap->get_inst(i));
reset_timer();
主动启动选举的节点向monmap中的其它节点发送OP_PROPOSE请求,并将选举的epoch加1,置为奇数。
void Elector::handle_propose(MonOpRequestRef op)
MMonElection *m = static_cast<MMonElection*>(op->get_req());
int from = m->get_source().num();
// 对方的选举版本大于自己
if (m->epoch > epoch)
bump_epoch(m->epoch);
epoch = e;
t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
mon->store->apply_transaction(t);
mon->join_election();
electing_me = false;
acked_me.clear();
// 对方的选举版本小于自己
else if (m->epoch < epoch)
if (epoch % 2 == 0 && mon->quorum.count(from) == 0)
mon->start_election();
else
dout(5) << " ignoring old propose" << dendl;
return;
// 即使选举,也是本节点赢
if (mon->rank < from)
if (leader_acked >= 0)
assert(leader_acked < from);
else
if (!electing_me)
mon->start_election();
else
if (leader_acked < 0 || leader_acked > from || leader_acked == from)
// 对方会赢得选举
defer(from);
if (electing_me)
acked_me.clear();
electing_me = false;
// ack them
leader_acked = who;
MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);
m->mon_features = ceph::features::mon::get_supported();
mon->collect_metadata(&m->metadata);
mon->messenger->send_message(m, mon->monmap->get_inst(who));
// set a timer
reset_timer(1.0);
else
dout(5) << "no, we already acked " << leader_acked << dendl;
其它节点收到OP_PROPOSE请求后,
(1) 如果对方的选举版本大于自己,则将自己的选举版本设置为对方的选举版本。
(2) 对方的选举版本小于自己,并且满足自己不处于选举阶段和对方不处于本节点的quorum缓存里,则说明对方可能是新加入的节点,这种情况下自己主动开启选举,以便让其加入到quorum。
如果没有忽略该请求,继续采取如下行为:
(1) 如果自己的rank小于对方,则一定不会选举对方为主节点,如果这时本节点没有回应过其它节点,则自己会发起选举。
(2) 如果对方的rank小于自己,并且对方的rank小于等于自己已经回应过的节点,则选举对方为主节点。
void Elector::handle_ack(MonOpRequestRef op)
// 本节点发起的选举请求,要求选本节点
if (electing_me)
acked_me[from].cluster_features = m->get_connection()->get_features();
acked_me[from].mon_features = m->mon_features;
acked_me[from].metadata = m->metadata;
// 要求monmap中的全部节点都同意我作为leader才可以
if (acked_me.size() == mon->monmap->size())
victory();
assert(leader_acked >= 0);
收到OP_ACK后,将回应的节点插入到acked_me中,如果acked_me的大小和monmap的大小相同,则说明全部节点都同意我作为主节点。
victory()
bump_epoch(epoch+1); // is over! 偶数结束
for (map<int, elector_info_t>::iterator p = acked_me.begin(); p != acked_me.end(); ++p) {
quorum.insert(p->first);
for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
if (*p == mon->rank) continue;
MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, mon->monmap);
m->quorum = quorum;
mon->messenger->send_message(m, mon->monmap->get_inst(*p));
mon->win_election(epoch, quorum, cluster_features, mon_features, metadata);
state = STATE_LEADER;
leader_since = ceph_clock_now();
leader = rank;
quorum = active;
quorum_con_features = features;
quorum_mon_features = mon_features;
pending_metadata = metadata;
outside_quorum.clear();
paxos->leader_init();
(1) 将选举版本加1,变为偶数。
(2) 将acked_me的节点赋值给quorum。
(3) 对quorum的每个节点,发送OP_VICTORY消息。
(4) 初始化paxos,主要是提交上次没有提交的消息。
leader_init
pending_proposal.reset();
state = STATE_RECOVERING;
lease_expire = utime_t();
collect(0); // paxos里的函数
// 有为确认的消息
if (get_store()->exists(get_name(), last_committed+1))
version_t v = get_store()->get(get_name(), "pending_v");
version_t pn = get_store()->get(get_name(), "pending_pn");
if (v && pn && v == last_committed + 1)
uncommitted_pn = pn;
else
uncommitted_pn = accepted_pn;
uncommitted_v = last_committed+1;
get_store()->get(get_name(), last_committed+1, uncommitted_value);
// 生成新的accepted_pn,只有在每次选举成功才能生成新的
accepted_pn = get_new_proposal_number(std::max(accepted_pn, oldpn));
accepted_pn_from = last_committed;
for (set<int>::const_iterator p = mon->get_quorum().begin(); p != mon->get_quorum().end(); ++p)
if (*p == mon->rank) continue;
MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT, ceph_clock_now());
collect->last_committed = last_committed;
collect->first_committed = first_committed;
collect->pn = accepted_pn;
mon->messenger->send_message(collect, mon->monmap->get_inst(*p));
(1) 如果存在last_committed+1版本的日志,则说明存在没有完成同步的消息,则获取pending_v(未提交日志的版本号)、pending_pn(为提交日志的accepted_pn)、和last_committed+1的日志
(2) 生成新的accepted_pn,该值只在每次选举完成后重新生成。
(3) 向quorum其它节点发送OP_COLLECT消息。
void Paxos::handle_collect(MonOpRequestRef op)
state = STATE_RECOVERING;
MMonPaxos *last = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LAST, ceph_clock_now());
last->last_committed = last_committed;
last->first_committed = first_committed;
version_t previous_pn = accepted_pn;
// 接受对方的accepted_pn
if (collect->pn > accepted_pn)
accepted_pn = collect->pn;
accepted_pn_from = collect->pn_from;
t->put(get_name(), "accepted_pn", accepted_pn);
get_store()->apply_transaction(t);
last->pn = accepted_pn;
last->pn_from = accepted_pn_from;
if (collect->last_committed < last_committed)
share_state(last, collect->first_committed, collect->last_committed);
version_t v = peer_last_committed + 1;
for ( ; v <= last_committed; v++)
get_store()->get(get_name(), v, m->values[v]);
m->last_committed = last_committed;
if (collect->last_committed <= last_committed && get_store()->exists(get_name(), last_committed+1))
get_store()->get(get_name(), last_committed+1, bl);
last->values[last_committed+1] = bl;
version_t v = get_store()->get(get_name(), "pending_v");
version_t pn = get_store()->get(get_name(), "pending_pn");
if (v && pn && v == last_committed + 1)
last->uncommitted_pn = pn;
else
last->uncommitted_pn = previous_pn;
collect->get_connection()->send_message(last);
副节点收到OP_COLLECT消息后,如果主节点的的last_committed小于自己的,说明主节点缺失部分日志,就将缺失的这段日志分享给主节点,如果自己有未确认的日志,则一并发送给对方。
void Paxos::handle_last(MonOpRequestRef op)
peer_first_committed[from] = last->first_committed;
peer_last_committed[from] = last->last_committed;
// 如果对方的日志更新于本节点,则在本节点持久化缺失的日志
need_refresh = store_state(last);
// 之所以每次副节点的LAST消息到来都要遍历peer_last_committed,因此每次有新的LAST消息到来都可能会改变
// 本节点的日志,所以每次都需要和LAST消息已被接收的节点比较,如果对方的日志太旧,则更新
for (map<int,version_t>::iterator p = peer_last_committed.begin(); p != peer_last_committed.end(); ++p)
// 对方收到OP_PROBE时,检测到自己的版本太落后,会bootstrap
if (p->second + 1 < first_committed && first_committed > 1)
mon->bootstrap();
return;
// 对方的日志不是太落后,则直接在此更新
if (p->second < last_committed)
MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, ceph_clock_now());
share_state(commit, peer_first_committed[p->first], p->second);
mon->messenger->send_message(commit, mon->monmap->get_inst(p->first));
if (last->pn > accepted_pn)
collect(last->pn);
else if (last->pn == accepted_pn)
num_last++;
if (last->uncommitted_pn)
if (last->uncommitted_pn >= uncommitted_pn && last->last_committed >= last_committed && last->last_committed + 1 >= uncommitted_v)
uncommitted_v = last->last_committed+1;
uncommitted_pn = last->uncommitted_pn;
uncommitted_value = last->values[uncommitted_v];
if (num_last == mon->get_quorum().size())
if (uncommitted_v == last_committed+1 && uncommitted_value.length())
state = STATE_UPDATING_PREVIOUS;
begin(uncommitted_value);
else
extend_lease();
主节点收到OP_LAST消息后,如果副节点的日志比自己新,则更新自己的日志。并遍历peer_last_committed,对比本节点更新后的日志和其它节点的日志,如果其它节点的日志过于落后(last_committed小于first_committed),本节点重新调用bootstrap,bootstrap会发送OP_PROBE消息,在对方收到OP_PROBE消息时,检测到自己的日志太过落后,就会主动调用bootstrap。如果peer_last_committed中其它节点的日志稍微落后于本节点,就主动将缺失的日志发送给对方,不需要重新调用bootstrap。
如果副节点的的accepted_pn大于主节点的accepted_pn,则主节点重新collect,会重新生成新的accepted_pn。
如果副节点的accepted_pn等于本节点的accepted_pn,则说明副节点接受的本节点的accepted_pn。判断副节点是否有未提交并且版本大于主节点未提交的日志,如果有,则将这个未提交的日志广播出去,这是通过正常的paxos过程实现。如果没有未提交的日志,就调用extend_lease扩展副本的租约。