欢迎关注存储老小伙的博客。
上篇是Phase 1,即leader当选后确定PN的部分。这篇主要是Phase 2,即正常工作过程中的Propose、accept和commit过程。
Ceph的paxos实现,不算很精妙,近期修改也不大活跃。但是对于我们理解paxos协议仍然有帮助。
1. 几个要点说明
1.1 Phase 2的交互过程
Leader | Peons | 说明 |
---|---|---|
begin()=> | Leader给quorum中各个成员发送提议,包含PN、version和内容 | |
<= handle_begin() | Peon处理提议,有可能会拒绝 | |
handle_accept() | 只有quorum中所有成员都同意,才算成功 | |
commit_start() | handle_accept在收到所有ack后调用,用事务写commit记录,并设置回调函数 | |
commit_finish()=> | 上一步的回调函数,在实际事务完成时执行 | |
handle_commit() | Peon根据leader的commit消息同步状态 |
从begin()到commit_finish()称为一轮,即一次提议的完成过程。
1.2 串行化提议
Ceph的paxos实现,每次只允许一个提议在执行中,即上面提到
的一轮完成才能执行下一轮。在这个过程中,会有多次写盘操作。
这个过程实际上比较慢。对于ceph自身来说,osd等的状态变更,
不频繁,无需极高的paxos性能。 但是如果用于做用于分布式数据
库等系统的日志,这种方式则有不足。
2. 代码
2.1 Leader的提议
//这是Phase 2的开始,leader正常工作时,所有提议都是从这个函数开始
//bufferlist是内容的打包,paxos层不需要知道具体语义
//leader
void Paxos::begin(bufferlist& v)
{
assert(mon->is_leader());
assert(is_updating() || is_updating_previous());
//STATE_UPDATING_PREVIOUS对应刚当选后,发现有uncommited value,并且是
//下一个版本(last_committed+1)
// we must already have a majority for this to work.
assert(mon->get_quorum().size() == 1 ||
num_last > (unsigned)mon->monmap->size()/2);
// and no value, yet.
assert(new_value.length() == 0);
//清空"已accept的成员"列表
accepted.clear();
accepted.insert(mon->rank);//表示 自己先accept
new_value = v;
if (last_committed == 0) {
//paxos从未进行过提交。将"first_committed"的初始化,也打包到一个事务中
// initial base case; set first_committed too
t->put(get_name(), "first_committed", 1);
decode_append_transaction(t, new_value);
bufferlist tx_bl;
t->encode(tx_bl);
new_value = tx_bl;
}
// store the proposed value in the store. IF it is accepted, we will then
// have to decode it into a transaction and apply it.
MonitorDBStore::TransactionRef t(new MonitorDBStore::Transaction);
//下这几个k/v,在同一个事务中写下去。
//实际上,我们只会建议last_committed+1,因此不会有多个pending的版本。
t->put(get_name(), last_committed+1, new_value);
//pending_v就是上面的version
t->put(get_name(), "pending_v", last_committed + 1);
//配套的PN,这些都记为pending.
t->put(get_name(), "pending_pn", accepted_pn);
logger->inc(l_paxos_begin);
logger->inc(l_paxos_begin_keys, t->get_keys());
logger->inc(l_paxos_begin_bytes, t->get_bytes());
utime_t start = ceph_clock_now(NULL);
//保存前面put的三个 k/v
get_store()->apply_transaction(t);
utime_t end = ceph_clock_now(NULL);
logger->tinc(l_paxos_begin_latency, end - start);
assert(g_conf->paxos_kill_at != 3);
//quorum size是1,这是all in one 配置才有的场景
if (mon->get_quorum().size() == 1) {
// we're alone, take it easy
commit_start();
return;
}
//给quorum中各个成员发提议
for (set::const_iterator p = mon->get_quorum().begin();
p != mon->get_quorum().end();
++p) {
if (*p == mon->rank) continue;
MMonPaxos *begin = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_BEGIN,
ceph_clock_now(g_ceph_context));
//消息包含下面三个值
begin->values[last_committed+1] = new_value;
begin->last_committed = last_committed;//这个直接发送过去了。
begin->pn = accepted_pn;
mon->messenger->send_message(begin, mon->monmap->get_inst(*p));
}
// set timeout event
accept_timeout_event = new C_AcceptTimeout(this);
mon->timer.add_event_after(g_conf->mon_accept_timeout, accept_timeout_event);
}
2.2 Peon处理提议
//Phase 2, Peon收到提议后的处理
void Paxos::handle_begin(MonOpRequestRef op)
{
op->mark_paxos_event("handle_begin");
MMonPaxos *begin = static_cast(op->get_req());
//比较PN,确定是否应该accept。这个是标准paxos协议做法
if (begin->pn < accepted_pn) {
/*可能已经有人发起了新的选举,新的leader做了collect修改了PN。
比如之前的leader网络故障,然后恢复了,它仍然按照旧状态在运行*/
dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl;
op->mark_paxos_event("have higher pn, ignore");
return;
}
//实际上保证leader还是那个leader,即朝代没改。
assert(begin->pn == accepted_pn);
assert(begin->last_committed == last_committed);//因为leader没有变化,
//请求一个一个处理,所以总是维持一致的last_committed,双方知道下一个版本是啥
assert(g_conf->paxos_kill_at != 4);
logger->inc(l_paxos_begin);
// set state.
state = STATE_UPDATING; //不同状态下能做不同的事情。
lease_expire = utime_t(); // cancel lease
// yes.
version_t v = last_committed+1; //设置version,这时不会有uncommitted吧? 应该是同步完成了。
dout(10) << "accepting value for " << v << " pn " << accepted_pn << dendl;
// store the accepted value onto our store. We will have to decode it and
// apply its transaction once we receive permission to commit.
MonitorDBStore::TransactionRef t(new MonitorDBStore::Transaction);
//以下几个k/v,在一个事务中写入
t->put(get_name(), v, begin->values[v]);
// note which pn this pending value is for.
t->put(get_name(), "pending_v", v);//下面会apply,但是这些都是pending的
t->put(get_name(), "pending_pn", accepted_pn);//不用写pending_version,
//因为有last_committed已经持久化了,这个含义明确。
logger->inc(l_paxos_begin_bytes, t->get_bytes());
utime_t start = ceph_clock_now(NULL);
get_store()->apply_transaction(t);
utime_t end = ceph_clock_now(NULL);
logger->tinc(l_paxos_begin_latency, end - start);
assert(g_conf->paxos_kill_at != 5);
// reply
MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT,
ceph_clock_now(g_ceph_context));
accept->pn = accepted_pn;
accept->last_committed = last_committed;
begin->get_connection()->send_message(accept);
}
2.3 Leader收到ack后处理
//Phase 2, Leader 收到Peon的ack后的处理
// leader
void Paxos::handle_accept(MonOpRequestRef op)
{
op->mark_paxos_event("handle_accept");
MMonPaxos *accept = static_cast(op->get_req());
dout(10) << "handle_accept " << *accept << dendl;
int from = accept->get_source().num();
if (accept->pn != accepted_pn) {//更高的PN,可能别人已经当选leader
// we accepted a higher pn, from some other leader
dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl;
op->mark_paxos_event("have higher pn, ignore");
return;
}
if (last_committed > 0 &&
accept->last_committed < last_committed-1) {//旧的响应,比如网络延迟太大引起。抛弃。
dout(10) << " this is from an old round, ignoring" << dendl;
op->mark_paxos_event("old round, ignore");
return;
}
assert(accept->last_committed == last_committed || // not committed
accept->last_committed == last_committed-1); // committed
// 只允许差1
assert(is_updating() || is_updating_previous());
assert(accepted.count(from) == 0); //确认同一个peon不重复发送
accepted.insert(from);
dout(10) << " now " << accepted << " have accepted" << dendl;
assert(g_conf->paxos_kill_at != 6);
// only commit (and expose committed state) when we get *all* quorum
// members to accept. otherwise, they may still be sharing the now
// stale state.
// FIXME: we can improve this with an additional lease revocation message
// that doesn't block for the persist.
if (accepted == mon->get_quorum()) {
//所有的属于quorum的都响应才行,否则会走到timeout分支
// yay, commit!
op->mark_paxos_event("commit_start");
commit_start(); //这个函数,最终引起调用commit_finish,修改last_committed。
}
}
void Paxos::accept_timeout()
{
dout(1) << "accept timeout, calling fresh election" << dendl;
accept_timeout_event = 0;
assert(mon->is_leader());
assert(is_updating() || is_updating_previous() || is_writing() ||
is_writing_previous());
logger->inc(l_paxos_accept_timeout);
mon->bootstrap();//注意,是直接自举,即触发重新核对qurom,并选leader。Ceph实现的特殊之处。
}
struct C_Committed : public Context {
Paxos *paxos;
C_Committed(Paxos *p) : paxos(p) {}
void finish(int r) {
assert(r >= 0);
//这个类,在构造函数中获得锁,析构函数中释放锁
Mutex::Locker l(paxos->mon->lock);
//下面这一段执行,是受mon->lock保护的
paxos->commit_finish();
}
};
//commit_start只有leader会调用,所以commit_finish 也就只有leader用了
void Paxos::commit_start()
{
dout(10) << __func__ << " " << (last_committed+1) << dendl;
assert(g_conf->paxos_kill_at != 7);
MonitorDBStore::TransactionRef t(new MonitorDBStore::Transaction);
// commit locally
t->put(get_name(), "last_committed", last_committed + 1);
//这个修改,下面是queue_trans,不是apply
// decode the value and apply its transaction to the store.
// this value can now be read from last_committed.
decode_append_transaction(t, new_value);
dout(30) << __func__ << " transaction dump:\n";
JSONFormatter f(true);
t->dump(&f);
f.flush(*_dout);
*_dout << dendl;
logger->inc(l_paxos_commit);
logger->inc(l_paxos_commit_keys, t->get_keys());
logger->inc(l_paxos_commit_bytes, t->get_bytes());
commit_start_stamp = ceph_clock_now(NULL);
//C_Committed在finish执行时,会获取锁(在finish函数),但是当前上下文应该持有了锁,
//什么时候放锁的? 否则C_Committed:finish() 没法拿到锁。
get_store()->queue_transaction(t, new C_Committed(this));
// 钩子函数,txn结束时callback, C_Committed的finish? trans
//在MonitorDBStore.h的finish()中有注释,实际上是在做了txn apply后,调用callback
if (is_updating_previous())
state = STATE_WRITING_PREVIOUS;
else if (is_updating())
state = STATE_WRITING;//设置这两个状态后,依赖于异步的commit完成回调,去做清除,
//本context实际上很快结束,不会修改此状态。参见paxos::commit_finish()
else
assert(0);
if (mon->get_quorum().size() > 1) {
// cancel timeout event
mon->timer.cancel_event(accept_timeout_event);
accept_timeout_event = 0;
}
}
//Leader的函数
void Paxos::commit_finish()
{
dout(20) << __func__ << " " << (last_committed+1) << dendl;
utime_t end = ceph_clock_now(NULL);
logger->tinc(l_paxos_commit_latency, end - commit_start_stamp);
assert(g_conf->paxos_kill_at != 8);
// cancel lease - it was for the old value.
// (this would only happen if message layer lost the 'begin', but
// leader still got a majority and committed with out us.)
lease_expire = utime_t(); // cancel lease
//这里才修改last_committed
last_committed++;//这里才修改last_committed
last_commit_time = ceph_clock_now(NULL);
// refresh first_committed; this txn may have trimmed. //说了可能trim log。流程还么仔细看
first_committed = get_store()->get(get_name(), "first_committed");
_sanity_check_store();
// tell everyone
for (set::const_iterator p = mon->get_quorum().begin();
p != mon->get_quorum().end();
++p) {
if (*p == mon->rank) continue;
dout(10) << " sending commit to mon." << *p << dendl;
MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT,
ceph_clock_now(g_ceph_context));
//leader 在commit完成后,通知peon
commit->values[last_committed] = new_value;
commit->pn = accepted_pn;
commit->last_committed = last_committed;
mon->messenger->send_message(commit, mon->monmap->get_inst(*p));
//本地的last_committed等信息已经修改了,通知其他的
}
assert(g_conf->paxos_kill_at != 9);
// get ready for a new round.
new_value.clear();
remove_legacy_versions();
// WRITING -> REFRESH
// among other things, this lets do_refresh() -> mon->bootstrap() know
// it doesn't need to flush the store queue
assert(is_writing() || is_writing_previous());
state = STATE_REFRESH;
//do_refresh(),是让服务层刷新知道该propose了。
if (do_refresh()) {//do_refresh的注释中(.h),在它返回false时,abort,应该是异常情况吧。
commit_proposal();//这个应该是调用上次注册的complete函数
if (mon->get_quorum().size() > 1) {//如果只有我自己,单个monitor,那就无需lease了。
extend_lease();
}
//唤醒等待者
finish_contexts(g_ceph_context, waiting_for_commit);
assert(g_conf->paxos_kill_at != 10);
finish_round();//修改状态,本轮结束。让pending的proposal可以继续执行。
//修改了last_committed和status。但是这个同步写完再做下一个的方式,比较慢。
}
}
2.4 Peon处理commit消息
//Peon的函数,leader通知哪些已经commit,这些是可以信任的
void Paxos::handle_commit(MonOpRequestRef op)
{
op->mark_paxos_event("handle_commit");
MMonPaxos *commit = static_cast(op->get_req());
dout(10) << "handle_commit on " << commit->last_committed << dendl;
logger->inc(l_paxos_commit);
if (!mon->is_peon()) {
dout(10) << "not a peon, dropping" << dendl;
assert(0);
return;
}
op->mark_paxos_event("store_state");
store_state(commit);
if (do_refresh()) {//让service层刷新状态
finish_contexts(g_ceph_context, waiting_for_commit);//Peon端,没有等待被propose的。
}
}
2.5 让上层服务刷新状态的工具函数
/*一个paxos过程结束后,需要让上层的各个service(monitor)刷新状态。
因为paxos这层本身不知道语义,只是确定执行顺序而已。一个paxos决议可能
包含了几个上层service的内容。
*/
bool Paxos::do_refresh()
{
bool need_bootstrap = false;
utime_t start = ceph_clock_now(NULL);
// make sure we have the latest state loaded up
mon->refresh_from_paxos(&need_bootstrap);
utime_t end = ceph_clock_now(NULL);
logger->inc(l_paxos_refresh);
logger->tinc(l_paxos_refresh_latency, end - start);
if (need_bootstrap) {//需要bootstrap才返回false,正常都是成功
dout(10) << " doing requested bootstrap" << dendl;
mon->bootstrap();
return false;
}
return true;
}
//唤醒上层等待当前提议完成的上下文
void Paxos::commit_proposal()
{
dout(10) << __func__ << dendl;
assert(mon->is_leader());
assert(is_refresh());
list ls;
ls.swap(committing_finishers);
//从pending_finishers ==swap==> committing_finishers ==swap==> ls
finish_contexts(g_ceph_context, ls);
//做callback,paxosservice调用 queue_pending_finisher()注册的钩子
}
2.6 完成本轮,开始下一轮
//已经完成上一轮提议过程,可以开始下一个
void Paxos::finish_round()
{
dout(10) << __func__ << dendl;
assert(mon->is_leader());
// ok, now go active!
state = STATE_ACTIVE;//不是active是不会去propose的。
dout(20) << __func__ << " waiting_for_acting" << dendl;
finish_contexts(g_ceph_context, waiting_for_active);
dout(20) << __func__ << " waiting_for_readable" << dendl;
finish_contexts(g_ceph_context, waiting_for_readable);
dout(20) << __func__ << " waiting_for_writeable" << dendl;
finish_contexts(g_ceph_context, waiting_for_writeable);
dout(10) << __func__ << " done w/ waiters, state " << state << dendl;
if (should_trim()) {
trim();
}
if (is_active() && pending_proposal) {
propose_pending();
}
}
2.7 其他
/*
* return a globally unique, monotonically increasing proposal number
*/
version_t Paxos::get_new_proposal_number(version_t gt)
{
if (last_pn < gt)
last_pn = gt;
//每个monitor有自己的rank,把rank作为自己产生的PN的低位数,则各自不同。
//比如,rank=5的,产生的rank只可能是105, 205, 305等,即n*100 +5
// update. make it unique among all monitors.
last_pn /= 100; //由于gt可能是别人发过来的,是不同的rank,如果直接
//把last_pn +=100,last_pn所带的rank就是别人的,不是自己应该产生的合法pn。
last_pn++;
last_pn *= 100;
last_pn += (version_t)mon->rank;//如果之前last_pn = 306,而我的rank是 5,
//则得到了新的last_pn是405
// write
MonitorDBStore::TransactionRef t(new MonitorDBStore::Transaction);
t->put(get_name(), "last_pn", last_pn);//持久化到kv
dout(30) << __func__ << " transaction dump:\n";
JSONFormatter f(true);
t->dump(&f);
f.flush(*_dout);
*_dout << dendl;
logger->inc(l_paxos_new_pn);
utime_t start = ceph_clock_now(NULL);
get_store()->apply_transaction(t);
utime_t end = ceph_clock_now(NULL);
logger->tinc(l_paxos_new_pn_latency, end - start);
dout(10) << "get_new_proposal_number = " << last_pn << dendl;
return last_pn;
}
void Paxos::cancel_events()
{
if (collect_timeout_event) {
mon->timer.cancel_event(collect_timeout_event);
collect_timeout_event = 0;
}
if (accept_timeout_event) {
mon->timer.cancel_event(accept_timeout_event);
accept_timeout_event = 0;
}
if (lease_renew_event) {
mon->timer.cancel_event(lease_renew_event);
lease_renew_event = 0;
}
if (lease_ack_timeout_event) {
mon->timer.cancel_event(lease_ack_timeout_event);
lease_ack_timeout_event = 0;
}
if (lease_timeout_event) {
mon->timer.cancel_event(lease_timeout_event);
lease_timeout_event = 0;
}
}
void Paxos::shutdown()
{
dout(10) << __func__ << " cancel all contexts" << dendl;
// discard pending transaction
pending_proposal.reset();
finish_contexts(g_ceph_context, waiting_for_writeable, -ECANCELED);
finish_contexts(g_ceph_context, waiting_for_commit, -ECANCELED);
finish_contexts(g_ceph_context, waiting_for_readable, -ECANCELED);
finish_contexts(g_ceph_context, waiting_for_active, -ECANCELED);
finish_contexts(g_ceph_context, pending_finishers, -ECANCELED);
finish_contexts(g_ceph_context, committing_finishers, -ECANCELED);
if (logger)
g_ceph_context->get_perfcounters_collection()->remove(logger);
delete logger;
}
void Paxos::leader_init()
{
cancel_events();
new_value.clear();
// discard pending transaction
pending_proposal.reset();//当选leader之前的都废弃掉
finish_contexts(g_ceph_context, pending_finishers, -EAGAIN);
finish_contexts(g_ceph_context, committing_finishers, -EAGAIN);
logger->inc(l_paxos_start_leader);
if (mon->get_quorum().size() == 1) {
state = STATE_ACTIVE;
return;
}
state = STATE_RECOVERING;
lease_expire = utime_t();
dout(10) << "leader_init -- starting paxos recovery" << dendl;
collect(0);
}
void Paxos::peon_init()
{
cancel_events();
new_value.clear();
state = STATE_RECOVERING;
lease_expire = utime_t();
dout(10) << "peon_init -- i am a peon" << dendl;
// start a timer, in case the leader never manages to issue a lease
reset_lease_timeout();
// discard pending transaction
pending_proposal.reset();
// no chance to write now!
finish_contexts(g_ceph_context, waiting_for_writeable, -EAGAIN);
finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
finish_contexts(g_ceph_context, pending_finishers, -EAGAIN);
finish_contexts(g_ceph_context, committing_finishers, -EAGAIN);
logger->inc(l_paxos_start_peon);
}
void Paxos::restart()
{
dout(10) << "restart -- canceling timeouts" << dendl;
cancel_events();
new_value.clear();
if (is_writing() || is_writing_previous()) {
dout(10) << __func__ << " flushing" << dendl;
mon->lock.Unlock();
mon->store->flush();
mon->lock.Lock();
dout(10) << __func__ << " flushed" << dendl;
}
state = STATE_RECOVERING;
// discard pending transaction
pending_proposal.reset();
finish_contexts(g_ceph_context, committing_finishers, -EAGAIN);
finish_contexts(g_ceph_context, pending_finishers, -EAGAIN);
finish_contexts(g_ceph_context, waiting_for_commit, -EAGAIN);
finish_contexts(g_ceph_context, waiting_for_active, -EAGAIN);
logger->inc(l_paxos_restart);
}
void Paxos::dispatch(MonOpRequestRef op)
{
assert(op->is_type_paxos());
op->mark_paxos_event("dispatch");
PaxosServiceMessage *m = static_cast(op->get_req());
// election in progress?
if (!mon->is_leader() && !mon->is_peon()) {
dout(5) << "election in progress, dropping " << *m << dendl;
return;
}
// check sanity
assert(mon->is_leader() ||
(mon->is_peon() && m->get_source().num() == mon->get_leader()));
//应该是指peon只接受leader的消息,但是不能接受其他peon的消息
switch (m->get_type()) {
case MSG_MON_PAXOS:
{
MMonPaxos *pm = reinterpret_cast(m);
// NOTE: these ops are defined in messages/MMonPaxos.h
switch (pm->op) {
// learner
case MMonPaxos::OP_COLLECT:
handle_collect(op);
break;
case MMonPaxos::OP_LAST:
handle_last(op);
break;
case MMonPaxos::OP_BEGIN:
handle_begin(op);
break;
case MMonPaxos::OP_ACCEPT:
handle_accept(op);
break;
case MMonPaxos::OP_COMMIT:
handle_commit(op);
break;
case MMonPaxos::OP_LEASE:
handle_lease(op);
break;
case MMonPaxos::OP_LEASE_ACK:
handle_lease_ack(op);
break;
default:
assert(0);
}
}
break;
default:
assert(0);
}
}
// -- WRITE --
bool Paxos::is_writeable()
{
return
mon->is_leader() &&
is_active() &&
is_lease_valid();
}
void Paxos::propose_pending()
{
assert(is_active());
assert(pending_proposal);
cancel_events();
bufferlist bl;
pending_proposal->encode(bl);
dout(10) << __func__ << " " << (last_committed + 1)
<< " " << bl.length() << " bytes" << dendl;
dout(30) << __func__ << " transaction dump:\n";
JSONFormatter f(true);
pending_proposal->dump(&f);
f.flush(*_dout);
*_dout << dendl;
pending_proposal.reset();//让pending_proposal不再ref里面的Transaction类型对象,
//参见http://en.cppreference.com/w/cpp/memory/shared_ptr/reset
committing_finishers.swap(pending_finishers);
//list::swap()的含义: Exchanges the contents of two lists,
// http://www.cplusplus.com/reference/list/list/swap-free/
state = STATE_UPDATING;
begin(bl);
}
void Paxos::queue_pending_finisher(Context *onfinished)
{
dout(5) << __func__ << " " << onfinished << dendl;
assert(onfinished);
pending_finishers.push_back(onfinished);
}
//注意,上层通过这个函数获取txn,然后再往里面添加内容。
//按照这么理解,一次propose,可能有多个操作被打包
MonitorDBStore::TransactionRef Paxos::get_pending_transaction()
//pending transaction,不是proposal
{
assert(mon->is_leader());
if (!pending_proposal) {
pending_proposal.reset(new MonitorDBStore::Transaction);
assert(pending_finishers.empty());
}
return pending_proposal;
}
bool Paxos::trigger_propose()
{
if (is_active()) {
dout(10) << __func__ << " active, proposing now" << dendl;
propose_pending();
return true;
} else {
dout(10) << __func__ << " not active, will propose later" << dendl;
return false;
}
}
bool Paxos::is_consistent()
{
return (first_committed <= last_committed);
}