ceph源码分析--monitor的lease机制

概述

在monitor节点中,存在着Leader和Peon两种角色。在monitor采用了一种lease机制,保证了副本在一定时间内可读写。同时lease机制也保证了整个集群中的monitor当前都是可用状态。

Leader节点会向所有的Peon节点定时的发送lease消息,延长各个节点的lease时间,同时收集所有节点的ack消息。只要有一个节点没有回复ack消息。就会重新发起选举。
同理,Peon节点一直在等待Leader向自己发送lease消息。超时之后,也会重新发起选举。
这样就保证了整个monitor集群的可用性。

源码分析

从lease的发起者extend_lease()开始讲

void Paxos::extend_lease()
{
  //断言lease是由Leader节点发起
  assert(mon->is_leader());
  //assert(is_active());
  //当前时间+5s作为租期
  lease_expire = ceph_clock_now();
  lease_expire += g_conf->mon_lease;
  //已经收到的lease回复集合清空。将leader节点加入集合
  acked_lease.clear();
  acked_lease.insert(mon->rank);

  dout(7) << "extend_lease now+" << g_conf->mon_lease 
      << " (" << lease_expire << ")" << dendl;

  // bcast
  for (set<int>::const_iterator p = mon->get_quorum().begin();
      p != mon->get_quorum().end(); ++p) {
    //向quorum中的所有peon节点发送lease消息
    if (*p == mon->rank) continue;
    MMonPaxos *lease = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE,
                     ceph_clock_now());
    lease->last_committed = last_committed;
    lease->lease_timestamp = lease_expire;
    lease->first_committed = first_committed;
    mon->messenger->send_message(lease, mon->monmap->get_inst(*p));
  }

  // set timeout event.
  //  if old timeout is still in place, leave it.
  if (!lease_ack_timeout_event) {
    lease_ack_timeout_event = mon->timer.add_event_after(
    //2*5=10超时时间为10s
      g_conf->mon_lease_ack_timeout_factor * g_conf->mon_lease,
      new C_MonContext(mon, [this](int r) {
      if (r == -ECANCELED)
        return;
        //超时后发起选举
      lease_ack_timeout();
    }));
  }

  // set renew event
  //继续发起下一轮extend_lease
  utime_t at = lease_expire;
  at -= g_conf->mon_lease;
  at += g_conf->mon_lease_renew_interval_factor * g_conf->mon_lease;
  lease_renew_event = mon->timer.add_event_at(
    at, new C_MonContext(mon, [this](int r) {
    if (r == -ECANCELED)
      return;
    lease_renew_timeout();
    }));
}

注册一个dispatch来接收ack消息

void Paxos::dispatch(MonOpRequestRef op)
{
  assert(op->is_type_paxos());
  op->mark_paxos_event("dispatch");
  PaxosServiceMessage *m = static_cast(op->get_req());
  // election in progress?
  if (!mon->is_leader() && !mon->is_peon()) {
    dout(5) << "election in progress, dropping " << *m << dendl;
    return;    
  }

  // check sanity
  assert(mon->is_leader() || 
     (mon->is_peon() && m->get_source().num() == mon->get_leader()));

  switch (m->get_type()) {

  case MSG_MON_PAXOS:
    {
      MMonPaxos *pm = reinterpret_cast(m);

      // NOTE: these ops are defined in messages/MMonPaxos.h
      switch (pm->op) {
    // learner
      case MMonPaxos::OP_COLLECT:
    handle_collect(op);
    break;
      case MMonPaxos::OP_LAST:
    handle_last(op);
    break;
      case MMonPaxos::OP_BEGIN:
    handle_begin(op);
    break;
      case MMonPaxos::OP_ACCEPT:
    handle_accept(op);
    break;      
      case MMonPaxos::OP_COMMIT:
    handle_commit(op);
    break;
      case MMonPaxos::OP_LEASE:
    handle_lease(op);
    break;
    //处理lease_ack
      case MMonPaxos::OP_LEASE_ACK:
    handle_lease_ack(op);
    break;

handle_lease_ack

void Paxos::handle_lease_ack(MonOpRequestRef op)
{
  op->mark_paxos_event("handle_lease_ack");
  MMonPaxos *ack = static_cast(op->get_req());
  int from = ack->get_source().num();

  if (!lease_ack_timeout_event) {
    dout(10) << "handle_lease_ack from " << ack->get_source()
         << " -- stray (probably since revoked)" << dendl;

  } else if (acked_lease.count(from) == 0) {
    acked_lease.insert(from);
    if (ack->feature_map.length()) {
      auto p = ack->feature_map.begin();
      FeatureMap& t = mon->quorum_feature_map[from];
      ::decode(t, p);
    }
    if (acked_lease == mon->get_quorum()) {
      // yay!
     //收到了所有的ack消息。取消掉lease超时事件
      dout(10) << "handle_lease_ack from " << ack->get_source()
           << " -- got everyone" << dendl;
      mon->timer.cancel_event(lease_ack_timeout_event);
      lease_ack_timeout_event = 0;


    } else {
      dout(10) << "handle_lease_ack from " << ack->get_source()
           << " -- still need "
           << mon->get_quorum().size() - acked_lease.size()
           << " more" << dendl;
    }
  } else {
    //acked_lease集合里已有,重复ack,忽略
    dout(10) << "handle_lease_ack from " << ack->get_source()
         << " dup (lagging!), ignoring" << dendl;
  }

  warn_on_future_time(ack->sent_timestamp, ack->get_source());
}

如果超时时间内没有收集到所有的ack消息
那么lease_ack_timeout()会被调用

 lease_renew_event = mon->timer.add_event_at(
    at, new C_MonContext(mon, [this](int r) {
    if (r == -ECANCELED)
      return;
    lease_renew_timeout();
    }));

class C_MonContext final : public FunctionContext {
  const Monitor *mon;
public:
  explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback)
    : FunctionContext(std::move(callback)), mon(m) {}
  void finish(int r) override;
};


void Paxos::lease_ack_timeout(),发起选举

void Paxos::lease_ack_timeout()
{
  dout(1) << "lease_ack_timeout -- calling new election" << dendl;
  assert(mon->is_leader());
  assert(is_active());
  logger->inc(l_paxos_lease_ack_timeout);
  lease_ack_timeout_event = 0;
  mon->bootstrap();
}

关于Leader节点差不多到底就结束了

下面是peon节点的lease处理
那么peon节点在收到leader节点发出的lease消息做了哪些处理呢?

void Paxos::handle_lease(MonOpRequestRef op)
{
  op->mark_paxos_event("handle_lease");
  MMonPaxos *lease = static_cast<MMonPaxos*>(op->get_req());
  // sanity
  if (!mon->is_peon() ||
      last_committed != lease->last_committed) {
    dout(10) << "handle_lease i'm not a peon, or they're not the leader,"
         << " or the last_committed doesn't match, dropping" << dendl;
    op->mark_paxos_event("invalid lease, ignore");
    return;
  }

  warn_on_future_time(lease->sent_timestamp, lease->get_source());

  // extend lease
  //收到leader的lease消息后,根据lease消息来延长
  if (lease_expire < lease->lease_timestamp) {
    lease_expire = lease->lease_timestamp;

    utime_t now = ceph_clock_now();
    //如果lease延长后比当前时间还要早,告警,可能是monitor laggy或者是各monitor节点之间时间差距较大
    if (lease_expire < now) {
      utime_t diff = now - lease_expire;
      derr << "lease_expire from " << lease->get_source_inst() << " is " << diff << " seconds in the past; mons are probably laggy (or possibly clocks are too skewed)" << dendl;
    }
  }

  state = STATE_ACTIVE;

  dout(10) << "handle_lease on " << lease->last_committed
       << " now " << lease_expire << dendl;

  // ack
  //发送ack消息到leader
  MMonPaxos *ack = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE_ACK,
                 ceph_clock_now());
  ack->last_committed = last_committed;
  ack->first_committed = first_committed;
  ack->lease_timestamp = ceph_clock_now();
  ::encode(mon->session_map.feature_map, ack->feature_map);
  lease->get_connection()->send_message(ack);

  // (re)set timeout event.
  //重新设置超时事件
  reset_lease_timeout();

  // kick waiters
  finish_contexts(g_ceph_context, waiting_for_active);
  if (is_readable())
    finish_contexts(g_ceph_context, waiting_for_readable);
}

那么当leader节点不给peon节点lease消息的时候,peon是如何触发选举的呢
关键就是void Paxos::reset_lease_timeout(),当超时时间内一直未收到leader的消息,则进行重新选举

void Paxos::reset_lease_timeout()
{
  dout(20) << "reset_lease_timeout - setting timeout event" << dendl;
  if (lease_timeout_event)
    mon->timer.cancel_event(lease_timeout_event);
  lease_timeout_event = mon->timer.add_event_after(
    g_conf->mon_lease_ack_timeout_factor * g_conf->mon_lease,
    new C_MonContext(mon, [this](int r) {
    if (r == -ECANCELED)
      return;
    lease_timeout();
      }));
}

void Paxos::lease_timeout()
{
  dout(1) << "lease_timeout -- calling new election" << dendl;
  assert(mon->is_peon());
  logger->inc(l_paxos_lease_timeout);
  lease_timeout_event = 0;
  mon->bootstrap();
}

行文至此,lease的主要工作工作机理已经清楚。
但还遗留着几个问题。
1.extend_lease由谁来调用呢?
2.消息传递的整个流程是怎么样的呢?

2.消息传递博客还在写。
1.问题首先可以看到extend_lease会调用lease_renew_timeout()函数,在其中又调用了extend_lease,以进行不间断的lease消息发送。

void Paxos::lease_renew_timeout()
{
  lease_renew_event = 0;
  extend_lease();
}

可以看到

// leader
void Paxos::handle_last(MonOpRequestRef op)

这个函数中调用了extend_lease()
那关于这个函数的作用呢,留在ceph paxos博客中再进行探讨

你可能感兴趣的:(Ceph)