ceph源码分析--onode lru

onode是bluestore中的元数据形式,由于bluestore直接写裸盘,因此需要onode来管理对象。本文就讲讲onode的缓存算法。

在bluestore的cache中存在着lru和twoq两种,但是关于onode元数据的cache采用的都是lru算法。

1.lru上已有元素在访问到时怎么到队首?

这部分得从get_onode讲起,在其中调用了

BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)

lueStore::OnodeRef BlueStore::Collection::get_onode(
  const ghobject_t& oid,
  bool create)
{
  assert(create ? lock.is_wlocked() : lock.is_locked());

  spg_t pgid;
  if (cid.is_pg(&pgid)) {
    if (!oid.match(cnode.bits, pgid.ps())) {
      lderr(store->cct) << __func__ << " oid " << oid << " not part of "
            << pgid << " bits " << cnode.bits << dendl;
      ceph_abort();
    }
  }

  OnodeRef o = onode_map.lookup(oid);
  if (o)
    return o;

  mempool::bluestore_cache_other::string key;
  get_object_key(store->cct, oid, &key);

  ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
            << pretty_binary_string(key) << dendl;

  bufferlist v;
  int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
  ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
  Onode *on;
  if (v.length() == 0) {
    assert(r == -ENOENT);
    if (!store->cct->_conf->bluestore_debug_misc &&
    !create)
      return OnodeRef();

    // new object, new onode
    on = new Onode(this, oid, key);
  } else {
    // loaded
    assert(r >= 0);
    on = new Onode(this, oid, key);
    on->exists = true;
    bufferptr::iterator p = v.front().begin_deep();
    on->onode.decode(p);
    for (auto& i : on->onode.attrs) {
      i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
    }

    // initialize extent_map
    on->extent_map.decode_spanning_blobs(p);
    if (on->onode.extent_map_shards.empty()) {
      denc(on->extent_map.inline_bl, p);
      on->extent_map.decode_some(on->extent_map.inline_bl);
      on->extent_map.inline_bl.reassign_to_mempool(
    mempool::mempool_bluestore_cache_other);
    } else {
      on->extent_map.init_shards(false, false);
    }
  }
  o.reset(on);
  return onode_map.add(oid, o);
}

再关注一下lookup函数,发现在其中当命中了cache时会调用cache中的
void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)

BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
{
  ldout(cache->cct, 30) << __func__ << dendl;
  OnodeRef o;
  bool hit = false;

  {
    std::lock_guard<std::recursive_mutex> l(cache->lock);
    ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
    if (p == onode_map.end()) {
      ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
    } else {
      ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
                << dendl;
      cache->_touch_onode(p->second);
      hit = true;
      o = p->second;
    }
  }

  if (hit) {
    cache->logger->inc(l_bluestore_onode_hits);
  } else {
    cache->logger->inc(l_bluestore_onode_misses);
  }
  return o;
}

于是在void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
将命中后的onode从队列中删除并将其添加到队首

void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
{
  auto p = onode_lru.iterator_to(*o);
  onode_lru.erase(p);
  onode_lru.push_front(*o);
}

2.lru中元素是怎么增加的?

回到get_onode,该函数有两个参数,当未在onode_map中查找到对应的onode时,当create参数为false时直接返回OnodeRef。而当create参数为true时,则new一个onode对象并调用
BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
在onode_map中添加该onode,并调用cache中的方法
_add_onode(OnodeRef& o, int level)将其加到lru中

BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
{
  std::lock_guard<std::recursive_mutex> l(cache->lock);
  auto p = onode_map.find(oid);
  if (p != onode_map.end()) {
    ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
              << " raced, returning existing " << p->second
              << dendl;
    return p->second;
  }
  ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
  onode_map[oid] = o;
  cache->_add_onode(o, 1);
  return o;
}

将onode加入到lru中,源码中调用处的level都为1,所以都放在队首

void _add_onode(OnodeRef& o, int level) override {
  if (level > 0)
    onode_lru.push_front(*o);
  else
    onode_lru.push_back(*o);
}

3.什么时候对lru做trim??

void *BlueStore::MempoolThread::entry(),在其中能看到trim是定时启动的,其间隔时间是bluestore_cache_trim_interval默认是0.2s。从中也能看出,一个osd不只是有一个lru的onode,而是有多个lru。一个shard对应一个lru,默认hdd是有5个lru,而ssd稍多有8个。

void *BlueStore::MempoolThread::entry()
{
  Mutex::Locker l(lock);
  while (!stop) {
    uint64_t meta_bytes =
      mempool::bluestore_cache_other::allocated_bytes() +
      mempool::bluestore_cache_onode::allocated_bytes();
    uint64_t onode_num =
      mempool::bluestore_cache_onode::allocated_items();

    if (onode_num < 2) {
      onode_num = 2;
    }

    float bytes_per_onode = (float)meta_bytes / (float)onode_num;
    size_t num_shards = store->cache_shards.size();
    float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
    // A little sloppy but should be close enough
    uint64_t shard_target = target_ratio * (store->cache_size / num_shards);

    for (auto i : store->cache_shards) {
      i->trim(shard_target,
          store->cache_meta_ratio,
          store->cache_data_ratio,
          bytes_per_onode);
    }

    store->_update_cache_logger();

    utime_t wait;
    wait += store->cct->_conf->bluestore_cache_trim_interval;
    cond.WaitInterval(lock, wait);
  }
  stop = false;
  return NULL;
}

关注trim函数,可得到当满足current > target_bytes时才会去调用cache中的_trim

void BlueStore::Cache::trim(
  uint64_t target_bytes,
  float target_meta_ratio,
  float target_data_ratio,
  float bytes_per_onode)
{
  std::lock_guardrecursive_mutex> l(lock);
  uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
  uint64_t current_buffer = _get_buffer_bytes();
  uint64_t current = current_meta + current_buffer;

  uint64_t target_meta = target_bytes * target_meta_ratio;
  uint64_t target_buffer = target_bytes * target_data_ratio;

  // correct for overflow or float imprecision
  target_meta = min(target_bytes, target_meta);
  target_buffer = min(target_bytes - target_meta, target_buffer);

  if (current <= target_bytes) {
    dout(10) << __func__
         << " shard target " << pretty_si_t(target_bytes)
         << " meta/data ratios " << target_meta_ratio
         << " + " << target_data_ratio << " ("
         << pretty_si_t(target_meta) << " + "
         << pretty_si_t(target_buffer) << "), "
         << " current " << pretty_si_t(current) << " ("
         << pretty_si_t(current_meta) << " + "
         << pretty_si_t(current_buffer) << ")"
         << dendl;
    return;
  }

  uint64_t need_to_free = current - target_bytes;
  uint64_t free_buffer = 0;
  uint64_t free_meta = 0;
  if (current_buffer > target_buffer) {
    free_buffer = current_buffer - target_buffer;
    if (free_buffer > need_to_free) {
      free_buffer = need_to_free;
    }
  }
  free_meta = need_to_free - free_buffer;

  // start bounds at what we have now
  uint64_t max_buffer = current_buffer - free_buffer;
  uint64_t max_meta = current_meta - free_meta;
  uint64_t max_onodes = max_meta / bytes_per_onode;

  dout(10) << __func__
       << " shard target " << pretty_si_t(target_bytes)
       << " ratio " << target_meta_ratio << " ("
       << pretty_si_t(target_meta) << " + "
       << pretty_si_t(target_buffer) << "), "
       << " current " << pretty_si_t(current) << " ("
       << pretty_si_t(current_meta) << " + "
       << pretty_si_t(current_buffer) << "),"
       << " need_to_free " << pretty_si_t(need_to_free) << " ("
       << pretty_si_t(free_meta) << " + "
       << pretty_si_t(free_buffer) << ")"
       << " -> max " << max_onodes << " onodes + "
       << max_buffer << " buffer"
       << dendl;
  _trim(max_onodes, max_buffer);
}

_trim函数前半部分是数据的cache,这部分不关注略去

void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
{
  dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
       << " buffers " << buffer_bytes << " / " << buffer_max
       << dendl;

  _audit("trim start");

  ···

  // onodes
  //当lru的大小大于onode的最大值进行trim
  int num = onode_lru.size() - onode_max;
  if (num <= 0)
    return; // don't even try
  //从后往前trim,因为队尾是较久未访问的数据
  auto p = onode_lru.end();
  assert(p != onode_lru.begin());
  --p;
  int skipped = 0;
  int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
  while (num > 0) {
    Onode *o = &*p;
    dout(20) << __func__ << " considering " << o << dendl;
    int refs = o->nref.load();
    //查看是否有其他人在使用
    if (refs > 1) {
      dout(20) << __func__ << "  " << o->oid << " has " << refs
           << " refs; skipping" << dendl;
      //这部分达到最大跳过数就结束trim,我认为本处的目的是为了尽快结束trim,
      //避免影响主流程io,因为在主流程和本处trim时操作lru和相关数据时
      //都加了锁std::lock_guard l(lock);
      if (++skipped >= max_skipped) {
        dout(20) << __func__ << " maximum skip pinned reached; stopping with "
                 << num << " left to trim" << dendl;
        break;
      }

      if (p == onode_lru.begin()) {
        break;
      } else {
        p--;
        num--;
        continue;
      }
    }
    dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
    //从lru上摘除
    if (p != onode_lru.begin()) {
      onode_lru.erase(p--);
    } else {
      onode_lru.erase(p);
      assert(num == 1);
    }
    o->get();  // paranoia
    //从onode_map中删除
    o->c->onode_map.remove(o->oid);
    o->put();
    --num;
  }
}

你可能感兴趣的:(Ceph)