集群ceph df中pool 与 global不一致问题

一 引言
某地存储项目中由于每天都需要统计存储使用量及业务量,在某一日统计存储使用量时发现global中容量与各池的可用容量不一致,由此产生疑问。
二 Global & Pool统计
分析代码,发现ceph df在monitor中的处理逻辑是monito.cc handle_command:
这部分代码里主要分为两部分:一部分是对于Global输出的统计,另外一部分是对于各池的统计。

{
      bool verbose = (detail == "detail");
      if (f)
        f->open_object_section("stats");

      pgservice->dump_fs_stats(&ds, f.get(), verbose);//Global统计
      if (!f)
        ds << '\n';
      pgservice->dump_pool_stats(osdmon()->osdmap, &ds, f.get(), verbose);//各存储池统计

      if (f) {
        f->close_section();
        f->flush(ds);
        ds << '\n';
      }
    }

非detail版本的Global主要依赖于osd_sum这个变量,这个变量的来源是osd_stat_t。osd_stat_t这个结构的值统计在update_osd_stat中。

三 Global统计

**monitor.cc:dump_fs_stats--->mgrstatmonitor.cc:dump_fs_stats**
void dump_fs_stats(stringstream *ss,
		     Formatter *f,
		     bool verbose) const override {
    digest.dump_fs_stats(ss, f, verbose);
  }
  **mgrstatmonitor.cc:dump_fs_stats:**
  
void PGMapDigest::dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const
{
  if (f) {
    f->open_object_section("stats");
    f->dump_int("total_bytes", osd_sum.kb * 1024ull);
    f->dump_int("total_used_bytes", osd_sum.kb_used * 1024ull);
    f->dump_int("total_avail_bytes", osd_sum.kb_avail * 1024ull);
    if (verbose) {
      f->dump_int("total_objects", pg_sum.stats.sum.num_objects);
    }
    f->close_section();
  } else {
    assert(ss != nullptr);
    TextTable tbl;
    tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
    tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
    tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
    tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
    if (verbose) {
      tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
    }
    tbl << stringify(si_t(osd_sum.kb*1024))
        << stringify(si_t(osd_sum.kb_avail*1024))
        << stringify(si_t(osd_sum.kb_used*1024));
    float used = 0.0;
    if (osd_sum.kb > 0) {
      used = ((float)osd_sum.kb_used / osd_sum.kb);
    }
    tbl << percentify(used*100);
    if (verbose) {
      tbl << stringify(si_t(pg_sum.stats.sum.num_objects));
    }
    tbl << TextTable::endrow;

    *ss << "GLOBAL:\n";
    tbl.set_indent(4);
    *ss << tbl;
  }
}


void OSDService::update_osd_stat(vector& hb_peers)
{
  // load osd stats first
  struct store_statfs_t stbuf;
  int r = osd->store->statfs(&stbuf);//获取osd 对应fs信息入buf
  if (r < 0) {
    derr << "statfs() failed: " << cpp_strerror(r) << dendl;
    return;
  }

  auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());//更新osd_stat_t结构
  dout(20) << "update_osd_stat " << new_stat << dendl;
  assert(new_stat.kb);
  float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
  check_full_status(ratio);
}

由上可知,ceph df输出中Global一项来自于各osd磁盘的文件系统统计,与实际情况比较相符,符合线上项目现象。

四 POOL统计

此图引用至其他博客,能很好的描述pool统计的过程
集群ceph df中pool 与 global不一致问题_第1张图片


void PGMapDigest::dump_pool_stats_full(
  const OSDMap &osd_map,
  stringstream *ss,
  Formatter *f,
  bool verbose) const
{
  TextTable tbl;

/*按需输出格式*/
  if (f) {
    f->open_array_section("pools");
  } else {
    tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
    tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
    if (verbose) {
      tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
      tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
    }

    tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
    tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
    tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
    tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
    if (verbose) {
      tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
      tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
      tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
      tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
    }
  }

  map avail_by_rule;
  for (auto p = osd_map.get_pools().begin();
       p != osd_map.get_pools().end(); ++p) {//遍历各pool
    int64_t pool_id = p->first;
    if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
      continue;
    const string& pool_name = osd_map.get_pool_name(pool_id);
    const pool_stat_t &stat = pg_pool_sum.at(pool_id);//获取对应pool统计信息

    const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
    int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
                                         pool->get_type(),
                                         pool->get_size());
    int64_t avail;
    float raw_used_rate;
    if (avail_by_rule.count(ruleno) == 0) {
      // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
      avail = get_rule_avail(ruleno);
      if (avail < 0)
	avail = 0;
      avail_by_rule[ruleno] = avail;
    } else {
      avail = avail_by_rule[ruleno];
    }

    raw_used_rate = ::pool_raw_used_rate(osd_map, pool_id);

    if (f) {
      f->open_object_section("pool");
      f->dump_string("name", pool_name);
      f->dump_int("id", pool_id);
      f->open_object_section("stats");
    } else {
      tbl << pool_name
          << pool_id;
      if (verbose) {
        if (pool->quota_max_objects == 0)
          tbl << "N/A";
        else
          tbl << si_t(pool->quota_max_objects);

        if (pool->quota_max_bytes == 0)
          tbl << "N/A";
        else
          tbl << si_t(pool->quota_max_bytes);
      }

    }
    dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);
    if (f)
      f->close_section();  // stats
    else
      tbl << TextTable::endrow;

    if (f)
      f->close_section();  // pool
  }
  if (f)
    f->close_section();
  else {
    assert(ss != nullptr);
    *ss << "POOLS:\n";
    tbl.set_indent(4);
    *ss << tbl;
  }
}

其核心是dump_object_stat_sum这个函数:


void PGMapDigest::dump_pool_stats_full(
  const OSDMap &osd_map,
  stringstream *ss,
  Formatter *f,
  bool verbose) const
{
  TextTable tbl;

  if (f) {
    f->open_array_section("pools");
  } else {
    tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
    tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
    if (verbose) {
      tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
      tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
    }

    tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
    tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
    tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
    tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
    if (verbose) {
      tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
      tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
      tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
      tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
    }
  }

  map avail_by_rule;
  for (auto p = osd_map.get_pools().begin();
       p != osd_map.get_pools().end(); ++p) {
    int64_t pool_id = p->first;
    if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
      continue;
    const string& pool_name = osd_map.get_pool_name(pool_id);
    const pool_stat_t &stat = pg_pool_sum.at(pool_id);//查找对应存储池对应的stat值

    const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
    int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
                                         pool->get_type(),
                                         pool->get_size());//查找对应存储池的crushmap 以及副本策略
    int64_t avail;
    float raw_used_rate;
    if (avail_by_rule.count(ruleno) == 0) {
      // FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
      avail = get_rule_avail(ruleno);
      if (avail < 0)
	avail = 0;
      avail_by_rule[ruleno] = avail;
    } else {
      avail = avail_by_rule[ruleno];
    }

    raw_used_rate = ::pool_raw_used_rate(osd_map, pool_id);//根据副本类型获取已用容量

    if (f) {
      f->open_object_section("pool");
      f->dump_string("name", pool_name);
      f->dump_int("id", pool_id);
      f->open_object_section("stats");
    } else {
      tbl << pool_name
          << pool_id;
      if (verbose) {
        if (pool->quota_max_objects == 0)
          tbl << "N/A";
        else
          tbl << si_t(pool->quota_max_objects);

        if (pool->quota_max_bytes == 0)
          tbl << "N/A";
        else
          tbl << si_t(pool->quota_max_bytes);
      }

    }
    dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);//统计输出
    if (f)
      f->close_section();  // stats
    else
      tbl << TextTable::endrow;

    if (f)
      f->close_section();  // pool
  }
  if (f)
    f->close_section();
  else {
    assert(ss != nullptr);
    *ss << "POOLS:\n";
    tbl.set_indent(4);
    *ss << tbl;
  }
}

以上,是按照pool的格式进行统计输出,其中最核心的各池stat信息来源并未说明,需要继续查找stat来源。利用stat结构反向查找发现基本无好的线索来入手,结合pool的used是通过osd来更新,我们知道在发生write、delete等操作时会更新统计信息,更新ctx->delta_stats。举例的话,可以从处理WRITE的op为入手点,当处理CEPH_OSD_OP_WRITE类型的op的时候,会调用write_update_size_and_usage(),里面会更新ctx->delta_stats。当IO处理完,也就是applied和commited之后,会publish_stats_to_osd()。 这里会将变化的pg的stat_queue_item入队到pg_stat_queue中。然后设置osd_stat_updated为True。入队之后,由tick_timer在C_Tick_WithoutOSDLock这个ctx中通过send_pg_stats()将PG的状态发送给Monitor。这样Monitor就可以知道pg的的变化了。
可用空间,即MAX AVAIL的值,计算稍微有点复杂。Ceph是先计算Available的值,然后根据副本策略再计算MAX AVAIL的值。Available的值是在get_rule_avail()中计算的。在该函数中通过get_rule_weight_osd_map()算出来一个有weight的osd列表。 注意这里的weight一般是小于1的,因为它除以了sum。而sum就是pool中所有osd weight的总和。在拿到weight列表后,就会根据pg_map.osd_stat中kb_avail的值进行除以weight,选出其中最小的,作为Available的值。 这么描述有些抽象了,具体举一个例子。比如这里我们的pool中有三个osd,假设kb_avail都是400G 即, {osd_0: 0.9, osd_1, 0.8, osd_2: 0.7}。计算出来的weight值是{osd_0: 0.9/2.4,osd_1: 0.8/2.4,osd_2: 0.7/2.4} 这样后面用osd的available 空间除以这里的weight值,这里的Available的值就是400G*0.7/2.4。这一节引用至unitedstatck博客,博客里对于这个值的计算非常清晰。

你可能感兴趣的:(ceph)