一 引言
某地存储项目中由于每天都需要统计存储使用量及业务量,在某一日统计存储使用量时发现global中容量与各池的可用容量不一致,由此产生疑问。
二 Global & Pool统计
分析代码,发现ceph df在monitor中的处理逻辑是monito.cc handle_command:
这部分代码里主要分为两部分:一部分是对于Global输出的统计,另外一部分是对于各池的统计。
{
bool verbose = (detail == "detail");
if (f)
f->open_object_section("stats");
pgservice->dump_fs_stats(&ds, f.get(), verbose);//Global统计
if (!f)
ds << '\n';
pgservice->dump_pool_stats(osdmon()->osdmap, &ds, f.get(), verbose);//各存储池统计
if (f) {
f->close_section();
f->flush(ds);
ds << '\n';
}
}
非detail版本的Global主要依赖于osd_sum这个变量,这个变量的来源是osd_stat_t。osd_stat_t这个结构的值统计在update_osd_stat中。
三 Global统计
**monitor.cc:dump_fs_stats--->mgrstatmonitor.cc:dump_fs_stats**
void dump_fs_stats(stringstream *ss,
Formatter *f,
bool verbose) const override {
digest.dump_fs_stats(ss, f, verbose);
}
**mgrstatmonitor.cc:dump_fs_stats:**
void PGMapDigest::dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const
{
if (f) {
f->open_object_section("stats");
f->dump_int("total_bytes", osd_sum.kb * 1024ull);
f->dump_int("total_used_bytes", osd_sum.kb_used * 1024ull);
f->dump_int("total_avail_bytes", osd_sum.kb_avail * 1024ull);
if (verbose) {
f->dump_int("total_objects", pg_sum.stats.sum.num_objects);
}
f->close_section();
} else {
assert(ss != nullptr);
TextTable tbl;
tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
if (verbose) {
tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
}
tbl << stringify(si_t(osd_sum.kb*1024))
<< stringify(si_t(osd_sum.kb_avail*1024))
<< stringify(si_t(osd_sum.kb_used*1024));
float used = 0.0;
if (osd_sum.kb > 0) {
used = ((float)osd_sum.kb_used / osd_sum.kb);
}
tbl << percentify(used*100);
if (verbose) {
tbl << stringify(si_t(pg_sum.stats.sum.num_objects));
}
tbl << TextTable::endrow;
*ss << "GLOBAL:\n";
tbl.set_indent(4);
*ss << tbl;
}
}
void OSDService::update_osd_stat(vector& hb_peers)
{
// load osd stats first
struct store_statfs_t stbuf;
int r = osd->store->statfs(&stbuf);//获取osd 对应fs信息入buf
if (r < 0) {
derr << "statfs() failed: " << cpp_strerror(r) << dendl;
return;
}
auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());//更新osd_stat_t结构
dout(20) << "update_osd_stat " << new_stat << dendl;
assert(new_stat.kb);
float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
check_full_status(ratio);
}
由上可知,ceph df输出中Global一项来自于各osd磁盘的文件系统统计,与实际情况比较相符,符合线上项目现象。
四 POOL统计
void PGMapDigest::dump_pool_stats_full(
const OSDMap &osd_map,
stringstream *ss,
Formatter *f,
bool verbose) const
{
TextTable tbl;
/*按需输出格式*/
if (f) {
f->open_array_section("pools");
} else {
tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
if (verbose) {
tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
}
tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
if (verbose) {
tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
}
}
map avail_by_rule;
for (auto p = osd_map.get_pools().begin();
p != osd_map.get_pools().end(); ++p) {//遍历各pool
int64_t pool_id = p->first;
if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
continue;
const string& pool_name = osd_map.get_pool_name(pool_id);
const pool_stat_t &stat = pg_pool_sum.at(pool_id);//获取对应pool统计信息
const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
pool->get_type(),
pool->get_size());
int64_t avail;
float raw_used_rate;
if (avail_by_rule.count(ruleno) == 0) {
// FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
avail = get_rule_avail(ruleno);
if (avail < 0)
avail = 0;
avail_by_rule[ruleno] = avail;
} else {
avail = avail_by_rule[ruleno];
}
raw_used_rate = ::pool_raw_used_rate(osd_map, pool_id);
if (f) {
f->open_object_section("pool");
f->dump_string("name", pool_name);
f->dump_int("id", pool_id);
f->open_object_section("stats");
} else {
tbl << pool_name
<< pool_id;
if (verbose) {
if (pool->quota_max_objects == 0)
tbl << "N/A";
else
tbl << si_t(pool->quota_max_objects);
if (pool->quota_max_bytes == 0)
tbl << "N/A";
else
tbl << si_t(pool->quota_max_bytes);
}
}
dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);
if (f)
f->close_section(); // stats
else
tbl << TextTable::endrow;
if (f)
f->close_section(); // pool
}
if (f)
f->close_section();
else {
assert(ss != nullptr);
*ss << "POOLS:\n";
tbl.set_indent(4);
*ss << tbl;
}
}
其核心是dump_object_stat_sum这个函数:
void PGMapDigest::dump_pool_stats_full(
const OSDMap &osd_map,
stringstream *ss,
Formatter *f,
bool verbose) const
{
TextTable tbl;
if (f) {
f->open_array_section("pools");
} else {
tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
if (verbose) {
tbl.define_column("QUOTA OBJECTS", TextTable::LEFT, TextTable::LEFT);
tbl.define_column("QUOTA BYTES", TextTable::LEFT, TextTable::LEFT);
}
tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
if (verbose) {
tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
}
}
map avail_by_rule;
for (auto p = osd_map.get_pools().begin();
p != osd_map.get_pools().end(); ++p) {
int64_t pool_id = p->first;
if ((pool_id < 0) || (pg_pool_sum.count(pool_id) == 0))
continue;
const string& pool_name = osd_map.get_pool_name(pool_id);
const pool_stat_t &stat = pg_pool_sum.at(pool_id);//查找对应存储池对应的stat值
const pg_pool_t *pool = osd_map.get_pg_pool(pool_id);
int ruleno = osd_map.crush->find_rule(pool->get_crush_rule(),
pool->get_type(),
pool->get_size());//查找对应存储池的crushmap 以及副本策略
int64_t avail;
float raw_used_rate;
if (avail_by_rule.count(ruleno) == 0) {
// FIXME: we don't guarantee avail_space_by_rule is up-to-date before this function is invoked
avail = get_rule_avail(ruleno);
if (avail < 0)
avail = 0;
avail_by_rule[ruleno] = avail;
} else {
avail = avail_by_rule[ruleno];
}
raw_used_rate = ::pool_raw_used_rate(osd_map, pool_id);//根据副本类型获取已用容量
if (f) {
f->open_object_section("pool");
f->dump_string("name", pool_name);
f->dump_int("id", pool_id);
f->open_object_section("stats");
} else {
tbl << pool_name
<< pool_id;
if (verbose) {
if (pool->quota_max_objects == 0)
tbl << "N/A";
else
tbl << si_t(pool->quota_max_objects);
if (pool->quota_max_bytes == 0)
tbl << "N/A";
else
tbl << si_t(pool->quota_max_bytes);
}
}
dump_object_stat_sum(tbl, f, stat.stats.sum, avail, raw_used_rate, verbose, pool);//统计输出
if (f)
f->close_section(); // stats
else
tbl << TextTable::endrow;
if (f)
f->close_section(); // pool
}
if (f)
f->close_section();
else {
assert(ss != nullptr);
*ss << "POOLS:\n";
tbl.set_indent(4);
*ss << tbl;
}
}
以上,是按照pool的格式进行统计输出,其中最核心的各池stat信息来源并未说明,需要继续查找stat来源。利用stat结构反向查找发现基本无好的线索来入手,结合pool的used是通过osd来更新,我们知道在发生write、delete等操作时会更新统计信息,更新ctx->delta_stats。举例的话,可以从处理WRITE的op为入手点,当处理CEPH_OSD_OP_WRITE类型的op的时候,会调用write_update_size_and_usage(),里面会更新ctx->delta_stats。当IO处理完,也就是applied和commited之后,会publish_stats_to_osd()。 这里会将变化的pg的stat_queue_item入队到pg_stat_queue中。然后设置osd_stat_updated为True。入队之后,由tick_timer在C_Tick_WithoutOSDLock这个ctx中通过send_pg_stats()将PG的状态发送给Monitor。这样Monitor就可以知道pg的的变化了。
可用空间,即MAX AVAIL的值,计算稍微有点复杂。Ceph是先计算Available的值,然后根据副本策略再计算MAX AVAIL的值。Available的值是在get_rule_avail()中计算的。在该函数中通过get_rule_weight_osd_map()算出来一个有weight的osd列表。 注意这里的weight一般是小于1的,因为它除以了sum。而sum就是pool中所有osd weight的总和。在拿到weight列表后,就会根据pg_map.osd_stat中kb_avail的值进行除以weight,选出其中最小的,作为Available的值。 这么描述有些抽象了,具体举一个例子。比如这里我们的pool中有三个osd,假设kb_avail都是400G 即, {osd_0: 0.9, osd_1, 0.8, osd_2: 0.7}。计算出来的weight值是{osd_0: 0.9/2.4,osd_1: 0.8/2.4,osd_2: 0.7/2.4} 这样后面用osd的available 空间除以这里的weight值,这里的Available的值就是400G*0.7/2.4。这一节引用至unitedstatck博客,博客里对于这个值的计算非常清晰。