ceph bluefs负责对接rocksdb,是一个精简的文件系统,往下整合磁盘设备,往上给rocksdb提供一些必须的接口,其在ceph bluestore架构中所处于的位置如下图所示。
ceph bluefs的上电加载过程是利用_open_db函数实现的。
如果是创建osd,则调用方式是_open_db(true),如果是启动osd,则调用方式是_open_db(false, !open_db);
_open_db函数的调用栈如下:
string fn = path + "/db"; //这里的path="/var/lib/ceph/osd/ceph-0/"
if (create)
kv_backend = cct->_conf->bluestore_kvbackend; //rocksdb
else
read_meta("kv_backend", &kv_backend);//read_meta均是从block设备读 ,读不到就从path/kv_backend文件中读出来
if(create)
do_bluefs = cct->_conf->bluestore_bluefs;
else
read_meta("bluefs", &s); //read_meta均是从block设备读 ,读不到就从path/bluefs文件中读出来
if (s == "1")
do_bluefs = true;
else if (s == "0")
do_bluefs = false;
如果是创建osd,则直接复制kv_backend为"rocksdb",否则就调用read_meta从block磁盘设备中读取kv_backend的值。同时如果是创建osd就设置do_bluefs的值,默认为true,如果是启动osd,同样调用read_meta从block磁盘设备中读取do_bluefs的值。read_meta的实现如下:
string p = path + "/block"; //block设备就是主存储设备
_read_bdev_label(cct, p, &label);
string p = path + "/block";
int fd = ::open(path.c_str(), O_RDONLY);
bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);//4096
::close(fd)
bufferlist::iterator p = bl.begin();
decode(*label, p);
auto i = label.meta.find(key);
if (i == label.meta.end())
return ObjectStore::read_meta(key, value);
*value = i->second;
read_meta就是读取block设备的前4096字节(block设备前4096字节保留给label使用),然后解析出来对应的key的值,如果没有找到调用ObjectStore::read_meta从类似/var/lib/ceph/osd/ceph-num目录下对应的文件中读取key的值。
回到_open_db函数
if (do_bluefs)
bluefs = new BlueFS(cct);
discard_cb[BDEV_WAL] = wal_discard_cb;
discard_cb[BDEV_DB] = db_discard_cb;
discard_cb[BDEV_SLOW] = slow_discard_cb;
if (read_meta("path_block.db", &bfn) < 0)
bfn = path + "/block.db";
if (::stat(bfn.c_str(), &st) == 0) //bfn是block.db设备
bluefs->add_block_device(BlueFS::BDEV_DB, bfn, create && cct->_conf->bdev_enable_discard); //BDEV_DB = 1;
if (bluefs->bdev_support_label(BlueFS::BDEV_DB))
_check_or_set_bdev_label(bfn, bluefs->get_block_device_size(BlueFS::BDEV_DB), "bluefs db", create);
if (create)
bluefs->add_block_extent(BlueFS::BDEV_DB, SUPER_RESERVED, bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
bluefs_shared_bdev = BlueFS::BDEV_SLOW;
bluefs_single_shared_device = false;
if (read_meta("path_block", &bfn) < 0) //read_meta均是从block设备读 ,读不到就从path/path_block文件中读出来
bfn = path + "/block";
bluefs->add_block_device(bluefs_shared_bdev, bfn, false);
if (create)
uint64_t initial = bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio + cct->_conf->bluestore_bluefs_gift_ratio);
initial = std::max(initial, cct->_conf->bluestore_bluefs_min);
initial = p2roundup(initial, cct->_conf->bluefs_alloc_size); //往上对齐
uint64_t start = p2align((bdev->get_size() - initial) / 2, cct->_conf->bluefs_alloc_size); //???????????
bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
bluefs_extents.insert(start, initial);
if (read_meta("path_block.wal", &bfn) < 0) ////read_meta均是从block设备读 ,读不到就从path/path_block.wal文件中读出来
bfn = path + "/block.wal";
if (::stat(bfn.c_str(), &st) == 0)
bluefs->add_block_device(BlueFS::BDEV_WAL, bfn, create && cct->_conf->bdev_enable_discard);
if (bluefs->bdev_support_label(BlueFS::BDEV_WAL))
_check_or_set_bdev_label(bfn,bluefs->get_block_device_size(BlueFS::BDEV_WAL),"bluefs wal", create);
if (create)
bluefs->add_block_extent(BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE, bluefs->get_block_device_size(BlueFS::BDEV_WAL) - BDEV_LABEL_BLOCK_SIZE);
kv_options["separate_wal_dir"] = "1";
bluefs_single_shared_device = false;
如果是使用bluefs,则创建一个BlueFS实例,在BlueFS实例中会设置discard回调函数。接下来会创建三个设备(block.db、block、block.wal)的相关信息
(1)首先将block.db设备加入到bluefs的管理之中,其是通过调用add_block_device来实现的,block.db负责存储BlueStore内部产生的元数据。如果是创建osd,则还需要调用add_block_extent将这个磁盘设备中的空间加入到bluefs的管理之中,其空间为SUPER_RESERVED(8192字节)后的空间,前SUPER_RESERVED字节空间负责存放一些label信息和bluefs的super块信息。最后会调用_check_or_set_bdev_label函数把一些label写入到block.db设备中,或从block.db设备中读取一些label。
(2)将block设备加入到bluefs的管理之中,block设备主要用于存储对象数据。并计算block设备中给bluefs使用的空间,还要将这些空间加入到bluefs_extents中,osd上电启动时,_open_fm函数会将bluefs_extents中的空间使用记录持久化到rocksdb中,同时bluestore的allocator会将这些空间从空闲块btree中删除,以免重复分配。
(3)将block.wal设备加入到bluefs的管理之中,调用_check_or_set_bdev_label函数把一些label写入到block.db设备中,或从block.db设备中读取一些label。同时调用add_block_extent来将这个磁盘设备中的空间加入到bluefs的管理之中(默认从BDEV_LABEL_BLOCK_SIZE(4096)到最后)。
add_block_device的调用栈如下
bluefs->add_block_device(BlueFS::BDEV_DB, bfn, create && cct->_conf->bdev_enable_discard); //BDEV_DB = 1;
BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, discard_cb[id], static_cast(this));//path=bfn
string type = "kernel";
::readlink(path.c_str(), buf, sizeof(buf) - 1); //path 是block.db
char *bname = ::basename(buf);
return new KernelDevice(cct, cb, cbpriv, d_cb, d_cbpriv);
b->open(path); //该部分都属于block.db
path = p; //p就是open传进来的path,在这里就是block.db
fd_direct = ::open(path.c_str(), O_RDWR | O_DIRECT) //path是"/block.db"
fd_buffered = ::open(path.c_str(), O_RDWR);
aio = cct->_conf->bdev_aio;
dio = true;
posix_fadvise(fd_buffered, 0, 0, POSIX_FADV_RANDOM);
_lock(); //d对文件加锁,只能本进程使用
struct flock l;
memset(&l, 0, sizeof(l));
l.l_type = F_WRLCK;
l.l_whence = SEEK_SET;
::fcntl(fd_direct, F_SETLK, &l);
::fstat(fd_direct, &st);
if (S_ISBLK(st.st_mode))
get_block_device_size(fd_direct, &s);
size = s;
else
size = st.st_size; //获取磁盘大小
get_device_by_fd(fd_buffered, partition, devname, sizeof(devname));
_aio_start();
if (aio)
aio_queue.init();
aio_thread.create("bstore_aio");
_discard_start();
discard_thread.create("bstore_discard");
size &= ~(block_size - 1);
bdev[id] = b;
ioc[id] = new IOContext(cct, NULL);
其实现如下:
(1)创造设备的KernelDevice实例
(2)利用系统接口open两次打开设备,一次是不带缓存打开,一次带缓存打开
(3)利用系统接口fcntl对设备加写锁,只有该进程可以写该设备
(4)启动该设备上的aio_thread,aio_thread负责获取已完成的aio返回信息
(5)启动该设备上的discard_thread,该线程负责回收空间
可见,对block、block.db、block.wal设备均做了上面相同的处理。
add_block_extent的调用栈如下
bluefs->add_block_extent(BlueFS::BDEV_DB, SUPER_RESERVED, bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
block_all[id].insert(offset, length); //用于存储的空间,对于block.db就是从SUPER_RESERVED到最后
if (id < alloc.size() && alloc[id])
log_t.op_alloc_add(id, offset, length);
_flush_and_sync_log(l);
alloc[id]->init_add_free(offset, length);
(1)将空间插入到block_all中
(2)调用allocator的init_add_free函数将空闲空间插入到分配器的btree中
因为此时alloc仍然没有分配StupidAllocator实例,因此(2)在这里不会被执行
_check_or_set_bdev_label的调用栈如下:
if (create)
label.osd_uuid = fsid;
label.size = size;
label.btime = ceph_clock_now();
label.description = desc;
_write_bdev_label(cct, path, label);
::open(path.c_str(), O_WRONLY)
bl.write_fd(fd);
::fsync(fd);
else
_read_bdev_label(cct, path, &label); //从磁盘设备前BDEV_LABEL_BLOCK_SIZE字节中获取bluestore_bdev_label_t label;
如果是创建osd,则会将一些label信息写入到磁盘设备中,如果是上电启动osd则直接读取一些label。
回到_open_db函数
if (create)
bluefs->mkfs(fsid);
_init_alloc();
alloc.resize(MAX_BDEV);//3
pending_release.resize(MAX_BDEV);
for (unsigned id = 0; id < bdev.size(); ++id)//对每个设备创建分配器
alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, //stupid
interval_set& p = block_all[id];//前面设置的用于存放数据的空间
for (interval_set::iterator q = p.begin(); q != p.end(); ++q)
alloc[id]->init_add_free(q.get_start(), q.get_len());//插入每个分配器的btree中
super.version = 1;
super.block_size = bdev[BDEV_DB]->get_block_size();
super.osd_uuid = osd_uuid;
super.osd_uuid = osd_uuid;
super.log_fnode = log_file->fnode;
log_file->fnode.ino = 1;
log_file->fnode.prefer_bdev = BDEV_WAL;//0
_allocate(log_file->fnode.prefer_bdev, cct->_conf->bluefs_max_log_runway, &log_file->fnode); //bluefs_max_log_runway=4194304
left = round_up_to(len, min_alloc_size);
PExtentVector extents;
alloc[id]->reserve(left);
extents.reserve(4);
alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents); //从btree中获取空闲空间,并分配给extents
for (auto& p : extents)
node->append_extent(bluefs_extent_t(id, p.offset, p.length));
log_writer = _create_writer(log_file);
FileWriter *w = new FileWriter(f);
for (unsigned i = 0; i < MAX_BDEV; ++i)
w->iocv[i] = new IOContext(cct, NULL);
return w;
super.log_fnode = log_file->fnode;
_write_super();
encode(super, bl);
bdev[BDEV_DB]->write(get_super_offset(), bl, false); //block.db写
flush_bdev
for (auto p : bdev)
if (p)
p->flush();
super = bluefs_super_t();
_close_writer(log_writer);
for (unsigned i=0; iqueue_reap_ioc(h->iocv[i]);
_close_writer(log_writer);
block_all.clear();
_stop_alloc();
_shutdown_logger();
如果是创建osd,还要调用mkfs,mkfs实现如下:
(1)调用_init_alloc将每个设备的空闲空间加入到对应的stupidallocator中,空闲空间由前面的add_block_extent插入到block_all,stupidallocator是利用btree来保存空闲块的。
(2)调用_allocate函数给log file分配空间,log file的ino为1,优先选择block.wal设备存储,第一次分配的大小为4M,BlueFs的_allocate最终会调用StupidAllocator的allocate来分配具体的空间。最后_allocate将分配的pextent插入到log file的extents中。
(3)写bluefs的super block,其是调用_write_super实现的。super block包含version、block_size、日志文件的fnode信息等。_write_super会把super block写到block.db设备的4096偏移处。
(4)对block_all中的信息加日志,对每块空闲快,都执行log_t.op_alloc_add(bdev, q.get_start(), q.get_len());,其会在**_flush_and_sync_log**中持久化到日志中。
(5)做一些清理工作,比如清除block_all,_close_writer。因为会在_mount函数中继续从日志中读取信息,来进行回放操作。
回到_open_db函数
紧接着调用mount函数,mount函数调用栈如下:
_open_super();
bdev[BDEV_DB]->read(get_super_offset(), get_super_length(), &bl, ioc[BDEV_DB], false);//系统的pread
bufferlist::iterator p = bl.begin();
decode(super, p);
block_all.clear();
block_all.resize(MAX_BDEV);
_init_alloc();
_replay(false, false);
log_writer = _create_writer(_get_file(1));
log_writer->pos = log_writer->file->fnode.size;
(1)读取bluefs 的super block
(2)调用_init_alloc为每块磁盘创建StupidAllocator实例
(3)调用_replay来回放日志,其会将分配的空间、创建的文件等恢复到内存
(4)创建日志文件句柄
(5)对于_replay回放得出的文件,利用init_rm_free来将文件所占用的空间从StupidAllocator的btree中删除
回到_open_db函数
_open_db函数的最后一部分就是rocksdb相关的初始化了
env = new BlueRocksEnv(bluefs)
fn = "db";
if (create)
env->CreateDir(fn);
fs->mkdir(dirname);
dir_map[dirname] = new Dir;
log_t.op_dir_create(dirname); //mkdir操作不将dir的相关数据写入磁盘,仅仅将创建dir的日志记录写道日志中
if (kv_options.count("separate_wal_dir"))
env->CreateDir(fn + ".wal");
if (kv_options.count("rocksdb_db_paths"))
env->CreateDir(fn + ".slow");
BlueRocksEnv继承rocksdb::EnvWrapper,BlueRocksEnv负责给rocksdb提供一些定制化的接口。
如果是创建osd,则创建三个目录,名称分别是db、db.wal、db.slow。创建目录只是在dir_map插入元素,并利用op_dir_create来记录目录创建这个日志。
db = KeyValueDB::create(cct, kv_backend, fn, kv_options, static_cast(env));
return new RocksDBStore(cct, dir, options, p);
FreelistManager::setup_merge_operators(db);
BitmapFreelistManager::setup_merge_operator(db, "b");
db->set_merge_operator(prefix, merge_op); //prefix=b
merge_ops.push_back(std::make_pair(prefix,mop));
db->set_merge_operator(PREFIX_STAT, merge_op);
merge_ops.push_back(std::make_pair(prefix,mop))
db->set_cache_size(cache_size * cache_kv_ratio);
cct->_conf->with_val("bluestore_rocksdb_cfs", get_str_map, &cf_map, " \t");
for (auto& i : cf_map) {
cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second)); //就只有L M P???
db->init(options);
if (to_repair_db)
return 0;
if (create)
if (cct->_conf->get_val("bluestore_rocksdb_cf")) //false
r = db->create_and_open(err, cfs);
else
r = db->create_and_open(err);
create_db_dir();
unique_ptr dir;
env->NewDirectory(path, &dir);
result->reset(new BlueRocksDirectory(fs)); //fs类型为BlueFs
if (cfs.empty())
return do_open(out, true, nullptr);
else
return do_open(out, true, &cfs);
load_rocksdb_options(create_if_missing, opt);
if (create_if_missing)
rocksdb::DB::Open(opt, path, &db);
default_cf = db->DefaultColumnFamily();
这部分代码实现的功能如下:
(1)创建一个RocksDB实例
(2)设置rocksdb的列簇,列簇名为L、M、P
(3)调用init来初始化rocksdb的一些参数配置
(4)调用do_open打开RocksDB,并利用RocksDB的DefaultColumnFamily接口获取默认列簇
在_open_db函数执行结束后,如果是创建osd,还会执行以下两行代码将kv_backend和bluefs写入到block的label区域和/var/lib/ceph/osd/ceph-num目录下对于的文件中,osd上电时会加载这两个label。
write_meta(“kv_backend”, cct->_conf->bluestore_kvbackend);
write_meta(“bluefs”, stringify(bluefs ? 1 : 0));