osd::mkfs()
----BlueStore:mkfs()
--------Bluestore::_open_db()
-----------Bluefs->add_block_device()
-----------Bluefs->add_block_extent()
-----------Bluefs->mkfs()
-----------Bluefs->mount()
---------------Bluefs->_open_super()
---------------Bluefs->_init_alloc()
---------------Bluefs->_replay()
在osd的部署过程中会对BlueStore进行初始化
1、设置fsid,设置为 osd的uuid
2、初始化bluestore
3、挂载bluestore
4、读取bluestore的superblock信息,如果已存在superblock,则检查相关信息;如果不存在,则先设置cluster_fsid、osd_fsid、whoami、compat_features信息,然后通过事务创建superblock。
5、写fsid文件
int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
uuid_d fsid, int whoami)
{
// if we are fed a uuid for this osd, use it.
store->set_fsid(cct->_conf->osd_uuid); //设置fs的fsid为osd的uuid
ret = store->mkfs(); //bulestore初始化
store->set_cache_shards(1); // doesn't matter for mkfs!
ret = store->mount(); //blustore的mount
ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl); //获取bluestore的superblock数据
if (ret >= 0) {
//检查superblock的相关信息
} else {
//设置superblock相关信息
sb.cluster_fsid = fsid;
sb.osd_fsid = store->get_fsid();
sb.whoami = whoami;
sb.compat_features = get_osd_initial_compat_set();
//通过bluestore事务来创建superblock
ObjectStore::Transaction t;
t.create_collection(coll_t::meta(), 0);
t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
ret = store->apply_transaction(osr.get(), std::move(t));
}
//写fsid文件
ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
}
bluestore的mkfs过程
int BlueStore::mkfs()
{
r = read_meta("mkfs_done", &done); //读取元数据信息??并进行fsck
r = read_meta("type", &type); //读取类型数据,判断是否为bluestore
freelist_type = "bitmap"; //设置空闲空间的存储格式为bitmap
r = _open_path(); //判断osd_max_object_size是否小于4GB,BlueStore has hard limit of 4GB
r = _open_fsid(true); //感觉是在读取fsid有没有成功配置??
r = _lock_fsid(); //为fsid相关文件上锁,如果无法上锁,有可能是因为有其他的ceph-osd还在运行状态
r = _read_fsid(&old_fsid); //读取fsid相关配置信息,判断相关配置信息是否正确,如果没有fsid,则生成一个随机的fsid???
r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
cct->_conf->bluestore_block_size,
cct->_conf->bluestore_block_create); //创建block链接或文件??
if (cct->_conf->bluestore_bluefs) {
r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
cct->_conf->bluestore_block_wal_size,
cct->_conf->bluestore_block_wal_create); //创建block.wal链接或文件??
r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
cct->_conf->bluestore_block_db_size,
cct->_conf->bluestore_block_db_create); //创建block.db链接或文件??
}
r = _open_bdev(true); //创建blockdevice设备
// 设置min_alloc_size
...
r = _open_db(true); //初始化Bluefs,准备rocksdb的使用环境,如果无法完成,则关闭blockdevice设备
r = _open_fm(true); //初始化FreelistManager,如果无法完成,则关闭db
r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend); //写入rocksdb元数据信息
r = write_meta("bluefs", stringify(bluefs ? 1 : 0)); //写入bluefs元数据
return r;
}
初始化rocksdb
int BlueStore::_open_db(bool create)
{
//创建bluestore以后检查kv_backend和bluefs的元数据信息
...
//初始化bluefs
rocksdb::Env *env = NULL;
if (do_bluefs) {
bluefs = new BlueFS(cct);
// 根据指定path创建block.db的blockdevice设备,并创建IOContext
bfn = path + "/block.db";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
if (create) {
// 添加设备的存储空间,一般为SUPER_RESERVED到磁盘空间的上限,SUPER_RESERVED为8192,即从第三个4k开始
bluefs->add_block_extent(
BlueFS::BDEV_DB,
SUPER_RESERVED,
bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
}
}
// shared device
// 根据指定path创建block的blockdevice设备,并创建IOContext,并添加设备的存储空间
bfn = path + "/block";
r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
if (create) {
bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
bluefs_extents.insert(start, initial);
}
// 根据指定path创建block.wal的blockdevice设备,并创建IOContext,并添加设备存储空间
bfn = path + "/block.wal";
if (::stat(bfn.c_str(), &st) == 0) {
r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
if (create) {
bluefs->add_block_extent(BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE, bluefs->get_block_device_size(BlueFS::BDEV_WAL) -BDEV_LABEL_BLOCK_SIZE);
}
}
if (create) {
bluefs->mkfs(fsid); //初始化bluefs,并设置fsid
}
r = bluefs->mount(); //挂载bluefs
//创建设置BlueRocksEnv
...
}
db = KeyValueDB::create(cct, kv_backend, fn, static_cast(env)); //创建db
//fm相关设置??
FreelistManager::setup_merge_operators(db);
db->set_merge_operator(PREFIX_STAT, merge_op);
db->set_cache_size(cache_kv_ratio * cache_size);
if (kv_backend == "rocksdb")
options = cct->_conf->bluestore_rocksdb_options;
db->init(options); //根据rocksdb配置初始化db
}
bluefs的mkfs过程
int BlueFS::mkfs(uuid_d osd_uuid)
{
//执行初始化操作
_init_alloc();
_init_logger();
//设置superblock相关信息
super.version = 1;
super.block_size = bdev[BDEV_DB]->get_block_size();
super.osd_uuid = osd_uuid;
super.uuid.generate_random();
// 初始化日志:log_file,初始化txn:log_t
...
// 写superblock
super.log_fnode = log_file->fnode;
_write_super();
flush_bdev();
// clean up,没明白是针对什么??
block_all.clear();
block_total.clear();
_stop_alloc();
_shutdown_logger();
}
//加载BlueFS
BlueFS加载(BlueFS::mount())
加载superblock到内存
初始化各存储空间的块分配器
BlueFS将存储空间划分为三层:慢速(Slow)空间、高速(DB)空间、超高速(WAL)空间,每种类型空间使用各自的块分配器,块分配器负责该存储空间中空闲空间的分配与回收,块分配器的工作原理我们将在以后的章节中讨论。
日志回放
BlueFS元数据都是作为日志持久化在硬盘中,在加载BlueFS时候对日志进行replay还原到内存中,由于日志在持久化时都是根据操作顺序append到日志文件当中,因此在replay的时候只要顺序逐条对日志进行解析就能将BlueFS的当前元数据还原到内存中。
日志回放后会在内存中建立dir_map和file_map,此外块分配器中会添加为不同存储空间分配的磁盘空间。
标记已分配空间
日志回放过程中并未将分配给文件的空间从空闲空间中移除,仅当日志回放完成后,所有文件元数据全部加载到内存中,再通过遍历file_map中文件的地址空间映射信息,移除相应的块分配器中的空闲空间,防止已分配空间的重复分配。
创建log_writer
log_writer为日志文件的句柄,用于向日志中追加日志项。
int BlueFS::mount()
{
int r = _open_super(); // 打开superblock
// 初始化allocator为磁盘所有的空间
_init_alloc();
// 回放文件系统日志,日志项即为上面的事务OP,针对每个事务进行回放,文件系统的dir_map/file_map就会被更新
r = _replay(false);
// 初始化freelist,针对file_map中的每个文件,将分配给文件的空间从空闲空间中移除
for (auto& p : file_map) {
for (auto& q : p.second->fnode.extents) {
alloc[q.bdev]->init_rm_free(q.offset, q.length); // 将文件已经占用的内容从allocator中删除
}
}
// 创建log_writer, log_writer为日志文件的句柄,用于向日志中追加日志项,该日志文件的fnode.ino为1
log_writer = _create_writer(_get_file(1));
assert(log_writer->file->fnode.ino == 1);
log_writer->pos = log_writer->file->fnode.size;
_init_logger();
}
Superblock部分:
加载superblock到内存,并且使用校验和的方式对superblock内容进行校验
int BlueFS::_open_super()
{
// always the second block 读取superblocak总是放在第二个block
r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
&bl, ioc[BDEV_DB], false);
bufferlist::iterator p = bl.begin();
//解码super块内容
::decode(super, p);
//计算当前的校验和
{
t.substr_of(bl, 0, p.get_off());
crc = t.crc32c(-1);
}
//解码校验和
::decode(expected_crc, p);
//检查计算得到的新校验和是不是与期望的校验和一致
if (crc != expected_crc) {
return -EIO;
}
}
对bluefs进行还原。读取log_file,根据实际的历史事务记录,来还原bluefs,对每一个事务,判断其操作类型,对根据操作类型来执行相应的操作,还原之前的文件系统环境。log_file的元数据信息fnode保存在superblock的log_fnode中
int BlueFS::_replay(bool noop)
{
//superblock中的log_fnode保存的是log_file的fnode
log_file->fnode = super.log_fnode;
//预读log_file
FileReader *log_reader = new FileReader(
log_file, cct->_conf->bluefs_max_prefetch,
false, // !random
true); // ignore eof
while (true) {
{
//每次读取一个block_size的内容
int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
&bl, NULL);
read_pos += r;
}
{
bluefs_transaction_t t;
bufferlist::iterator p = t.op_bl.begin();
//对bufferlist中的逐个事务进行处理
while (!p.end()) {
switch (op) {
case bluefs_transaction_t::OP_INIT:
...
break;
case bluefs_transaction_t::OP_JUMP:
{
::decode(next_seq, p);
::decode(offset, p);
log_seq = next_seq - 1; // we will increment it below
uint64_t skip = offset - read_pos;
if (skip) {
bufferlist junk;
int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk, NULL);
}
}
break;
case bluefs_transaction_t::OP_JUMP_SEQ:
{
::decode(next_seq, p);
log_seq = next_seq - 1; // we will increment it below
}
break;
case bluefs_transaction_t::OP_ALLOC_ADD:
{
::decode(id, p);
::decode(offset, p);
::decode(length, p);
if (!noop) {
//block_all记录已分配段(offset, length),block_total记录已分配长度
block_all[id].insert(offset, length);
block_total[id] += length;
alloc[id]->init_add_free(offset, length);
}
}
break;
case bluefs_transaction_t::OP_ALLOC_RM:
{
::decode(id, p);
::decode(offset, p);
::decode(length, p);
if (!noop) {
block_all[id].erase(offset, length);
block_total[id] -= length;
alloc[id]->init_rm_free(offset, length);
}
}
break;
case bluefs_transaction_t::OP_DIR_LINK:
{
::decode(dirname, p);
::decode(filename, p);
::decode(ino, p);
if (!noop) {
FileRef file = _get_file(ino);
assert(file->fnode.ino);
map::iterator q = dir_map.find(dirname);
assert(q != dir_map.end());
map::iterator r = q->second->file_map.find(filename);
assert(r == q->second->file_map.end());
q->second->file_map[filename] = file;
++file->refs;
}
}
break;
case bluefs_transaction_t::OP_DIR_UNLINK:
{
::decode(dirname, p);
::decode(filename, p);
if (!noop) {
map::iterator q = dir_map.find(dirname);
assert(q != dir_map.end());
map::iterator r = q->second->file_map.find(filename);
assert(r != q->second->file_map.end());
assert(r->second->refs > 0);
--r->second->refs;
q->second->file_map.erase(r);
}
}
break;
case bluefs_transaction_t::OP_DIR_CREATE:
{
::decode(dirname, p);
if (!noop) {
map::iterator q = dir_map.find(dirname);
assert(q == dir_map.end());
dir_map[dirname] = new Dir;
}
}
break;
case bluefs_transaction_t::OP_DIR_REMOVE:
{
::decode(dirname, p);
if (!noop) {
map::iterator q = dir_map.find(dirname);
assert(q != dir_map.end());
assert(q->second->file_map.empty());
dir_map.erase(q);
}
}
break;
case bluefs_transaction_t::OP_FILE_UPDATE:
{
::decode(fnode, p);
if (!noop) {
FileRef f = _get_file(fnode.ino);
f->fnode = fnode;
if (fnode.ino > ino_last) {
ino_last = fnode.ino;
}
}
}
break;
case bluefs_transaction_t::OP_FILE_REMOVE:
{
::decode(ino, p);
if (!noop) {
auto p = file_map.find(ino);
assert(p != file_map.end());
file_map.erase(p);
}
}
break;
}
}
assert(p.end());
// we successfully replayed the transaction; bump the seq and log size
++log_seq;
log_file->fnode.size = log_reader->buf.pos;
}
}
bluefs事务类型。
struct bluefs_transaction_t {
typedef enum {
OP_NONE = 0,
OP_INIT, ///< initial (empty) file system marker
//给文件分配和释放空间
OP_ALLOC_ADD, ///< add extent to available block storage (extent)
OP_ALLOC_RM, ///< remove extent from availabe block storage (extent)
// 创建和删除目录项(包含文件)【一个目录和文件的映射关系】
OP_DIR_LINK, ///< (re)set a dir entry (dirname, filename, ino)
OP_DIR_UNLINK, ///< remove a dir entry (dirname, filename)
// 创建和删除目录
OP_DIR_CREATE, ///< create a dir (dirname)
OP_DIR_REMOVE, ///< remove a dir (dirname)
// 更新和删除文件
OP_FILE_UPDATE, ///< set/update file metadata (file)
OP_FILE_REMOVE, ///< remove file (ino)
// bluefs日志文件的compaction操作 ??
OP_JUMP, ///< jump the seq # and offset
OP_JUMP_SEQ, ///< jump the seq #
} op_t;
uuid_d uuid; ///< fs uuid
uint64_t seq; ///< sequence number
bufferlist op_bl; ///< encoded transaction ops
...
block_all记录已分配段(offset, length),block_total记录已分配长度