ceph bluefs 启动加载源码解析(上)

ceph bluefs负责对接rocksdb,是一个精简的文件系统,往下整合磁盘设备,往上给rocksdb提供一些必须的接口,其在ceph bluestore架构中所处于的位置如下图所示。
ceph bluefs 启动加载源码解析(上)_第1张图片
ceph bluefs的上电加载过程是利用_open_db函数实现的。
如果是创建osd,则调用方式是_open_db(true),如果是启动osd,则调用方式是_open_db(false, !open_db);

_open_db函数的调用栈如下:

string fn = path + "/db"; //这里的path="/var/lib/ceph/osd/ceph-0/"
if (create)
    kv_backend = cct->_conf->bluestore_kvbackend; //rocksdb
else    
    read_meta("kv_backend", &kv_backend);//read_meta均是从block设备读 ,读不到就从path/kv_backend文件中读出来
if(create)
    do_bluefs = cct->_conf->bluestore_bluefs;
else
    read_meta("bluefs", &s);  //read_meta均是从block设备读 ,读不到就从path/bluefs文件中读出来
    if (s == "1")
        do_bluefs = true;
    else if (s == "0") 
        do_bluefs = false;    

如果是创建osd,则直接复制kv_backend为"rocksdb",否则就调用read_meta从block磁盘设备中读取kv_backend的值。同时如果是创建osd就设置do_bluefs的值,默认为true,如果是启动osd,同样调用read_meta从block磁盘设备中读取do_bluefs的值。read_meta的实现如下:

string p = path + "/block";  //block设备就是主存储设备
_read_bdev_label(cct, p, &label);
    string p = path + "/block";
    int fd = ::open(path.c_str(), O_RDONLY);
    bl.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);//4096
    ::close(fd)
    bufferlist::iterator p = bl.begin();
    decode(*label, p);
auto i = label.meta.find(key);
if (i == label.meta.end())
    return ObjectStore::read_meta(key, value);
*value = i->second;

read_meta就是读取block设备的前4096字节(block设备前4096字节保留给label使用),然后解析出来对应的key的值,如果没有找到调用ObjectStore::read_meta从类似/var/lib/ceph/osd/ceph-num目录下对应的文件中读取key的值。

回到_open_db函数

if (do_bluefs)
    bluefs = new BlueFS(cct);
        discard_cb[BDEV_WAL] = wal_discard_cb;
        discard_cb[BDEV_DB] = db_discard_cb;
        discard_cb[BDEV_SLOW] = slow_discard_cb;
    if (read_meta("path_block.db", &bfn) < 0)
        bfn = path + "/block.db";  
    if (::stat(bfn.c_str(), &st) == 0)  //bfn是block.db设备
        bluefs->add_block_device(BlueFS::BDEV_DB, bfn, create && cct->_conf->bdev_enable_discard); //BDEV_DB = 1;
        if (bluefs->bdev_support_label(BlueFS::BDEV_DB))
            _check_or_set_bdev_label(bfn, bluefs->get_block_device_size(BlueFS::BDEV_DB), "bluefs db", create);  

                    
    if (create)
        bluefs->add_block_extent(BlueFS::BDEV_DB, SUPER_RESERVED, bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);       
    bluefs_shared_bdev = BlueFS::BDEV_SLOW;
    bluefs_single_shared_device = false;

    if (read_meta("path_block", &bfn) < 0) //read_meta均是从block设备读 ,读不到就从path/path_block文件中读出来   
        bfn = path + "/block";        
    bluefs->add_block_device(bluefs_shared_bdev, bfn, false);  
    if (create)
        uint64_t initial =  bdev->get_size() * (cct->_conf->bluestore_bluefs_min_ratio + cct->_conf->bluestore_bluefs_gift_ratio); 
        initial = std::max(initial, cct->_conf->bluestore_bluefs_min);    
        initial = p2roundup(initial, cct->_conf->bluefs_alloc_size);   //往上对齐
        uint64_t start = p2align((bdev->get_size() - initial) / 2,  cct->_conf->bluefs_alloc_size);  //???????????
        bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
        bluefs_extents.insert(start, initial);    
        
    if (read_meta("path_block.wal", &bfn) < 0)     ////read_meta均是从block设备读 ,读不到就从path/path_block.wal文件中读出来   
        bfn = path + "/block.wal";
    if (::stat(bfn.c_str(), &st) == 0)
        bluefs->add_block_device(BlueFS::BDEV_WAL, bfn, create && cct->_conf->bdev_enable_discard);
        if (bluefs->bdev_support_label(BlueFS::BDEV_WAL))
            _check_or_set_bdev_label(bfn,bluefs->get_block_device_size(BlueFS::BDEV_WAL),"bluefs wal", create);
        if (create)
            bluefs->add_block_extent(BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE, bluefs->get_block_device_size(BlueFS::BDEV_WAL) - BDEV_LABEL_BLOCK_SIZE);
        kv_options["separate_wal_dir"] = "1";
        bluefs_single_shared_device = false;

如果是使用bluefs,则创建一个BlueFS实例,在BlueFS实例中会设置discard回调函数。接下来会创建三个设备(block.db、block、block.wal)的相关信息
(1)首先将block.db设备加入到bluefs的管理之中,其是通过调用add_block_device来实现的,block.db负责存储BlueStore内部产生的元数据。如果是创建osd,则还需要调用add_block_extent将这个磁盘设备中的空间加入到bluefs的管理之中,其空间为SUPER_RESERVED(8192字节)后的空间,前SUPER_RESERVED字节空间负责存放一些label信息和bluefs的super块信息。最后会调用_check_or_set_bdev_label函数把一些label写入到block.db设备中,或从block.db设备中读取一些label。
(2)将block设备加入到bluefs的管理之中,block设备主要用于存储对象数据。并计算block设备中给bluefs使用的空间,还要将这些空间加入到bluefs_extents中,osd上电启动时,_open_fm函数会将bluefs_extents中的空间使用记录持久化到rocksdb中,同时bluestore的allocator会将这些空间从空闲块btree中删除,以免重复分配。
(3)将block.wal设备加入到bluefs的管理之中,调用_check_or_set_bdev_label函数把一些label写入到block.db设备中,或从block.db设备中读取一些label。同时调用add_block_extent来将这个磁盘设备中的空间加入到bluefs的管理之中(默认从BDEV_LABEL_BLOCK_SIZE(4096)到最后)。

add_block_device的调用栈如下

bluefs->add_block_device(BlueFS::BDEV_DB, bfn, create && cct->_conf->bdev_enable_discard); //BDEV_DB = 1;
    BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL, discard_cb[id], static_cast(this));//path=bfn
        string type = "kernel";
        ::readlink(path.c_str(), buf, sizeof(buf) - 1); //path 是block.db
        char *bname = ::basename(buf);
        return new KernelDevice(cct, cb, cbpriv, d_cb, d_cbpriv);            
    b->open(path); //该部分都属于block.db
        path = p; //p就是open传进来的path,在这里就是block.db
        fd_direct = ::open(path.c_str(), O_RDWR | O_DIRECT)  //path是"/block.db"
        fd_buffered = ::open(path.c_str(), O_RDWR);
        aio = cct->_conf->bdev_aio;
        dio = true;
        posix_fadvise(fd_buffered, 0, 0, POSIX_FADV_RANDOM);
        _lock(); //d对文件加锁,只能本进程使用
            struct flock l;
            memset(&l, 0, sizeof(l));
            l.l_type = F_WRLCK;
            l.l_whence = SEEK_SET;
            ::fcntl(fd_direct, F_SETLK, &l);
        ::fstat(fd_direct, &st);
        if (S_ISBLK(st.st_mode)) 
            get_block_device_size(fd_direct, &s);
            size = s;
        else
            size = st.st_size; //获取磁盘大小
        get_device_by_fd(fd_buffered, partition, devname, sizeof(devname));
        _aio_start();
            if (aio)
                aio_queue.init();
                aio_thread.create("bstore_aio");
        _discard_start();
            discard_thread.create("bstore_discard");
        size &= ~(block_size - 1);      
    bdev[id] = b;
    ioc[id] = new IOContext(cct, NULL);

其实现如下:
(1)创造设备的KernelDevice实例
(2)利用系统接口open两次打开设备,一次是不带缓存打开,一次带缓存打开
(3)利用系统接口fcntl对设备加写锁,只有该进程可以写该设备
(4)启动该设备上的aio_thread,aio_thread负责获取已完成的aio返回信息
(5)启动该设备上的discard_thread,该线程负责回收空间
可见,对block、block.db、block.wal设备均做了上面相同的处理。

add_block_extent的调用栈如下

bluefs->add_block_extent(BlueFS::BDEV_DB, SUPER_RESERVED, bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);       
    block_all[id].insert(offset, length);  //用于存储的空间,对于block.db就是从SUPER_RESERVED到最后
    if (id < alloc.size() && alloc[id])
        log_t.op_alloc_add(id, offset, length);
        _flush_and_sync_log(l);
        alloc[id]->init_add_free(offset, length);

(1)将空间插入到block_all中
(2)调用allocator的init_add_free函数将空闲空间插入到分配器的btree中
因为此时alloc仍然没有分配StupidAllocator实例,因此(2)在这里不会被执行

_check_or_set_bdev_label的调用栈如下:

if (create)
    label.osd_uuid = fsid;
    label.size = size;
    label.btime = ceph_clock_now();
    label.description = desc;
    _write_bdev_label(cct, path, label);
        ::open(path.c_str(), O_WRONLY)
        bl.write_fd(fd);
        ::fsync(fd);
else
    _read_bdev_label(cct, path, &label); //从磁盘设备前BDEV_LABEL_BLOCK_SIZE字节中获取bluestore_bdev_label_t label; 

如果是创建osd,则会将一些label信息写入到磁盘设备中,如果是上电启动osd则直接读取一些label。

回到_open_db函数

if (create)
    bluefs->mkfs(fsid);
        _init_alloc();
            alloc.resize(MAX_BDEV);//3
            pending_release.resize(MAX_BDEV);
            for (unsigned id = 0; id < bdev.size(); ++id)//对每个设备创建分配器
                alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator, //stupid
                interval_set& p = block_all[id];//前面设置的用于存放数据的空间
                for (interval_set::iterator q = p.begin(); q != p.end(); ++q)
                    alloc[id]->init_add_free(q.get_start(), q.get_len());//插入每个分配器的btree中
        super.version = 1;
        super.block_size = bdev[BDEV_DB]->get_block_size();
        super.osd_uuid = osd_uuid;
        super.osd_uuid = osd_uuid;             
        super.log_fnode = log_file->fnode;
        
        log_file->fnode.ino = 1;
        log_file->fnode.prefer_bdev = BDEV_WAL;//0
        _allocate(log_file->fnode.prefer_bdev, cct->_conf->bluefs_max_log_runway, &log_file->fnode); //bluefs_max_log_runway=4194304
            left = round_up_to(len, min_alloc_size);  
            PExtentVector extents;
            alloc[id]->reserve(left);
            extents.reserve(4);
            alloc_len = alloc[id]->allocate(left, min_alloc_size, hint, &extents); //从btree中获取空闲空间,并分配给extents
            for (auto& p : extents)
                node->append_extent(bluefs_extent_t(id, p.offset, p.length));
         
        log_writer = _create_writer(log_file); 
            FileWriter *w = new FileWriter(f);
            for (unsigned i = 0; i < MAX_BDEV; ++i)
                w->iocv[i] = new IOContext(cct, NULL);
            return w;
        super.log_fnode = log_file->fnode;   
        _write_super();
            encode(super, bl);
            bdev[BDEV_DB]->write(get_super_offset(), bl, false); //block.db写
        flush_bdev
            for (auto p : bdev)
                if (p)
                    p->flush();
        super = bluefs_super_t();
        _close_writer(log_writer);
            for (unsigned i=0; iqueue_reap_ioc(h->iocv[i]);
        _close_writer(log_writer);
        block_all.clear();
        _stop_alloc();
        _shutdown_logger();

如果是创建osd,还要调用mkfs,mkfs实现如下:
(1)调用_init_alloc将每个设备的空闲空间加入到对应的stupidallocator中,空闲空间由前面的add_block_extent插入到block_all,stupidallocator是利用btree来保存空闲块的。
(2)调用_allocate函数给log file分配空间,log file的ino为1,优先选择block.wal设备存储,第一次分配的大小为4M,BlueFs的_allocate最终会调用StupidAllocator的allocate来分配具体的空间。最后_allocate将分配的pextent插入到log file的extents中。
(3)写bluefs的super block,其是调用_write_super实现的。super block包含version、block_size、日志文件的fnode信息等。_write_super会把super block写到block.db设备的4096偏移处。
(4)对block_all中的信息加日志,对每块空闲快,都执行log_t.op_alloc_add(bdev, q.get_start(), q.get_len());,其会在**_flush_and_sync_log**中持久化到日志中。
(5)做一些清理工作,比如清除block_all,_close_writer。因为会在_mount函数中继续从日志中读取信息,来进行回放操作。

回到_open_db函数
紧接着调用mount函数,mount函数调用栈如下:

_open_super();
    bdev[BDEV_DB]->read(get_super_offset(), get_super_length(), &bl, ioc[BDEV_DB], false);//系统的pread
    bufferlist::iterator p = bl.begin();
    decode(super, p);
block_all.clear();
block_all.resize(MAX_BDEV);
_init_alloc(); 
_replay(false, false);
log_writer = _create_writer(_get_file(1));
log_writer->pos = log_writer->file->fnode.size;

(1)读取bluefs 的super block
(2)调用_init_alloc为每块磁盘创建StupidAllocator实例
(3)调用_replay来回放日志,其会将分配的空间、创建的文件等恢复到内存
(4)创建日志文件句柄
(5)对于_replay回放得出的文件,利用init_rm_free来将文件所占用的空间从StupidAllocator的btree中删除

回到_open_db函数
_open_db函数的最后一部分就是rocksdb相关的初始化了

env = new BlueRocksEnv(bluefs)
fn = "db";
if (create)
    env->CreateDir(fn);
        fs->mkdir(dirname);
            dir_map[dirname] = new Dir;
            log_t.op_dir_create(dirname); //mkdir操作不将dir的相关数据写入磁盘,仅仅将创建dir的日志记录写道日志中
    if (kv_options.count("separate_wal_dir"))
        env->CreateDir(fn + ".wal");
    if (kv_options.count("rocksdb_db_paths"))
        env->CreateDir(fn + ".slow");

BlueRocksEnv继承rocksdb::EnvWrapper,BlueRocksEnv负责给rocksdb提供一些定制化的接口。
如果是创建osd,则创建三个目录,名称分别是db、db.wal、db.slow。创建目录只是在dir_map插入元素,并利用op_dir_create来记录目录创建这个日志。

db = KeyValueDB::create(cct, kv_backend, fn, kv_options, static_cast(env)); 
    return new RocksDBStore(cct, dir, options, p);
FreelistManager::setup_merge_operators(db);
    BitmapFreelistManager::setup_merge_operator(db, "b");
        db->set_merge_operator(prefix, merge_op); //prefix=b
            merge_ops.push_back(std::make_pair(prefix,mop));
db->set_merge_operator(PREFIX_STAT, merge_op);
    merge_ops.push_back(std::make_pair(prefix,mop))
db->set_cache_size(cache_size * cache_kv_ratio);    
cct->_conf->with_val("bluestore_rocksdb_cfs", get_str_map, &cf_map, " \t");    
for (auto& i : cf_map) {    
    cfs.push_back(KeyValueDB::ColumnFamily(i.first, i.second));  //就只有L M P???
db->init(options);

if (to_repair_db)
    return 0;
if (create)
    if (cct->_conf->get_val("bluestore_rocksdb_cf")) //false
        r = db->create_and_open(err, cfs);
    else
        r = db->create_and_open(err);
            create_db_dir();
                unique_ptr dir;
                env->NewDirectory(path, &dir);
                    result->reset(new BlueRocksDirectory(fs)); //fs类型为BlueFs
            if (cfs.empty())
                return do_open(out, true, nullptr);
            else
                return do_open(out, true, &cfs);
                    load_rocksdb_options(create_if_missing, opt);
                    if (create_if_missing)
                        rocksdb::DB::Open(opt, path, &db);
                        default_cf = db->DefaultColumnFamily();

这部分代码实现的功能如下:
(1)创建一个RocksDB实例
(2)设置rocksdb的列簇,列簇名为L、M、P
(3)调用init来初始化rocksdb的一些参数配置
(4)调用do_open打开RocksDB,并利用RocksDB的DefaultColumnFamily接口获取默认列簇

在_open_db函数执行结束后,如果是创建osd,还会执行以下两行代码将kv_backend和bluefs写入到block的label区域和/var/lib/ceph/osd/ceph-num目录下对于的文件中,osd上电时会加载这两个label。
write_meta(“kv_backend”, cct->_conf->bluestore_kvbackend);
write_meta(“bluefs”, stringify(bluefs ? 1 : 0));

你可能感兴趣的:(ceph)