Bluestore--bluefs初始化部分源码解析

osd::mkfs()
----BlueStore:mkfs()
--------Bluestore::_open_db()
-----------Bluefs->add_block_device()
-----------Bluefs->add_block_extent()
-----------Bluefs->mkfs()
-----------Bluefs->mount()
---------------Bluefs->_open_super()
---------------Bluefs->_init_alloc()
---------------Bluefs->_replay()

在osd的部署过程中会对BlueStore进行初始化
1、设置fsid,设置为 osd的uuid
2、初始化bluestore
3、挂载bluestore
4、读取bluestore的superblock信息,如果已存在superblock,则检查相关信息;如果不存在,则先设置cluster_fsid、osd_fsid、whoami、compat_features信息,然后通过事务创建superblock。
5、写fsid文件

  int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
    	      uuid_d fsid, int whoami)
    {
      // if we are fed a uuid for this osd, use it.
      store->set_fsid(cct->_conf->osd_uuid);   //设置fs的fsid为osd的uuid
      ret = store->mkfs();      //bulestore初始化
      store->set_cache_shards(1);  // doesn't matter for mkfs!
      ret = store->mount();    //blustore的mount
      ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);    //获取bluestore的superblock数据
      if (ret >= 0) {
        //检查superblock的相关信息
      } else {
    //设置superblock相关信息
        sb.cluster_fsid = fsid;
        sb.osd_fsid = store->get_fsid();
        sb.whoami = whoami;
        sb.compat_features = get_osd_initial_compat_set();
    
        //通过bluestore事务来创建superblock
        ObjectStore::Transaction t;
        t.create_collection(coll_t::meta(), 0);
        t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
        ret = store->apply_transaction(osr.get(), std::move(t));
      }
       //写fsid文件
      ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);  
    }

bluestore的mkfs过程

int BlueStore::mkfs()
{
  r = read_meta("mkfs_done", &done);   //读取元数据信息??并进行fsck

  r = read_meta("type", &type);     //读取类型数据,判断是否为bluestore
  freelist_type = "bitmap";         //设置空闲空间的存储格式为bitmap
  r = _open_path();               //判断osd_max_object_size是否小于4GB,BlueStore has hard limit of 4GB
  r = _open_fsid(true);            //感觉是在读取fsid有没有成功配置??
  r = _lock_fsid();                //为fsid相关文件上锁,如果无法上锁,有可能是因为有其他的ceph-osd还在运行状态
  r = _read_fsid(&old_fsid);        //读取fsid相关配置信息,判断相关配置信息是否正确,如果没有fsid,则生成一个随机的fsid???
  r = _setup_block_symlink_or_file("block", cct->_conf->bluestore_block_path,
				   cct->_conf->bluestore_block_size,
				   cct->_conf->bluestore_block_create);    //创建block链接或文件??
  if (cct->_conf->bluestore_bluefs) {
    r = _setup_block_symlink_or_file("block.wal", cct->_conf->bluestore_block_wal_path,
	cct->_conf->bluestore_block_wal_size,
	cct->_conf->bluestore_block_wal_create);             //创建block.wal链接或文件??
    r = _setup_block_symlink_or_file("block.db", cct->_conf->bluestore_block_db_path,
	cct->_conf->bluestore_block_db_size,
	cct->_conf->bluestore_block_db_create);              //创建block.db链接或文件??
  }
  r = _open_bdev(true);           //创建blockdevice设备
  // 设置min_alloc_size
  ...
  r = _open_db(true);             //初始化Bluefs,准备rocksdb的使用环境,如果无法完成,则关闭blockdevice设备
  r = _open_fm(true);             //初始化FreelistManager,如果无法完成,则关闭db
  r = write_meta("kv_backend", cct->_conf->bluestore_kvbackend);    //写入rocksdb元数据信息
  r = write_meta("bluefs", stringify(bluefs ? 1 : 0));      //写入bluefs元数据
  return r;
}

初始化rocksdb

int BlueStore::_open_db(bool create)
{
  //创建bluestore以后检查kv_backend和bluefs的元数据信息
  ...
  //初始化bluefs
  rocksdb::Env *env = NULL;
  if (do_bluefs) {
    bluefs = new BlueFS(cct);
    // 根据指定path创建block.db的blockdevice设备,并创建IOContext
    bfn = path + "/block.db";
    if (::stat(bfn.c_str(), &st) == 0) {
      r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
      if (create) {
        // 添加设备的存储空间,一般为SUPER_RESERVED到磁盘空间的上限,SUPER_RESERVED为8192,即从第三个4k开始
		bluefs->add_block_extent(
	  	    BlueFS::BDEV_DB,
	  		SUPER_RESERVED,
	  		bluefs->get_block_device_size(BlueFS::BDEV_DB) - SUPER_RESERVED);
      }
    } 
// shared device
// 根据指定path创建block的blockdevice设备,并创建IOContext,并添加设备的存储空间
    bfn = path + "/block";
    r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
if (create) {
      bluefs->add_block_extent(bluefs_shared_bdev, start, initial);
      bluefs_extents.insert(start, initial);
    }
    // 根据指定path创建block.wal的blockdevice设备,并创建IOContext,并添加设备存储空间
    bfn = path + "/block.wal";
    if (::stat(bfn.c_str(), &st) == 0) {
      r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
      if (create) {
          bluefs->add_block_extent(BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE, bluefs->get_block_device_size(BlueFS::BDEV_WAL) -BDEV_LABEL_BLOCK_SIZE);
      }
    }
    if (create) {
      bluefs->mkfs(fsid);       //初始化bluefs,并设置fsid
    }
r = bluefs->mount();        //挂载bluefs
//创建设置BlueRocksEnv
    ...
  } 
  db = KeyValueDB::create(cct, kv_backend, fn, static_cast(env));  //创建db
  //fm相关设置??
  FreelistManager::setup_merge_operators(db);
  db->set_merge_operator(PREFIX_STAT, merge_op);
  db->set_cache_size(cache_kv_ratio * cache_size);

  if (kv_backend == "rocksdb")
    options = cct->_conf->bluestore_rocksdb_options;
  db->init(options);      //根据rocksdb配置初始化db
}

bluefs的mkfs过程

int BlueFS::mkfs(uuid_d osd_uuid)
{
  //执行初始化操作
  _init_alloc();
  _init_logger();
  //设置superblock相关信息
  super.version = 1;
  super.block_size = bdev[BDEV_DB]->get_block_size();
  super.osd_uuid = osd_uuid;
  super.uuid.generate_random();
  // 初始化日志:log_file,初始化txn:log_t
  ...
  // 写superblock
  super.log_fnode = log_file->fnode;
  _write_super();
  flush_bdev();
  // clean up,没明白是针对什么??
  block_all.clear();
  block_total.clear();
  _stop_alloc();
  _shutdown_logger();
}

//加载BlueFS
BlueFS加载(BlueFS::mount())
加载superblock到内存
初始化各存储空间的块分配器
BlueFS将存储空间划分为三层:慢速(Slow)空间、高速(DB)空间、超高速(WAL)空间,每种类型空间使用各自的块分配器,块分配器负责该存储空间中空闲空间的分配与回收,块分配器的工作原理我们将在以后的章节中讨论。
日志回放
BlueFS元数据都是作为日志持久化在硬盘中,在加载BlueFS时候对日志进行replay还原到内存中,由于日志在持久化时都是根据操作顺序append到日志文件当中,因此在replay的时候只要顺序逐条对日志进行解析就能将BlueFS的当前元数据还原到内存中。
日志回放后会在内存中建立dir_map和file_map,此外块分配器中会添加为不同存储空间分配的磁盘空间。
标记已分配空间
日志回放过程中并未将分配给文件的空间从空闲空间中移除,仅当日志回放完成后,所有文件元数据全部加载到内存中,再通过遍历file_map中文件的地址空间映射信息,移除相应的块分配器中的空闲空间,防止已分配空间的重复分配。
创建log_writer
log_writer为日志文件的句柄,用于向日志中追加日志项。

int BlueFS::mount()
{
  int r = _open_super();   // 打开superblock
  // 初始化allocator为磁盘所有的空间
  _init_alloc();
  // 回放文件系统日志,日志项即为上面的事务OP,针对每个事务进行回放,文件系统的dir_map/file_map就会被更新
  r = _replay(false);
  // 初始化freelist,针对file_map中的每个文件,将分配给文件的空间从空闲空间中移除   
  for (auto& p : file_map) {
    for (auto& q : p.second->fnode.extents) {
      alloc[q.bdev]->init_rm_free(q.offset, q.length);      // 将文件已经占用的内容从allocator中删除
    }
  }
  // 创建log_writer, log_writer为日志文件的句柄,用于向日志中追加日志项,该日志文件的fnode.ino为1
  log_writer = _create_writer(_get_file(1));
  assert(log_writer->file->fnode.ino == 1);
  log_writer->pos = log_writer->file->fnode.size;
  _init_logger();
 }

Superblock部分:
加载superblock到内存,并且使用校验和的方式对superblock内容进行校验

int BlueFS::_open_super()
{
  // always the second block  读取superblocak总是放在第二个block
  r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
			  &bl, ioc[BDEV_DB], false);
  bufferlist::iterator p = bl.begin(); 
  //解码super块内容
  ::decode(super, p);
  //计算当前的校验和
  {
    t.substr_of(bl, 0, p.get_off());
    crc = t.crc32c(-1);
  }
  //解码校验和
  ::decode(expected_crc, p);
  //检查计算得到的新校验和是不是与期望的校验和一致
  if (crc != expected_crc) {
    return -EIO;
  }
}

对bluefs进行还原。读取log_file,根据实际的历史事务记录,来还原bluefs,对每一个事务,判断其操作类型,对根据操作类型来执行相应的操作,还原之前的文件系统环境。log_file的元数据信息fnode保存在superblock的log_fnode中

int BlueFS::_replay(bool noop)
{
  //superblock中的log_fnode保存的是log_file的fnode
  log_file->fnode = super.log_fnode;
  //预读log_file
  FileReader *log_reader = new FileReader(
    log_file, cct->_conf->bluefs_max_prefetch,
    false,  // !random
    true);  // ignore eof
  while (true) {
    {
      //每次读取一个block_size的内容
      int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
    		&bl, NULL);
      read_pos += r;
    }
    {
      bluefs_transaction_t t;
      bufferlist::iterator p = t.op_bl.begin();
     //对bufferlist中的逐个事务进行处理
      while (!p.end()) {
         switch (op) {
          	case bluefs_transaction_t::OP_INIT:
            ...
    		break;	
          	case bluefs_transaction_t::OP_JUMP:
            {
                ::decode(next_seq, p);
    	  		::decode(offset, p);
    	  		log_seq = next_seq - 1; // we will increment it below
    	  		uint64_t skip = offset - read_pos;
    	  		if (skip) {
    	    		bufferlist junk;
    	    		int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk, NULL);
    	  		}
    		}
    		break;
          	case bluefs_transaction_t::OP_JUMP_SEQ:
            {
                ::decode(next_seq, p);
    	  		log_seq = next_seq - 1; // we will increment it below
    		}
    		break;
          	case bluefs_transaction_t::OP_ALLOC_ADD:
            {
                ::decode(id, p);
    	  		::decode(offset, p);
    	  		::decode(length, p);
    	  		if (!noop) {
    	  			//block_all记录已分配段(offset, length),block_total记录已分配长度
    	    		block_all[id].insert(offset, length);
    	    		block_total[id] += length;
    	    		alloc[id]->init_add_free(offset, length);
    	  		}	
    		}
    		break;
            case bluefs_transaction_t::OP_ALLOC_RM:
            {
    ::decode(id, p);
    ::decode(offset, p);
    ::decode(length, p);
    	  		if (!noop) {
    	    		block_all[id].erase(offset, length);
    	    		block_total[id] -= length;
    	    		alloc[id]->init_rm_free(offset, length);
    	  		}
    		}
    		break;
          	case bluefs_transaction_t::OP_DIR_LINK:
            {
                ::decode(dirname, p);
    	  		::decode(filename, p);
    	  		::decode(ino, p);
    	  		if (!noop) {
    	    		FileRef file = _get_file(ino);
    	    		assert(file->fnode.ino);
    	    		map::iterator q = dir_map.find(dirname);
    	    		assert(q != dir_map.end());
    	    		map::iterator r = q->second->file_map.find(filename);
    	    		assert(r == q->second->file_map.end());
    	    		q->second->file_map[filename] = file;
    	    		++file->refs;
    	  		}
    		}
    		break;
          	case bluefs_transaction_t::OP_DIR_UNLINK:
            {
    	  	  	::decode(dirname, p);
    	  		::decode(filename, p);
    	  		if (!noop) {
    	    		map::iterator q = dir_map.find(dirname);
    	    		assert(q != dir_map.end());
    	    		map::iterator r = q->second->file_map.find(filename);
    	    		assert(r != q->second->file_map.end());
                	assert(r->second->refs > 0); 
    	    		--r->second->refs;
    	    		q->second->file_map.erase(r);
    	  		}
    		}
    		break;
          	case bluefs_transaction_t::OP_DIR_CREATE:
            {
    	  		::decode(dirname, p);
    	  		if (!noop) {
    	    		map::iterator q = dir_map.find(dirname);
    	    		assert(q == dir_map.end());
    	    		dir_map[dirname] = new Dir;
    	  		}
    		}
    		break;
          	case bluefs_transaction_t::OP_DIR_REMOVE:
            {
    	  		::decode(dirname, p);
    	  		if (!noop) {
    	    		map::iterator q = dir_map.find(dirname);
    	    		assert(q != dir_map.end());
    	    		assert(q->second->file_map.empty());
    	    		dir_map.erase(q);
    	  		}
    		}
    		break;
          	case bluefs_transaction_t::OP_FILE_UPDATE:
            {
    	  		 ::decode(fnode, p);
    	  		if (!noop) {
    	    		FileRef f = _get_file(fnode.ino);
    	    		f->fnode = fnode;
    	    		if (fnode.ino > ino_last) {
    	      			ino_last = fnode.ino;
    	    		}
    	  		}
    		}
    		break;
          	case bluefs_transaction_t::OP_FILE_REMOVE:
            {
    	    	::decode(ino, p);
    	  		if (!noop) {
    	    		auto p = file_map.find(ino);
    	    		assert(p != file_map.end());
    	    		file_map.erase(p);
    	  		}
    		}
    		break;
          }
        }
        assert(p.end());
    
        // we successfully replayed the transaction; bump the seq and log size
        ++log_seq;
        log_file->fnode.size = log_reader->buf.pos;
      }
    }

bluefs事务类型。

  struct bluefs_transaction_t {
      typedef enum {
        OP_NONE = 0,
        OP_INIT,        ///< initial (empty) file system marker
         //给文件分配和释放空间
        OP_ALLOC_ADD,   ///< add extent to available block storage (extent)
        OP_ALLOC_RM,    ///< remove extent from availabe block storage (extent)
       // 创建和删除目录项(包含文件)【一个目录和文件的映射关系】
        OP_DIR_LINK,    ///< (re)set a dir entry (dirname, filename, ino)
        OP_DIR_UNLINK,  ///< remove a dir entry (dirname, filename)
        // 创建和删除目录
        OP_DIR_CREATE,  ///< create a dir (dirname)
        OP_DIR_REMOVE,  ///< remove a dir (dirname)
        // 更新和删除文件
        OP_FILE_UPDATE, ///< set/update file metadata (file)
        OP_FILE_REMOVE, ///< remove file (ino)
        // bluefs日志文件的compaction操作  ??
        OP_JUMP,        ///< jump the seq # and offset
        OP_JUMP_SEQ,    ///< jump the seq #
      } op_t;
    
      uuid_d uuid;          ///< fs uuid
      uint64_t seq;         ///< sequence number
      bufferlist op_bl;     ///< encoded transaction ops
    
    ...

block_all记录已分配段(offset, length),block_total记录已分配长度

你可能感兴趣的:(ceph)