osd存储空间中的目录结构
[root@ceph-osd-241 ceph-1]# tree -L 1
.
├── ceph_fsid
├── current
├── fsid
├── journal
├── keyring
├── lltxt
├── magic
├── ready
├── store_version
├── superblock
└── whoami
FileStore::mount()
1.打开、读取fsid文件中的uuid
...
if (::access(basedir.c_str(), R_OK | W_OK)) { //basedir是osd的数据目录
ret = -errno;
derr << "FileStore::mount: unable to access basedir '" << basedir << "': "
<< cpp_strerror(ret) << dendl;
goto done;
}
// get fsid
snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str());
fsid_fd = ::open(buf, O_RDWR, 0644); //打开fsid文件
if (fsid_fd < 0) {
ret = -errno;
derr << "FileStore::mount: error opening '" << buf << "': "
<< cpp_strerror(ret) << dendl;
goto done;
}
ret = read_fsid(fsid_fd, &fsid); //读取fsid文件中的内容,并放入到uuid_d结构的fsid对象中。
if (ret < 0) {
derr << "FileStore::mount: error reading fsid_fd: " << cpp_strerror(ret)
<< dendl;
goto close_fsid_fd;
}
if (lock_fsid() < 0) { //对fsid文件上lock
derr << "FileStore::mount: lock_fsid failed" << dendl;
ret = -EBUSY;
goto close_fsid_fd;
}
...
2.读取store_version判断当前版本是否有效,(0.94.9)版本是4.
uint32_t version_stamp;
ret = version_stamp_is_valid(&version_stamp);
if (ret < 0) {
derr << "FileStore::mount : error in version_stamp_is_valid: "
<< cpp_strerror(ret) << dendl;
goto close_fsid_fd;
} else if (ret == 0) {
if (do_update || (int)version_stamp < g_conf->filestore_update_to) {
derr << "FileStore::mount : stale version stamp detected: "
<< version_stamp
<< ". Proceeding, do_update "
<< "is set, performing disk format upgrade."
<< dendl;
do_update = true;
} else {
ret = -EINVAL;
derr << "FileStore::mount : stale version stamp " << version_stamp
<< ". Please run the FileStore update script before starting the "
<< "OSD, or set filestore_update_to to " << target_version
<< " (currently " << g_conf->filestore_update_to << ")"
<< dendl;
goto close_fsid_fd;
}
}
3.读取superblock中的内容并填充到FileStore::superblock中去。
ret = read_superblock();
if (ret < 0) {
ret = -EINVAL;
goto close_fsid_fd;
}
4.Check if this FileStore supports all the necessary features to mount
/* Compare this CompatSet to another. * CAREFULLY NOTE: This operation is NOT commutative. * a > b DOES NOT imply that b < a. * If returns: * 0: The CompatSets have the same feature set. * 1: This CompatSet's features are a strict superset of the other's. * -1: This CompatSet is missing at least one feature * described in the other. It may still have more features, though. */
// Check if this FileStore supports all the necessary features to mount
if (supported_compat_set.compare(superblock.compat_features) == -1) {
derr << "FileStore::mount : Incompatible features set "
<< superblock.compat_features << dendl;
ret = -EINVAL;
goto close_fsid_fd;
}
5.获取数据目录的描述符。
// open some dir handles
basedir_fd = ::open(basedir.c_str(), O_RDONLY);
if (basedir_fd < 0) {
ret = -errno;
derr << "FileStore::mount: failed to open " << basedir << ": "
<< cpp_strerror(ret) << dendl;
basedir_fd = -1;
goto close_fsid_fd;
}
// test for btrfs, xattrs, etc.
ret = _detect_fs();
if (ret < 0) {
derr << "FileStore::mount : error in _detect_fs: "
<< cpp_strerror(ret) << dendl;
goto close_basedir_fd;
}
7.检查挂载点,该功能跟底层的文件系统相关,如果底层文件系统支持快照功能则会获取该目录下的所有快照,保存到对应的snaps和cluster_snaps中去(xfs不支持快照,btrfs和zfs支持快照)。
{
list<string> ls;
ret = backend->list_checkpoints(ls);
if (ret < 0) {
derr << "FileStore::mount : error in _list_snaps: "<< cpp_strerror(ret) << dendl;
goto close_basedir_fd;
}
long long unsigned c, prev = 0;
char clustersnap[NAME_MAX];
for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) {
if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) {
assert(c > prev);
prev = c;
snaps.push_back(c);
} else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1)
cluster_snaps.insert(*it);
}
}
8.检查是否需要回滚快照,xfs不支持,btrfs和zfs支持。
if (backend->can_checkpoint()) {
if (snaps.empty()) {
dout(0) << "mount WARNING: no consistent snaps found, store may be in inconsistent state" << dendl;
} else {
char s[NAME_MAX];
uint64_t curr_seq = 0;
if (m_osd_rollback_to_cluster_snap.length()) {
derr << TEXT_RED
<< " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **"
<< TEXT_NORMAL
<< dendl;
assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap));
snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str());
} else {
{
int fd = read_op_seq(&curr_seq);
if (fd >= 0) {
VOID_TEMP_FAILURE_RETRY(::close(fd));
}
}
if (curr_seq)
dout(10) << " current/ seq was " << curr_seq << dendl;
else
dout(10) << " current/ missing entirely (unusual, but okay)" << dendl;
uint64_t cp = snaps.back();
dout(10) << " most recent snap from " << snaps << " is " << cp << dendl;
// if current/ is marked as non-snapshotted, refuse to roll
// back (without clear direction) to avoid throwing out new
// data.
struct stat st;
if (::stat(nosnapfn, &st) == 0) {
if (!m_osd_use_stale_snap) {
derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl;
derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl;
derr << "config option for --osd-use-stale-snap startup argument." << dendl;
ret = -ENOTSUP;
goto close_basedir_fd;
}
derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq
<< ", newest snap is " << cp << dendl;
cerr << TEXT_YELLOW
<< " ** WARNING: forcing the use of stale snapshot data **"
<< TEXT_NORMAL << std::endl;
}
dout(10) << "mount rolling back to consistent snap " << cp << dendl;
snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
}
// drop current?
ret = backend->rollback_to(s);
if (ret) {
derr << "FileStore::mount: error rolling back to " << s << ": "
<< cpp_strerror(ret) << dendl;
goto close_basedir_fd;
}
}
}
9.获取osd提交最后一个日志是的op的序号,并赋值给initial_op_seq,该序号保存在current/commit_op_seq文件中。
initial_op_seq = 0;
。。。
op_fd = read_op_seq(&initial_op_seq);
if (op_fd < 0) {
derr << "FileStore::mount: read_op_seq failed" << dendl;
goto close_current_fd;
}
dout(5) << "mount op_seq is " << initial_op_seq << dendl;
if (initial_op_seq == 0) {
derr << "mount initial op seq is 0; something is wrong" << dendl;
ret = -EINVAL;
goto close_current_fd;
}
//标识current目录无快照。
if (!backend->can_checkpoint()) {
// mark current/ as non-snapshotted so that we don't rollback away
// from it.
int r = ::creat(nosnapfn, 0644);
if (r < 0) {
derr << "FileStore::mount: failed to create current/nosnap" << dendl;
goto close_current_fd;
}
VOID_TEMP_FAILURE_RETRY(::close(r));
} else {
// clear nosnap marker, if present.
::unlink(nosnapfn);
}
10.创建KeyValueDB(LevelDBStore)实例,并初始化;构建DBObjectMap对象,最后初始化object_map。
if (!(generic_flags & SKIP_MOUNT_OMAP)) {
KeyValueDB * omap_store = KeyValueDB::create(g_ceph_context,
superblock.omap_backend,
omap_dir);
if (omap_store == NULL)
{
derr << "Error creating " << superblock.omap_backend << dendl;
ret = -1;
goto close_current_fd;
}
omap_store->init();
stringstream err;
if (omap_store->create_and_open(err)) {
delete omap_store;
derr << "Error initializing " << superblock.omap_backend
<< " : " << err.str() << dendl;
ret = -1;
goto close_current_fd;
}
DBObjectMap *dbomap = new DBObjectMap(omap_store);
ret = dbomap->init(do_update);
if (ret < 0) {
delete dbomap;
derr << "Error initializing DBObjectMap: " << ret << dendl;
goto close_current_fd;
}
stringstream err2;
if (g_conf->filestore_debug_omap_check && !dbomap->check(err2)) {
derr << err2.str() << dendl;
delete dbomap;
ret = -EINVAL;
goto close_current_fd;
}
object_map.reset(dbomap);
}
11.打开journal,主要是实例化FileJournal对象,并赋值给FileStore::journal。
open_journal();
12.选择日志模式:writeahead、parallel、trailing;或者无日志。
// select journal mode?
if (journal) {
if (!m_filestore_journal_writeahead &&
!m_filestore_journal_parallel &&
!m_filestore_journal_trailing) {
if (!backend->can_checkpoint()) {
m_filestore_journal_writeahead = true;
dout(0) << "mount: enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl;
} else {
m_filestore_journal_parallel = true;
dout(0) << "mount: enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl;
}
} else {
if (m_filestore_journal_writeahead)
dout(0) << "mount: WRITEAHEAD journal mode explicitly enabled in conf" << dendl;
if (m_filestore_journal_parallel)
dout(0) << "mount: PARALLEL journal mode explicitly enabled in conf" << dendl;
if (m_filestore_journal_trailing)
dout(0) << "mount: TRAILING journal mode explicitly enabled in conf" << dendl;
}
if (m_filestore_journal_writeahead)
journal->set_wait_on_full(true);
} else {
dout(0) << "mount: no journal" << dendl;
}
13.检查选择的日志模式(如果选择非 btrfs,且无日志,则当发生crash时可能会丢失数据;也会增加延迟)
ret = _sanity_check_fs();
14.清理pg目录中无效的元素
// Cleanup possibly invalid collections
{
vector<coll_t> collections;
ret = list_collections(collections);
if (ret < 0) {
derr << "Error " << ret << " while listing collections" << dendl;
goto close_current_fd;
}
for (vector<coll_t>::iterator i = collections.begin();
i != collections.end();
++i) {
Index index;
ret = get_index(*i, &index);
if (ret < 0) {
derr << "Unable to mount index " << *i
<< " with error: " << ret << dendl;
goto close_current_fd;
}
assert(NULL != index.index);
RWLock::WLocker l((index.index)->access_lock);
index->cleanup();
}
}
15.启动throttle线程和sync_thread线程
wbthrottle.start();
sync_thread.create();
16.重做日志中的操作记录,可以通过配置从指定的op序号,重做日志中记录的操作。
if (!(generic_flags & SKIP_JOURNAL_REPLAY)) {
ret = journal_replay(initial_op_seq);
if (ret < 0) {
derr << "mount failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl;
if (ret == -ENOTTY) {
derr << "maybe journal is not pointing to a block device and its size "
<< "wasn't configured?" << dendl;
}
// stop sync thread
lock.Lock();
stop = true;
sync_cond.Signal();
lock.Unlock();
sync_thread.join();
wbthrottle.stop();
goto close_current_fd;
}
}
17.开启journal中的finish线程;
journal_start();
18.开启一下线程/线程池
op_tp.start(); //处理op请求
op_finisher.start();//处理op操作完成的回调
ondisk_finisher.start(); //处理op写入磁盘的回调