FileStore::mount处理过程

osd存储空间中的目录结构

[root@ceph-osd-241 ceph-1]# tree -L 1
.
├── ceph_fsid
├── current
├── fsid
├── journal
├── keyring
├── lltxt
├── magic
├── ready
├── store_version
├── superblock
└── whoami
FileStore::mount()

1.打开、读取fsid文件中的uuid

...
  if (::access(basedir.c_str(), R_OK | W_OK)) { //basedir是osd的数据目录
    ret = -errno;
    derr << "FileStore::mount: unable to access basedir '" << basedir << "': "
     << cpp_strerror(ret) << dendl;
    goto done;
  }

  // get fsid
  snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str());
  fsid_fd = ::open(buf, O_RDWR, 0644); //打开fsid文件
  if (fsid_fd < 0) {
    ret = -errno;
    derr << "FileStore::mount: error opening '" << buf << "': "
     << cpp_strerror(ret) << dendl;
    goto done;
  }

  ret = read_fsid(fsid_fd, &fsid); //读取fsid文件中的内容,并放入到uuid_d结构的fsid对象中。
  if (ret < 0) {
    derr << "FileStore::mount: error reading fsid_fd: " << cpp_strerror(ret)
     << dendl;
    goto close_fsid_fd;
  }
    if (lock_fsid() < 0) {  //对fsid文件上lock
    derr << "FileStore::mount: lock_fsid failed" << dendl;
    ret = -EBUSY;
    goto close_fsid_fd;
  }
  ...

2.读取store_version判断当前版本是否有效,(0.94.9)版本是4.

 uint32_t version_stamp;
  ret = version_stamp_is_valid(&version_stamp);
  if (ret < 0) {
    derr << "FileStore::mount : error in version_stamp_is_valid: "
     << cpp_strerror(ret) << dendl;
    goto close_fsid_fd;
  } else if (ret == 0) {
    if (do_update || (int)version_stamp < g_conf->filestore_update_to) {
      derr << "FileStore::mount : stale version stamp detected: "
       << version_stamp 
       << ". Proceeding, do_update "
       << "is set, performing disk format upgrade."
       << dendl;
      do_update = true;
    } else {
      ret = -EINVAL;
      derr << "FileStore::mount : stale version stamp " << version_stamp
       << ". Please run the FileStore update script before starting the "
       << "OSD, or set filestore_update_to to " << target_version
       << " (currently " << g_conf->filestore_update_to << ")"
       << dendl;
      goto close_fsid_fd;
    }
  }

3.读取superblock中的内容并填充到FileStore::superblock中去。

ret = read_superblock();
  if (ret < 0) {
    ret = -EINVAL;
    goto close_fsid_fd;
  }
 4.Check if this FileStore supports all the necessary features to mount

   /* Compare this CompatSet to another. * CAREFULLY NOTE: This operation is NOT commutative. * a > b DOES NOT imply that b < a. * If returns: * 0: The CompatSets have the same feature set. * 1: This CompatSet's features are a strict superset of the other's. * -1: This CompatSet is missing at least one feature * described in the other. It may still have more features, though. */

   // Check if this FileStore supports all the necessary features to mount
  if (supported_compat_set.compare(superblock.compat_features) == -1) {
    derr << "FileStore::mount : Incompatible features set "
       << superblock.compat_features << dendl;
    ret = -EINVAL;
    goto close_fsid_fd;
  }
 5.获取数据目录的描述符。
  // open some dir handles
  basedir_fd = ::open(basedir.c_str(), O_RDONLY);
  if (basedir_fd < 0) {
    ret = -errno;
    derr << "FileStore::mount: failed to open " << basedir << ": "
     << cpp_strerror(ret) << dendl;
    basedir_fd = -1;
    goto close_fsid_fd;
  }
  1. 探测挂载的文件系统,创建filestore的filestorebackend,探测其特性,并测试文件系统的xattr等。
  // test for btrfs, xattrs, etc.
  ret = _detect_fs();
  if (ret < 0) {
    derr << "FileStore::mount : error in _detect_fs: "
     << cpp_strerror(ret) << dendl;
    goto close_basedir_fd;
  }

7.检查挂载点,该功能跟底层的文件系统相关,如果底层文件系统支持快照功能则会获取该目录下的所有快照,保存到对应的snaps和cluster_snaps中去(xfs不支持快照,btrfs和zfs支持快照)。

 {
    list<string> ls;
    ret = backend->list_checkpoints(ls);
    if (ret < 0) {
      derr << "FileStore::mount : error in _list_snaps: "<< cpp_strerror(ret) << dendl;
      goto close_basedir_fd;
    }

    long long unsigned c, prev = 0;
    char clustersnap[NAME_MAX];
    for (list<string>::iterator it = ls.begin(); it != ls.end(); ++it) {
      if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) {
    assert(c > prev);
    prev = c;
    snaps.push_back(c);
      } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1)
    cluster_snaps.insert(*it);
    }
  }

8.检查是否需要回滚快照,xfs不支持,btrfs和zfs支持。

 if (backend->can_checkpoint()) {
    if (snaps.empty()) {
      dout(0) << "mount WARNING: no consistent snaps found, store may be in inconsistent state" << dendl;
    } else {
      char s[NAME_MAX];
      uint64_t curr_seq = 0;

      if (m_osd_rollback_to_cluster_snap.length()) {
    derr << TEXT_RED
         << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **"
         << TEXT_NORMAL
         << dendl;
    assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap));
    snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str());
      } else {
    {
      int fd = read_op_seq(&curr_seq);
      if (fd >= 0) {
        VOID_TEMP_FAILURE_RETRY(::close(fd));
      }
    }
    if (curr_seq)
      dout(10) << " current/ seq was " << curr_seq << dendl;
    else
      dout(10) << " current/ missing entirely (unusual, but okay)" << dendl;

    uint64_t cp = snaps.back();
    dout(10) << " most recent snap from " << snaps << " is " << cp << dendl;

    // if current/ is marked as non-snapshotted, refuse to roll
    // back (without clear direction) to avoid throwing out new
    // data.
    struct stat st;
    if (::stat(nosnapfn, &st) == 0) {
      if (!m_osd_use_stale_snap) {
        derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl;
        derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl;
        derr << "config option for --osd-use-stale-snap startup argument." << dendl;
        ret = -ENOTSUP;
        goto close_basedir_fd;
      }
      derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq
           << ", newest snap is " << cp << dendl;
      cerr << TEXT_YELLOW
           << " ** WARNING: forcing the use of stale snapshot data **"
           << TEXT_NORMAL << std::endl;
    }

        dout(10) << "mount rolling back to consistent snap " << cp << dendl;
    snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp);
      }

      // drop current?
      ret = backend->rollback_to(s);
      if (ret) {
    derr << "FileStore::mount: error rolling back to " << s << ": "
         << cpp_strerror(ret) << dendl;
    goto close_basedir_fd;
      }
    }
  }

9.获取osd提交最后一个日志是的op的序号,并赋值给initial_op_seq,该序号保存在current/commit_op_seq文件中。

 initial_op_seq = 0;

。。。

  op_fd = read_op_seq(&initial_op_seq);
  if (op_fd < 0) {
    derr << "FileStore::mount: read_op_seq failed" << dendl;
    goto close_current_fd;
  }

  dout(5) << "mount op_seq is " << initial_op_seq << dendl;
  if (initial_op_seq == 0) {
    derr << "mount initial op seq is 0; something is wrong" << dendl;
    ret = -EINVAL;
    goto close_current_fd;
  }
//标识current目录无快照。
  if (!backend->can_checkpoint()) {
    // mark current/ as non-snapshotted so that we don't rollback away
    // from it.
    int r = ::creat(nosnapfn, 0644);
    if (r < 0) {
      derr << "FileStore::mount: failed to create current/nosnap" << dendl;
      goto close_current_fd;
    }
    VOID_TEMP_FAILURE_RETRY(::close(r));
  } else {
    // clear nosnap marker, if present.
    ::unlink(nosnapfn);
  }

10.创建KeyValueDB(LevelDBStore)实例,并初始化;构建DBObjectMap对象,最后初始化object_map。

  if (!(generic_flags & SKIP_MOUNT_OMAP)) {
    KeyValueDB * omap_store = KeyValueDB::create(g_ceph_context,
                         superblock.omap_backend,
                         omap_dir);
    if (omap_store == NULL)
    {
      derr << "Error creating " << superblock.omap_backend << dendl;
      ret = -1;
      goto close_current_fd;
    }

    omap_store->init();

    stringstream err;
    if (omap_store->create_and_open(err)) {
      delete omap_store;
      derr << "Error initializing " << superblock.omap_backend
       << " : " << err.str() << dendl;
      ret = -1;
      goto close_current_fd;
    }

    DBObjectMap *dbomap = new DBObjectMap(omap_store);
    ret = dbomap->init(do_update);
    if (ret < 0) {
      delete dbomap;
      derr << "Error initializing DBObjectMap: " << ret << dendl;
      goto close_current_fd;
    }
    stringstream err2;

    if (g_conf->filestore_debug_omap_check && !dbomap->check(err2)) {
      derr << err2.str() << dendl;
      delete dbomap;
      ret = -EINVAL;
      goto close_current_fd;
    }
    object_map.reset(dbomap);
  } 

11.打开journal,主要是实例化FileJournal对象,并赋值给FileStore::journal。

   open_journal();

12.选择日志模式:writeahead、parallel、trailing;或者无日志。

// select journal mode?
  if (journal) {
    if (!m_filestore_journal_writeahead &&
    !m_filestore_journal_parallel &&
    !m_filestore_journal_trailing) {
      if (!backend->can_checkpoint()) {
    m_filestore_journal_writeahead = true;
    dout(0) << "mount: enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl;
      } else {
    m_filestore_journal_parallel = true;
    dout(0) << "mount: enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl;
      }
    } else {
      if (m_filestore_journal_writeahead)
    dout(0) << "mount: WRITEAHEAD journal mode explicitly enabled in conf" << dendl;
      if (m_filestore_journal_parallel)
    dout(0) << "mount: PARALLEL journal mode explicitly enabled in conf" << dendl;
      if (m_filestore_journal_trailing)
    dout(0) << "mount: TRAILING journal mode explicitly enabled in conf" << dendl;
    }
    if (m_filestore_journal_writeahead)
      journal->set_wait_on_full(true);
  } else {
    dout(0) << "mount: no journal" << dendl;
  }

13.检查选择的日志模式(如果选择非 btrfs,且无日志,则当发生crash时可能会丢失数据;也会增加延迟)

 ret = _sanity_check_fs();

14.清理pg目录中无效的元素

  // Cleanup possibly invalid collections
  {
    vector<coll_t> collections;
    ret = list_collections(collections);
    if (ret < 0) {
      derr << "Error " << ret << " while listing collections" << dendl;
      goto close_current_fd;
    }
    for (vector<coll_t>::iterator i = collections.begin();
     i != collections.end();
     ++i) {
      Index index;
      ret = get_index(*i, &index);
      if (ret < 0) {
    derr << "Unable to mount index " << *i 
         << " with error: " << ret << dendl;
    goto close_current_fd;
      }
      assert(NULL != index.index);
      RWLock::WLocker l((index.index)->access_lock);

      index->cleanup();
    }
  }

15.启动throttle线程和sync_thread线程

   wbthrottle.start();
  sync_thread.create();

16.重做日志中的操作记录,可以通过配置从指定的op序号,重做日志中记录的操作。

 if (!(generic_flags & SKIP_JOURNAL_REPLAY)) {
    ret = journal_replay(initial_op_seq);
    if (ret < 0) {
      derr << "mount failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl;
      if (ret == -ENOTTY) {
        derr << "maybe journal is not pointing to a block device and its size "
         << "wasn't configured?" << dendl;
      }

      // stop sync thread
      lock.Lock();
      stop = true;
      sync_cond.Signal();
      lock.Unlock();
      sync_thread.join();

      wbthrottle.stop();

      goto close_current_fd;
    }
  }

17.开启journal中的finish线程;

 journal_start();

18.开启一下线程/线程池

  op_tp.start(); //处理op请求
  op_finisher.start();//处理op操作完成的回调
  ondisk_finisher.start(); //处理op写入磁盘的回调

你可能感兴趣的:(存储)