学习Linux已经有一段时间了,最近看了下mount这个系统调用的一些流程,把它用博客记录下来,方便自己以后查找,也可以给那些有需要的人提供一些帮助。
当在用户层或者启动脚本中时调用mount函数把一个设备用相应的文件系统挂载起来时,可以让我们很方便的去访问这个设备中的文件;在内核中,mount的入口函数在fs/namespace.c
- SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
- char __user *, type, unsigned long, flags, void __user *, data)
- {
- int ret;
- char *kernel_type;
- char *kernel_dir;
- char *kernel_dev;
- unsigned long data_page;
- ret = copy_mount_string(type, &kernel_type);//复制数据到内核空间
- if (ret < 0)
- goto out_type;
- kernel_dir = getname(dir_name); //复制数据到内核空间
- if (IS_ERR(kernel_dir)) {
- ret = PTR_ERR(kernel_dir);
- goto out_dir;
- }
- ret = copy_mount_string(dev_name, &kernel_dev); //复制数据到内核空间
- if (ret < 0)
- goto out_dev;
- ret = copy_mount_options(data, &data_page);//复制数据到内核空间
- if (ret < 0)
- goto out_data;
- ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags,
- (void *) data_page);
- free_page(data_page);
- out_data:
- kfree(kernel_dev);
- out_dev:
- putname(kernel_dir);
- out_dir:
- kfree(kernel_type);
- out_type:
- return ret;
- }
用户空间传递了dev_name、dir_name、type、flags和data五个参数到内核中,由于dev_name、dir_name、type和data四个参数都是指针,都指向用户空间的某区域,所以需要用特定的函数将这些数据从用户层拷贝到内核。
这个函数的主要实现都在do_mount函数中:
- long do_mount(char *dev_name, char *dir_name, char *type_page,
- unsigned long flags, void *data_page)
- {
- struct path path;
- int retval = 0;
- int mnt_flags = 0;
- /* Discard magic */
- if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
- flags &= ~MS_MGC_MSK;
- /* Basic sanity checks */
- if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
- return -EINVAL;
- if (data_page)
- ((char *)data_page)[PAGE_SIZE - 1] = 0;
- /* ... and get the mountpoint */
- retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
- if (retval)
- return retval;
- retval = security_sb_mount(dev_name, &path,
- type_page, flags, data_page);
- if (retval)
- goto dput_out;
- /* Default to relatime unless overriden */
- if (!(flags & MS_NOATIME))
- mnt_flags |= MNT_RELATIME;
- /* Separate the per-mountpoint flags */
- if (flags & MS_NOSUID)
- mnt_flags |= MNT_NOSUID;
- if (flags & MS_NODEV)
- mnt_flags |= MNT_NODEV;
- if (flags & MS_NOEXEC)
- mnt_flags |= MNT_NOEXEC;
- if (flags & MS_NOATIME)
- mnt_flags |= MNT_NOATIME;
- if (flags & MS_NODIRATIME)
- mnt_flags |= MNT_NODIRATIME;
- if (flags & MS_STRICTATIME)
- mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
- if (flags & MS_RDONLY)
- mnt_flags |= MNT_READONLY;
- flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
- MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
- MS_STRICTATIME);
- if (flags & MS_REMOUNT)
- retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
- data_page);
- else if (flags & MS_BIND)
- retval = do_loopback(&path, dev_name, flags & MS_REC);
- else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
- retval = do_change_type(&path, flags);
- else if (flags & MS_MOVE)
- retval = do_move_mount(&path, dev_name);
- else
- retval = do_new_mount(&path, type_page, flags, mnt_flags,
- dev_name, data_page);
- dput_out:
- path_put(&path);
- return retval;
- }
前面都是对一些指针的判断,函数kern_path用于在给定的字符串去查找出将要挂在在哪个目录中,查找成功会通过path这个指针带回查找的结构,之后用do_new_mount这个函数去进行下一步的挂载。
kern_path函数中只调用了函数do_path_lookup
- static int do_path_lookup(int dfd, const char *name,
- unsigned int flags, struct nameidata *nd)
- {
- int retval = path_init(dfd, name, flags, nd);
- if (!retval)
- retval = path_walk(name, nd);
- if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
- nd->path.dentry->d_inode))
- audit_inode(name, nd->path.dentry);
- if (nd->root.mnt) {
- path_put(&nd->root);
- nd->root.mnt = NULL;
- }
- return retval;
- }
分为两部分看:第一部分调用path_init,用于初始化查找的根目录;第二部分在根目录的基础上对所给的字符串目录进行逐级查找。
先看path_init
- static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
- {
- int retval = 0;
- int fput_needed;
- struct file *file;
- nd->last_type = LAST_ROOT; /* if there are only slashes... */
- nd->flags = flags;
- nd->depth = 0;
- nd->root.mnt = NULL;
- if (*name=='/') {
- set_root(nd);
- nd->path = nd->root;
- path_get(&nd->root);
- } else if (dfd == AT_FDCWD) {
- struct fs_struct *fs = current->fs;
- read_lock(&fs->lock);
- nd->path = fs->pwd;
- path_get(&fs->pwd);
- read_unlock(&fs->lock);
- } else {
- struct dentry *dentry;
- file = fget_light(dfd, &fput_needed);
- retval = -EBADF;
- if (!file)
- goto out_fail;
- dentry = file->f_path.dentry;
- retval = -ENOTDIR;
- if (!S_ISDIR(dentry->d_inode->i_mode))
- goto fput_fail;
- retval = file_permission(file, MAY_EXEC);
- if (retval)
- goto fput_fail;
- nd->path = file->f_path;
- path_get(&file->f_path);
- fput_light(file, fput_needed);
- }
- return 0;
- fput_fail:
- fput_light(file, fput_needed);
- out_fail:
- return retval;
- }
这个函数就是一个if ..else..语句,如果第一个字符时'/',则说明是绝对路径,从当前进程描述符的fs的root成员中得到根目录,否则从pwd中保存的当前路径作为查找根目录。
再来看path_walk
- static int path_walk(const char *name, struct nameidata *nd)
- {
- struct path save = nd->path;
- int result;
- current->total_link_count = 0;
- /* make sure the stuff we saved doesn't go away */
- path_get(&save);
- result = link_path_walk(name, nd);
- if (result == -ESTALE) {
- /* nd->path had been dropped */
- current->total_link_count = 0;
- nd->path = save;
- path_get(&nd->path);
- nd->flags |= LOOKUP_REVAL;
- result = link_path_walk(name, nd);
- }
- path_put(&save);
- return result;
- }
path_walk函数的代码中只调用了link_path_walk
- static int link_path_walk(const char *name, struct nameidata *nd)
- {
- struct path next;
- struct inode *inode;
- int err;
- unsigned int lookup_flags = nd->flags;
- while (*name=='/') //去掉开头的/字符
- name++;
- if (!*name)
- goto return_reval;
- inode = nd->path.dentry->d_inode;
- if (nd->depth)
- lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
- /* At this point we know we have a real path component. */
- for(;;) {
- unsigned long hash;
- struct qstr this; //临时保存将要查找的目录
- unsigned int c;
- nd->flags |= LOOKUP_CONTINUE;
- err = exec_permission(inode);
- if (err)
- break;
- this.name = name;
- c = *(const unsigned char *)name;
- hash = init_name_hash();
- do {
- name++;
- hash = partial_name_hash(c, hash); //计算hash值
- c = *(const unsigned char *)name;
- } while (c && (c != '/'));
- this.len = name - (const char *) this.name;
- this.hash = end_name_hash(hash);
- /* remove trailing slashes? */
- if (!c)
- goto last_component; //跳转去处理最后一级目录
- while (*++name == '/');
- if (!*name)
- goto last_with_slashes;
- /*
- * "." and ".." are special - ".." especially so because it has
- * to be able to know about the current root directory and
- * parent relationships.
- */
- if (this.name[0] == '.') switch (this.len) {
- default:
- break;
- case 2:
- if (this.name[1] != '.')
- break;
- follow_dotdot(nd);
- inode = nd->path.dentry->d_inode; //两个点将当前设置为上一级目录
- /* fallthrough */
- case 1:
- continue; //只有一个点不做任何处理
- }
- /* This does the actual lookups.. */
- err = do_lookup(nd, &this, &next); //真正的查找函数
- if (err)
- break;
- err = -ENOENT;
- inode = next.dentry->d_inode;
- if (!inode)
- goto out_dput;
- if (inode->i_op->follow_link) {
- err = do_follow_link(&next, nd);
- if (err)
- goto return_err;
- err = -ENOENT;
- inode = nd->path.dentry->d_inode;
- if (!inode)
- break;
- } else
- path_to_nameidata(&next, nd);
- err = -ENOTDIR;
- if (!inode->i_op->lookup)
- break;
- continue;
- /* here ends the main loop */
- last_with_slashes:
- lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
- last_component:
- /* Clear LOOKUP_CONTINUE iff it was previously unset */
- nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
- if (lookup_flags & LOOKUP_PARENT)
- goto lookup_parent;
- if (this.name[0] == '.') switch (this.len) {
- default:
- break;
- case 2:
- if (this.name[1] != '.')
- break;
- follow_dotdot(nd);
- inode = nd->path.dentry->d_inode;
- /* fallthrough */
- case 1:
- goto return_reval;
- }
- err = do_lookup(nd, &this, &next);
- if (err)
- break;
- inode = next.dentry->d_inode;
- if (follow_on_final(inode, lookup_flags)) {
- err = do_follow_link(&next, nd);
- if (err)
- goto return_err;
- inode = nd->path.dentry->d_inode;
- } else
- path_to_nameidata(&next, nd);
- err = -ENOENT;
- if (!inode)
- break;
- if (lookup_flags & LOOKUP_DIRECTORY) {
- err = -ENOTDIR;
- if (!inode->i_op->lookup)
- break;
- }
- goto return_base;
- lookup_parent:
- nd->last = this;
- nd->last_type = LAST_NORM;
- if (this.name[0] != '.')
- goto return_base;
- if (this.len == 1)
- nd->last_type = LAST_DOT;
- else if (this.len == 2 && this.name[1] == '.')
- nd->last_type = LAST_DOTDOT;
- else
- goto return_base;
- return_reval:
- /*
- * We bypassed the ordinary revalidation routines.
- * We may need to check the cached dentry for staleness.
- */
- if (nd->path.dentry && nd->path.dentry->d_sb &&
- (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
- err = -ESTALE;
- /* Note: we do not d_invalidate() */
- if (!nd->path.dentry->d_op->d_revalidate(
- nd->path.dentry, nd))
- break;
- }
- return_base:
- return 0;
- out_dput:
- path_put_conditional(&next, nd);
- break;
- }
- path_put(&nd->path);
- return_err:
- return err;
- }
link_path_walk函数先把给的字符串进行拆分,去除每级目录的名字,然后调用do_lookup函数在当前的目录基础上进行查找,知道查完整个字符串。
- static int do_lookup(struct nameidata *nd, struct qstr *name,struct path *path)
- {
- struct vfsmount *mnt = nd->path.mnt;
- struct dentry *dentry, *parent;
- struct inode *dir;
- /*
- * See if the low-level filesystem might want
- * to use its own hash..
- */
- if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
- int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
- if (err < 0)
- return err;
- }
- dentry = __d_lookup(nd->path.dentry, name);
- if (!dentry)
- goto need_lookup;
- if (dentry->d_op && dentry->d_op->d_revalidate)
- goto need_revalidate;
- done:
- path->mnt = mnt;
- path->dentry = dentry;
- __follow_mount(path);
- return 0;
- need_lookup:
- parent = nd->path.dentry;
- dir = parent->d_inode;
- mutex_lock(&dir->i_mutex);
- /*
- * First re-do the cached lookup just in case it was created
- * while we waited for the directory semaphore..
- *
- * FIXME! This could use version numbering or similar to
- * avoid unnecessary cache lookups.
- *
- * The "dcache_lock" is purely to protect the RCU list walker
- * from concurrent renames at this point (we mustn't get false
- * negatives from the RCU list walk here, unlike the optimistic
- * fast walk).
- *
- * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
- */
- dentry = d_lookup(parent, name);
- if (!dentry) {
- struct dentry *new;
- /* Don't create child dentry for a dead directory. */
- dentry = ERR_PTR(-ENOENT);
- if (IS_DEADDIR(dir))
- goto out_unlock;
- new = d_alloc(parent, name);
- dentry = ERR_PTR(-ENOMEM);
- if (new) {
- dentry = dir->i_op->lookup(dir, new, nd);
- if (dentry)
- dput(new);
- else
- dentry = new;
- }
- out_unlock:
- mutex_unlock(&dir->i_mutex);
- if (IS_ERR(dentry))
- goto fail;
- goto done;
- }
- /*
- * Uhhuh! Nasty case: the cache was re-populated while
- * we waited on the semaphore. Need to revalidate.
- */
- mutex_unlock(&dir->i_mutex);
- if (dentry->d_op && dentry->d_op->d_revalidate) {
- dentry = do_revalidate(dentry, nd);
- if (!dentry)
- dentry = ERR_PTR(-ENOENT);
- }
- if (IS_ERR(dentry))
- goto fail;
- goto done;
- need_revalidate:
- dentry = do_revalidate(dentry, nd);
- if (!dentry)
- goto need_lookup;
- if (IS_ERR(dentry))
- goto fail;
- goto done;
- fail:
- return PTR_ERR(dentry);
- }
do_lookup先调用__d_lookup进行查找,如果查找失败,再去启用d_lookup,d_lookup其实内部还是调用__d_lookup函数,只是在这个基础上会使用信号量保护起来,以防止重命名造成的同步问题;如果都查找失败就新分配一个dentry并把它连接起来,函数的最后会调用__follow_mount,用于在当前dentry上查找是否存在挂载点,并用最新的挂载点的dentry和vfsmount对path进行重新赋值。__follow_mount的代码如下:
- static int __follow_mount(struct path *path)
- {
- int res = 0;
- while (d_mountpoint(path->dentry)) {
- struct vfsmount *mounted = lookup_mnt(path);
- if (!mounted)
- break;
- dput(path->dentry);
- if (res)
- mntput(path->mnt);
- path->mnt = mounted;
- path->dentry = dget(mounted->mnt_root);
- res = 1;
- }
- return res;
- }
再看下__d_lookup函数的实现:
- struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
- {
- unsigned int len = name->len;
- unsigned int hash = name->hash;
- const unsigned char *str = name->name;
- struct hlist_head *head = d_hash(parent,hash);
- struct dentry *found = NULL;
- struct hlist_node *node;
- struct dentry *dentry;
- rcu_read_lock();
- hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
- struct qstr *qstr;
- if (dentry->d_name.hash != hash)
- continue;
- if (dentry->d_parent != parent)
- continue;
- spin_lock(&dentry->d_lock);
- /*
- * Recheck the dentry after taking the lock - d_move may have
- * changed things. Don't bother checking the hash because we're
- * about to compare the whole name anyway.
- */
- if (dentry->d_parent != parent)
- goto next;
- /* non-existing due to RCU? */
- if (d_unhashed(dentry))
- goto next;
- /*
- * It is safe to compare names since d_move() cannot
- * change the qstr (protected by d_lock).
- */
- qstr = &dentry->d_name;
- if (parent->d_op && parent->d_op->d_compare) {
- if (parent->d_op->d_compare(parent, qstr, name))
- goto next;
- } else { //如果d_compare函数没有实现就匹配字符串,对没有特殊要求的文件系统都可以匹配字符串即可
- if (qstr->len != len)
- goto next;
- if (memcmp(qstr->name, str, len))
- goto next;
- }
- atomic_inc(&dentry->d_count);
- found = dentry;
- spin_unlock(&dentry->d_lock);
- break;
- next:
- spin_unlock(&dentry->d_lock);
- }
- rcu_read_unlock();
- return found;
- }
__d_lookup函数会遍历父目录的hash表找出相匹配的子目录。
到这里整个挂载目录的查找就结束了,kern_path完成之后会通过path变量带回挂载点的dentry和父文件系统的vfsmount到do_mount函数中。
do_mountj继续调用do_new_mount函数:do_new_mount分为两部分,第一部分是生成挂载所需的超级快等文件结构;第二部分用于将一种生成的加到内核中去。
先看第一部分,通过do_kern_mount实现,do_kern_mount有调用了vfs_kern_mount:
- struct vfsmount *
- vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
- {
- struct vfsmount *mnt;
- char *secdata = NULL;
- int error;
- if (!type)
- return ERR_PTR(-ENODEV);
- error = -ENOMEM;
- mnt = alloc_vfsmnt(name);
- if (!mnt)
- goto out;
- if (flags & MS_KERNMOUNT)
- mnt->mnt_flags = MNT_INTERNAL;
- if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
- secdata = alloc_secdata();
- if (!secdata)
- goto out_mnt;
- error = security_sb_copy_data(data, secdata);
- if (error)
- goto out_free_secdata;
- }
- error = type->get_sb(type, flags, name, data, mnt);
- if (error < 0)
- goto out_free_secdata;
- BUG_ON(!mnt->mnt_sb);
- WARN_ON(!mnt->mnt_sb->s_bdi);
- mnt->mnt_sb->s_flags |= MS_BORN;
- error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
- if (error)
- goto out_sb;
- /*
- * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
- * but s_maxbytes was an unsigned long long for many releases. Throw
- * this warning for a little while to try and catch filesystems that
- * violate this rule. This warning should be either removed or
- * converted to a BUG() in 2.6.34.
- */
- WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
- "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
- mnt->mnt_mountpoint = mnt->mnt_root;
- mnt->mnt_parent = mnt;
- up_write(&mnt->mnt_sb->s_umount);
- free_secdata(secdata);
- return mnt;
- out_sb:
- dput(mnt->mnt_root);
- deactivate_locked_super(mnt->mnt_sb);
- out_free_secdata:
- free_secdata(secdata);
- out_mnt:
- free_vfsmnt(mnt);
- out:
- return ERR_PTR(error);
- }
这部分的重点在于type->get_sb(type, flags, name, data, mnt); 调用特定文件系统的get_sb函数生成超级块对象和挂载点等数据结构。
第二部分的代码为函数do_add_mount
- int do_add_mount(struct vfsmount *newmnt, struct path *path,
- int mnt_flags, struct list_head *fslist)
- {
- int err;
- mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
- down_write(&namespace_sem);
- /* Something was mounted here while we slept */
- while (d_mountpoint(path->dentry) &&
- follow_down(path))
- ;
- err = -EINVAL;
- if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
- goto unlock;
- /* Refuse the same filesystem on the same mount point */
- err = -EBUSY;
- if (path->mnt->mnt_sb == newmnt->mnt_sb &&
- path->mnt->mnt_root == path->dentry)
- goto unlock;
- err = -EINVAL;
- if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
- goto unlock;
- newmnt->mnt_flags = mnt_flags;
- if ((err = graft_tree(newmnt, path)))
- goto unlock;
- if (fslist) /* add to the specified expiration list */
- list_add_tail(&newmnt->mnt_expire, fslist);
- up_write(&namespace_sem);
- return 0;
- unlock:
- up_write(&namespace_sem);
- mntput(newmnt);
- return err;
- }
继续调用graft_tree
- static int graft_tree(struct vfsmount *mnt, struct path *path)
- {
- int err;
- if (mnt->mnt_sb->s_flags & MS_NOUSER)
- return -EINVAL;
- if (S_ISDIR(path->dentry->d_inode->i_mode) !=
- S_ISDIR(mnt->mnt_root->d_inode->i_mode))
- return -ENOTDIR;
- err = -ENOENT;
- mutex_lock(&path->dentry->d_inode->i_mutex);
- if (cant_mount(path->dentry))
- goto out_unlock;
- if (!d_unlinked(path->dentry))
- err = attach_recursive_mnt(mnt, path, NULL);
- out_unlock:
- mutex_unlock(&path->dentry->d_inode->i_mutex);
- return err;
- }
调用attach_recursive_mnt
- static int attach_recursive_mnt(struct vfsmount *source_mnt,
- struct path *path, struct path *parent_path)
- {
- LIST_HEAD(tree_list);
- struct vfsmount *dest_mnt = path->mnt;
- struct dentry *dest_dentry = path->dentry;
- struct vfsmount *child, *p;
- int err;
- if (IS_MNT_SHARED(dest_mnt)) {
- err = invent_group_ids(source_mnt, true);
- if (err)
- goto out;
- }
- err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
- if (err)
- goto out_cleanup_ids;
- spin_lock(&vfsmount_lock);
- if (IS_MNT_SHARED(dest_mnt)) {
- for (p = source_mnt; p; p = next_mnt(p, source_mnt))
- set_mnt_shared(p);
- }
- if (parent_path) {
- detach_mnt(source_mnt, parent_path);
- attach_mnt(source_mnt, path);
- touch_mnt_namespace(parent_path->mnt->mnt_ns);
- } else {
- mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
- commit_tree(source_mnt);
- }
- list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
- list_del_init(&child->mnt_hash);
- commit_tree(child);
- }
- spin_unlock(&vfsmount_lock);
- return 0;
- out_cleanup_ids:
- if (IS_MNT_SHARED(dest_mnt))
- cleanup_group_ids(source_mnt, NULL);
- out:
- return err;
- }
调用mnt_set_mountpoint
- void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
- struct vfsmount *child_mnt)
- {
- child_mnt->mnt_parent = mntget(mnt); //设置父文件系统
- child_mnt->mnt_mountpoint = dget(dentry); //设置挂载点目录项
- dentry->d_mounted++; //挂载计数加1
- }
到这里整个mount的流程就分析完毕了,mount的流程可以分为以下几个步骤:
一、查找给定挂载目录中的目录项结构和挂载点;
二、通过设备节点和文件系统类型生成新挂载文件系统的超级快等结构;
三、将二中生成的结构连接到一中查找到的路径中