===================================================
作者:ietf AT doit.com.cn
源文件来自于glibc-2.1.2,Copyright (C) 1996, 1997, 1998 Free Software Foundation, Inc. 和 linux kernel 2.6.20
请在GNU Library General Public License下参考。
引用请注明出处。
===================================================
所有的注释由如下标记标识:
-------ietf add start-------
-------ietf add end -------
一直想跟踪从敲入mount命令开始,命令的参数通过mount工具经由sys_mount系统调用进入内核最终实现文件系统挂载的全过程。可惜找了整整一天,竟然没有能找到mount这个命令的源代码是怎么实现,只好从系统调用开始了。
1、系统调用的头文件
在glibc-2.1.2的sysdeps/unix/sysv/linux/sys/mount.h中,定义了为实现挂载和卸载Linux文件系统的头文件,其内容如下:
/* Header file for mounting/unmount Linux filesystems.
Copyright (C) 1996, 1997, 1998 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License as
published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with the GNU C Library; see the file COPYING.LIB. If not,
write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* This is taken from /usr/include/linux/fs.h. */
-------ietf add start-------
给出/usr/include/linux/fs.h中定义方式(2.6.20),以供参考
-------ietf add end -------
#ifndef _SYS_MOUNT_H
#define _SYS_MOUNT_H 1
#include <features.h>
#include <sys/ioctl.h>
-------ietf add start-------
#define BLOCK_SIZE_BITS 10
#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
-------ietf add end -------
#define BLOCK_SIZE 1024
#define BLOCK_SIZE_BITS 10
/* These are the fs-independent mount-flags: up to 16 flags are
supported */
-------ietf add start-------
#define MS_RDONLY 1 /* Mount read-only */
#define MS_NOSUID 2 /* Ignore suid and sgid bits */
#define MS_NODEV 4 /* Disallow access to device special files */
#define MS_NOEXEC 8 /* Disallow program execution */
#define MS_SYNCHRONOUS 16 /* Writes are synced at once */
#define MS_REMOUNT 32 /* Alter flags of a mounted FS */
#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
#define MS_DIRSYNC 128 /* Directory modifications are synchronous */
#define MS_NOATIME 1024 /* Do not update access times. */
#define MS_NODIRATIME 2048 /* Do not update directory access times */
-------ietf add end -------
enum
{
MS_RDONLY = 1, /* Mount read-only. */
#define MS_RDONLY MS_RDONLY
MS_NOSUID = 2, /* Ignore suid and sgid bits. */
#define MS_NOSUID MS_NOSUID
MS_NODEV = 4, /* Disallow access to device special files. */
#define MS_NODEV MS_NODEV
MS_NOEXEC = 8, /* Disallow program execution. */
#define MS_NOEXEC MS_NOEXEC
MS_SYNCHRONOUS = 16, /* Writes are synced at once. */
#define MS_SYNCHRONOUS MS_SYNCHRONOUS
MS_REMOUNT = 32, /* Alter flags of a mounted FS. */
#define MS_REMOUNT MS_REMOUNT
MS_MANDLOCK = 64, /* Allow mandatory locks on an FS. */
#define MS_MANDLOCK MS_MANDLOCK
S_WRITE = 128, /* Write on file/directory/symlink. */
#define S_WRITE S_WRITE
S_APPEND = 256, /* Append-only file. */
#define S_APPEND S_APPEND
S_IMMUTABLE = 512, /* Immutable file. */
#define S_IMMUTABLE S_IMMUTABLE
MS_NOATIME = 1024, /* Do not update access times. */
#define MS_NOATIME MS_NOATIME
MS_NODIRATIME = 2048 /* Do not update directory access times. */
#define MS_NODIRATIME MS_NODIRATIME
};
-------ietf add start-------
#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK)
-------ietf add end -------
/* Flags that can be altered by MS_REMOUNT */
#define MS_RMT_MASK (MS_RDONLY | MS_MANDLOCK)
/* Magic mount flag number. Has to be or-ed to the flag values. */
-------ietf add start-------
#define MS_MGC_VAL 0xC0ED0000
#define MS_MGC_MSK 0xffff0000
-------ietf add end -------
#define MS_MGC_VAL 0xc0ed0000 /* Magic flag number to indicate "new" flags */
#define MS_MGC_MSK 0xffff0000 /* Magic flag number mask */
/* The read-only stuff doesn't really belong here, but any other place
is probably as bad and I don't want to create yet another include
file. */
-------ietf add start-------
#define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */
#define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */
#define BLKRRPART _IO(0x12,95) /* re-read partition table */
#define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */
#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */
#define BLKRASET _IO(0x12,98) /* set read ahead for block device */
#define BLKRAGET _IO(0x12,99) /* get current read ahead setting */
-------ietf add end -------
#define BLKROSET _IO(0x12, 93) /* Set device read-only (0 = read-write). */
#define BLKROGET _IO(0x12, 94) /* Get read-only status (0 = read_write). */
#define BLKRRPART _IO(0x12, 95) /* Re-read partition table. */
#define BLKGETSIZE _IO(0x12, 96) /* Return device size. */
#define BLKFLSBUF _IO(0x12, 97) /* Flush buffer cache. */
#define BLKRASET _IO(0x12, 98) /* Set read ahead for block device. */
#define BLKRAGET _IO(0x12, 99) /* Get current read ahead setting. */
/* Possible value for FLAGS parameter of `umount2'. */
enum
{
MNT_FORCE = 1 /* Force unmounting. */
#define MNT_FORCE MNT_FORCE
};
__BEGIN_DECLS
/* Mount a filesystem. */
extern int mount __P ((__const char *__special_file, __const char *__dir,
__const char *__fstype, unsigned long int __rwflag,
__const void *__data));
/* Unmount a filesystem. */
extern int umount __P ((__const char *__special_file));
/* Unmount a filesystem. Force unmounting if FLAGS is set to MNT_FORCE. */
extern int umount2 __P ((__const char *__special_file, int __flags));
__END_DECLS
#endif /* _SYS_MOUNT_H */
该文件是从fs.h中摘取的一部分,不过新版的内核有了稍许的更改。
2、系统调用号
通过给定系统调用mount和umount,查找对应的系统调用表。在文件glibc-2.1.2的sysdeps/unix/sysv/linux/mips/sys/syscall.h中如下:
/*
* Linux syscalls are in the range from 4000 to 4999
* Hopefully these syscall numbers are unused ... If not everyone using
* statically linked binaries is pretty <censored - the government>. You've
* been warned.
*/
#define SYS_Linux 4000
...... ......
#define SYS_mount (SYS_Linux + 21)
#define SYS_umount (SYS_Linux + 22)
即分别为4021和4022。
3、系统调用在内核的声明
系统调用的函数声明在内核源码的src/include/linux/syscall.h中,具体如下:
asmlinkage long sys_mount(char __user *dev_name, char __user *dir_name,
char __user *type, unsigned long flags,
void __user *data);
-------ietf add start-------
dev_name: 字符串类型,挂载的设备名,如,/dev/sda1
dir_name: 挂载点,如,/mnt/usb
type: 在/proc/filesystems中列出的内核所支持的文件系统格式,如,"ext2", "nfs", "proc"等
flags: 标志参数,参考1中列出的MS_XXXXX类型宏定义
data: 不同文件系统特定的参数表,通常以逗号分隔
返回的错误类型请参考专门的文档详细介绍。
-------ietf add end -------
asmlinkage long sys_umount(char __user *name, int flags);
-------ietf add start-------
name: 同sys_mount中dir_name
flags: 同sys_mount中flags
-------ietf add end -------
asmlinkage long sys_oldumount(char __user *name);
-------ietf add start-------
name: 同sys_mount中dir_name
-------ietf add end -------
与1中的三个函数相对应。
4、系统调用的实现
具体实现在内核源马的src/fs/namespace.c中,下面就三个调用分别参考其流程。
4.1 sys_mount
sys_mount主要功能是将用户的命令行参数从用户空间传递到内核空间,并调用do_mount解析参数,完成mount过程。其实现源码如下:
asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
char __user * type, unsigned long flags,
void __user * data)
{
int retval;
unsigned long data_page;
unsigned long type_page;
unsigned long dev_page;
char *dir_page;
-------ietf add start-------
将文件系统类型参数传入内核空间,在namespace.c中实现如下:
int copy_mount_options(const void __user * data, unsigned long *where)
{
int i;
unsigned long page;
unsigned long size;
*where = 0;
if (!data)
return 0;
if (!(page = __get_free_page(GFP_KERNEL)))
return -ENOMEM;
/* We only care that *some* data at the address the user
* gave us is valid. Just in case, we'll zero
* the remainder of the page.
*/
/* copy_from_user cannot cross TASK_SIZE ! */
size = TASK_SIZE - (unsigned long)data;
if (size > PAGE_SIZE)
size = PAGE_SIZE;
i = size - exact_copy_from_user((void *)page, data, size);
if (!i) {
free_page(page);
return -EFAULT;
}
if (i != PAGE_SIZE)
memset((char *)page + i, 0, PAGE_SIZE - i);
*where = page;
return 0;
}
-------ietf add end -------
retval = copy_mount_options(type, &type_page);
if (retval < 0)
return retval;
-------ietf add start-------
将挂载点名传入内核空间,并在系统配置了audit属性时,对名字进行访问控制,其实现在namei.c中实现,如下:
char * getname(const char __user * filename)
{
char *tmp, *result;
result = ERR_PTR(-ENOMEM);
tmp = __getname();
if (tmp) {
int retval = do_getname(filename, tmp);
result = tmp;
if (retval < 0) {
__putname(tmp);
result = ERR_PTR(retval);
}
}
audit_getname(result);
return result;
}
-------ietf add end -------
dir_page = getname(dir_name);
retval = PTR_ERR(dir_page);
if (IS_ERR(dir_page))
goto out1;
-------ietf add start-------
将设备名参数传入内核空间
-------ietf add end -------
retval = copy_mount_options(dev_name, &dev_page);
if (retval < 0)
goto out2;
-------ietf add start-------
将文件系统特定参数表传入内核空间
-------ietf add end -------
retval = copy_mount_options(data, &data_page);
if (retval < 0)
goto out3;
lock_kernel();
-------ietf add start-------
调用do_mount,解析参数,实现挂载
-------ietf add end -------
retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
flags, (void *)data_page);
unlock_kernel();
free_page(data_page);
out3:
free_page(dev_page);
out2:
putname(dir_page);
out1:
free_page(type_page);
return retval;
}
4.2 do_mount
do_mount根据sys_mount传递的内核空间的参数,学则对应的执行过程,在namespace.c中,其源码具体如下:
/*
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
*
* data is a (void *) that can point to any structure up to
* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
* information (or be NULL).
*
* Pre-0.97 versions of mount() didn't have a flags word.
* When the flags word was introduced its top half was required
* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
long do_mount(char *dev_name, char *dir_name, char *type_page,
unsigned long flags, void *data_page)
{
struct nameidata nd;
int retval = 0;
int mnt_flags = 0;
/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
flags &= ~MS_MGC_MSK;
/* Basic sanity checks */
-------ietf add start-------
如果挂载点或者设备名字符串为空,或者字符串过长,超过PAGE_SIZE大小,则返回失败
-------ietf add end -------
if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
return -EINVAL;
if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
return -EINVAL;
-------ietf add start-------
如果data_page超过PAGE_SIZE长度,将超出部分截断
-------ietf add end -------
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
-------ietf add start-------
根据用户参数,构建flags值
-------ietf add end -------
/* Separate the per-mountpoint flags */
if (flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;
if (flags & MS_NODEV)
mnt_flags |= MNT_NODEV;
if (flags & MS_NOEXEC)
mnt_flags |= MNT_NOEXEC;
if (flags & MS_NOATIME)
mnt_flags |= MNT_NOATIME;
if (flags & MS_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_RELATIME)
mnt_flags |= MNT_RELATIME;
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME);
-------ietf add start-------
根据dir_name查找挂载点路径,将该挂载点引用计数增1,将挂载点数据保存在nd中
方法调用了namei.c中的path_lookup,再通过path_lookup调用do_path_lookup实现,源码如下:
int fastcall path_lookup(const char *name, unsigned int flags,
struct nameidata *nd)
{
return do_path_lookup(AT_FDCWD, name, flags, nd);
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int fastcall do_path_lookup(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
int retval = 0;
int fput_needed;
struct file *file;
struct fs_struct *fs = current->fs;
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags;
nd->depth = 0;
if (*name=='/') {
read_lock(&fs->lock);
if (fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
nd->mnt = mntget(fs->altrootmnt);
nd->dentry = dget(fs->altroot);
read_unlock(&fs->lock);
if (__emul_lookup_dentry(name,nd))
goto out; /* found in altroot */
read_lock(&fs->lock);
}
nd->mnt = mntget(fs->rootmnt);
nd->dentry = dget(fs->root);
read_unlock(&fs->lock);
} else if (dfd == AT_FDCWD) {
read_lock(&fs->lock);
nd->mnt = mntget(fs->pwdmnt);
nd->dentry = dget(fs->pwd);
read_unlock(&fs->lock);
} else {
struct dentry *dentry;
file = fget_light(dfd, &fput_needed);
retval = -EBADF;
if (!file)
goto out_fail;
dentry = file->f_path.dentry;
retval = -ENOTDIR;
if (!S_ISDIR(dentry->d_inode->i_mode))
goto fput_fail;
retval = file_permission(file, MAY_EXEC);
if (retval)
goto fput_fail;
nd->mnt = mntget(file->f_path.mnt);
nd->dentry = dget(dentry);
fput_light(file, fput_needed);
}
current->total_link_count = 0;
retval = link_path_walk(name, nd);
out:
if (likely(retval == 0)) {
if (unlikely(!audit_dummy_context() && nd && nd->dentry &&
nd->dentry->d_inode))
audit_inode(name, nd->dentry->d_inode);
}
out_fail:
return retval;
fput_fail:
fput_light(file, fput_needed);
goto out_fail;
}
持续跟踪该操作,会进入很深层次。它先查看该路径是否在dcache中,如果不在,将重新从硬盘上将该路径加入dcache。
-------ietf add end -------
/* ... and get the mountpoint */
retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
if (retval)
return retval;
-------ietf add start-------
如果没有配置安全属性,该函数为空;否则使用系统初始化时设定的安全框架函数。这一部分需要进一步分析
-------ietf add end -------
retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
if (retval)
goto dput_out;
if (flags & MS_REMOUNT)
-------ietf add start-------
根据给定的路径,将设备重新挂载到新的路径点。该操做中,内核先将该sb对应的dcache数据缩减,以减小remount时的开销,并将所有的dirty inode写回硬盘,最后调用该文件系统sb中的remount_fs方法。
-------ietf add end -------
retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
-------ietf add start-------
太复杂,还没深入看,其中分两种情况,一种copy方式,一种clone方式。
-------ietf add end -------
retval = do_loopback(&nd, dev_name, flags & MS_REC);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
-------ietf add start-------
更改mount参数类型
-------ietf add end -------
retval = do_change_type(&nd, flags);
else if (flags & MS_MOVE)
-------ietf add start-------
卸载挂载点
-------ietf add end -------
retval = do_move_mount(&nd, dev_name);
else
-------ietf add start-------
这里是初次挂载一个新设备时的入口,也是俺所感兴趣的部分。
该函数分别调用了do_kern_mount()和do_add_mount()两个函数,代码如下:
/*
* create a new mount for userspace and request it to be added into the
* namespace's tree
*/
static int do_new_mount(struct nameidata *nd, char *type, int flags,
int mnt_flags, char *name, void *data)
{
struct vfsmount *mnt;
if (!type || !memchr(type, 0, PAGE_SIZE))
return -EINVAL;
/* we need capabilities... */
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
-------ietf add start-------
分配挂载点空间,具体见4.3
-------ietf add end -------
mnt = do_kern_mount(type, flags, name, data);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
-------ietf add start-------
见下
-------ietf add end -------
return do_add_mount(mnt, nd, mnt_flags, NULL);
}
-------ietf add end -------
retval = do_new_mount(&nd, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
path_release(&nd);
return retval;
}
4.3 do_kern_mount & vfs_kern_mount
这两个程序都在/src/fs/super.c中实现
struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
-------ietf add start-------
根据文件系统类型名载文件系统链表中查找对应的模块,如果模块不在内核中,则尝试加载模块
-------ietf add end -------
struct file_system_type *type = get_fs_type(fstype);
struct vfsmount *mnt;
if (!type)
return ERR_PTR(-ENODEV);
mnt = vfs_kern_mount(type, flags, name, data);
put_filesystem(type);
return mnt;
}
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
struct vfsmount *mnt;
char *secdata = NULL;
int error;
if (!type)
return ERR_PTR(-ENODEV);
error = -ENOMEM;
-------ietf add start-------
为设备分配挂载点数据结构,设置引用计数,初始化各参数,并将mnt_devname域赋值为name(即dev_name),
-------ietf add end -------
mnt = alloc_vfsmnt(name);
if (!mnt)
goto out;
-------ietf add start-------
传入的data参数不为空,又是和security框架相关的操作
-------ietf add end -------
if (data) {
secdata = alloc_secdata();
if (!secdata)
goto out_mnt;
error = security_sb_copy_data(type, data, secdata);
if (error)
goto out_free_secdata;
}
-------ietf add start-------
调用被挂载设备上的文件系统定义的get_sb()模块,判断superblock中指定的块设备是否包含该方法支持的文件系统。执行成功,就返回指向superblock的指针,出错则返回NULL。具体功能实现在分析特定文件系统时详细介绍。
-------ietf add end -------
error = type->get_sb(type, flags, name, data, mnt);
if (error < 0)
goto out_free_secdata;
error = security_sb_kern_mount(mnt->mnt_sb, secdata);
if (error)
goto out_sb;
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
up_write(&mnt->mnt_sb->s_umount);
free_secdata(secdata);
return mnt;
out_sb:
dput(mnt->mnt_root);
up_write(&mnt->mnt_sb->s_umount);
deactivate_super(mnt->mnt_sb);
out_free_secdata:
free_secdata(secdata);
out_mnt:
free_vfsmnt(mnt);
out:
return ERR_PTR(error);
}
4.4 do_add_mount
既然已经为新的设备分配挂载点,就该把该挂载点加入到系统的文件系统中来了。
/*
* add a mount into a namespace's mount tree
* - provide the option of adding the new mount to an expiration list
*/
int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
int mnt_flags, struct list_head *fslist)
{
int err;
-------ietf add start-------
先关锁
-------ietf add end -------
down_write(&namespace_sem);
/* Something was mounted here while we slept */
while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
if (!check_mnt(nd->mnt))
goto unlock;
/* Refuse the same filesystem on the same mount point */
-------ietf add start-------
相同的设备,相同的挂载点,不必再挂
-------ietf add end -------
err = -EBUSY;
if (nd->mnt->mnt_sb == newmnt->mnt_sb &&
nd->mnt->mnt_root == nd->dentry)
goto unlock;
err = -EINVAL;
-------ietf add start-------
挂载在link上,跳过
-------ietf add end -------
if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
goto unlock;
newmnt->mnt_flags = mnt_flags;
-------ietf add start-------
挂载的操作,其中调用了attach_recursive_mnt(),实现挂载,具体见4.5
-------ietf add end -------
if ((err = graft_tree(newmnt, nd)))
goto unlock;
-------ietf add start-------
第一次加载时为空,忽略该操作
-------ietf add end -------
if (fslist) {
/* add to the specified expiration list */
spin_lock(&vfsmount_lock);
list_add_tail(&newmnt->mnt_expire, fslist);
spin_unlock(&vfsmount_lock);
}
up_write(&namespace_sem);
return 0;
unlock:
up_write(&namespace_sem);
mntput(newmnt);
return err;
}
4.5 attach_recursive_mnt
这是目前看到的注释最长的一个函数,瞻仰一下:
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
* @parent_nd : if non-null, detach the source_mnt from its parent and
* store the parent mount and mountpoint dentry.
* (done when source_mnt is moved)
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
* ---------------------------------------------------------------------------
* | BIND MOUNT OPERATION |
* |**************************************************************************
* | source-->| shared | private | slave | unbindable |
* | dest | | | | |
* | | | | | | |
* | v | | | | |
* |**************************************************************************
* | shared | shared (++) | shared (+) | shared(+++)| invalid |
* | | | | | |
* |non-shared| shared (+) | private | slave (*) | invalid |
* ***************************************************************************
* A bind operation clones the source mount and mounts the clone on the
* destination mount.
*
* (++) the cloned mount is propagated to all the mounts in the propagation
* tree of the destination mount and the cloned mount is added to
* the peer group of the source mount.
* (+) the cloned mount is created under the destination mount and is marked
* as shared. The cloned mount is added to the peer group of the source
* mount.
* (+++) the mount is propagated to all the mounts in the propagation tree
* of the destination mount and the cloned mount is made slave
* of the same master as that of the source mount. The cloned mount
* is marked as 'shared and slave'.
* (*) the cloned mount is made a slave of the same master as that of the
* source mount.
*
* ---------------------------------------------------------------------------
* | MOVE MOUNT OPERATION |
* |**************************************************************************
* | source-->| shared | private | slave | unbindable |
* | dest | | | | |
* | | | | | | |
* | v | | | | |
* |**************************************************************************
* | shared | shared (+) | shared (+) | shared(+++) | invalid |
* | | | | | |
* |non-shared| shared (+*) | private | slave (*) | unbindable |
* ***************************************************************************
*
* (+) the mount is moved to the destination. And is then propagated to
* all the mounts in the propagation tree of the destination mount.
* (+*) the mount is moved to the destination.
* (+++) the mount is moved to the destination and is then propagated to
* all the mounts belonging to the destination mount's propagation tree.
* the mount is marked as 'shared and slave'.
* (*) the mount continues to be a slave at the new location.
*
* if the source mount is a tree, the operations explained above is
* applied to each mount in the tree.
* Must be called without spinlocks held, since this function can sleep
* in allocations.
*/
static int attach_recursive_mnt(struct vfsmount *source_mnt,
struct nameidata *nd, struct nameidata *parent_nd)
{
LIST_HEAD(tree_list);
struct vfsmount *dest_mnt = nd->mnt;
struct dentry *dest_dentry = nd->dentry;
struct vfsmount *child, *p;
if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
return -EINVAL;
if (IS_MNT_SHARED(dest_mnt)) {
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
set_mnt_shared(p);
}
spin_lock(&vfsmount_lock);
if (parent_nd) {
detach_mnt(source_mnt, parent_nd);
attach_mnt(source_mnt, nd);
touch_mnt_namespace(current->nsproxy->mnt_ns);
} else {
mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
commit_tree(source_mnt);
}
list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
list_del_init(&child->mnt_hash);
commit_tree(child);
}
spin_unlock(&vfsmount_lock);
return 0;
}
上面解释的很清楚,就不罗索了。
呵呵,有点虎头蛇尾。