阅读本文需要linux文件系统基础知识。以下引用的kernel源代码,都是基于linux kernel源代码版本:3.4。
以下分析基于下面的假设:
根设备使用SD卡设备(块设备),根文件系统使用ext4文件系统。
linux kernel在初始化的最后阶段,会加载“根文件系统”,按照前面的假设,也就是加载一个ext4文件系统作为根文件系统,这个文件系统位于SD卡上。
在加载这个根文件系统前,kernel会先加载一个虚拟的根文件系统,名叫rootfs文件系统,这个rootfs文件系统才是kernel真正意义上mount的第一个文件系统。它是基于内存的文件系统(并没有对应一个非易失性存储设备,而像ext, fat这些文件系统都会对应一个外部存储设备)。
这个rootfs 文件系统会mount到自己的根目录 “/” 上,mount完毕后会将自己的根目录设为进程的根目录。现在假设这个rootfs文件系统已经mount好了(暂不分析rootfs文件系统的mount过程)。
kernel在安装根文件系统前会先安装根设备节点:/dev/root。ext4根文件系统必须存在于一个根设备上,比如硬盘,SD/MMC卡。根设备节点用来描述指定的根设备。
kernel在kernel_init函数中会调用prepare_namespace函数(init/do_mounts.c),prepare_namespace函数先从启动参数中获得根设备名称,保存在root_device_name中,这里名称就是/dev/mmcblk0p1。
调用mount_root挂载根文件系统。在mount_root函数中就会创建设备节点了。
void __init prepare_namespace(void)
{
int is_floppy;
if (root_delay) {
printk(KERN_INFO "Waiting %dsec before mounting root device...\n",
root_delay);
ssleep(root_delay);
}
/*
* wait for the known devices to complete their probing
*
* Note: this is a potential source of long boot delays.
* For example, it is not atypical to wait 5 seconds here
* for the touchpad of a laptop to initialize.
*/
wait_for_device_probe();
md_run_setup();
if (saved_root_name[0]) {
root_device_name = saved_root_name;
if (!strncmp(root_device_name, "mtd", 3) ||
!strncmp(root_device_name, "ubi", 3)) {
mount_block_root(root_device_name, root_mountflags);
goto out;
}
printk(KERN_INFO "-----ROOT_DEV_NAME----: %s\n", root_device_name);
ROOT_DEV = name_to_dev_t(root_device_name); //拿到设备名称后调用name_to_dev_t转换为设备号;
if (strncmp(root_device_name, "/dev/", 5) == 0)
root_device_name += 5; //跳过 ‘/dev/’;
}
if (initrd_load())
goto out;
/* wait for any asynchronous scanning to complete */
if ((ROOT_DEV == 0) && root_wait) {
printk(KERN_INFO "Waiting for root device %s...\n",
saved_root_name);
while (driver_probe_done() != 0 ||
(ROOT_DEV = name_to_dev_t(saved_root_name)) == 0)
msleep(100);
async_synchronize_full();
}
is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR;
if (is_floppy && rd_doload && rd_load_disk(0))
ROOT_DEV = Root_RAM0;
mount_root(); //安装根文件系统;
out:
devtmpfs_mount("dev");
sys_mount(".", "/", NULL, MS_MOVE, NULL);
sys_chroot((const char __user __force *)".");
}
mount_root(): (init/do_mounts.c)
void __init mount_root(void)
{
#ifdef CONFIG_ROOT_NFS
if (ROOT_DEV == Root_NFS) {
if (mount_nfs_root())
return;
printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");
ROOT_DEV = Root_FD0;
}
#endif
#ifdef CONFIG_BLK_DEV_FD
if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
/* rd_doload is 2 for a dual initrd/ramload setup */
if (rd_doload==2) {
if (rd_load_disk(1)) {
ROOT_DEV = Root_RAM1;
root_device_name = NULL;
}
} else
change_floppy("root floppy");
}
#endif
#ifdef CONFIG_BLOCK
create_dev("/dev/root", ROOT_DEV); //创建根设备节点;
mount_block_root("/dev/root", root_mountflags);
#endif
}
参数为: name = "/dev/root", mode = S_IFBLK|0600,dev = xxx;
static inline int create_dev(char *name, dev_t dev)
{
sys_unlink(name);
return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev));
}
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
return sys_mknodat(AT_FDCWD, filename, mode, dev);
}
最后实际是调用sys_mknodat函数(fs/namei.c),SYSCALL_DEFINE4是函数定义的宏,展开后就变成sys_mknodat函数。
sys_mknodat函数调用user_path_create得到要创建节点的path,节点父目录的信息保存在path中,然后调用vfs_mknod创建节点。
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
unsigned, dev)
{
struct dentry *dentry;
struct path path;
int error;
if (S_ISDIR(mode))
return -EPERM;
dentry = user_path_create(dfd, filename, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
error = may_mknod(mode);
if (error)
goto out_dput;
error = mnt_want_write(path.mnt);
if (error)
goto out_dput;
error = security_path_mknod(&path, dentry, mode, dev);
if (error)
goto out_drop_write;
switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(path.dentry->d_inode,dentry,mode,NULL);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(path.dentry->d_inode,dentry,mode,
new_decode_dev(dev));
break;
case S_IFIFO: case S_IFSOCK:
error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
break;
}
out_drop_write:
mnt_drop_write(path.mnt);
out_dput:
dput(dentry);
mutex_unlock(&path.dentry->d_inode->i_mutex);
path_put(&path);
return error;
}
user_path_create: (fs/namei.c)
简单的调用了kern_path_create函数后返回。此时参数分别为dfd = AT_FDCWD,tmp = "/dev/root",path为输出参数,is_dir = 0;
struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
{
char *tmp = getname(pathname);
struct dentry *res;
if (IS_ERR(tmp))
return ERR_CAST(tmp);
res = kern_path_create(dfd, tmp, path, is_dir);
putname(tmp);
return res;
}
kern_path_create函数先调用do_path_lookup搜索路径,搜索方式是搜索最后一个目录项的父目录。然后再调用lookup_hash搜索目录最后一项,也就是root项。
struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
{
struct dentry *dentry = ERR_PTR(-EEXIST);
struct nameidata nd;
int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
if (error)
return ERR_PTR(error);
/*
* Yucky last component or no last component at all?
* (foo/., foo/.., /////)
*/
if (nd.last_type != LAST_NORM)
goto out;
nd.flags &= ~LOOKUP_PARENT;
nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
nd.intent.open.flags = O_EXCL;
/*
* Do the final lookup.
*/
mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
dentry = lookup_hash(&nd);
if (IS_ERR(dentry))
goto fail;
if (dentry->d_inode)
goto eexist;
/*
* Special case - lookup gave negative, but... we had foo/bar/
* From the vfs_mknod() POV we just have a negative dentry -
* all is fine. Let's be bastards - you had / on the end, you've
* been asking for (non-existent) directory. -ENOENT for you.
*/
if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
dput(dentry);
dentry = ERR_PTR(-ENOENT);
goto fail;
}
*path = nd.path;
return dentry;
eexist:
dput(dentry);
dentry = ERR_PTR(-EEXIST);
fail:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
out:
path_put(&nd.path);
return dentry;
}
do_path_lookup函数如下(fs/namei.c)。
调用path_lookupat进行搜索。加上LOOKUP_RCU参数。vfs文件系统路径搜索主要有两种mode,rcu-walk和 ref-walk。 这两种方式在源代码目录 Documentation/filesystems/path-lookup.txt 的文档中有详细说明。
static int do_path_lookup(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
if (unlikely(retval == -ECHILD))
retval = path_lookupat(dfd, name, flags, nd);
if (unlikely(retval == -ESTALE))
retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
if (likely(!retval)) {
if (unlikely(!audit_dummy_context())) {
if (nd->path.dentry && nd->inode)
audit_inode(name, nd->path.dentry);
}
}
return retval;
}
path_lookupat函数先初始化路径,可以看作将设置路径的根(搜索的起点)。VFS中目录项是用dentry和inode描述的,这里可以看作设置根路径对应的dentry。
static int path_lookupat(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
struct file *base = NULL;
struct path path;
int err;
/*
* Path walking is largely split up into 2 different synchronisation
* schemes, rcu-walk and ref-walk (explained in
* Documentation/filesystems/path-lookup.txt). These share much of the
* path walk code, but some things particularly setup, cleanup, and
* following mounts are sufficiently divergent that functions are
* duplicated. Typically there is a function foo(), and its RCU
* analogue, foo_rcu().
*
* -ECHILD is the error number of choice (just to avoid clashes) that
* is returned if some aspect of an rcu-walk fails. Such an error must
* be handled by restarting a traditional ref-walk (which will always
* be able to complete).
*/
err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); //找到搜索的起点,保存在nd中;
if (unlikely(err))
return err;
current->total_link_count = 0;
err = link_path_walk(name, nd); //从起点开始路径的搜索,其中nd用来返回搜索结果;
if (!err && !(flags & LOOKUP_PARENT)) {
err = lookup_last(nd, &path);
while (err > 0) {
void *cookie;
struct path link = path;
nd->flags |= LOOKUP_PARENT;
err = follow_link(&link, nd, &cookie);
if (!err)
err = lookup_last(nd, &path);
put_link(nd, &link, cookie);
}
}
if (!err)
err = complete_walk(nd);
if (!err && nd->flags & LOOKUP_DIRECTORY) {
if (!nd->inode->i_op->lookup) {
path_put(&nd->path);
err = -ENOTDIR;
}
}
if (base)
fput(base);
if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
path_put(&nd->root);
nd->root.mnt = NULL;
}
return err;
}
主要调用set_root_rcu设置根路径,将根路径保存在nd->root。
static int path_init(int dfd, const char *name, unsigned int flags,
struct nameidata *nd, struct file **fp)
{
int retval = 0;
int fput_needed;
struct file *file;
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED;
nd->depth = 0;
if (flags & LOOKUP_ROOT) {
struct inode *inode = nd->root.dentry->d_inode;
if (*name) {
if (!inode->i_op->lookup)
return -ENOTDIR;
retval = inode_permission(inode, MAY_EXEC);
if (retval)
return retval;
}
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
br_read_lock(vfsmount_lock);
rcu_read_lock();
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} else {
path_get(&nd->path);
}
return 0;
}
nd->root.mnt = NULL;
if (*name=='/') {
if (flags & LOOKUP_RCU) {
br_read_lock(vfsmount_lock);
rcu_read_lock();
set_root_rcu(nd); //设置根路径
} else {
set_root(nd);
path_get(&nd->root);
}
nd->path = nd->root; //搜索的初始路径设为根路径
} else if (dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
unsigned seq;
br_read_lock(vfsmount_lock);
rcu_read_lock();
do {
seq = read_seqcount_begin(&fs->seq);
nd->path = fs->pwd;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
}
} else {
struct dentry *dentry;
file = fget_raw_light(dfd, &fput_needed);
retval = -EBADF;
if (!file)
goto out_fail;
dentry = file->f_path.dentry;
if (*name) {
retval = -ENOTDIR;
if (!S_ISDIR(dentry->d_inode->i_mode))
goto fput_fail;
retval = inode_permission(dentry->d_inode, MAY_EXEC);
if (retval)
goto fput_fail;
}
nd->path = file->f_path;
if (flags & LOOKUP_RCU) {
if (fput_needed)
*fp = file;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
br_read_lock(vfsmount_lock);
rcu_read_lock();
} else {
path_get(&file->f_path);
fput_light(file, fput_needed);
}
}
nd->inode = nd->path.dentry->d_inode; //设置根目录的inode
return 0;
fput_fail:
fput_light(file, fput_needed);
out_fail:
return retval;
}
用当前进程的根路径设置nd->root。
static __always_inline void set_root_rcu(struct nameidata *nd)
{
if (!nd->root.mnt) {
struct fs_struct *fs = current->fs;
unsigned seq;
do {
seq = read_seqcount_begin(&fs->seq);
nd->root = fs->root; //设置nd的root为当前进程fs的root;
nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
}
}
path_init返回后,调用link_path_walk正式开始搜索,搜索的结果会保存在struct nameidata结构体nd中。
struct nameidata {
struct path path;
struct qstr last; //保存最后一个目录项
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags;
unsigned seq;
int last_type;
unsigned depth;
char *saved_names[MAX_NESTED_LINKS + 1];
/* Intent data */
union {
struct open_intent open;
} intent;
};
如果路径名以‘/’开头,就把它跳过去,因为在这种情况下nd->path已经指向本进程的根目录了。如果路径名中仅仅包含‘/’字符的话,那么其搜索目标就是根目录,所以任务完成。
调用hash_name,hash_name函数计算第一个目录项的hash值并返回目录项长度,对于本例中的/dev/root,第一个目录项是dev,长度为3。用name和len构造struct qstr结构体。若发现是最后一个目录项,就将struct qstr赋给nd->last, 然后直接返回,若不是最后一项,就调用walk_component搜索这个目录项。
static int link_path_walk(const char *name, struct nameidata *nd)
{
struct path next;
int err;
while (*name=='/')
name++;
if (!*name)
return 0;
/* At this point we know we have a real path component. */
for(;;) {
struct qstr this;
long len;
int type;
err = may_lookup(nd);
if (err)
break;
len = hash_name(name, &this.hash);
this.name = name;
this.len = len;
type = LAST_NORM;
if (name[0] == '.') switch (len) {
case 2:
if (name[1] == '.') {
type = LAST_DOTDOT;
nd->flags |= LOOKUP_JUMPED;
}
break;
case 1:
type = LAST_DOT;
}
if (likely(type == LAST_NORM)) {
struct dentry *parent = nd->path.dentry;
nd->flags &= ~LOOKUP_JUMPED;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
err = parent->d_op->d_hash(parent, nd->inode,
&this);
if (err < 0)
break;
}
}
if (!name[len]) //若是最后一项,就直接返回;
goto last_component;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
do {
len++;
} while (unlikely(name[len] == '/'));
if (!name[len])
goto last_component;
name += len; //运行到这里,表示当前节点为中间节点;
err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); //搜索目录项;
if (err < 0)
return err;
if (err) {
err = nested_symlink(&next, nd);
if (err)
return err;
}
if (can_lookup(nd->inode))
continue;
err = -ENOTDIR;
break;
/* here ends the main loop */
last_component:
nd->last = this;
nd->last_type = type;
return 0;
}
terminate_walk(nd);
return err;
}
权限检查。
static inline int may_lookup(struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
if (err != -ECHILD)
return err;
if (unlazy_walk(nd, NULL))
return -ECHILD;
}
return inode_permission(nd->inode, MAY_EXEC);
}
hash_name返回第一个目录项的长度。
static inline unsigned long hash_name(const char *name, unsigned int *hashp)
{
unsigned long hash = init_name_hash(); // #define init_name_hash() 0;
unsigned long len = 0, c;
c = (unsigned char)*name;
do {
len++;
hash = partial_name_hash(c, hash); // 计算并返回哈希值;
c = (unsigned char)name[len];
} while (c && c != '/');
*hashp = end_name_hash(hash); // return hash;
return len;
}
计算hash值;
static inline unsigned long
partial_name_hash(unsigned long c, unsigned long prevhash)
{
return (prevhash + (c << 4) + (c >> 4)) * 11;
}
walk_component: (fs/namei.c)
调用do_lookup函数搜索目录项的inode。如果找不到inode节点,则终止搜索。如果这个inode节点被mount上另一个文件系统,更新搜索路径。
static inline int walk_component(struct nameidata *nd, struct path *path,
struct qstr *name, int type, int follow)
{
struct inode *inode;
int err;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
* parent relationships.
*/
if (unlikely(type != LAST_NORM))
return handle_dots(nd, type);
err = do_lookup(nd, name, path, &inode);
if (unlikely(err)) {
terminate_walk(nd);
return err;
}
if (!inode) {
path_to_nameidata(path, nd);
terminate_walk(nd);
return -ENOENT;
}
if (should_follow_link(inode, follow)) {
if (nd->flags & LOOKUP_RCU) {
if (unlikely(unlazy_walk(nd, path->dentry))) {
terminate_walk(nd);
return -ECHILD;
}
}
BUG_ON(inode != path->dentry->d_inode);
return 1;
}
path_to_nameidata(path, nd); //将path的内容转化到nd中;
nd->inode = inode;
return 0;
}
do_lookup: (fs/namei.c) 做实际的路径搜索工作。
do_lookup先获取要搜索目录项的父目录parent,根据LOOPUP_RCU标志,调用__d_lookup_rcu或者__d_lookup。假设没有LOOKUP_RCU标志,则进入__d_lookup函数,若找到则__d_lookup返回找到的dentry,用找到的dentry给path->dentry赋值,将dentry对应的dentry->d_inode赋给输出参数inode,inode有可能为NULL。
static int do_lookup(struct nameidata *nd, struct qstr *name,
struct path *path, struct inode **inode)
{
struct vfsmount *mnt = nd->path.mnt;
struct dentry *dentry, *parent = nd->path.dentry;
int need_reval = 1;
int status = 1;
int err;
/*
* Rename seqlock is not required here because in the off chance
* of a false negative due to a concurrent rename, we're going to
* do the non-racy lookup, below.
*/
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
*inode = nd->inode;
dentry = __d_lookup_rcu(parent, name, &seq, inode);
if (!dentry)
goto unlazy;
/* Memory barrier in read_seqcount_begin of child is enough */
if (__read_seqcount_retry(&parent->d_seq, nd->seq))
return -ECHILD;
nd->seq = seq;
if (unlikely(d_need_lookup(dentry)))
goto unlazy;
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
status = d_revalidate(dentry, nd);
if (unlikely(status <= 0)) {
if (status != -ECHILD)
need_reval = 0;
goto unlazy;
}
}
path->mnt = mnt;
path->dentry = dentry;
if (unlikely(!__follow_mount_rcu(nd, path, inode)))
goto unlazy;
if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
goto unlazy;
return 0;
unlazy:
if (unlazy_walk(nd, dentry))
return -ECHILD;
} else {
dentry = __d_lookup(parent, name);
}
if (unlikely(!dentry))
goto need_lookup;
if (unlikely(d_need_lookup(dentry))) {
dput(dentry);
goto need_lookup;
}
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
status = d_revalidate(dentry, nd);
if (unlikely(status <= 0)) {
if (status < 0) {
dput(dentry);
return status;
}
if (!d_invalidate(dentry)) {
dput(dentry);
goto need_lookup;
}
}
done:
path->mnt = mnt;
path->dentry = dentry;
err = follow_managed(path, nd->flags);
if (unlikely(err < 0)) {
path_put_conditional(path, nd);
return err;
}
if (err)
nd->flags |= LOOKUP_JUMPED;
*inode = path->dentry->d_inode;
return 0;
need_lookup:
BUG_ON(nd->inode != parent->d_inode);
mutex_lock(&parent->d_inode->i_mutex);
dentry = __lookup_hash(name, parent, nd);
mutex_unlock(&parent->d_inode->i_mutex);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
goto done;
}
__d_lookup: (fs/dcache.c)
在内存中寻找该节点已经建立的dentry结构,内核中有个hash表dentry_hashtable,是一个struct hlist_bl_head结构的指针数组,每一个struct hlist_bl_head结构包含有一个队列头指针struct hlist_bl_node*。
一旦在内存中建立起一个目录节点的dentry结构,就根据其节点名的hash值和父目录的dentry值,调用d_hash获得dentry_hashtable中的一个队列头指针,然后将dentry成员变量dentry->d_hash挂入dentry_hashtable表中的这个队列,需要寻找时会根据hash值查找dentry_hashtable表。
将dentry->d_hash挂入dentry_hashtable的过程在d_rehash函数中完成,后面分析。
这里__d_lookup先调用d_hash从dentry_hashtable表中获得这个队列。遍历这个队列,看看队列中有没有目录项的hash值和要搜索的目录项的hash值相同的,如果有,再比较parent是否是同一个parent,最后调用dentry_cmp进一步比较是否是同一个目录项,如果是,表示找到了,就增加引用计数并将其返回。如果没有找到,返回NULL。
struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
{
unsigned int len = name->len;
unsigned int hash = name->hash;
const unsigned char *str = name->name;
struct hlist_bl_head *b = d_hash(parent, hash);
struct hlist_bl_node *node;
struct dentry *found = NULL;
struct dentry *dentry;
/*
* Note: There is significant duplication with __d_lookup_rcu which is
* required to prevent single threaded performance regressions
* especially on architectures where smp_rmb (in seqcounts) are costly.
* Keep the two functions in sync.
*/
/*
* The hash list is protected using RCU.
*
* Take d_lock when comparing a candidate dentry, to avoid races
* with d_move().
*
* It is possible that concurrent renames can mess up our list
* walk here and result in missing our dentry, resulting in the
* false-negative result. d_lookup() protects against concurrent
* renames using rename_lock seqlock.
*
* See Documentation/filesystems/path-lookup.txt for more details.
*/
rcu_read_lock();
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { //遍历散列
const char *tname;
int tlen;
if (dentry->d_name.hash != hash)
continue;
spin_lock(&dentry->d_lock);
if (dentry->d_parent != parent)
goto next;
if (d_unhashed(dentry))
goto next;
/*
* It is safe to compare names since d_move() cannot
* change the qstr (protected by d_lock).
*/
tlen = dentry->d_name.len;
tname = dentry->d_name.name;
if (parent->d_flags & DCACHE_OP_COMPARE) {
if (parent->d_op->d_compare(parent, parent->d_inode,
dentry, dentry->d_inode,
tlen, tname, name))
goto next;
} else {
if (dentry_cmp(tname, tlen, str, len))
goto next;
}
dentry->d_count++;
found = dentry;
spin_unlock(&dentry->d_lock);
break;
next:
spin_unlock(&dentry->d_lock);
}
rcu_read_unlock();
return found;
}
d_hash为散列函数,它将parent与hash值相结合,映射到dentry_hashtable表中,返回相应的散列链。
static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
unsigned int hash)
{
hash += (unsigned long) parent / L1_CACHE_BYTES;
hash = hash + (hash >> D_HASHBITS);
return dentry_hashtable + (hash & D_HASHMASK);
}
__d_lookup然后遍历刚刚得到的队列上的每个元素。hlist_bl_for_each_entry_rcu实现遍历。
#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \
for (pos = hlist_bl_first_rcu(head); \
pos && \
({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
pos = rcu_dereference_raw(pos->next))
static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
{
return (struct hlist_bl_node *)
((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK);
}
struct hlist_bl_head {
struct hlist_bl_node *first;
};
struct hlist_bl_node {
struct hlist_bl_node *next, **pprev;
};
#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)
执行到这里,就层层返回到do_lookup,用找到的dentry给path赋值,接着调用follow_managed函数 (fs/namei.c)。
follow_managed处理一些有特殊用途的目录项。比如标记为mount点的目录项,如果目录项是一个mount点,那就切换到另一个文件系统,将path->dentry重新设为新文件系统的根。
最后,do_lookup函数将找到的dentry->d_inode赋值给输出参数inode,返回。
static int follow_managed(struct path *path, unsigned flags)
{
struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
unsigned managed;
bool need_mntput = false;
int ret = 0;
/* Given that we're not holding a lock here, we retain the value in a
* local variable for each dentry as we look at it so that we don't see
* the components of that value change under us */
while (managed = ACCESS_ONCE(path->dentry->d_flags),
managed &= DCACHE_MANAGED_DENTRY,
unlikely(managed != 0)) {
/* Allow the filesystem to manage the transit without i_mutex
* being held. */
if (managed & DCACHE_MANAGE_TRANSIT) {
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
ret = path->dentry->d_op->d_manage(path->dentry, false);
if (ret < 0)
break;
}
/* Transit to a mounted filesystem. */
if (managed & DCACHE_MOUNTED) {
struct vfsmount *mounted = lookup_mnt(path);
if (mounted) {
dput(path->dentry);
if (need_mntput)
mntput(path->mnt);
path->mnt = mounted;
path->dentry = dget(mounted->mnt_root);
need_mntput = true;
continue;
}
/* Something is mounted on this dentry in another
* namespace and/or whatever was mounted there in this
* namespace got unmounted before we managed to get the
* vfsmount_lock */
}
/* Handle an automount point */
if (managed & DCACHE_NEED_AUTOMOUNT) {
ret = follow_automount(path, flags, &need_mntput);
if (ret < 0)
break;
continue;
}
/* We didn't change the current path point */
break;
}
if (need_mntput && path->mnt == mnt)
mntput(path->mnt);
if (ret == -EISDIR)
ret = 0;
return ret < 0 ? ret : need_mntput;
}
返回到walk_component函数,检查inode是不是NULL,如果是则终止搜索。
调用path_to_nameidata函数,path_to_nameidata函数将path的内容转化到nd里面去。将找到的inode节点赋给nd->inode。
static inline void path_to_nameidata(const struct path *path,
struct nameidata *nd)
{
if (!(nd->flags & LOOKUP_RCU)) {
dput(nd->path.dentry);
if (nd->path.mnt != path->mnt)
mntput(nd->path.mnt);
}
nd->path.mnt = path->mnt;
nd->path.dentry = path->dentry;
}
返回到link_path_walk函数,继续循环搜索下一个目录项。若发现是最后一个目录项,就将struct qstr this赋给nd->last, 然后直接返回到path_lookupat函数。
如果link_path_walk返回0,调用complete_walk函数完成搜索 (fs/namei.c)。
static int complete_walk(struct nameidata *nd)
{
struct dentry *dentry = nd->path.dentry;
int status;
if (nd->flags & LOOKUP_RCU) {
nd->flags &= ~LOOKUP_RCU;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
spin_lock(&dentry->d_lock);
if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
spin_unlock(&dentry->d_lock);
rcu_read_unlock();
br_read_unlock(vfsmount_lock);
return -ECHILD;
}
BUG_ON(nd->inode != dentry->d_inode);
spin_unlock(&dentry->d_lock);
mntget(nd->path.mnt);
rcu_read_unlock();
br_read_unlock(vfsmount_lock);
}
if (likely(!(nd->flags & LOOKUP_JUMPED)))
return 0;
if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
return 0;
if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
return 0;
/* Note: we do not d_invalidate() */
status = d_revalidate(dentry, nd);
if (status > 0)
return 0;
if (!status)
status = -ESTALE;
path_put(&nd->path);
return status;
}
层层返回到kern_path_create,调用lookup_hash函数 (fs/namei.c)进行最后一个目录项的搜索。
static struct dentry *lookup_hash(struct nameidata *nd)
{
return __lookup_hash(&nd->last, nd->path.dentry, nd);
}
调用__lookup_hash函数进行搜索,__lookup_hash调用lookup_dcache搜索dcache,若找不到则lookup_real调用对应文件系统的lookup函数去查找。
static struct dentry *__lookup_hash(struct qstr *name,
struct dentry *base, struct nameidata *nd)
{
bool need_lookup;
struct dentry *dentry;
dentry = lookup_dcache(name, base, nd, &need_lookup);
if (!need_lookup)
return dentry;
return lookup_real(base->d_inode, dentry, nd);
}
调用d_lookup搜索。
static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
struct nameidata *nd, bool *need_lookup)
{
struct dentry *dentry;
int error;
*need_lookup = false;
dentry = d_lookup(dir, name);
if (dentry) {
if (d_need_lookup(dentry)) {
*need_lookup = true;
} else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
error = d_revalidate(dentry, nd);
if (unlikely(error <= 0)) {
if (error < 0) {
dput(dentry);
return ERR_PTR(error);
} else if (!d_invalidate(dentry)) {
dput(dentry);
dentry = NULL;
}
}
}
}
if (!dentry) {
dentry = d_alloc(dir, name);
if (unlikely(!dentry))
return ERR_PTR(-ENOMEM);
*need_lookup = true;
}
return dentry;
}
d_lookup进一步调用__d_lookup。__d_lookup上面已经分析过了,就是在内存的dentry_hashtable表中查找dentry结构。这里/dev下的root还没有创建,所以肯定找不到,返回NULL。层层返回到lookup_dcache,如果d_lookup返回的dentry为空,则调用d_alloc分配一个dentry对象。
struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
{
struct dentry *dentry;
unsigned seq;
do {
seq = read_seqbegin(&rename_lock);
dentry = __d_lookup(parent, name);
if (dentry)
break;
} while (read_seqretry(&rename_lock, seq));
return dentry;
}
d_alloc: (fs/dcache.c)
调用__d_alloc分配一个dcache对象并初始化。设置dentry的父目录,并将新的dentry链入其父目录的d_subdirs链表中。
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
struct dentry *dentry = __d_alloc(parent->d_sb, name);
if (!dentry)
return NULL;
spin_lock(&parent->d_lock);
/*
* don't need child lock because it is not subject
* to concurrency here
*/
__dget_dlock(parent);
dentry->d_parent = parent;
list_add(&dentry->d_u.d_child, &parent->d_subdirs);
spin_unlock(&parent->d_lock);
return dentry;
}
__d_alloc: (fs/dcache.c)
分配并初始化一个dcache对象。
struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
struct dentry *dentry;
char *dname;
dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
if (!dentry)
return NULL;
if (name->len > DNAME_INLINE_LEN-1) {
dname = kmalloc(name->len + 1, GFP_KERNEL);
if (!dname) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
} else {
dname = dentry->d_iname;
}
dentry->d_name.name = dname;
dentry->d_name.len = name->len;
dentry->d_name.hash = name->hash;
memcpy(dname, name->name, name->len);
dname[name->len] = 0;
dentry->d_count = 1;
dentry->d_flags = 0;
spin_lock_init(&dentry->d_lock);
seqcount_init(&dentry->d_seq);
dentry->d_inode = NULL;
dentry->d_parent = dentry;
dentry->d_sb = sb;
dentry->d_op = NULL;
dentry->d_fsdata = NULL;
INIT_HLIST_BL_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
INIT_LIST_HEAD(&dentry->d_subdirs);
INIT_LIST_HEAD(&dentry->d_alias);
INIT_LIST_HEAD(&dentry->d_u.d_child);
d_set_d_op(dentry, dentry->d_sb->s_d_op);
this_cpu_inc(nr_dentry);
printk("__d_alloc: dentry name: %s\n", dentry->d_name.name);
return dentry;
}
lookup_real: (fs/namei.c)
在缓存中无法找到指定的目录项,那就创建该目录项。通过底层文件系统的lookup函数从设备读,lookup在读目录项的过程中,也把该目录项对应的inode结构从磁盘读出来,并赋值给dentry的d_inode字段。
static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
struct dentry *old;
/* Don't create child dentry for a dead directory. */
if (unlikely(IS_DEADDIR(dir))) {
dput(dentry);
return ERR_PTR(-ENOENT);
}
old = dir->i_op->lookup(dir, dentry, nd);
if (unlikely(old)) {
dput(dentry);
dentry = old;
}
return dentry;
}
simple_lookup: (fs/libfs.c)
struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
{
static const struct dentry_operations simple_dentry_operations = {
.d_delete = simple_delete_dentry,
};
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
d_set_d_op(dentry, &simple_dentry_operations);
d_add(dentry, NULL);
return NULL;
}
d_set_d_op: (fs/dcache.c)
用simple_dentry_operations初始化dentry->d_op。
void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
WARN_ON_ONCE(dentry->d_op);
WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
DCACHE_OP_COMPARE |
DCACHE_OP_REVALIDATE |
DCACHE_OP_DELETE ));
dentry->d_op = op;
if (!op)
return;
if (op->d_hash)
dentry->d_flags |= DCACHE_OP_HASH;
if (op->d_compare)
dentry->d_flags |= DCACHE_OP_COMPARE;
if (op->d_revalidate)
dentry->d_flags |= DCACHE_OP_REVALIDATE;
if (op->d_delete)
dentry->d_flags |= DCACHE_OP_DELETE;
if (op->d_prune)
dentry->d_flags |= DCACHE_OP_PRUNE;
}
d_add: (include/linux/cache.h)
传入的inode参数为NULL。d_instantiate函数将dentry->d_inode置为NULL。
d_rehash将dentry加入hash表。
static inline void d_add(struct dentry *entry, struct inode *inode)
{
d_instantiate(entry, inode);
d_rehash(entry);
}
void d_rehash(struct dentry * entry)
{
spin_lock(&entry->d_lock);
_d_rehash(entry);
spin_unlock(&entry->d_lock);
}
_d_rehash: (fs/dcache.c)
static void _d_rehash(struct dentry * entry)
{
__d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
}
通过hlist_bl_add_head_rcu将dentry链入dentry_hashtable表。
static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
{
BUG_ON(!d_unhashed(entry));
hlist_bl_lock(b);
entry->d_flags |= DCACHE_RCUACCESS;
hlist_bl_add_head_rcu(&entry->d_hash, b);
hlist_bl_unlock(b);
}
hlist_bl_add_head_rcu: (include/linux/rculist_bl.h)
static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
struct hlist_bl_head *h)
{
struct hlist_bl_node *first;
/* don't need hlist_bl_first_rcu because we're under lock */
first = hlist_bl_first(h);
n->next = first;
if (first)
first->pprev = &n->next;
n->pprev = &h->first;
/* need _rcu because we can have concurrent lock free readers */
hlist_bl_set_first_rcu(h, n);
}
lookup_real返回后,向上一直返回到kern_path_create函数,接着kern_path_create函数检查返回的dentry->d_inode是否不空,如果不空表示inode节点已经存在了,不需要创建,本例中/dev下的root节点是不存在的(需要创建),因此将nd.path赋值给path后,直接返回dentry。
返回到sys_mknodat函数,执行mnt_want_write,这个函数告诉文件系统将要执行写操作了,如果不可写,将会返回错误。
如果可写,调用vfs_mknod函数创建inode节点。
vfs_mknod : (fs/namei.c)
这个函数其实就是调用了要创建节点的父目录的inode节点的方法i_op->mknod。
int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
int error = may_create(dir, dentry);
if (error)
return error;
if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
!ns_capable(inode_userns(dir), CAP_MKNOD))
return -EPERM;
if (!dir->i_op->mknod)
return -EPERM;
error = devcgroup_inode_mknod(mode, dev);
if (error)
return error;
error = security_inode_mknod(dir, dentry, mode, dev);
if (error)
return error;
error = dir->i_op->mknod(dir, dentry, mode, dev);
if (!error)
fsnotify_create(dir, dentry);
return error;
}
inode节点的方法是创建inode节点时初始化的,这里也就是/dev节点创建时初始化的,dev节点是在rootfs虚拟文件系统下创建的(前面说过rootfs虚拟文件系统是linux kernel真正意义上挂载的第一个根文件系统),它的i_op被赋值为ramfs_dir_inode_operations。
(fs/ramfs/inode.c)
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
.link = simple_link,
.unlink = simple_unlink,
.symlink = ramfs_symlink,
.mkdir = ramfs_mkdir,
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
};
所以i_op->mknod方法就是调用的ramfs_mknod函数。
ramfs_mknod: (fs/ramfs/inode.c)
ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
int error = -ENOSPC;
if (inode) {
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
}
return error;
}
ramfs_mknod函数调用了ramfs_get_inode函数创建一个新的inode。
ramfs_get_inode: (fs/ramfs/inode.c)
struct inode *ramfs_get_inode(struct super_block *sb,
const struct inode *dir, umode_t mode, dev_t dev)
{
struct inode * inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &ramfs_aops;
inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
break;
case S_IFREG:
inode->i_op = &ramfs_file_inode_operations;
inode->i_fop = &ramfs_file_operations;
break;
case S_IFDIR:
inode->i_op = &ramfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
break;
}
}
return inode;
}
调用new_inode分配一个新inode。
new_inode: (fs/inode.c)
new_inode进一步调用new_inode_pseudo函数新建inode。如果创建成功,则将其加入到该文件系统超级块的inode链表。
struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
spin_lock_prefetch(&inode_sb_list_lock);
inode = new_inode_pseudo(sb);
if (inode)
inode_sb_list_add(inode);
return inode;
}
new_inode_pseudo: (fs/inode.c)
pseudo是“虚拟”的意思,因为当前是在虚拟根文件系统————rootfs文件系统下创建节点,前面提过rootfs文件系统是基于内存的特殊文件系统,它的inode是只存在于内存中的。而其它像ext, fat等文件系统的inode节点都存储在硬盘等外部设备中。
new_inode_pseudo调用alloc_inode创建inode。
struct inode *new_inode_pseudo(struct super_block *sb)
{
struct inode *inode = alloc_inode(sb);
if (inode) {
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
INIT_LIST_HEAD(&inode->i_sb_list);
}
return inode;
}
如果对应文件系统超级块的s_op->alloc_inode存在,则调用它,否则调用kmem_cache_alloc函数分配inode。
static struct inode *alloc_inode(struct super_block *sb)
{
struct inode *inode;
if (sb->s_op->alloc_inode)
inode = sb->s_op->alloc_inode(sb);
else
inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
if (!inode)
return NULL;
if (unlikely(inode_init_always(sb, inode))) {
if (inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
else
kmem_cache_free(inode_cachep, inode);
return NULL;
}
return inode;
}
在挂载rootfs虚拟根文件系统的时候,对应超级块的s_op已经被设置为ramfs_ops了。可以看到,alloc_inode没有设置,所以调用kmem_cache_alloc函数来分配inode。
static const struct super_operations ramfs_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = generic_show_options,
};
返回到ramfs_get_inode。
从new_inode返回后,进行一些初始化工作(包括inode的节点号),然后进入init_special_inode函数。
init_special_inode: (fs/inode.c)
前面create_dev的时候传进来的参数mode是块设备,所以这里将inode节点操作函数设为def_blk_fops,设备号设为rdev。
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_mode = mode;
if (S_ISCHR(mode)) {
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
} else if (S_ISBLK(mode)) {
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
} else if (S_ISFIFO(mode))
inode->i_fop = &def_fifo_fops;
else if (S_ISSOCK(mode))
inode->i_fop = &bad_sock_fops;
else
printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
" inode %s:%lu\n", mode, inode->i_sb->s_id,
inode->i_ino);
}
至此,inode节点就创建好了,向上层层返回到ramfs_get_inode,ramfs_get_inode就将创建好的inode节点直接返回到ramfs_mknod。
ramfs_mknod进一步调用d_instantiate。
d_instantiate: (fs/dcache.c)
d_instantiate函数封装了__d_instantiate。
void d_instantiate(struct dentry *entry, struct inode * inode)
{
BUG_ON(!list_empty(&entry->d_alias));
if (inode)
spin_lock(&inode->i_lock);
__d_instantiate(entry, inode);
if (inode)
spin_unlock(&inode->i_lock);
security_d_instantiate(entry, inode);
}
__d_instantiate: (fs/dcache.c)
__d_instantiate将inode与dentry关联起来。
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
spin_lock(&dentry->d_lock);
if (inode) {
if (unlikely(IS_AUTOMOUNT(inode)))
dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
list_add(&dentry->d_alias, &inode->i_dentry);
}
dentry->d_inode = inode;
dentry_rcuwalk_barrier(dentry);
spin_unlock(&dentry->d_lock);
fsnotify_d_instantiate(dentry, inode);
}