在linux下,假设我们想打开文件/dev/tty,我们可以使用系统调用open,比如:
int fd = open("/dev/tty", O_RDWR, 0);
本文将从源码角度看下,在linux内核中,open方法是如何打开文件的。
首先看下入口函数。
// fs/open.c
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
...
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
该方法调用了do_sys_open方法
// fs/open.c
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_flags op;
int fd = build_open_flags(flags, mode, &op);
struct filename *tmp;
...
tmp = getname(filename);
...
fd = get_unused_fd_flags(flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
...
} else {
...
fd_install(fd, f);
}
}
...
return fd;
}
该方法大致操作为:
1. 调用build_open_flags方法,初始化struct open_flags实例op。
// fs/internal.h
struct open_flags {
int open_flag;
umode_t mode;
int acc_mode;
int intent;
int lookup_flags;
};
// fs/open.c
static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
{
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
...
if (flags & (O_CREAT | __O_TMPFILE))
op->mode = (mode & S_IALLUGO) | S_IFREG;
else
op->mode = 0;
...
op->open_flag = flags;
...
op->acc_mode = acc_mode;
op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
...
if (flags & O_DIRECTORY)
lookup_flags |= LOOKUP_DIRECTORY;
...
op->lookup_flags = lookup_flags;
return 0;
}
2. 调用getname方法,分配并初始化struct filename实例tmp。
// include/linux/fs.h
struct filename {
const char *name; /* pointer to actual string */
const __user char *uptr; /* original userland pointer */
int refcnt;
struct audit_names *aname;
const char iname[];
};
// fs/namei.c
struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
struct filename *result;
char *kname;
...
result = __getname(); // 分配内存
...
kname = (char *)result->iname;
result->name = kname;
len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
...
result->refcnt = 1;
...
result->uptr = filename;
...
return result;
}
struct filename *
getname(const char __user * filename)
{
return getname_flags(filename, 0, NULL);
}
3. 调用get_unused_fd_flags方法获取一个未被使用的文件描述符fd。
4. 调用do_filp_open方法,继续执行open操作,并将返回值赋值给类型为struct file的实例指针f。
5. 如果do_filp_open成功,则调用fd_install方法,建立从fd到struct file的对应关系。
6. 返回fd给用户。
我们再继续看下do_filp_open方法。
// fs/namei.c
struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
struct nameidata nd;
int flags = op->lookup_flags;
struct file *filp;
set_nameidata(&nd, dfd, pathname);
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
...
return filp;
}
该方法先调用set_nameidata方法,初始化struct nameidata类型实例nd。
// fs/namei.c
struct nameidata {
struct path path;
struct qstr last;
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags;
unsigned seq, m_seq;
int last_type;
unsigned depth;
int total_link_count;
struct saved {
struct path link;
struct delayed_call done;
const char *name;
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
struct nameidata *saved;
struct inode *link_inode;
unsigned root_seq;
int dfd;
} __randomize_layout;
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
struct nameidata *old = current->nameidata;
p->stack = p->internal;
p->dfd = dfd;
p->name = name;
p->total_link_count = old ? old->total_link_count : 0;
p->saved = old;
current->nameidata = p;
}
再调用path_openat方法继续执行open操作。
// fs/namei.c
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
struct file *file;
int error;
file = alloc_empty_file(op->open_flag, current_cred());
...
if (unlikely(file->f_flags & __O_TMPFILE)) {
...
} else {
const char *s = path_init(nd, flags);
while (!(error = link_path_walk(s, nd)) &&
(error = do_last(nd, file, op)) > 0) {
...
}
...
}
if (likely(!error)) {
if (likely(file->f_mode & FMODE_OPENED))
return file;
...
}
...
return ERR_PTR(error);
}
该方法中,先调用alloc_empty_file方法,分配一个空的struct file实例,再调用path_init、link_path_walk、do_last等方法执行后续的open操作,如果都成功了,返回file给上层。
先看下path_init方法。
// fs/namei.c
static const char *path_init(struct nameidata *nd, unsigned flags)
{
const char *s = nd->name->name;
...
nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
nd->depth = 0;
...
nd->root.mnt = NULL;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
...
if (*s == '/') {
set_root(nd);
if (likely(!nd_jump_root(nd)))
return s;
return ERR_PTR(-ECHILD);
}
...
}
假设我们要open的路径为/dev/tty,该方法在进行一些初始化赋值之后,会调用set_root方法,设置nd->root字段为fs->root,即根目录
// fs/namei.c
static void set_root(struct nameidata *nd)
{
struct fs_struct *fs = current->fs;
if (nd->flags & LOOKUP_RCU) {
...
do {
...
nd->root = fs->root;
...
} while (read_seqcount_retry(&fs->seq, seq));
} else {
...
}
}
再调用nd_jump_root方法,设置nd->path字段为nd->root,nd->inode字段为nd->root->d_inode。
// fs/namei.c
static int nd_jump_root(struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
struct dentry *d;
nd->path = nd->root;
d = nd->path.dentry;
nd->inode = d->d_inode;
...
} else {
...
}
nd->flags |= LOOKUP_JUMPED;
return 0;
}
如果上述方法都没有问题,最后返回s给上层,至此,path_init方法结束。
由上可见,path_init方法主要是用来初始化struct nameidata实例中的path、root、inode等字段。
我们再来看下link_path_walk方法。
// fs/namei.c
static int link_path_walk(const char *name, struct nameidata *nd)
{
...
while (*name=='/')
name++;
...
/* At this point we know we have a real path component. */
for(;;) {
u64 hash_len;
int type;
...
hash_len = hash_name(nd->path.dentry, name);
type = LAST_NORM;
...
nd->last.hash_len = hash_len;
nd->last.name = name;
nd->last_type = type;
name += hashlen_len(hash_len);
if (!*name)
goto OK;
do {
name++;
} while (unlikely(*name == '/'));
if (unlikely(!*name)) {
OK:
/* pathname body, done */
if (!nd->depth)
return 0;
...
} else {
/* not the last component */
err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
}
...
}
}
该方法的大致操作为:
1. 跳过开始的‘/’字符。
2. 调用hash_name方法,获取下一个path component的hash和len,并复制给hash_len。
path component就是以‘/’字符分割的路径的各个部分。
3. 将该path component的信息赋值给nd->last字段。
4. 修改name的值,使其指向path的下一个component。
5. 如果下一个component为空,则goto到OK这个label,执行一些操作之后,最后return 0给上层。
6. 如果下一个component不为空,则执行walk_component方法,找到nd->last字段指向的component对应的dentry、inode等信息,并更新nd->path、nd->inode等字段,使其指向新的路径。
以open /dev/tty为例,该方法最终的结果是,更新struct nameidata实例指针nd中的path、inode字段,使其指向路径/dev/,更新nd中的last值,使其为tty。
最后,再来看下do_last方法。
// fs/namei.c
static int do_last(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
...
if (!(open_flag & O_CREAT)) {
...
error = lookup_fast(nd, &path, &inode, &seq);
if (likely(error > 0))
goto finish_lookup;
...
} else {
...
}
...
finish_lookup:
error = step_into(nd, &path, 0, inode, seq);
...
error = vfs_open(&nd->path, file);
...
return error;
}
该方法中,先调用lookup_fast,找路径中的最后一个component,如果成功,就会跳到finish_lookup对应的label,然后执行step_into方法,更新nd中的path、inode等信息,使其指向目标路径。
之后,调用vfs_open方法,继续执行open操作。
最后,返回error给上层,如果成功,error为0。
我们继续看下vfs_open方法。
// fs/open.c
int vfs_open(const struct path *path, struct file *file)
{
file->f_path = *path;
return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}
该方法又调用了do_dentry_open方法。
// fs/open.c
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *))
{
...
f->f_inode = inode;
...
f->f_op = fops_get(inode->i_fop);
...
if (!open)
open = f->f_op->open;
if (open) {
error = open(inode, f);
...
}
f->f_mode |= FMODE_OPENED;
...
return 0;
...
}
该方法中,设置f->f_op的值为inode->i_fop,由于参数open为null,所以open也被重新赋值为f->f_op->open,即 inode->i_fop->open,之后再调用该open方法,继续执行open逻辑。
那inode->i_fop的值又是在哪里设置的呢?
// fs/inode.c
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_mode = mode;
if (S_ISCHR(mode)) {
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
} else if (S_ISBLK(mode)) {
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
} else if (S_ISFIFO(mode))
inode->i_fop = &pipefifo_fops;
else if (S_ISSOCK(mode))
; /* leave it no_open_fops */
else
printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
" inode %s:%lu\n", mode, inode->i_sb->s_id,
inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);
由上可见,是在init_special_inode方法里设置的。
由于/dev/tty是character device,所以i_fop的值为def_chr_fops。
// fs/char_dev.c
const struct file_operations def_chr_fops = {
.open = chrdev_open,
.llseek = noop_llseek,
};
它对应的open方法为chrdev_open。
// fs/char_dev.c
static int chrdev_open(struct inode *inode, struct file *filp)
{
const struct file_operations *fops;
struct cdev *p;
...
p = inode->i_cdev;
if (!p) {
struct kobject *kobj;
...
kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
...
new = container_of(kobj, struct cdev, kobj);
...
/* Check i_cdev again in case somebody beat us to it while
we dropped the lock. */
p = inode->i_cdev;
if (!p) {
inode->i_cdev = p = new;
...
} ...
}
...
fops = fops_get(p->ops);
...
replace_fops(filp, fops);
if (filp->f_op->open) {
ret = filp->f_op->open(inode, filp);
...
}
return 0;
...
}
该方法先调用kobj_lookup方法,在cdev_map中找对应的cdev,找到之后把结果赋值给p。之后获取p->ops的值,赋值给fops,再之后替换filp->f_op字段的值为fops,最后检查filp->f_op的值中是否包含open方法,如果有,则调用该方法继续执行open逻辑。
我们先看下/dev/tty对应的cdev是在哪把自己注册到cdev_map里的。
// drivers/tty/tty_io.c
int __init tty_init(void)
{
...
cdev_init(&tty_cdev, &tty_fops);
if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) ||
register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0)
panic("Couldn't register /dev/tty driver\n");
...
return 0;
}
该方法先调用cdev_init,初始化tty_cdev,并将其ops字段设置为tty_fops,然后调用cdev_add、register_chrdev_region方法,注册这个cdev到cdev_map。
由上可知,/dev/tty对应的cdev就是tty_cdev,而cdev->ops就是tty_fops。、
// drivers/tty/tty_io.c
static const struct file_operations tty_fops = {
.llseek = no_llseek,
.read = tty_read,
.write = tty_write,
.poll = tty_poll,
.unlocked_ioctl = tty_ioctl,
.compat_ioctl = tty_compat_ioctl,
.open = tty_open,
.release = tty_release,
.fasync = tty_fasync,
.show_fdinfo = tty_show_fdinfo,
};
由上可见,cdev->ops->open对应的方法就是tty_open,即/dev/tty的最终open逻辑。
由于此部分逻辑和open系统调用关联不是很大,在此略过。
至此,整个open逻辑就已分析完毕。