Linux虚拟文件系统--open()

        open()系统调用用来打开一个文件,本文就VFS层,对open系统调用的过程进行一个简单的分析。

 

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
{
	long ret;

	if (force_o_largefile())
		flags |= O_LARGEFILE;

	ret = do_sys_open(AT_FDCWD, filename, flags, mode);
	/* avoid REGPARM breakage on x86: */
	asmlinkage_protect(3, ret, filename, flags, mode);
	return ret;
}

force_o_largefile()用来判断系统是否为32位的,如果不是32位,也就是说为64位,则将O_LARGEFILE置位,主体工作由do_sys_open()来做

long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
{
	char *tmp = getname(filename);//拷贝文件名字符串到内核空间
	int fd = PTR_ERR(tmp);

	if (!IS_ERR(tmp)) {
		fd = get_unused_fd_flags(flags);//为文件分配一个文件描述符
		if (fd >= 0) {
			//实际的OPEN操作处理
			struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
			if (IS_ERR(f)) {
				put_unused_fd(fd);
				fd = PTR_ERR(f);
			} else {
				fsnotify_open(f->f_path.dentry);
				fd_install(fd, f);
			}
		}
		putname(tmp);
	}
	return fd;
}


open操作是特定于某个进程进行的,因此涉及到了VFS中特定于进程的结构,这里简单的介绍下

struct files_struct {
  /*
   * read mostly part
   */
	atomic_t count;
	struct fdtable *fdt;
	struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
	spinlock_t file_lock ____cacheline_aligned_in_smp;
	int next_fd;
	struct embedded_fd_set close_on_exec_init;
	struct embedded_fd_set open_fds_init;
	struct file * fd_array[NR_OPEN_DEFAULT];
};

count表示共享该结构的进程数

fdtable是该进程的文件描述符数组

fdt指向fdtable

next_fd表示最大文件描述符号+1

embedded_fd_set是一个位图结构,用来标记文件描述符,close_on_exec_init用来标记那些执行exec时要关闭的文件的文件描述符,open_fds_init用来标记已经分配出去了的文件描述符

fd_array用来存储进程打开的文件的struct file指针

 

do_sys_open()的一个重要任务就是调用get_unused_fd_flags()为即将打开的文件分配一个文件描述符

#define get_unused_fd_flags(flags) alloc_fd(0, (flags))

 

int alloc_fd(unsigned start, unsigned flags)
{
	struct files_struct *files = current->files;//获取当前进程的files_struct
	unsigned int fd;
	int error;
	struct fdtable *fdt;

	spin_lock(&files->file_lock);
repeat:
	fdt = files_fdtable(files);//获取进程的fdtable
	fd = start;
	if (fd < files->next_fd)
		fd = files->next_fd;

	if (fd < fdt->max_fds)
		fd = find_next_zero_bit(fdt->open_fds->fds_bits,
					   fdt->max_fds, fd);//从位图中获取一个空闲位

	error = expand_files(files, fd);//这里根据需要扩充文件描述符数组
	if (error < 0)
		goto out;

	/*
	 * If we needed to expand the fs array we
	 * might have blocked - try again.
	 */
	if (error)//之前进行了扩充操作,重新进行一次空闲bit的搜索
		goto repeat;

	if (start <= files->next_fd)
		files->next_fd = fd + 1;

	FD_SET(fd, fdt->open_fds);//在open_fds的位图上置位
	if (flags & O_CLOEXEC)//如果设定了O_CLOEXEC,则在close_on_exec位图上将相应位置位
		FD_SET(fd, fdt->close_on_exec);
	else
		FD_CLR(fd, fdt->close_on_exec);
	error = fd;
#if 1
	/* Sanity check */
	if (rcu_dereference(fdt->fd[fd]) != NULL) {
		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
		rcu_assign_pointer(fdt->fd[fd], NULL);
	}
#endif

out:
	spin_unlock(&files->file_lock);
	return error;
}



 

int expand_files(struct files_struct *files, int nr)
{
	struct fdtable *fdt;

	fdt = files_fdtable(files);

	/*
	 * N.B. For clone tasks sharing a files structure, this test
	 * will limit the total number of files that can be opened.
	 */
	 /*如果nr大于进程允许的最大打开文件数,则返回错误*/
	if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
		return -EMFILE;

	/*nr小于最大文件描述符,则不用进行fdtable的扩展,直接返回*/
	if (nr < fdt->max_fds)
		return 0;

	/*扩展的话不能超过sysctl_nr_opend的上限*/
	if (nr >= sysctl_nr_open)
		return -EMFILE;

	/* 到这里表示确实需要进行扩充,进行实际的扩展操作 */
	return expand_fdtable(files, nr);
}

 

实际的扩充操作:

static int expand_fdtable(struct files_struct *files, int nr)
	__releases(files->file_lock)
	__acquires(files->file_lock)
{
	struct fdtable *new_fdt, *cur_fdt;

	spin_unlock(&files->file_lock);
	new_fdt = alloc_fdtable(nr);//根据nr重新创建一个新的fdtable
	spin_lock(&files->file_lock);
	if (!new_fdt)
		return -ENOMEM;
	/*
	 * extremely unlikely race - sysctl_nr_open decreased between the check in
	 * caller and alloc_fdtable().  Cheaper to catch it here...
	 */
	 /*这里为了防止因为竞争,在alloc_fdtable调用之前systl_nr_open减小了新创建的fdtable小于nr*/
	if (unlikely(new_fdt->max_fds <= nr)) {
		free_fdarr(new_fdt);
		free_fdset(new_fdt);
		kfree(new_fdt);
		return -EMFILE;
	}
	/*
	 * Check again since another task may have expanded the fd table while
	 * we dropped the lock
	 */
	cur_fdt = files_fdtable(files);//获取旧的fdtable
	if (nr >= cur_fdt->max_fds) {//新的nr必须大于旧的fdtable的大小
		/* Continue as planned */
		copy_fdtable(new_fdt, cur_fdt);//将旧的fdtable中的内容拷贝至新的fdtable
		rcu_assign_pointer(files->fdt, new_fdt);//用新的fdtable替换旧的fdtable
		if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
			free_fdtable(cur_fdt);//释放旧的fdtable
	} else {
		/* Somebody else expanded, so undo our attempt */
		free_fdarr(new_fdt);
		free_fdset(new_fdt);
		kfree(new_fdt);
	}
	return 1;
}


到此为止,分配新的fd的工作完成,如果分配fd成功,接下来do_sys_open()就要通过do_filp_open()函数查找文件并执行相应的打开操作

do_filp_open的工作针对两种情况进行:

1.flag中未标识O_CREAT,也就是只进行单纯的搜索打开,如果没有搜索到目标文件的话,不会进行创建,这种情况处理起来比较简单,主要工作就是通过路径解析来查找文件,查找到了的话再根据文件系统定义的open方式进行打开

2.flag中标识了O_CREAT,也就是说如果没找到目标文件要进行创建。这种情况要先查找目标文件的父目录(通过将LOOKUP_PARENT标识置位然后进行路径解析来实现),因为假如没查找到目标文件的话,创建工作需要在父目录下完成;然后再查找最后一个文件分量,也就是目标文件,并进行打开操作,其中涉及到的许多部分在前面几篇文章中也都已经分析过了

struct file *do_filp_open(int dfd, const char *pathname,
		int open_flag, int mode, int acc_mode)
{
	struct file *filp;
	struct nameidata nd;
	int error;
	struct path path;
	struct dentry *dir;
	int count = 0;
	int will_write;
	int flag = open_to_namei_flags(open_flag);

	if (!acc_mode)
		acc_mode = MAY_OPEN | ACC_MODE(flag);

	/* O_TRUNC implies we need access checks for write permissions */
	if (flag & O_TRUNC)
		acc_mode |= MAY_WRITE;

	/* Allow the LSM permission hook to distinguish append 
	   access from general write access. */
	if (flag & O_APPEND)
		acc_mode |= MAY_APPEND;

	/*
	 * The simplest case - just a plain lookup.
	 */

	/*如果没有设置O_CREAT,则在未找到文件的情况下不用创建文件,直接通过查找来打开文件*/
	if (!(flag & O_CREAT)) {
		error = path_lookup_open(dfd, pathname, lookup_flags(flag),
					 &nd, flag);
		if (error)
			return ERR_PTR(error);
		goto ok;  //成功查找到了目标文件的话,就跳转到ok去执行后续操作
	}

	/*
	 * Create - we need to know the parent.
	 */
	 /*如果需要creat,那么就要知道目标文件的父目录,因此需要设置LOOKUP_PARENT标识*/
	error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
	if (error)
		return ERR_PTR(error);
	/*进行路径名的解析,父目录将保存到nd中*/
	error = path_walk(pathname, &nd);
	if (error) {
		if (nd.root.mnt)
			path_put(&nd.root);
		return ERR_PTR(error);
	}
	if (unlikely(!audit_dummy_context()))
		audit_inode(pathname, nd.path.dentry);

	/*
	 * We have the parent and last component. First of all, check
	 * that we are not asked to creat(2) an obvious directory - that
	 * will not do.
	 */
	error = -EISDIR;

	/*这里要先保证路径名的最后一个分量是普通文件名(不为.和..),并且长度不为0*/
	if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
		goto exit_parent;

	error = -ENFILE;
	filp = get_empty_filp();//分配一个struct file
	if (filp == NULL)
		goto exit_parent;
	/*将打开文件的信息保存在nd.intent中*/
	nd.intent.open.file = filp;
	nd.intent.open.flags = flag;
	nd.intent.open.create_mode = mode;
	dir = nd.path.dentry;//获取父目录
	nd.flags &= ~LOOKUP_PARENT;//取消LOOKUP_PARENT标识
	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;//设置CREATE和OPEN标识
	if (flag & O_EXCL)
		nd.flags |= LOOKUP_EXCL;
	mutex_lock(&dir->d_inode->i_mutex);

	//lookup_hash进行最终分量的查找,先查找dentry缓存,没找到的话再通过特定于文件系统的lookup方式从磁盘查找
	path.dentry = lookup_hash(&nd);
	path.mnt = nd.path.mnt;

do_last:
	error = PTR_ERR(path.dentry);//检查目标dentry是否有效
	if (IS_ERR(path.dentry)) {
		mutex_unlock(&dir->d_inode->i_mutex);
		goto exit;
	}

	if (IS_ERR(nd.intent.open.file)) {//检查file是否有效
		error = PTR_ERR(nd.intent.open.file);
		goto exit_mutex_unlock;
	}

	/* Negative dentry, just create the file */
	if (!path.dentry->d_inode) {//dentry没有对应上inode,创建之,可能的情况就是该文件被删除了
		/*
		 * This write is needed to ensure that a
		 * ro->rw transition does not occur between
		 * the time when the file is created and when
		 * a permanent write count is taken through
		 * the 'struct file' in nameidata_to_filp().
		 */
		error = mnt_want_write(nd.path.mnt);
		if (error)
			goto exit_mutex_unlock;
		/*__open_namei_create将会调用到父目录所属文件系统中定义的create方式创建文件*/
		error = __open_namei_create(&nd, &path, flag, mode);
		if (error) {
			mnt_drop_write(nd.path.mnt);
			goto exit;
		}
		/*nameidata_to_filp将会调用目标文件的inode对应的open函数进行打开操作*/
		filp = nameidata_to_filp(&nd, open_flag);
		if (IS_ERR(filp))
			ima_counts_put(&nd.path,
				       acc_mode & (MAY_READ | MAY_WRITE |
						   MAY_EXEC));
		mnt_drop_write(nd.path.mnt);
		if (nd.root.mnt)
			path_put(&nd.root);
		return filp;
	}

	/*
	 * 下面的情况对应目标文件存在
	 */
	mutex_unlock(&dir->d_inode->i_mutex);
	audit_inode(pathname, path.dentry);

	error = -EEXIST;
	if (flag & O_EXCL)
		goto exit_dput;

	/*下面要做一些必要的检查*/
	if (__follow_mount(&path)) {//检测目标对象上是否挂载了文件系统
		error = -ELOOP;
		if (flag & O_NOFOLLOW)
			goto exit_dput;
	}

	error = -ENOENT;
	if (!path.dentry->d_inode)//检测目标对象的inode是否存在
		goto exit_dput;
	if (path.dentry->d_inode->i_op->follow_link)//检测目标对象是否为链接文件
		goto do_link;

	/*检查OK,将path保存至nd*/
	path_to_nameidata(&path, &nd);
	error = -EISDIR;
	if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
		goto exit;
ok:
	/*
	 * Consider:
	 * 1. may_open() truncates a file
	 * 2. a rw->ro mount transition occurs
	 * 3. nameidata_to_filp() fails due to
	 *    the ro mount.
	 * That would be inconsistent, and should
	 * be avoided. Taking this mnt write here
	 * ensures that (2) can not occur.
	 */
	will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
	if (will_write) {
		error = mnt_want_write(nd.path.mnt);
		if (error)
			goto exit;
	}
	/*may_open()会做一些检测*/
	error = may_open(&nd.path, acc_mode, flag);
	if (error) {
		if (will_write)
			mnt_drop_write(nd.path.mnt);
		goto exit;
	}
	//执行文件系统定义的打开操作,并保存信息至filp
	filp = nameidata_to_filp(&nd, open_flag);
	if (IS_ERR(filp))
		ima_counts_put(&nd.path,
			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
	/*
	 * It is now safe to drop the mnt write
	 * because the filp has had a write taken
	 * on its behalf.
	 */
	if (will_write)
		mnt_drop_write(nd.path.mnt);
	if (nd.root.mnt)
		path_put(&nd.root);
	return filp;

exit_mutex_unlock:
	mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
	path_put_conditional(&path, &nd);
exit:
	if (!IS_ERR(nd.intent.open.file))
		release_open_intent(&nd);
exit_parent:
	if (nd.root.mnt)
		path_put(&nd.root);
	path_put(&nd.path);
	return ERR_PTR(error);

do_link://目标文件为符号链接的处理,前文已经分析过
	error = -ELOOP;
	if (flag & O_NOFOLLOW)
		goto exit_dput;
	/*
	 * This is subtle. Instead of calling do_follow_link() we do the
	 * thing by hands. The reason is that this way we have zero link_count
	 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
	 * After that we have the parent and last component, i.e.
	 * we are in the same situation as after the first path_walk().
	 * Well, almost - if the last component is normal we get its copy
	 * stored in nd->last.name and we will have to putname() it when we
	 * are done. Procfs-like symlinks just set LAST_BIND.
	 */
	nd.flags |= LOOKUP_PARENT;
	error = security_inode_follow_link(path.dentry, &nd);
	if (error)
		goto exit_dput;
	error = __do_follow_link(&path, &nd);
	if (error) {
		/* Does someone understand code flow here? Or it is only
		 * me so stupid? Anathema to whoever designed this non-sense
		 * with "intent.open".
		 */
		release_open_intent(&nd);
		if (nd.root.mnt)
			path_put(&nd.root);
		return ERR_PTR(error);
	}
	nd.flags &= ~LOOKUP_PARENT;
	if (nd.last_type == LAST_BIND)
		goto ok;
	error = -EISDIR;
	if (nd.last_type != LAST_NORM)
		goto exit;
	if (nd.last.name[nd.last.len]) {
		__putname(nd.last.name);
		goto exit;
	}
	error = -ELOOP;
	if (count++==32) {
		__putname(nd.last.name);
		goto exit;
	}
	dir = nd.path.dentry;
	mutex_lock(&dir->d_inode->i_mutex);
	path.dentry = lookup_hash(&nd);
	path.mnt = nd.path.mnt;
	__putname(nd.last.name);
	goto do_last;
}


 


 

你可能感兴趣的:(Linux文件系统)