Linux read系统调用

1 read系统调用流程

本文内核版本:4.1.15
文件系统:ext3

read()
	vfs_read()
		rw_verify_area()
		__vfs_read()
			new_sync_read()
				generic_file_read_iter()	
					do_generic_file_read()

2 调用函数分析

2.1 read()

系统调用 read() :SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
	struct fd f = fdget_pos(fd);						//get struct fd from fd array
	ssize_t ret = -EBADF;

	if (f.file) {
		loff_t pos = file_pos_read(f.file);				//get read pos from file
		ret = vfs_read(f.file, buf, count, &pos);		// vfs_read
		if (ret >= 0)
			file_pos_write(f.file, pos);				// update read pos to file
		fdput_pos(f);									// update struct fd to fd array
	}
	return ret;
}

每个进程都会保存一个 struct fd 的数组代表进程打开的文件。所以根据 fd 数组的偏移地址就能找到 struct fd 。

1.2 vfs_read()

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

	if (!(file->f_mode & FMODE_READ))				// is file open for read
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_READ))			// can file read
		return -EINVAL;
	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
		return -EFAULT;

	ret = rw_verify_area(READ, file, pos, count);		// read or write verify
	if (ret >= 0) {
		count = ret;
		ret = __vfs_read(file, buf, count, pos);		// __vfs_read
		if (ret > 0) {
			fsnotify_access(file);
			add_rchar(current, ret);
		}
		inc_syscr(current);
	}

	return ret;
}

1.3 __vfs_read()

ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
		   loff_t *pos)
{
	if (file->f_op->read)
		return file->f_op->read(file, buf, count, pos);
	else if (file->f_op->read_iter)
		return new_sync_read(file, buf, count, pos);
	else
		return -EINVAL;
}

根据文件的注册 file_operations 调用相应的 read 方法:

  • 注册的 file_operations 有 read 方法,调用 read 方法,一般的设备文件会注册此类接口
  • 若 file_operations 有 read_iter 方法,调用 new_sync_read()。一般普通文件注册此类方法。

下面分别是 tty 设备与 ext3 文件系统注册的 file_operations:

static const struct file_operations tty_fops = {
	.llseek		= no_llseek,
	.read		= tty_read,
	.write		= tty_write,
	.poll		= tty_poll,
	.unlocked_ioctl	= tty_ioctl,
	.compat_ioctl	= tty_compat_ioctl,
	.open		= tty_open,
	.release	= tty_release,
	.fasync		= tty_fasync,
};

const struct file_operations ext3_file_operations = {
	.llseek		= generic_file_llseek,
	.read_iter	= generic_file_read_iter,
	.write_iter	= generic_file_write_iter,
	.unlocked_ioctl	= ext3_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= ext3_compat_ioctl,
#endif
	.mmap		= generic_file_mmap,
	.open		= dquot_file_open,
	.release	= ext3_release_file,
	.fsync		= ext3_sync_file,
	.splice_read	= generic_file_splice_read,
	.splice_write	= iter_file_splice_write,
};

1.4 new_sync_read()

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
	struct iovec iov = { .iov_base = buf, .iov_len = len };
	struct kiocb kiocb;
	struct iov_iter iter;
	ssize_t ret;

	init_sync_kiocb(&kiocb, filp);
	kiocb.ki_pos = *ppos;
	iov_iter_init(&iter, READ, &iov, 1, len);

	ret = filp->f_op->read_iter(&kiocb, &iter);			//fs/ext3/file.c/line53
	BUG_ON(ret == -EIOCBQUEUED);
	*ppos = kiocb.ki_pos;
	return ret;
}

这里采用了 散布读(scatter read)和聚集写(gather writer)技术。具体参见 UNIX 环境高级编程14.6节。

1.5 generic_file_read_iter()

ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
	struct file *file = iocb->ki_filp;
	ssize_t retval = 0;
	loff_t *ppos = &iocb->ki_pos;
	loff_t pos = *ppos;

	if (iocb->ki_flags & IOCB_DIRECT) {		// open direct I/O, copy data from disk to user, not use page cache
		struct address_space *mapping = file->f_mapping;
		struct inode *inode = mapping->host;
		size_t count = iov_iter_count(iter);
		loff_t size;

		if (!count)
			goto out; /* skip atime */
		size = i_size_read(inode);
		retval = filemap_write_and_wait_range(mapping, pos,
					pos + count - 1);
		if (!retval) {
			struct iov_iter data = *iter;
			retval = mapping->a_ops->direct_IO(iocb, &data, pos);
		}

		if (retval > 0) {
			*ppos = pos + retval;
			iov_iter_advance(iter, retval);
		}

		/*
		 * Btrfs can have a short DIO read if we encounter
		 * compressed extents, so if there was an error, or if
		 * we've already read everything we wanted to, or if
		 * there was a short read because we hit EOF, go ahead
		 * and return.  Otherwise fallthrough to buffered io for
		 * the rest of the read.  Buffered reads will not work for
		 * DAX files, so don't bother trying.
		 */
		if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
		    IS_DAX(inode)) {
			file_accessed(file);
			goto out;
		}
	}

	retval = do_generic_file_read(file, ppos, iter, retval);
out:
	return retval;
}

该函数分为两个部分:

  • 若是在直接 I/O 模式下打开,任何读写操作都将数据在用户态地址与磁盘间直接传送而不通过页高速缓存。这种情况暂时不分析。
  • 经过高速缓存读取文件,调用 do_generic_file_read()

do_generic_file_read() 函数是读文件的核心,同时函数也比较长,我们下篇文章再来分析。

你可能感兴趣的:(linux)