本文内核版本:4.1.15
文件系统:ext3
read()
vfs_read()
rw_verify_area()
__vfs_read()
new_sync_read()
generic_file_read_iter()
do_generic_file_read()
系统调用 read() :SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct fd f = fdget_pos(fd); //get struct fd from fd array
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file); //get read pos from file
ret = vfs_read(f.file, buf, count, &pos); // vfs_read
if (ret >= 0)
file_pos_write(f.file, pos); // update read pos to file
fdput_pos(f); // update struct fd to fd array
}
return ret;
}
每个进程都会保存一个 struct fd 的数组代表进程打开的文件。所以根据 fd 数组的偏移地址就能找到 struct fd 。
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ)) // is file open for read
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ)) // can file read
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count); // read or write verify
if (ret >= 0) {
count = ret;
ret = __vfs_read(file, buf, count, pos); // __vfs_read
if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret);
}
inc_syscr(current);
}
return ret;
}
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
loff_t *pos)
{
if (file->f_op->read)
return file->f_op->read(file, buf, count, pos);
else if (file->f_op->read_iter)
return new_sync_read(file, buf, count, pos);
else
return -EINVAL;
}
根据文件的注册 file_operations 调用相应的 read 方法:
下面分别是 tty 设备与 ext3 文件系统注册的 file_operations:
static const struct file_operations tty_fops = {
.llseek = no_llseek,
.read = tty_read,
.write = tty_write,
.poll = tty_poll,
.unlocked_ioctl = tty_ioctl,
.compat_ioctl = tty_compat_ioctl,
.open = tty_open,
.release = tty_release,
.fasync = tty_fasync,
};
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
#endif
.mmap = generic_file_mmap,
.open = dquot_file_open,
.release = ext3_release_file,
.fsync = ext3_sync_file,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
iov_iter_init(&iter, READ, &iov, 1, len);
ret = filp->f_op->read_iter(&kiocb, &iter); //fs/ext3/file.c/line53
BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
这里采用了 散布读(scatter read)和聚集写(gather writer)技术。具体参见 UNIX 环境高级编程14.6节。
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
if (iocb->ki_flags & IOCB_DIRECT) { // open direct I/O, copy data from disk to user, not use page cache
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
loff_t size;
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
retval = filemap_write_and_wait_range(mapping, pos,
pos + count - 1);
if (!retval) {
struct iov_iter data = *iter;
retval = mapping->a_ops->direct_IO(iocb, &data, pos);
}
if (retval > 0) {
*ppos = pos + retval;
iov_iter_advance(iter, retval);
}
/*
* Btrfs can have a short DIO read if we encounter
* compressed extents, so if there was an error, or if
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
IS_DAX(inode)) {
file_accessed(file);
goto out;
}
}
retval = do_generic_file_read(file, ppos, iter, retval);
out:
return retval;
}
该函数分为两个部分:
do_generic_file_read() 函数是读文件的核心,同时函数也比较长,我们下篇文章再来分析。