Linux内核的VFS是非常经典的抽象,不仅抽象出了flesystem,super_block,inode,dentry,file等结构,而且还提供了像页高速缓存层的通用接口,当然,你可以自己选择是否使用或者自己定制使用方式。本文主要根据自己阅读Linux Kernel 3.19.3系统调用read相关的源码来追踪页高速缓存在整个流程中的痕迹,以常规文件的页高速缓存为例,了解页高速缓存的实现过程,不过于追究具体bio请求的底层细节。另外,在写操作的过程中,页高速缓存的处理流程有所不同(回写),涉及的东西更多,本文主要关注读操作。Linux VFS相关的重要数据结构及概念可以参考Document目录下的vfs.txt。
除了前述基本数据结构以外,struct address_space 和 struct address_space_operations也在页高速缓存中起着极其重要的作用。
//include/linux/fs.h
struct address_space {
//指向文件的inode,可能为NULL
struct inode *host;
//存放装有缓存数据的页面
struct radix_tree_root page_tree;
spinlock_t tree_lock;
atomic_t i_mmap_writable;
struct rb_root i_mmap;
struct list_head i_mmap_nonlinear;
struct rw_semaphore i_mmap_rwsem;
//已缓存页的数量
unsigned long nrpages;
unsigned long nrshadows;
pgoff_t writeback_index;
//address_space相关操作,定义了具体读写页面的钩子
const struct address_space_operations *a_ops;
unsigned long flags;
struct backing_dev_info *backing_dev_info;
spinlock_t private_lock;
struct list_head private_list;
void *private_data;
} __attribute__((aligned(sizeof(long))));
//include/linux/fs.h
struct address_space_operations {
//具体写页面的操作
int (*writepage)(struct page *page, struct writeback_control *wbc);
//具体读页面的操作
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
//标记页面脏
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata);
int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata);
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned int, unsigned int);
int (*releasepage) (struct page *, gfp_t);
void (*freepage)(struct page *);
ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset);
int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, unsigned long *);
int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode);
int (*launder_page) (struct page *);
int (*is_partially_uptodate) (struct page *, unsigned long, unsigned long);
void (*is_dirty_writeback) (struct page *, bool *, bool *);
int (*error_remove_page)(struct address_space *, struct page *);
/* swapfile support */
int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span);
void (*swap_deactivate)(struct file *file);
};
关于挂载和打开文件的操作,不赘述(涉及的细节也很多…),(极其)简陋地理解,挂载返回挂载点的root dentry,并且读取磁盘数据生成了super_block链接到全局超级块链表中,这样,当前进程就可以通过root dentry找到其inode,从而找到并生成其子树的dentry和inode信息,从而实现查找路径的逻辑。打开文件简单理解就是分配fd,通过dentry将file结构与对应inode挂接,最后安装到进程的打开文件数组中,这里假设已经成功打开文件,返回了fd,我们从系统调用read开始。
//定义系统调用read
//fs/read_write.c
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
//根据fd number获得struct fd
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
//偏移位置
loff_t pos = file_pos_read(f.file);
//进入vfs_read
//参数:file指针,用户空间buffer指针,长度,偏移位置
//主要做一些验证工作,最后进入__vfs_read
ret = vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
}
return ret;
}
//fs/read_write.c
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
//注意这,我们可以在file_operations中定义自己的read操作,使不使用页高速缓存可以自己控制
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else if (file->f_op->aio_read)
//会调用f_ops->read_iter
ret = do_sync_read(file, buf, count, pos);
else if (file->f_op->read_iter)
//会调用f_ops->read_iter
//这里ext2中又将read_iter直接与generic_file_read_iter挂接,使用内核自带的read操作,稍后会以ext2为例分析
ret = new_sync_read(file, buf, count, pos);
else
ret = -EINVAL;
return ret;
}
//fs/ext2/file.c
const struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
.read = new_sync_read, //重定向到read_iter此处即generic_file_read_iter
.write = new_sync_write,
.read_iter = generic_file_read_iter, //使用内核自带的通用读操作,这里会进入页高速缓冲的部分
.write_iter = generic_file_write_iter,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
.mmap = generic_file_mmap,
.open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (file->f_flags & O_DIRECT) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
loff_t size;
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
//先写?
retval = filemap_write_and_wait_range(mapping, pos,
pos + count - 1);
if (!retval) {
struct iov_iter data = *iter;
retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
}
if (retval > 0) {
*ppos = pos + retval;
iov_iter_advance(iter, retval);
}
/*
* Btrfs can have a short DIO read if we encounter
* compressed extents, so if there was an error, or if
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
* the rest of the read.
*/
if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
file_accessed(file);
goto out;
}
}
//进入真正read,在address_space的radix tree中查找
//偏移的page,如果找到,直接copy到用户空间如果未找到,
//则调用a_ops->readpage读取发起bio,分配cache page,
//读入数据,加入radix,然后拷贝到用户空间,完成读取数据的过程.
retval = do_generic_file_read(file, ppos, iter, retval);
out:
return retval;
}
EXPORT_SYMBOL(generic_file_read_iter);
static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,struct iov_iter *iter, ssize_t written)
{
/* 省略部分 */
for (;;) {
struct page *page;
pgoff_t end_index;
loff_t isize;
unsigned long nr, ret;
//读页面的过程中可能重新调度
cond_resched();
find_page:
//redix tree中查找
page = find_get_page(mapping, index);
//没找到
if (!page) {
//先读到页缓存
//分配list page_pool
//调用a_ops->readpages or a_ops->readpage读取数据
//a_ops->readpage负责提交bio
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
//再找
page = find_get_page(mapping, index);
//还是没找到...
if (unlikely(page == NULL))
//去分配页面再读
goto no_cached_page;
}
//readahead related
if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
//不是最新
if (!PageUptodate(page)) {
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
if (!page->mapping)
goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
offset, iter->count))
goto page_not_up_to_date_locked;
unlock_page(page);
}
page_ok: //好,拿到的cached page正常了
/* 省略其他检查部分 */
//到这,从磁盘中读取块到page cache或者本身page cache存在,一切正常,拷贝到用户空间
ret = copy_page_to_iter(page, offset, nr, iter);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
prev_offset = offset;
//释放页面
page_cache_release(page);
written += ret;
if (!iov_iter_count(iter))
goto out;
if (ret < nr) {
error = -EFAULT;
goto out;
}
//继续
continue;
page_not_up_to_date:
/* Get exclusive access to the page ... */
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
continue;
}
/* Did somebody else fill it already? */
if (PageUptodate(page)) {
unlock_page(page);
goto page_ok;
}
readpage: //为了no_cached_page
/*
* A previous I/O error may have been due to temporary
* failures, eg. multipath errors.
* PG_error will be set again if readpage fails.
*/
ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
//还是调用a_ops->readpage
error = mapping->a_ops->readpage(filp, page);
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
error = 0;
goto find_page;
}
goto readpage_error;
}
if (!PageUptodate(page)) {
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
* invalidate_mapping_pages got it
*/
unlock_page(page);
page_cache_release(page);
goto find_page;
}
unlock_page(page);
shrink_readahead_size_eio(filp, ra);
error = -EIO;
goto readpage_error;
}
unlock_page(page);
}
//page ok
goto page_ok;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
page_cache_release(page);
goto out;
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
*/
//从冷页面链表中拿一个page
page = page_cache_alloc_cold(mapping);
if (!page) {
error = -ENOMEM;
goto out;
}
//加入cache
error = add_to_page_cache_lru(page, mapping,
index, GFP_KERNEL);
if (error) {
page_cache_release(page);
if (error == -EEXIST) {
error = 0;
goto find_page;
}
goto out;
}
goto readpage;
}
/* 省略部分 */
ref: Linux Kernel 3.19.3 source code