文件系统IO---page cache与buffer cache

Cache有两种:Page Cache 和Buffer Cache;
应用层----》VFS----》Page Cache ===Buffer Cache ----》具体文件系统----》Disk
Page Cache: 虚拟文件系统用Page Cache与用户态buf交换数据。
Buffer Cache: 具体文件系统用buffer cache与磁盘块交换数据。
数据结构:
struct page{
unsigned long private; //buffer_head 头指针
......
}
struct buffer_head{
char *data; //buffer_head状态相关
(1,未使用,该对象可用,b_data为NULL,
2,空闲,b_data指向一个空闲的缓存区,即缓存区没与与块设备中的数据块对应,
3,正在使用状态,b_data指向一个正在使用中的缓存区,
4,异步状态,b_data指向一个用来实现page I/O的临时缓存区)
sector_t b_blocknr;//本缓存区对应的块号
atomic_t b_count;//本缓存区的引用计数

size_t b_size;//块大小
struct block_device *b_bdev;
struct page*b_page;
unsigned long b_state;//缓存区的状态
...
}
#define BH_Uptodate 0 /* 包含有用的数据 
#define BH_Dirty 1 /* 为脏
#define BH_Lock 2 /* 已锁 
#define BH_Req 3 /* 不可用 
#define BH_Mapped 4 /* 映射到磁盘
#define BH_New 5 /*新的,没有写到
#define BH_Protected 6 /* 已保护

buffer_head对象管理:
bh_cachep=kmem_cache_create("buffer_head",sizeof(struct buffer_head),0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD),NULL);

max_buffer_heads= 最大页*(4k/sizeof(buffer_head))

struct buffer_head* alloc_buffer_head(gfp_t gfp_flags);  // 分配buffer_head
-->kmem_cache_alloc(bh_cachep,gfp_flags|__GFP_ZERO);
void free_buffer_head(struct buffer_head*bh);     //释放buffer_head
-->kmem_cache_free(bh_cachp,bh);

文件系统IO流程:(写过程跟踪)

应用层系统调用:read/write

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf[数据],size_t, count[大小])
{
	struct file *file;
	ssize_t ret = -EBADF;
	int fput_needed;

	file = fget_light(fd, &fput_needed);//从文件fd找到对应的file
	if (file) {
		loff_t pos = file_pos_read(file);//获得当前的位置指针
		ret = vfs_write(file, buf[数据], count[大小], &pos);
		file_pos_write(file, pos);//更新位置指针
		fput_light(file, fput_needed);
	}
	return ret;
}
ssize_t vfs_write(struct file *file, const char __user *buf[数据], size_t count[大小], loff_t *pos)
{
	ssize_t ret;
	...//各种检查
	
		if (file->f_op->write)
			ret = file->f_op->write(file, buf, count, pos); //具体文件系统(ext2/3/4,xfs)均实例化为do_sync_write/read
		else
			ret = do_sync_write(file, buf, count, pos);
	...
	return ret;
}
ssize_t do_sync_write(struct file *filp, const char __user *buf[数据], size_t len[大小], loff_t *ppos)
{
	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };//数据,大小
	struct kiocb kiocb;
	ssize_t ret;

	init_sync_kiocb(&kiocb, filp);// kiocb.ki_filp=filp; //文件
	kiocb.ki_pos = *ppos;
	kiocb.ki_left = len;
	kiocb.ki_nbytes = len;
	...
		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
			//具体文件系统异步调用接口,ext2/3/4实例化为generic_file_aio_read/write。
								xfs实例化为xfs_file_aio_read/write.
		...
		
}
//具体文件系统均实例化为generic_file_aio_write/read
ssize_t generic_file_aio_write(struct kiocb *iocb[文件], const struct iovec *iov[数据],unsigned long nr_segs, loff_t pos)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file->f_mapping->host;
	ssize_t ret;
	......
	mutex_lock(&inode->i_mutex);
	ret = __generic_file_aio_write(iocb[文件], iov[数据块], nr_segs[数据块个数], &iocb->ki_pos);
	mutex_unlock(&inode->i_mutex);

	if (ret > 0 || ret == -EIOCBQUEUED) {
		ssize_t err;
		err = generic_write_sync(file, pos, ret);//执行同步命令,vfs_fsync_range函数的封装
		if (err < 0 && ret > 0)
			ret = err;
	}
	return ret;
}
//具体的数据写入函数:iocb[IO状态结构,file,offset等], iov[数据向量数组],nr_segs[数组个数],ppos[偏移]
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,unsigned long nr_segs, loff_t *ppos)
{
	......
	pos = *ppos;
	......
	/* 带O_DIRECT标志时,直接把数据写入磁盘,绕过文件系统buffer */
	if (unlikely(file->f_flags & O_DIRECT)) {
		loff_t endbyte;
		ssize_t written_buffered;
				//真接写入磁盘
		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,ppos, count, ocount);
		......
	} else {	//写入buffer缓存区
		written = generic_file_buffered_write(iocb, iov, nr_segs,pos, ppos, count, written);
	}
	......
}
//正常buffer写
ssize_t generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
		unsigned long nr_segs, loff_t pos, loff_t *ppos,size_t count[要写入的字节数], ssize_t written[已写入的字节数])
{
	struct file *file = iocb->ki_filp;//文件
	ssize_t status;
	struct iov_iter i;//IO数据向量

	iov_iter_init(&i, iov, nr_segs, count, written); // i->iov=iov;i->nr_segs=nr_segs;i->count=count+written;
	status = generic_perform_write(file, &i, pos);
	......
}
static ssize_t generic_perform_write(struct file *file[文件],
				struct iov_iter *i[IO数据], loff_t pos)
{
	struct address_space *mapping = file->f_mapping; //缓存区组织管理结构:地址空间
	const struct address_space_operations *a_ops = mapping->a_ops;
	long status = 0;
	ssize_t written = 0;
	unsigned int flags = 0;
	......
	do {
		struct page *page;
		pgoff_t index;		/* Pagecache index for current page  缓存区中页的索引*/
		unsigned long offset;	/* Offset into pagecache page  页内偏移*/
		unsigned long bytes;	/* Bytes to write to page */
		size_t copied;		/* Bytes copied from user */
		void *fsdata;

		offset = (pos & (PAGE_CACHE_SIZE - 1)); // (页大小:4K)
		index = pos >> PAGE_CACHE_SHIFT; //(页偏移:12位)
		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_count(i));
again:
		......
		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
						&page, &fsdata); //由具体的文件系统 索引或分配缓存页面
		if (unlikely(status))
			break;

		if (mapping_writably_mapped(mapping)) //mapping->i_mmap_writable!=0 此页面在用户态是否被修改
			flush_dcache_page(page); //刷新page,将与此page相关的缓存(cache,buffer)刷回page

		pagefault_disable(); //关闭缺页中断
		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); //数据拷贝到缓存页面
		pagefault_enable();	//使能缺页中断
		flush_dcache_page(page);

		mark_page_accessed(page);//对页面做标记
		status = a_ops->write_end(file, mapping, pos, bytes, copied,
						page, fsdata); //通知具体文件系统 把缓存数据页面提交到磁盘
		if (unlikely(status < 0))
			break;
		copied = status;
		
		cond_resched();//调度
		iov_iter_advance(i, copied); //更新IO完成字节数
		......
		balance_dirty_pages_ratelimited(mapping);//脏页处理

	} while (iov_iter_count(i));//i->count
	return written ? written : status;
}
write_begin()-->ext2_write_begin()/xfs_vm_write_begin()[都是block_write_begin的封装]
write_end()--->ext2_write_end()/xfs_vm_write_end()[都是generic_write_end()-->block_write_end()的封装]
int block_write_begin(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned flags,
			struct page **pagep, void **fsdata,
			get_block_t *get_block)
{
	struct inode *inode = mapping->host;
	int status = 0;
	struct page *page;
	pgoff_t index;
	unsigned start, end;
	int ownpage = 0;

	index = pos >> PAGE_CACHE_SHIFT; //12
	start = pos & (PAGE_CACHE_SIZE - 1); // 4k
	end = start + len;

	page = *pagep;
	if (page == NULL) {
		ownpage = 1;
           //在地址空间查找或分配一个page
		page = grab_cache_page_write_begin(mapping, index, flags);
		if (!page) {
			status = -ENOMEM;
			goto out;
		}
		*pagep = page;
	} else
		BUG_ON(!PageLocked(page));
		//为page分配一组 缓存头buffer_head,并初始化
	status = __block_prepare_write(inode, page, start, end, get_block); 
	......
}
generic_write_end()是通用的页面完成处理函数,首先会调用block_write_end()来对页面中刚刚写入的缓冲区标记为(BH_Dirty)。
接下来调用unlock_page()将锁定的页面解锁(即清除PG_Locked标志位)。
如果是文件大小被改变,则还需要修改索引节点中的文件大小(inode 中的i_size)成员变量.

你可能感兴趣的:(文件系统IO---page cache与buffer cache)