linux的读写系统调用提供了一个O_DIRECT标记,可以让尝试绕过缓存,直接对磁盘进行读写(为啥是尝试绕过?当直接落盘失败时还要通过缓存去落盘)。为了能实现直接落盘,使用direct IO限制多多,文件偏移得对齐到磁盘block,内存地址得对齐到磁盘block,读写size也得对齐到磁盘block。但direct IO的实现还是有个小缺陷。这个缺陷我在fuse上的分析已经讲清楚了,对于缺陷原理不清晰的,可以移步我的fuse缺陷分析,这里主要分析direct IO的实现,顺带说一下哪里引入了缺陷。
从系统调用write到direct IO的调用路径是
write --> vfs_write --> do_sync_write --> generic_file_aio_write -->__generic_file_aio_write
从__generic_file_aio_write开始就设计到direct IO的内容了,我们从__generic_file_aio_write开始分析
/*@iocb do_sync_write 声明的内核io控制块,主要用于等待io完成实现同步
*@iov 用户调用write时给的地址空间,数组只有1个元素
*@nr_segs 为1
*@写文件偏移
*/
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
//省略部分无关紧要的代码
...
if (io_is_direct(file)) {
loff_t endbyte;
ssize_t written_buffered;
//这里尝试直接写入磁盘
written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
ppos, count, ocount);
/*
* If the write stopped short of completing, fall back to
* buffered writes. Some filesystems do this for writes to
* holes, for example. For DAX files, a buffered write will
* not succeed (even if it did, DAX does not handle dirty
* page-cache pages correctly).
*/
if (written < 0 || written == count || IS_DAX(inode))
goto out;
pos += written;
count -= written;
//如果不能将所有内容直接写入磁盘,则将未写入的使用缓存写写入
written_buffered = generic_file_buffered_write(iocb, iov,
nr_segs, pos, ppos, count,
written);
/*
* If generic_file_buffered_write() retuned a synchronous error
* then we want to return the number of bytes which were
* direct-written, or the error code if that was zero. Note
* that this differs from normal direct-io semantics, which
* will return -EFOO even if some bytes were written.
*/
if (written_buffered < 0) {
err = written_buffered;
goto out;
}
/*
* We need to ensure that the page cache pages are written to
* disk and invalidated to preserve the expected O_DIRECT
* semantics.
*/
//在缓存写写入后,这里将缓存刷入磁盘,达到一个系统调用直接落盘的效果
endbyte = pos + written_buffered - written - 1;
err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
...
}
...
}
如果文件打开的时候带有O_DIRECT标记的话,就尝试直接写入磁盘,如果未能直接落盘而又不是返回错误的话,就尝试使用缓存写,先写入内存缓存,再让缓存落盘。所以man手册O_DIRECT这个标记的说明是Try to minimize cache effects of the I/O to and from this file.
而不是不使用cache。
ssize_t
generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long *nr_segs, loff_t pos, loff_t *ppos,
size_t count, size_t ocount)
{
...
//首先把对应写入的区域缓存在内存的部分刷入磁盘
written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
if (written)
goto out;
/*
* After a write we want buffered reads to be sure to go to disk to get
* the new data. We invalidate clean cached page from the region we're
* about to write. We do this *before* the write so that we can return
* without clobbering -EIOCBQUEUED from ->direct_IO().
*/
if (mapping->nrpages) {
//在磁盘里写了之后,缓存里的内容就和磁盘的不一致了,所以需要让缓存的内容失效,为什么不写了之后在写之后呢? 看上面原生注释,我没怎么看明白
written = invalidate_inode_pages2_range(mapping,
pos >> PAGE_CACHE_SHIFT, end);
/*
* If a page can not be invalidated, return 0 to fall back
* to buffered write.
*/
if (written) {
if (written == -EBUSY)
return 0;
goto out;
}
}
written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
/*
* Finally, try again to invalidate clean pages which might have been
* cached by non-direct readahead, or faulted in by get_user_pages()
* if the source of the write was an mmap'ed region of the file
* we're writing. Either one is a pretty crazy thing to do,
* so we don't support it 100%. If this invalidation
* fails, tough, the write still worked...
*/
//为什么之前让页失效了这里还要失效一次?这里有个坑爹的情景
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
pos >> PAGE_CACHE_SHIFT, end);
}
...
}
generic_file_direct_write首先将要写的地方的内存缓存先刷下磁盘,然后再让对应的缓存失效,再direct写完后还要让页面失效一次。这是为啥呢?我们考虑这样一个情景:
进程用mmap用一个页映射了文件的一片空间,然后用将那片内存传给write用direct io写,写的地址还恰好在是mmap映射的那一块。
你说direct io写完后要不要将那个范围的页缓存失效呢?这时上面英文注释所说的一个原因,还有其它原因我还不是很清楚。这里对mmap实现不熟的可以看我mmap实现的分析。
generic_file_direct_write调用了文件系统提供的direct IO函数,但如文件系统的其它函数一样,大部分文件系统的实现是封装的内核的direct io函数,以ext2为例,最终调用的还是do_blockdev_direct_IO,这个函数是读写通用的,在分析的时候要注意下。
/*
@rw 读写方向
@iocb 内核io控制块,里面包含读写的文件offset,读写的长度
@inode 目标inode
@bdev 目标所在bdev
@iov 用户缓存iov,一般只有1个
@offset 文件offset
@nr_segs 一般为1
@get_block 解决内存文件block号到硬盘block号的映射
@end_io io完成后掉用的方法,此处为NULL
@submit_io 这个不知道,此处为NULL
@flags 标记,此处为DIO_LOCKING | DIO_SKIP_HOLES
*/
static inline ssize_t
do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
int seg;
size_t size;
unsigned long addr;
unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
unsigned blkbits = i_blkbits;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
loff_t end = offset;
struct dio *dio;
struct dio_submit sdio = { 0, };
unsigned long user_addr;
size_t bytes;
struct buffer_head map_bh = { 0, };
struct blk_plug plug;
if (rw & WRITE)
rw = WRITE_ODIRECT;
/*
* Avoid references to bdev if not absolutely needed to give
* the early prefetch in the caller enough time.
*/
//读写文件的起始地址是否对齐到inode block或至少对齐到块设备的block大小
if (offset & blocksize_mask) {
if (bdev)
blkbits = blksize_bits(bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
if (offset & blocksize_mask)
goto out;
}
/* Check the memory alignment. Blocks cannot straddle pages */
//每个iov的地址和size都要对齐到inode block或至少对齐到块设备的block大小
//为啥要对齐呢? 因为块设备io最小的传输块是块设备block,一个页中固定映射2的n次幂个block,
//即一个页里有固定的block的框框,block不能随便乱放。
for (seg = 0; seg < nr_segs; seg++) {
addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len;
end += size;
if (unlikely((addr & blocksize_mask) ||
(size & blocksize_mask))) {
if (bdev)
blkbits = blksize_bits(
bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
if ((addr & blocksize_mask) || (size & blocksize_mask))
goto out;
}
}
/* watch out for a 0 len io from a tricksy fs */
//这里是读长度为0,下面那个是读到文件末尾以后
if (rw == READ && end == offset)
return 0;
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
retval = -ENOMEM;
if (!dio)
goto out;
/*
* Believe it or not, zeroing out the page array caused a .5%
* performance regression in a database benchmark. So, we take
* care to only zero out what's needed.
*/
memset(dio, 0, offsetof(struct dio, pages));
dio->flags = flags;
if (dio->flags & DIO_LOCKING) { //direct IO里设了这个标记,所以会执行一下流程
if (rw == READ) {
struct address_space *mapping =
iocb->ki_filp->f_mapping;
/* will be released by direct_io_worker */
mutex_lock(&inode->i_mutex); //这里对inode上锁,因为外层read系统调用是不上锁的,
//这里是将文件对应偏移位置的缓存先刷下磁盘,原因很好理解,write已经在外层执行过了,但这里是read
retval = filemap_write_and_wait_range(mapping, offset,
end - 1);
if (retval) {
mutex_unlock(&inode->i_mutex);
kmem_cache_free(dio_cache, dio);
goto out;
}
}
}
/* Once we sampled i_size check for reads beyond EOF */
//这里是读到文件末尾以后,上面那个是读长度为0
dio->i_size = i_size_read(inode);
if (rw == READ && offset >= dio->i_size) {
if (dio->flags & DIO_LOCKING)
mutex_unlock(&inode->i_mutex);
kmem_cache_free(dio_cache, dio);
retval = 0;
goto out;
}
/*
* For file extending writes updating i_size before data writeouts
* complete can expose uninitialized blocks in dumb filesystems.
* In that case we need to wait for I/O completion even if asked
* for an asynchronous write.
*/
if (is_sync_kiocb(iocb)) //这里判断成立,在do_sync_write/do_sync_read会对kiocb进行初始化
dio->is_async = false;
else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
(rw & WRITE) && end > i_size_read(inode))
dio->is_async = false;
else
dio->is_async = true;
dio->inode = inode;
dio->rw = rw;
/*
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
* so that we can call ->fsync.
*/
//这个判断不会成立,dio->is_async = false
if ((dio->inode->i_sb->s_type->fs_flags & FS_HAS_DIO_IODONE2) &&
dio->is_async && (rw & WRITE) &&
((iocb->ki_filp->f_flags & O_DSYNC) ||
IS_SYNC(iocb->ki_filp->f_mapping->host))) {
retval = dio_set_defer_completion(dio);
if (retval) {
/*
* We grab i_mutex only for reads so we don't have
* to release it here
*/
kmem_cache_free(dio_cache, dio);
goto out;
}
}
/*
* Will be decremented at I/O completion time.
*/
//这个会成立,会执行inode_dio_begin
if (!(dio->flags & DIO_SKIP_DIO_COUNT))
inode_dio_begin(inode);
retval = 0;
sdio.blkbits = blkbits; //blkbits会被设置为块设备的block位数或者inode的block位数,这里假设读写对齐到了
//inode block,便于分析。
sdio.blkfactor = i_blkbits - blkbits; //这里就是0
sdio.block_in_file = offset >> blkbits; //这里是文件第一个块的位移
sdio.get_block = get_block;
dio->end_io = end_io;
sdio.submit_io = submit_io;
sdio.final_block_in_bio = -1;
sdio.next_block_for_io = -1;
dio->iocb = iocb;
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
/*
* In case of non-aligned buffers, we may need 2 more
* pages since we need to zero out first and last block.
*/
//这几行的page_in_io记录的应该是需要映射到内核地址的用户页的个数
if (unlikely(sdio.blkfactor))
sdio.pages_in_io = 2;
for (seg = 0; seg < nr_segs; seg++) {
user_addr = (unsigned long)iov[seg].iov_base;
sdio.pages_in_io +=
((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
PAGE_SIZE - user_addr / PAGE_SIZE);
}
blk_start_plug(&plug);
for (seg = 0; seg < nr_segs; seg++) {
user_addr = (unsigned long)iov[seg].iov_base;
sdio.size += bytes = iov[seg].iov_len;
/* Index into the first page of the first block */
sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; //这个是用户提供的第seg个iov的第一个块在用户页的块索引
sdio.final_block_in_request = sdio.block_in_file +
(bytes >> blkbits); //这个是用户提供的第seg个iov空间能装下的最后一个文件块的下一个块的块索引,sdio.block_in_file在下面的do_direct_IO里会增加,代码对阅读者真不友好啊
/* Page fetching state */
sdio.head = 0;
sdio.tail = 0;
sdio.curr_page = 0;
sdio.total_pages = 0;
if (user_addr & (PAGE_SIZE-1)) {
sdio.total_pages++;
bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
}
sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; //这几行计算本次要用的总共的页数,在用户地址没有对齐到页的时候可能需要多用一个页
sdio.curr_user_address = user_addr;
retval = do_direct_IO(dio, &sdio, &map_bh);//读写函数,待会再好好分析
dio->result += iov[seg].iov_len -
((sdio.final_block_in_request - sdio.block_in_file) << //在上面sdio.final_block_in_request = sdio.block_in_file +(bytes >> blkbits),这里差值应该是没有读或写的数量,被记录到dio->result里。
blkbits);
if (retval) {
dio_cleanup(dio, &sdio);
break;
}
} /* end iovec loop */
if (retval == -ENOTBLK) {
/*
* The remaining part of the request will be
* be handled by buffered I/O when we return
*/
retval = 0;
}
/*
* There may be some unwritten disk at the end of a part-written
* fs-block-sized block. Go zero that now.
*/
//这里的意思是如果是写,在文件最后没有对齐到文件系统的iblock大小时,get_block_t在磁盘申请的i
//block大小的块会比实际写的block要大(一个iblock可能包含几个block),这里要把多出来的block置0
dio_zero_block(dio, &sdio, 1, &map_bh);
if (sdio.cur_page) {
ssize_t ret2;
ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
if (retval == 0)
retval = ret2;
page_cache_release(sdio.cur_page);
sdio.cur_page = NULL;
}
if (sdio.bio)
dio_bio_submit(dio, &sdio);
blk_finish_plug(&plug);
/*
* It is possible that, we return short IO due to end of file.
* In that case, we need to release all the pages we got hold on.
*/
dio_cleanup(dio, &sdio);
/*
* All block lookups have been performed. For READ requests
* we can let i_mutex go now that its achieved its purpose
* of protecting us from looking up uninitialized blocks.
*/
if (rw == READ && (dio->flags & DIO_LOCKING))
mutex_unlock(&dio->inode->i_mutex);
/*
* The only time we want to leave bios in flight is when a successful
* partial aio read or full aio write have been setup. In that case
* bio completion will call aio_complete. The only time it's safe to
* call aio_complete is when we return -EIOCBQUEUED, so we key on that.
* This had *better* be the only place that raises -EIOCBQUEUED.
*/
//这里只是提交了bio,真正的磁盘读写并未完成
BUG_ON(retval == -EIOCBQUEUED);
if (dio->is_async && retval == 0 && dio->result &&
((rw == READ) || (dio->result == sdio.size)))
retval = -EIOCBQUEUED;
if (retval != -EIOCBQUEUED)
dio_await_completion(dio); //这里等待IO完成后才返回
if (drop_refcount(dio) == 0) {
retval = dio_complete(dio, offset, retval, false);
} else
BUG_ON(retval != -EIOCBQUEUED);
out:
return retval;
}
do_blockdev_direct_IO这个函数很长,但内容不多,主要是对每个连续的用户空间初始化sdio和dio来调用do_direct_IO。系统调用的iov的话,都只有一个元素的。sdio和dio看起来是专门用于direct IO的,里面维护用户页、块映射和bio下发。我们分析下do_direct_IO
/*
* Walk the user pages, and the file, mapping blocks to disk and generating
* a sequence of (page,offset,len,block) mappings. These mappings are injected
* into submit_page_section(), which takes care of the next stage of submission
*
* Direct IO against a blockdev is different from a file. Because we can
* happily perform page-sized but 512-byte aligned IOs. It is important that
* blockdev IO be able to have fine alignment and large sizes.
*
* So what we do is to permit the ->get_block function to populate bh.b_size
* with the size of IO which is permitted at this offset and this i_blkbits.
*
* For best results, the blockdev should be set up with 512-byte i_blkbits and
* it should set b_size to PAGE_SIZE or more inside get_block(). This gives
* fine alignment but still allows this function to work in PAGE_SIZE units.
*/
//这里的注释写明了这个函数的作用,遍历用户页和文件,产生一系列page,offset,len,block的映射,即buffer_head,并提交到下一层。
static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
struct buffer_head *map_bh)
{
const unsigned blkbits = sdio->blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits; //字面意思,一个页里block的数量,这里是块设备block
struct page *page;
unsigned block_in_page;
int ret = 0;
/* The I/O can start at any block offset within the first page */
//sdio->first_block_in_page里记录的是用户空间第一个块在页里的索引
//这个IO可以从页的任意一个块开始,详细看下面的while循环
block_in_page = sdio->first_block_in_page;
while (sdio->block_in_file < sdio->final_block_in_request) { //从iov里的第一个块里到最后一个块遍历
//获取到块对应的页,这里每次循环都调用是因为下面遍历页里所有的块。这里会引入一个缺陷
page = dio_get_page(dio, sdio);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
}
while (block_in_page < blocks_per_page) { //这里会遍历一个页里所有的block
...
//这里维护的变量比较多,比较难看懂,细节就不深究了,大概意思就是对每个块调用submit_page_section向下提交bio
ret = submit_page_section(dio, sdio, page,
offset_in_page,
this_chunk_bytes,
sdio->next_block_for_io,
map_bh);
...
}
/* Drop the ref which was taken in get_user_pages() */
page_cache_release(page);
block_in_page = 0; //一个页结束,这个索引计数器归0
}
out:
return ret;
}
这里主要分析dio_get_page,这里获取用于bio的页,这个页从哪里来呢?以前在读buffer IO代码时就纠结,buffer IO的bio的页从文件的address_space来,这很自然,直接IO是不过buffer的,如果是申请的临时页,那效率就低了。
/*
* Get another userspace page. Returns an ERR_PTR on error. Pages are
* buffered inside the dio so that we can call get_user_pages() against a
* decent number of pages, less frequently. To provide nicer use of the
* L1 cache.
*/
//获取一个映射到用户空间的页,这里尝试一次性获取所需的全部,记录在sdio中,但一次只返回一页
static inline struct page *dio_get_page(struct dio *dio,
struct dio_submit *sdio)
{
if (dio_pages_present(sdio) == 0) {//返回sdio队列里有多少个页,如果为0则按sdio的需求填充
int ret;
ret = dio_refill_pages(dio, sdio);//调用dio_refill_pages给sdio的页队列填充页
if (ret)
return ERR_PTR(ret);
BUG_ON(dio_pages_present(sdio) == 0);
}
return dio->pages[sdio->head++];//返回队列头的那个页
}
/*
* Go grab and pin some userspace pages. Typically we'll get 64 at a time.
*/
static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
int ret;
int nr_pages;
nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); //计算要获取的页数量
ret = get_user_pages_fast( //调用这个函数进行获取当前进程的页,这个函数不锁mm的
sdio->curr_user_address, /* Where from? */
nr_pages, /* How many pages? */
dio->rw == READ, /* Write to memory? */
&dio->pages[0]); /* Put results here */
if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
struct page *page = ZERO_PAGE(0);
/*
* A memory fault, but the filesystem has some outstanding
* mapped blocks. We need to use those blocks up to avoid
* leaking stale data in the file.
*/
//如果是direct 写的话,用户空间申请的内存可能并没有页映射,因为用户访问那片内存时才会产生缺页中断来映射那片内存。所以这里使用固定的0页。
if (dio->page_errors == 0)
dio->page_errors = ret;
page_cache_get(page);
dio->pages[0] = page;
sdio->head = 0;
sdio->tail = 1;
ret = 0;
goto out;
}
if (ret >= 0) {
sdio->curr_user_address += ret * PAGE_SIZE; //sdio->curr_user_address记录当前用户提供的iov未映射的起始地址
sdio->curr_page += ret; //sdio->curr_page记录当前用户提供的iov已映射了的页数
sdio->head = 0;
sdio->tail = ret;
ret = 0;
}
out:
return ret;
}
可以看到dio_get_page最终是通过get_user_pages_fast获取到用户提供的缓存所在的页来进行direct IO,这样只要进行一次内存拷贝就可以直接落盘了。
这里直接使用用户的页,有好处,也引入了一个可能存在的大坑。当进行的是direct read的话,这里用了用户的页,如果在dio_get_page获取页之后,bio下发之前,进程触发cow把页表给替换了,这就尴尬了,嘿嘿。那怎么避免这个大坑呢?有三个方法,一是多线程进程所有内存申请都对齐到页;二是不要在多线程环境下用fork,包括system、popen之类的都不要有;三是不要在多线程环境下direct io 读。
direct IO从submit_page_section往下,就到了BIO层,没什么好研究的了,就此结束。