背景
熟悉了文件系统的数据layout之后,理解它主要的IO路径(特别是写操作)是掌握一个文件系统必须要啃下来的骨头。F2FS也一样,IO相关的主要阶段包括:写提交;check point; GC。
写提交
IO相关的数据结构
最主要的两个数据结构如下:
struct f2fs_io_info {
struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
nid_t ino; /* inode number */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
enum temp_type temp; /* contains HOT/WARM/COLD */
int op; /* contains REQ_OP_ */
int op_flags; /* req_flag_bits */
block_t new_blkaddr; /* new block address to be written */
block_t old_blkaddr; /* old block address before Cow */
struct page *page; /* page to be written */
struct page *encrypted_page; /* encrypted page */
struct list_head list; /* serialize IOs */
bool submitted; /* indicate IO submission */
int need_lock; /* indicate we need to lock cp_rwsem */
bool in_list; /* indicate fio is in io_list */
bool is_meta; /* indicate borrow meta inode mapping or not */
bool retry; /* need to reallocate block address */
enum iostat_type io_type; /* io type */
struct writeback_control *io_wbc; /* writeback control */
unsigned char version; /* version of the node */
};
#define is_read_io(rw) ((rw) == READ)
struct f2fs_bio_info {
struct f2fs_sb_info *sbi; /* f2fs superblock */
struct bio *bio; /* bios to merge */
sector_t last_block_in_bio; /* last block number */
struct f2fs_io_info fio; /* store buffered io info. */
struct rw_semaphore io_rwsem; /* blocking op for bio */
spinlock_t io_lock; /* serialize DATA/NODE IOs */
struct list_head io_list; /* track fios */
};
和page cache的交互
主要的代码在f2fs_file_operations中实现,头文件在f2fs.h中。
fs/f2fs/f2fs.h中列出了目录、文件、inode等主要数据结构等主要操作。如下所示:
extern const struct file_operations f2fs_dir_operations;
extern const struct file_operations f2fs_file_operations;
extern const struct inode_operations f2fs_file_inode_operations;
extern const struct address_space_operations f2fs_dblock_aops;
extern const struct address_space_operations f2fs_node_aops;
extern const struct address_space_operations f2fs_meta_aops;
extern const struct inode_operations f2fs_dir_inode_operations;
extern const struct inode_operations f2fs_symlink_inode_operations;
extern const struct inode_operations f2fs_encrypted_symlink_inode_operations;
extern const struct inode_operations f2fs_special_inode_operations;
其中,上面和文件相关的操作是f2fs_file_operations, 支持write、read、seek、open等标准等操作。具体代码可以参考:fs/f2fs/file.c:
const struct file_operations f2fs_file_operations = {
.llseek = f2fs_llseek,
.read_iter = generic_file_read_iter,
.write_iter = f2fs_file_write_iter,
.open = f2fs_file_open,
.release = f2fs_release_file,
.mmap = f2fs_file_mmap,
.flush = f2fs_file_flush,
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
.unlocked_ioctl = f2fs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = f2fs_compat_ioctl,
#endif
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
其中上面的f2fs_file_write_iter会穿过kernel的page cache机制,
f2fs_file_write_iter->generic_file_write_iter->generic_file_direct_write->filemap_write_and_wait_range->filemap_fdatawrite_range->do_writepages->generic_writepages->write_cache_pages->__writepage。
最后会调用f2fs向page cache 注册的__writepage 函数f2fs_write_meta_page / f2fs_write_node_page/ f2fs_write_data_page分别写SSA/node/data 类型的数据区域。
#bash:~/workspace/linux-4.19.10/fs/f2fs$grep ".writepage" * -r
checkpoint.c: .writepage = f2fs_write_meta_page,
checkpoint.c: .writepages = f2fs_write_meta_pages,
data.c: .writepage = f2fs_write_data_page,
data.c: .writepages = f2fs_write_data_pages,
node.c: .writepage = f2fs_write_node_page,
node.c: .writepages = f2fs_write_node_pages,
这里,需要注意的是f2fs对meta/node/data 区域的读写模式是不同的,参考下面的注释:
/*
* The below are the page types of bios used in submit_bio().
* The available types are:
* DATA User data pages. It operates as async mode.
* NODE Node pages. It operates as async mode.
* META FS metadata pages such as SIT, NAT, CP.
* NR_PAGE_TYPE The number of page types.
* META_FLUSH Make sure the previous pages are written
* with waiting the bio's completion
* ... Only can be used with META.
*/
#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type))
enum page_type {
DATA,
NODE,
META,
NR_PAGE_TYPE,
META_FLUSH,
INMEM, /* the below types are used by tracepoints only. */
INMEM_DROP,
INMEM_INVALIDATE,
INMEM_REVOKE,
IPU,
OPU,
};
下面分别看三种类型的写和page cache的交互。
write_meta_page
先看write_meta_page:
f2fs_write_meta_page->f2fs_write_meta_page->f2fs_do_write_meta_page()->f2fs_submit_page_write()->构造fio,submit_merged_bio()-->__submit_bio()
上面f2fs_write_meta_page的主要流程如下:
void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
enum iostat_type io_type)
{
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
.temp = HOT,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
.old_blkaddr = page->index,
.new_blkaddr = page->index,
.page = page,
.encrypted_page = NULL,
.in_list = false,
};
if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
fio.op_flags &= ~REQ_META;
set_page_writeback(page);
ClearPageError(page);
f2fs_submit_page_write(&fio);
f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE);
}
最终通过submit_merged_bio把IO请求丢到通用块层,可以看到上面调用完成之后,meta 数据实际已经write back 写到磁盘里。
write_node_page
f2fs_write_node_page 会调用__write_node_page(), 后者会先构造node类型的f2fs_io_info, 然后设置脏页,并且直接调用写回函数:
.............
set_page_writeback(page);
ClearPageError(page);
if (f2fs_in_warm_node_list(sbi, page)) {
seq = f2fs_add_fsync_node_entry(sbi, page);
if (seq_id)
*seq_id = seq;
}
fio.old_blkaddr = ni.blk_addr;
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
up_read(&sbi->node_write);
可以看到对node 类型的数据,上面返回之后数据就写落盘了。
write_data_page
data相关的操作主要在:fs/f2fs/data.c中:
const struct address_space_operations f2fs_dblock_aops = {
.readpage = f2fs_read_data_page,
.readpages = f2fs_read_data_pages,
.writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin, //准备页框
.write_end = f2fs_write_end, // set_page_dirty
.set_page_dirty = f2fs_set_data_page_dirty,
.invalidatepage = f2fs_invalidate_page,
.releasepage = f2fs_release_page,
.direct_IO = f2fs_direct_IO,
.bmap = f2fs_bmap,
#ifdef CONFIG_MIGRATION
.migratepage = f2fs_migrate_page,
#endif
};
f2fs_write_data_page 会调用__write_data_page().同样也会构造data 类型的f2fs_io_info,然后drop page cache,把对应inode里的dirty page cache计数减一,等kernel 后台刷page cache的线程把数据刷到磁盘。
if (f2fs_is_drop_cache(inode))
goto out;
........
inode_dec_dirty_pages(inode);
if (err)
ClearPageUptodate(page);
if (wbc->for_reclaim) {
f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA);
clear_inode_flag(inode, FI_HOT_DATA);
f2fs_remove_dirty_inode(inode);
submitted = NULL;
}
......
check point
Main area 可以认为是data和node的journal写的区域;
SSA (share segment area)可以认为是 NAT/SIT的journal 区域;
check point 的时候,就是把这些写到SSA,但尚未落到NAT/SIT的数据写到各自的区域。
由于f2fs的log-structure特性,每次写一个数据块,需要相应更改direct node,NAT和SIT,尤其是NAT和SIT区域,可能仅仅需要修改一个entry几个字节的信息,就要重写整个page,这会严重降低文件系统的性能和SSD的使用寿命,因此,f2fs使用了journal的机制来减少NAT和SIT的写次数。所谓journal,其实就是把NAT和SIT的更改写到f2fs_summary_block中,当写checkpoint时,才把dirty的SIT和NAT区域回写。
GC
上面做完check point之后,从SSA区域释放的NAT/SIT的数据需要释放,这个可以交给GC去做。
以section 为单位
两种victim 策略:
拥有最少有效block 数量的section: foreground cleaning process
拥有最老blocks平均age的section: background cleaning process