背景

熟悉了文件系统的数据layout之后,理解它主要的IO路径(特别是写操作)是掌握一个文件系统必须要啃下来的骨头。F2FS也一样,IO相关的主要阶段包括:写提交;check point; GC。

写提交

IO相关的数据结构

最主要的两个数据结构如下:

 struct f2fs_io_info {
        struct f2fs_sb_info *sbi;       /* f2fs_sb_info pointer */
        nid_t ino;              /* inode number */
        enum page_type type;    /* contains DATA/NODE/META/META_FLUSH */
        enum temp_type temp;    /* contains HOT/WARM/COLD */
        int op;                 /* contains REQ_OP_ */
        int op_flags;           /* req_flag_bits */
        block_t new_blkaddr;    /* new block address to be written */
        block_t old_blkaddr;    /* old block address before Cow */
        struct page *page;      /* page to be written */
        struct page *encrypted_page;    /* encrypted page */
        struct list_head list;          /* serialize IOs */
        bool submitted;         /* indicate IO submission */
        int need_lock;          /* indicate we need to lock cp_rwsem */
        bool in_list;           /* indicate fio is in io_list */
        bool is_meta;           /* indicate borrow meta inode mapping or not */
        bool retry;             /* need to reallocate block address */
        enum iostat_type io_type;       /* io type */
        struct writeback_control *io_wbc; /* writeback control */
        unsigned char version;          /* version of the node */
};

#define is_read_io(rw) ((rw) == READ)
struct f2fs_bio_info {
        struct f2fs_sb_info *sbi;       /* f2fs superblock */
        struct bio *bio;                /* bios to merge */
        sector_t last_block_in_bio;     /* last block number */
        struct f2fs_io_info fio;        /* store buffered io info. */
        struct rw_semaphore io_rwsem;   /* blocking op for bio */
        spinlock_t io_lock;             /* serialize DATA/NODE IOs */
        struct list_head io_list;       /* track fios */
};

和page cache的交互

主要的代码在f2fs_file_operations中实现,头文件在f2fs.h中。

fs/f2fs/f2fs.h中列出了目录、文件、inode等主要数据结构等主要操作。如下所示:

extern const struct file_operations f2fs_dir_operations;
extern const struct file_operations f2fs_file_operations;
extern const struct inode_operations f2fs_file_inode_operations;
extern const struct address_space_operations f2fs_dblock_aops;
extern const struct address_space_operations f2fs_node_aops;
extern const struct address_space_operations f2fs_meta_aops;
extern const struct inode_operations f2fs_dir_inode_operations;
extern const struct inode_operations f2fs_symlink_inode_operations;
extern const struct inode_operations f2fs_encrypted_symlink_inode_operations;
extern const struct inode_operations f2fs_special_inode_operations;

其中,上面和文件相关的操作是f2fs_file_operations, 支持write、read、seek、open等标准等操作。具体代码可以参考:fs/f2fs/file.c:

const struct file_operations f2fs_file_operations = {
        .llseek         = f2fs_llseek,
        .read_iter      = generic_file_read_iter,
        .write_iter     = f2fs_file_write_iter,
        .open           = f2fs_file_open,
        .release        = f2fs_release_file,
        .mmap           = f2fs_file_mmap,
        .flush          = f2fs_file_flush,
        .fsync          = f2fs_sync_file,
        .fallocate      = f2fs_fallocate,
        .unlocked_ioctl = f2fs_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl   = f2fs_compat_ioctl,
#endif
        .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
};

其中上面的f2fs_file_write_iter会穿过kernel的page cache机制,
f2fs_file_write_iter->generic_file_write_iter->generic_file_direct_write->filemap_write_and_wait_range->filemap_fdatawrite_range->do_writepages->generic_writepages->write_cache_pages->__writepage。

最后会调用f2fs向page cache 注册的__writepage 函数f2fs_write_meta_page / f2fs_write_node_page/ f2fs_write_data_page分别写SSA/node/data 类型的数据区域。

#bash:~/workspace/linux-4.19.10/fs/f2fs$grep ".writepage" * -r
checkpoint.c: .writepage = f2fs_write_meta_page,
checkpoint.c: .writepages = f2fs_write_meta_pages,
data.c: .writepage = f2fs_write_data_page,
data.c: .writepages = f2fs_write_data_pages,
node.c: .writepage = f2fs_write_node_page,
node.c: .writepages = f2fs_write_node_pages,

这里,需要注意的是f2fs对meta/node/data 区域的读写模式是不同的,参考下面的注释:

/*
 * The below are the page types of bios used in submit_bio().
 * The available types are:
 * DATA                 User data pages. It operates as async mode.
 * NODE                 Node pages. It operates as async mode.
 * META                 FS metadata pages such as SIT, NAT, CP.
 * NR_PAGE_TYPE         The number of page types.
 * META_FLUSH           Make sure the previous pages are written
 *                      with waiting the bio's completion
 * ...                  Only can be used with META.
 */
#define PAGE_TYPE_OF_BIO(type)  ((type) > META ? META : (type))
enum page_type {
        DATA,
        NODE,
        META,
        NR_PAGE_TYPE,
        META_FLUSH,
        INMEM,          /* the below types are used by tracepoints only. */
        INMEM_DROP,
        INMEM_INVALIDATE,
        INMEM_REVOKE,
        IPU,
        OPU,
};

下面分别看三种类型的写和page cache的交互。

write_meta_page

先看write_meta_page:
f2fs_write_meta_page->f2fs_write_meta_page->f2fs_do_write_meta_page()->f2fs_submit_page_write()->构造fio,submit_merged_bio()-->__submit_bio()

上面f2fs_write_meta_page的主要流程如下:

void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
                                        enum iostat_type io_type)
{
        struct f2fs_io_info fio = {
                .sbi = sbi,
                .type = META,
                .temp = HOT,
                .op = REQ_OP_WRITE,
                .op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
                .old_blkaddr = page->index,
                .new_blkaddr = page->index,
                .page = page,
                .encrypted_page = NULL,
                .in_list = false,
        };

        if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
                fio.op_flags &= ~REQ_META;

        set_page_writeback(page);
        ClearPageError(page);
        f2fs_submit_page_write(&fio);

        f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE);
}

最终通过submit_merged_bio把IO请求丢到通用块层,可以看到上面调用完成之后,meta 数据实际已经write back 写到磁盘里。

write_node_page

f2fs_write_node_page 会调用__write_node_page(), 后者会先构造node类型的f2fs_io_info, 然后设置脏页,并且直接调用写回函数:

  .............
  set_page_writeback(page);
        ClearPageError(page);

        if (f2fs_in_warm_node_list(sbi, page)) {
                seq = f2fs_add_fsync_node_entry(sbi, page);
                if (seq_id)
                        *seq_id = seq;
        }

        fio.old_blkaddr = ni.blk_addr;
        f2fs_do_write_node_page(nid, &fio);
        set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
        dec_page_count(sbi, F2FS_DIRTY_NODES);
        up_read(&sbi->node_write);

可以看到对node 类型的数据,上面返回之后数据就写落盘了。

write_data_page

data相关的操作主要在:fs/f2fs/data.c中:

const struct address_space_operations f2fs_dblock_aops = {
        .readpage       = f2fs_read_data_page,
        .readpages      = f2fs_read_data_pages,
        .writepage      = f2fs_write_data_page,
        .writepages     = f2fs_write_data_pages,
        .write_begin    = f2fs_write_begin, //准备页框 
        .write_end      = f2fs_write_end, // set_page_dirty
        .set_page_dirty = f2fs_set_data_page_dirty,
        .invalidatepage = f2fs_invalidate_page,
        .releasepage    = f2fs_release_page,
        .direct_IO      = f2fs_direct_IO,
        .bmap           = f2fs_bmap,
#ifdef CONFIG_MIGRATION
        .migratepage    = f2fs_migrate_page,
#endif
};

f2fs_write_data_page 会调用__write_data_page().同样也会构造data 类型的f2fs_io_info,然后drop page cache,把对应inode里的dirty page cache计数减一,等kernel 后台刷page cache的线程把数据刷到磁盘。

        if (f2fs_is_drop_cache(inode))
                goto out;
                ........
              inode_dec_dirty_pages(inode);
        if (err)
                ClearPageUptodate(page);

        if (wbc->for_reclaim) {
                f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA);
                clear_inode_flag(inode, FI_HOT_DATA);
                f2fs_remove_dirty_inode(inode);
                submitted = NULL;
        }
                ......

check point

Main area 可以认为是data和node的journal写的区域;
SSA (share segment area)可以认为是 NAT/SIT的journal 区域;
check point 的时候,就是把这些写到SSA,但尚未落到NAT/SIT的数据写到各自的区域。

由于f2fs的log-structure特性,每次写一个数据块,需要相应更改direct node,NAT和SIT,尤其是NAT和SIT区域,可能仅仅需要修改一个entry几个字节的信息,就要重写整个page,这会严重降低文件系统的性能和SSD的使用寿命,因此,f2fs使用了journal的机制来减少NAT和SIT的写次数。所谓journal,其实就是把NAT和SIT的更改写到f2fs_summary_block中,当写checkpoint时,才把dirty的SIT和NAT区域回写。

GC

上面做完check point之后,从SSA区域释放的NAT/SIT的数据需要释放,这个可以交给GC去做。

以section 为单位
两种victim 策略:
拥有最少有效block 数量的section: foreground cleaning process
拥有最老blocks平均age的section: background cleaning process