f2fs 的checkpoint 维护data、node和meta data(SIT,NAT)的数据一致性,把一起写到SSA区域的数据分别写回到SIT/NAT区域。

checkpoint 相关数据结构

super block区域里记录了checkpoint (CP)的起始block address,以及checkpoint 区域segment 的数量:

struct f2fs_super_block {
        __le32 magic;                   /* Magic Number */
        __le16 major_ver;               /* Major Version */
        __le16 minor_ver;               /* Minor Version */
        __le32 log_sectorsize;          /* log2 sector size in bytes */
        __le32 log_sectors_per_block;   /* log2 # of sectors per block */
        __le32 log_blocksize;           /* log2 block size in bytes */
        __le32 log_blocks_per_seg;      /* log2 # of blocks per segment */
        __le32 segs_per_sec;            /* # of segments per section */
        __le32 secs_per_zone;           /* # of sections per zone */
        __le32 checksum_offset;         /* checksum offset inside super block */
        __le64 block_count;             /* total # of user blocks */
        __le32 section_count;           /* total # of sections */
        __le32 segment_count;           /* total # of segments */
        __le32 segment_count_ckpt;      /* # of segments for checkpoint */
        __le32 segment_count_sit;       /* # of segments for SIT */
        __le32 segment_count_nat;       /* # of segments for NAT */
        __le32 segment_count_ssa;       /* # of segments for SSA */
        __le32 segment_count_main;      /* # of segments for main area */
        __le32 segment0_blkaddr;        /* start block address of segment 0 */
        __le32 cp_blkaddr;              /* start block address of checkpoint */
                .............

check point 包含的信息如下:

#define F2FS_CP_PACKS           2       /* # of checkpoint packs */

struct f2fs_checkpoint {
        __le64 checkpoint_ver;          /* checkpoint block version number */
        __le64 user_block_count;        /* # of user blocks */
        __le64 valid_block_count;       /* # of valid blocks in main area */
        __le32 rsvd_segment_count;      /* # of reserved segments for gc */
        __le32 overprov_segment_count;  /* # of overprovision segments */
        __le32 free_segment_count;      /* # of free segments in main area */

        /* information of current node segments */
        __le32 cur_node_segno[MAX_ACTIVE_NODE_LOGS];
        __le16 cur_node_blkoff[MAX_ACTIVE_NODE_LOGS];
        /* information of current data segments */
        __le32 cur_data_segno[MAX_ACTIVE_DATA_LOGS];
        __le16 cur_data_blkoff[MAX_ACTIVE_DATA_LOGS];
        __le32 ckpt_flags;              /* Flags : umount and journal_present */
        __le32 cp_pack_total_block_count;       /* total # of one cp pack */
        __le32 cp_pack_start_sum;       /* start block number of data summary */
        __le32 valid_node_count;        /* Total number of valid nodes */
        __le32 valid_inode_count;       /* Total number of valid inodes */
        __le32 next_free_nid;           /* Next free node number */
        __le32 sit_ver_bitmap_bytesize; /* Default value 64 */
        __le32 nat_ver_bitmap_bytesize; /* Default value 256 */
        __le32 checksum_offset;         /* checksum offset inside cp block */
        __le64 elapsed_time;            /* mounted time */
        /* allocation type of current segment */
        unsigned char alloc_type[MAX_ACTIVE_LOGS];

        /* SIT and NAT version bitmap */
        unsigned char sit_nat_version_bitmap[1];
} __packed;

checkpoint 主要工作

把cached 的SIT/NAT 信息写回到相应的SIT / NAT 区域。

主要的工作在 f2fs_write_checkpoint  中完成:

int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
        unsigned long long ckpt_ver;
        int err = 0;
                ............
                err = block_operations(sbi);
                ............
                /*
         * update checkpoint pack index
         * Increase the version number so that
         * SIT entries and seg summaries are written at correct place
         */
        ckpt_ver = cur_cp_version(ckpt);
        ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);

        /* write cached NAT/SIT entries to NAT/SIT area */
        f2fs_flush_nat_entries(sbi, cpc);
        f2fs_flush_sit_entries(sbi, cpc);

        /* unlock all the fs_lock[] in do_checkpoint() */
        err = do_checkpoint(sbi, cpc);
        if (err)
                f2fs_release_discard_addrs(sbi);
        else
                f2fs_clear_prefree_segments(sbi, cpc);

        unblock_operations(sbi);
        stat_inc_cp_count(sbi->stat_info);
                ......
}

可以看到上面首先用block_operations()来阻塞一下IO operations :

/*
 * Freeze all the FS-operations for checkpoint.
 */
static int block_operations(struct f2fs_sb_info *sbi)
{
......
         /* write all the dirty dentry pages */  //flush dirty dentry pages
        if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
                f2fs_unlock_all(sbi);
                err = f2fs_sync_dirty_inodes(sbi, DIR_INODE);
                if (err)
                        goto out;
                cond_resched();
                goto retry_flush_dents;
        }
                .....
                 if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
                up_write(&sbi->node_change);
                f2fs_unlock_all(sbi);
                err = f2fs_sync_inode_meta(sbi);
                .........

                ...... // flush dirty nodes
                if (get_pages(sbi, F2FS_DIRTY_NODES)) {
                up_write(&sbi->node_write);
                atomic_inc(&sbi->wb_sync_req[NODE]);
                err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
                ......          

接着刷回缓存的 NAT entry到 NAT 区域,这是通过f2fs_flush_nat_entries来实现的:

void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
        struct f2fs_journal *journal = curseg->journal;
        struct nat_entry_set *setvec[SETVEC_SIZE];
        struct nat_entry_set *set, *tmp;
        unsigned int found;
        nid_t set_idx = 0;
        LIST_HEAD(sets);

        /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */
        if (enabled_nat_bits(sbi, cpc)) {
                down_write(&nm_i->nat_tree_lock);
                remove_nats_in_journal(sbi);
                up_write(&nm_i->nat_tree_lock);
        }
                .....
                 /*
         * if there are no enough space in journal to store dirty nat
         * entries, remove all entries from journal and merge them
         * into nat entry set.
         */
        if (enabled_nat_bits(sbi, cpc) ||
                !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
                remove_nats_in_journal(sbi);

        while ((found = __gang_lookup_nat_set(nm_i,
                                        set_idx, SETVEC_SIZE, setvec))) {
                unsigned idx;
                set_idx = setvec[found - 1]->set + 1;
                for (idx = 0; idx < found; idx++)
                        __adjust_nat_entry_set(setvec[idx], &sets,
                                                MAX_NAT_JENTRIES(journal));
        }

        /* flush dirty nats in nat entry set */
        list_for_each_entry_safe(set, tmp, &sets, set_list)
                __flush_nat_entry_set(sbi, set, cpc);

上面步骤完成之后,还刷回缓存的SIT entry到 SIT 区域,这是通过下面的函数去实现:

void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
        struct sit_info *sit_i = SIT_I(sbi);
        unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
        struct f2fs_journal *journal = curseg->journal;
        struct sit_entry_set *ses, *tmp;
        struct list_head *head = &SM_I(sbi)->sit_entry_set;
        bool to_journal = true;
        struct seg_entry *se;
                ......
                /*
         * if there are no enough space in journal to store dirty sit
         * entries, remove all entries from journal and add and account
         * them in sit entry set.
         */
        if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL))
                remove_sits_in_journal(sbi);

        /*
         * there are two steps to flush sit entries:
         * #1, flush sit entries to journal in current cold data summary block.
         * #2, flush sit entries to sit page.
         */
        list_for_each_entry_safe(ses, tmp, head, set_list) {
                     ......
                          /* flush dirty sit entries in region of current sit set */
                for_each_set_bit_from(segno, bitmap, end) {
                        int offset, sit_offset;

上面把 NAT/SIT 都刷完了之后,会执行最关键的操作, do_checkpoint, 它主要的操作如下:

 /* Flush all the NAT/SIT pages */
        while (get_pages(sbi, F2FS_DIRTY_META)) {
                f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
                if (unlikely(f2fs_cp_error(sbi)))
                        break;
        }
                .....
                 /* write nat bits */
        if (enabled_nat_bits(sbi, cpc)) {
                __u64 cp_ver = cur_cp_version(ckpt);
                block_t blk;

                cp_ver |= ((__u64)crc32 << 32);
                *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);

                blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks;
                for (i = 0; i < nm_i->nat_bits_blocks; i++)
                        f2fs_update_meta_page(sbi, nm_i->nat_bits +
                                        (i << F2FS_BLKSIZE_BITS), blk + i);

                /* Flush all the NAT BITS pages */
                while (get_pages(sbi, F2FS_DIRTY_META)) {
                        f2fs_sync_meta_pages(sbi, META, LONG_MAX,
                                                        FS_CP_META_IO);
                        if (unlikely(f2fs_cp_error(sbi)))
                                break;
                }
        }

        /* write out checkpoint buffer at block 0 */
        f2fs_update_meta_page(sbi, ckpt, start_blk++);

        for (i = 1; i < 1 + cp_payload_blks; i++)
                f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE,
                                                        start_blk++);
             .....
              f2fs_write_data_summaries(sbi, start_blk);
        start_blk += data_sum_blocks;
                .....
                /* update user_block_counts */
        sbi->last_valid_block_count = sbi->total_valid_block_count;
        percpu_counter_set(&sbi->alloc_valid_block_count, 0);

        /* Here, we have one bio having CP pack except cp pack 2 page */
        f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);

        /* wait for previous submitted meta pages writeback */
        f2fs_wait_on_all_pages_writeback(sbi);

        /* flush all device cache */
        err = f2fs_flush_device_cache(sbi);
        if (err)
                return err;

        /* barrier and flush checkpoint cp pack 2 page if it can */
        commit_checkpoint(sbi, ckpt, start_blk);
        f2fs_wait_on_all_pages_writeback(sbi);
                ......

触发 check point的时机

  • gc: 当空闲segment 不足的时候,可以通过check point 释放。
    if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) {
                /*
                 * For example, if there are many prefree_segments below given
                 * threshold, we can make them free by checkpoint. Then, we
                 * secure free segments which doesn't need fggc any more.
                 */
                if (prefree_segments(sbi)) {
                        ret = f2fs_write_checkpoint(sbi, &cpc);
                        if (ret)
                                goto stop;
                }
                if (has_not_enough_free_secs(sbi, 0, 0))
                        gc_type = FG_GC;
        }
  • recover:
    当 f2fs_recover_fsync_data()找到有需要fsync的segment的时候,会调用f2fs_write_checkpoint:

  • trim:
    f2fs_trim 通过设置CP的reason 是CP_DISCARD, 然后走f2fs_write_checkpoint 实现:

    cpc.reason = CP_DISCARD;
        cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
        cpc.trim_start = start_segno;
        cpc.trim_end = end_segno;
    
        if (sbi->discard_blks == 0)
                goto out;
    
        mutex_lock(&sbi->gc_mutex);
        err = f2fs_write_checkpoint(sbi, &cpc);
        mutex_unlock(&sbi->gc_mutex);
  • super block
    super block 的put_super 接口需要通过f2fs_write_checkpoint() 来把没有来得及umount导致的可能不一致的数据重新做一次check point。

    /*
                 .......
         * We don't need to do checkpoint when superblock is clean.
         * But, the previous checkpoint was not done by umount, it needs to do
         * clean checkpoint again.
         */
        if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
                        !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
                struct cp_control cpc = {
                        .reason = CP_UMOUNT,
                };
                f2fs_write_checkpoint(sbi, &cpc);
        }
    
        /* be sure to wait for any on-going discard commands */
        dropped = f2fs_wait_discard_bios(sbi);
    
        if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) {
                struct cp_control cpc = {
                        .reason = CP_UMOUNT | CP_TRIMMED,
                };
                                .....