学习一个文件系统,需要熟悉它的数据layout,为此就必须深入理解layout相关的数据结构。结合本人最近学习f2fs的心得,下面总结了相关的几个最重要的数据结构。

基本概念

block: 4KB对齐且连续的物理存储空间
segment: 2M连续的物理存储空间
session: 若干连续的segment 组成
zone: 若干连续的zone组成

node

node是内部用来定位的。通过下面的数据结构可以看到,f2fs里面的node 主要就是用来记录block的地址。相关的数据结构如下:

struct f2fs_node {
        /* can be one of three types: inode, direct, and indirect types */
        union {
                struct f2fs_inode i;
                struct direct_node dn;
                struct indirect_node in;
        };
        struct node_footer footer;
} __packed;

struct direct_node {
        __le32 addr[ADDRS_PER_BLOCK];   /* array of data block address */
} __packed;

struct indirect_node {
        __le32 nid[NIDS_PER_BLOCK];     /* array of data block address */
} __packed;

上面f2fs_node包含了以个union,里面可能是f2fs_inode。它里面的一个重要内容就是用来索引逻辑文件或者目录里的数据。具体结构如下。

inode

inode是用来和外部用户交互的,inode包括和VFS交互,包括ACL、time等相关数据信息。主要数据结构如下:

struct f2fs_inode {
        __le16 i_mode;                  /* file mode */
        __u8 i_advise;                  /* file hints */
        __u8 i_inline;                  /* file inline flags */
        __le32 i_uid;                   /* user ID */
        __le32 i_gid;                   /* group ID */
        __le32 i_links;                 /* links count */
        __le64 i_size;                  /* file size in bytes */
        __le64 i_blocks;                /* file size in blocks */
        __le64 i_atime;                 /* access time */
        __le64 i_ctime;                 /* change time */
        __le64 i_mtime;                 /* modification time */
        __le32 i_atime_nsec;            /* access time in nano scale */
        __le32 i_ctime_nsec;            /* change time in nano scale */
        __le32 i_mtime_nsec;            /* modification time in nano scale */
        __le32 i_generation;            /* file version (for NFS) */
        union {
                __le32 i_current_depth; /* only for directory depth */
                __le16 i_gc_failures;   /*
                                         * # of gc failures on pinned file.
                                         * only for regular files.
                                         */
        };
        __le32 i_xattr_nid;             /* nid to save xattr */
        __le32 i_flags;                 /* file attributes */
        __le32 i_pino;                  /* parent inode number */
        __le32 i_namelen;               /* file name length */
        __u8 i_name[F2FS_NAME_LEN];     /* file name for SPOR */
        __u8 i_dir_level;               /* dentry_level for large dir */

        struct f2fs_extent i_ext;       /* caching a largest extent */

        union {
                struct { // for what usage?
                        __le16 i_extra_isize;   /* extra inode attribute size */
                        __le16 i_inline_xattr_size;     /* inline xattr size, unit: 4 bytes */
                        __le32 i_projid;        /* project id */
                        __le32 i_inode_checksum;/* inode meta checksum */
                        __le64 i_crtime;        /* creation time */
                                                             __le32 i_extra_end[0];  /* for attribute size calculation */
                } __packed;
                __le32 i_addr[DEF_ADDRS_PER_INODE];     /* Pointers to data blocks */
        };
        __le32 i_nid[DEF_NIDS_PER_INODE];       /* direct(2), indirect(2),
                                                double_indirect(1) node id */
} __packed;

其中上面的i_addr 直接可以指向数据块,如果数据块的数量超过了DEF_NIDS_PER_INODE,就需要使用i_nid。 i_nid 数组可以用来分别指向2个direct、2个indirect、1个double indirect的 block地址索引块。

NAT

上面f2fs_inode数据结构是一个inode块里面的内容。那么这个inode块的地址如何确定呢?这就是f2fs_nat_entry的职责了, 每个f2fs_nat_entry 记录了每个inode编号和其inode块数据地址的对应关系。而专门存储f2fs_nat_entry的block,组成了f2fs_nat_block。

/*
 * For NAT entries
 */
#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))

struct f2fs_nat_entry {
        __u8 version;           /* latest version of cached nat entry */
        __le32 ino;             /* inode number */
        __le32 block_addr;      /* block address */
} __packed;

struct f2fs_nat_block {
        struct f2fs_nat_entry entries[NAT_ENTRY_PER_BLOCK];
} __packed;

可是上面问题又来了, inode number如何确定?NAT block起始地址在哪,有多少个?

f2fs dir entry

f2fs_dir_entry 回答了上面的第一个问题,它把inode number和文件名通过hash关联起来了。同样,也有专门存储f2fs_dir_entry的块,叫做f2fs_dentry_block.

#define NR_DENTRY_IN_BLOCK      214     /* the number of dentry in a block */
#define SIZE_OF_DIR_ENTRY       11      /* by byte */
#define SIZE_OF_DENTRY_BITMAP   ((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
                                        BITS_PER_BYTE)
#define SIZE_OF_RESERVED        (PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
                                F2FS_SLOT_LEN) * \
                                NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))

/* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
struct f2fs_dir_entry {
        __le32 hash_code;       /* hash code of file name */
        __le32 ino;             /* inode number */
        __le16 name_len;        /* lengh of file name */
        __u8 file_type;         /* file type */
} __packed;

/* 4KB-sized directory entry block */
struct f2fs_dentry_block {
        /* validity bitmap for directory entries in each block */
        __u8 dentry_bitmap[SIZE_OF_DENTRY_BITMAP];
        __u8 reserved[SIZE_OF_RESERVED];
        struct f2fs_dir_entry dentry[NR_DENTRY_IN_BLOCK];
        __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
} __packed;

这里我之前的一个顾虑是,如果出现不同file name的inode hash到同一个inode number,岂不是出问题了?后来通过看具体实现的代码,可以看到,实际还会加上file name的比较。这样就可以避免碰撞了。

那么,f2fs第一个inode节点(root indoe)的inode number是怎么确定的?又存储在哪呢?

f2fs super block

f2fs super block数据结构回答了上面的问题,同时也记录了NAT block的起始地址。主要的数据结构如下:

struct f2fs_super_block {
        __le32 magic;                   /* Magic Number */
        __le16 major_ver;               /* Major Version */
        __le16 minor_ver;               /* Minor Version */
        __le32 log_sectorsize;          /* log2 sector size in bytes */
        __le32 log_sectors_per_block;   /* log2 # of sectors per block */
        __le32 log_blocksize;           /* log2 block size in bytes */
        __le32 log_blocks_per_seg;      /* log2 # of blocks per segment */
        __le32 segs_per_sec;            /* # of segments per section */
        __le32 secs_per_zone;           /* # of sections per zone */
        __le32 checksum_offset;         /* checksum offset inside super block */
        __le64 block_count;             /* total # of user blocks */
        __le32 section_count;           /* total # of sections */
        __le32 segment_count;           /* total # of segments */
        __le32 segment_count_ckpt;      /* # of segments for checkpoint */
        __le32 segment_count_sit;       /* # of segments for SIT */
        __le32 segment_count_nat;       /* # of segments for NAT */
        __le32 segment_count_ssa;       /* # of segments for SSA */
        __le32 segment_count_main;      /* # of segments for main area */
        __le32 segment0_blkaddr;        /* start block address of segment 0 */
        __le32 cp_blkaddr;              /* start block address of checkpoint */
        __le32 sit_blkaddr;             /* start block address of SIT */
        __le32 nat_blkaddr;             /* start block address of NAT */
        __le32 ssa_blkaddr;             /* start block address of SSA */
        __le32 main_blkaddr;            /* start block address of main area */
        __le32 root_ino;                /* root inode number */
        __le32 node_ino;                /* node inode number */
        __le32 meta_ino;                /* meta inode number */
        __u8 uuid[16];                  /* 128-bit uuid for volume */
        __le16 volume_name[MAX_VOLUME_NAME];    /* volume name */
        __le32 extension_count;         /* # of extensions below */
        __u8 extension_list[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];/* extension array */
        __le32 cp_payload;
        __u8 version[VERSION_LEN];      /* the kernel version */
        __u8 init_version[VERSION_LEN]; /* the initial kernel version */
        __le32 feature;                 /* defined features */
        __u8 encryption_level;          /* versioning level for encryption */
        __u8 encrypt_pw_salt[16];       /* Salt used for string2key algorithm */
        struct f2fs_device devs[MAX_DEVICES];   /* device list */
                 __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */
        __u8 hot_ext_count;             /* # of hot file extension */
        __u8 reserved[314];             /* valid reserved region */
} __packed;

而super block的位置是固定的,当以f2fs格式化一个磁盘的时候,它会写入到磁盘固定偏移的地方。

SIT

由于f2fs是LFS,追加的写的大小不固定,很可能小于一个segment的大小,这就需要记录哪些block已经使用。segment info table 就是做这个事情的,里面的valid_map记录了有效的块。

/*
 * Note that f2fs_sit_entry->vblocks has the following bit-field information.
 * [15:10] : allocation type such as CURSEG_XXXX_TYPE
 * [9:0] : valid block count
 */
#define SIT_VBLOCKS_SHIFT       10
#define SIT_VBLOCKS_MASK        ((1 << SIT_VBLOCKS_SHIFT) - 1)
#define GET_SIT_VBLOCKS(raw_sit)                                \
        (le16_to_cpu((raw_sit)->vblocks) & SIT_VBLOCKS_MASK)
#define GET_SIT_TYPE(raw_sit)                                   \
        ((le16_to_cpu((raw_sit)->vblocks) & ~SIT_VBLOCKS_MASK)  \
         >> SIT_VBLOCKS_SHIFT)

struct f2fs_sit_entry {
        __le16 vblocks;                         /* reference above */
        __u8 valid_map[SIT_VBLOCK_MAP_SIZE];    /* bitmap for valid blocks */
        __le64 mtime;                           /* segment age for cleaning */
} __packed;

struct f2fs_sit_block {
        struct f2fs_sit_entry entries[SIT_ENTRY_PER_BLOCK];
} __packed;

segment summary

f2fs 一个重要的设计特色就是避免了对传统LFS 的wandering tree问题,这个主要是通过segment summary 相关的数据结构实现的。

通过上面SIT的介绍,一次写之后,需要更新对应的SIT。这个更新会记录到 f2fs_sit_journal_entry中:

struct sit_journal_entry {
        __le32 segno;
        struct f2fs_sit_entry se;
} __packed;

struct sit_journal {
        struct sit_journal_entry entries[SIT_JOURNAL_ENTRIES];
        __u8 reserved[SIT_JOURNAL_RESERVED];
} __packed;

如果新建一个文件或目录,并且有写操作,就需要更新nat 区域。同样对这个inode的更新也会记录到f2fs_nat_journal_entry中:

struct nat_journal_entry {
        __le32 nid;
        struct f2fs_nat_entry ne;
} __packed;

struct nat_journal {
        struct nat_journal_entry entries[NAT_JOURNAL_ENTRIES];
        __u8 reserved[NAT_JOURNAL_RESERVED];
}

一个写操作,其实对NAT/SIT更新的区域的很小。如果每次都直接更新这两个区域,对SSD会导致比较大的写放大。为了避免这个问题,f2fs 通过segment summary把这些零星的写攒到segment summary 区域。

/*
 * For segment summary
 *
 * One summary block contains exactly 512 summary entries, which represents
 * exactly 2MB segment by default. Not allow to change the basic units.
 *
 * NOTE: For initializing fields, you must use set_summary
 *
 * - If data page, nid represents dnode's nid
 * - If node page, nid represents the node page's nid.
 *
 * The ofs_in_node is used by only data page. It represents offset
 * from node's page's beginning to get a data block address.
 * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node)
 */
#define ENTRIES_IN_SUM          512
#define SUMMARY_SIZE            (7)     /* sizeof(struct summary) */
#define SUM_FOOTER_SIZE         (5)     /* sizeof(struct summary_footer) */
#define SUM_ENTRY_SIZE          (SUMMARY_SIZE * ENTRIES_IN_SUM)

/* a summary entry for a 4KB-sized block in a segment */
struct f2fs_summary {
        __le32 nid;             /* parent node id */
        union {
                __u8 reserved[3];
                struct {
                        __u8 version;           /* node version number */
                        __le16 ofs_in_node;     /* block index in parent node */
                } __packed;
        };
} __packed;

/* summary block type, node or data, is stored to the summary_footer */
#define SUM_TYPE_NODE           (1)
#define SUM_TYPE_DATA           (0)

struct summary_footer {
        unsigned char entry_type;       /* SUM_TYPE_XXX */
        __le32 check_sum;               /* summary checksum */
} __packed;

#define SUM_JOURNAL_SIZE        (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\
                                SUM_ENTRY_SIZE)
#define NAT_JOURNAL_ENTRIES     ((SUM_JOURNAL_SIZE - 2) /\
                                sizeof(struct nat_journal_entry))
#define NAT_JOURNAL_RESERVED    ((SUM_JOURNAL_SIZE - 2) %\
                                sizeof(struct nat_journal_entry))
#define SIT_JOURNAL_ENTRIES     ((SUM_JOURNAL_SIZE - 2) /\
                                sizeof(struct sit_journal_entry))
#define SIT_JOURNAL_RESERVED    ((SUM_JOURNAL_SIZE - 2) %\
                                sizeof(struct sit_journal_entry))

/* Reserved area should make size of f2fs_extra_info equals to
 * that of nat_journal and sit_journal.
 */
#define EXTRA_INFO_RESERVED     (SUM_JOURNAL_SIZE - 2 - 8)

/*
 * frequently updated NAT/SIT entries can be stored in the spare area in
 * summary blocks
 */
enum {
        NAT_JOURNAL = 0,
        SIT_JOURNAL
};

struct f2fs_extra_info {
        __le64 kbytes_written;
        __u8 reserved[EXTRA_INFO_RESERVED];
} __packed;

struct f2fs_journal {
        union {
                __le16 n_nats;
                __le16 n_sits;
        };
        /* spare area is used by NAT or SIT journals or extra info */
        union {
                struct nat_journal nat_j;
                struct sit_journal sit_j;
                struct f2fs_extra_info info;
        };
} __packed;

/* 4KB-sized summary block structure */
struct f2fs_summary_block {
        struct f2fs_summary entries[ENTRIES_IN_SUM]; // 512 entry * 6 bytes per entry ,used to recor where has modifiaction
        struct f2fs_journal journal;
        struct summary_footer footer;
} __packed;

file 相关操作

f2fs.h:
file_operations f2fs_file_operations:

fs/f2fs/file.c:

const struct file_operations f2fs_file_operations = {
        .llseek         = f2fs_llseek,
        .read_iter      = generic_file_read_iter,
        .write_iter     = f2fs_file_write_iter,
        .open           = f2fs_file_open,
        .release        = f2fs_release_file,
        .mmap           = f2fs_file_mmap,
        .flush          = f2fs_file_flush,
        .fsync          = f2fs_sync_file,
        .fallocate      = f2fs_fallocate,
        .unlocked_ioctl = f2fs_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl   = f2fs_compat_ioctl,
#endif
        .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
};

参考

include/linux/f2fs_fs.h
fs/f2fs/
Documentation/filesystems/f2fs.txt