文件系统是linux内核的重要组成部分,涉及到vfs、块IO层的调度机制,块设备驱动以及具体文件系统所采用的数据结构。所使用linux内核版本是2.6.34.1。
以fs/omfs为例,主要学习vfs的实现,omfs的硬盘布局,它所采用的数据结构为何能够优化MPEG文件系统。Omfs文件系统的具体文件读写又是如何实现的。
OMFS:Optimized MPEG Filesystem
OMFS是由SonicBlue公司创建的用于ReplayTV DVR和MP3 player的文件系统。该文件系统是基于extent的(现代很多文件系统都采用extent替代block来管理磁盘。Extent就是一些连续的block,可以有效减少元数据开销。),可用的block大小在2k到8k之间,目录结构是基于hash的。
该文件系统在特定的流媒体设备中性能很好,但对于一般的应用,linux主流的文件系统应该性能更优。Omfs是如何针对MPEG做性能优化,这点还在探索中。
硬盘布局格式:
Omfs区分sysblocks和一般的数据blocks。Sysblock group由superblock信息、文件的metadata元数据、目录结构和extents构成。每一个sysblock都有一个包含CRC校验的头,而且可以在硬盘上备份。Sysblock大小比一个数据block小,但是它们都用64位的块号寻址。
Sysblock 头信息:
struct omfs_header { __be64 h_self; /* FS block where this is located */ __be32 h_body_size; /* size of useful data after header */ __be16 h_crc; /* crc-ccitt of body_size bytes */
char h_fill1[2];
u8 h_version; /* version, always 1 */
char h_type; /* OMFS_INODE_X */
u8 h_magic; /* OMFS_IMAGIC */
u8 h_check_xor; /* XOR of header bytes before this */ __be32 h_fill2; };
文件和目录都由omfs_inode表示:
struct omfs_inode { struct omfs_header i_head; /* header */
__be64 i_parent; /* parent containing this inode */ __be64 i_sibling; /* next inode in hash bucket */
__be64 i_ctime; /* ctime, in milliseconds */ char i_fill1[35];
char i_type; /* OMFS_[DIR,FILE] */
__be32 i_fill2;
char i_fill3[64];
char i_name[OMFS_NAMELEN]; /* filename */
__be64 i_size; /* size of file, in bytes */ };
OMFS中的目录是一个大的hash表。文件名经过hash计算,然后放到以OMFS_DIR_START开始的桶中。查找的时候需要hash文件名,然后通过i_sibling指针查找到匹配的i_name。
文件以omfs_inode结构体开头,后面跟着在OMFS_EXTENT_START开始的extent table。
struct omfs_extent_entry {
__be64 e_cluster; /* start location of a set of blocks */
__be64 e_blocks; /* number of blocks after e_cluster */
};
struct omfs_extent {
__be64 e_next; /* next extent table location */
__be32 e_extent_count; /* total # extents in this table */
__be32 e_fill;
struct omfs_extent_entry e_entry; /* start of extent entries */
};
通过Mkomfs.c可以了解omfs的基本布局:
在这里我们不使用实际的硬盘,用loop设备模拟一下:
生成一个文件:dd if=/dev/zero of=file.img bs=512 count=10000
10000+0 records in
10000+0 records out
5120000 bytes (5.1 MB) copied, 0.102142 s, 50.1 MB/s
用losetup /dev/loop0 file.img将loop设备和file.img关联上。
再用mkomfs /dev/loop0来布局:
Mkomfs的默认配置参数是:
fs_config_t config = {
.block_size = 8192,
.cluster_size = 8,
.clear_dev = 0
};
获取设备的大小:size=bs×count=512×10000=5120000
create_fs(fp, size/512, &config);中
block_size = 8192
blocks_per_sector = block_size / SECTOR_SIZE = 8192/512 = 16
blocks = sectors / blocks_per_sector = 10000/16 =625=0x271
初始化omfs_super_block结构体:
struct omfs_super_block {
char s_fill1[192];
char s_name[OMFS_SUPER_NAMELEN]; :”omfs”
__be64 s_root_block; /* block number of omfs_root_block */ :ROOT_BLK=1
__be64 s_num_blocks; /* total number of FS blocks */ :blocks=625=0x271
__be32 s_magic; /* OMFS_MAGIC */ : OMFS_MAGIC 0xC2993D87
__be32 s_blocksize; /* size of a block */ :block_size=8192=0x2000
__be32 s_mirrors; /* # of mirrors of system blocks */ :2
__be32 s_sys_blocksize; /* size of non-data blocks */ :2048=0x800
}; 288个字节
hexdump -C -s 0 -n 512 /dev/loop0
00000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
000000c0 6f 6d 66 73 00 00 00 00 00 00 00 00 00 00 00 00 |omfs............|
000000d0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000100 00 00 00 00 00 00 00 01 00 00 00 00 00 00 02 71 |...............q|
00000110 c2 99 3d 87 00 00 20 00 00 00 00 02 00 00 08 00 |..=... .........|
00000120 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00000200
将superblock的288个字节写在最开头,
初始化omfs_root_block结构体:
struct omfs_root_block {
struct omfs_header r_head; /* header */
__be64 r_fill1;
__be64 r_num_blocks; /* total number of FS blocks */ :blocks=625
__be64 r_root_dir; /* block # of root directory */ : ROOT_DIR_BLK=3
__be64 r_bitmap; /* block # of free space bitmap */ :BITMAP_BLK 5
__be32 r_blocksize; /* size of a block */ :block_size=8192
__be32 r_clustersize; /* size allocated for data blocks */ : cluster_size = 8
__be64 r_mirrors; /* # of mirrors of system blocks */ :2
char r_name[OMFS_NAMELEN]; /* partition label */ :”omfs”
__be64 r_fill2;
}; 336个字节
struct omfs_header {
__be64 h_self; /* FS block where this is located */ :ROOT_BLK=1
__be32 h_body_size; /* size of useful data after header */ :336-24=312=0x138
__be16 h_crc; /* crc-ccitt of body_size bytes */
char h_fill1[2];
u8 h_version; /* version, always 1 */ :1
char h_type; /* OMFS_INODE_X */ : OMFS_INODE_SYSTEM=’s’
u8 h_magic; /* OMFS_IMAGIC */ : OMFS_IMAGIC 0xD2
u8 h_check_xor; /* XOR of header bytes before this */
__be32 h_fill2;
}; 24个字节
然后移动到8192个字节,0x2000处,写入rootblock;再移动到0x4000处,再写一遍rootblock。
hexdump -C -s 0x2000 -n 512 /dev/loop0
00002000 00 00 00 00 00 00 00 01 00 00 01 38 00 00 00 00 |...........8....|
00002010 01 73 d2 00 00 00 00 00 00 00 00 00 00 00 00 00 |.s..............|
00002020 00 00 00 00 00 00 02 71 00 00 00 00 00 00 00 03 |.......q........|
00002030 00 00 00 00 00 00 00 05 00 00 20 00 00 00 00 08 |.......... .....|
00002040 00 00 00 00 02 00 00 00 6f 6d 66 73 00 00 00 00 |........omfs....|
00002050 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00002200
hexdump -C -s 0x4000 -n 512 /dev/loop0
00004000 00 00 00 00 00 00 00 01 00 00 01 38 00 00 00 00 |...........8....|
00004010 01 73 d2 00 00 00 00 00 00 00 00 00 00 00 00 00 |.s..............|
00004020 00 00 00 00 00 00 02 71 00 00 00 00 00 00 00 03 |.......q........|
00004030 00 00 00 00 00 00 00 05 00 00 20 00 00 00 00 08 |.......... .....|
00004040 00 00 00 00 02 00 00 00 6f 6d 66 73 00 00 00 00 |........omfs....|
00004050 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00004200
Root directory inode信息:
struct omfs_inode {
struct omfs_header i_head; /* header */
__be64 i_parent; /* parent containing this inode */ :~0
__be64 i_sibling; /* next inode in hash bucket */ :~0
__be64 i_ctime; /* ctime, in milliseconds */
char i_fill1[35];
char i_type; /* OMFS_[DIR,FILE] */ :’D’
__be32 i_fill2; :1
char i_fill3[64];
char i_name[OMFS_NAMELEN]; /* filename */
__be64 i_size; /* size of file, in bytes */ :2048
};
struct omfs_header {
__be64 h_self; /* FS block where this is located */ : ROOT_DIR_BLK 3
__be32 h_body_size; /* size of useful data after header */ :2048-24=2024
__be16 h_crc; /* crc-ccitt of body_size bytes */
char h_fill1[2];
u8 h_version; /* version, always 1 */ :1
char h_type; /* OMFS_INODE_X */ : OMFS_INODE_NORMAL 'e'
u8 h_magic; /* OMFS_IMAGIC */ :OMFS_IMAGIC 0xD2
u8 h_check_xor; /* XOR of header bytes before this */
__be32 h_fill2;
};
申请一块大小为2048字节的内存,前面416=0x1a0个字节放root inode,
从OMFS_DIR_START 0x1b8到2048=0x800全部设置为0xff。
将这2048个字节写到0x6000位置,再重复将这2048个字节写到0x8000位置。
hexdump -C -s 0x6000 -n 512 /dev/loop0
00006000 00 00 00 00 00 00 00 03 00 00 07 e8 1c 5c 00 00 |.............\..|
00006010 01 65 d2 1a 00 00 00 00 ff ff ff ff ff ff ff ff |.e..............|
00006020 ff ff ff ff ff ff ff ff 00 00 01 35 5a 8e 42 bb |...........5Z.B.|
00006030 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00006050 00 00 00 44 00 00 00 01 00 00 00 00 00 00 00 00 |...D............|
00006060 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00006190 00 00 00 00 00 00 00 00 00 00 00 00 00 00 08 00 |................|
000061a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
000061b0 00 00 00 00 00 00 00 00 ff ff ff ff ff ff ff ff |................|
000061c0 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
*
00006200
hexdump -C -s 0x8000 -n 512 /dev/loop0
00008000 00 00 00 00 00 00 00 03 00 00 07 e8 1c 5c 00 00 |.............\..|
00008010 01 65 d2 1a 00 00 00 00 ff ff ff ff ff ff ff ff |.e..............|
00008020 ff ff ff ff ff ff ff ff 00 00 01 35 5a 8e 42 bb |...........5Z.B.|
00008030 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00008050 00 00 00 44 00 00 00 01 00 00 00 00 00 00 00 00 |...D............|
00008060 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
00008190 00 00 00 00 00 00 00 00 00 00 00 00 00 00 08 00 |................|
000081a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
000081b0 00 00 00 00 00 00 00 00 ff ff ff ff ff ff ff ff |................|
000081c0 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
*
00008200
最后,就是free space bitmap了。
bitmap_size = (swap_be64(super.s_num_blocks) + 7)/8; =(625+7)/8=79
dirty_size = (bitmap_size + 7)/8; =(79+7)/8=10
first_blk = BITMAP_BLK + (bitmap_size +
swap_be32(super.s_blocksize)-1) / swap_be32(super.s_blocksize); =5+(79+8192-1)/8192=6
因为bitmap的每一位代表一个block,所以可以计算出bitmap的字节数。
Bitmap.bmap申请bitmap_size=79个字节的内存大小
for (i=0; i<first_blk; i++)
{
bitmap.bmap[i/8] |= 1<<(i & 7);
}
Bitmap.bmap[0] = 0011 1111 2进制 0x3f
因为blocks 0-5被用了,所以bitmap的相应位都置上1。
Bitmap.dirty申请dirty_size=10个字节的大小,每个字节都置为0xff
将bitmap写入0xa000。(第6个block,block是从block 0开始的,block 0放的是super block)。
hexdump -C -s 0xa000 -n 512 /dev/loop0
0000a000 3f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |?...............|
0000a010 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
0000a050 00 00 00 00 11 00 00 00 ff ff ff ff ff ff ff ff |................|
0000a060 ff ff 00 00 c9 0c 02 00 00 00 00 00 00 00 00 00 |................|
0000a070 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
0000a190 00 00 00 00 00 00 00 00 00 00 00 00 00 00 08 00 |................|
0000a1a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
0000a1b0 00 00 00 00 00 00 00 00 ff ff ff ff ff ff ff ff |................|
0000a1c0 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff |................|
*
0000a200
到此,omfs文件系统格式化ok了,mount之后就可以进行常见的文件操作了。