EXT4写文件流程

测试函数

#include<stdio.h>

#include<sys/types.h>

#include<sys/stat.h>

#include<fcntl.h>

#include<unistd.h>

intmain(int argc, char *argv[])

{

intfd = 0, i = 0;

charbuf[1024] = {0};

fd= open(argv[1], O_CREAT | O_RDWR); //创建文件并打开文件

for(i = 0; i < 20; i++) {

write(fd,buf, 1024); //20k数据到新创建的文件中去(12k直接寻址+8k间接寻址)

}

close(fd);

return0;

}


dump_stack查看函数堆栈:

open(argv[1],O_CREAT | O_RDWR)kernel函数调用

[<c0028650>](unwind_backtrace+0x0/0xec) from [<c00e4f14>](ext4_create+0x80/0x110)

[<c00e4f14>](ext4_create+0x80/0x110) from [<c0094d84>](vfs_create+0x74/0xa4)

[<c0094d84>](vfs_create+0x74/0xa4) from [<c009594c>] (do_last+0x2b8/0x590)

[<c009594c>](do_last+0x2b8/0x590) from [<c00974b4>](do_filp_open+0x16c/0x4c0)

[<c00974b4>](do_filp_open+0x16c/0x4c0) from [<c008b240>](do_sys_open+0x58/0xdc)

[<c008b240>](do_sys_open+0x58/0xdc) from [<c0022e60>](ret_fast_syscall+0x0/0x2c)


write(fd,buf, 1024)的函数调用

[<c0028650>](unwind_backtrace+0x0/0xec) from [<c00f8ae4>](ext4_mb_new_blocks+0x2c/0x3b4)

[<c00f8ae4>](ext4_mb_new_blocks+0x2c/0x3b4) from [<c00ddcbc>](ext4_ind_map_blocks+0x4bc/0xa44)

[<c00ddcbc>](ext4_ind_map_blocks+0x4bc/0xa44) from [<c00de490>](ext4_map_blocks+0x114/0x1cc)

[<c00de490>](ext4_map_blocks+0x114/0x1cc) from [<c00de608>](_ext4_get_block+0xc0/0x15c)

[<c00de608>](_ext4_get_block+0xc0/0x15c) from [<c00ad88c>](__block_prepare_write+0x1a8/0x464)

[<c00ad88c>](__block_prepare_write+0x1a8/0x464) from [<c00add28>](block_write_begin_newtrunc+0x88/0xdc)

[<c00add28>](block_write_begin_newtrunc+0x88/0xdc) from [<c00ae1cc>](block_write_begin+0x40/0x94)

[<c00ae1cc>](block_write_begin+0x40/0x94) from [<c00e198c>](ext4_write_begin+0x150/0x2e0)

[<c00e198c>](ext4_write_begin+0x150/0x2e0) from [<c0068170>](generic_file_buffered_write+0xdc/0x214)

[<c0068170>](generic_file_buffered_write+0xdc/0x214) from [<c0069d74>](__generic_file_aio_write+0x41c/0x46c)

[<c0069d74>](__generic_file_aio_write+0x41c/0x46c) from [<c0069e2c>](generic_file_aio_write+0x68/0xcc)

[<c0069e2c>](generic_file_aio_write+0x68/0xcc) from [<c008c74c>](do_sync_write+0x98/0xe4)

[<c008c74c>](do_sync_write+0x98/0xe4) from [<c008d120>](vfs_write+0xac/0x124)

[<c008d120>](vfs_write+0xac/0x124) from [<c008d244>] (sys_write+0x3c/0x68)

[<c008d244>](sys_write+0x3c/0x68) from [<c0022e60>](ret_fast_syscall+0x0/0x2c)


LOG输出分析:

/# mkdir /mnt;mount -t ext4 -o nosuid,nodev,nodelalloc 1M.img /mnt;cd/mnt;

ddif=/dev/zero of=1 bs=1024 count=20

[_ext4_get_block,1359]map.m_lblk = 0, map.m_len = 1//写第一个1k数据,长度为1k,文件的偏移block0

[ext4_ind_map_blocks,943]depth = 1, flag = 0x0//读测试depth表示可直接寻址的block

[ext4_map_blocks,1252]retval = 0, map->m_flags = 0x0

[ext4_map_blocks,1294]down_write

[ext4_ind_map_blocks,943]depth = 1, flag = 0x1//再次调用ext4_ind_map_blocks分配blocks

[ext4_alloc_branch,733]indirect_blks = 0//不用分配存放寻址信息的block(metablocks)

[ext4_mb_group_or_file,3980]ac->ac_o_ex.fe_logical:0, size:1,isize:0

//组分分配还是大文件分配策略(16blocks)/sys/fs/ext4/loop0/mb_stream_req

[ext4_mb_initialize_context,4060]init ac: 1 blocks @ 0, goal 190, flags a0, 2^0, left: 0/0, right0/0 to writable

[ext4_mb_use_preallocated,3167]i_prealloc_list

[ext4_mb_use_preallocated,3197]lg_prealloc_list

[ext4_mb_use_preallocated,3215]order:0, goal_block:190

[ext4_mb_normalize_group_request,2842]#35: goal 512 blocks for locality group

//默认的组预分配大小(512Blocks):/sys/fs/ext4/loop0/mb_group_prealloc

[ext4_mb_regular_allocator,2032]fls(512) = 10

[ext4_mb_new_blocks,4348]ac->ac_o_ex.fe_len:1, ac->ac_b_ex.fe_len:512

[ext4_mb_add_n_trim,4193]list_add_tail_rcu:8

//插入到组预分配的链表中去list[0]表示2^1blocks的预分配链表,list[8]则表示2^9= 512blocks

[ext4_mb_new_blocks,4404]block:38

[ext4_alloc_blocks,672]current_block = 38

[ext4_alloc_blocks,685]blk_allocated = 1

[ext4_alloc_branch,786]num = 1

[_ext4_get_block,1359]map.m_lblk = 1, map.m_len = 1

[ext4_ind_map_blocks,943]depth = 1, flag = 0x0

[ext4_map_blocks,1252]retval = 0, map->m_flags = 0x0

[ext4_map_blocks,1294]down_write

[ext4_ind_map_blocks,943]depth = 1, flag = 0x1

[ext4_alloc_branch,733]indirect_blks = 0

[ext4_mb_group_or_file,3980]ac->ac_o_ex.fe_logical:1, size:2,isize:1

[ext4_mb_initialize_context,4060]init ac: 1 blocks @ 1, goal 38, flags a0, 2^0, left: 0/0, right0/0 to writable

[ext4_mb_use_preallocated,3167]i_prealloc_list

[ext4_mb_use_preallocated,3197]lg_prealloc_list

[ext4_mb_use_preallocated,3215]order:0, goal_block:38

[ext4_mb_use_preallocated,3226]i:8, pa_deleted:0, pa_free:511

[ext4_mb_add_n_trim,4193]list_add_tail_rcu:8

[ext4_mb_new_blocks,4404]block:39

[ext4_alloc_blocks,672]current_block = 39

[ext4_alloc_blocks,685]blk_allocated = 1

[ext4_alloc_branch,786]num = 1

...

[_ext4_get_block,1350]map.m_lblk = 16, map.m_len = 1

[ext4_mb_group_or_file,3980]ac->ac_o_ex.fe_logical:16, size:17,isize:16

###s_mb_stream_request//启用大文件分配策略

[ext4_mb_initialize_context,4060]init ac: 1 blocks @ 16, goal 53, flags 820, 2^0, left: 0/0,right 0/0 to writable

[ext4_mb_use_preallocated,3167]i_prealloc_list

[ext4_mb_use_preallocated,3197]lg_prealloc_list

[ext4_mb_regular_allocator,2032]fls(32) = 6

[ext4_mb_new_blocks,4348]ac->ac_o_ex.fe_len:1, ac->ac_b_ex.fe_len:32

[ext4_mb_new_blocks,4404]block:977

[_ext4_get_block,1350]map.m_lblk = 17, map.m_len = 1

[ext4_mb_group_or_file,3980]ac->ac_o_ex.fe_logical:17, size:18,isize:17

###s_mb_stream_request

[ext4_mb_initialize_context,4060]init ac: 1 blocks @ 17, goal 977, flags 820, 2^0, left: 0/0,right 0/0 to writable

[ext4_mb_use_preallocated,3167]i_prealloc_list

[ext4_mb_new_blocks,4404]block:978



函数分析

ext4_ind_map_blocks调用ext4_block_to_path去查找inodeblock存放的位置

i_block为将分配的block在文件中的偏移blocks数,从log中可以看到我们写20k的数据到新创建的文件中去,

第一次i_block0,你二次为1,以此类推。offset存放block的寻址信息,我们先看一下每一种寻址的范围:

#define EXT4_NDIR_BLOCKS 12//直接寻址的block(12k)

#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS //(12K+256k= 268k)

#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK+ 1) //(12k+256k+256^2k)

#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK+ 1)//(12k+256k+256^2k+256^3k)

#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK+ 1)

由此可以看出offset存放的是i_block的寻址信息,即如何去查找到该block,depth为寻址的深度

1标志直接寻址,2表示间接寻址,3表示3级寻址,4表示4级寻址

staticint ext4_block_to_path(struct inode *inode,

ext4_lblk_t i_block,

ext4_lblk_t offsets[4], int *boundary)

{

intptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);

intptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);

constlong direct_blocks = EXT4_NDIR_BLOCKS,//直接寻址的BLOCK(12)

indirect_blocks= ptrs,//简介寻址的BLOCK(1024/4= 256)

double_blocks= (1 << (ptrs_bits * 2));//256^2BLOCKs

intn = 0;

intfinal = 0;


if(i_block < direct_blocks) {

offsets[n++]= i_block;

final= direct_blocks;

}else if ((i_block -= direct_blocks) < indirect_blocks) {

offsets[n++]= EXT4_IND_BLOCK;//直接寻址已经,offsets[0]= 12

offsets[n++]= i_block;//使用第二级寻址,注i_block=i_blocks -12

final= ptrs;

}else if ((i_block -= indirect_blocks) < double_blocks) {

offsets[n++]= EXT4_DIND_BLOCK;//i_block= i_blocks -12 – 256,offsets[0]= 13

offsets[n++]= i_block >> ptrs_bits;//offset[1]= iblock>>(256^2)

offsets[n++]= i_block & (ptrs - 1);

final= ptrs;

}else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) <ptrs) {

offsets[n++]= EXT4_TIND_BLOCK;

offsets[n++]= i_block >> (ptrs_bits * 2);

offsets[n++]= (i_block >> ptrs_bits) & (ptrs - 1);

offsets[n++]= i_block & (ptrs - 1);

final= ptrs;

}else {

ext4_warning(inode->i_sb,"block %lu > max in inode %lu",

i_block + direct_blocks +

indirect_blocks + double_blocks, inode->i_ino);

}

if(boundary)

*boundary= final - 1 - (i_block & (ptrs – 1));

//该次寻址还剩多少个blocks,如i_block=0,boundary=12 -1 =11

returnn;

}


根据ext4_block_to_path函数返回的寻址信息,查看相应的block是否已经分配了空间,EXT4_I(inode)->i_data会存放一份原始inode寻址信息(EXT4物理磁盘上的inode)的备份,根据其内容可以判断block是否已经分配,如果没有分配,返回位置信息的指针Indirect*p。根据该返回值可以判断我们需不需要为其分配额外的metablocks(用于blocks寻址的空闲block,例如在分配第13block是,需要间接寻址,这是需要多分配一个额外的间接寻址block

staticIndirect *ext4_get_branch(struct inode *inode, int depth,

ext4_lblk_t *offsets,

Indirect chain[4], int *err)

{

structsuper_block *sb = inode->i_sb;

Indirect*p = chain;

structbuffer_head *bh;


*err= 0;

/*i_data is not going away, no lock needed */

add_chain(chain,NULL, EXT4_I(inode)->i_data + *offsets);

/*staticinline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)

{

p->key= *(p->p = v);存放block的指针

p->bh= bh;存放block的值

}

加入iblock<=12因为是新分配的block,p->key0即还没有分配block

*/

if(!p->key)

gotono_block;

while(--depth) {

bh= sb_getblk(sb, le32_to_cpu(p->key));

if(unlikely(!bh))

gotofailure;


if(!bh_uptodate_or_lock(bh)) {

if(bh_submit_read(bh) < 0) {

put_bh(bh);

gotofailure;

}

/*validate block references */

if(ext4_check_indirect_blockref(inode, bh)) {

put_bh(bh);

gotofailure;

}

}


add_chain(++p,bh, (__le32 *)bh->b_data + *++offsets);

/*Reader: end */

if(!p->key)

gotono_block;

}

returnNULL;


failure:

*err= -EIO;

no_block:

returnp;

}


ext4_find_goal调用ext4_find_near计算goal值,加入是首次分配则根据inode的块组信息ei->i_block_group去计算,如果之前已经分配过空间了,则以上次分配的blockgoal

staticext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)

{

structext4_inode_info *ei = EXT4_I(inode);

__le32*start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;

__le32*p;

ext4_fsblk_tbg_start;

ext4_fsblk_tlast_block;

ext4_grpblk_tcolour;

ext4_group_tblock_group;

intflex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));

//s_log_groups_per_flex= log2(groups_per_flex) = 4,不是用flex特性时,flex_size= 2^0 = 1

//flex_bg特性是将几个连续的块组拼接成一个大的块组,s_log_groups_per_flex= 4即将2^4= 16 //块组拼成一个大的块组

/*Try to find previous block */

for(p = ind->p - 1; p >= start; p--) {

if(*p)

returnle32_to_cpu(*p);//直接返回上一次分配的块号,例如blocks[0]=38,goal就设//置成38

}


/*No such thing, so let's try location of indirect block */

if(ind->bh)

returnind->bh->b_blocknr;


/*

* It is going to be referred to from the inode itself? OK, just putit

* into the same cylinder group then.

*/

block_group= ei->i_block_group;//inode所在的块组

if(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {

block_group&= ~(flex_size-1);

if(S_ISREG(inode->i_mode))

block_group++;

}

bg_start= ext4_group_first_block_no(inode->i_sb, block_group);

last_block= ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;


/*

* If we are doing delayed allocation, we don't need take

* colour into account.

*/

if(test_opt(inode->i_sb, DELALLOC))

returnbg_start;


if(bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)

colour= (current->pid % 16) *

(EXT4_BLOCKS_PER_GROUP(inode->i_sb)/ 16);

else

colour= (current->pid % 16) * ((last_block - bg_start) / 16);

returnbg_start + colour;

}


ext4_ind_map_blocks寻址计算goalblock数,最后调用ext4_alloc_branch分配block(包含分配metablocks),最后调用ext4_splice_branch更新inode

staticint ext4_ind_map_blocks(handle_t *handle, struct inode *inode,

struct ext4_map_blocks *map,

int flags)

{

interr = -EIO;

ext4_lblk_toffsets[4];

Indirectchain[4];

Indirect*partial;

ext4_fsblk_tgoal;

intindirect_blks;

intblocks_to_boundary = 0;

intdepth;

intcount = 0;

ext4_fsblk_tfirst_block = 0;


J_ASSERT(!(ext4_test_inode_flag(inode,EXT4_INODE_EXTENTS)));

J_ASSERT(handle!= NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);

depth= ext4_block_to_path(inode, map->m_lblk, offsets,

&blocks_to_boundary);


if(depth == 0)

gotoout;


partial= ext4_get_branch(inode, depth, offsets, chain, &err);


/*Simplest case - block found, no allocation needed */

if(!partial) {

first_block= le32_to_cpu(chain[depth - 1].key);

count++;

/*mapmore blocks*/

while(count < map->m_len && count <= blocks_to_boundary){

ext4_fsblk_tblk;


blk= le32_to_cpu(*(chain[depth-1].p + count));


if(blk == first_block + count)

count++;

else

break;

}

gotogot_it;

}


/*Next simple case - plain lookup or failed read of indirect block */

if((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)

gotocleanup;


/*

* Okay, we need to do block allocation.

*/

goal= ext4_find_goal(inode, map->m_lblk, partial);


/*the number of blocks need to allocate for [d,t]indirect blocks */

indirect_blks= (chain + depth) - partial – 1;//需要分配的间接寻址block


/*

* Next look up the indirect map to count the totoal number of

* direct blocks to allocate for this branch.

*/

//返回需要分配的个数

count= ext4_blks_to_allocate(partial, indirect_blks,

map->m_len, blocks_to_boundary);

/*

* Block out ext4_truncate while we alter the tree

*/

err= ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,

&count,goal,

offsets+ (partial - chain), partial);


/*

* The ext4_splice_branch call will free and forget any buffers

* on the new chain if there is a failure, but that risks using

* up transaction credits, especially for bitmaps where the

* credits cannot be returned. Can we handle this somehow? We

* may need to return -EAGAIN upwards in the worst case. --sct

*/

if(!err)

err= ext4_splice_branch(handle, inode, map->m_lblk,

partial, indirect_blks, count);

if(err)

gotocleanup;


map->m_flags|= EXT4_MAP_NEW;//新分配的block


ext4_update_inode_fsync_trans(handle,inode, 1);

got_it:

map->m_flags|= EXT4_MAP_MAPPED;

map->m_pblk= le32_to_cpu(chain[depth-1].key);

map->m_len= count;

if(count > blocks_to_boundary)

map->m_flags|= EXT4_MAP_BOUNDARY;

err= count;

/*Clean up and exit */

partial= chain + depth - 1; /* the whole chain */

cleanup:

while(partial > chain) {

BUFFER_TRACE(partial->bh,"call brelse");

brelse(partial->bh);

partial--;

}

out:

returnerr;

}


在该函数中调用了两次ext4_ind_map_blocks,第一次是读尝试,得到EXT4_I(inode)->i_data_sem信号量后,将flags设置为0,进行block寻址,当需要写数据时,需要再次调用ext4_ind_map_blocks去真正的分配一个block

intext4_map_blocks(handle_t *handle, struct inode *inode,

struct ext4_map_blocks *map, int flags)

{

intretval;


map->m_flags= 0;

ext_debug("ext4_map_blocks():inode %lu, flag %d, max_blocks %u,"

"logical block %lu\n", inode->i_ino, flags, map->m_len,

(unsigned long) map->m_lblk);

/*

* Try to see if we can get the block without requesting a new

* file system block.

*/

down_read((&EXT4_I(inode)->i_data_sem));//得到读的信号量

if(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

retval= ext4_ext_map_blocks(handle, inode, map, 0);

}else {

retval= ext4_ind_map_blocks(handle, inode, map, 0); //block

}

up_read((&EXT4_I(inode)->i_data_sem));

printk("[%s,%d]retval = %d, map->m_flags = 0x%x\n",

__func__,__LINE__, retval, map->m_flags);

if(retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {

intret = check_block_validity(inode, __func__, map);

if(ret != 0) {

returnret;

}

}


/*If it is only a block(s) look up */

if((flags & EXT4_GET_BLOCKS_CREATE) == 0) {

returnretval;

}

/*

* Returns if the blocks have already allocated

*

* Note that if blocks have been preallocated

* ext4_ext_get_block() returns th create = 0

* with buffer head unmapped.

*/

if(retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {

returnretval;

}


/*

* When we call get_blocks without the create flag, the

* BH_Unwritten flag could have gotten set if the blocks

* requested were part of a uninitialized extent. We need to

* clear this flag now that we are committed to convert all or

* part of the uninitialized extent to be an initialized

* extent. This is because we need to avoid the combination

* of BH_Unwritten and BH_Mapped flags being simultaneously

* set on the buffer_head.

*/

map->m_flags&= ~EXT4_MAP_UNWRITTEN;


/*

* New blocks allocate and/or writing to uninitialized extent

* will possibly result in updating i_data, so we take

* the write lock of i_data_sem, and call get_blocks()

* with create == 1 flag.

*/

down_write((&EXT4_I(inode)->i_data_sem));//得到写的信号量

/*

* if the caller is from delayed allocation writeout path

* we have already reserved fs blocks for allocation

* let the underlying get_block() function know to

* avoid double accounting

*/

if(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)

EXT4_I(inode)->i_delalloc_reserved_flag= 1;

/*

* We need to check for EXT4 here because migrate

* could have changed the inode type in between

*/

if(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {

retval= ext4_ext_map_blocks(handle, inode, map, flags);

}else {

retval= ext4_ind_map_blocks(handle, inode, map, flags);//开始分配block


if(retval > 0 && map->m_flags & EXT4_MAP_NEW) {

/*

* We allocated new blocks which will result in

* i_data's format changing. Force the migrate

* to fail by clearing migrate flags

*/

ext4_clear_inode_state(inode,EXT4_STATE_EXT_MIGRATE);

}


/*

* Update reserved blocks/metadata blocks after successful

* block allocation which had been deferred till now. We don't

* support fallocate for non extent files. So we can update

* reserve space here.

*/

if((retval > 0) &&

(flags& EXT4_GET_BLOCKS_DELALLOC_RESERVE))

ext4_da_update_reserve_space(inode,retval, 1);

}

if(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)

EXT4_I(inode)->i_delalloc_reserved_flag= 0;


up_write((&EXT4_I(inode)->i_data_sem));

if(retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {

intret = check_block_validity(inode,

"ext4_map_blocks_after_alloc",

map);

if(ret != 0)

returnret;

}

returnretval;

}


EXT4文件系统中分配block最关键的函数ext4_mb_new_blocks,大家知道EXT4使用了MBalloc分配器,即多块分配器,在分配block时,并不是按需分配,而是进行一定了预分配,并又工具文件大小的不同使用不同的分配策略(组分配&大文件的inode分配)

为了完成这个功能,EXT4的超级块数据接口中多了一个名为s_buddy_cacheinode指针用于存放bitmapcache & buddy cacheBuddycache的使用类似与linux内存的buddy分配器,即把连续空闲的block按照2n次方去存放和分配。从proc节点中我们可以简单的了解一下buddycache的使用。

/mnt# cat /proc/fs/ext4/loop0/mb_groups

#group:free frags first [ 2^0 2^1 2^2 2^3 2^4 2^5 2^6 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]

#0 : 469 2 550 [ 1 2 2 3 3 0 2 2 0 0 0 0 0 0 ]

这边只有一个块组的buddy信息,块组号为0freeblock469,块组的碎片为2,第一个空闲的block550,(469 = 1+ 2*2 + 2*4 + 3*8 + 3*16 + 2*64 + 2*128,MBalloc根据这些信息去分配block


ext4_fsblk_text4_mb_new_blocks(handle_t *handle,

struct ext4_allocation_request *ar, int *errp)

{

intfreed;

structext4_allocation_context *ac = NULL;

structext4_sb_info *sbi;

structsuper_block *sb;

ext4_fsblk_tblock = 0;

unsignedint inquota = 0;

unsignedint reserv_blks = 0;


sb= ar->inode->i_sb;

sbi= EXT4_SB(sb);


trace_ext4_request_blocks(ar);


/*

* For delayed allocation, we could skip the ENOSPC and

* EDQUOT check, as blocks and quotas have been already

* reserved when data being copied into pagecache.

*/

if(EXT4_I(ar->inode)->i_delalloc_reserved_flag)

ar->flags|= EXT4_MB_DELALLOC_RESERVED;

else{

/*Without delayed allocation we need to verify

* there is enough free blocks to do block allocation

* and verify allocation doesn't exceed the quota limits.

*/

while(ar->len && ext4_claim_free_blocks(sbi, ar->len)) {

/*let others to free the space */

yield();

ar->len= ar->len >> 1;

}

if(!ar->len) {

*errp= -ENOSPC;

return0;

}

reserv_blks= ar->len;

while(ar->len && dquot_alloc_block(ar->inode, ar->len)) {

ar->flags|= EXT4_MB_HINT_NOPREALLOC;

ar->len--;

}

inquota= ar->len;

if(ar->len == 0) {

*errp= -EDQUOT;

gotoout3;

}

}


ac= kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);

if(!ac) {

ar->len= 0;

*errp= -ENOMEM;

gotoout1;

}

//初始化ext4_allocation_context*ac

//并根据文件的大小决定使用组分配还是大文件分(默认16blocks)

*errp= ext4_mb_initialize_context(ac, ar);

if(*errp) {

ar->len= 0;

gotoout2;

}


ac->ac_op= EXT4_MB_HISTORY_PREALLOC;

//首先非文件不是用预分配,如果是文件则依次查找链表

//i_prealloc_list& lg_prealloc_list是否满足条件

if(!ext4_mb_use_preallocated(ac)) {

ac->ac_op= EXT4_MB_HISTORY_ALLOC;

ext4_mb_normalize_request(ac,ar);

//根据不同的分配策略,计算预分配的大小,组预分配的大小为512blocks

repeat:

/*allocate space in core */

ext4_mb_regular_allocator(ac);

//根据buddycache分配block

/*as we've just preallocated more space than

* user requested orinally, we store allocated

* space in a special descriptor */

printk("[%s, %d]ac->ac_o_ex.fe_len:%d,ac->ac_b_ex.fe_len:%d\n",

__func__, __LINE__, ac->ac_o_ex.fe_len, ac->ac_b_ex.fe_len);

if(ac->ac_status == AC_STATUS_FOUND &&

ac->ac_o_ex.fe_len< ac->ac_b_ex.fe_len)

ext4_mb_new_preallocation(ac);

//更新预分配信息

}

if(likely(ac->ac_status == AC_STATUS_FOUND)) {

*errp= ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);

if(*errp == -EAGAIN) {

printk("ext4_mb_release_context\n");

/*

* drop the reference that we took

* in ext4_mb_use_best_found

*/

ext4_mb_release_context(ac);

ac->ac_b_ex.fe_group= 0;

ac->ac_b_ex.fe_start= 0;

ac->ac_b_ex.fe_len= 0;

ac->ac_status= AC_STATUS_CONTINUE;

gotorepeat;

}else if (*errp) {

ext4_discard_allocated_blocks(ac);

ac->ac_b_ex.fe_len= 0;

ar->len= 0;

ext4_mb_show_ac(ac);

}else {


block= ext4_grp_offs_to_block(sb, &ac->ac_b_ex);

ar->len= ac->ac_b_ex.fe_len;

}

}else {

freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);

if(freed)

gotorepeat;

*errp= -ENOSPC;

ac->ac_b_ex.fe_len= 0;

ar->len= 0;

ext4_mb_show_ac(ac);

}


ext4_mb_release_context(ac);


out2:

kmem_cache_free(ext4_ac_cachep,ac);

out1:

if(inquota && ar->len < inquota)

dquot_free_block(ar->inode,inquota - ar->len);

out3:

if(!ar->len) {

if(!EXT4_I(ar->inode)->i_delalloc_reserved_flag)

/*release all the reserved blocks if non delalloc */

percpu_counter_sub(&sbi->s_dirtyblocks_counter,

reserv_blks);

}


trace_ext4_allocate_blocks(ar,(unsigned long long)block);

printk("[%s,%d]block:%lld\n", __func__, __LINE__, block);


returnblock;

}


/*
 *  根据bitmap的数据生成buddy数据
 * mb_find_next_bit(void *addr, int max, int start)
 *   从start位开始搜索下一个“1”的位置,并返回该位置。例如搜索从低位到高位,0X04的下一个“1”为2
 * ffs(0x4) = 3 返回一个“1”的位置
 *
 */
static void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
unsigned short i = 0;
unsigned short first;
unsigned short len;
unsigned free = 0;
unsigned fragments = 0;
unsigned long long period = get_cycles();
/* initialize buddy from bitmap which is aggregation
* of on-disk bitmap and preallocations */
i = mb_find_next_zero_bit(bitmap, max, 0);
/* 搜索bitmap,从第0位找到第一个空闲的位置 */
grp->bb_first_free = i;
while (i < max) {
fragments++; /* 该块组的碎片数 */
first = i;
i = mb_find_next_bit(bitmap, max, i); /* 搜索一个大的空闲块,并将其放入buddy中 */
len = i - first;
free += len;
if (len > 1)
ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
else
grp->bb_counters[0]++;
if (i < max)
i = mb_find_next_zero_bit(bitmap, max, i);
}
grp->bb_fragments = fragments;


if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group,  __func__,
"EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
group, free, grp->bb_free);
/*
* If we intent to continue, we consider group descritor
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
}


clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));


period = get_cycles() - period;
spin_lock(&EXT4_SB(sb)->s_bal_lock);
EXT4_SB(sb)->s_mb_buddies_generated++;
EXT4_SB(sb)->s_mb_generation_time += period;
spin_unlock(&EXT4_SB(sb)->s_bal_lock);
}


/**
 * ffs:返回fisrt “1”的位置(低位)
 * fls:返回last “1”的位置(高位)
 */
static void ext4_mb_mark_free_simple(struct super_block *sb,
void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t min;
ext4_grpblk_t max;
ext4_grpblk_t chunk;
unsigned short border;


BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));


border = 2 << sb->s_blocksize_bits;


while (len > 0) {
/* find how many blocks can be covered since this position */
max = ffs(first | border) - 1;


/* find how many blocks of power 2 we need to mark */
min = fls(len) - 1;


if (max < min)
min = max;
chunk = 1 << min;


/* mark multiblock chunks only */
grp->bb_counters[min]++;
if (min > 0)
mb_clear_bit(first >> min,
    buddy + sbi->s_mb_offsets[min]); /* 参考ext4_mb_init */
/* sbi->s_mb_offsets[min] 存放的各个buddy的偏移量
* 例如2^1的偏移量为0,个数为4096个(8192= 2 * 4096 = 4 * 2048)
* 注:2^0不在buddy中,使用bitmap buffer就行
*
* first >> min计算在该buddy中的位置
* 使用命令查看块设备EXT4状态:cat /proc/fs/ext4/<sda1>/mb_groups |more
*/
len -= chunk;
first += chunk;
}
}


int ext4_mb_init(struct super_block *sb, int needs_recovery)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i, j;
unsigned offset;
unsigned max;
int ret;


i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);


sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
return -ENOMEM;
}


i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
kfree(sbi->s_mb_maxs);
return -ENOMEM;
}


/* order 0 is regular bitmap */
sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
sbi->s_mb_offsets[0] = 0;


i = 1;
offset = 0;
max = sb->s_blocksize << 2;
/* buddy 偏移量初始化:2^0, 2^1, 2^2 ... 2^(sb->s_blocksize_bits + 1)

* sb->s_blocksize:1024, sb->s_blocksize_bits:10
* sbi->s_mb_offsets[1] = 0, sbi->s_mb_maxs[1] = 4096
* sbi->s_mb_offsets[2] = 512, sbi->s_mb_maxs[2] = 2048
* sbi->s_mb_offsets[3] = 768, sbi->s_mb_maxs[3] = 1024
* sbi->s_mb_offsets[4] = 896, sbi->s_mb_maxs[4] = 512
* sbi->s_mb_offsets[5] = 960, sbi->s_mb_maxs[5] = 256
* sbi->s_mb_offsets[6] = 992, sbi->s_mb_maxs[6] = 128
* sbi->s_mb_offsets[7] = 1008, sbi->s_mb_maxs[7] = 64
* sbi->s_mb_offsets[8] = 1016, sbi->s_mb_maxs[8] = 32
* sbi->s_mb_offsets[9] = 1020, sbi->s_mb_maxs[9] = 16
* sbi->s_mb_offsets[10] = 1022, sbi->s_mb_maxs[10] = 8
* sbi->s_mb_offsets[11] = 1023, sbi->s_mb_maxs[11] = 4
* 存放buddy位信息的开始位,一个块组的buddy cache大小为1kb,bitmap cache大小为1kb
* sbi->s_mb_offsets[1]存放了2^1 位信息的开始偏移量,buddycache[0] ~buddycache[511] 存放了2^1的位信息,最多是4096个,但是blocksize为1k时,
* 我们只有512位,512/8 = 64个字节存放就行了
*/
do {
sbi->s_mb_offsets[i] = offset;
sbi->s_mb_maxs[i] = max;
offset += 1 << (sb->s_blocksize_bits - i);
max = max >> 1;
i++;
} while (i <= sb->s_blocksize_bits + 1);


/* init file for buddy data */
ret = ext4_mb_init_backend(sb);
if (ret != 0) {
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
return ret;
}


spin_lock_init(&sbi->s_md_lock);
spin_lock_init(&sbi->s_bal_lock);


sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;


sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
return -ENOMEM;
}
for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
lg = per_cpu_ptr(sbi->s_locality_groups, i);
mutex_init(&lg->lg_mutex);
for (j = 0; j < PREALLOC_TB_SIZE; j++)
INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
spin_lock_init(&lg->lg_prealloc_lock);
}


ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);


if (sbi->s_journal)
sbi->s_journal->j_commit_callback = release_blocks_on_commit;


printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}

你可能感兴趣的:(list,struct,ext,Flex,branch,Allocation)