测试函数
#include<stdio.h>
#include<sys/types.h>
#include<sys/stat.h>
#include<fcntl.h>
#include<unistd.h>
intmain(int argc, char *argv[])
{
intfd = 0, i = 0;
charbuf[1024] = {0};
fd= open(argv[1], O_CREAT | O_RDWR); //创建文件并打开文件
for(i = 0; i < 20; i++) {
write(fd,buf, 1024); //写20k数据到新创建的文件中去(12k直接寻址+8k间接寻址)
}
close(fd);
return0;
}
dump_stack查看函数堆栈:
open(argv[1],O_CREAT | O_RDWR)的kernel函数调用
[<c0028650>](unwind_backtrace+0x0/0xec) from [<c00e4f14>](ext4_create+0x80/0x110)
[<c00e4f14>](ext4_create+0x80/0x110) from [<c0094d84>](vfs_create+0x74/0xa4)
[<c0094d84>](vfs_create+0x74/0xa4) from [<c009594c>] (do_last+0x2b8/0x590)
[<c009594c>](do_last+0x2b8/0x590) from [<c00974b4>](do_filp_open+0x16c/0x4c0)
[<c00974b4>](do_filp_open+0x16c/0x4c0) from [<c008b240>](do_sys_open+0x58/0xdc)
[<c008b240>](do_sys_open+0x58/0xdc) from [<c0022e60>](ret_fast_syscall+0x0/0x2c)
write(fd,buf, 1024)的函数调用
[<c0028650>](unwind_backtrace+0x0/0xec) from [<c00f8ae4>](ext4_mb_new_blocks+0x2c/0x3b4)
[<c00f8ae4>](ext4_mb_new_blocks+0x2c/0x3b4) from [<c00ddcbc>](ext4_ind_map_blocks+0x4bc/0xa44)
[<c00ddcbc>](ext4_ind_map_blocks+0x4bc/0xa44) from [<c00de490>](ext4_map_blocks+0x114/0x1cc)
[<c00de490>](ext4_map_blocks+0x114/0x1cc) from [<c00de608>](_ext4_get_block+0xc0/0x15c)
[<c00de608>](_ext4_get_block+0xc0/0x15c) from [<c00ad88c>](__block_prepare_write+0x1a8/0x464)
[<c00ad88c>](__block_prepare_write+0x1a8/0x464) from [<c00add28>](block_write_begin_newtrunc+0x88/0xdc)
[<c00add28>](block_write_begin_newtrunc+0x88/0xdc) from [<c00ae1cc>](block_write_begin+0x40/0x94)
[<c00ae1cc>](block_write_begin+0x40/0x94) from [<c00e198c>](ext4_write_begin+0x150/0x2e0)
[<c00e198c>](ext4_write_begin+0x150/0x2e0) from [<c0068170>](generic_file_buffered_write+0xdc/0x214)
[<c0068170>](generic_file_buffered_write+0xdc/0x214) from [<c0069d74>](__generic_file_aio_write+0x41c/0x46c)
[<c0069d74>](__generic_file_aio_write+0x41c/0x46c) from [<c0069e2c>](generic_file_aio_write+0x68/0xcc)
[<c0069e2c>](generic_file_aio_write+0x68/0xcc) from [<c008c74c>](do_sync_write+0x98/0xe4)
[<c008c74c>](do_sync_write+0x98/0xe4) from [<c008d120>](vfs_write+0xac/0x124)
[<c008d120>](vfs_write+0xac/0x124) from [<c008d244>] (sys_write+0x3c/0x68)
[<c008d244>](sys_write+0x3c/0x68) from [<c0022e60>](ret_fast_syscall+0x0/0x2c)
LOG输出分析:
/# mkdir /mnt;mount -t ext4 -o nosuid,nodev,nodelalloc 1M.img /mnt;cd/mnt;
ddif=/dev/zero of=1 bs=1024 count=20
[_ext4_get_block,1359]map.m_lblk = 0, map.m_len = 1//写第一个1k数据,长度为1k,文件的偏移block为0
[ext4_ind_map_blocks,943]depth = 1, flag = 0x0//读测试,depth表示可直接寻址的block
[ext4_map_blocks,1252]retval = 0, map->m_flags = 0x0
[ext4_map_blocks,1294]down_write
[ext4_ind_map_blocks,943]depth = 1, flag = 0x1//再次调用ext4_ind_map_blocks分配blocks
[ext4_alloc_branch,733]indirect_blks = 0//不用分配存放寻址信息的block(metablocks)
[ext4_mb_group_or_file,3980]ac->ac_o_ex.fe_logical:0, size:1,isize:0
//组分分配还是大文件分配策略(16blocks):/sys/fs/ext4/loop0/mb_stream_req
[ext4_mb_initialize_context,4060]init ac: 1 blocks @ 0, goal 190, flags a0, 2^0, left: 0/0, right0/0 to writable
[ext4_mb_use_preallocated,3167]i_prealloc_list
[ext4_mb_use_preallocated,3197]lg_prealloc_list
[ext4_mb_use_preallocated,3215]order:0, goal_block:190
[ext4_mb_normalize_group_request,2842]#35: goal 512 blocks for locality group
//默认的组预分配大小(512Blocks):/sys/fs/ext4/loop0/mb_group_prealloc
[ext4_mb_regular_allocator,2032]fls(512) = 10
[ext4_mb_new_blocks,4348]ac->ac_o_ex.fe_len:1, ac->ac_b_ex.fe_len:512
[ext4_mb_add_n_trim,4193]list_add_tail_rcu:8
//插入到组预分配的链表中去list[0]表示2^1blocks的预分配链表,list[8]则表示2^9= 512blocks
[ext4_mb_new_blocks,4404]block:38
[ext4_alloc_blocks,672]current_block = 38
[ext4_alloc_blocks,685]blk_allocated = 1
[ext4_alloc_branch,786]num = 1
[_ext4_get_block,1359]map.m_lblk = 1, map.m_len = 1
[ext4_ind_map_blocks,943]depth = 1, flag = 0x0
[ext4_map_blocks,1252]retval = 0, map->m_flags = 0x0
[ext4_map_blocks,1294]down_write
[ext4_ind_map_blocks,943]depth = 1, flag = 0x1
[ext4_alloc_branch,733]indirect_blks = 0
[ext4_mb_group_or_file,3980]ac->ac_o_ex.fe_logical:1, size:2,isize:1
[ext4_mb_initialize_context,4060]init ac: 1 blocks @ 1, goal 38, flags a0, 2^0, left: 0/0, right0/0 to writable
[ext4_mb_use_preallocated,3167]i_prealloc_list
[ext4_mb_use_preallocated,3197]lg_prealloc_list
[ext4_mb_use_preallocated,3215]order:0, goal_block:38
[ext4_mb_use_preallocated,3226]i:8, pa_deleted:0, pa_free:511
[ext4_mb_add_n_trim,4193]list_add_tail_rcu:8
[ext4_mb_new_blocks,4404]block:39
[ext4_alloc_blocks,672]current_block = 39
[ext4_alloc_blocks,685]blk_allocated = 1
[ext4_alloc_branch,786]num = 1
…
...
[_ext4_get_block,1350]map.m_lblk = 16, map.m_len = 1
[ext4_mb_group_or_file,3980]ac->ac_o_ex.fe_logical:16, size:17,isize:16
###s_mb_stream_request//启用大文件分配策略
[ext4_mb_initialize_context,4060]init ac: 1 blocks @ 16, goal 53, flags 820, 2^0, left: 0/0,right 0/0 to writable
[ext4_mb_use_preallocated,3167]i_prealloc_list
[ext4_mb_use_preallocated,3197]lg_prealloc_list
[ext4_mb_regular_allocator,2032]fls(32) = 6
[ext4_mb_new_blocks,4348]ac->ac_o_ex.fe_len:1, ac->ac_b_ex.fe_len:32
[ext4_mb_new_blocks,4404]block:977
[_ext4_get_block,1350]map.m_lblk = 17, map.m_len = 1
[ext4_mb_group_or_file,3980]ac->ac_o_ex.fe_logical:17, size:18,isize:17
###s_mb_stream_request
[ext4_mb_initialize_context,4060]init ac: 1 blocks @ 17, goal 977, flags 820, 2^0, left: 0/0,right 0/0 to writable
[ext4_mb_use_preallocated,3167]i_prealloc_list
[ext4_mb_new_blocks,4404]block:978
函数分析
ext4_ind_map_blocks调用ext4_block_to_path去查找inode中block存放的位置
i_block为将分配的block在文件中的偏移blocks数,从log中可以看到我们写20k的数据到新创建的文件中去,
第一次i_block为0,你二次为1,以此类推。offset存放block的寻址信息,我们先看一下每一种寻址的范围:
#define EXT4_NDIR_BLOCKS 12//直接寻址的block数(12k)
#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS //(12K+256k= 268k)
#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK+ 1) //(12k+256k+256^2k)
#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK+ 1)//(12k+256k+256^2k+256^3k)
#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK+ 1)
由此可以看出offset存放的是i_block的寻址信息,即如何去查找到该block,depth为寻址的深度
1标志直接寻址,2表示间接寻址,3表示3级寻址,4表示4级寻址
staticint ext4_block_to_path(struct inode *inode,
ext4_lblk_t i_block,
ext4_lblk_t offsets[4], int *boundary)
{
intptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
intptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
constlong direct_blocks = EXT4_NDIR_BLOCKS,//直接寻址的BLOCK数(12个)
indirect_blocks= ptrs,//简介寻址的BLOCK数(1024/4= 256个)
double_blocks= (1 << (ptrs_bits * 2));//256^2个BLOCKs
intn = 0;
intfinal = 0;
if(i_block < direct_blocks) {
offsets[n++]= i_block;
final= direct_blocks;
}else if ((i_block -= direct_blocks) < indirect_blocks) {
offsets[n++]= EXT4_IND_BLOCK;//直接寻址已经满,offsets[0]= 12
offsets[n++]= i_block;//使用第二级寻址,注i_block=i_blocks -12
final= ptrs;
}else if ((i_block -= indirect_blocks) < double_blocks) {
offsets[n++]= EXT4_DIND_BLOCK;//i_block= i_blocks -12 – 256,offsets[0]= 13
offsets[n++]= i_block >> ptrs_bits;//offset[1]= iblock>>(256^2)
offsets[n++]= i_block & (ptrs - 1);
final= ptrs;
}else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) <ptrs) {
offsets[n++]= EXT4_TIND_BLOCK;
offsets[n++]= i_block >> (ptrs_bits * 2);
offsets[n++]= (i_block >> ptrs_bits) & (ptrs - 1);
offsets[n++]= i_block & (ptrs - 1);
final= ptrs;
}else {
ext4_warning(inode->i_sb,"block %lu > max in inode %lu",
i_block + direct_blocks +
indirect_blocks + double_blocks, inode->i_ino);
}
if(boundary)
*boundary= final - 1 - (i_block & (ptrs – 1));
//该次寻址还剩多少个blocks,如i_block=0,boundary=12 -1 =11
returnn;
}
根据ext4_block_to_path函数返回的寻址信息,查看相应的block是否已经分配了空间,EXT4_I(inode)->i_data会存放一份原始inode寻址信息(EXT4物理磁盘上的inode)的备份,根据其内容可以判断block是否已经分配,如果没有分配,返回位置信息的指针Indirect*p。根据该返回值可以判断我们需不需要为其分配额外的metablocks(用于blocks寻址的空闲block,例如在分配第13个block是,需要间接寻址,这是需要多分配一个额外的间接寻址block)
staticIndirect *ext4_get_branch(struct inode *inode, int depth,
ext4_lblk_t *offsets,
Indirect chain[4], int *err)
{
structsuper_block *sb = inode->i_sb;
Indirect*p = chain;
structbuffer_head *bh;
*err= 0;
/*i_data is not going away, no lock needed */
add_chain(chain,NULL, EXT4_I(inode)->i_data + *offsets);
/*staticinline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
{
p->key= *(p->p = v);存放block的指针
p->bh= bh;存放block的值
}
加入iblock<=12因为是新分配的block,p->key为0即还没有分配block
*/
if(!p->key)
gotono_block;
while(--depth) {
bh= sb_getblk(sb, le32_to_cpu(p->key));
if(unlikely(!bh))
gotofailure;
if(!bh_uptodate_or_lock(bh)) {
if(bh_submit_read(bh) < 0) {
put_bh(bh);
gotofailure;
}
/*validate block references */
if(ext4_check_indirect_blockref(inode, bh)) {
put_bh(bh);
gotofailure;
}
}
add_chain(++p,bh, (__le32 *)bh->b_data + *++offsets);
/*Reader: end */
if(!p->key)
gotono_block;
}
returnNULL;
failure:
*err= -EIO;
no_block:
returnp;
}
ext4_find_goal调用ext4_find_near计算goal值,加入是首次分配则根据inode的块组信息ei->i_block_group去计算,如果之前已经分配过空间了,则以上次分配的block为goal
staticext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
{
structext4_inode_info *ei = EXT4_I(inode);
__le32*start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
__le32*p;
ext4_fsblk_tbg_start;
ext4_fsblk_tlast_block;
ext4_grpblk_tcolour;
ext4_group_tblock_group;
intflex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
//s_log_groups_per_flex= log2(groups_per_flex) = 4,不是用flex特性时,flex_size= 2^0 = 1
//flex_bg特性是将几个连续的块组拼接成一个大的块组,s_log_groups_per_flex= 4即将2^4= 16个 //块组拼成一个大的块组
/*Try to find previous block */
for(p = ind->p - 1; p >= start; p--) {
if(*p)
returnle32_to_cpu(*p);//直接返回上一次分配的块号,例如blocks[0]=38,goal就设//置成38
}
/*No such thing, so let's try location of indirect block */
if(ind->bh)
returnind->bh->b_blocknr;
/*
* It is going to be referred to from the inode itself? OK, just putit
* into the same cylinder group then.
*/
block_group= ei->i_block_group;//inode所在的块组
if(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
block_group&= ~(flex_size-1);
if(S_ISREG(inode->i_mode))
block_group++;
}
bg_start= ext4_group_first_block_no(inode->i_sb, block_group);
last_block= ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
/*
* If we are doing delayed allocation, we don't need take
* colour into account.
*/
if(test_opt(inode->i_sb, DELALLOC))
returnbg_start;
if(bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour= (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb)/ 16);
else
colour= (current->pid % 16) * ((last_block - bg_start) / 16);
returnbg_start + colour;
}
ext4_ind_map_blocks寻址计算goal,block数,最后调用ext4_alloc_branch分配block(包含分配metablocks),最后调用ext4_splice_branch更新inode
staticint ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
int flags)
{
interr = -EIO;
ext4_lblk_toffsets[4];
Indirectchain[4];
Indirect*partial;
ext4_fsblk_tgoal;
intindirect_blks;
intblocks_to_boundary = 0;
intdepth;
intcount = 0;
ext4_fsblk_tfirst_block = 0;
J_ASSERT(!(ext4_test_inode_flag(inode,EXT4_INODE_EXTENTS)));
J_ASSERT(handle!= NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
depth= ext4_block_to_path(inode, map->m_lblk, offsets,
&blocks_to_boundary);
if(depth == 0)
gotoout;
partial= ext4_get_branch(inode, depth, offsets, chain, &err);
/*Simplest case - block found, no allocation needed */
if(!partial) {
first_block= le32_to_cpu(chain[depth - 1].key);
count++;
/*mapmore blocks*/
while(count < map->m_len && count <= blocks_to_boundary){
ext4_fsblk_tblk;
blk= le32_to_cpu(*(chain[depth-1].p + count));
if(blk == first_block + count)
count++;
else
break;
}
gotogot_it;
}
/*Next simple case - plain lookup or failed read of indirect block */
if((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
gotocleanup;
/*
* Okay, we need to do block allocation.
*/
goal= ext4_find_goal(inode, map->m_lblk, partial);
/*the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks= (chain + depth) - partial – 1;//需要分配的间接寻址block数
/*
* Next look up the indirect map to count the totoal number of
* direct blocks to allocate for this branch.
*/
//返回需要分配的个数
count= ext4_blks_to_allocate(partial, indirect_blks,
map->m_len, blocks_to_boundary);
/*
* Block out ext4_truncate while we alter the tree
*/
err= ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
&count,goal,
offsets+ (partial - chain), partial);
/*
* The ext4_splice_branch call will free and forget any buffers
* on the new chain if there is a failure, but that risks using
* up transaction credits, especially for bitmaps where the
* credits cannot be returned. Can we handle this somehow? We
* may need to return -EAGAIN upwards in the worst case. --sct
*/
if(!err)
err= ext4_splice_branch(handle, inode, map->m_lblk,
partial, indirect_blks, count);
if(err)
gotocleanup;
map->m_flags|= EXT4_MAP_NEW;//新分配的block
ext4_update_inode_fsync_trans(handle,inode, 1);
got_it:
map->m_flags|= EXT4_MAP_MAPPED;
map->m_pblk= le32_to_cpu(chain[depth-1].key);
map->m_len= count;
if(count > blocks_to_boundary)
map->m_flags|= EXT4_MAP_BOUNDARY;
err= count;
/*Clean up and exit */
partial= chain + depth - 1; /* the whole chain */
cleanup:
while(partial > chain) {
BUFFER_TRACE(partial->bh,"call brelse");
brelse(partial->bh);
partial--;
}
out:
returnerr;
}
在该函数中调用了两次ext4_ind_map_blocks,第一次是读尝试,得到EXT4_I(inode)->i_data_sem信号量后,将flags设置为0,进行block寻址,当需要写数据时,需要再次调用ext4_ind_map_blocks去真正的分配一个block
intext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
intretval;
map->m_flags= 0;
ext_debug("ext4_map_blocks():inode %lu, flag %d, max_blocks %u,"
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
/*
* Try to see if we can get the block without requesting a new
* file system block.
*/
down_read((&EXT4_I(inode)->i_data_sem));//得到读的信号量
if(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval= ext4_ext_map_blocks(handle, inode, map, 0);
}else {
retval= ext4_ind_map_blocks(handle, inode, map, 0); //读block
}
up_read((&EXT4_I(inode)->i_data_sem));
printk("[%s,%d]retval = %d, map->m_flags = 0x%x\n",
__func__,__LINE__, retval, map->m_flags);
if(retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
intret = check_block_validity(inode, __func__, map);
if(ret != 0) {
returnret;
}
}
/*If it is only a block(s) look up */
if((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
returnretval;
}
/*
* Returns if the blocks have already allocated
*
* Note that if blocks have been preallocated
* ext4_ext_get_block() returns th create = 0
* with buffer head unmapped.
*/
if(retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
returnretval;
}
/*
* When we call get_blocks without the create flag, the
* BH_Unwritten flag could have gotten set if the blocks
* requested were part of a uninitialized extent. We need to
* clear this flag now that we are committed to convert all or
* part of the uninitialized extent to be an initialized
* extent. This is because we need to avoid the combination
* of BH_Unwritten and BH_Mapped flags being simultaneously
* set on the buffer_head.
*/
map->m_flags&= ~EXT4_MAP_UNWRITTEN;
/*
* New blocks allocate and/or writing to uninitialized extent
* will possibly result in updating i_data, so we take
* the write lock of i_data_sem, and call get_blocks()
* with create == 1 flag.
*/
down_write((&EXT4_I(inode)->i_data_sem));//得到写的信号量
/*
* if the caller is from delayed allocation writeout path
* we have already reserved fs blocks for allocation
* let the underlying get_block() function know to
* avoid double accounting
*/
if(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
EXT4_I(inode)->i_delalloc_reserved_flag= 1;
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
*/
if(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval= ext4_ext_map_blocks(handle, inode, map, flags);
}else {
retval= ext4_ind_map_blocks(handle, inode, map, flags);//开始分配block
if(retval > 0 && map->m_flags & EXT4_MAP_NEW) {
/*
* We allocated new blocks which will result in
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
ext4_clear_inode_state(inode,EXT4_STATE_EXT_MIGRATE);
}
/*
* Update reserved blocks/metadata blocks after successful
* block allocation which had been deferred till now. We don't
* support fallocate for non extent files. So we can update
* reserve space here.
*/
if((retval > 0) &&
(flags& EXT4_GET_BLOCKS_DELALLOC_RESERVE))
ext4_da_update_reserve_space(inode,retval, 1);
}
if(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
EXT4_I(inode)->i_delalloc_reserved_flag= 0;
up_write((&EXT4_I(inode)->i_data_sem));
if(retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
intret = check_block_validity(inode,
"ext4_map_blocks_after_alloc",
map);
if(ret != 0)
returnret;
}
returnretval;
}
EXT4文件系统中分配block最关键的函数ext4_mb_new_blocks,大家知道EXT4使用了MBalloc分配器,即多块分配器,在分配block时,并不是按需分配,而是进行一定了预分配,并又工具文件大小的不同使用不同的分配策略(组分配&大文件的inode分配)
为了完成这个功能,EXT4的超级块数据接口中多了一个名为s_buddy_cache的inode指针用于存放bitmapcache & buddy cache。Buddycache的使用类似与linux内存的buddy分配器,即把连续空闲的block按照2的n次方去存放和分配。从proc节点中我们可以简单的了解一下buddycache的使用。
/mnt# cat /proc/fs/ext4/loop0/mb_groups
#group:free frags first [ 2^0 2^1 2^2 2^3 2^4 2^5 2^6 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]
#0 : 469 2 550 [ 1 2 2 3 3 0 2 2 0 0 0 0 0 0 ]
这边只有一个块组的buddy信息,块组号为0,free的block为469,块组的碎片为2,第一个空闲的block为550,(469 = 1+ 2*2 + 2*4 + 3*8 + 3*16 + 2*64 + 2*128),MBalloc根据这些信息去分配block
ext4_fsblk_text4_mb_new_blocks(handle_t *handle,
struct ext4_allocation_request *ar, int *errp)
{
intfreed;
structext4_allocation_context *ac = NULL;
structext4_sb_info *sbi;
structsuper_block *sb;
ext4_fsblk_tblock = 0;
unsignedint inquota = 0;
unsignedint reserv_blks = 0;
sb= ar->inode->i_sb;
sbi= EXT4_SB(sb);
trace_ext4_request_blocks(ar);
/*
* For delayed allocation, we could skip the ENOSPC and
* EDQUOT check, as blocks and quotas have been already
* reserved when data being copied into pagecache.
*/
if(EXT4_I(ar->inode)->i_delalloc_reserved_flag)
ar->flags|= EXT4_MB_DELALLOC_RESERVED;
else{
/*Without delayed allocation we need to verify
* there is enough free blocks to do block allocation
* and verify allocation doesn't exceed the quota limits.
*/
while(ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
/*let others to free the space */
yield();
ar->len= ar->len >> 1;
}
if(!ar->len) {
*errp= -ENOSPC;
return0;
}
reserv_blks= ar->len;
while(ar->len && dquot_alloc_block(ar->inode, ar->len)) {
ar->flags|= EXT4_MB_HINT_NOPREALLOC;
ar->len--;
}
inquota= ar->len;
if(ar->len == 0) {
*errp= -EDQUOT;
gotoout3;
}
}
ac= kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if(!ac) {
ar->len= 0;
*errp= -ENOMEM;
gotoout1;
}
//初始化ext4_allocation_context*ac,
//并根据文件的大小决定使用组分配还是大文件分(默认16blocks)
*errp= ext4_mb_initialize_context(ac, ar);
if(*errp) {
ar->len= 0;
gotoout2;
}
ac->ac_op= EXT4_MB_HISTORY_PREALLOC;
//首先非文件不是用预分配,如果是文件则依次查找链表
//i_prealloc_list& lg_prealloc_list是否满足条件
if(!ext4_mb_use_preallocated(ac)) {
ac->ac_op= EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac,ar);
//根据不同的分配策略,计算预分配的大小,组预分配的大小为512blocks
repeat:
/*allocate space in core */
ext4_mb_regular_allocator(ac);
//根据buddycache分配block
/*as we've just preallocated more space than
* user requested orinally, we store allocated
* space in a special descriptor */
printk("[%s, %d]ac->ac_o_ex.fe_len:%d,ac->ac_b_ex.fe_len:%d\n",
__func__, __LINE__, ac->ac_o_ex.fe_len, ac->ac_b_ex.fe_len);
if(ac->ac_status == AC_STATUS_FOUND &&
ac->ac_o_ex.fe_len< ac->ac_b_ex.fe_len)
ext4_mb_new_preallocation(ac);
//更新预分配信息
}
if(likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp= ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
if(*errp == -EAGAIN) {
printk("ext4_mb_release_context\n");
/*
* drop the reference that we took
* in ext4_mb_use_best_found
*/
ext4_mb_release_context(ac);
ac->ac_b_ex.fe_group= 0;
ac->ac_b_ex.fe_start= 0;
ac->ac_b_ex.fe_len= 0;
ac->ac_status= AC_STATUS_CONTINUE;
gotorepeat;
}else if (*errp) {
ext4_discard_allocated_blocks(ac);
ac->ac_b_ex.fe_len= 0;
ar->len= 0;
ext4_mb_show_ac(ac);
}else {
block= ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
ar->len= ac->ac_b_ex.fe_len;
}
}else {
freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
if(freed)
gotorepeat;
*errp= -ENOSPC;
ac->ac_b_ex.fe_len= 0;
ar->len= 0;
ext4_mb_show_ac(ac);
}
ext4_mb_release_context(ac);
out2:
kmem_cache_free(ext4_ac_cachep,ac);
out1:
if(inquota && ar->len < inquota)
dquot_free_block(ar->inode,inquota - ar->len);
out3:
if(!ar->len) {
if(!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
/*release all the reserved blocks if non delalloc */
percpu_counter_sub(&sbi->s_dirtyblocks_counter,
reserv_blks);
}
trace_ext4_allocate_blocks(ar,(unsigned long long)block);
printk("[%s,%d]block:%lld\n", __func__, __LINE__, block);
returnblock;
}
/*
* 根据bitmap的数据生成buddy数据
* mb_find_next_bit(void *addr, int max, int start)
* 从start位开始搜索下一个“1”的位置,并返回该位置。例如搜索从低位到高位,0X04的下一个“1”为2
* ffs(0x4) = 3 返回一个“1”的位置
*
*/
static void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
unsigned short i = 0;
unsigned short first;
unsigned short len;
unsigned free = 0;
unsigned fragments = 0;
unsigned long long period = get_cycles();
/* initialize buddy from bitmap which is aggregation
* of on-disk bitmap and preallocations */
i = mb_find_next_zero_bit(bitmap, max, 0);
/* 搜索bitmap,从第0位找到第一个空闲的位置 */
grp->bb_first_free = i;
while (i < max) {
fragments++; /* 该块组的碎片数 */
first = i;
i = mb_find_next_bit(bitmap, max, i); /* 搜索一个大的空闲块,并将其放入buddy中 */
len = i - first;
free += len;
if (len > 1)
ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
else
grp->bb_counters[0]++;
if (i < max)
i = mb_find_next_zero_bit(bitmap, max, i);
}
grp->bb_fragments = fragments;
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, __func__,
"EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
group, free, grp->bb_free);
/*
* If we intent to continue, we consider group descritor
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
}
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
period = get_cycles() - period;
spin_lock(&EXT4_SB(sb)->s_bal_lock);
EXT4_SB(sb)->s_mb_buddies_generated++;
EXT4_SB(sb)->s_mb_generation_time += period;
spin_unlock(&EXT4_SB(sb)->s_bal_lock);
}
/**
* ffs:返回fisrt “1”的位置(低位)
* fls:返回last “1”的位置(高位)
*/
static void ext4_mb_mark_free_simple(struct super_block *sb,
void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t min;
ext4_grpblk_t max;
ext4_grpblk_t chunk;
unsigned short border;
BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
border = 2 << sb->s_blocksize_bits;
while (len > 0) {
/* find how many blocks can be covered since this position */
max = ffs(first | border) - 1;
/* find how many blocks of power 2 we need to mark */
min = fls(len) - 1;
if (max < min)
min = max;
chunk = 1 << min;
/* mark multiblock chunks only */
grp->bb_counters[min]++;
if (min > 0)
mb_clear_bit(first >> min,
buddy + sbi->s_mb_offsets[min]); /* 参考ext4_mb_init */
/* sbi->s_mb_offsets[min] 存放的各个buddy的偏移量
* 例如2^1的偏移量为0,个数为4096个(8192= 2 * 4096 = 4 * 2048)
* 注:2^0不在buddy中,使用bitmap buffer就行
*
* first >> min计算在该buddy中的位置
* 使用命令查看块设备EXT4状态:cat /proc/fs/ext4/<sda1>/mb_groups |more
*/
len -= chunk;
first += chunk;
}
}
int ext4_mb_init(struct super_block *sb, int needs_recovery)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i, j;
unsigned offset;
unsigned max;
int ret;
i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
return -ENOMEM;
}
i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
kfree(sbi->s_mb_maxs);
return -ENOMEM;
}
/* order 0 is regular bitmap */
sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
sbi->s_mb_offsets[0] = 0;
i = 1;
offset = 0;
max = sb->s_blocksize << 2;
/* buddy 偏移量初始化:2^0, 2^1, 2^2 ... 2^(sb->s_blocksize_bits + 1)
*
* sb->s_blocksize:1024, sb->s_blocksize_bits:10
* sbi->s_mb_offsets[1] = 0, sbi->s_mb_maxs[1] = 4096
* sbi->s_mb_offsets[2] = 512, sbi->s_mb_maxs[2] = 2048
* sbi->s_mb_offsets[3] = 768, sbi->s_mb_maxs[3] = 1024
* sbi->s_mb_offsets[4] = 896, sbi->s_mb_maxs[4] = 512
* sbi->s_mb_offsets[5] = 960, sbi->s_mb_maxs[5] = 256
* sbi->s_mb_offsets[6] = 992, sbi->s_mb_maxs[6] = 128
* sbi->s_mb_offsets[7] = 1008, sbi->s_mb_maxs[7] = 64
* sbi->s_mb_offsets[8] = 1016, sbi->s_mb_maxs[8] = 32
* sbi->s_mb_offsets[9] = 1020, sbi->s_mb_maxs[9] = 16
* sbi->s_mb_offsets[10] = 1022, sbi->s_mb_maxs[10] = 8
* sbi->s_mb_offsets[11] = 1023, sbi->s_mb_maxs[11] = 4
* 存放buddy位信息的开始位,一个块组的buddy cache大小为1kb,bitmap cache大小为1kb
* sbi->s_mb_offsets[1]存放了2^1 位信息的开始偏移量,buddycache[0] ~buddycache[511] 存放了2^1的位信息,最多是4096个,但是blocksize为1k时,
* 我们只有512位,512/8 = 64个字节存放就行了
*/
do {
sbi->s_mb_offsets[i] = offset;
sbi->s_mb_maxs[i] = max;
offset += 1 << (sb->s_blocksize_bits - i);
max = max >> 1;
i++;
} while (i <= sb->s_blocksize_bits + 1);
/* init file for buddy data */
ret = ext4_mb_init_backend(sb);
if (ret != 0) {
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
return ret;
}
spin_lock_init(&sbi->s_md_lock);
spin_lock_init(&sbi->s_bal_lock);
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
return -ENOMEM;
}
for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
lg = per_cpu_ptr(sbi->s_locality_groups, i);
mutex_init(&lg->lg_mutex);
for (j = 0; j < PREALLOC_TB_SIZE; j++)
INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
spin_lock_init(&lg->lg_prealloc_lock);
}
ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);
if (sbi->s_journal)
sbi->s_journal->j_commit_callback = release_blocks_on_commit;
printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}