static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
ext4_group_t prefetch_grp = 0, ngroups, group, i;
int cr = -1;
int err = 0, first_err = 0;
unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
int lost;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
ngroups = ext4_get_groups_count(sb);
/* non-extent files are limited to low blocks/groups */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
ngroups = sbi->s_blockfile_groups;
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
/* first, try the goal */
//首先尝试从goal分配物理块,如果分配到goto out返回
err = ext4_mb_find_by_goal(ac, &e4b);
if (err || ac->ac_status == AC_STATUS_FOUND)
goto out;
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
goto out;
/*
* ac->ac_2order is set only if the fe_len is a power of 2
* if ac->ac_2order is set we also set criteria to 0 so that we
* try exact allocation using buddy.
*/
i = fls(ac->ac_g_ex.fe_len);
ac->ac_2order = 0;
/*
* We search using buddy data only if the order of the request
* is greater than equal to the sbi_s_mb_order2_reqs
* You can tune it via /sys/fs/ext4//mb_order2_req
* We also support searching for power-of-two requests only for
* requests upto maximum buddy size we have constructed.
*/
if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
/*
* This should tell if fe_len is exactly power of 2
*/
//申请分配的数量刚好是2的N次方,比如申请1024个block,那么ac->ac_2order = 10
if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
ac->ac_2order = array_index_nospec(i - 1,
sb->s_blocksize_bits + 2);
}
/* if stream allocation is enabled, use global goal */
//由于goal目标物理块无法申请到空间,那么从文件系统上次分配的地方开始分配
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
/* TBD: may be hot point */
spin_lock(&sbi->s_md_lock);
ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
//申请有个严苛程度的概念,如果是2^N方申请量,那么就用cr=0代表精确分配请求数量的物理物理块
/*
* cr == 0 try to get exact allocation,
* cr == 3 try to get anything
*/
repeat:
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
/*
* searching for the right group start
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
prefetch_grp = group;
for (i = 0; i < ngroups; group++, i++) {
int ret = 0;
cond_resched();
/*
* Artificially restricted ngroups for non-extent
* files makes group > ngroups possible on first loop.
*/
if (group >= ngroups)
group = 0;
...
/* This now checks without needing the buddy page */
ret = ext4_mb_good_group_nolock(ac, group, cr);
if (ret <= 0) {
if (!first_err)
first_err = ret;
continue;
}
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err)
goto out;
ext4_lock_group(sb, group);
/*
* We need to check again after locking the
* block group
*/
ret = ext4_mb_good_group(ac, group, cr);
if (ret == 0) {
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
continue;
}
//到这里说明已经找到good group来申请空间
ac->ac_groups_scanned++;
if (cr == 0)
//申请的block数量正好是2的N次方
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 && sbi->s_stripe &&
!(ac->ac_g_ex.fe_len % sbi->s_stripe))
//要分配的长度是stripe的整数倍,这是对raid的优化
ext4_mb_scan_aligned(ac, &e4b);
else
//遍历block group内的所有空闲空间段,然后找出最合适的空闲空间段
ext4_mb_complex_scan_group(ac, &e4b);
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
if (ac->ac_status != AC_STATUS_CONTINUE)
break;
}
}
//走到这里说明最终也没有分配到空间
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
!(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
/*
* We've been searching too long. Let's try to allocate
* the best chunk we've found so far
*/
// 有空闲空间就行
ext4_mb_try_best_found(ac, &e4b);
//再尝试遍历一次所有的block group有空闲空间就行
if (ac->ac_status != AC_STATUS_FOUND) {
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
* found block(s)
*/
lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len, lost);
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
cr = 3;
goto repeat;
}
}
out:
if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
err = first_err;
mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
ac->ac_flags, cr, err);
if (nr)
ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
return err;
}
static noinline_for_stack
int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
ext4_group_t group = ac->ac_g_ex.fe_group;
int max;
int err;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
return 0;
if (grp->bb_free == 0)
return 0;
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
if (err)
return err;
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
ext4_mb_unload_buddy(e4b);
return 0;
}
ext4_lock_group(ac->ac_sb, group);
//根据buddy bitmap,查找已ac_g_ex的起始物理块号fe_start开始的空闲区域长度,最少
//要>=fe_len,当然也有可能不存在这么大的连续空闲区间。
max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
ex.fe_logical = 0xDEADFA11; /* debug value */
//这个分支是ext4文件系统对raid的优化,当分配的起始地址和长度都对齐到stripe时才分配
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
ext4_fsblk_t start;
start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
ex.fe_start;
/* use do_div to get remainder (would be 64-bit modulo) */
if (do_div(start, sbi->s_stripe) == 0) {
ac->ac_found++;
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
}
} else if (max >= ac->ac_g_ex.fe_len) {
//分配成功
BUG_ON(ex.fe_len <= 0);
BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
ac->ac_found++;
ac->ac_b_ex = ex;n
ext4_mb_use_best_found(ac, e4b);
} else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
/* Sometimes, caller may want to merge even small
* number of blocks to an existing extent */
BUG_ON(ex.fe_len <= 0);
BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
ac->ac_found++;
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
}
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
return 0;
}
static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
int next = block;
int max, order;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
BUG_ON(ex == NULL);
buddy = mb_find_buddy(e4b, 0, &max);
BUG_ON(buddy == NULL);
BUG_ON(block >= max);
//先遍历0 order的bitmap,如果物理块号block已经分配,直接返回
if (mb_test_bit(block, buddy)) {
ex->fe_len = 0;
ex->fe_start = 0;
ex->fe_group = 0;
return 0;
}
/* find actual order */
//order表示的是从block位置开始的空闲的阶(空闲的最小order)
order = mb_find_order_for_block(e4b, block);
block = block >> order;
ex->fe_len = 1 << order;
ex->fe_start = block << order;
ex->fe_group = e4b->bd_group;
/* calc difference from given start */
next = next - ex->fe_start;
ex->fe_len -= next;
ex->fe_start += next;
while (needed > ex->fe_len &&
mb_find_buddy(e4b, order, &max)) {
if (block + 1 >= max)
break;
next = (block + 1) * (1 << order);
if (mb_test_bit(next, e4b->bd_bitmap))
break;
order = mb_find_order_for_block(e4b, next);
block = next >> order;
ex->fe_len += 1 << order;
}
...
return ex->fe_len;
}
函数返回的ex->fe_len是从block物理块号开始的最大连续空间块长度,有可能>=needed参数代表的请求数量,也可能小于needed,即无法从goal目标物理块block开始的连续空闲块不满足needed的申请要求。
ext4_mb_good_group_nolock函数
/*
* This could return negative error code if something goes wrong
* during ext4_mb_init_group(). This should not be called with
* ext4_lock_group() held.
*/
static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct super_block *sb = ac->ac_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
ext4_grpblk_t free;
int ret = 0;
if (should_lock)
ext4_lock_group(sb, group);
free = grp->bb_free;
if (free == 0)
goto out;
if (cr <= 2 && free < ac->ac_g_ex.fe_len)
goto out;
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
goto out;
if (should_lock)
ext4_unlock_group(sb, group);
...
if (should_lock)
ext4_lock_group(sb, group);
ret = ext4_mb_good_group(ac, group, cr);
out:
if (should_lock)
ext4_unlock_group(sb, group);
return ret;
}
做一些基本判定后调用ext4_mb_good_group函数。
检查再cr严苛程度下能否完成分配
/*
* This is also called BEFORE we load the buddy bitmap.
* Returns either 1 or 0 indicating that the group is either suitable
* for the allocation or not.
*/
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
ext4_grpblk_t free, fragments;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
BUG_ON(cr < 0 || cr >= 4);
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
return false;
free = grp->bb_free;
if (free == 0)
return false;
fragments = grp->bb_fragments;
if (fragments == 0)
return false;
switch (cr) {
case 0:
BUG_ON(ac->ac_2order == 0);
/* Avoid using the first bg of a flexgroup for data files */
//lex块组的第一个块组一般是给目录和特殊文件用的,当“最严苛的时候”跳过
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
return false;
//空闲block总数都少于fe_len的话返回false
if (free < ac->ac_g_ex.fe_len)
return false;
//大于13则可以分配
if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
return true;
//最大的空闲order小于申请的order,返回false
if (grp->bb_largest_free_order < ac->ac_2order)
return false;
return true;
case 1:
//空闲空间的平均长度大于等于申请的长度,可以进行分配
if ((free / fragments) >= ac->ac_g_ex.fe_len)
return true;
break;
case 2:
//空闲的总block树大于申请量就可以申请
if (free >= ac->ac_g_ex.fe_len)
return true;
break;
case 3:
//cr=3最不严格,意味只要能有空闲的任何空间满足条件就可以申请
return true;
default:
BUG();
}
return false;
}
/*
* The routine scans the group and measures all found extents.
* In order to optimize scanning, caller must pass number of
* free blocks in the group, so the routine can know upper limit.
*/
static noinline_for_stack
void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct super_block *sb = ac->ac_sb;
void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
int i;
int free;
free = e4b->bd_info->bb_free;
if (WARN_ON(free <= 0))
return;
//从第一个空闲的block开始搜索
i = e4b->bd_info->bb_first_free;
while (free && ac->ac_status == AC_STATUS_CONTINUE) {
i = mb_find_next_zero_bit(bitmap,
EXT4_CLUSTERS_PER_GROUP(sb), i);
...
mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
...
ex.fe_logical = 0xDEADC0DE; /* debug value */
//检查是否合适
ext4_mb_measure_extent(ac, &ex, e4b);
i += ex.fe_len;
free -= ex.fe_len;
}
ext4_mb_check_limits(ac, e4b, 1);
}
/*
* The routine checks whether found extent is good enough. If it is,
* then the extent gets marked used and flag is set to the context
* to stop scanning. Otherwise, the extent is compared with the
* previous found extent and if new one is better, then it's stored
* in the context. Later, the best found extent will be used, if
* mballoc can't find good enough extent.
*
* FIXME: real allocation policy is to be designed yet!
*/
static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
struct ext4_free_extent *ex,
struct ext4_buddy *e4b)
{
struct ext4_free_extent *bex = &ac->ac_b_ex;
struct ext4_free_extent *gex = &ac->ac_g_ex;
ac->ac_found++;
/*
* The special case - take what you catch first
*/
if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
*bex = *ex;
ext4_mb_use_best_found(ac, e4b);
return;
}
/*
* Let's check whether the chuck is good enough
*/
//如果空闲空间长度刚好跟要求分配的长度一致那么这个就是最佳的
if (ex->fe_len == gex->fe_len) {
*bex = *ex;
ext4_mb_use_best_found(ac, e4b);
return;
}
/*
* If this is first found extent, just store it in the context
*/
if (bex->fe_len == 0) {
*bex = *ex;
return;
}
/*
* If new found extent is better, store it in the context
*/
if (bex->fe_len < gex->fe_len) {
//如果当前记录的bex不满足分配,那么只要找一个更大的空闲区域,就记录到bex中
/* if the request isn't satisfied, any found extent
* larger than previous best one is better */
if (ex->fe_len > bex->fe_len)
*bex = *ex;
} else if (ex->fe_len > gex->fe_len) {
//如果已经发现了满足条件的空闲区域记录在bex中,那么就要找到最小能满gex->fe_len的
//空闲区域即可,避免使用更大的连续空间造成碎片化
/* if the request is satisfied, then we try to find
* an extent that still satisfy the request, but is
* smaller than previous one */
if (ex->fe_len < bex->fe_len)
*bex = *ex;
}
ext4_mb_check_limits(ac, e4b, 0);
}