page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
bool deferred_compaction = false;
bool contended_compaction = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
* reclaim >= MAX_ORDER areas which will never succeed. Callers may
* be using allocators in order of preference for an area that is
* too large.
*/
if (order >= MAX_ORDER) {//健康检查
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
/*
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
* __GFP_NOWARN set) should not cause reclaim since the subsystem
* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
* using a larger set of nodes after it has established that the
* allowed per node queues are empty and that nodes are
* over allocated.
*/
if (IS_ENABLED(CONFIG_NUMA) &&
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)//不能重试
goto nopage;
restart:
if (!(gfp_mask & __GFP_NO_KSWAPD))//唤醒负责换出内存页的kswapd守护进程
wake_all_kswapd(order, zonelist, high_zoneidx,
zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);//集合下所有分配内存标识
/*
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*///找到一个最合适的zone,但这是在不考虑cpu集的情况下
if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
first_zones_zonelist(zonelist, high_zoneidx, NULL,
&preferred_zone);
rebalance:
/* This is the last chance, in general, before the goto nopage. *///再用快速分配试下,这是在最低水印的情况下
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page)
goto got_pg;
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
/*
* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
* the allocation is high priority and these type of
* allocations are system rather than user orientated
*/
zonelist = node_zonelist(numa_node_id(), gfp_mask);//这里返回的是本地节点的备用zonelist[0]
//高优先级分配,主要是 没有水印分配,如果是不能失败分配,则下面一定会分配到内存,否则就系统崩溃了
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
if (page) {
goto got_pg;
}
}
/* Atomic allocations - we can't balance anything */
if (!wait)//不能等待,则失败返回。这可以看出后面要睡眠等待
goto nopage;
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)//如果是执行公务分配内存,则失败返回,避免递归等待,产生死锁
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
/*
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*///试着直接压缩内存,先是异步,下次再用同步
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, sync_migration,
&contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
sync_migration = true;//设置为同步
/*
* If compaction is deferred for high-order allocations, it is because
* sync compaction recently failed. In this is the case and the caller
* requested a movable allocation that does not heavily disrupt the
* system then fail the allocation instead of entering direct reclaim.
*/
if ((deferred_compaction || contended_compaction) &&
(gfp_mask & __GFP_NO_KSWAPD))
goto nopage;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, &did_some_progress);
if (page)
goto got_pg;
/*
* If we failed to make any progress reclaiming, then we are
* running out of options and have to consider going OOM
*/
if (!did_some_progress) {//页面没有回收成功,只有OOM了
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {//可以调用文件系统函数,可以重试
if (oom_killer_disabled)//如果关闭了OOM,则返回NULL
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
if ((current->flags & PF_DUMPCORE) &&
!(gfp_mask & __GFP_NOFAIL))
goto nopage;
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
migratetype);
if (page)
goto got_pg;
if (!(gfp_mask & __GFP_NOFAIL)) {
/*
* The oom killer is not called for high-order
* allocations that may fail, so if no progress
* is being made, there are no other options and
* retrying is unlikely to help.
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)//如果需要分配的阶太大了,就没办法了,就算杀了一个进程,释放该进程占用的所有内存估计也不够
goto nopage;//所以干脆告诉你没有内存
/*
* The oom killer is not called for lowmem
* allocations to prevent needlessly killing
* innocent tasks.
*/
if (high_zoneidx < ZONE_NORMAL)//如果合适的zone已经试到DMA区域了,那还是放弃吧,因为DMA区域的内存本来就少
goto nopage;
}
goto restart;//不能失败的话,那只好再到回去重试了
}
}
//运行到这里的话,表明上一步回收内存成功了
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, did_some_progress,
pages_reclaimed)) {//如果还要重试分配内存
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);//则等待一会再试,等待其他页面的空闲
goto rebalance;
} else {//不再试了,
/*
* High-order allocations do not necessarily loop after
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*///再一次压缩下内存,看看能否分配到内存
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, sync_migration,
&contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
}
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
return page;
}
/*
* This is called in the allocator slow-path if the allocation request is of
* sufficient urgency to ignore watermarks and take other desperate measures
*/
static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
struct page *page;
do {
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);//没有水印去分配内存,不遗余力的了
if (!page && gfp_mask & __GFP_NOFAIL)
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);//同步/异步的等待数据回写,然后空出页面
} while (!page && (gfp_mask & __GFP_NOFAIL));//不能失败,死循环,直到分配到page
return page;
}
#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
int migratetype, bool sync_migration,
bool *contended_compaction, bool *deferred_compaction,
unsigned long *did_some_progress)
{
if (!order)
return NULL;
if (compaction_deferred(preferred_zone, order)) {
*deferred_compaction = true;
return NULL;
}
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration,
contended_compaction);
current->flags &= ~PF_MEMALLOC;
if (*did_some_progress != COMPACT_SKIPPED) {
struct page *page;
/* Page migration frees to the PCP lists but we want merging */
drain_pages(get_cpu());
put_cpu();
page = get_page_from_freelist(gfp_mask, nodemask,
order, zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page) {
preferred_zone->compact_blockskip_flush = false;
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
if (order >= preferred_zone->compact_order_failed)
preferred_zone->compact_order_failed = order + 1;
count_vm_event(COMPACTSUCCESS);
return page;
}
/*
* It's bad if compaction run occurs and fails.
* The most likely reason is that pages exist,
* but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
/*
* As async compaction considers a subset of pageblocks, only
* defer if the failure was a sync compaction failure.
*/
if (sync_migration)
defer_compaction(preferred_zone, order);
cond_resched();
}
return NULL;
}