alloc_page和__get_free_page都是从Buddy分配页面,只是最终返回值类型不同而已,前者返回page指针,后者返回该page所在的虚拟地址。
两者最终都会调用到核心函数__alloc_pages_nodemask,下面详述该函数的处理流程。
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page;
int migratetype = allocflags_to_migratetype(gfp_mask);//由于gfp flag和migrate type不是一一对应的关系,在此进行转换
gfp_mask &= gfp_allowed_mask;
lockdep_trace_alloc(gfp_mask);
might_sleep_if(gfp_mask & __GFP_WAIT);//如果此次内存分配可以等待(睡眠),那么再深入判断此task是否可以被调度,如果是将主动schedule
if (should_fail_alloc_page(gfp_mask, order))//打开CONFIG_FAIL_PAGE_ALLOC调试配置选项时,为分配失败调试做准备
return NULL;
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
get_mems_allowed();//锁定分配策略,防止被修改
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);//搜索可用的zone保存在preferred_zone
if (!preferred_zone) {//如果没有可用的zone,释放分配策略,返回
put_mems_allowed();
return NULL;
}
/* First allocation attempt */
//快速路径分配,指定了cpu亲和性和选择高水线区,check zonelist,找到合适的zone,check水线值,
//如果不满足水线值要求,启动回收机制,最后调用它再buffered_rmqueue在该zone分配内存
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
preferred_zone, migratetype);
if (unlikely(!page))//上面分配失败,开始从slowpath分配,会启动回收机制,并且降低水线值,再次调用快速路径分配
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
put_mems_allowed();//释放策略
trace_mm_page_alloc(page, order, gfp_mask, migratetype);//调试使用
return page;
}
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,//遍历zonelist
high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))//check zonelist cache中是否有符合的zone
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))//check cpuset和allowed node,函数头注释很清楚
goto try_next_zone;//不满足要求,check下一个zone
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {//需要check水线,否则直接跳到真正的分配动作
unsigned long mark;
int ret;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (zone_watermark_ok(zone, order, mark,//check该zone水线是否满足要求,水线值根据分配flag ALLOC_HIGH和ALLOC_HARDER,
//有不同的计算方式,满足跳到真正分配动作,否则继续check
classzone_idx, alloc_flags))
goto try_this_zone;
if (zone_reclaim_mode == 0)//如果本zone不允许回收,更新zone list cache为full,为下一次check节省时间
goto this_zone_full;
ret = zone_reclaim(zone, gfp_mask, order);//启动回收机制,如果不可以wait,返回ZONE_RECLAIM_NOSCAN,
//否则在local zone或没有关联到其他processor的zone,调用__zone_reclaim进行回收
switch (ret) {
case ZONE_RECLAIM_NOSCAN://没有scan,直接check next zone
/* did not scan */
goto try_next_zone;
case ZONE_RECLAIM_FULL://没有分配空间
/* scanned but unreclaimable */
goto this_zone_full;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,//成功进行了回收,check水线是否满足要求
classzone_idx, alloc_flags))
goto this_zone_full;
}
}
try_this_zone:
page = buffered_rmqueue(preferred_zone, zone, order,//各种情况check完毕,在本zone进行真正的内存分配动作
gfp_mask, migratetype);
if (page)
break;
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
try_next_zone:
if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup after the first zone is tried but only
* if there are multiple nodes make it worthwhile
*/
allowednodes = zlc_setup(zonelist, alloc_flags);//更新zone list cache
zlc_active = 1;
did_zlc_setup = 1;
}
}
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
return page;
}
/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
again:
if (likely(order == 0)) {//如果分配单页,直接从pcp搜索
struct per_cpu_pages *pcp;
struct list_head *list;
local_irq_save(flags);//Disable and save irq state
pcp = &this_cpu_ptr(zone->pageset)->pcp//取得pcp指针
list = &pcp->lists[migratetype];//取得对应的migrate type pcp list
if (list_empty(list)) {//如果该list为空
pcp->count += rmqueue_bulk(zone, 0,//从buddy的free_area释放batch个页面到pcp
pcp->batch, list,
migratetype, cold);
if (unlikely(list_empty(list)))
goto failed;
}
if (cold)//分配冷页,不被cache的页,比如用于DMA
page = list_entry(list->prev, struct page, lru);//分配冷页,即从链表尾开始查找
else//分配热页,被cache的页,提高效率
page = list_entry(list->next, struct page, lru);//热页从链表头开始查找
list_del(&page->lru);//从lru list删除该页
pcp->count--;//pcp->count值代表本pcp有多少页
} else {
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);//为操作buddy上锁
page = __rmqueue(zone, order, migratetype);//真正从buddy的free_area链表分配内存,分两种情况
//1. __rmqueue_smallest():如果order上对应分配策略要求的migrate type list有空间,从第一满足
//的节点上分配内存,并将剩余的部分add到更小的order链表上
//2. __rmqueue_fallback():如果对应的migrate type list上没有空间,fallback到其他的type list上,
//释放一定空间到本migratte type list上,fallback有相应的sequence,再进行和__rmqueue_smallest类似的分配、合并动作
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);//更新本cpu vm event信息
zone_statistics(preferred_zone, zone);//更新zone相关信息
local_irq_restore(flags);//enable and restore irq
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))//如果本页已经本映射,重新分配
goto again;
return page;
failed:
local_irq_restore(flags);
return NULL;
}