先说下快速分配内存页的参数:
gfp_mask 进入快速分配时,加上了__GFP_HARDWALL 这表示再分配时要加大分配力度;
nodemask 表示节点的mask,就是是否能在该节点上分配内存,这是个bit位数组;
order 是分配的阶;
zonelist 是当perferred_zone上没有合适的页可以分配时,就要按zonelist中的顺序扫描该zonelist中备用zone列表,一个个的试用;
high_zoneidx是表示该分配时,所能分配的最高zone,一般从high --》 normal --》dma 内存越来越昂贵,所以一般从high到dma分配依次分配;
alloc_flags 是分配内存是的标识;
preferred_zone 表示从high_zoneidx后找到的合适的zone,一般会从该zone分配;分配失败的话,就会在zonelist再找一个preferred_zone = 合适的zone;
migratetype是迁移类型,在zone->free_area.free_list[XXX] 作为分配下标使用,这个是用来反碎片化的,修改了以前的free_area结构体,在该结构体中再添加了一个数组,该数组以迁移类型为下标,每个数组元素都挂了对应迁移类型的页链表;
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype);
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
classzone_idx = zone_idx(preferred_zone);//zone的id
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {//这个宏是从zonelist->_zonerefs数组中得到合适的zone,具体解释看后面
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))//z->zone所在的节点不允许分配或者该zone已经饱满了,都跳过该z
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))//开启了检查内存节点是否在指定CPU集合,并且该zone不被允许在该CPU上分配内存,跳过;
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty
* limit, such that no single zone holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the zone's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* This may look like it could increase pressure on
* lower zones by failing allocations in higher zones
* before they are full. But the pages that do spill
* over are limited as the lower zones are protected
* by this very same mechanism. It should not become
* a practical burden to them.
*
* XXX: For now, allow allocations to potentially
* exceed the per-zone dirty limit in the slowpath
* (ALLOC_WMARK_LOW unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* zones are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
if ((alloc_flags & ALLOC_WMARK_LOW) &&
(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))//判断该zone上的脏页是否超过了limit
goto this_zone_full;//脏页超过了限制,跳转到最后设置该zone已经饱满,这样可以平衡脏页分配到各个zone上
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))//检查该zone是否有足够的页来分配,具体分析见后面
goto try_this_zone;
if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
* by the cpuset.
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
// 上面已经用zone_watermark_ok()测试了该zone是否能分配页,如果分配不了,而又不能回收,或者该zone不在回收zone范围,所以就只好设置饱满了,防止下一次再扫描该zone
if (zone_reclaim_mode == 0 ||
!zone_allows_reclaim(preferred_zone, zone))
goto this_zone_full;
/*
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*///因为上面已经设置过zlc_active等变量,所以要再次扫描下,不行就跳过该zone
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
ret = zone_reclaim(zone, gfp_mask, order);//运行到这里表明该zone可以回收页
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default://上面两种情况都是没有回收页面的,到这里则表示已经回收了部分页面
/* did we reclaim enough *///因为上面回收了部分页面,所以要再用看看该zone是否可以分配页了
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
/*
* Failed to reclaim enough to meet watermark.
* Only mark the zone full if checking the min
* watermark or if we failed to reclaim just
* 1<pfmemalloc is set when ALLOC_NO_WATERMARKS was
* necessary to allocate the page. The expectation is
* that the caller is taking steps that will free more
* memory. The caller should avoid the page being used
* for !PFMEMALLOC purposes.
*///如果是无水印分配得到的页,表明该zone已经没有多少内存页可以被用来分配了,所以要设置pfmemalloc,让系统回收点内存;
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
return page;
}
这个宏是从zonelist->_zonerefs数组中获取合适的zone
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone)
{
return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
zone);
}
struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};
/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone)
{
/*
* Find the next suitable zone to use for the allocation.
* Only filter based on nodemask if it's set
*/
if (likely(nodes == NULL))
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;
else
while (zonelist_zone_idx(z) > highest_zoneidx || (z->zone && !zref_in_nodemask(z, nodes)))
z++;
*zone = zonelist_zone(z);
return z;
}
要理解zlc_zone_worth_trying()函数,要先看几个结构体
struct zonelist {
struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
#ifdef CONFIG_NUMA
struct zonelist_cache zlcache; // optional ...
#endif
};
struct zonelist_cache {
unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
unsigned long last_full_zap; /* when last zap'd (jiffies) */
};
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
int n; /* node that zone *z is on */
zlc = zonelist->zlcache_ptr;//上面结构体说的很明白 zlcache_ptr 是 zlcache的地址
if (!zlc)//表示 zlcache不存在,内存是UMA模式
return 1;
i = z - zonelist->_zonerefs;// 这是 求z在_zonerfs是第几个元素
n = zlc->z_to_n[i];//根据i可以得到该zone所在的节点编号nid
/* This zone is worth trying if it is allowed but not full */
return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);//判断该nid是否合法,该zone是否已经达到饱满了;
}
/*
* Return true if free pages are above 'mark'. This takes into account the order * of the allocation.
*/
static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
long lowmem_reserve = z->lowmem_reserve[classzone_idx];//紧急情况下才能分配的页数
int o;
long free_cma = 0;
free_pages -= (1 << order) - 1;//减去要分配的页数
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
if (free_pages - free_cma <= min + lowmem_reserve)//min + lowmem_reserve是一个界限,小于则不能在该zone上分配
return false;
for (o = 0; o < order; o++) {//循环去掉比需要分配的order介还小的阶上的free page
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
/* Require fewer higher order pages to be free */
min >>= 1;//高阶页可以少些
if (free_pages <= min)
return false;
}
return true;
}