先说下快速分配内存页的参数:
gfp_mask 进入快速分配时,加上了__GFP_HARDWALL 这表示再分配时要加大分配力度;
nodemask 表示节点的mask,就是是否能在该节点上分配内存,这是个bit位数组;
order 是分配的阶;
zonelist 是当perferred_zone上没有合适的页可以分配时,就要按zonelist中的顺序扫描该zonelist中备用zone列表,一个个的试用;
high_zoneidx是表示该分配时,所能分配的最高zone,一般从high --》 normal --》dma 内存越来越昂贵,所以一般从high到dma分配依次分配;
alloc_flags 是分配内存是的标识;
preferred_zone 表示从high_zoneidx后找到的合适的zone,一般会从该zone分配;分配失败的话,就会在zonelist再找一个preferred_zone = 合适的zone;
migratetype是迁移类型,在zone->free_area.free_list[XXX] 作为分配下标使用,这个是用来反碎片化的,修改了以前的free_area结构体,在该结构体中再添加了一个数组,该数组以迁移类型为下标,每个数组元素都挂了对应迁移类型的页链表;
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, struct zonelist *zonelist, int high_zoneidx, int alloc_flags, struct zone *preferred_zone, int migratetype) { struct zoneref *z; struct page *page = NULL; int classzone_idx; struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ classzone_idx = zone_idx(preferred_zone);//zone的id zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) {//这个宏是从zonelist->_zonerefs数组中得到合适的zone,具体解释看后面 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes))//z->zone所在的节点不允许分配或者该zone已经饱满了,都跳过该z continue; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask))//开启了检查内存节点是否在指定CPU集合,并且该zone不被允许在该CPU上分配内存,跳过; continue; /* * When allocating a page cache page for writing, we * want to get it from a zone that is within its dirty * limit, such that no single zone holds more than its * proportional share of globally allowed dirty pages. * The dirty limits take into account the zone's * lowmem reserves and high watermark so that kswapd * should be able to balance it without having to * write pages from its LRU list. * * This may look like it could increase pressure on * lower zones by failing allocations in higher zones * before they are full. But the pages that do spill * over are limited as the lower zones are protected * by this very same mechanism. It should not become * a practical burden to them. * * XXX: For now, allow allocations to potentially * exceed the per-zone dirty limit in the slowpath * (ALLOC_WMARK_LOW unset) before going into reclaim, * which is important when on a NUMA setup the allowed * zones are together not big enough to reach the * global limit. The proper fix for these situations * will require awareness of zones in the * dirty-throttling and the flusher threads. */ if ((alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))//判断该zone上的脏页是否超过了limit goto this_zone_full;//脏页超过了限制,跳转到最后设置该zone已经饱满,这样可以平衡脏页分配到各个zone上 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; int ret; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags))//检查该zone是否有足够的页来分配,具体分析见后面 goto try_this_zone; if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* * we do zlc_setup if there are multiple nodes * and before considering the first zone allowed * by the cpuset. */ allowednodes = zlc_setup(zonelist, alloc_flags); zlc_active = 1; did_zlc_setup = 1; } // 上面已经用zone_watermark_ok()测试了该zone是否能分配页,如果分配不了,而又不能回收,或者该zone不在回收zone范围,所以就只好设置饱满了,防止下一次再扫描该zone if (zone_reclaim_mode == 0 || !zone_allows_reclaim(preferred_zone, zone)) goto this_zone_full; /* * As we may have just activated ZLC, check if the first * eligible zone has failed zone_reclaim recently. *///因为上面已经设置过zlc_active等变量,所以要再次扫描下,不行就跳过该zone if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; ret = zone_reclaim(zone, gfp_mask, order);//运行到这里表明该zone可以回收页 switch (ret) { case ZONE_RECLAIM_NOSCAN: /* did not scan */ continue; case ZONE_RECLAIM_FULL: /* scanned but unreclaimable */ continue; default://上面两种情况都是没有回收页面的,到这里则表示已经回收了部分页面 /* did we reclaim enough *///因为上面回收了部分页面,所以要再用看看该zone是否可以分配页了 if (zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) goto try_this_zone; /* * Failed to reclaim enough to meet watermark. * Only mark the zone full if checking the min * watermark or if we failed to reclaim just * 1<<order pages or else the page allocator * fastpath will prematurely mark zones full * when the watermark is between the low and * min watermarks. *///没办法,确实尽力了 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || ret == ZONE_RECLAIM_SOME) goto this_zone_full; continue; } } try_this_zone://理想情况下,开始分配内存 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask, migratetype); if (page) break; this_zone_full://该zone已经饱满了,设置该zone if (IS_ENABLED(CONFIG_NUMA)) zlc_mark_zone_full(zonelist, z); }//这里就结束了遍历zone分配函数的循环了 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { /* Disable zlc cache for second zonelist scan */ zlc_active = 0; goto zonelist_scan;//再循环一次 } if (page) /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is * that the caller is taking steps that will free more * memory. The caller should avoid the page being used * for !PFMEMALLOC purposes. *///如果是无水印分配得到的页,表明该zone已经没有多少内存页可以被用来分配了,所以要设置pfmemalloc,让系统回收点内存; page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); return page; }
这个宏是从zonelist->_zonerefs数组中获取合适的zone
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) {
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes, struct zone **zone) { return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes, zone); }
struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ };
/* Returns the next zone at or below highest_zoneidx in a zonelist */ struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes, struct zone **zone) { /* * Find the next suitable zone to use for the allocation. * Only filter based on nodemask if it's set */ if (likely(nodes == NULL)) while (zonelist_zone_idx(z) > highest_zoneidx) z++; else while (zonelist_zone_idx(z) > highest_zoneidx || (z->zone && !zref_in_nodemask(z, nodes))) z++; *zone = zonelist_zone(z); return z; }
要理解zlc_zone_worth_trying()函数,要先看几个结构体
struct zonelist { struct zonelist_cache *zlcache_ptr; // NULL or &zlcache struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; #ifdef CONFIG_NUMA struct zonelist_cache zlcache; // optional ... #endif };
struct zonelist_cache { unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ unsigned long last_full_zap; /* when last zap'd (jiffies) */ };
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, nodemask_t *allowednodes) { struct zonelist_cache *zlc; /* cached zonelist speedup info */ int i; /* index of *z in zonelist zones */ int n; /* node that zone *z is on */ zlc = zonelist->zlcache_ptr;//上面结构体说的很明白 zlcache_ptr 是 zlcache的地址 if (!zlc)//表示 zlcache不存在,内存是UMA模式 return 1; i = z - zonelist->_zonerefs;// 这是 求z在_zonerfs是第几个元素 n = zlc->z_to_n[i];//根据i可以得到该zone所在的节点编号nid /* This zone is worth trying if it is allowed but not full */ return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);//判断该nid是否合法,该zone是否已经达到饱满了; }
/* * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags, long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; long lowmem_reserve = z->lowmem_reserve[classzone_idx];//紧急情况下才能分配的页数 int o; long free_cma = 0; free_pages -= (1 << order) - 1;//减去要分配的页数 if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif if (free_pages - free_cma <= min + lowmem_reserve)//min + lowmem_reserve是一个界限,小于则不能在该zone上分配 return false; for (o = 0; o < order; o++) {//循环去掉比需要分配的order介还小的阶上的free page /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o; /* Require fewer higher order pages to be free */ min >>= 1;//高阶页可以少些 if (free_pages <= min) return false; } return true; }