Linux内存分配alloc_page和__get_free_page详注(伙伴管理系统Buddy)

alloc_page和__get_free_page都是从Buddy分配页面,只是最终返回值类型不同而已,前者返回page指针,后者返回该page所在的虚拟地址。

两者最终都会调用到核心函数__alloc_pages_nodemask,下面详述该函数的处理流程。


struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
			struct zonelist *zonelist, nodemask_t *nodemask)
{
	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
	struct zone *preferred_zone;
	struct page *page;
	int migratetype = allocflags_to_migratetype(gfp_mask);//由于gfp flag和migrate type不是一一对应的关系,在此进行转换

	gfp_mask &= gfp_allowed_mask;

	lockdep_trace_alloc(gfp_mask);

	might_sleep_if(gfp_mask & __GFP_WAIT);//如果此次内存分配可以等待(睡眠),那么再深入判断此task是否可以被调度,如果是将主动schedule

	if (should_fail_alloc_page(gfp_mask, order))//打开CONFIG_FAIL_PAGE_ALLOC调试配置选项时,为分配失败调试做准备
		return NULL;

	/*
	 * Check the zones suitable for the gfp_mask contain at least one
	 * valid zone. It's possible to have an empty zonelist as a result
	 * of GFP_THISNODE and a memoryless node
	 */
	if (unlikely(!zonelist->_zonerefs->zone))
		return NULL;

	get_mems_allowed();//锁定分配策略,防止被修改
	/* The preferred zone is used for statistics later */
	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);//搜索可用的zone保存在preferred_zone
	if (!preferred_zone) {//如果没有可用的zone,释放分配策略,返回
		put_mems_allowed();
		return NULL;
	}

	/* First allocation attempt */
	//快速路径分配,指定了cpu亲和性和选择高水线区,check zonelist,找到合适的zone,check水线值,
	//如果不满足水线值要求,启动回收机制,最后调用它再buffered_rmqueue在该zone分配内存
	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
			preferred_zone, migratetype);
	if (unlikely(!page))//上面分配失败,开始从slowpath分配,会启动回收机制,并且降低水线值,再次调用快速路径分配
		page = __alloc_pages_slowpath(gfp_mask, order,
				zonelist, high_zoneidx, nodemask,
				preferred_zone, migratetype);
	put_mems_allowed();//释放策略

	trace_mm_page_alloc(page, order, gfp_mask, migratetype);//调试使用
	return page;
}


/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
		struct zone *preferred_zone, int migratetype)
{
	struct zoneref *z;
	struct page *page = NULL;
	int classzone_idx;
	struct zone *zone;
	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
	int zlc_active = 0;		/* set if using zonelist_cache */
	int did_zlc_setup = 0;		/* just call zlc_setup() one time */

	classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
	/*
	 * Scan zonelist, looking for a zone with enough free.
	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
	 */
	for_each_zone_zonelist_nodemask(zone, z, zonelist,//遍历zonelist
						high_zoneidx, nodemask) {
		if (NUMA_BUILD && zlc_active &&
			!zlc_zone_worth_trying(zonelist, z, allowednodes))//check zonelist cache中是否有符合的zone
				continue;
		if ((alloc_flags & ALLOC_CPUSET) &&
			!cpuset_zone_allowed_softwall(zone, gfp_mask))//check cpuset和allowed node,函数头注释很清楚
				goto try_next_zone;//不满足要求,check下一个zone

		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {//需要check水线,否则直接跳到真正的分配动作
			unsigned long mark;
			int ret;

			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
			if (zone_watermark_ok(zone, order, mark,//check该zone水线是否满足要求,水线值根据分配flag ALLOC_HIGH和ALLOC_HARDER,
			//有不同的计算方式,满足跳到真正分配动作,否则继续check
				    classzone_idx, alloc_flags))
				goto try_this_zone;

			if (zone_reclaim_mode == 0)//如果本zone不允许回收,更新zone list cache为full,为下一次check节省时间
				goto this_zone_full;

			ret = zone_reclaim(zone, gfp_mask, order);//启动回收机制,如果不可以wait,返回ZONE_RECLAIM_NOSCAN,
			//否则在local zone或没有关联到其他processor的zone,调用__zone_reclaim进行回收
			switch (ret) {
			case ZONE_RECLAIM_NOSCAN://没有scan,直接check next zone
				/* did not scan */
				goto try_next_zone;
			case ZONE_RECLAIM_FULL://没有分配空间
				/* scanned but unreclaimable */
				goto this_zone_full;
			default:
				/* did we reclaim enough */
				if (!zone_watermark_ok(zone, order, mark,//成功进行了回收,check水线是否满足要求
						classzone_idx, alloc_flags))
					goto this_zone_full;
			}
		}

try_this_zone:
		page = buffered_rmqueue(preferred_zone, zone, order,//各种情况check完毕,在本zone进行真正的内存分配动作
						gfp_mask, migratetype);
		if (page)
			break;
this_zone_full:
		if (NUMA_BUILD)
			zlc_mark_zone_full(zonelist, z);
try_next_zone:
		if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
			/*
			 * we do zlc_setup after the first zone is tried but only
			 * if there are multiple nodes make it worthwhile
			 */
			allowednodes = zlc_setup(zonelist, alloc_flags);//更新zone list cache
			zlc_active = 1;
			did_zlc_setup = 1;
		}
	}

	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
		/* Disable zlc cache for second zonelist scan */
		zlc_active = 0;
		goto zonelist_scan;
	}
	return page;
}

/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
			struct zone *zone, int order, gfp_t gfp_flags,
			int migratetype)
{
	unsigned long flags;
	struct page *page;
	int cold = !!(gfp_flags & __GFP_COLD);

again:
	if (likely(order == 0)) {//如果分配单页,直接从pcp搜索
		struct per_cpu_pages *pcp;
		struct list_head *list;

		local_irq_save(flags);//Disable and save irq state
		pcp = &this_cpu_ptr(zone->pageset)->pcp//取得pcp指针
		list = &pcp->lists[migratetype];//取得对应的migrate type pcp list
		if (list_empty(list)) {//如果该list为空
			pcp->count += rmqueue_bulk(zone, 0,//从buddy的free_area释放batch个页面到pcp
					pcp->batch, list,
					migratetype, cold);
			if (unlikely(list_empty(list)))
				goto failed;
		}

		if (cold)//分配冷页,不被cache的页,比如用于DMA
			page = list_entry(list->prev, struct page, lru);//分配冷页,即从链表尾开始查找
		else//分配热页,被cache的页,提高效率
			page = list_entry(list->next, struct page, lru);//热页从链表头开始查找

		list_del(&page->lru);//从lru list删除该页
		pcp->count--;//pcp->count值代表本pcp有多少页
	} else {
		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
			/*
			 * __GFP_NOFAIL is not to be used in new code.
			 *
			 * All __GFP_NOFAIL callers should be fixed so that they
			 * properly detect and handle allocation failures.
			 *
			 * We most definitely don't want callers attempting to
			 * allocate greater than order-1 page units with
			 * __GFP_NOFAIL.
			 */
			WARN_ON_ONCE(order > 1);
		}
		spin_lock_irqsave(&zone->lock, flags);//为操作buddy上锁
		page = __rmqueue(zone, order, migratetype);//真正从buddy的free_area链表分配内存,分两种情况
		//1. __rmqueue_smallest():如果order上对应分配策略要求的migrate type list有空间,从第一满足
		//的节点上分配内存,并将剩余的部分add到更小的order链表上
		//2. __rmqueue_fallback():如果对应的migrate type list上没有空间,fallback到其他的type list上,
		//释放一定空间到本migratte type list上,fallback有相应的sequence,再进行和__rmqueue_smallest类似的分配、合并动作
		spin_unlock(&zone->lock);
		if (!page)
			goto failed;
		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
	}

	__count_zone_vm_events(PGALLOC, zone, 1 << order);//更新本cpu vm event信息
	zone_statistics(preferred_zone, zone);//更新zone相关信息
	local_irq_restore(flags);//enable and restore irq

	VM_BUG_ON(bad_range(zone, page));
	if (prep_new_page(page, order, gfp_flags))//如果本页已经本映射,重新分配
		goto again;
	return page;

failed:
	local_irq_restore(flags);
	return NULL;
}

你可能感兴趣的:(Linux内存分配alloc_page和__get_free_page详注(伙伴管理系统Buddy))