Linux 内存管理窥探(13):伙伴系统(分配内存)

在伙伴系统完成了初始化(将内存的按照 order 添加到 zonelist 后),就可以利用伙伴系统进行分配内存了。

首先我们看看伙伴系统提供的内存分配函数:

内存分配函数 功能
alloc_pages(mask, order) 分配2^order页并返回一个struct page的实例,表示分配的内存块的起始页
alloc_page(mask) 是前者在order = 0情况下的简化形式,只分配一页
get_zeroed_page(mask) 分配一页并返回一个page实例,页对应的内存填充0(所有其他函数,分配之后页的内容是未定义的)
__get_free_pages(mask, order) 工作方式与上述函数相同,但返回分配内存块的虚拟地址,而不是page实例
__get_free_page(mask) 工作方式与上述函数相同,但返回分配内存块的虚拟地址,而不是page实例
get_dma_pages(gfp_mask, order) 用来获得适用于DMA的页.

可以看到,对于物理页的分配,伙伴系统分配的单位都是 2^order 次幂

这些分配的宏函数都位于:include/linux/gfp.h

对于 alloc_pages(mask, order):

#define alloc_pages(gfp_mask, order) \
		alloc_pages_node(numa_node_id(), gfp_mask, order)

alloc_page(mask):

#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

get_zeroed_page(mask):

unsigned long get_zeroed_page(gfp_t gfp_mask)
{
        return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
}
EXPORT_SYMBOL(get_zeroed_page);

__get_free_pages(mask, order):

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
	struct page *page;

	/*
	 * __get_free_pages() returns a 32-bit address, which cannot represent
	 * a highmem page
	 */
	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

	page = alloc_pages(gfp_mask, order);
	if (!page)
		return 0;
	return (unsigned long) page_address(page);
}
EXPORT_SYMBOL(__get_free_pages);

__get_free_page(mask):

#define __get_free_page(gfp_mask) \
		__get_free_pages((gfp_mask), 0)

__get_dma_pages(gfp_mask, order) 

#define __get_dma_pages(gfp_mask, order) \
		__get_free_pages((gfp_mask) | GFP_DMA, (order))

可以看到,上述的所有接口调用,最终都集合到了 alloc_pages 接口:

Linux 内存管理窥探(13):伙伴系统(分配内存)_第1张图片

所以这里就来分析 alloc_pages_node 

首先,分配参数里面有两个关键参数,一个是分配的阶,即 order,这个不用多说,另外一个是分配内存的 gfp_mask

/* Plain integer GFP bitmasks. Do not use this directly. */
#define ___GFP_DMA		0x01u
#define ___GFP_HIGHMEM		0x02u
#define ___GFP_DMA32		0x04u
#define ___GFP_MOVABLE		0x08u
#define ___GFP_WAIT		0x10u
#define ___GFP_HIGH		0x20u
#define ___GFP_IO		0x40u
#define ___GFP_FS		0x80u
#define ___GFP_COLD		0x100u
#define ___GFP_NOWARN		0x200u
#define ___GFP_REPEAT		0x400u
#define ___GFP_NOFAIL		0x800u
#define ___GFP_NORETRY		0x1000u
#define ___GFP_COMP		0x4000u
#define ___GFP_ZERO		0x8000u
#define ___GFP_NOMEMALLOC	0x10000u
#define ___GFP_HARDWALL		0x20000u
#define ___GFP_THISNODE		0x40000u
#define ___GFP_RECLAIMABLE	0x80000u
#ifdef CONFIG_KMEMCHECK
#define ___GFP_NOTRACK		0x200000u
#else
#define ___GFP_NOTRACK		0
#endif
#define ___GFP_NO_KSWAPD	0x400000u
#define ___GFP_OTHER_NODE	0x800000u

在分配掩码中定义,分为了两大类:

1. zone modifiers : 指定从哪个 zone 中分配所需的页面,由掩码的最低 4 bits 构成分别是,DMA, HIGHMEM, DMA32, MOVABLE

#define __GFP_DMA	((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM	((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32	((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* Page is movable */
#define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)

2. action modifiers :不限制分配的 zone,但是会改变分配的行为:

#define __GFP_WAIT	((__force gfp_t)___GFP_WAIT)	/* Can wait and reschedule? */
#define __GFP_HIGH	((__force gfp_t)___GFP_HIGH)	/* Should access emergency pools? */
#define __GFP_IO	((__force gfp_t)___GFP_IO)	/* Can start physical IO? */
#define __GFP_FS	((__force gfp_t)___GFP_FS)	/* Can call down to low-level FS? */
#define __GFP_COLD	((__force gfp_t)___GFP_COLD)	/* Cache-cold page required */
#define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)	/* Suppress page allocation failure warning */
#define __GFP_REPEAT	((__force gfp_t)___GFP_REPEAT)	/* See above */
#define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)	/* See above */
#define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY) /* See above */
#define __GFP_COMP	((__force gfp_t)___GFP_COMP)	/* Add compound page metadata */
#define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)	/* Return zeroed page on success */
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */
#define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
#define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
#define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */

#define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */

最为常用的有 GFP_KERNEL 标志,GFP_ATOMIC 标志等:

/* This equals 0, but use constants in case they ever change */
#define GFP_NOWAIT	(GFP_ATOMIC & ~__GFP_HIGH)
/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
#define GFP_ATOMIC	(__GFP_HIGH)
#define GFP_NOIO	(__GFP_WAIT)
#define GFP_NOFS	(__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL	(__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_TEMPORARY	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
			 __GFP_RECLAIMABLE)
#define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
			 __GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
				 __GFP_HARDWALL | __GFP_HIGHMEM | \
				 __GFP_MOVABLE)
#define GFP_IOFS	(__GFP_IO | __GFP_FS)
#define GFP_TRANSHUGE	(GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
			 __GFP_NO_KSWAPD)

#ifdef CONFIG_NUMA
#define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
#else
#define GFP_THISNODE	((__force gfp_t)0)
#endif

好了,入参基本知道是什么情况了,接下来就看看函数实现

alloc_pages -> alloc_pages_node -> __alloc_pages -> __alloc_pages_nodemask

这个 __alloc_pages_nodemask 是伙伴系统的核心函数,盘他:

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
							nodemask_t *nodemask)
{
	struct page *page;
	unsigned int alloc_flags = ALLOC_WMARK_LOW;
	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
	struct alloc_context ac = { };

	/*
	 * There are several places where we assume that the order value is sane
	 * so bail out early if the request is out of bound.
	 */
	if (unlikely(order >= MAX_ORDER)) {
		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
		return NULL;
	}

	gfp_mask &= gfp_allowed_mask;
	alloc_mask = gfp_mask;
	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
		return NULL;

	finalise_ac(gfp_mask, &ac);

	/* First allocation attempt */
	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
	if (likely(page))
		goto out;
......
}

首先做一些基本的 Check,然后调用函数 prepare_alloc_pages 来对 alloc_context ac 结构进行赋值:

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
		int preferred_nid, nodemask_t *nodemask,
		struct alloc_context *ac, gfp_t *alloc_mask,
		unsigned int *alloc_flags)
{
	ac->high_zoneidx = gfp_zone(gfp_mask);
	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
	ac->nodemask = nodemask;
	ac->migratetype = gfpflags_to_migratetype(gfp_mask);

	if (cpusets_enabled()) {
		*alloc_mask |= __GFP_HARDWALL;
		if (!ac->nodemask)
			ac->nodemask = &cpuset_current_mems_allowed;
		else
			*alloc_flags |= ALLOC_CPUSET;
	}

	fs_reclaim_acquire(gfp_mask);
	fs_reclaim_release(gfp_mask);

	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);

	if (should_fail_alloc_page(gfp_mask, order))
		return false;

	if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
		*alloc_flags |= ALLOC_CMA;

	return true;
}

这个 alloc_context 结构包含一些内存分配的的一些基本的结构:

struct alloc_context {
	struct zonelist *zonelist;
	nodemask_t *nodemask;
	struct zoneref *preferred_zoneref;
	int migratetype;
	enum zone_type high_zoneidx;
	bool spread_dirty_pages;
};

其中:

	ac->high_zoneidx = gfp_zone(gfp_mask);
	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
	ac->nodemask = nodemask;
	ac->migratetype = gfpflags_to_migratetype(gfp_mask);

high_zoneidx 是从 gfp_zone 得来的,还记得我们的 gfp_mask 分为两个域么?这个 gfp_zone 就是获得了低 4 bits 的域,也就是 zone modifiers。

gfpflags_to_migratetype 获得了分配的 migratetype。

回到 __alloc_pages_nodemask 函数,接着调用了 finalise_ac 函数,这个函数:
 

/* Determine whether to spread dirty pages and what the first usable zone */
static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
{
	/* Dirty zone balancing only done in the fast path */
	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);

	/*
	 * The preferred zone is used for statistics but crucially it is
	 * also used as the starting point for the zonelist iterator. It
	 * may get reset for allocations that ignore memory policies.
	 */
	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
					ac->high_zoneidx, ac->nodemask);
}

获得了 ac->preferred_zoneref,也就是可用分配内存的 zone。

这里多说一下,可用于内存份分配的 zone 的寻找,是通过遍历 zonelist 的 _zonerefs 数组来做到的,在初始化的时候,已经按照优先级对其进行排序:

_zonerefs[0]  -> HIGHMEM

_zonerefs[1]  -> NORMAL

而 _zonerefs 的结构中,还存在一个 index 的变量:

struct zoneref {
	struct zone *zone;	/* Pointer to actual zone */
	int zone_idx;		/* zone_idx(zoneref->zone) */
};

这个 index 变量是 _zonerefs 的编号,比如:

_zonerefs[0]  -> ZONE_HIGHMEM   index = 0

_zonerefs[1]  -> ZONE_NORMAL    index = 1

在进行 zone 遍历的时候,会得到一个 :

ac->high_zoneidx = gfp_zone(gfp_mask);

实际上遍历的时候,从  _zonerefs[0] 开始遍历,当这个当前的 index 小于了 ac->high_zoneidx 的时候,命中,比如:

当 gfp_mask = GFP_KERNEL 的时候,high_zoneidx 通过 gfp_zone 解析出来是:0,遍历的时候,首先在 _zonerefs[0] 开始,但是 index = 1,所以不命中,继续在 _zonerefs[1] 的地方命中,也就是命中了 ZONE_NORMAL 

又比如,分配掩码为 GFP_HIGHUSER_MOVABLE,通过 gfp_zone 解出来的 high_zoneidx 为 2,从 _zonerefs[0] 开始遍历,_zonerefs[0] 的 index 为 1, 所以命中,所以选择了 ZONE_HIGHMEM 作为分配内存的地方。

好了,我们回到 __alloc_pages_nodemask 函数,现在看到的是

__alloc_pages_nodemask

|---------------->prepare_alloc_pages

|---------------->finalise_ac

接下来调用关键函数 get_page_from_freelist 尝试去分配页面,如果分配失败,则交给 __alloc_pages_slowpath 处理一些特殊场景,这里先分析 get_page_from_freelist

代码挺多,不一一截取,关键点是首先调用 for_next_zone_zonelist_nodemask 来遍历 zone 来分配内存:

/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
						const struct alloc_context *ac)
{
	struct zoneref *z = ac->preferred_zoneref;
	struct zone *zone;
	struct pglist_data *last_pgdat_dirty_limit = NULL;

	/*
	 * Scan zonelist, looking for a zone with enough free.
	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
	 */
	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
								ac->nodemask) {

紧接着,检查该 zone 的内存水线是否充足:

....		
        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
		if (!zone_watermark_fast(zone, order, mark,
				       ac_classzone_idx(ac), alloc_flags)) {
			int ret;
....
			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
			switch (ret) {
			case NODE_RECLAIM_NOSCAN:
				/* did not scan */
				continue;
			case NODE_RECLAIM_FULL:
				/* scanned but unreclaimable */
				continue;
			default:
				/* did we reclaim enough */
				if (zone_watermark_ok(zone, order, mark,
						ac_classzone_idx(ac), alloc_flags))
					goto try_this_zone;

				continue;
			}
		}

zone 存在 3 个水线,WMARK_MIN, WMARK_LOW, WMARK_HIGH

在满足条件的情况下,调用 rmqueue 来分配内存,最终会调用到 __rmqueue_smallest 来分配:

/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
						int migratetype)
{
	unsigned int current_order;
	struct free_area *area;
	struct page *page;

	/* Find a page of the appropriate size in the preferred list */
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = &(zone->free_area[current_order]);
		page = list_first_entry_or_null(&area->free_list[migratetype],
							struct page, lru);
		if (!page)
			continue;
		list_del(&page->lru);
		rmv_page_order(page);
		area->nr_free--;
		expand(zone, page, order, current_order, area, migratetype);
		set_pcppage_migratetype(page, migratetype);
		return page;
	}

	return NULL;
}

__rmqueue_smallest 首先从 order 开始查找 zone 中空闲链表,如果 zone 的当前 order 对应的空闲 free_area 中相应的 migratetype 类型的链表中没有空闲的对象,那么查找下一级的 order。

在系统刚刚启动的时候,绝大多数空闲的内存块,都以最大 order 的方式,组织到了一旦找到了 MAX_ORDER - 1 的链表,当找到这种大的内存块,则需要按照之前了解到的伙伴系统的方式,进行内存的切块。

__rmqueue_smallest 函数中的 expand 调用,实现了内存切块:

/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- nyc
 */
static inline void expand(struct zone *zone, struct page *page,
	int low, int high, struct free_area *area,
	int migratetype)
{
	unsigned long size = 1 << high;

	while (high > low) {
		area--;
		high--;
		size >>= 1;
		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);

		/*
		 * Mark as guard pages (or page), that will allow to
		 * merge back to allocator when buddy will be freed.
		 * Corresponding page table entries will not be touched,
		 * pages will stay not present in virtual address space
		 */
		if (set_page_guard(zone, &page[size], high, migratetype))
			continue;

		list_add(&page[size].lru, &area->free_list[migratetype]);
		area->nr_free++;
		set_page_order(&page[size], high);
	}
}

当分配成功后,__rmqueue 返回分配的这个内存块的起始页面的数据结构 page。

所以,总的来说,伙伴系统内存分配的流程:

alloc_pages
|
|->alloc_pages_node
   |
   |->__alloc_pages
      |
      |-> __alloc_pages_nodemask​​​​​​​
         |
         |-> get_page_from_freelist
             |
             |-> zone_watermark_ok  -> rmqueue
                                       |
                                       |--> __rmqueue_smallest

参考文档:

https://blog.csdn.net/gatieme/article/details/52704844

 

你可能感兴趣的:(Linux,内核内存管理)