在伙伴系统完成了初始化(将内存的按照 order 添加到 zonelist 后),就可以利用伙伴系统进行分配内存了。
首先我们看看伙伴系统提供的内存分配函数:
内存分配函数 | 功能 |
alloc_pages(mask, order) | 分配2^order页并返回一个struct page的实例,表示分配的内存块的起始页 |
alloc_page(mask) | 是前者在order = 0情况下的简化形式,只分配一页 |
get_zeroed_page(mask) | 分配一页并返回一个page实例,页对应的内存填充0(所有其他函数,分配之后页的内容是未定义的) |
__get_free_pages(mask, order) | 工作方式与上述函数相同,但返回分配内存块的虚拟地址,而不是page实例 |
__get_free_page(mask) | 工作方式与上述函数相同,但返回分配内存块的虚拟地址,而不是page实例 |
get_dma_pages(gfp_mask, order) | 用来获得适用于DMA的页. |
可以看到,对于物理页的分配,伙伴系统分配的单位都是 2^order 次幂。
这些分配的宏函数都位于:include/linux/gfp.h 中
对于 alloc_pages(mask, order):
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)
alloc_page(mask):
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
get_zeroed_page(mask):
unsigned long get_zeroed_page(gfp_t gfp_mask)
{
return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
}
EXPORT_SYMBOL(get_zeroed_page);
__get_free_pages(mask, order):
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
/*
* __get_free_pages() returns a 32-bit address, which cannot represent
* a highmem page
*/
VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
page = alloc_pages(gfp_mask, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
EXPORT_SYMBOL(__get_free_pages);
__get_free_page(mask):
#define __get_free_page(gfp_mask) \
__get_free_pages((gfp_mask), 0)
__get_dma_pages(gfp_mask, order)
#define __get_dma_pages(gfp_mask, order) \
__get_free_pages((gfp_mask) | GFP_DMA, (order))
可以看到,上述的所有接口调用,最终都集合到了 alloc_pages 接口:
所以这里就来分析 alloc_pages_node
首先,分配参数里面有两个关键参数,一个是分配的阶,即 order,这个不用多说,另外一个是分配内存的 gfp_mask
/* Plain integer GFP bitmasks. Do not use this directly. */
#define ___GFP_DMA 0x01u
#define ___GFP_HIGHMEM 0x02u
#define ___GFP_DMA32 0x04u
#define ___GFP_MOVABLE 0x08u
#define ___GFP_WAIT 0x10u
#define ___GFP_HIGH 0x20u
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
#define ___GFP_COLD 0x100u
#define ___GFP_NOWARN 0x200u
#define ___GFP_REPEAT 0x400u
#define ___GFP_NOFAIL 0x800u
#define ___GFP_NORETRY 0x1000u
#define ___GFP_COMP 0x4000u
#define ___GFP_ZERO 0x8000u
#define ___GFP_NOMEMALLOC 0x10000u
#define ___GFP_HARDWALL 0x20000u
#define ___GFP_THISNODE 0x40000u
#define ___GFP_RECLAIMABLE 0x80000u
#ifdef CONFIG_KMEMCHECK
#define ___GFP_NOTRACK 0x200000u
#else
#define ___GFP_NOTRACK 0
#endif
#define ___GFP_NO_KSWAPD 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
在分配掩码中定义,分为了两大类:
1. zone modifiers : 指定从哪个 zone 中分配所需的页面,由掩码的最低 4 bits 构成分别是,DMA, HIGHMEM, DMA32, MOVABLE
#define __GFP_DMA ((__force gfp_t)___GFP_DMA)
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* Page is movable */
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
2. action modifiers :不限制分配的 zone,但是会改变分配的行为:
#define __GFP_WAIT ((__force gfp_t)___GFP_WAIT) /* Can wait and reschedule? */
#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) /* Should access emergency pools? */
#define __GFP_IO ((__force gfp_t)___GFP_IO) /* Can start physical IO? */
#define __GFP_FS ((__force gfp_t)___GFP_FS) /* Can call down to low-level FS? */
#define __GFP_COLD ((__force gfp_t)___GFP_COLD) /* Cache-cold page required */
#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) /* Suppress page allocation failure warning */
#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */
#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */
#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */
#define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */
#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */
#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */
#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
最为常用的有 GFP_KERNEL 标志,GFP_ATOMIC 标志等:
/* This equals 0, but use constants in case they ever change */
#define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH)
/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
#define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \
__GFP_RECLAIMABLE)
#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
__GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
__GFP_HARDWALL | __GFP_HIGHMEM | \
__GFP_MOVABLE)
#define GFP_IOFS (__GFP_IO | __GFP_FS)
#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
__GFP_NO_KSWAPD)
#ifdef CONFIG_NUMA
#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
#else
#define GFP_THISNODE ((__force gfp_t)0)
#endif
好了,入参基本知道是什么情况了,接下来就看看函数实现
alloc_pages -> alloc_pages_node -> __alloc_pages -> __alloc_pages_nodemask
这个 __alloc_pages_nodemask 是伙伴系统的核心函数,盘他:
/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
/*
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
if (unlikely(order >= MAX_ORDER)) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
finalise_ac(gfp_mask, &ac);
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
goto out;
......
}
首先做一些基本的 Check,然后调用函数 prepare_alloc_pages 来对 alloc_context ac 结构进行赋值:
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
ac->high_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
if (!ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
}
fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order))
return false;
if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
*alloc_flags |= ALLOC_CMA;
return true;
}
这个 alloc_context 结构包含一些内存分配的的一些基本的结构:
struct alloc_context {
struct zonelist *zonelist;
nodemask_t *nodemask;
struct zoneref *preferred_zoneref;
int migratetype;
enum zone_type high_zoneidx;
bool spread_dirty_pages;
};
其中:
ac->high_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
high_zoneidx 是从 gfp_zone 得来的,还记得我们的 gfp_mask 分为两个域么?这个 gfp_zone 就是获得了低 4 bits 的域,也就是 zone modifiers。
gfpflags_to_migratetype 获得了分配的 migratetype。
回到 __alloc_pages_nodemask 函数,接着调用了 finalise_ac 函数,这个函数:
/* Determine whether to spread dirty pages and what the first usable zone */
static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
{
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
获得了 ac->preferred_zoneref,也就是可用分配内存的 zone。
这里多说一下,可用于内存份分配的 zone 的寻找,是通过遍历 zonelist 的 _zonerefs 数组来做到的,在初始化的时候,已经按照优先级对其进行排序:
_zonerefs[0] -> HIGHMEM
_zonerefs[1] -> NORMAL
而 _zonerefs 的结构中,还存在一个 index 的变量:
struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};
这个 index 变量是 _zonerefs 的编号,比如:
_zonerefs[0] -> ZONE_HIGHMEM index = 0
_zonerefs[1] -> ZONE_NORMAL index = 1
在进行 zone 遍历的时候,会得到一个 :
ac->high_zoneidx = gfp_zone(gfp_mask);
实际上遍历的时候,从 _zonerefs[0] 开始遍历,当这个当前的 index 小于了 ac->high_zoneidx 的时候,命中,比如:
当 gfp_mask = GFP_KERNEL 的时候,high_zoneidx 通过 gfp_zone 解析出来是:0,遍历的时候,首先在 _zonerefs[0] 开始,但是 index = 1,所以不命中,继续在 _zonerefs[1] 的地方命中,也就是命中了 ZONE_NORMAL
又比如,分配掩码为 GFP_HIGHUSER_MOVABLE,通过 gfp_zone 解出来的 high_zoneidx 为 2,从 _zonerefs[0] 开始遍历,_zonerefs[0] 的 index 为 1, 所以命中,所以选择了 ZONE_HIGHMEM 作为分配内存的地方。
好了,我们回到 __alloc_pages_nodemask 函数,现在看到的是
__alloc_pages_nodemask
|---------------->prepare_alloc_pages
|---------------->finalise_ac
接下来调用关键函数 get_page_from_freelist 尝试去分配页面,如果分配失败,则交给 __alloc_pages_slowpath 处理一些特殊场景,这里先分析 get_page_from_freelist:
代码挺多,不一一截取,关键点是首先调用 for_next_zone_zonelist_nodemask 来遍历 zone 来分配内存:
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
紧接着,检查该 zone 的内存水线是否充足:
....
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_fast(zone, order, mark,
ac_classzone_idx(ac), alloc_flags)) {
int ret;
....
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
ac_classzone_idx(ac), alloc_flags))
goto try_this_zone;
continue;
}
}
zone 存在 3 个水线,WMARK_MIN, WMARK_LOW, WMARK_HIGH
在满足条件的情况下,调用 rmqueue 来分配内存,最终会调用到 __rmqueue_smallest 来分配:
/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = list_first_entry_or_null(&area->free_list[migratetype],
struct page, lru);
if (!page)
continue;
list_del(&page->lru);
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
__rmqueue_smallest 首先从 order 开始查找 zone 中空闲链表,如果 zone 的当前 order 对应的空闲 free_area 中相应的 migratetype 类型的链表中没有空闲的对象,那么查找下一级的 order。
在系统刚刚启动的时候,绝大多数空闲的内存块,都以最大 order 的方式,组织到了一旦找到了 MAX_ORDER - 1 的链表,当找到这种大的内存块,则需要按照之前了解到的伙伴系统的方式,进行内存的切块。
__rmqueue_smallest 函数中的 expand 调用,实现了内存切块:
/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
* testing. Specifically, as large blocks of memory are subdivided,
* the order in which smaller blocks are delivered depends on the order
* they're subdivided in this function. This is the primary factor
* influencing the order in which pages are delivered to the IO
* subsystem according to empirical testing, and this is also justified
* by considering the behavior of a buddy system containing a single
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
* -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype)
{
unsigned long size = 1 << high;
while (high > low) {
area--;
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
}
}
当分配成功后,__rmqueue 返回分配的这个内存块的起始页面的数据结构 page。
所以,总的来说,伙伴系统内存分配的流程:
alloc_pages
|
|->alloc_pages_node
|
|->__alloc_pages
|
|-> __alloc_pages_nodemask
|
|-> get_page_from_freelist
|
|-> zone_watermark_ok -> rmqueue
|
|--> __rmqueue_smallest
参考文档:
https://blog.csdn.net/gatieme/article/details/52704844