一:伙伴系统的初始化函数
系统初始化的时候,会进行如下的调用关系来初始化伙伴管理系统:
start_kernel
---------->mm_init
----------->mem_init
在mem_init函数中,会把之前使用的bootmem 里面管理的空闲内存释放到伙伴系统中,初始化完伙伴系统以后,内存管理就由伙伴系统来接管。
void __init mem_init(void)
{
unsigned long reserved_pages, free_pages;
struct memblock_region *reg;
int i;
#ifdef CONFIG_HAVE_TCM
/* These pointers are filled in on TCM detection */
extern u32 dtcm_end;
extern u32 itcm_end;
#endif
max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;
/* this will put all unused low memory onto the freelists */
free_unused_memmap(&meminfo); 释放meminfo 中记录的bank的物理内存,这边只初始化了一
个bank,所以其实什么都没做
totalram_pages += free_all_bootmem(); 把bootmem中的内存释放到伙伴系统中
pr_err("totalram_pages=%lx \n",totalram_pages);
#ifdef CONFIG_SA1111
/* now that our DMA memory is actually so designated, we can free it */
free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, 0, NULL);
#endif
free_highpages(); 没有定义high memory
reserved_pages = free_pages = 0;
for_each_bank(i, &meminfo) { 这边只是遍历所有的page,查询标志位,对空闲和reserve
的内存做统计
struct membank *bank = &meminfo.bank[i];
unsigned int pfn1, pfn2;
struct page *page, *end;
pfn1 = bank_pfn_start(bank);
pfn2 = bank_pfn_end(bank);
page = pfn_to_page(pfn1);
end = pfn_to_page(pfn2 - 1) + 1;
do {
if (PageReserved(page))
reserved_pages++;
else if (!page_count(page))
free_pages++;
page++;
} while (page < end);
}
/*
* Since our memory may not be contiguous, calculate the
* real number of pages we have in this system
*/
printk(KERN_INFO "Memory:");
num_physpages = 0;
for_each_memblock(memory, reg) {
unsigned long pages = memblock_region_memory_end_pfn(reg) -
memblock_region_memory_base_pfn(reg);
num_physpages += pages;
printk(" %ldMB", pages >> (20 - PAGE_SHIFT));
}
printk(" = %luMB total\n", num_physpages >> (20 - PAGE_SHIFT));
}
释放内存,初始化伙伴系统的核心函数是
free_all_bootmem
unsigned long __init free_all_bootmem(void)
{
unsigned long total_pages = 0;
bootmem_data_t *bdata;
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat)
reset_node_lowmem_managed_pages(pgdat);把每个zone的managed_pages清零
list_for_each_entry(bdata, &bdata_list, list)
total_pages += free_all_bootmem_core(bdata); 释放bootmem到伙伴系统中
return total_pages;
}
在理解linux的内存管理的时候,应该要对pg_data_t,zone,page等结构有一定的认识,可以参考下面的博文:
http://blog.chinaunix.net/uid-25845340-id-3023037.html
伙伴系统初始化的核心函数就是free_all_bootmem_core,bootmem的管理在bdata结构中,所以需要把它传下去
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
struct page *page;
unsigned long start, end, pages, count = 0;
if (!bdata->node_bootmem_map) bootmem位图不存在,直接返回
return 0;
start = bdata->node_min_pfn;
end = bdata->node_low_pfn;
pr_err("nid=%td start=%lx end=%lx\n",
bdata - bootmem_node_data, start, end);
while (start < end) {
unsigned long *map, idx, vec;
unsigned shift;
map = bdata->node_bootmem_map; 获取map位图的首地址
idx = start - bdata->node_min_pfn; 在map中起始页框的index
shift = idx & (BITS_PER_LONG - 1); 如果不是32位对其的,则会产生shift
/*
* vec holds at most BITS_PER_LONG map bits,
* bit 0 corresponds to start.
*/
vec = ~map[idx / BITS_PER_LONG]; map是整形数组,每个数组值能有32位,所以
在对其的情况下,idx / BITS_PER_LONG
的值就是32位对其的index值
if (shift) {
vec >>= shift; 如果不是32位对其的,需要做处理,这边没弄明白,
感觉shift的位宽代的是那些页表位,把它左移掉是什么操
做?
if (end - start >= BITS_PER_LONG)
vec |= ~map[idx / BITS_PER_LONG + 1] <<
(BITS_PER_LONG - shift);
}
/*
* If we have a properly aligned and fully unreserved
* BITS_PER_LONG block of pages in front of us, free
* it in one go.
*/
if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {如果32位对其,并且全
int order = ilog2(BITS_PER_LONG); 是空闲的,则释放掉这
32页
__free_pages_bootmem(pfn_to_page(start), order);
count += BITS_PER_LONG;
start += BITS_PER_LONG;
} else {
unsigned long cur = start; 如果不全是空闲的,依次释放
start = ALIGN(start + 1, BITS_PER_LONG); 取下一个32位的起始index
while (vec && cur != start) {
if (vec & 1) {
page = pfn_to_page(cur);
__free_pages_bootmem(page, 0);
count++;
}
vec >>= 1;
++cur;
}
}
}
page = virt_to_page(bdata->node_bootmem_map); 把bootmem中统计的空闲内存释放到
pages = bdata->node_low_pfn - bdata->node_min_pfn; 伙伴系统以后,bootmem不需要了,把
pages = bootmem_bootmap_pages(pages); bootmem位图占用的空间释放掉
count += pages;
while (pages--)
__free_pages_bootmem(page++, 0);
pr_err("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
return count;
}
该函数的原理是,以BITS_PER_LONG为一个单位,32位,32个页面一组为单位进行查询,如果32个页面都是空的,则一起把这组页面释放掉,如果不全部是free的,则按个释放。
释放bootmem的函数
__free_pages_bootmem
----------->__free_pages(page, order);
__free_pages_bootmem中会依次调用每个page,把page->_count设为0,然后调用__free_pages,释放掉bootmem
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
free_hot_cold_page(page, 0);
else
__free_pages_ok(page, order);
}
}
在__free_pages中,如果只释放一个页,系统会把这个页释放到per_cpu_pages中,对于1页这样的数据,cpu为了加快内存管理速度,会先放入这个列表,如果多个页,则调用__free_pages_ok释放到伙伴管理系统中。先看一下__free_pages_ok如何调用
__free_pages_ok
---------->free_one_page
------------>__free_one_page
__free_one_page 是伙伴系统内存释放管理的核心函数:
static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order,
int migratetype)
{
unsigned long page_idx;
unsigned long combined_idx;
unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
VM_BUG_ON(!zone_is_initialized(zone));
if (unlikely(PageCompound(page)))
if (unlikely(destroy_compound_page(page, order)))
return;
VM_BUG_ON(migratetype == -1);
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 找到当前页的页框,这边不明白
为什么还需要和MAX_ORDER与,取低位
VM_BUG_ON(page_idx & ((1 << order) - 1));
VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) {
buddy_idx = __find_buddy_index(page_idx, order); 找到伙伴页的页框,伙伴页的页框和当前页的页框只有order那一位的值是不一样的
buddy = page + (buddy_idx - page_idx); 取伙伴的page
if (!page_is_buddy(page, buddy, order)) 如果伙伴页不再当前zone,或者不空闲
break; 则返回,不再尝试合并伙伴,释放更大
/* 页
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy)) { 这个不调用,只有开了page debug才走
clear_page_guard_flag(buddy);
set_page_private(page, 0);
__mod_zone_freepage_state(zone, 1 << order,
migratetype);
} else {
list_del(&buddy->lru); 释放伙伴,把伙伴页从当前order删除
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
}
combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx; 把order加1,和伙伴合并以后,变成更大的内存块,
order++; 尝试和更大的伙伴块进行合并
}
set_page_order(page, order);
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
struct page *higher_page, *higher_buddy;
combined_idx = buddy_idx & page_idx;
higher_page = page + (combined_idx - page_idx);
buddy_idx = __find_buddy_index(combined_idx, order + 1);
higher_buddy = higher_page + (buddy_idx - combined_idx);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
goto out;
}
}
把当前块放入free_list,完成释放
list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
out:
zone->free_area[order].nr_free++;
}
所谓伙伴内存,如果第一块内存为4K,那么紧接着他后面的那块4K 内存,则属于他的伙伴,如果这两块内存都空闲,则可以合并起来,变成一块更大的8K内存,两个伙伴,必然处于2的order次方和2的order+1次方之间。关于伙伴更详细的解释以及伙伴系统对于伙伴的处理,参考下面的链接:
https://blog.csdn.net/geshifei/article/details/81914115
__free_one_page主要完成的功能是,查找当前要释放内存块的伙伴块,判断这两个伙伴块是否可以合并成一个更大的伙伴块,把小的块不断的合并成更大的块,有助于减少内存碎片,直到无法再找到连续的可合并的内存块,则把这个page 内存块挂到当前order级别的free_list中。
再来看一下每次释放一个页时调用的free_hot_cold_page:
void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
int migratetype;
if (!free_pages_prepare(page, 0))
return;
migratetype = get_pageblock_migratetype(page);
set_freepage_migratetype(page, migratetype);
local_irq_save(flags);
__count_vm_event(PGFREE);
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
* offlined but treat RESERVE as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(zone, page, 0, migratetype);
goto out;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
if (cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
else
list_add(&page->lru, &pcp->lists[migratetype]); 把当前页挂载
到&pcp->lists链表中
pcp->count++;
if (pcp->count >= pcp->high) { 如果pcp中挂载的空闲页大于某个值时,则需要回收一部
free_pcppages_bulk(zone, pcp->batch, pcp); 分页
pcp->count -= pcp->batch;
}
out:
local_irq_restore(flags);
}
可以看到会把要释放的page放入pcp->lists中,而且如果pcp->lists中的页面数量超过某个值,就会调用free_pcppages_bulk回收一部分page,放入伙伴系统中。关于这部分代码,更详细的解释,参考下面的链接:
http://blog.chinaunix.net/uid-25845340-id-3039220.html
二:伙伴系统内存分配
当伙伴系统初始化完毕以后,就可以利用伙伴系统来分配内存了。伙伴系统最外层内存分配函数有好几个api,但是最终都会调用到一个核心函数__alloc_pages_nodemask:
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
lockdep_trace_alloc(gfp_mask);
might_sleep_if(gfp_mask & __GFP_WAIT);
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
/*
* Will only have any effect when __GFP_KMEMCG is set. This is
* verified in the (always inline) callee
*/
if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
return NULL;
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone); 选取一个合适的zonelist,进行分配
if (!preferred_zone)
goto out;
#ifdef CONFIG_CMA
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype); 第一次尝试分配
if (unlikely(!page)) {
/*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
*/
gfp_mask = memalloc_noio_flags(gfp_mask);
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype); 如果第一次尝试分配失败,
} 则使用慢速分配的方式再进行
尝试分配
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
out:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
memcg_kmem_commit_charge(page, memcg, order);
return page;
}
上面函数主要比较重要的是两个函数,get_page_from_freelist和__alloc_pages_slowpath,称为快速分配和慢速分配,所谓的快速分配就是直接从现有的内存中去分配,如果不成功,再去尝试慢速分配。慢速分配把会进行内存压缩,回收,然后再去尝试分配内存。先看get_page_from_freelist
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
遍历zonelist中的zone
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty
* limit, such that no single zone holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the zone's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* This may look like it could increase pressure on
* lower zones by failing allocations in higher zones
* before they are full. But the pages that do spill
* over are limited as the lower zones are protected
* by this very same mechanism. It should not become
* a practical burden to them.
*
* XXX: For now, allow allocations to potentially
* exceed the per-zone dirty limit in the slowpath
* (ALLOC_WMARK_LOW unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* zones are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
if ((alloc_flags & ALLOC_WMARK_LOW) &&
(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
检查当前zone的水位是否符合分配条件,如果符合,则在这个zone中尝试分配
if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
* by the cpuset.
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
if (zone_reclaim_mode == 0 ||
!zone_allows_reclaim(preferred_zone, zone))
goto this_zone_full;
/*
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*/
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
如果走到这个地方,说明当前的内存已经不多了,需要回收
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
回收完以后,满足水位,则再次尝试分配
/*
* Failed to reclaim enough to meet watermark.
* Only mark the zone full if checking the min
* watermark or if we failed to reclaim just
* 1<pfmemalloc is set when ALLOC_NO_WATERMARKS was
* necessary to allocate the page. The expectation is
* that the caller is taking steps that will free more
* memory. The caller should avoid the page being used
* for !PFMEMALLOC purposes.
*/
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
return page;
}
上面函数会检查当前区的水位,如果满足分配条件,则直接调用buffered_rmqueue进行分配,否则可能则需要调用内存回收函数,然后再尝试分配。
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
again:
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;
local_irq_save(flags);先尝试从cpu缓存链表中进行分配
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
if (list_empty(list)) {缓存链表为空,尝试从zone中分配一些空间给缓存链表
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
if (unlikely(list_empty(list)))
goto failed;
}
if (cold) 重新从缓存链表中取页
page = list_entry(list->prev, struct page, lru);
else
page = list_entry(list->next, struct page, lru);
list_del(&page->lru);
pcp->count--;
} else {
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);当分配大于一页的时候,调用如下函数
page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
get_pageblock_migratetype(page));
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
failed:
local_irq_restore(flags);
return NULL;
}
当order为0,只分配一页的时候,直接尝试从cpu缓存链表中去取,如果cpu缓存链表为空,则从zone中取出pcp->batch页放到cpu缓存链表中,再用该内存进行分配,如果大于一页,则直接调用__rmqueue从zone中分配内存。
释放内存到cpu缓存链表中的函数为rmqueue_bulk,调用关系
rmqueue_bulk
-------->__rmqueue
rmqueue_bulk只是对__rmqueue的封装,在order为0的情况下分配多个页给cpu缓存链表。
static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
{
struct page *page;
retry_reserve: 先尝试用指定migratetype进行分配
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
page = __rmqueue_fallback(zone, order, migratetype);
再尝试用migratetype表进行分配
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
* is used because __rmqueue_smallest is an inline function
* and we want just one call site
*/
if (!page) {
migratetype = MIGRATE_RESERVE;
goto retry_reserve;
}
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
先尝试用指定的migratetype类型取分配,如果不行,再尝试用其他的migratetype表进行分配。
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area * area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
if (list_empty(&area->free_list[migratetype]))
continue;
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
list_del(&page->lru);找到可以分配的order,从当前链表中删除该order
rmv_page_order(page); 取消分配的内存的order值
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
如果分配的order比较大,把order再分成几块,空闲order继续放入伙伴系统中
return page;
}
return NULL;
}
在调用__rmqueue_smallest进行分配的时候,有可能找不到指定order值的内存块,那么则需要找更大的order值的内存块来进行分配。而更大的order值。必定会产生小的空闲的内存块,所以调用expand把空闲的内存继续放到伙伴系统中。
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype)
{ low为需要分配的内存块大小,high当前分配的内存块大小
unsigned long size = 1 << high;
while (high > low) { high大于low,必然会出现空闲内存
area--;
high--;
size >>= 1;
VM_BUG_ON(bad_range(zone, &page[size]));
#ifdef CONFIG_DEBUG_PAGEALLOC
if (high < debug_guardpage_minorder()) {
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
INIT_LIST_HEAD(&page[size].lru);
set_page_guard_flag(&page[size]);
set_page_private(&page[size], high);
/* Guard pages are not available for any usage */
__mod_zone_freepage_state(zone, -(1 << high),
migratetype);
continue;
}
#endif
page偏移size,放入伙伴系统
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
}
}
__rmqueue_fallback函数也是很清晰的,从fallbacks链表中取出不同的migratetype,然后查找zone的不同MIGRATE域的free_list来分配内存,如果分配成功大内存,则使用expand再回收一部分内存。这部分代码更详细的说明,参考如下博文:
http://blog.chinaunix.net/uid-25845340-id-3039220.html
回过来再看__alloc_pages_slowpath函数:
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
bool deferred_compaction = false;
bool contended_compaction = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
* reclaim >= MAX_ORDER areas which will never succeed. Callers may
* be using allocators in order of preference for an area that is
* too large.
*/
if (order >= MAX_ORDER) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
/*
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
* __GFP_NOWARN set) should not cause reclaim since the subsystem
* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
* using a larger set of nodes after it has established that the
* allowed per node queues are empty and that nodes are
* over allocated.
*/
if (IS_ENABLED(CONFIG_NUMA) &&
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
if (!(gfp_mask & __GFP_NO_KSWAPD))
wake_all_kswapd(order, zonelist, high_zoneidx,
zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
first_zones_zonelist(zonelist, high_zoneidx, NULL,
&preferred_zone);
rebalance:
/* This is the last chance, in general, before the goto nopage. */
尝试进行内存分配
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page)
goto got_pg;
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
/*
* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
* the allocation is high priority and these type of
* allocations are system rather than user orientated
*/
zonelist = node_zonelist(numa_node_id(), gfp_mask);
如果设置了ALLOC_NO_WATERMARKS标志位的前提下,则不会管zone中的水位,再次尝试分配
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
if (page) {
goto got_pg;
}
}
/* Atomic allocations - we can't balance anything */
if (!wait)
goto nopage;
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
/*
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
对内存进行压缩。压缩完以后再尝试进行分配
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, sync_migration,
&contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
sync_migration = true;
/*
* If compaction is deferred for high-order allocations, it is because
* sync compaction recently failed. In this is the case and the caller
* requested a movable allocation that does not heavily disrupt the
* system then fail the allocation instead of entering direct reclaim.
*/
if ((deferred_compaction || contended_compaction) &&
(gfp_mask & __GFP_NO_KSWAPD))
goto nopage;
/* Try direct reclaim and then allocating */
对内存进行回收。回收完以后再尝试进行分配
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, &did_some_progress);
if (page)
goto got_pg;
/*
* If we failed to make any progress reclaiming, then we are
* running out of options and have to consider going OOM
*/
if (!did_some_progress) {
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_killer_disabled)
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
if ((current->flags & PF_DUMPCORE) &&
!(gfp_mask & __GFP_NOFAIL))
goto nopage;
尝试杀死某些进程来回收内存,并再次进行分配
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
migratetype);
if (page)
goto got_pg;
if (!(gfp_mask & __GFP_NOFAIL)) {
/*
* The oom killer is not called for high-order
* allocations that may fail, so if no progress
* is being made, there are no other options and
* retrying is unlikely to help.
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto nopage;
/*
* The oom killer is not called for lowmem
* allocations to prevent needlessly killing
* innocent tasks.
*/
if (high_zoneidx < ZONE_NORMAL)
goto nopage;
}
goto restart;
}
}
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, did_some_progress,
pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
} else {
/*
* High-order allocations do not necessarily loop after
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, sync_migration,
&contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
}
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
return page;
}
上面的函数在内存可能短缺的情况下,采取多种手段来回收内存,并重新进行分配。更详细的说明,参考如下博文:
http://blog.chinaunix.net/uid-25845340-id-3033899.html