chipset: MSM8X25Q
Codebase: Android4.1
Kernel: 3.4.0
基本概念:
关于伙伴系统算法的原理还是比较好理解的,这里不作复数。直接看下关键数据结构。
[html] view plain copy print ?
- struct zone {
- ~~snip
- struct free_area free_area[MAX_ORDER]; //每一阶以一个元素保存,平台最大11阶。
- ~~snip
- };
struct zone {
~~snip
struct free_area free_area[MAX_ORDER]; //每一阶以一个元素保存,平台最大11阶。
~~snip
};
可以看到每个zone都有它自己的free_area,效果如下图:
相关信息可以从/proc/buddyinfo读取到,各个值含义依次为节点号, zone类型,后面的值表示从0阶开始各个阶空闲的页数:
[html] view plain copy print ?
- #cat/proc/buddyinfo
- Node0, zone Normal 70 27
- 33 148 47 13 7 2
- 2 1 1
- Node0, zone HighMem 0 1 1 2 2 0 0 0
- 0 0 0
#cat/proc/buddyinfo
Node0, zone Normal 70 27
33 148 47 13 7 2
2 1 1
Node0, zone HighMem 0 1 1 2 2 0 0 0
0 0 0
[html] view plain copy print ?
- struct free_area {
- /*同样阶数保存到一个链表上 */
- struct list_head free_list[MIGRATE_TYPES];
- /*当前空闲页块的数目。*/
- unsigned long nr_free;
- };
struct free_area {
/*同样阶数保存到一个链表上 */
struct list_head free_list[MIGRATE_TYPES];
/*当前空闲页块的数目。*/
unsigned long nr_free;
};
效果如下图:
MIGRATE_TYPES是作为反碎片的一种机制,专有名叫迁移类型。大概的原理就是将伙伴系统的内存页分为几种类型,有可移动,不可移动,可回收等,同一类型的页放在一个区域,如不可回收的页不能放在可移动类型区域,这样对可以移动区域,伙伴系统就可以回收了。
[html] view plain copy print ?
- enum {
- MIGRATE_UNMOVABLE,
- MIGRATE_RECLAIMABLE,
- MIGRATE_MOVABLE,
- MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
- MIGRATE_RESERVE = MIGRATE_PCPTYPES,
- #ifdef CONFIG_CMA
- MIGRATE_CMA,
- #endif
- MIGRATE_ISOLATE, /* can't allocate from here */
- MIGRATE_TYPES
- };
enum {
MIGRATE_UNMOVABLE,
MIGRATE_RECLAIMABLE,
MIGRATE_MOVABLE,
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
MIGRATE_RESERVE = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
MIGRATE_CMA,
#endif
MIGRATE_ISOLATE, /* can't allocate from here */
MIGRATE_TYPES
};
MIGRATE_UNMOVABLE:不可移动页,在内存中有固定位置,不能移动。核心内核分配的大部分内存属于此类。
MIGRATE_RECLAIMABLE:可回收页,不能移动,但能删除。Kswapd内核线程会操作次区域。
MIGRATE_MOVABLE:可移动又可回收页,用户空间程序使用此类,通过页表映射实现,如果应用程序虚拟地址空间有变化,只要变化页表就可以了。
MIGRATE_RESERVE: 当系统内存相当少而且比较紧急时,才用到此区域。
MIGRATE_CMA:这个是为了避免预留大块内存实现的,当需要大块内存的时候如audio/camera等,它可以被使用;当小内存申请需要时,它也可以被使用,避免了pmem/ion的弊端,不过似乎要基于DMA。后面打算用一篇文章来分析cma.
MIGRATE_ISOLATE: NUMA系统上使用,我们用UMA,不管它。
当某个迁移类型的内存不足时,会向另外一个迁移类型去要内存。这个跟zone的申请机制很像!下面结构规定了当前迁移类型不够时下一个使用的类型,如MIGRATE_UNMOVABLE的使用顺序是: MIGRATE_RECLAIMABLE -> MIGRATE_RECLAIMABLE -> MIGRATE_MOVABLE -> MIGRATE_RESERVE.
[html] view plain copy print ?
- static int fallbacks[MIGRATE_TYPES][4] = {
- [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- #ifdef CONFIG_CMA
- [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
- [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
- #else
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
- #endif
- [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
- [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
- };
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
#ifdef CONFIG_CMA
[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
#else
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#endif
[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
};
迁移类型的信息可以从/proc/pagetypeinfo读到:
[html] view plain copy print ?
- #cat /proc/pagetypeinfo
- Page block order: 10
- Pages per block: 1024
-
- Free pages count per migrate type at order 0 1 2 3 4
- 5 6 7 8 9 10
- Node 0, zone Normal, type Unmovable 1 1 2 0 1
- 0 1 1 1 0 0
- Node 0, zone Normal, type Reclaimable 8 18 3 0 0
- 0 0 0 0 0 0
- Node 0, zone Normal, type Movable 1 1 0 115 46
- 12 5 0 0 0 0
- Node 0, zone Normal, type Reserve 0 0 0 1 0
- 1 1 1 1 1 1
- Node 0, zone Normal, type Isolate 0 0 0 0 0
- 0 0 0 0 0 0
- Node 0, zone HighMem, type Unmovable 0 0 0 0 0
- 0 0 0 0 0 0
- Node 0, zone HighMem, type Reclaimable 0 0 0 0 0
- 0 0 0 0 0 0
- Node 0, zone HighMem, type Movable 0 1 0 0 0
- 0 0 0 0 0 0
- Node 0, zone HighMem, type Reserve 0 0 1 2 2
- 0 0 0 0 0 0
- Node 0, zone HighMem, type Isolate 0 0 0 0 0
- 0 0 0 0 0 0
-
- Number of blocks type Unmovable Reclaimable Movable Reserve
- Isolate
- Node 0, zone Normal 13 8 178 2
- 0
- Node 0, zone HighMem 1 0 14 1
- 0
#cat /proc/pagetypeinfo
Page block order: 10
Pages per block: 1024
Free pages count per migrate type at order 0 1 2 3 4
5 6 7 8 9 10
Node 0, zone Normal, type Unmovable 1 1 2 0 1
0 1 1 1 0 0
Node 0, zone Normal, type Reclaimable 8 18 3 0 0
0 0 0 0 0 0
Node 0, zone Normal, type Movable 1 1 0 115 46
12 5 0 0 0 0
Node 0, zone Normal, type Reserve 0 0 0 1 0
1 1 1 1 1 1
Node 0, zone Normal, type Isolate 0 0 0 0 0
0 0 0 0 0 0
Node 0, zone HighMem, type Unmovable 0 0 0 0 0
0 0 0 0 0 0
Node 0, zone HighMem, type Reclaimable 0 0 0 0 0
0 0 0 0 0 0
Node 0, zone HighMem, type Movable 0 1 0 0 0
0 0 0 0 0 0
Node 0, zone HighMem, type Reserve 0 0 1 2 2
0 0 0 0 0 0
Node 0, zone HighMem, type Isolate 0 0 0 0 0
0 0 0 0 0 0
Number of blocks type Unmovable Reclaimable Movable Reserve
Isolate
Node 0, zone Normal 13 8 178 2
0
Node 0, zone HighMem 1 0 14 1
0
初始化:
首先对伙伴系统相关数据结构初始化,有如下调用流程:
start_kernel -> setup_arch ->paging_init -> bootmem_init -> arm_bootmem_free ->
free_area_init_node ->init_currently_empty_zone -> zone_init_free_lists
[html] view plain copy print ?
- static void __meminit zone_init_free_lists(struct zone *zone)
- {
- int order, t;
- /*对每种迁移类型每个order初始化free_list和nr_free。*/
- for_each_migratetype_order(order, t) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
- zone->free_area[order].nr_free = 0;
- }
- }
-
- #define for_each_migratetype_order(order, type) \
- for (order = 0; order < MAX_ORDER; order++) \
- for (type = 0; type < MIGRATE_TYPES; type++)
-
- 初始化好数据结构之后,先要得到系统当前空闲的可供伙伴系统分配的页,由于在伙伴系统初始化之前使用的是bootmem分配器,所以现在是该释放bootmem分配器所管理的内存部分了。调用流程如下:
-
- start_kernel -> mm_init -> mem_init.
- /*
- * mem_init() marks the free areas in the mem_map and tells us how much
- * memory is free. This is done after various parts of the system have
- * claimed their memory after the kernel image.
- */
- void __init mem_init(void)
- {
- unsigned long reserved_pages, free_pages;
- struct memblock_region *reg;
- int i;
-
- max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;
-
- /* this will put all unused low memory onto the freelists */
- /*标记可以使用的内存页*/
- free_unused_memmap(&meminfo);
- /*由bootmem分配器管理的空闲内存部分都会被释放,
- 释放的接口是__free_pages (),后面会说道调用这个接口最终会
- 被放到伙伴系统的free_list上面。另外,bootmem分配器到这里
- 也结束生命了。*/
- totalram_pages += free_all_bootmem();
- /*高端内存空闲页也被释放到free_list中。*/
- free_highpages();
-
- reserved_pages = free_pages = 0;
- /*统计当前物理内存空闲页和保留页各有多少。*/
- for_each_bank(i, &meminfo) {
- struct membank *bank = &meminfo.bank[i];
- unsigned int pfn1, pfn2;
- struct page *page, *end;
-
- pfn1 = bank_pfn_start(bank);
- pfn2 = bank_pfn_end(bank);
-
- page = pfn_to_page(pfn1);
- end = pfn_to_page(pfn2 - 1) + 1;
-
- do {
- if (PageReserved(page))
- reserved_pages++;
- else if (!page_count(page))
- free_pages++;
- page++;
- #ifdef CONFIG_SPARSEMEM
- pfn1++;
- if (!(pfn1 % PAGES_PER_SECTION))
- page = pfn_to_page(pfn1);
- } while (pfn1 < pfn2);
- #else
- } while (page < end);
- #endif
- }
-
- /*
- * Since our memory may not be contiguous, calculate the
- * real number of pages we have in this system
- */
- printk(KERN_INFO "Memory:");
- num_physpages = 0;
- for_each_memblock(memory, reg) {
- unsigned long pages = memblock_region_memory_end_pfn(reg) -
- memblock_region_memory_base_pfn(reg);
- num_physpages += pages;
- printk(" %ldMB", pages >> (20 - PAGE_SHIFT));
- }
- printk(" = %luMB total\n", num_physpages >> (20 - PAGE_SHIFT));
-
- printk(KERN_NOTICE "Memory: %luk/%luk available, %luk reserved, %luK highmem\n",
- nr_free_pages() << (PAGE_SHIFT-10),
- free_pages << (PAGE_SHIFT-10),
- reserved_pages << (PAGE_SHIFT-10),
- totalhigh_pages << (PAGE_SHIFT-10));
-
- #define MLK(b, t) b, t, ((t) - (b)) >> 10
- #define MLM(b, t) b, t, ((t) - (b)) >> 20
- #define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K)
-
- /*打印出各个区域的起始和结束地址。*/
- printk(KERN_NOTICE "Virtual kernel memory layout:\n"
- " vector : 0x%08lx - 0x%08lx (%4ld kB)\n"
- #ifdef CONFIG_HAVE_TCM
- " DTCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
- " ITCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
- #endif
- " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
- " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
- " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
- #ifdef CONFIG_HIGHMEM
- " pkmap : 0x%08lx - 0x%08lx (%4ld MB)\n"
- #endif
- #ifdef CONFIG_MODULES
- " modules : 0x%08lx - 0x%08lx (%4ld MB)\n"
- #endif
- " .text : 0x%p" " - 0x%p" " (%4d kB)\n"
- " .init : 0x%p" " - 0x%p" " (%4d kB)\n"
- " .data : 0x%p" " - 0x%p" " (%4d kB)\n"
- " .bss : 0x%p" " - 0x%p" " (%4d kB)\n",
-
- MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
- (PAGE_SIZE)),
- MLK(FIXADDR_START, FIXADDR_TOP),
- MLM(VMALLOC_START, VMALLOC_END),
- MLM(PAGE_OFFSET, (unsigned long)high_memory),
- #ifdef CONFIG_HIGHMEM
- MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) *
- (PAGE_SIZE)),
- #endif
- #ifdef CONFIG_MODULES
- MLM(MODULES_VADDR, MODULES_END),
- #endif
-
- MLK_ROUNDUP(_text, _etext),
- MLK_ROUNDUP(__init_begin, __init_end),
- MLK_ROUNDUP(_sdata, _edata),
- MLK_ROUNDUP(__bss_start, __bss_stop));
- ~~snip
- }
static void __meminit zone_init_free_lists(struct zone *zone)
{
int order, t;
/*对每种迁移类型每个order初始化free_list和nr_free。*/
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
}
#define for_each_migratetype_order(order, type) \
for (order = 0; order < MAX_ORDER; order++) \
for (type = 0; type < MIGRATE_TYPES; type++)
初始化好数据结构之后,先要得到系统当前空闲的可供伙伴系统分配的页,由于在伙伴系统初始化之前使用的是bootmem分配器,所以现在是该释放bootmem分配器所管理的内存部分了。调用流程如下:
start_kernel -> mm_init -> mem_init.
/*
* mem_init() marks the free areas in the mem_map and tells us how much
* memory is free. This is done after various parts of the system have
* claimed their memory after the kernel image.
*/
void __init mem_init(void)
{
unsigned long reserved_pages, free_pages;
struct memblock_region *reg;
int i;
max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;
/* this will put all unused low memory onto the freelists */
/*标记可以使用的内存页*/
free_unused_memmap(&meminfo);
/*由bootmem分配器管理的空闲内存部分都会被释放,
释放的接口是__free_pages (),后面会说道调用这个接口最终会
被放到伙伴系统的free_list上面。另外,bootmem分配器到这里
也结束生命了。*/
totalram_pages += free_all_bootmem();
/*高端内存空闲页也被释放到free_list中。*/
free_highpages();
reserved_pages = free_pages = 0;
/*统计当前物理内存空闲页和保留页各有多少。*/
for_each_bank(i, &meminfo) {
struct membank *bank = &meminfo.bank[i];
unsigned int pfn1, pfn2;
struct page *page, *end;
pfn1 = bank_pfn_start(bank);
pfn2 = bank_pfn_end(bank);
page = pfn_to_page(pfn1);
end = pfn_to_page(pfn2 - 1) + 1;
do {
if (PageReserved(page))
reserved_pages++;
else if (!page_count(page))
free_pages++;
page++;
#ifdef CONFIG_SPARSEMEM
pfn1++;
if (!(pfn1 % PAGES_PER_SECTION))
page = pfn_to_page(pfn1);
} while (pfn1 < pfn2);
#else
} while (page < end);
#endif
}
/*
* Since our memory may not be contiguous, calculate the
* real number of pages we have in this system
*/
printk(KERN_INFO "Memory:");
num_physpages = 0;
for_each_memblock(memory, reg) {
unsigned long pages = memblock_region_memory_end_pfn(reg) -
memblock_region_memory_base_pfn(reg);
num_physpages += pages;
printk(" %ldMB", pages >> (20 - PAGE_SHIFT));
}
printk(" = %luMB total\n", num_physpages >> (20 - PAGE_SHIFT));
printk(KERN_NOTICE "Memory: %luk/%luk available, %luk reserved, %luK highmem\n",
nr_free_pages() << (PAGE_SHIFT-10),
free_pages << (PAGE_SHIFT-10),
reserved_pages << (PAGE_SHIFT-10),
totalhigh_pages << (PAGE_SHIFT-10));
#define MLK(b, t) b, t, ((t) - (b)) >> 10
#define MLM(b, t) b, t, ((t) - (b)) >> 20
#define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K)
/*打印出各个区域的起始和结束地址。*/
printk(KERN_NOTICE "Virtual kernel memory layout:\n"
" vector : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HAVE_TCM
" DTCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
" ITCM : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
" lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld MB)\n"
#endif
#ifdef CONFIG_MODULES
" modules : 0x%08lx - 0x%08lx (%4ld MB)\n"
#endif
" .text : 0x%p" " - 0x%p" " (%4d kB)\n"
" .init : 0x%p" " - 0x%p" " (%4d kB)\n"
" .data : 0x%p" " - 0x%p" " (%4d kB)\n"
" .bss : 0x%p" " - 0x%p" " (%4d kB)\n",
MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
(PAGE_SIZE)),
MLK(FIXADDR_START, FIXADDR_TOP),
MLM(VMALLOC_START, VMALLOC_END),
MLM(PAGE_OFFSET, (unsigned long)high_memory),
#ifdef CONFIG_HIGHMEM
MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) *
(PAGE_SIZE)),
#endif
#ifdef CONFIG_MODULES
MLM(MODULES_VADDR, MODULES_END),
#endif
MLK_ROUNDUP(_text, _etext),
MLK_ROUNDUP(__init_begin, __init_end),
MLK_ROUNDUP(_sdata, _edata),
MLK_ROUNDUP(__bss_start, __bss_stop));
~~snip
}
到此,系统空闲的内存都交由伙伴系统管理了!
内存分配:
调用的接口有如下几个:
[html] view plain copy print ?
- #define alloc_pages(gfp_mask, order) \
- alloc_pages_node(numa_node_id(), gfp_mask, order)
- #define alloc_pages_vma(gfp_mask, order, vma, addr, node) \
- alloc_pages(gfp_mask, order)
- #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
- #define alloc_page_vma(gfp_mask, vma, addr) \
- alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
- #define alloc_page_vma_node(gfp_mask, vma, addr, node) \
- alloc_pages_vma(gfp_mask, 0, vma, addr, node)
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)
#define alloc_pages_vma(gfp_mask, order, vma, addr, node) \
alloc_pages(gfp_mask, order)
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr) \
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
alloc_pages_vma(gfp_mask, 0, vma, addr, node)
不过最终调用的都是__alloc_pages_nodemask()
__alloc_pages_nodemask()
[html] view plain copy print ?
- struct page *
- __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
- {
- /*得到分配者所需要的对应zone区域。*/
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
- struct zone *preferred_zone;
- struct page *page = NULL;
-
- /*根据分配者存在gfp_mask中的值来得到对应的表示用哪种迁移类型。*/
- int migratetype = allocflags_to_migratetype(gfp_mask);
- unsigned int cpuset_mems_cookie;
-
- gfp_mask &= gfp_allowed_mask;
-
- ~~snip
- /*第一次尝试快速分配,可能从pcp高速缓存页分配,也可以从free_list上分配。
- 这是最简单的分配状况。*/
- /* First allocation attempt */
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
- zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
- preferred_zone, migratetype);
- /*如果分配失败,则尝试再一次慢速分配,可能需要等页面回收等之后才有可用内存页。*/
- if (unlikely(!page))
- page = __alloc_pages_slowpath(gfp_mask, order,
- zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
- ~snip
-
- return page;
- }
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
/*得到分配者所需要的对应zone区域。*/
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page = NULL;
/*根据分配者存在gfp_mask中的值来得到对应的表示用哪种迁移类型。*/
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
gfp_mask &= gfp_allowed_mask;
~~snip
/*第一次尝试快速分配,可能从pcp高速缓存页分配,也可以从free_list上分配。
这是最简单的分配状况。*/
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
preferred_zone, migratetype);
/*如果分配失败,则尝试再一次慢速分配,可能需要等页面回收等之后才有可用内存页。*/
if (unlikely(!page))
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
~snip
return page;
}
get_page_from_freelist():
[html] view plain copy print ?
- static struct page *
- get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
- struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int migratetype)
- {
- struct zoneref *z;
- struct page *page = NULL;
- int classzone_idx;
- struct zone *zone;
- nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
- int zlc_active = 0; /* set if using zonelist_cache */
- int did_zlc_setup = 0; /* just call zlc_setup() one time */
-
- classzone_idx = zone_idx(preferred_zone);
- zonelist_scan:
- /*
- * Scan zonelist, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
- */
- /*扫描整个zonlist列表,我们是UMA,所以只有一个了。
- 不过还是会在当前zone内存不足的情况下,依次扫描下个zone
- 是否有空闲页。*/
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask) {
- /*检查是否在运行的node上分配内存*/
- if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed_softwall(zone, gfp_mask))
- continue;
- ~~snip
- /*伙伴系统提供了一个水位线机制来合理的分配内存。从名字
- 可以想到,肯定有对应的水位高低之分,高出或者低于肯定有想对应的
- 操作。*/
- if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
- unsigned long mark;
- int ret;
-
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
- /*检查水位线是否正常。*/
- if (zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
- goto try_this_zone;
-
- ~~snip
- /*跑到这里表示水位线不正常了,这里为0
- 表示没有设置页面回收模式。*/
- if (zone_reclaim_mode == 0)
- goto this_zone_full;
- /*开始回收本zone相应页面,不过函数只针对NUMA,
- UMA直接返回了0.*/
- ret = zone_reclaim(zone, gfp_mask, order);
- switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
- /* did not scan */
- continue;
- case ZONE_RECLAIM_FULL:
- /* scanned but unreclaimable */
- continue;
- default:
- /* did we reclaim enough */
- /*回收过后重新判断水位线。*/
- if (!zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
- /*不正常则表示此zone已满!*/
- goto this_zone_full;
- }
- }
- /*跑到这表示此zone可以分配。*/
- try_this_zone:
- /*根据order的值来决定从pcp还是free_list上分配。*/
- page = buffered_rmqueue(preferred_zone, zone, order,
- gfp_mask, migratetype);
- if (page)
- break;
- this_zone_full:
- ~~snip
- }
-
- ~~snip
- return page;
- }
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
/*扫描整个zonlist列表,我们是UMA,所以只有一个了。
不过还是会在当前zone内存不足的情况下,依次扫描下个zone
是否有空闲页。*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
/*检查是否在运行的node上分配内存*/
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
~~snip
/*伙伴系统提供了一个水位线机制来合理的分配内存。从名字
可以想到,肯定有对应的水位高低之分,高出或者低于肯定有想对应的
操作。*/
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
/*检查水位线是否正常。*/
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
~~snip
/*跑到这里表示水位线不正常了,这里为0
表示没有设置页面回收模式。*/
if (zone_reclaim_mode == 0)
goto this_zone_full;
/*开始回收本zone相应页面,不过函数只针对NUMA,
UMA直接返回了0.*/
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
/*回收过后重新判断水位线。*/
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
/*不正常则表示此zone已满!*/
goto this_zone_full;
}
}
/*跑到这表示此zone可以分配。*/
try_this_zone:
/*根据order的值来决定从pcp还是free_list上分配。*/
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (page)
break;
this_zone_full:
~~snip
}
~~snip
return page;
}
buffered_rmqueue():
[html] view plain copy print ?
- static inline
- struct page *buffered_rmqueue(struct zone *preferred_zone,
- struct zone *zone, int order, gfp_t gfp_flags,
- int migratetype)
- {
- unsigned long flags;
- struct page *page;
- /*分配的是pcp冷页还是热页,热页表示也存在硬件高速缓冲中。
- 而冷页没有。一般使用的都是热页。*/
- int cold = !!(gfp_flags & __GFP_COLD);
-
- again:
- /*当order为0也就是只分配一页的时候,为了提高效率,
- 直接从pcp中去获取。*/
- if (likely(order == 0)) {
- struct per_cpu_pages *pcp;
- struct list_head *list;
-
- local_irq_save(flags);
- /*每个cpu对应一个pcp,pcp名字由来就是
- Per cpu pageset.*/
- pcp = &this_cpu_ptr(zone->pageset)->pcp;
- /*从当前迁移类型上得到list,以判断是否有空闲页。*/
- list = &pcp->lists[migratetype];
- /*pcp本质上也是从伙伴系统的free list中获得的。*/
- if (list_empty(list)) {
- /*从free list上申请batch个页放入pcp备用,最终调用
- 的是__rmqueue()标准申请接口,下面会分析到。*/
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype, cold);
- if (unlikely(list_empty(list)))
- goto failed;
- }
- /*冷页从list最后取,热页从最前面取, 由存放顺序决定。*/
- if (cold)
- page = list_entry(list->prev, struct page, lru);
- else
- page = list_entry(list->next, struct page, lru);
- /*被申请掉后从空闲List中移除。*/
- list_del(&page->lru);
- pcp->count--;
- } else {
- if (unlikely(gfp_flags & __GFP_NOFAIL)) {
- WARN_ON_ONCE(order > 1);
- }
- spin_lock_irqsave(&zone->lock, flags);
- /*从伙伴系统上的free list申请。*/
- page = __rmqueue(zone, order, migratetype);
- spin_unlock(&zone->lock);
- if (!page)
- goto failed;
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
- }
- /*重新统计当前zone相关信息。*/
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(preferred_zone, zone, gfp_flags);
- local_irq_restore(flags);
-
- VM_BUG_ON(bad_range(zone, page));
- /*后续准备工作,如设置一些页相关标志,是否是zero page等。*/
- if (prep_new_page(page, order, gfp_flags))
- goto again;
- return page;
-
- failed:
- local_irq_restore(flags);
- return NULL;
- }
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
/*分配的是pcp冷页还是热页,热页表示也存在硬件高速缓冲中。
而冷页没有。一般使用的都是热页。*/
int cold = !!(gfp_flags & __GFP_COLD);
again:
/*当order为0也就是只分配一页的时候,为了提高效率,
直接从pcp中去获取。*/
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;
local_irq_save(flags);
/*每个cpu对应一个pcp,pcp名字由来就是
Per cpu pageset.*/
pcp = &this_cpu_ptr(zone->pageset)->pcp;
/*从当前迁移类型上得到list,以判断是否有空闲页。*/
list = &pcp->lists[migratetype];
/*pcp本质上也是从伙伴系统的free list中获得的。*/
if (list_empty(list)) {
/*从free list上申请batch个页放入pcp备用,最终调用
的是__rmqueue()标准申请接口,下面会分析到。*/
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
if (unlikely(list_empty(list)))
goto failed;
}
/*冷页从list最后取,热页从最前面取, 由存放顺序决定。*/
if (cold)
page = list_entry(list->prev, struct page, lru);
else
page = list_entry(list->next, struct page, lru);
/*被申请掉后从空闲List中移除。*/
list_del(&page->lru);
pcp->count--;
} else {
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);
/*从伙伴系统上的free list申请。*/
page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
}
/*重新统计当前zone相关信息。*/
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
VM_BUG_ON(bad_range(zone, page));
/*后续准备工作,如设置一些页相关标志,是否是zero page等。*/
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
failed:
local_irq_restore(flags);
return NULL;
}
__rmqueue():
[html] view plain copy print ?
- rmqueue_bulk()只是pcp调用__rmqueue()然后设置和pcp相关的一些参数,比较简单,这里不介绍了,直接看__rmqueue().
- static struct page *__rmqueue(struct zone *zone, unsigned int order,
- int migratetype)
- {
- struct page *page;
-
- retry_reserve:
- /*使用伙伴系统算法分配内存*/
- page = __rmqueue_smallest(zone, order, migratetype);
- /*如果失败了,而且当前迁移类型不是RESERVE,
- 那么尝试从下个迁移类型分配。分配次序前面有说明过了,
- 按照fallbacks 定义的顺序,MIGRATE_RESERVE表示很紧急的时候分配
- 如果它还是失败那没戏了。*/
- if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
- page = __rmqueue_fallback(zone, order, migratetype);
- if (!page) {
- /*还是失败的话那么只能用MIGRATE_RESERVE 类型的
- 去申请了。*/
- migratetype = MIGRATE_RESERVE;
- goto retry_reserve;
- }
- }
-
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
- return page;
- }
rmqueue_bulk()只是pcp调用__rmqueue()然后设置和pcp相关的一些参数,比较简单,这里不介绍了,直接看__rmqueue().
static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
{
struct page *page;
retry_reserve:
/*使用伙伴系统算法分配内存*/
page = __rmqueue_smallest(zone, order, migratetype);
/*如果失败了,而且当前迁移类型不是RESERVE,
那么尝试从下个迁移类型分配。分配次序前面有说明过了,
按照fallbacks 定义的顺序,MIGRATE_RESERVE表示很紧急的时候分配
如果它还是失败那没戏了。*/
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
page = __rmqueue_fallback(zone, order, migratetype);
if (!page) {
/*还是失败的话那么只能用MIGRATE_RESERVE 类型的
去申请了。*/
migratetype = MIGRATE_RESERVE;
goto retry_reserve;
}
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
__rmqueue_smallest():
[html] view plain copy print ?
- static inline
- struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
- int migratetype)
- {
- unsigned int current_order;
- struct free_area * area;
- struct page *page;
- /*buddy算法还是很容易理解的。从当前要申请的order开始查找,
- 只有大于order的有空闲页,那就成功了!*/
- /* Find a page of the appropriate size in the preferred list */
- for (current_order = order; current_order < MAX_ORDER; ++current_order) {
- /*取得当前order对应的free_area*/
- area = &(zone->free_area[current_order]);
- /*没有空闲页则查找更大的order。*/
- if (list_empty(&area->free_list[migratetype]))
- continue;
- /*跑到这里表示已经找到,取出free_list中的一页。*/
- page = list_entry(area->free_list[migratetype].next,
- struct page, lru);
- /*从free_list中删掉。*/
- list_del(&page->lru);
- rmv_page_order(page);
- /*空闲页减少。*/
- area->nr_free--;
- /*拆分合并此order,因为一部分被使用了,剩下一部分还空闲着,
- 会被安排到低于当前找到order的free_list上,重新排列buddy内存布局*/
- expand(zone, page, order, current_order, area, migratetype);
- return page;
- }
-
- return NULL;
- }
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area * area;
struct page *page;
/*buddy算法还是很容易理解的。从当前要申请的order开始查找,
只有大于order的有空闲页,那就成功了!*/
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
/*取得当前order对应的free_area*/
area = &(zone->free_area[current_order]);
/*没有空闲页则查找更大的order。*/
if (list_empty(&area->free_list[migratetype]))
continue;
/*跑到这里表示已经找到,取出free_list中的一页。*/
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
/*从free_list中删掉。*/
list_del(&page->lru);
rmv_page_order(page);
/*空闲页减少。*/
area->nr_free--;
/*拆分合并此order,因为一部分被使用了,剩下一部分还空闲着,
会被安排到低于当前找到order的free_list上,重新排列buddy内存布局*/
expand(zone, page, order, current_order, area, migratetype);
return page;
}
return NULL;
}
expand():
[html] view plain copy print ?
- static inline void expand(struct zone *zone, struct page *page,
- int low, int high, struct free_area *area,
- int migratetype)
- {
- unsigned long size = 1 << high;
- /*low和high分别表示要申请的order和现在找到的order*/
- while (high > low) {
- /*使用低一阶的area, order小一阶, size也减半*/
- area--;
- high--;
- size >>= 1;
- VM_BUG_ON(bad_range(zone, &page[size]));
- /*从size开始的page插入到当前area的freelist中*/
- list_add(&page[size].lru, &area->free_list[migratetype]);
- area->nr_free++;
- /*保存当前order当struct page中。*/
- set_page_order(&page[size], high);
- }
- }
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype)
{
unsigned long size = 1 << high;
/*low和high分别表示要申请的order和现在找到的order*/
while (high > low) {
/*使用低一阶的area, order小一阶, size也减半*/
area--;
high--;
size >>= 1;
VM_BUG_ON(bad_range(zone, &page[size]));
/*从size开始的page插入到当前area的freelist中*/
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
/*保存当前order当struct page中。*/
set_page_order(&page[size], high);
}
}
所以如果第一次分配就成功,buddy算法流程相当简单的。如果当前迁移类型分配失败,
那么就要从下一个迁移类型上去分配了!来看__rmqueue_fallback().
__rmqueue_fallback():
[html] view plain copy print ?
- static inline struct page *
- __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
- {
- struct free_area * area;
- int current_order;
- struct page *page;
- int migratetype, i;
- /*从最高阶开始搜索,大块内存申请/分割可以避免更少的碎片。*/
- /* Find the largest possible block of pages in the other list */
- for (current_order = MAX_ORDER-1; current_order >= order;
- --current_order) {
- for (i = 0;; i++) {
- /*得到数组中指定顺序的迁移类型。*/
- migratetype = fallbacks[start_migratetype][i];
- /*保留内存晚点操作。*/
- /* MIGRATE_RESERVE handled later if necessary */
- if (migratetype == MIGRATE_RESERVE)
- break;
- /*取得area,判断list,为空表示当前迁移类型也没有空闲页。*/
- area = &(zone->free_area[current_order]);
- if (list_empty(&area->free_list[migratetype]))
- continue;
-
- page = list_entry(area->free_list[migratetype].next,
- struct page, lru);
- area->nr_free--;
-
- /*
- * If breaking a large block of pages, move all free
- /*当前不是cma迁移类型,或者order比一个pageblock的order的
- 一半要大,或者是可回收类型,或者定义了迁移策略时,移动内存页到先前申请的迁移类型中去。*/
- if (!is_migrate_cma(migratetype) &&
- (unlikely(current_order >= pageblock_order / 2) ||
- start_migratetype == MIGRATE_RECLAIMABLE ||
- page_group_by_mobility_disabled)) {
- int pages;
- /*移动空闲页到先前迁移类型中去。*/
- pages = move_freepages_block(zone, page,
- start_migratetype);
- /*当当前内存块大部分已经迁移到先前类型中或者定义了迁移策略时,
- 那么就将这一整块全部迁移过去。*/
- /* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled)
- set_pageblock_migratetype(page,
- start_migratetype);
-
- migratetype = start_migratetype;
- }
-
- /* Remove the page from the freelists */
- list_del(&page->lru);
- rmv_page_order(page);
-
- /* Take ownership for orders >= pageblock_order */
- if (current_order >= pageblock_order &&
- !is_migrate_cma(migratetype))
- change_pageblock_range(page, current_order,
- start_migratetype);
- /*拆分/合并*/
- expand(zone, page, order, current_order, area,
- is_migrate_cma(migratetype)
- ? migratetype : start_migratetype);
-
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype);
-
- return page;
- }
- }
- return NULL;
- }
static inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area * area;
int current_order;
struct page *page;
int migratetype, i;
/*从最高阶开始搜索,大块内存申请/分割可以避免更少的碎片。*/
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1; current_order >= order;
--current_order) {
for (i = 0;; i++) {
/*得到数组中指定顺序的迁移类型。*/
migratetype = fallbacks[start_migratetype][i];
/*保留内存晚点操作。*/
/* MIGRATE_RESERVE handled later if necessary */
if (migratetype == MIGRATE_RESERVE)
break;
/*取得area,判断list,为空表示当前迁移类型也没有空闲页。*/
area = &(zone->free_area[current_order]);
if (list_empty(&area->free_list[migratetype]))
continue;
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
area->nr_free--;
/*
* If breaking a large block of pages, move all free
/*当前不是cma迁移类型,或者order比一个pageblock的order的
一半要大,或者是可回收类型,或者定义了迁移策略时,移动内存页到先前申请的迁移类型中去。*/
if (!is_migrate_cma(migratetype) &&
(unlikely(current_order >= pageblock_order / 2) ||
start_migratetype == MIGRATE_RECLAIMABLE ||
page_group_by_mobility_disabled)) {
int pages;
/*移动空闲页到先前迁移类型中去。*/
pages = move_freepages_block(zone, page,
start_migratetype);
/*当当前内存块大部分已经迁移到先前类型中或者定义了迁移策略时,
那么就将这一整块全部迁移过去。*/
/* Claim the whole block if over half of it is free */
if (pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page,
start_migratetype);
migratetype = start_migratetype;
}
/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order &&
!is_migrate_cma(migratetype))
change_pageblock_range(page, current_order,
start_migratetype);
/*拆分/合并*/
expand(zone, page, order, current_order, area,
is_migrate_cma(migratetype)
? migratetype : start_migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype);
return page;
}
}
return NULL;
}
以上的内存分配都是基于分配比较顺利的情况,如果分配依然失败,那么只能使用慢速分配机制了!so 继续看__alloc_pages_slowpath().
__alloc_pages_slowpath():
[html] view plain copy print ?
- static inline struct page *
- __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
- {
- const gfp_t wait = gfp_mask & __GFP_WAIT;
- struct page *page = NULL;
- int alloc_flags;
- unsigned long pages_reclaimed = 0;
- unsigned long did_some_progress;
- bool sync_migration = false;
- bool deferred_compaction = false;
- ~~snip
- restart:
- /*如果没有禁止内存回收的话就唤醒交换线程来进行内存页面回收, 写回或
- 换出很少使用的页到磁盘上。*/
- if (!(gfp_mask & __GFP_NO_KSWAPD))
- wake_all_kswapd(order, zonelist, high_zoneidx,
- zone_idx(preferred_zone));
-
- /*为了更积极地尝试分配,将水位线降低以便更有可能分配成功。*/
- alloc_flags = gfp_to_alloc_flags(gfp_mask);
-
- /*
- * Find the true preferred zone if the allocation is unconstrained by
- * cpusets.
- */
- if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
- first_zones_zonelist(zonelist, high_zoneidx, NULL,
- &preferred_zone);
-
- rebalance:
- /*降低水位线之后重新尝试分配。*/
- /* This is the last chance, in general, before the goto nopage. */
- page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
- high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
- if (page)
- goto got_pg;
- /*如果不考虑水位线,那么继续尝试分配。如果定义了__GFP_NOFAIL,
- 那么此函数会不断尝试分配,直到成功。*/
- /* Allocate without watermarks if the context allows */
- if (alloc_flags & ALLOC_NO_WATERMARKS) {
- page = __alloc_pages_high_priority(gfp_mask, order,
- zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
- if (page)
- goto got_pg;
- }
- /*没定义等待,如在中断上下文中需要原子份额配,所以直接返回失败。*/
- /* Atomic allocations - we can't balance anything */
- if (!wait)
- goto nopage;
- /*分配器自身需要更多内存,避免递归调用。*/
- /* Avoid recursion of direct reclaim */
- if (current->flags & PF_MEMALLOC)
- goto nopage;
- /*oom killer选中的线程才会设置TIF_MEMDIE标志,当然如果不允许失败的
- 话那就循环等待其他线程释放内存。*/
- /* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
-
- /*尝试压缩内存再尝试分配。*/
- page = __alloc_pages_direct_compact(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- migratetype, sync_migration,
- &deferred_compaction,
- &did_some_progress);
- if (page)
- goto got_pg;
- ~~snip
- /*自己直接去回收内存,然后尝试分配。*/
- /* Try direct reclaim and then allocating */
- page = __alloc_pages_direct_reclaim(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- migratetype, &did_some_progress);
- if (page)
- goto got_pg;
-
- /*如果还是失败的话那就使用oom killer杀掉一些线程,再尝试分配。*/
- if (!did_some_progress) {
- if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
- if (oom_killer_disabled)
- goto nopage;
- /* Coredumps can quickly deplete all memory reserves */
- if ((current->flags & PF_DUMPCORE) &&
- !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
- page = __alloc_pages_may_oom(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask, preferred_zone,
- migratetype);
- if (page)
- goto got_pg;
-
- if (!(gfp_mask & __GFP_NOFAIL)) {
- /*
- * The oom killer is not called for high-order
- * allocations that may fail, so if no progress
- * is being made, there are no other options and
- * retrying is unlikely to help.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- goto nopage;
- /*
- * The oom killer is not called for lowmem
- * allocations to prevent needlessly killing
- * innocent tasks.
- */
- if (high_zoneidx < ZONE_NORMAL)
- goto nopage;
- }
-
- goto restart;
- }
- }
- /*是否需要等待一会再尝试重新分配?*/
- /* Check if we should retry the allocation */
- pages_reclaimed += did_some_progress;
- if (should_alloc_retry(gfp_mask, order, did_some_progress,
- pages_reclaimed)) {
- /* Wait for some write requests to complete then retry */
- wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
- goto rebalance;
- } else {
- /*在做过内存回收之后再使用内存压缩分配试试看。*/
- page = __alloc_pages_direct_compact(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- migratetype, sync_migration,
- &deferred_compaction,
- &did_some_progress);
- if (page)
- goto got_pg;
- }
- /*到这里表示真分配失败了,打印警告信息。*/
- nopage:
- warn_alloc_failed(gfp_mask, order, NULL);
- return page;
- got_pg:
- if (kmemcheck_enabled)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
- return page;
-
- }
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
bool deferred_compaction = false;
~~snip
restart:
/*如果没有禁止内存回收的话就唤醒交换线程来进行内存页面回收, 写回或
换出很少使用的页到磁盘上。*/
if (!(gfp_mask & __GFP_NO_KSWAPD))
wake_all_kswapd(order, zonelist, high_zoneidx,
zone_idx(preferred_zone));
/*为了更积极地尝试分配,将水位线降低以便更有可能分配成功。*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
first_zones_zonelist(zonelist, high_zoneidx, NULL,
&preferred_zone);
rebalance:
/*降低水位线之后重新尝试分配。*/
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page)
goto got_pg;
/*如果不考虑水位线,那么继续尝试分配。如果定义了__GFP_NOFAIL,
那么此函数会不断尝试分配,直到成功。*/
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
if (page)
goto got_pg;
}
/*没定义等待,如在中断上下文中需要原子份额配,所以直接返回失败。*/
/* Atomic allocations - we can't balance anything */
if (!wait)
goto nopage;
/*分配器自身需要更多内存,避免递归调用。*/
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
/*oom killer选中的线程才会设置TIF_MEMDIE标志,当然如果不允许失败的
话那就循环等待其他线程释放内存。*/
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
/*尝试压缩内存再尝试分配。*/
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, sync_migration,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
~~snip
/*自己直接去回收内存,然后尝试分配。*/
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, &did_some_progress);
if (page)
goto got_pg;
/*如果还是失败的话那就使用oom killer杀掉一些线程,再尝试分配。*/
if (!did_some_progress) {
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_killer_disabled)
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
if ((current->flags & PF_DUMPCORE) &&
!(gfp_mask & __GFP_NOFAIL))
goto nopage;
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
migratetype);
if (page)
goto got_pg;
if (!(gfp_mask & __GFP_NOFAIL)) {
/*
* The oom killer is not called for high-order
* allocations that may fail, so if no progress
* is being made, there are no other options and
* retrying is unlikely to help.
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto nopage;
/*
* The oom killer is not called for lowmem
* allocations to prevent needlessly killing
* innocent tasks.
*/
if (high_zoneidx < ZONE_NORMAL)
goto nopage;
}
goto restart;
}
}
/*是否需要等待一会再尝试重新分配?*/
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, did_some_progress,
pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
} else {
/*在做过内存回收之后再使用内存压缩分配试试看。*/
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, sync_migration,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
}
/*到这里表示真分配失败了,打印警告信息。*/
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
return page;
}
内存分配一波三折,小结一下:
1. 先尝试快速分配,其中会从不同的zone以及迁移类型上去尝试,失败的话就进入慢速分配,里面会再划分单页面从pcp上分配以及多页面从伙伴系统中分配。
2. 尝试慢速分配,一般流程就是唤醒内存页面回收线程,然后尝试低水位分配 -> 忽略水位分配 -> 压缩内存分配 -> 直接回收内存分配 -> oom killer杀死线程分配 -> 压缩内存分配。
内存释放:
关于内存释放,使用的最终公共接口为__free_pages, 流程部分还是比较清晰的,
这里不对代码作具体分析了。分单页和多页释放。
单页:释放到pcp缓冲中,如果pcp中的空闲页面数过多,就会移动一部分到伙伴系统中。
多页:释放多页到伙伴系统,当当前的释放页面数和相邻的空闲页面数相同时,那就将两者合并,然后放到更高一阶的order链表上面,依次循环执行次操作直到不能合并为止。
2013/04/07