内核把物理页作为内存管理的基本单元,并用struct page结构表示系统中的每个物理页。
struct page{
unsigned long flags;
atomic_t _count;
atomic_t _mapcount;
unsigned long private;
struct address_space *mapping;
pgoff_t index;
struct list_head lru;
void *virtual;
}
flag表示页的状态
_count表示引用次数,为负值时表示空闲,可以调用page_count查询_count值,返回0表示空闲
virtual表示页的虚拟地址。高端内存中,动态映射时,该值为NULL。
lru 表示页面回收的LRU链表算法
Linux把系统的页划分为区,形成不同的内存池,可以根据用途进行分配。具体的分区要看不同的硬件体系架构,常用的分区为:ZONE_NORMAL,ZONE_HIGHMEM,ZONE_DMA;每个区都用struct zone表示:
struct zone {
unsigned long watermark[NR_WMARK];
unsigned long percpu_drift_mark;
unsigned long lowmem_reserve[MAX_NR_ZONES];
unsigned long dirty_balance_reserve;
#ifdef CONFIG_NUMA
int node;
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif
struct per_cpu_pageset __percpu *pageset;
spinlock_t lock;
int all_unreclaimable; /* All pages pinned */
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
struct free_area free_area[MAX_ORDER];
#ifndef CONFIG_SPARSEMEM
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_COMPACTION
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;
struct lruvec lruvec;
struct zone_reclaim_stat reclaim_stat;
unsigned long pages_scanned; /* since last reclaim */
unsigned long flags; /* zone flags, see below */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
unsigned int inactive_ratio;
ZONE_PADDING(_pad2_)
wait_queue_head_t * wait_table;
unsigned long wait_table_hash_nr_entries;
unsigned long wait_table_bits;
struct pglist_data *zone_pgdat;
unsigned long zone_start_pfn;
unsigned long spanned_pages; /* total size, including holes */
unsigned long present_pages; /* amount of memory (excluding holes) */
const char *name;
} ____cacheline_internodealigned_in_smp;
其中watermark数组持有该区的最小值、最低和最高水位,分别为:WMARK_MIN,WMARK_LOW和WMARK_HIGH
内核提供了请求内存的底层机制。核心函数是:alloc_pages(gfp_mask, order)
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
/* Unknown node is current node */
if (nid < 0)
nid = numa_node_id();
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}
alloc_pages最终调用的是_alloc_pages_nodemask,后面会仔细分析。alloc_pages返回的是page结构体,可以用page_address函数转换成对应的逻辑地址。
#define page_address(page) ((page)->virtual)
如果无需用到struct page结构体,可以直接调用_get_free_pages
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *page;
/*
* __get_free_pages() returns a 32-bit address, which cannot represent
* a highmem page
*/
VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
page = alloc_pages(gfp_mask, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
代码中可以看出,_get_free_pages就是alloc_pages与page_address的结合
关于分配掩码gfp_mask,是一个很重要的参数。主要分为以下几类:
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask); //根据gfp_mask确定分区
struct zone *preferred_zone;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask); //根据gfp_mask得到迁移类分配页的类型
unsigned int cpuset_mems_cookie;
gfp_mask &= gfp_allowed_mask;
lockdep_trace_alloc(gfp_mask);
might_sleep_if(gfp_mask & __GFP_WAIT);
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
/* The preferred zone is used for statistics later 从zonelist中找到zoneidx与high_zoneidx相同的管理区,也就是之前认定的管理区*/
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone)
goto out;
//尝试第一次分配,流程是从指定的管理区开始扫描管理区-->找到充足的管理区-->从指定的迁移类型链表中分配内存-->如果在指定迁移类型中找不到则到其他的迁移类型中去寻找
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
preferred_zone, migratetype);
if (unlikely(!page))
page = __alloc_pages_slowpath(gfp_mask, order, //第一次分配失败的话则会通过一条低速路径来进行二次分配,包括唤醒页换出守护进程等
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
out:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
}
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
classzone_idx = zone_idx(preferred_zone); //获取管理区的编号
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist, //从认定的管理区开始遍历,直到找到一个拥有足够空间的管理区,
high_zoneidx, nodemask) { //例如,如果high_zoneidx对应的ZONE_HIGHMEM,则遍历顺序为HIGHMEM-->NORMAL-->DMA,
if (NUMA_BUILD && zlc_active && // 如果high_zoneidx对应ZONE_NORMAL,则遍历顺序为NORMAL-->DMA*/
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty
* limit, such that no single zone holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the zone's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* This may look like it could increase pressure on
* lower zones by failing allocations in higher zones
* before they are full. But the pages that do spill
* over are limited as the lower zones are protected
* by this very same mechanism. It should not become
* a practical burden to them.
*
* XXX: For now, allow allocations to potentially
* exceed the per-zone dirty limit in the slowpath
* (ALLOC_WMARK_LOW unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* zones are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
if ((alloc_flags & ALLOC_WMARK_LOW) &&
(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; //通过alloc_flags来确定是使用何种水印,pages_min?pages_low?pages_high?
if (zone_watermark_ok(zone, order, mark, //选择了一种水印,就要求分配后的空闲不低于该水印才能进行分配
classzone_idx, alloc_flags)) /*如果管理区的水位线处于正常水平,则在该管理区进行分配*/
goto try_this_zone;
if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { //下面是针对NUMA架构的申请页面回收
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
* by the cpuset.
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
if (zone_reclaim_mode == 0)
goto this_zone_full;
/*
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*/
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto this_zone_full;
}
}
try_this_zone: //分配2^order个页
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (page)
break;
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
}
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
return page;
}
/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
/*
该函数分两种情况进行处理,一种是只要求分配单个页框,另一种是要求分配多个连续页框
对于单个页面,内核选择从每CPU页框高速缓存中分配,它的核心描述结构也是MIGRATE_TYPES个链表,
只不过链表中的元素都是单个页。这些页分为热页和冷页,所谓热页就是还处在CPU高速缓存中的
页,相反,冷页就是不存在于高速缓存中的页。对于单个页框的申请,分配热页可以提高效率。
需要注意的是,越靠近链表头的页越热,越靠近链表尾的页越冷,因为每次释放单个页框的时候,
页框是插入到链表的头部的,也就是说靠近头部的页框是最近才释放的,因此最有可能存在于高
速缓存当中
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
again:
if (likely(order == 0)) { //order为0,则分配一个页
struct per_cpu_pages *pcp;
struct list_head *list;
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp; //获取本地cpu对应的pcp
list = &pcp->lists[migratetype];
/*如果链表为空,则表示没有可分配的页,需要从伙伴系统中分配2^batch个页给list*/
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
if (unlikely(list_empty(list)))
goto failed;
}
if (cold) //如果需要分配冷页,则从链表的尾部获取
page = list_entry(list->prev, struct page, lru);
else //如果是需要热页,则从链表的头部获取
page = list_entry(list->next, struct page, lru);
list_del(&page->lru);
pcp->count--;
} else {
if (unlikely(gfp_flags & __GFP_NOFAIL)) { //对于连续的页框分配,通过调用__rmqueue()来完成分配
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order, migratetype); //从管理区的伙伴系统中选择合适的内存块进行分配
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
failed:
local_irq_restore(flags);
return NULL;
}
static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
{
struct page *page;
retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype); //按照指定的迁徙类型,分配对应的内存块
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { //如果分配失败并且迁移类型不是MIGRATE_RESERVE(如果是MIGRATE_RESERVE,则表明已经没有其他的迁移类型可供选择了)
page = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
* is used because __rmqueue_smallest is an inline function
* and we want just one call site
*/
if (!page) {
migratetype = MIGRATE_RESERVE;
goto retry_reserve;
}
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area * area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]); //获取和现在的阶数对应的free_area
if (list_empty(&area->free_list[migratetype]))
continue;
page = list_entry(area->free_list[migratetype].next, //得到满足要求的页块中的第一个页描述符
struct page, lru);
list_del(&page->lru);
rmv_page_order(page); //将page的private设为0
area->nr_free--; //内存块数减1
expand(zone, page, order, current_order, area, migratetype); //进行拆分
return page;
}
return NULL;
}
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype)
{
unsigned long size = 1 << high;
while (high > low) { //申请的order为low,实际分配的块对应的order为high,如果high大于low,则要将大块进行拆分,将拆分的块添加到下一级快链表中
area--; //area减1得到下一级order对应的ared
high--; //high减1表明进行了一次拆分
size >>= 1; //拆分一次size就要除以2
VM_BUG_ON(bad_range(zone, &page[size]));
#ifdef CONFIG_DEBUG_PAGEALLOC
if (high < debug_guardpage_minorder()) {
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
INIT_LIST_HEAD(&page[size].lru);
set_page_guard_flag(&page[size]);
set_page_private(&page[size], high);
/* Guard pages are not available for any usage */
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
continue;
}
#endif
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
set_page_order(&page[size], high);
}
}
参考:http://blog.csdn.net/vanbreaker/article/details/7621289