注:原创作品,转载请注明。
1. Linux 内核对各个zone都有一个buddy system.
2. 数据结构:
mem_map:一个Struct page数组,对应系统中所有的物理内存页。
而每一个zone结构里都有一个zone_mem_map域指向这个zone的第一个page 在mem_map的位置,还有一个域size代表这个区的大小,即总共有多少页。
每一个zone都有自己的buddy system,由下面的zone结构就可以看出。
空闲块是根据其大小做的保存,特别强调的是struct free_area free_area[MAX_ORDER];
保存着zone中的空闲块。数组中的每一个元素都有个双链表结构。比如说 free_area中第K个元素保存着大小为2的k次方大小的块的链表结构。数组中保存的是表头结构,即指向第一个2的k次方大小块的第一个页面。那块的剩余的页面怎么办?不用管,因为都是按块来操作的,只需要知道块的第一个页面即可,最后一个页面就是第一个页面加上2的k次方。同属于一个链表的块与块之间由每一个块的第一个页面的struct page 中的list_head lru来相互链接。
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
struct free_area {
struct list_head free_list;
unsigned long nr_free;
};
struct zone {
/* Fields commonly accessed by the page allocator */
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
/*
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
* GB of ram we must reserve some of the lower zone memory (otherwise we risk
* to run OOM on the lower zones despite there's tons of freeable ram
* on the higher zones). This array is recalculated at runtime if the
* sysctl_lowmem_reserve_ratio sysctl changes.
*/
unsigned long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
struct per_cpu_pageset *pageset[NR_CPUS];
#else
struct per_cpu_pageset pageset[NR_CPUS];
#endif
/*
* free areas of different sizes
*/
spinlock_t lock;
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
struct free_area free_area[MAX_ORDER];
ZONE_PADDING(_pad1_)
/* Fields commonly accessed by the page reclaim scanner */
spinlock_t lru_lock;
struct list_head active_list;
struct list_head inactive_list;
unsigned long nr_scan_active;
unsigned long nr_scan_inactive;
unsigned long nr_active;
unsigned long nr_inactive;
unsigned long pages_scanned; /* since last reclaim */
int all_unreclaimable; /* All pages pinned */
/* A count of how many reclaimers are scanning this zone */
atomic_t reclaim_in_progress;
/*
* timestamp (in jiffies) of the last zone reclaim that did not
* result in freeing of pages. This is used to avoid repeated scans
* if all memory in the zone is in use.
*/
unsigned long last_unsuccessful_zone_reclaim;
/*
* prev_priority holds the scanning priority for this zone. It is
* defined as the scanning priority at which we achieved our reclaim
* target at the previous try_to_free_pages() or balance_pgdat()
* invokation.
*
* We use prev_priority as a measure of how much stress page reclaim is
* under - it drives the swappiness decision: whether to unmap mapped
* pages.
*
* temp_priority is used to remember the scanning priority at which
* this zone was successfully refilled to free_pages == pages_high.
*
* Access to both these fields is quite racy even on uniprocessor. But
* it is expected to average out OK.
*/
int temp_priority;
int prev_priority;
ZONE_PADDING(_pad2_)
/* Rarely used or read-mostly fields */
/*
* wait_table -- the array holding the hash table
* wait_table_size -- the size of the hash table array
* wait_table_bits -- wait_table_size == (1 << wait_table_bits)
*
* The purpose of all these is to keep track of the people
* waiting for a page to become available and make them
* runnable again when possible. The trouble is that this
* consumes a lot of space, especially when so few things
* wait on pages at a given time. So instead of using
* per-page waitqueues, we use a waitqueue hash table.
*
* The bucket discipline is to sleep on the same queue when
* colliding and wake all in that wait queue when removing.
* When something wakes, it must check to be sure its page is
* truly available, a la thundering herd. The cost of a
* collision is great, but given the expected load of the
* table, they should be so rare as to be outweighed by the
* benefits from the saved space.
*
* __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
* primary users of these fields, and in mm/page_alloc.c
* free_area_init_core() performs the initialization of them.
*/
wait_queue_head_t * wait_table;
unsigned long wait_table_size;
unsigned long wait_table_bits;
/*
* Discontig memory support fields.
*/
struct pglist_data *zone_pgdat;
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
/*
* zone_start_pfn, spanned_pages and present_pages are all
* protected by span_seqlock. It is a seqlock because it has
* to be read outside of zone->lock, and it is done in the main
* allocator path. But, it is written quite infrequently.
*
* The lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*/
unsigned long spanned_pages; /* total size, including holes */
unsigned long present_pages; /* amount of memory (excluding holes) */
/*
* rarely used fields:
*/
char *name;
} ____cacheline_internodealigned_in_smp;
分配页面块
分配一个大小为2的m次方的页面块,首先看freearea的第m个元素,如果其nr_free大于0,则从这个链表中取出来一个块来满足要求,如果不大于0,则看数组中m+1个元素,那要下去。如果找到能够分配的,那么就将块的第一部分大小为2的m次方的块分出去,剩下的继续保存在buddy system中。看代码会比较详细:
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static struct page *__rmqueue(struct zone *zone, unsigned int order)
{
struct free_area * area;
unsigned int current_order;
struct page *page;
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = zone->free_area + current_order;
if (list_empty(&area->free_list))
continue;
page = list_entry(area->free_list.next, struct page, lru);
list_del(&page->lru);
rmv_page_order(page);
area->nr_free--;
zone->free_pages -= 1UL << order;
expand(zone, page, order, current_order, area);
return page;
}
return NULL;
}
释放页面块
/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
*
* 1) Any buddy B1 will have an order O twin B2 which satisfies
* the following equation:
* B2 = B1 ^ (1 << O)
* For example, if the starting buddy (buddy2) is #8 its order
* 1 buddy is #10:
* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
*
* 2) Any buddy B will have an order O+1 parent P which
* satisfies the following equation:
* P = B & ~(1 << O)
*
* Assumption: *_mem_map is contigious at least up to MAX_ORDER
*/
static inline unsigned long
__find_combined_index(unsigned long page_idx, unsigned int order)
{
return (page_idx & ~(1 << order));
}
static void free_pages_bulk(struct zone *zone, int count,
struct list_head *list, int order)
{
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
while (count--) {
struct page *page;
BUG_ON(list_empty(list));
page = list_entry(list->prev, struct page, lru);
/* have to delete it as __free_one_page list manipulates */
list_del(&page->lru);
__free_one_page(page, zone, order);
}
spin_unlock(&zone->lock);
}
static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order)
{
unsigned long page_idx;
int order_size = 1 << order;
if (unlikely(PageCompound(page)))
destroy_compound_page(page, order);
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
BUG_ON(page_idx & (order_size - 1));
BUG_ON(bad_range(zone, page));
zone->free_pages += order_size;
while (order < MAX_ORDER-1) {
unsigned long combined_idx;
struct free_area *area;
struct page *buddy;
buddy = __page_find_buddy(page, page_idx, order);
if (!page_is_buddy(buddy, order))
break; /* Move the buddy up one level. */
list_del(&buddy->lru);
area = zone->free_area + order;
area->nr_free--;
rmv_page_order(buddy);
combined_idx = __find_combined_index(page_idx, order);
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
}
set_page_order(page, order);
list_add(&page->lru, &zone->free_area[order].free_list);
zone->free_area[order].nr_free++;
}
第一个函数的作用是释放count个具有1<<order大小的块,list连着这些块的第一个页框.而第二个函数是释放一个具有1<<order的块。还有第二个函数中
buddy = __page_find_buddy(page, page_idx, order);
static inline struct page *
__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
{
unsigned long buddy_idx = page_idx ^ (1 << order);
return page + (buddy_idx - page_idx);
}
page_idx ^ (1 << order)操作是找到buddy的index,根据是1) Any buddy B1 will have an order O twin B2 which satisfies the following equation:
B2 = B1 ^ (1 << O)
An exclusive OR(XOR)using the(1<<order)mask switches the value of the order-th bit of page_idx.Therefore,if the bit was previously zero,buddy_idx is equal to page_idx+order_size;conversely,if the bit was previouslyone,buddy_idx is equal to page_idx-order_size.
两个合并后,要得到合并后的块的index:
combined_idx = __find_combined_index(page_idx, order);
而__find_combined_index为:
static inline unsigned long
__find_combined_index(unsigned long page_idx, unsigned int order)
{
return (page_idx & ~(1 << order));
}
其原理是:
2) Any buddy B will have an order O+1 parent P which satisfies the following equation:
P = B & ~(1 << O)