Linux kernel memory management buddy system (linux内核内存管理的伙伴算法)

注:原创作品,转载请注明。

1.     Linux 内核对各个zone都有一个buddy system.

2.     数据结构:

mem_map:一个Struct page数组,对应系统中所有的物理内存页。

而每一个zone结构里都有一个zone_mem_map域指向这个zone的第一个page mem_map的位置,还有一个域size代表这个区的大小,即总共有多少页。

每一个zone都有自己的buddy system,由下面的zone结构就可以看出。

空闲块是根据其大小做的保存,特别强调的是struct free_area  free_area[MAX_ORDER];

保存着zone中的空闲块。数组中的每一个元素都有个双链表结构。比如说     free_area中第K个元素保存着大小为2k次方大小的块的链表结构。数组中保存的是表头结构,即指向第一个2k次方大小块的第一个页面。那块的剩余的页面怎么办?不用管,因为都是按块来操作的,只需要知道块的第一个页面即可,最后一个页面就是第一个页面加上2k次方。同属于一个链表的块与块之间由每一个块的第一个页面的struct page 中的list_head lru来相互链接。

#ifndef CONFIG_FORCE_MAX_ZONEORDER

#define MAX_ORDER 11

#else

#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER

#endif

#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))

struct free_area {

       struct list_head  free_list;

       unsigned long            nr_free;

};

struct zone {

       /* Fields commonly accessed by the page allocator */

       unsigned long            free_pages;

       unsigned long             pages_min, pages_low, pages_high;

       /*

        * We don't know if the memory that we're going to allocate will be freeable

        * or/and it will be released eventually, so to avoid totally wasting several

        * GB of ram we must reserve some of the lower zone memory (otherwise we risk

        * to run OOM on the lower zones despite there's tons of freeable ram

        * on the higher zones). This array is recalculated at runtime if the

        * sysctl_lowmem_reserve_ratio sysctl changes.

        */

       unsigned long             lowmem_reserve[MAX_NR_ZONES];

 

#ifdef CONFIG_NUMA

       struct per_cpu_pageset     *pageset[NR_CPUS];

#else

       struct per_cpu_pageset     pageset[NR_CPUS];

#endif

       /*

        * free areas of different sizes

        */

       spinlock_t            lock;

#ifdef CONFIG_MEMORY_HOTPLUG

       /* see spanned/present_pages for more description */

       seqlock_t             span_seqlock;

#endif

       struct free_area free_area[MAX_ORDER];

 

       ZONE_PADDING(_pad1_)

 

       /* Fields commonly accessed by the page reclaim scanner */

       spinlock_t            lru_lock;      

       struct list_head    active_list;

       struct list_head    inactive_list;

       unsigned long             nr_scan_active;

       unsigned long             nr_scan_inactive;

       unsigned long             nr_active;

       unsigned long             nr_inactive;

       unsigned long             pages_scanned;      /* since last reclaim */

       int                 all_unreclaimable; /* All pages pinned */

 

       /* A count of how many reclaimers are scanning this zone */

       atomic_t              reclaim_in_progress;

 

       /*

        * timestamp (in jiffies) of the last zone reclaim that did not

        * result in freeing of pages. This is used to avoid repeated scans

        * if all memory in the zone is in use.

        */

       unsigned long             last_unsuccessful_zone_reclaim;

 

       /*

        * prev_priority holds the scanning priority for this zone.  It is

        * defined as the scanning priority at which we achieved our reclaim

        * target at the previous try_to_free_pages() or balance_pgdat()

        * invokation.

        *

        * We use prev_priority as a measure of how much stress page reclaim is

        * under - it drives the swappiness decision: whether to unmap mapped

        * pages.

        *

        * temp_priority is used to remember the scanning priority at which

        * this zone was successfully refilled to free_pages == pages_high.

        *

        * Access to both these fields is quite racy even on uniprocessor.  But

        * it is expected to average out OK.

        */

       int temp_priority;

       int prev_priority;

 

 

       ZONE_PADDING(_pad2_)

       /* Rarely used or read-mostly fields */

 

       /*

        * wait_table             -- the array holding the hash table

        * wait_table_size     -- the size of the hash table array

        * wait_table_bits      -- wait_table_size == (1 << wait_table_bits)

        *

        * The purpose of all these is to keep track of the people

        * waiting for a page to become available and make them

        * runnable again when possible. The trouble is that this

        * consumes a lot of space, especially when so few things

        * wait on pages at a given time. So instead of using

        * per-page waitqueues, we use a waitqueue hash table.

        *

        * The bucket discipline is to sleep on the same queue when

        * colliding and wake all in that wait queue when removing.

        * When something wakes, it must check to be sure its page is

        * truly available, a la thundering herd. The cost of a

        * collision is great, but given the expected load of the

        * table, they should be so rare as to be outweighed by the

        * benefits from the saved space.

        *

        * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the

        * primary users of these fields, and in mm/page_alloc.c

        * free_area_init_core() performs the initialization of them.

        */

       wait_queue_head_t    * wait_table;

       unsigned long             wait_table_size;

       unsigned long             wait_table_bits;

 

       /*

        * Discontig memory support fields.

        */

       struct pglist_data       *zone_pgdat;

       /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */

       unsigned long             zone_start_pfn;

 

       /*

        * zone_start_pfn, spanned_pages and present_pages are all

        * protected by span_seqlock.  It is a seqlock because it has

        * to be read outside of zone->lock, and it is done in the main

        * allocator path.  But, it is written quite infrequently.

        *

        * The lock is declared along with zone->lock because it is

        * frequently read in proximity to zone->lock.  It's good to

        * give them a chance of being in the same cacheline.

        */

       unsigned long             spanned_pages;   /* total size, including holes */

       unsigned long             present_pages;     /* amount of memory (excluding holes) */

 

       /*

        * rarely used fields:

        */

       char                     *name;

} ____cacheline_internodealigned_in_smp;

 

分配页面块

分配一个大小为2m次方的页面块,首先看freearea的第m个元素,如果其nr_free大于0,则从这个链表中取出来一个块来满足要求,如果不大于0,则看数组中m+1个元素,那要下去。如果找到能够分配的,那么就将块的第一部分大小为2m次方的块分出去,剩下的继续保存在buddy system中。看代码会比较详细:

/*

 * Do the hard work of removing an element from the buddy allocator.

 * Call me with the zone->lock already held.

 */

static struct page *__rmqueue(struct zone *zone, unsigned int order)

{

       struct free_area * area;

       unsigned int current_order;

       struct page *page;

 

       for (current_order = order; current_order < MAX_ORDER; ++current_order) {

              area = zone->free_area + current_order;

              if (list_empty(&area->free_list))

                     continue;

 

              page = list_entry(area->free_list.next, struct page, lru);

              list_del(&page->lru);

              rmv_page_order(page);

              area->nr_free--;

              zone->free_pages -= 1UL << order;

              expand(zone, page, order, current_order, area);

              return page;

       }

 

       return NULL;

}

 

 


释放页面块

 

 

/*

 * Locate the struct page for both the matching buddy in our

 * pair (buddy1) and the combined O(n+1) page they form (page).

 *

 * 1) Any buddy B1 will have an order O twin B2 which satisfies

 * the following equation:

 *     B2 = B1 ^ (1 << O)

 * For example, if the starting buddy (buddy2) is #8 its order

 * 1 buddy is #10:

 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

 *

 * 2) Any buddy B will have an order O+1 parent P which

 * satisfies the following equation:

 *     P = B & ~(1 << O)

 *

 * Assumption: *_mem_map is contigious at least up to MAX_ORDER

 */

static inline unsigned long

__find_combined_index(unsigned long page_idx, unsigned int order)

{

       return (page_idx & ~(1 << order));

}

 

static void free_pages_bulk(struct zone *zone, int count,
struct list_head *list, int order)
{
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
while (count--) {
struct page *page;

BUG_ON(list_empty(list));
page = list_entry(list->prev, struct page, lru);
/* have to delete it as __free_one_page list manipulates */
list_del(&page->lru);
__free_one_page(page, zone, order);
}
spin_unlock(&zone->lock);
}

static inline void __free_one_page(struct page *page,
struct zone *zone, unsigned int order)
{
unsigned long page_idx;
int order_size = 1 << order;

if (unlikely(PageCompound(page)))
destroy_compound_page(page, order);

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

BUG_ON(page_idx & (order_size - 1));
BUG_ON(bad_range(zone, page));

zone->free_pages += order_size;
while (order < MAX_ORDER-1) {
unsigned long combined_idx;
struct free_area *area;
struct page *buddy;

buddy = __page_find_buddy(page, page_idx, order);
if (!page_is_buddy(buddy, order))
break; /* Move the buddy up one level. */

list_del(&buddy->lru);
area = zone->free_area + order;
area->nr_free--;
rmv_page_order(buddy);
combined_idx = __find_combined_index(page_idx, order);
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
}
set_page_order(page, order);
list_add(&page->lru, &zone->free_area[order].free_list);
zone->free_area[order].nr_free++;
}

第一个函数的作用是释放count个具有1<<order大小的块,list连着这些块的第一个页框.而第二个函数是释放一个具有1<<order的块。还有第二个函数中
buddy = __page_find_buddy(page, page_idx, order);
static inline struct page *
__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
{
unsigned long buddy_idx = page_idx ^ (1 << order);

return page + (buddy_idx - page_idx);
}
page_idx ^ (1 << order)
操作是找到buddyindex,根据是1) Any buddy B1 will have an order O twin B2 which satisfies the following equation:

   B2 = B1 ^ (1 << O)

An exclusive ORXORusing the(1<<order)mask switches the value of the order-th bit of page_idx.Therefore,if the bit was previously zero,buddy_idx is equal to page_idx+order_size;conversely,if the bit was previouslyone,buddy_idx is equal to page_idx-order_size.

两个合并后,要得到合并后的块的index

combined_idx = __find_combined_index(page_idx, order);

__find_combined_index为:

static inline unsigned long

__find_combined_index(unsigned long page_idx, unsigned int order)

{

      return (page_idx & ~(1 << order));

}

其原理是:

2) Any buddy B will have an order O+1 parent P which satisfies the following equation:

    P = B & ~(1 << O)

 

 

 

你可能感兴趣的:(Linux kernel memory management buddy system (linux内核内存管理的伙伴算法))