3.4.3 启动过程期间的内存管理

在启动过程期间,尽管内存管理尚未初始化,但内核仍然需要分配内存以创建各种数据结构。bootmem分配器用于在启动阶段早期分配内存。

显然,对该分配器的需求集中于简单性方面,而不是性能和通用性。因此内核开发者决定实现一个最先适配(first-fit)分配器用于在启动阶段管理内存,这是可能想到的最简单方式。

该分配器使用一个位图来管理页,位图比特位的数目与系统中物理内存页的数目相同。比特位为1,表示已用页;比特位为0,表示空闲页。

在需要分配内存时,分配器逐位扫描位图,直至找到一个能提供足够连续页的位置,即所谓的最先最佳(first-best)或最先适配位置。

该过程不是很高效,因为每次分配都必须从头扫描比特链。因此在内核完全初始化之后,不能将该分配器用于内存管理。伙伴系统,以及slabslubslob分配器,是好得多的备选方案。

1. 数据结构

    即使最先适配器也必须管理一些数据,内核(为系统中的每个结点)提供了一个bootmem_data结构的实例,用于该用途。当然,该结构所需的内存无法动态分配,必须在编译时分配给内核。

 在UMA系统上该分配的实现与CPU无关(NUMA系统采用了特定于体系结构的解决方案)。bootmem_data结构定义如下

Linux-3.18.3

./include/linux/bootmem.h

typedef struct bootmem_data {
        unsigned long node_min_pfn;
        unsigned long node_low_pfn;
        void *node_bootmem_map;
        unsigned long last_end_off;
        unsigned long hint_idx;
        struct list_head list;
} bootmem_data_t;

Linux-2.6.25:

typedef struct bootmem_data {
unsigned long node_boot_start;
unsigned long node_low_pfn;
void *node_bootmem_map;
unsigned long last_offset;
unsigned long last_pos;
unsigned long last_success;	/* Previous allocation point.  To speed
 * up searching */
struct list_head list;
} bootmem_data_t;

在下面提到页时,总是指物理页帧。

node_boot_start(node_min_pfn):保存了系统中第一个页的编号,大多数体系结构下都是零。

node_low_pfn:是可以直接管理的物理地址空间中最后一页的编号。即ZONE_NORMAL的结束页。

node_bootmem_map:指向存储分配位图的内存区的指针。在IA-32系统上,用于该用途的内核区紧接着内核映像之后。对应的地址保存在_end变量中,该变量在链接期间自动地插入到内核映像中。

last_pos:上一次分配的页的编号。如果没有请求分配整个页,则last_offset用作该页内部的偏移量。这使得bootmem分配器可以分配小于一整页的内存区(伙伴系统无法做到这一点)。

last_success:指定位图中上一次成功分配内存的位置,新的分配将由此开始。尽管这使得最先适配算法稍快一点,但仍然无法真正代替更复杂的技术。

内存不连续的系统可能需要多个bootmem分配器。一个典型的例子是NUMA计算机,其中每个结点注册了一个bootmem分配器,但如果物理地址空间中散布着空洞,也可以为每个连续内存区注册一个bootmem分配器。

注册新的自举分配器可使用init_bootmem_core,所有注册的分配器保存在一个链表中,表头是全局变量bdata_list

UMA系统上,只需一个bootmem_t实例,即config_bootmem_data。它通过bdata成员与config_page_data关联起来。

Mm/page_alloc.c

#ifndef CONFIG_NEED_MULTIPLE_NODES
static bootmem_data_t contig_bootmem_data;
struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
EXPORT_SYMBOL(contig_page_data);
#endif

2. 初始化

bootmem分配器的初始化是一个特定于体系结构的过程,此外还取决于所属计算机的内存布局。正如前文的讨论,IA-32使用setup_memory,该函数又调用setup_bootmem_allocator来初始化bootmem分配器,而AMD64则使用contig_initmem_init

如下代码流程图说明了IA-32AMD64系统上初始化bootmem分配器设计的各个步骤。

//3.18.3

setup_arch

->setup_memory(kernel_end);

  ->”/* Find free clusters, and init and free the bootmem accordingly.  */”

  ->bootmem_bootmap_pages(max_low_pfn);

  ->init_bootmem(bootmap_start, max_low_pfn);

  ->reserve_bootmem(PFN_PHYS(bootmap_start), bootmap_size, BOOTMEM_DEFAULT);

  ->reserve_bootmem(virt_to_phys((void *)initrd_start), INITRD_SIZE, BOOTMEM_DEFAULT);

 

//arch/x86/kernel/setup_64.c(2.6.25)

contig_initmem_init

  ->bootmem_bootmap_pages

  ->find_e820_area

  ->init_bootmem

  ->e820_register_active_regions

  ->reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);

 

static void __init
contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
        unsigned long bootmap_size, bootmap;
 
        bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
        bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
                                 PAGE_SIZE);
        if (bootmap == -1L)
                panic("Cannot find bootmem map of size %ld\n", bootmap_size);
        bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
        e820_register_active_regions(0, start_pfn, end_pfn);
        free_bootmem_with_active_regions(0, end_pfn);
        reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}


//arch/ia64/mm/contig.c(3.18.3)

find_memory

->bootmem_bootmap_pages

->init_bootmem_node

->reserve_bootmem

/**
 * find_memory - setup memory map
 *
 * Walk the EFI memory map and find usable memory for the system, taking
 * into account reserved areas.
 */
void __init
find_memory (void)
{
        unsigned long bootmap_size;
 
        reserve_memory();
 
        /* first find highest page frame number */
        min_low_pfn = ~0UL;
        max_low_pfn = 0;
        efi_memmap_walk(find_max_min_low_pfn, NULL);
        max_pfn = max_low_pfn;
        /* how many bytes to cover all the pages */
        bootmap_size = bootmem_bootmap_pages(max_pfn) << PAGE_SHIFT;
 
        /* look for a location to hold the bootmap */
        bootmap_start = ~0UL;
        efi_memmap_walk(find_bootmap_location, &bootmap_size);
        if (bootmap_start == ~0UL)
                panic("Cannot find %ld bytes for bootmap\n", bootmap_size);
 
        bootmap_size = init_bootmem_node(NODE_DATA(0),
                        (bootmap_start >> PAGE_SHIFT), 0, max_pfn);
 
        /* Free all available memory, then mark bootmem-map as being in use. */
        efi_memmap_walk(filter_rsvd_memory, free_bootmem);
        reserve_bootmem(bootmap_start, bootmap_size, BOOTMEM_DEFAULT);
        
        find_initrd();
                     
        alloc_per_cpu_data();
}


(1)IA-32的初始化

setup_arch

->setup_memory(kernel_end);

  ->”/* Find free clusters, and init and free the bootmem accordingly.  */”

  ->bootmem_bootmap_pages(max_low_pfn);

  ->init_bootmem(bootmap_start, max_low_pfn);

  ->reserve_bootmem(PFN_PHYS(bootmap_start), bootmap_size, BOOTMEM_DEFAULT);

  ->reserve_bootmem(virt_to_phys((void *)initrd_start), INITRD_SIZE, BOOTMEM_DEFAULT);

 

Setup_memory分析检测到的内存区,以找到低端内存区中最大的页帧号。由于高端内存处理太麻烦,由此对bootmem分配器无用。全局变量max_low_pfn保存了可映射的最高页的编号。内核会在启动日志中报告找到的内存数量。

setup_memory(void *kernel_end)
{
        struct memclust_struct * cluster;
        struct memdesc_struct * memdesc;
        unsigned long start_kernel_pfn, end_kernel_pfn;
        unsigned long bootmap_size, bootmap_pages, bootmap_start;
        unsigned long start, end;
        unsigned long i;
 
        /* Find free clusters, and init and free the bootmem accordingly.  */
        memdesc = (struct memdesc_struct *)
          (hwrpb->mddt_offset + (unsigned long) hwrpb);
 
        for_each_mem_cluster(memdesc, cluster, i) {
                printk("memcluster %lu, usage %01lx, start %8lu, end %8lu\n",
                       i, cluster->usage, cluster->start_pfn,
                       cluster->start_pfn + cluster->numpages);
 
                /* Bit 0 is console/PALcode reserved.  Bit 1 is
                   non-volatile memory -- we might want to mark
                   this for later.  */
                if (cluster->usage & 3)
                        continue;
 
                end = cluster->start_pfn + cluster->numpages;
                if (end > max_low_pfn)
                        max_low_pfn = end;
        }
......
}


Dmesg显示的启动Log:

[    0.000000] 872MB HIGHMEM available.

[    0.000000] 887MB LOWMEM available.

[    0.000000]   mapped low ram: 0 - 377fe000

[    0.000000]   low ram: 0 - 377fe000

[    0.000000] BRK [0x01dcc000, 0x01dccfff] PGTABLE

[    0.000000] Zone ranges:

[    0.000000]   DMA      [mem 0x00001000-0x00ffffff]

[    0.000000]   Normal   [mem 0x01000000-0x377fdfff]

[    0.000000]   HighMem  [mem 0x377fe000-0x6dffffff]

基于该信息,setup_bootmem_allocator接下来负责发起所有必要的步骤,以初始化bootmem分配器。它首先调用通用函数init_bootmem,该函数是init_bootmem_core的一个前端。

static void __init
setup_memory(void *kernel_end)
{    
... 
   /* Allocate the bootmap and mark the whole MM as reserved.  */
        bootmap_size = init_bootmem(bootmap_start, max_low_pfn);
...
}
 
unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
{               
        max_low_pfn = pages;
        min_low_pfn = start;    
        return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}   
 
/*
 * Called once to set up the allocator itself.
 */
static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
        unsigned long mapstart, unsigned long start, unsigned long end)
{
        unsigned long mapsize;
 
        mminit_validate_memmodel_limits(&start, &end);
        bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
        bdata->node_min_pfn = start;
        bdata->node_low_pfn = end;
        link_bootmem(bdata);
 
        /*
         * Initially all pages are reserved - setup_arch() has to
         * register free RAM areas explicitly.
         */
        mapsize = bootmap_bytes(end - start);
        memset(bdata->node_bootmem_map, 0xff, mapsize);
 
        bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
                bdata - bootmem_node_data, start, mapstart, end, mapsize);
 
        return mapsize;
}


init_bootmem_core的目的在于执行bootmem分配器的第一个初始化步骤。先前检测到的低端内存页帧的范围输入到相应的bootmem_data_t实例中,这里是contig_bootmem_data。最初在位图contig_bootmemdata->node_bootmem_map中,所有的页都标记为已用。由于init_bootmem_core是一个体系结构无关的函数,它尙无法知道哪些页可用,哪些页不能用。因为体系结构方面的原因,有些页需要特殊的处理,例如IA-32系统上的0页。有些页已经使用,例如内核映像占用的页。实际可用的页必须由体系结构相关的代码显式标记出来。

该标记过程由两个特定于体系结构的函数完成。Register_bootmem_low_pages2.6.25中存在,3.18.3中不存在)通过将该位图中对应的比特位清零,释放所有潜在可用的内存页。

void __init setup_bootmem_allocator(void)//2.6.25
{
        unsigned long bootmap_size;
        /*
         * Initialize the boot-time allocator (with low memory only):
         */
        bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
 
        register_bootmem_low_pages(max_low_pfn);
 
        /*
         * Reserve the bootmem bitmap itself as well. We do this in two
         * steps (first step was init_bootmem()) because this catches
         * the (very unlikely) case of us accidentally initializing the
         * bootmem allocator with an invalid RAM area.
         */
        reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
                         bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
                         BOOTMEM_DEFAULT);
 
        /*
         * reserve physical page 0 - it's a special BIOS page on many boxes,
         * enabling clean reboots, SMP operation, laptop functions.
         */
        reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
 
        /* reserve EBDA region, it's a 4K region */
        reserve_ebda_region();
 
    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
       unless you have no PS/2 mouse plugged in. */
        if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
            boot_cpu_data.x86 == 6)
             reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
 
#ifdef CONFIG_SMP
        /*
         * But first pinch a few for the stack/trampoline stuff
         * FIXME: Don't need the extra page at 4K, but need to fix
         * trampoline before removing it. (see the GDT stuff)
         */
        reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
#endif
#ifdef CONFIG_ACPI_SLEEP
        /*
         * Reserve low memory region for sleep support.
         */
        acpi_reserve_bootmem();
#endif
#ifdef CONFIG_X86_FIND_SMP_CONFIG
        /*
         * Find and reserve possible boot-time SMP configuration:
         */
        find_smp_config();
#endif
#ifdef CONFIG_BLK_DEV_INITRD
        reserve_initrd();
#endif
        numa_kva_reserve();
        reserve_crashkernel();
}
 
void __init register_bootmem_low_pages(unsigned long max_low_pfn)
{       
        int i;
 
        for (i = 0; i < e820.nr_map; i++) {
                unsigned long curr_pfn, last_pfn, size;
                /*
                 * Reserve usable low memory
                 */      
                if (e820.map[i].type != E820_RAM)
                        continue;
                /*
                 * We are rounding up the start address of usable memory:
                 */
                curr_pfn = PFN_UP(e820.map[i].addr);
                if (curr_pfn >= max_low_pfn)
                        continue;
                /*
                 * ... and at the end of the usable range downwards:
                 */
                last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
 
                if (last_pfn > max_low_pfn)
                        last_pfn = max_low_pfn;
 
                /*
                 * .. finally, did all the rounding and playing
                 * around just make the area go away?
                 */
                if (last_pfn <= curr_pfn)
                        continue;
 
                size = last_pfn - curr_pfn;
                free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
        }
}


IA-32系统上BIOS对该任务提供了支持,BIOS向内核提供了可用内存区的列表,即初始化过程中更早一点提供的e820映射。

由于bootmem分配器需要一些内存页管理分配位图,必须首先调用reserve_bootmem分配这些内存页。

但还有一些其他的内存区已经在使用中,必须相应标记起来。因此,还需要使用reserver_bootmem注册相应的页。需要注册的内存区的确切数目,高度依赖于内核配置。例如,需要保留0页,因为在许多计算机上该页是一个特殊的BIOS页,有些特定于计算机的功能需要该页才能运作正常。其他的reserve_bootmem调用则分配与内核配置相关的内存区,例如,用于ACPI数据或SMP启动时的配置。

static void __init
setup_memory(void *kernel_end)
{
......
        reserve_bootmem(PFN_PHYS(bootmap_start), bootmap_size,
                        BOOTMEM_DEFAULT);
        printk("reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
        
#ifdef CONFIG_BLK_DEV_INITRD
        initrd_start = INITRD_START;
        if (initrd_start) { 
                initrd_end = initrd_start+INITRD_SIZE;
                printk("Initial ramdisk at: 0x%p (%lu bytes)\n",
                       (void *) initrd_start, INITRD_SIZE);
                                       
                if ((void *)initrd_end > phys_to_virt(PFN_PHYS(max_low_pfn))) {
                        if (!move_initrd(PFN_PHYS(max_low_pfn)))
                                printk("initrd extends beyond end of memory "
                                       "(0x%08lx > 0x%p)\ndisabling initrd\n",
                                       initrd_end,
                                       phys_to_virt(PFN_PHYS(max_low_pfn)));
                } else {
                        reserve_bootmem(virt_to_phys((void *)initrd_start),
                                        INITRD_SIZE, BOOTMEM_DEFAULT);
                }
        }
#endif /* CONFIG_BLK_DEV_INITRD */
}


setup_arch

->setup_memory

  ->init_bootmem()

  ->reserve_bootmem

->initmem_init(void)

 ->setup_bootmem_allocator(); 

  ->register_bootmem_low_pages

(2)AMD64的初始化

//arch/x86/kernel/setup_64.c(2.6.25)

contig_initmem_init

  ->bootmem_bootmap_pages

  ->find_e820_area

  ->init_bootmem

  ->e820_register_active_regions

  ->reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);

 

//arch/ia64/mm/contig.c(3.18.3)

find_memory

->bootmem_bootmap_pages

->init_bootmem_node

->reserve_bootmem

 

  虽然AMD64bootmem初始化的技术细节不同,但通用结构与IA-32的情况类似。这一次由contig_initmem负责分配任务。

  首先,bootmem_bootmap_bitmap计算bootmem位图所需的页的数目。该函数使用了BIOSe820映射提供的信息,类似于IA-32,相应的位图可用于查找长度适当的连续内存区。

  然后使用init_bootmem将该信息填充到体系结构无关的bootmem数据结构中。如前所述,该函数将所有的页标记为已分配,而现在必须选出空闲页。Free_bootmem_with_active_regions可以再次使用e820映射中的信息,按照BIOS报告的使用情况,释放所有实际空闲的内存区。最后,调用一次reserve_bootmem注册bootmem分配位图所需的空间。与IA-32相反,AMD64不再需要为遗留信息在内存中分配空间。

3. 对内核的接口

(1)分配内存

#define alloc_bootmem(x) \
        __alloc_bootmem(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_align(x, align) \
        __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_nopanic(x) \
        __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_pages(x) \
        __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_pages_nopanic(x) \
        __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_node(pgdat, x) \
        __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_node_nopanic(pgdat, x) \
        __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_pages_node(pgdat, x) \
        __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
        __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
                
#define alloc_bootmem_low(x) \
        __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
#define alloc_bootmem_low_pages_nopanic(x) \
        __alloc_bootmem_low_nopanic(x, PAGE_SIZE, 0)
#define alloc_bootmem_low_pages(x) \
        __alloc_bootmem_low(x, PAGE_SIZE, 0)
#define alloc_bootmem_low_pages_node(pgdat, x) \
        __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)


内核提供了各种函数,用于在初始化期间分配内存。在UMA系统上有下列函数可用。

alloc_bootmem(size)alloc_bootmem_pages(size)按指定大小在ZONE_NORMAL内存域分配内存。数据是对其的,这使得内存或者从可使用与L1高速缓存的理想位置开始,或者从边界开始。

Alloc_bootmem_lowalloc_bootmem_pages的工作方式类似于上述函数。只是从ZONE_DMA内存域分配内存。因此,只有需要DMA内存时,才能使用上述函数。

基本上NUMA系统的API是相同的,但函数名增加了_node后缀。与UMA系统的函数相比,还需要一个额外的参数,指定用于内存分配的结点。

这些函数都是__alloc_bootmem的前端,后者将实际工作委托给__alloc_bootmem_nopanic。由于可以注册多个bootmem分配器,__alloc_bootmem_core会遍历所有的分配器,直至分配成功为止。

NUMA系统上,__alloc_bootmem_node则用于实践该API函数。首先,工作传递到__alloc_bootmem_core,尝试在该结点的bootmem分配器进行分配,如果失败,则后退到__alloc_bootmem并将尝试所有的结点。

void * __init __alloc_bootmem(unsigned long size, unsigned long align,
                              unsigned long goal)
{
        unsigned long limit = 0;
                
        return ___alloc_bootmem(size, align, goal, limit);
}
 
static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
                                        unsigned long goal, unsigned long limit)
{
        void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
 
        if (mem)                      
                return mem;
        /*
         * Whoops, we cannot satisfy the allocation request.
         */
        printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
        panic("Out of memory");
        return NULL;
} 
 
static void * __init ___alloc_bootmem_nopanic(unsigned long size,
                                              unsigned long align,
                                              unsigned long goal,
                                              unsigned long limit)
{
        void *ptr;
 
restart:
        ptr = alloc_bootmem_core(size, align, goal, limit);
        if (ptr)
                return ptr;
        if (goal) {
                goal = 0;
                goto restart;
        }
 
        return NULL;
}
 
static void * __init alloc_bootmem_core(unsigned long size,
                                        unsigned long align,
                                        unsigned long goal,
                                        unsigned long limit)
{
        bootmem_data_t *bdata;
        void *region;
 
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc(size, GFP_NOWAIT);
 
        list_for_each_entry(bdata, &bdata_list, list) {
                if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
                        continue;
                if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
                        break;
 
                region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
                if (region)
                        return region;
        }
 
        return NULL;
}


所需分配内存的长度(x)未做改变直接传递给__alloc_bootmem,但内存对齐方式有两个选项。SMP_CACHE_BYTES会对齐数据,使之在大多数体系结构上能够理想地置于L1高速缓存中。PAGE_SIZE将数据对齐到页边界。后一种对齐方式用于分配一个或多个整页,但前者在分配涉及部分页时能够产生更好的结果。

低端DMA内存与普通内存的区别在于其起始地址。搜索适用于DMA的内存从地址0开始,而请求普通内存时则从MAX_DMA_ADDRESS向上。

__alloc_bootmem_core函数的功能相对而言和广泛。该函数不仅能够分配整个的内存页,还能分配页的一部分。

该函数执行下列操作:

(1)从goal开始,扫描位图,查找满足分配请求的空闲内存区。

(2)如果目标页紧接着上一次分配的页,即ootmem_data->last_pos,内核会检查bootmem_data->last_offset,判断所需的内存是否能够在上一页分配或从上一页开始分配。

(3)新分配的页在位图对应的比特位设置为1,最后一页的数目也保存在bootmem_data->last_pos。如果该页未完全分配,则相应的偏移量保存在bootmem_data->last_offset;否则,该值设置为0.

 

(2)释放内存

内核提供了free_bootmem函数来释放内存。它需要两个参数:需要释放的内存区的起始地址和长度。不出意外,NUMA系统上等价函数的名称为free_bootmem_node,它需要一个额外的参数来指定结点。

/**
 * free_bootmem_node - mark a page range as usable
 * @pgdat: node the range resides on
 * @physaddr: starting address of the range
 * @size: size of the range in bytes
 *
 * Partial pages will be considered reserved and left as they are.
 *
 * The range must reside completely on the specified node.
 */
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
                              unsigned long size)
{
        unsigned long start, end;
 
        kmemleak_free_part(__va(physaddr), size);
 
        start = PFN_UP(physaddr);
        end = PFN_DOWN(physaddr + size);
 
        mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
}
 
/**
 * free_bootmem - mark a page range as usable
 * @addr: starting physical address of the range
 * @size: size of the range in bytes
 *
 * Partial pages will be considered reserved and left as they are.
 *
 * The range must be contiguous but may span node boundaries.
 */
void __init free_bootmem(unsigned long physaddr, unsigned long size)
{
        unsigned long start, end;
 
        kmemleak_free_part(__va(physaddr), size);
 
        start = PFN_UP(physaddr);
        end = PFN_DOWN(physaddr + size);
 
        mark_bootmem(start, end, 0, 0);
}

 

4. 停用bootmem分配器

在系统初始化进行到伙伴系统分配器能够承担内存管理的责任后,必须停用bootmem分配器,毕竟不能同时用两个分配器管理内存。在UMANUMA系统上,停用分别由free_all_bootmemfree_all_bootmem_node完成,在伙伴系统建立之后,特定于体系结构的初始化代码还需要调用这两个函数。

 

/**    
 * free_all_bootmem - release free pages to the buddy allocator
 *
 * Returns the number of pages actually released.
 */    
unsigned long __init free_all_bootmem(void)
{
        unsigned long total_pages = 0; 
        bootmem_data_t *bdata;
 
        reset_all_zones_managed_pages();
 
        list_for_each_entry(bdata, &bdata_list, list)
                total_pages += free_all_bootmem_core(bdata);
 
        totalram_pages += total_pages; 
 
        return total_pages;   
}
 
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
        struct page *page;
        unsigned long *map, start, end, pages, count = 0;
 
        if (!bdata->node_bootmem_map)
                return 0;
 
        map = bdata->node_bootmem_map;
        start = bdata->node_min_pfn;
        end = bdata->node_low_pfn;
 
        bdebug("nid=%td start=%lx end=%lx\n",
                bdata - bootmem_node_data, start, end);
 
        while (start < end) {
                unsigned long idx, vec;
                unsigned shift;
 
                idx = start - bdata->node_min_pfn;
                shift = idx & (BITS_PER_LONG - 1);
                /*
                 * vec holds at most BITS_PER_LONG map bits,
                 * bit 0 corresponds to start.
                 */
                vec = ~map[idx / BITS_PER_LONG];
 
                if (shift) {
                        vec >>= shift;
                        if (end - start >= BITS_PER_LONG)
                                vec |= ~map[idx / BITS_PER_LONG + 1] <<
                                        (BITS_PER_LONG - shift);
                }
                /*
                 * If we have a properly aligned and fully unreserved
                 * BITS_PER_LONG block of pages in front of us, free
                 * it in one go.
                 */
                if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
                        int order = ilog2(BITS_PER_LONG);
 
                        __free_pages_bootmem(pfn_to_page(start), order);
                        count += BITS_PER_LONG;
                        start += BITS_PER_LONG;
                } else {
                        unsigned long cur = start;
 
                        start = ALIGN(start + 1, BITS_PER_LONG);
                        while (vec && cur != start) {
                                if (vec & 1) {
                                        page = pfn_to_page(cur);
                                        __free_pages_bootmem(page, 0);
                                        count++;
                                }
                                vec >>= 1;
                                ++cur;
                        }
                }
        }
 
        page = virt_to_page(bdata->node_bootmem_map);
        pages = bdata->node_low_pfn - bdata->node_min_pfn;
        pages = bootmem_bootmap_pages(pages);
        count += pages;
        while (pages--)
                __free_pages_bootmem(page++, 0);
 
        bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
 
        return count;
}
 
void __init __free_pages_bootmem(struct page *page, unsigned int order)
{                       
        unsigned int nr_pages = 1 << order;
        struct page *p = page;
        unsigned int loop;
                                
        prefetchw(p);                   
        for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
                prefetchw(p + 1);       
                __ClearPageReserved(p);
                set_page_count(p, 0);
        }                       
        __ClearPageReserved(p);
        set_page_count(p, 0);
        
        page_zone(page)->managed_pages += nr_pages;
        set_page_refcounted(page);
        __free_pages(page, order);
}  
 
void __free_pages(struct page *page, unsigned int order)
{
        if (put_page_testzero(page)) {
                if (order == 0)
                        free_hot_cold_page(page, false);
                else
                        __free_pages_ok(page, order);
        }
}
 
EXPORT_SYMBOL(__free_pages);

首先扫描bootmem分配器页位图,释放每个未用的页。到伙伴系统的接口是__free_pages_bootmem函数,该函数对每个空闲页调用。该函数内部依赖于标准函数__free_page。它使得这些页并入伙伴系统的数据结构,在其中作为空闲页管理,可用于分配。

在页位图已经完全扫描之后,它占据的内存空间也必须释放。此后,只有伙伴系统可用于内存分配。

 

5. 释放初始化数据

许多内核代码块和数据表只在系统初始化阶段需要。例如,对于链接到内核的驱动程序而言,则不必要在内核内存中保持其数据结构的初始化例程。在结构建立之后,这些例程就不再需要。驱动程序用于检测其设备的硬件数据,在相关的设备已经识别之后,也不再需要。

内核提供了两个属性(__init__initdata)用于标记初始化函数和数据。这些必须置于函数或数据的生命之前。

__init属性插入到函数声明中返回类型和函数名之间,例如:

./include/xen/xen-ops.h:static inline efi_system_table_t __init *xen_efi_probe(void);

数据段页可以标记为初始化数据:

./include/linux/init.h:extern char __initdata boot_command_line[];

__init__initdata不能使用普通的C语言实现,因此内核必须再一次借助于特殊的GUU C编译器语句。

//3.18.3 

#define __init          __section(.init.text) __cold notrace

#define __initdata      __section(.init.data)

#define __initconst     __constsection(.init.rodata)

#define __exitdata      __section(.exit.data)

#define __exit_call     __used __section(.exitcall.exit)

__attribute__是一个特殊的GNU C关键字,属性即通过该关键字使用。__section属性用于通知编译器将随后的数据或函数分别写入二进制文件的.init.data.init.text段。前缀__code还通知编译器,通向该函数的代码路径可能性比较低,即该函数不会经常调用,对初始化函数通常这样。

Readelf工具可以用于显示内核映像的各个段:

root@linux:/study/linux-git/linux-3.18.3# readelf --sections vmlinux

There are 78 section headers, starting at offset 0xe2b808c:

 

Section Headers:

  [Nr] Name              Type            Addr     Off    Size   ES Flg Lk Inf Al

  [ 0]                   NULL            00000000 000000 000000 00      0   0  0

  [ 1] .text             PROGBITS        c1000000 001000 7e64e2 00  AX  0   0 4096

  [ 2] .rel.text         REL             00000000 9d8cc9c 20db10 08   I 76   1  4

  [ 3] .notes            NOTE            c17e64e4 7e74e4 000190 00  AX  0   0  4

  [ 4] .rel.notes        REL             00000000 9f9a7ac 000010 08   I 76   3  4

  [ 5] __ex_table        PROGBITS        c17e6680 7e7680 001198 00   A  0   0  8

  [ 6] .rel__ex_table    REL             00000000 9f9a7bc 002330 08   I 76   5  4

  [ 7] .rodata           PROGBITS        c17e8000 7e9000 30229e 00   A  0   0 64

  [ 8] .rel.rodata       REL             00000000 9f9caec 0efee0 08   I 76   7  4

  [ 9] __bug_table       PROGBITS        c1aea2a0 aeb2a0 008a9c 00   A  0   0  1

  [10] .rel__bug_table   REL             00000000 a08c9cc 00b8d0 08   I 76   9  4

  [11] .pci_fixup        PROGBITS        c1af2d3c af3d3c 001f60 00   A  0   0  4

  [12] .rel.pci_fixup    REL             00000000 a09829c 000fb0 08   I 76  11  4

  [13] .builtin_fw       PROGBITS        c1af4c9c af5c9c 0000cc 00   A  0   0  4

  [14] .rel.builtin_fw   REL             00000000 a09924c 000110 08   I 76  13  4

  [15] .tracedata        PROGBITS        c1af4d68 af5d68 000024 00   A  0   0  1

  [16] .rel.tracedata    REL             00000000 a09935c 000030 08   I 76  15  4

  [17] __ksymtab         PROGBITS        c1af4d8c af5d8c 008b70 00   A  0   0  4

  [18] .rel__ksymtab     REL             00000000 a09938c 0116e0 08   I 76  17  4

  [19] __ksymtab_gpl     PROGBITS        c1afd8fc afe8fc 007690 00   A  0   0  4

  [20] .rel__ksymtab_gpl REL             00000000 a0aaa6c 00ed20 08   I 76  19  4

  [21] __kcrctab         PROGBITS        c1b04f8c b05f8c 0045b8 00   A  0   0  4

  [22] .rel__kcrctab     REL             00000000 a0b978c 008b70 08   I 76  21  4

  [23] __kcrctab_gpl     PROGBITS        c1b09544 b0a544 003b48 00   A  0   0  4

  [24] .rel__kcrctab_gpl REL             00000000 a0c22fc 007690 08   I 76  23  4

  [25] __ksymtab_strings PROGBITS        c1b0d08c b0e08c 026121 00   A  0   0  1

  [26] __init_rodata     PROGBITS        c1b331b0 b341b0 000050 00   A  0   0  4

  [27] .rel__init_rodata REL             00000000 a0c998c 000078 08   I 76  26  4

  [28] __param           PROGBITS        c1b33200 b34200 001530 00   A  0   0  4

  [29] .rel__param       REL             00000000 a0c9a04 001f88 08   I 76  28  4

  [30] __modver          PROGBITS        c1b34730 b35730 0008d0 00   A  0   0  4

  [31] .rel__modver      REL             00000000 a0cb98c 000128 08   I 76  30  4

  [32] .data             PROGBITS        c1b35000 b36000 0c7dc0 00  WA  0   0 4096

  [33] .rel.data         REL             00000000 a0cbab4 061e20 08   I 76  32  4

  [34] .vvar             PROGBITS        c1bfd000 bfe000 001000 00  WA  0   0 16

  [35] .init.text        PROGBITS        c1bfe000 bff000 057b76 00  AX  0   0  4

  [36] .rel.init.text    REL             00000000 a12d8d4 0393c8 08   I 76  35  4

  [37] .init.data        PROGBITS        c1c56000 c57000 06c48c 00  WA  0   0 4096

  [38] .rel.init.data    REL             00000000 a166c9c 047f08 08   I 76  37  4

  [39] .x86_cpu_dev.init PROGBITS        c1cc248c cc348c 00001c 00   A  0   0  4

  [40] .rel.x86_cpu_dev. REL             00000000 a1aeba4 000038 08   I 76  39  4

  [41] .x86_intel_mid_de PROGBITS        c1cc24a8 cc34a8 000048 00   A  0   0  4

  [42] .rel.x86_intel_mi REL             00000000 a1aebdc 000090 08   I 76  41  4

  [43] .parainstructions PROGBITS        c1cc24f0 cc34f0 0086b0 00   A  0   0  4

  [44] .rel.parainstruct REL             00000000 a1aec6c 0086b0 08   I 76  43  4

  [45] .altinstructions  PROGBITS        c1ccaba0 ccbba0 007b0c 00   A  0   0  1

  [46] .rel.altinstructi REL             00000000 a1b731c 00a180 08   I 76  45  4

  [47] .altinstr_replace PROGBITS        c1cd26ac cd36ac 001f3f 00  AX  0   0  1

  [48] .iommu_table      PROGBITS        c1cd45ec cd55ec 000050 00   A  0   0  4

  [49] .rel.iommu_table  REL             00000000 a1c149c 000060 08   I 76  48  4

  [50] .apicdrivers      PROGBITS        c1cd4640 cd5640 000004 00  WA  0   0  4

  [51] .rel.apicdrivers  REL             00000000 a1c14fc 000008 08   I 76  50  4

  [52] .exit.text        PROGBITS        c1cd4648 cd5648 002b0a 00  AX  0   0  1

  [53] .rel.exit.text    REL             00000000 a1c1504 002c68 08   I 76  52  4

  [54] .data..percpu     PROGBITS        c1cd8000 cd9000 008f80 00  WA  0   0 4096

  [55] .rel.data..percpu REL             00000000 a1c416c 000050 08   I 76  54  4

  [56] .smp_locks        PROGBITS        c1ce1000 ce2000 00a000 00   A  0   0  4

  [57] .rel.smp_locks    REL             00000000 a1c41bc 0124e0 08   I 76  56  4

  [58] .bss              NOBITS          c1ceb000 cec000 0cb000 00  WA  0   0 4096

  [59] .brk              NOBITS          c1db6000 cec000 284000 00  WA  0   0  1

  [60] .comment          PROGBITS        00000000 cec000 000024 01  MS  0   0  1

  [61] .debug_aranges    PROGBITS        00000000 cec028 016d30 00      0   0  8

  [62] .rel.debug_arange REL             00000000 a1d669c 00dd10 08   I 76  61  4

  [63] .debug_info       PROGBITS        00000000 d02d58 762b8e9 00      0   0  1

  [64] .rel.debug_info   REL             00000000 a1e43ac 39ce1c8 08   I 76  63  4

  [65] .debug_abbrev     PROGBITS        00000000 832e641 32d1c8 00      0   0  1

  [66] .debug_line       PROGBITS        00000000 865b809 754116 00      0   0  1

  [67] .rel.debug_line   REL             00000000 dbb2574 007768 08   I 76  66  4

  [68] .debug_frame      PROGBITS        00000000 8daf920 1b0df4 00      0   0  4

  [69] .rel.debug_frame  REL             00000000 dbb9cdc 097350 08   I 76  68  4

  [70] .debug_str        PROGBITS        00000000 8f60714 2cf407 01  MS  0   0  1

  [71] .debug_loc        PROGBITS        00000000 922fb1b 5bfdb1 00      0   0  1

  [72] .rel.debug_loc    REL             00000000 dc5102c 47aad0 08   I 76  71  4

  [73] .debug_ranges     PROGBITS        00000000 97ef8d0 1df588 00      0   0  8

  [74] .rel.debug_ranges REL             00000000 e0cbafc 1ec590 08   I 76  73  4

  [75] .shstrtab         STRTAB          00000000 99cee58 00028a 00      0   0  1

  [76] .symtab           SYMTAB          00000000 99cf0e4 196090 10     77 67737  4

  [77] .strtab           STRTAB          00000000 9b65174 227b27 00      0   0  1

Key to Flags:

  W (write), A (alloc), X (execute), M (merge), S (strings)

  I (info), L (link order), G (group), T (TLS), E (exclude), x (unknown)

  O (extra OS processing required) o (OS specific), p (processor specific)

root@linux:/study/linux-git/linux-3.18.3# 

为从内存中释放初始化数据,内核不必知道数据的性质,即那些数据和函数保存在内存中和它们的用途都是完全不相干的。唯一相关的信息是这些数据和函数在内存中开始和结束的地址。

由于该函数在编译时无法得到,它是内核在链接时插入的。为提供该信息,内核定义了一对变脸__init_begin__init_end

Free_initmem负责释放用于初始化的内存区,并将相关的页返回给伙伴系统。在启动过程刚好结束时会调用该函数,紧接气候init作为系统中的第一个进程启动。启动日志包含了一条信息,指出释放了多少内存:

[    0.158495] Freeing SMP alternatives memory: 40K (c1ce1000 - c1ceb000)

 

 

 

你可能感兴趣的:(3.4.3 启动过程期间的内存管理)