在内核启动期间,伙伴系统内存管理器还没有建立之前,内核此时也要分配内存以及创建一些用于建立内存管理等机制的数据结构,此时内存分配和管理就是由bootmem内存分配器来完成的。
bootmem的建立要求就是简单,越简单越好,因为一旦伙伴系统建立之后,就不需要bootmem了,因此对性能和通用性等要服从一切从简的原则。在了解这个分配器之后,就会知道它真的很简单。
该分配器使用一个位图来管理页,位图比特位的数目与系统中物理内存页的数目相同,比特位为1时,表示这个页已经分配,为0时,表示当前指示的页是空闲的。在需要分配内存时,分配器扫描整个位图,直到找到一个能够提供足够连续页的位置。
下面分析一下这个分配器。
一,前提
在这个分配器被建立之前,先了解一下内核此时是一个什么样的状态,主要说内存方面的。
内存在检测系统可用内存之后,被存入一个数组之中,其结构如下:
struct e820map{ _u32 nr_map; struct e820entry map[E820MAX]; } struct e820entry{ _u64 addr; _u64 size; _u32 type; } __attribute__((packed));在用中断检测可用内存之后,内存被存入e820map e820变量中,然后根据这个数组,确定下面一些值:
typedef struct bootmem_data { unsigned long node_boot_start; unsigned long node_low_pfn; void *node_bootmem_map; unsigned long last_offset; unsigned long last_pos; unsigned long last_success; /* Previous allocation point. To speed * up searching */ struct list_head list; } bootmem_data_t;
node_boot_start:这个字段保存了系统中第一个页的编号,这一般都是0;
node_low_pfn:可以被直接管理的物理地址空间中最后一页的编号;
node_bootmem_map:指向分配位图的内存指针,前面说这种分配方式主要是维护一个大的位图。
last_offset:上一次分配的页内的偏移,上一次分配的页的编号由last_pos指定,而last_offset指定个页内已经分配的偏移。
last_success:指定位图中上一次成功分配内存的位置,下一个分配从这里开始分配。
list:因为内存并不是都是连续的,对于不连续的内存,系统需要多个bootmem分配器,所有的分配器都保存一个链表中,表头由一个全局变量指定。
void __init setup_bootmem_allocator(void) { unsigned long bootmap_size; /* * Initialize the boot-time allocator (with low memory only): */ bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);调用init_bootmem函数完成初始化,这个函数会调用init_bootmem_core函数完成初始化。
unsigned long __init init_bootmem(unsigned long start, unsigned long pages) { max_low_pfn = pages; min_low_pfn = start; return init_bootmem_core(NODE_DATA(0), start, 0, pages); }这里的两个参数需要注意,是由min_low_pfn和max_low_pfn传递而来,表示低内存域的最小和最大的页帧编号。
在下面的函数实现中,可以看到mapstart的值就是min_log_pfn,end的值是max_low_pfn,而start的值为0,这个值要赋给bootmem_data_t结构的node_boot_start字段,参数中的NODE_DATA(0),表示当前内存结点。
static unsigned long __init init_bootmem_core(pg_data_t *pgdat, unsigned long mapstart, unsigned long start, unsigned long end) { bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize; bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));//内存的最开始,由分配位图占用一部分 bdata->node_boot_start = PFN_PHYS(start); bdata->node_low_pfn = end;//注意这个赋值 link_bootmem(bdata); /* * Initially all pages are reserved - setup_arch() has to * register free RAM areas explicitly. */ mapsize = get_mapsize(bdata); memset(bdata->node_bootmem_map, 0xff, mapsize); return mapsize; }PFN_PHYS宏是将页帧的编号转换为对应的页内的物理地址,该操作通过左移页内偏移的位数来达到。
#define PFN_PHYS(x) ((x) << PAGE_SHIFT)//在这里是左移12位
static void __init link_bootmem(bootmem_data_t *bdata) { bootmem_data_t *ent; if (list_empty(&bdata_list)) { list_add(&bdata->list, &bdata_list); return; } /* insert in order */ list_for_each_entry(ent, &bdata_list, list) { if (bdata->node_boot_start < ent->node_boot_start) { list_add_tail(&bdata->list, &ent->list); return; } } list_add_tail(&bdata->list, &bdata_list); }
static unsigned long __init get_mapsize(bootmem_data_t *bdata) { unsigned long mapsize; unsigned long start = PFN_DOWN(bdata->node_boot_start); unsigned long end = bdata->node_low_pfn; mapsize = ((end - start) + 7) / 8; return ALIGN(mapsize, sizeof(long)); }
在前面已经初始化了一个位图,该位图的位置从min_low_pfn开始占用,其实就是被内核映射之后的第一个页。但是标记了所有的内存页都是已经被使用了,这时系统中就不存在可用的内存了,需要从刚标识的内存位图中释放一些潜在的、可用的内存。这通过调用register_bootmem_low_pages函数完成。
register_bootmem_low_pages(max_low_pfn); /* * Reserve the bootmem bitmap itself as well. We do this in two * steps (first step was init_bootmem()) because this catches * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text)); /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ reserve_bootmem(0, PAGE_SIZE); /* reserve EBDA region, it's a 4K region */ reserve_ebda_region(); /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent PCI prefetch into it (errata #56). Usually the page is reserved anyways, unless you have no PS/2 mouse plugged in. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 == 6) reserve_bootmem(0xa0000 - 4096, 4096); #ifdef CONFIG_SMP /* * But first pinch a few for the stack/trampoline stuff * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ reserve_bootmem(PAGE_SIZE, PAGE_SIZE); #endif #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. */ acpi_reserve_bootmem(); #endif #ifdef CONFIG_X86_FIND_SMP_CONFIG /* * Find and reserve possible boot-time SMP configuration: */ find_smp_config(); #endif numa_kva_reserve(); #ifdef CONFIG_BLK_DEV_INITRD if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; unsigned long ramdisk_end = ramdisk_image + ramdisk_size; unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; if (ramdisk_end <= end_of_lowmem) { reserve_bootmem(ramdisk_image, ramdisk_size); initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; } else { printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", ramdisk_end, end_of_lowmem); initrd_start = 0; } } #endif reserve_crashkernel(); }下面是register_bootmem_low_pages的代码。
void __init register_bootmem_low_pages(unsigned long max_low_pfn) { int i; if (efi_enabled) { efi_memmap_walk(free_available_memory, NULL); return; } for (i = 0; i < e820.nr_map; i++) { unsigned long curr_pfn, last_pfn, size; /* * Reserve usable low memory */ if (e820.map[i].type != E820_RAM) continue; /* * We are rounding up the start address of usable memory: */ curr_pfn = PFN_UP(e820.map[i].addr); if (curr_pfn >= max_low_pfn) continue; /* * ... and at the end of the usable range downwards: */ last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); if (last_pfn > max_low_pfn) last_pfn = max_low_pfn; /* * .. finally, did all the rounding and playing * around just make the area go away? */ if (last_pfn <= curr_pfn) continue; size = last_pfn - curr_pfn; free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); } }
前面说了由BIOS的中断给我们提供了可用的内存区列表,并且存在变量e820中,那么我们应该要将e820中标识的可用的内存列表都在内存分配器中标识为可用的内存。这个函数就是完成这个功能的,它通过对列表的遍历,找到每一个可用的内存域所在的页,然后标识该页可用。标记为可用是通过调用free_bootmem函数完成的。
void __init free_bootmem(unsigned long addr, unsigned long size) { free_bootmem_core(NODE_DATA(0)->bdata, addr, size); } static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) { unsigned long sidx, eidx; unsigned long i; /* * round down end of usable mem, partially free pages are * considered reserved. */ BUG_ON(!size); BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn); if (addr < bdata->last_success) bdata->last_success = addr; /* * Round up the beginning of the address. */ sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); eidx = PFN_DOWN(addr + size - bdata->node_boot_start); for (i = sidx; i < eidx; i++) { if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) BUG(); } }
这个函数其实是释放内存的函数,就是释放从addr开始的内存页
在上面把系统的可用内存都标记为可用,但此时系统正在使用一些内存,需要把这些内存相应的标记出来,这通过调用reserve_bootmem函数完成。
#define reserve_bootmem(addr, size) \ reserve_bootmem_node(NODE_DATA(0), (addr), (size)) void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { reserve_bootmem_core(pgdat->bdata, physaddr, size); } static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) { unsigned long sidx, eidx; unsigned long i; /* * round up, partially reserved pages are considered * fully reserved. */ BUG_ON(!size); BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn); BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn); sidx = PFN_DOWN(addr - bdata->node_boot_start); eidx = PFN_UP(addr + size - bdata->node_boot_start); for (i = sidx; i < eidx; i++) if (test_and_set_bit(i, bdata->node_bootmem_map)) { #ifdef CONFIG_DEBUG_BOOTMEM printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); #endif } }
这个函数和前面的有些类似,就是将对应的页标记为已经使用。注意这个内存区域的计算,和前面的计算是相反的,前面是计算完全包含在内存中的内存页,这个计算是被完全包含在内存页中那些内存页。
五,内核分配内存
内核提供一些函数,用于向bootmem分配器索要内存,提供了很多系列的接口,如alloc_bootmem(size)、alloc_bootmem_pages(size)等等,这些接口最终调用__alloc_bootmem函数完成分配。
void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { void *mem = __alloc_bootmem_nopanic(size,align,goal); if (mem) return mem; /* * Whoops, we cannot satisfy the allocation request. */ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; }这个函数也是__alloc_bootmem_nopanic函数的一个前端。
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { bootmem_data_t *bdata; void *ptr; list_for_each_entry(bdata, &bdata_list, list) { ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); if (ptr) return ptr; } return NULL; }这个函数会实际的分配,之前我们知道多个分配器被一个全局变量链接到一个链表上,这里会遍历整个链表,调用__alloc_bootmem_core来分配内存,这个函数比较长,函数所需要参数有
void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { unsigned long offset, remaining_size, areasize, preferred; unsigned long i, start = 0, incr, eidx, end_pfn; void *ret; if (!size) { printk("__alloc_bootmem_core(): zero-sized request\n"); BUG(); } BUG_ON(align & (align-1)); if (limit && bdata->node_boot_start >= limit) return NULL; /* on nodes without memory - bootmem_map is NULL */ if (!bdata->node_bootmem_map) return NULL; end_pfn = bdata->node_low_pfn; limit = PFN_DOWN(limit); if (limit && end_pfn > limit) end_pfn = limit; eidx = end_pfn - PFN_DOWN(bdata->node_boot_start); offset = 0; if (align && (bdata->node_boot_start & (align - 1UL)) != 0) offset = align - (bdata->node_boot_start & (align - 1UL)); offset = PFN_DOWN(offset); /* * We try to allocate bootmem pages above 'goal' * first, then we try to allocate lower pages. */ if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { preferred = goal - bdata->node_boot_start; if (bdata->last_success >= preferred) if (!limit || (limit && limit > bdata->last_success)) preferred = bdata->last_success; } else preferred = 0; preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; incr = align >> PAGE_SHIFT ? : 1; restart_scan: for (i = preferred; i < eidx; i += incr) { unsigned long j; i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); i = ALIGN(i, incr); if (i >= eidx) break; if (test_bit(i, bdata->node_bootmem_map)) continue; for (j = i + 1; j < i + areasize; ++j) { if (j >= eidx) goto fail_block; if (test_bit(j, bdata->node_bootmem_map)) goto fail_block; } start = i; goto found; fail_block: i = ALIGN(j, incr); } if (preferred > offset) { preferred = offset; goto restart_scan; } return NULL; found: bdata->last_success = PFN_PHYS(start); BUG_ON(start >= eidx); /* * Is the next page of the previous allocation-end the start * of this allocation's buffer? If yes then we can 'merge' * the previous partial page with this allocation. */ if (align < PAGE_SIZE && bdata->last_offset && bdata->last_pos+1 == start) { offset = ALIGN(bdata->last_offset, align); BUG_ON(offset > PAGE_SIZE); remaining_size = PAGE_SIZE - offset; if (size < remaining_size) { areasize = 0; /* last_pos unchanged */ bdata->last_offset = offset + size; ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + offset + bdata->node_boot_start); } else { remaining_size = size - remaining_size; areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + offset + bdata->node_boot_start); bdata->last_pos = start + areasize - 1; bdata->last_offset = remaining_size; } bdata->last_offset &= ~PAGE_MASK; } else { bdata->last_pos = start + areasize - 1; bdata->last_offset = size & ~PAGE_MASK; ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); } /* * Reserve the area now: */ for (i = start; i < start + areasize; i++) if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) BUG(); memset(ret, 0, size); return ret; }这个函数的中心思想是从指定的位置开始扫描位图,如果找到了满足分配要求的,就马上分配。执行以下一些操作:
page = virt_to_page(bdata->node_bootmem_map); count = 0; idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; for (i = 0; i < idx; i++, page++) { __free_pages_bootmem(page, 0); count++; } total += count; bdata->node_bootmem_map = NULL;