linux启动内存分配器是在伙伴系统、slab机制实现之前,为满足内核中内存的分配而建立的。本身的机制比较简单,使用位图来进行标志分配和释放。
一、数据结构介绍
1,保留区间
因为在建立启动内存分配器的时候,会涉及保留内存。也就是说,之前保留给页表、分配器本身(用于映射的位图)、io等得内存在分配器建立后,当用它来分配内存空间时,保留出来的那些部分就不能再分配了。linux中对保留内存空间的部分用下列数据结构表示
/* * Early reserved memory areas. */ #define MAX_EARLY_RES 20/*保留空间最大块数*/ struct early_res {/*保留空间结构*/ u64 start, end; char name[16]; char overlap_ok; }; /*保留内存空间全局变量*/ static struct early_res early_res[MAX_EARLY_RES] __initdata = { { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ {} };2,bootmem分配器
/* * node_bootmem_map is a map pointer - the bits represent all physical * memory pages (including holes) on the node. */ /*用于bootmem分配器的节点数据结构*/ typedef struct bootmem_data { unsigned long node_min_pfn;/*存放bootmem位图的第一个页面(即内核映象结束处的第一个页面)。*/ unsigned long node_low_pfn;/*物理内存的顶点,最高不超过896MB。*/ void *node_bootmem_map; unsigned long last_end_off;/*用来存放在前一次分配中所分配的最后一个字节相对于last_pos的位移量*/ unsigned long hint_idx;/*存放前一次分配的最后一个页面号*/ struct list_head list; } bootmem_data_t;全局链表
static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
二、启动分配器的建立
启动分配器的建立主要的流程为初始化映射位图、活动内存区的映射位置0(表示可用)、保留内存区域处理,其中保留区存放在上面介绍的全局数组中,这里只是将分配器中对应映射位图值1,表示已经分配。
下面我们看内核中具体的初始化流程。
start_kernel()->setup_arch()->initmem_init()
void __init setup_arch(char **cmdline_p)
{
.......
/*此函数在开始对bootmem分配制度建立做些准备工作
然后调用相关函数建立bootmem分配制度*/
initmem_init(0, max_pfn);
.......
}
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; /*将活动内存放到early_node_map中,前面已经分析过了*/ e820_register_active_regions(0, 0, highend_pfn); /*设置上面变量中的内存为当前,在这里没有 设置相关的宏*/ sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; /*高端内存开始地址物理*/ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else e820_register_active_regions(0, 0, max_low_pfn); sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif __vmalloc_start_set = true; printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); /*安装bootmem分配器,此分配器在伙伴系统起来之前 用来进行承担内存的分配等管理*/ setup_bootmem_allocator(); }void __init setup_bootmem_allocator(void) { int nodeid; unsigned long bootmap_size, bootmap; /* * Initialize the boot-time allocator (with low memory only): */ /*计算所需要的映射页面大小一个字节一位, 所以需要对总的页面大小除以8*/ bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; /*直接中e820中找到一个大小合适的内存块,返回基址*/ bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size, PAGE_SIZE); if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); /*将用于位图映射的页面保留*/ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); /*对每一个在线的node*/ for_each_online_node(nodeid) { unsigned long start_pfn, end_pfn; #ifdef CONFIG_NEED_MULTIPLE_NODES/*not set*/ start_pfn = node_start_pfn[nodeid]; end_pfn = node_end_pfn[nodeid]; if (start_pfn > max_low_pfn) continue; if (end_pfn > max_low_pfn) end_pfn = max_low_pfn; #else start_pfn = 0; end_pfn = max_low_pfn; #endif /*对指定节点安装启动分配器*/ bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, bootmap); } /*bootmem的分配制度到这里就已经建立完成,把after_bootmem 变量置成1,标识*/ after_bootmem = 1; }static unsigned long __init setup_node_bootmem(int nodeid, unsigned long start_pfn, unsigned long end_pfn, unsigned long bootmap) { unsigned long bootmap_size; /* don't touch min_low_pfn */ /*初始化映射位图,将位图中的所有位置1*/ bootmap_size = init_bootmem_node(NODE_DATA(nodeid), bootmap >> PAGE_SHIFT, start_pfn, end_pfn); printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", nodeid, bootmap, bootmap + bootmap_size); /*将活动内存区对应位图相关位置0,表示可被分配的*/ free_bootmem_with_active_regions(nodeid, end_pfn); /*对置保留位的相关页面对应的位图设置为1,表示已经分配 或者不可用(不能被分配)*/ early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); /*返回映射页面的最后地址,下次映射即可以从这里开始*/ return bootmap + bootmap_size; }对于初始化映射位图,最终调用init_bootmem_core()/* * Called once to set up the allocator itself. */ static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, unsigned long mapstart, unsigned long start, unsigned long end) { unsigned long mapsize; mminit_validate_memmodel_limits(&start, &end); bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); bdata->node_min_pfn = start; bdata->node_low_pfn = end; /*添加bdata变量到链表中*/ link_bootmem(bdata); /* * Initially all pages are reserved - setup_arch() has to * register free RAM areas explicitly. */ /*计算本bdata的mapsize,也就是内存页面大小的1/8*/ mapsize = bootmap_bytes(end - start); /*将所有map置1*/ memset(bdata->node_bootmem_map, 0xff, mapsize); bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", bdata - bootmem_node_data, start, mapstart, end, mapsize); return mapsize; }/* * link bdata in order */ /*添加到链表,由添加的代码可知 链表中的数据开始位置为递增的*/ static void __init link_bootmem(bootmem_data_t *bdata) { struct list_head *iter; /*添加到全局链表bdata_list中*/ list_for_each(iter, &bdata_list) { bootmem_data_t *ent; ent = list_entry(iter, bootmem_data_t, list); if (bdata->node_min_pfn < ent->node_min_pfn) break; } list_add_tail(&bdata->list, iter); }/** * free_bootmem_with_active_regions - Call free_bootmem_node for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node * * If an architecture guarantees that all ranges registered with * add_active_ranges() contain no holes and may be freed, this * this function may be used instead of calling free_bootmem() manually. */ /*用active_region来初始化bootmem分配器,基于低端内存区*/ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { int i; /*对每个节点上得活动内存区*/ for_each_active_range_index_in_nid(i, nid) { unsigned long size_pages = 0; unsigned long end_pfn = early_node_map[i].end_pfn; if (early_node_map[i].start_pfn >= max_low_pfn) continue; if (end_pfn > max_low_pfn) end_pfn = max_low_pfn; /*计算活动区的页面数*/ size_pages = end_pfn - early_node_map[i].start_pfn; /*释放这部分内存,起始就是对应位图值0*/ free_bootmem_node(NODE_DATA(early_node_map[i].nid), PFN_PHYS(early_node_map[i].start_pfn), size_pages << PAGE_SHIFT); } }/** * free_bootmem_node - mark a page range as usable * @pgdat: node the range resides on * @physaddr: starting address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. * * The range must reside completely on the specified node. */ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { unsigned long start, end; /*相关宏进行控制,调试用*/ kmemleak_free_part(__va(physaddr), size); start = PFN_UP(physaddr);/*取上界*/ end = PFN_DOWN(physaddr + size);/*取下界*/ /*调用此函数对相关bit位清0,表示没有分配,这里保留位为0*/ mark_bootmem_node(pgdat->bdata, start, end, 0, 0); }static int __init mark_bootmem_node(bootmem_data_t *bdata, unsigned long start, unsigned long end, int reserve, int flags) { unsigned long sidx, eidx; bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", bdata - bootmem_node_data, start, end, reserve, flags); BUG_ON(start < bdata->node_min_pfn); BUG_ON(end > bdata->node_low_pfn); /*此两个变量为到节点最小内存页面的偏移量*/ sidx = start - bdata->node_min_pfn; eidx = end - bdata->node_min_pfn; if (reserve)/*如果设置了保留位*/ return __reserve(bdata, sidx, eidx, flags); else/*相关的map位清0*/ __free(bdata, sidx, eidx); return 0; }/*bootmem分配器的保留操作*/ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx, int flags) { unsigned long idx; int exclusive = flags & BOOTMEM_EXCLUSIVE; bdebug("nid=%td start=%lx end=%lx flags=%x\n", bdata - bootmem_node_data, sidx + bdata->node_min_pfn, eidx + bdata->node_min_pfn, flags); /*对连续的几个页面设置为保留*/ for (idx = sidx; idx < eidx; idx++) if (test_and_set_bit(idx, bdata->node_bootmem_map)) { if (exclusive) { __free(bdata, sidx, idx); return -EBUSY; } bdebug("silent double reserve of PFN %lx\n", idx + bdata->node_min_pfn); } return 0; }/*bootmem分配器中释放内存*/ static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { unsigned long idx; bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, sidx + bdata->node_min_pfn, eidx + bdata->node_min_pfn); if (bdata->hint_idx > sidx) bdata->hint_idx = sidx;/*更新变量hint_idx,用于分配*/ for (idx = sidx; idx < eidx; idx++)/*对应位清0*/ if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) BUG(); }void __init early_res_to_bootmem(u64 start, u64 end) { int i, count; u64 final_start, final_end; count = 0; for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) count++;/*计算保留块的个数*/ printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", count, start, end); for (i = 0; i < count; i++) { struct early_res *r = &early_res[i]; printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, r->start, r->end, r->name); final_start = max(start, r->start); final_end = min(end, r->end); if (final_start >= final_end) { printk(KERN_CONT "\n"); continue; } printk(KERN_CONT " ==> [%010llx - %010llx]\n", final_start, final_end); /*将指定区间置为保留*/ reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT); } }上面的保留指定区间reserve_bootmem_generic()函数最终调用如下函数/** * reserve_bootmem - mark a page range as usable * @addr: starting address of the range * @size: size of the range in bytes * @flags: reservation flags (see linux/bootmem.h) * * Partial pages will be reserved. * * The range must be contiguous but may span node boundaries. */ int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { unsigned long start, end; start = PFN_DOWN(addr);/*下界*/ end = PFN_UP(addr + size);/*上界*/ return mark_bootmem(start, end, 1, flags); }/*保留指定内存区间*/ static int __init mark_bootmem(unsigned long start, unsigned long end, int reserve, int flags) { unsigned long pos; bootmem_data_t *bdata; pos = start; /*通过bdata_list链表找到在指定区间的bdata*/ list_for_each_entry(bdata, &bdata_list, list) { int err; unsigned long max; if (pos < bdata->node_min_pfn || pos >= bdata->node_low_pfn) { BUG_ON(pos != start); continue; } max = min(bdata->node_low_pfn, end); /*设置为保留*/ err = mark_bootmem_node(bdata, pos, max, reserve, flags); if (reserve && err) {/*如果出错,递归调用*/ mark_bootmem(start, pos, 0, 0); return err; } if (max == end) return 0; pos = bdata->node_low_pfn; } BUG(); }三、内存的分配和释放介绍了上面的初始化流程,对于分配和释放就简单了,分配就是将分配器映射位图中对应的位置1,释放过程相反。
/*分配size大小的空间*/ static void * __init alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, align, goal, limit); BUG_ON(!size); BUG_ON(align & (align - 1)); BUG_ON(limit && goal + size > limit); /*如果没有映射位图返回空,分配失败*/ if (!bdata->node_bootmem_map) return NULL; min = bdata->node_min_pfn; max = bdata->node_low_pfn; goal >>= PAGE_SHIFT; limit >>= PAGE_SHIFT; if (limit && max > limit) max = limit; if (max <= min) return NULL; /*step为需要对齐于页面数*/ step = max(align >> PAGE_SHIFT, 1UL); /*计算起始页面*/ if (goal && min < goal && goal < max) start = ALIGN(goal, step); else start = ALIGN(min, step); /*计算分配页面区间*/ sidx = start - bdata->node_min_pfn; midx = max - bdata->node_min_pfn; /*前一次分配的页号比这次开始分配的页面号大 那么,如果第一次没有分配到,回退到这次的 开始重新试,因为第一次分配是从上一次分配 的位置开始的*/ if (bdata->hint_idx > sidx) { * Handle the valid case of sidx being zero and still * catch the fallback below. */ fallback = sidx + 1; /*从上一次分配的位置开始,对齐与页面*/ sidx = align_idx(bdata, bdata->hint_idx, step); } while (1) { int merge; void *region; unsigned long eidx, i, start_off, end_off; find_block: /*查找第一个为0的位*/ sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); sidx = align_idx(bdata, sidx, step); eidx = sidx + PFN_UP(size);/*结束位置*/ if (sidx >= midx || eidx > midx)/*找到结束了*/ break; for (i = sidx; i < eidx; i++)/*检查这段区域是否空闲*/ if (test_bit(i, bdata->node_bootmem_map)) {/*如果不是,将跳过这段继续查找*/ sidx = align_idx(bdata, i, step); if (sidx == i) sidx += step; goto find_block; } if (bdata->last_end_off & (PAGE_SIZE - 1) &&/*如果为相邻的页面,也就是说上次分配的页面和这次分配的开始页面为相邻的*/ PFN_DOWN(bdata->last_end_off) + 1 == sidx) start_off = align_off(bdata, bdata->last_end_off, align); else start_off = PFN_PHYS(sidx); /*merge==1表示上次结束和这次开始不在同一个页面上*/ merge = PFN_DOWN(start_off) < sidx; end_off = start_off + size; /*更新数据*/ bdata->last_end_off = end_off; bdata->hint_idx = PFN_UP(end_off); /* * Reserve the area now: */ /*设定新加入的页面为保留,就是将对应的映射位置1*/ if (__reserve(bdata, PFN_DOWN(start_off) + merge, PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) BUG(); /*对应开始地址的虚拟地址返回*/ region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + start_off); memset(region, 0, size);/*分配的大小*/ /* * The min_count is set to 0 so that bootmem allocated blocks * are never reported as leaks. */ /*调试用*/ kmemleak_alloc(region, size, 0, 0); return region; } if (fallback) {/*回退,重新查看*/ sidx = align_idx(bdata, fallback - 1, step); fallback = 0; goto find_block; } return NULL; }