bootmem是内核启动时使用的物理内存分配器,根据e820map中的可用内存来初始化bootmem可用内存;
bootmem启动之前分配的内存放入early_res预留内存区间中,初始化bootmem时将已经分配出去的内存在bootmem系统中标识为已分配
bootmem管理的是页帧,而e820map和early_res管理的是区间
e820map注册进bootmem时会对区间做页对齐操作;区间起始地址roundup,区间终止地址rounddown,见e820_register_active_regions
early_res注入bootmem时会对区间做页对齐操作;区间起始地址rounddown,区间终止地址roundup,见early_res_to_bootmem->reserve_bootmem_generic->reserve_bootmem->reserve_bootmem
mm/bootmem.c:
1 /* 2 * bootmem - A boot-time physical memory allocator and configurator 3 * 4 * Copyright (C) 1999 Ingo Molnar 5 * 1999 Kanoj Sarcar, SGI 6 * 2008 Johannes Weiner 7 * 8 * Access to this subsystem has to be serialized externally (which is true 9 * for the boot process anyway). 10 */
I、bootmem数据结构
include/linux/bootmem.h:
26 /* 27 * node_bootmem_map is a map pointer - the bits represent all physical 28 * memory pages (including holes) on the node. 29 */ 30 typedef struct bootmem_data { 31 unsigned long node_min_pfn; 32 unsigned long node_low_pfn; 33 void *node_bootmem_map; 34 unsigned long last_end_off; 35 unsigned long hint_idx; 36 struct list_head list; 37 } bootmem_data_t;
mm/bootmem.c:
35 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 36 37 static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
bootmem_node_data通过list组成链表,链表头为bdata_list;每个NUMA的node一个对应一个链表节点。x86使用UMA内存模型,则只有一个节点。
node_min_pfn:起始页
node_low_pfn:终止页
node_bootmem_map:页帧位图
last_end_off:上次分配内存在最后一页的页偏移
hint_idx:分配内存起始位置索引,分配内存时从第hint_idx开始查找空闲位图块;如果没找到才从头开始查找
list:组成链表
bootmem使用位图表示页帧的使用情况,bit-1表示保留(不可用)内存,bit-0表示可用内存
bdata_list是按node_min_pfn大小递增的有序链表
II、bootmem启动
1.节点bootmem初始化
bootmem初始化时将位图所有位置1,表示所有的页都已经保留;由setup_arch显示的清空来表示可用内存,如setup_arch->initmem_init->setup_bootmem_allocator->setup_node_bootmem->free_bootmem_with_active_regions将可用内存页帧位置0
90 /* 91 * Called once to set up the allocator itself. 92 */ 93 static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, 94 unsigned long mapstart, unsigned long start, unsigned long end) 95 { 96 unsigned long mapsize; 97 98 mminit_validate_memmodel_limits(&start, &end); 99 bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); 100 bdata->node_min_pfn = start; 101 bdata->node_low_pfn = end; 102 link_bootmem(bdata); 103 104 /* 105 * Initially all pages are reserved - setup_arch() has to 106 * register free RAM areas explicitly. 107 */ 108 mapsize = bootmap_bytes(end - start); 109 memset(bdata->node_bootmem_map, 0xff, mapsize); 110 111 bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", 112 bdata - bootmem_node_data, start, mapstart, end, mapsize); 113 114 return mapsize; 115 } 116 117 /** 118 * init_bootmem_node - register a node as boot memory 119 * @pgdat: node to register 120 * @freepfn: pfn where the bitmap for this node is to be placed 121 * @startpfn: first pfn on the node 122 * @endpfn: first pfn after the node 123 * 124 * Returns the number of bytes needed to hold the bitmap for this node. 125 */ 126 unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, 127 unsigned long startpfn, unsigned long endpfn) 128 { 129 return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); 130 }
2.e820map可用内存注入到bootmem中
当节点bootmem初始化完成后,所有页帧标识为已用;将e820map中可用内存注入到bootmem中;e820map先放入active_regions,再从active_regions注入到bootmem中
a.e820map->active_regions
setup_arch->initmem_init->e820_register_active_regions
1154 /* 1155 * Finds an active region in the address range from start_pfn to last_pfn and 1156 * returns its range in ei_startpfn and ei_endpfn for the e820 entry. 1157 */ 1158 int __init e820_find_active_region(const struct e820entry *ei, 1159 unsigned long start_pfn, 1160 unsigned long last_pfn, 1161 unsigned long *ei_startpfn, 1162 unsigned long *ei_endpfn) 1163 { 1164 u64 align = PAGE_SIZE; 1165 1166 *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT; 1167 *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT; 1168 1169 /* Skip map entries smaller than a page */ 1170 if (*ei_startpfn >= *ei_endpfn) 1171 return 0; 1172 1173 /* Skip if map is outside the node */ 1174 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || 1175 *ei_startpfn >= last_pfn) 1176 return 0; 1177 1178 /* Check for overlaps */ 1179 if (*ei_startpfn < start_pfn) 1180 *ei_startpfn = start_pfn; 1181 if (*ei_endpfn > last_pfn) 1182 *ei_endpfn = last_pfn; 1183 1184 return 1; 1185 } 1186 1187 /* Walk the e820 map and register active regions within a node */ 1188 void __init e820_register_active_regions(int nid, unsigned long start_pfn, 1189 unsigned long last_pfn) 1190 { 1191 unsigned long ei_startpfn; 1192 unsigned long ei_endpfn; 1193 int i; 1194 1195 for (i = 0; i < e820.nr_map; i++) 1196 if (e820_find_active_region(&e820.map[i], 1197 start_pfn, last_pfn, 1198 &ei_startpfn, &ei_endpfn)) 1199 add_active_range(nid, ei_startpfn, ei_endpfn);
b.active_regions->bootmem
setup_arch->initmem_init->setup_bootmem_allocator->setup_node_bootmem->free_bootmem_with_active_regions
3410 /** 3411 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 3412 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 3413 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 3414 * 3415 * If an architecture guarantees that all ranges registered with 3416 * add_active_ranges() contain no holes and may be freed, this 3417 * this function may be used instead of calling free_bootmem() manually. 3418 */ 3419 void __init free_bootmem_with_active_regions(int nid, 3420 unsigned long max_low_pfn) 3421 { 3422 int i; 3423 3424 for_each_active_range_index_in_nid(i, nid) { 3425 unsigned long size_pages = 0; 3426 unsigned long end_pfn = early_node_map[i].end_pfn; 3427 3428 if (early_node_map[i].start_pfn >= max_low_pfn) 3429 continue; 3430 3431 if (end_pfn > max_low_pfn) 3432 end_pfn = max_low_pfn; 3433 3434 size_pages = end_pfn - early_node_map[i].start_pfn; 3435 free_bootmem_node(NODE_DATA(early_node_map[i].nid), 3436 PFN_PHYS(early_node_map[i].start_pfn), 3437 size_pages << PAGE_SHIFT); 3438 } 3439 }}
3.early_res预留内存注入到bootmem中
将bootmem启动之前分配的内存放入early_res中,bootmem初始化时将预留的内存在bootmem中标识为1,表示已经分配。
setup_arch->initmem_init->setup_bootmem_allocator->setup_node_bootmem->early_res_to_bootmemarch/x86/kernel/e820.c:
917 void __init early_res_to_bootmem(u64 start, u64 end) 918 { 919 int i, count; 920 u64 final_start, final_end; 921 922 count = 0; 923 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) 924 count++; 925 926 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", 927 count, start, end); 928 for (i = 0; i < count; i++) { 929 struct early_res *r = &early_res[i]; 930 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, 931 r->start, r->end, r->name); 932 final_start = max(start, r->start); 933 final_end = min(end, r->end); 934 if (final_start >= final_end) { 935 printk(KERN_CONT "\n"); 936 continue; 937 } 938 printk(KERN_CONT " ==> [%010llx - %010llx]\n", 939 final_start, final_end); 940 reserve_bootmem_generic(final_start, final_end - final_start, 941 BOOTMEM_DEFAULT); 942 } 943 }
III、bootmem内存分配
1、alloc_bootmem_core是核心bootmem分配内存函数;alloc_bootmem、alloc_bootmem_pages、alloc_bootmem_node等都是对他的封装
434 static void * __init alloc_bootmem_core(struct bootmem_data *bdata, 435 unsigned long size, unsigned long align, 436 unsigned long goal, unsigned long limit) 437 { 438 unsigned long fallback = 0; 439 unsigned long min, max, start, sidx, midx, step; 440 441 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", 442 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, 443 align, goal, limit); 444 445 BUG_ON(!size); 446 BUG_ON(align & (align - 1)); 447 BUG_ON(limit && goal + size > limit); 448 449 if (!bdata->node_bootmem_map) 450 return NULL; 451 452 min = bdata->node_min_pfn; 453 max = bdata->node_low_pfn; 454 455 goal >>= PAGE_SHIFT; 456 limit >>= PAGE_SHIFT; 457 458 if (limit && max > limit) 459 max = limit; 460 if (max <= min) 461 return NULL; 462 463 step = max(align >> PAGE_SHIFT, 1UL); 464 465 if (goal && min < goal && goal < max) 466 start = ALIGN(goal, step); 467 else 468 start = ALIGN(min, step); 469 470 sidx = start - bdata->node_min_pfn; 471 midx = max - bdata->node_min_pfn; 472 473 if (bdata->hint_idx > sidx) { 474 /* 475 * Handle the valid case of sidx being zero and still 476 * catch the fallback below. 477 */ 478 fallback = sidx + 1; 479 sidx = align_idx(bdata, bdata->hint_idx, step); 480 } 481 482 while (1) { 483 int merge; 484 void *region; 485 unsigned long eidx, i, start_off, end_off; 486 find_block: 487 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); 488 sidx = align_idx(bdata, sidx, step); 489 eidx = sidx + PFN_UP(size); 490 491 if (sidx >= midx || eidx > midx) 492 break; 493 494 for (i = sidx; i < eidx; i++) 495 if (test_bit(i, bdata->node_bootmem_map)) { 496 sidx = align_idx(bdata, i, step); 497 if (sidx == i) 498 sidx += step; 499 goto find_block; 500 } 501 502 if (bdata->last_end_off & (PAGE_SIZE - 1) && 503 PFN_DOWN(bdata->last_end_off) + 1 == sidx) 504 start_off = align_off(bdata, bdata->last_end_off, align); 505 else 506 start_off = PFN_PHYS(sidx); 507 508 merge = PFN_DOWN(start_off) < sidx; 509 end_off = start_off + size; 510 511 bdata->last_end_off = end_off; 512 bdata->hint_idx = PFN_UP(end_off); 513 514 /* 515 * Reserve the area now: 516 */ 517 if (__reserve(bdata, PFN_DOWN(start_off) + merge, 518 PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) 519 BUG(); 520 521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + 522 start_off); 523 memset(region, 0, size); 524 /* 525 * The min_count is set to 0 so that bootmem allocated blocks 526 * are never reported as leaks. 527 */ 528 kmemleak_alloc(region, size, 0, 0); 529 return region; 530 } 531 532 if (fallback) { 533 sidx = align_idx(bdata, fallback - 1, step); 534 fallback = 0; 535 goto find_block; 536 } 537 538 return NULL; 539 }
a、首先从hint_idx开始查找空闲内存块,如果没有找到从头开始(goal与node_min_pfn的较大值)
b、查找PFN_UP(size)个连续空闲的页帧;没找到后移PFN_UP(size)继续查找。
c、如果last_end_off在查找到空闲内存块的上一页内,则返回地址从last_end_off(align对齐后)开始;否则返回地址为空闲块起始页地址
d、保留页帧
e、申请内存块清空为全0
IV、bootmem内存回收
347 /** 348 * free_bootmem - mark a page range as usable 349 * @addr: starting address of the range 350 * @size: size of the range in bytes 351 * 352 * Partial pages will be considered reserved and left as they are. 353 * 354 * The range must be contiguous but may span node boundaries. 355 */ 356 void __init free_bootmem(unsigned long addr, unsigned long size) 357 { 358 unsigned long start, end; 359 360 kmemleak_free_part(__va(addr), size); 361 362 start = PFN_UP(addr); 363 end = PFN_DOWN(addr + size); 364 365 mark_bootmem(start, end, 0, 0); 366 }
将起始地址向上页对齐,终止地址向下页对齐后,将这些页帧标为可用,将页帧释放
V、bootmem销毁
在伙伴系统启动后,将bootmem中的空闲内存释放到buddy系统中;在buddy系统启用后不再使用bootmem
146 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 147 { 148 int aligned; 149 struct page *page; 150 unsigned long start, end, pages, count = 0; 151 152 if (!bdata->node_bootmem_map) 153 return 0; 154 155 start = bdata->node_min_pfn; 156 end = bdata->node_low_pfn; 157 158 /* 159 * If the start is aligned to the machines wordsize, we might 160 * be able to free pages in bulks of that order. 161 */ 162 aligned = !(start & (BITS_PER_LONG - 1)); 163 164 bdebug("nid=%td start=%lx end=%lx aligned=%d\n", 165 bdata - bootmem_node_data, start, end, aligned); 166 167 while (start < end) { 168 unsigned long *map, idx, vec; 169 170 map = bdata->node_bootmem_map; 171 idx = start - bdata->node_min_pfn; 172 vec = ~map[idx / BITS_PER_LONG]; 173 174 if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { 175 int order = ilog2(BITS_PER_LONG); 176 177 __free_pages_bootmem(pfn_to_page(start), order); 178 count += BITS_PER_LONG; 179 } else { 180 unsigned long off = 0; 181 182 while (vec && off < BITS_PER_LONG) { 183 if (vec & 1) { 184 page = pfn_to_page(start + off); 185 __free_pages_bootmem(page, 0); 186 count++; 187 } 188 vec >>= 1; 189 off++; 190 } 191 } 192 start += BITS_PER_LONG; 193 } 194 195 page = virt_to_page(bdata->node_bootmem_map); 196 pages = bdata->node_low_pfn - bdata->node_min_pfn; 197 pages = bootmem_bootmap_pages(pages); 198 count += pages; 199 while (pages--) 200 __free_pages_bootmem(page++, 0); 201 202 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); 203 204 return count; 205 } 219 /** 220 * free_all_bootmem - release free pages to the buddy allocator 221 * 222 * Returns the number of pages actually released. 223 */ 224 unsigned long __init free_all_bootmem(void) 225 { 226 return free_all_bootmem_core(NODE_DATA(0)->bdata); 227 }
1.将bootmem中的空闲内存释放到buddy系统中;如果bootmem起始页是字对齐的,则按批量方式注入到buddy系统,否则一页一页的注入到buddy系统
2.将bootmem用于标识内存使用情况的位图内存释放到buddy系统中