linux内存管理bootmem

bootmem是内核启动时使用的物理内存分配器,根据e820map中的可用内存来初始化bootmem可用内存;
bootmem启动之前分配的内存放入early_res预留内存区间中,初始化bootmem时将已经分配出去的内存在bootmem系统中标识为已分配

bootmem管理的是页帧,而e820map和early_res管理的是区间
e820map注册进bootmem时会对区间做页对齐操作;区间起始地址roundup,区间终止地址rounddown,见e820_register_active_regions
early_res注入bootmem时会对区间做页对齐操作;区间起始地址rounddown,区间终止地址roundup,见early_res_to_bootmem->reserve_bootmem_generic->reserve_bootmem->reserve_bootmem

mm/bootmem.c:

  1 /*
  2  *  bootmem - A boot-time physical memory allocator and configurator
  3  *
  4  *  Copyright (C) 1999 Ingo Molnar
  5  *                1999 Kanoj Sarcar, SGI
  6  *                2008 Johannes Weiner
  7  *
  8  * Access to this subsystem has to be serialized externally (which is true
  9  * for the boot process anyway).
 10  */


I、bootmem数据结构
include/linux/bootmem.h:

 26 /*
 27  * node_bootmem_map is a map pointer - the bits represent all physical 
 28  * memory pages (including holes) on the node.
 29  */
 30 typedef struct bootmem_data {
 31         unsigned long node_min_pfn;
 32         unsigned long node_low_pfn;
 33         void *node_bootmem_map;
 34         unsigned long last_end_off;
 35         unsigned long hint_idx;
 36         struct list_head list;
 37 } bootmem_data_t;



mm/bootmem.c:

 35 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
 36 
 37 static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);

bootmem_node_data通过list组成链表,链表头为bdata_list;每个NUMA的node一个对应一个链表节点。x86使用UMA内存模型,则只有一个节点。
node_min_pfn:起始页
node_low_pfn:终止页
node_bootmem_map:页帧位图
last_end_off:上次分配内存在最后一页的页偏移
hint_idx:分配内存起始位置索引,分配内存时从第hint_idx开始查找空闲位图块;如果没找到才从头开始查找
list:组成链表

bootmem使用位图表示页帧的使用情况,bit-1表示保留(不可用)内存,bit-0表示可用内存
bdata_list是按node_min_pfn大小递增的有序链表

II、bootmem启动
1.节点bootmem初始化
bootmem初始化时将位图所有位置1,表示所有的页都已经保留;由setup_arch显示的清空来表示可用内存,如setup_arch->initmem_init->setup_bootmem_allocator->setup_node_bootmem->free_bootmem_with_active_regions将可用内存页帧位置0

 90 /*
 91  * Called once to set up the allocator itself.
 92  */
 93 static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
 94         unsigned long mapstart, unsigned long start, unsigned long end)
 95 {
 96         unsigned long mapsize;
 97 
 98         mminit_validate_memmodel_limits(&start, &end);
 99         bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
100         bdata->node_min_pfn = start;
101         bdata->node_low_pfn = end;
102         link_bootmem(bdata);
103 
104         /*
105          * Initially all pages are reserved - setup_arch() has to
106          * register free RAM areas explicitly.
107          */
108         mapsize = bootmap_bytes(end - start);
109         memset(bdata->node_bootmem_map, 0xff, mapsize);
110 
111         bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
112                 bdata - bootmem_node_data, start, mapstart, end, mapsize);
113 
114         return mapsize;
115 }
116 
117 /**
118  * init_bootmem_node - register a node as boot memory
119  * @pgdat: node to register
120  * @freepfn: pfn where the bitmap for this node is to be placed
121  * @startpfn: first pfn on the node
122  * @endpfn: first pfn after the node
123  *
124  * Returns the number of bytes needed to hold the bitmap for this node.
125  */
126 unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
127                                 unsigned long startpfn, unsigned long endpfn)
128 {			
129         return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
130 }



2.e820map可用内存注入到bootmem中
当节点bootmem初始化完成后,所有页帧标识为已用;将e820map中可用内存注入到bootmem中;e820map先放入active_regions,再从active_regions注入到bootmem中
a.e820map->active_regions
setup_arch->initmem_init->e820_register_active_regions

1154 /*
1155  * Finds an active region in the address range from start_pfn to last_pfn and
1156  * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
1157  */
1158 int __init e820_find_active_region(const struct e820entry *ei,
1159                                   unsigned long start_pfn,
1160                                   unsigned long last_pfn,
1161                                   unsigned long *ei_startpfn,
1162                                   unsigned long *ei_endpfn)
1163 {
1164         u64 align = PAGE_SIZE;
1165 
1166         *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
1167         *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
1168 
1169         /* Skip map entries smaller than a page */
1170         if (*ei_startpfn >= *ei_endpfn)
1171                 return 0;
1172 
1173         /* Skip if map is outside the node */
1174         if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
1175                                     *ei_startpfn >= last_pfn)
1176                 return 0;
1177 
1178         /* Check for overlaps */
1179         if (*ei_startpfn < start_pfn)
1180                 *ei_startpfn = start_pfn;
1181         if (*ei_endpfn > last_pfn)
1182                 *ei_endpfn = last_pfn;
1183 
1184         return 1;
1185 }
1186 
1187 /* Walk the e820 map and register active regions within a node */
1188 void __init e820_register_active_regions(int nid, unsigned long start_pfn,
1189                                          unsigned long last_pfn)
1190 {
1191         unsigned long ei_startpfn;
1192         unsigned long ei_endpfn;
1193         int i;
1194 
1195         for (i = 0; i < e820.nr_map; i++)
1196                 if (e820_find_active_region(&e820.map[i],
1197                                             start_pfn, last_pfn,
1198                                             &ei_startpfn, &ei_endpfn))
1199                         add_active_range(nid, ei_startpfn, ei_endpfn);



b.active_regions->bootmem
setup_arch->initmem_init->setup_bootmem_allocator->setup_node_bootmem->free_bootmem_with_active_regions

3410 /**
3411  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
3412  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
3413  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
3414  *
3415  * If an architecture guarantees that all ranges registered with
3416  * add_active_ranges() contain no holes and may be freed, this
3417  * this function may be used instead of calling free_bootmem() manually.
3418  */
3419 void __init free_bootmem_with_active_regions(int nid,
3420                                                 unsigned long max_low_pfn)
3421 {
3422         int i;
3423 
3424         for_each_active_range_index_in_nid(i, nid) {
3425                 unsigned long size_pages = 0;
3426                 unsigned long end_pfn = early_node_map[i].end_pfn;
3427 
3428                 if (early_node_map[i].start_pfn >= max_low_pfn)
3429                         continue;
3430 
3431                 if (end_pfn > max_low_pfn)
3432                         end_pfn = max_low_pfn;
3433 
3434                 size_pages = end_pfn - early_node_map[i].start_pfn;
3435                 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
3436                                 PFN_PHYS(early_node_map[i].start_pfn),
3437                                 size_pages << PAGE_SHIFT);
3438         }
3439 }}

 

3.early_res预留内存注入到bootmem中

将bootmem启动之前分配的内存放入early_res中,bootmem初始化时将预留的内存在bootmem中标识为1,表示已经分配。

setup_arch->initmem_init->setup_bootmem_allocator->setup_node_bootmem->early_res_to_bootmemarch/x86/kernel/e820.c:

 917 void __init early_res_to_bootmem(u64 start, u64 end)
 918 {
 919         int i, count;
 920         u64 final_start, final_end;
 921 
 922         count  = 0;
 923         for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
 924                 count++;
 925 
 926         printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
 927                          count, start, end);
 928         for (i = 0; i < count; i++) {
 929                 struct early_res *r = &early_res[i];
 930                 printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
 931                         r->start, r->end, r->name);
 932                 final_start = max(start, r->start);
 933                 final_end = min(end, r->end);
 934                 if (final_start >= final_end) {
 935                         printk(KERN_CONT "\n");
 936                         continue;
 937                 }
 938                 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
 939                         final_start, final_end);
 940                 reserve_bootmem_generic(final_start, final_end - final_start,
 941                                 BOOTMEM_DEFAULT);
 942         }
 943 }



III、bootmem内存分配
1、alloc_bootmem_core是核心bootmem分配内存函数;alloc_bootmem、alloc_bootmem_pages、alloc_bootmem_node等都是对他的封装

434 static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
435                                         unsigned long size, unsigned long align,
436                                         unsigned long goal, unsigned long limit)
437 {
438         unsigned long fallback = 0;
439         unsigned long min, max, start, sidx, midx, step;
440 
441         bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
442                 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
443                 align, goal, limit);
444 
445         BUG_ON(!size);
446         BUG_ON(align & (align - 1));
447         BUG_ON(limit && goal + size > limit);
448 
449         if (!bdata->node_bootmem_map)
450                 return NULL;
451 
452         min = bdata->node_min_pfn;
453         max = bdata->node_low_pfn;
454 
455         goal >>= PAGE_SHIFT;
456         limit >>= PAGE_SHIFT;
457 
458         if (limit && max > limit)
459                 max = limit;
460         if (max <= min)
461                 return NULL;
462 
463         step = max(align >> PAGE_SHIFT, 1UL);
464 
465         if (goal && min < goal && goal < max)
466                 start = ALIGN(goal, step);
467         else
468                 start = ALIGN(min, step);
469 
470         sidx = start - bdata->node_min_pfn;
471         midx = max - bdata->node_min_pfn;
472 
473         if (bdata->hint_idx > sidx) {
474                 /*
475                  * Handle the valid case of sidx being zero and still
476                  * catch the fallback below.
477                  */
478                 fallback = sidx + 1;
479                 sidx = align_idx(bdata, bdata->hint_idx, step);
480         }
481 
482         while (1) {
483                 int merge;
484                 void *region;
485                 unsigned long eidx, i, start_off, end_off;
486 find_block:
487                 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
488                 sidx = align_idx(bdata, sidx, step);
489                 eidx = sidx + PFN_UP(size);
490 
491                 if (sidx >= midx || eidx > midx)
492                         break;
493 
494                 for (i = sidx; i < eidx; i++)
495                         if (test_bit(i, bdata->node_bootmem_map)) {
496                                 sidx = align_idx(bdata, i, step);
497                                 if (sidx == i)
498                                         sidx += step;
499                                 goto find_block;
500                         }
501 
502                 if (bdata->last_end_off & (PAGE_SIZE - 1) &&
503                                 PFN_DOWN(bdata->last_end_off) + 1 == sidx)
504                         start_off = align_off(bdata, bdata->last_end_off, align);
505                 else
506                         start_off = PFN_PHYS(sidx);
507 
508                 merge = PFN_DOWN(start_off) < sidx;
509                 end_off = start_off + size;
510 
511                 bdata->last_end_off = end_off;
512                 bdata->hint_idx = PFN_UP(end_off);
513 
514                 /*
515                  * Reserve the area now:
516                  */
517                 if (__reserve(bdata, PFN_DOWN(start_off) + merge,
518                                 PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
519                         BUG();
520 
521                 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
522                                 start_off);
523                 memset(region, 0, size);
524                 /*
525                  * The min_count is set to 0 so that bootmem allocated blocks
526                  * are never reported as leaks.
527                  */
528                 kmemleak_alloc(region, size, 0, 0);
529                 return region;
530         }
531 
532         if (fallback) {
533                 sidx = align_idx(bdata, fallback - 1, step);
534                 fallback = 0;
535                 goto find_block;
536         }
537 
538         return NULL;
539 }

a、首先从hint_idx开始查找空闲内存块,如果没有找到从头开始(goal与node_min_pfn的较大值)
b、查找PFN_UP(size)个连续空闲的页帧;没找到后移PFN_UP(size)继续查找。
c、如果last_end_off在查找到空闲内存块的上一页内,则返回地址从last_end_off(align对齐后)开始;否则返回地址为空闲块起始页地址
d、保留页帧
e、申请内存块清空为全0

 

IV、bootmem内存回收

347 /**
348  * free_bootmem - mark a page range as usable
349  * @addr: starting address of the range
350  * @size: size of the range in bytes
351  *
352  * Partial pages will be considered reserved and left as they are.
353  *
354  * The range must be contiguous but may span node boundaries.
355  */
356 void __init free_bootmem(unsigned long addr, unsigned long size)
357 {
358         unsigned long start, end;
359 
360         kmemleak_free_part(__va(addr), size);
361 
362         start = PFN_UP(addr);
363         end = PFN_DOWN(addr + size);
364 
365         mark_bootmem(start, end, 0, 0);
366 }

将起始地址向上页对齐,终止地址向下页对齐后,将这些页帧标为可用,将页帧释放

 

V、bootmem销毁
在伙伴系统启动后,将bootmem中的空闲内存释放到buddy系统中;在buddy系统启用后不再使用bootmem

146 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
147 {
148         int aligned;
149         struct page *page;
150         unsigned long start, end, pages, count = 0;
151 
152         if (!bdata->node_bootmem_map)
153                 return 0;
154 
155         start = bdata->node_min_pfn;
156         end = bdata->node_low_pfn;
157 
158         /*
159          * If the start is aligned to the machines wordsize, we might
160          * be able to free pages in bulks of that order.
161          */
162         aligned = !(start & (BITS_PER_LONG - 1));
163 
164         bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
165                 bdata - bootmem_node_data, start, end, aligned);
166 
167         while (start < end) {
168                 unsigned long *map, idx, vec;
169 
170                 map = bdata->node_bootmem_map;
171                 idx = start - bdata->node_min_pfn;
172                 vec = ~map[idx / BITS_PER_LONG];
173 
174                 if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
175                         int order = ilog2(BITS_PER_LONG);
176 
177                         __free_pages_bootmem(pfn_to_page(start), order);
178                         count += BITS_PER_LONG;
179                 } else {
180                         unsigned long off = 0;
181 
182                         while (vec && off < BITS_PER_LONG) {
183                                 if (vec & 1) {
184                                         page = pfn_to_page(start + off);
185                                         __free_pages_bootmem(page, 0);
186                                         count++;
187                                 }
188                                 vec >>= 1;
189                                 off++;
190                         }
191                 }
192                 start += BITS_PER_LONG;
193         }
194 
195         page = virt_to_page(bdata->node_bootmem_map);
196         pages = bdata->node_low_pfn - bdata->node_min_pfn;
197         pages = bootmem_bootmap_pages(pages);
198         count += pages;
199         while (pages--)
200                 __free_pages_bootmem(page++, 0);
201 
202         bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
203 
204         return count;
205 }

219 /**
220  * free_all_bootmem - release free pages to the buddy allocator
221  *
222  * Returns the number of pages actually released.
223  */
224 unsigned long __init free_all_bootmem(void)
225 {
226         return free_all_bootmem_core(NODE_DATA(0)->bdata);
227 }

1.将bootmem中的空闲内存释放到buddy系统中;如果bootmem起始页是字对齐的,则按批量方式注入到buddy系统,否则一页一页的注入到buddy系统

2.将bootmem用于标识内存使用情况的位图内存释放到buddy系统中

你可能感兴趣的:(linux内存管理bootmem)