linux内核的内存管理分三个阶段。
1. 启动---->bootmem初始化完成为第一阶段。此阶段只能使用memblock_reserve函数分配内存。
此阶段结束标志为:init_bootmem_done = 1.
2. bootmem初始化完--->buddy完成前。结束标志为mem_init_done = 1.
3. 全部内存初始化完毕,可以用cache和buddy分配内存。
本文先分析第1阶段。
1. 初始化:head_fsl_booke.S->machine_init->early_init_devtree:
2. 内存原始数据由u-boot传入,在初始化完memblock_init后,用memblock_add_region加入原始内存数据,我的板子上配了2G内存,即:0x0000 0000->0x80000000,加完后的配置如下:
MEMBLOCK configuration:
rmo_size = 0x80000000
memory.size = 0x0
memory.cnt = 0x1
memory[0x0] 0x0000000000000000 - 0x000000007fffffff, 0x80000000 bytes
reserved.cnt = 0x1
reserved[0x0] 0x0000000000000000 - 0xffffffffffffffff, 0x0 bytes
3. memblock_reserve用来分配内存页。
以分配内核本身占用的内存为例:
/* Reserve MEMBLOCK regions used by kernel, initrd, dt, etc... */
memblock_reserve(PHYSICAL_START, __pa(klimit) - PHYSICAL_START);
分配完之后的内存配置为:
MEMBLOCK configuration:
rmo_size = 0x80000000
memory.size = 0x0
memory.cnt = 0x1
memory[0x0] 0x0000000000000000 - 0x000000007fffffff, 0x80000000 bytes
reserved.cnt = 0x1
reserved[0x0] 0x0000000000000000 - 0x00000000006b0fff, 0x6b1000 bytes
如果分配有连续则进行合并。
几次分配后的配置如下:
MEMBLOCK configuration:
rmo_size = 0x80000000
memory.size = 0x80000000
memory.cnt = 0x1
memory[0x0] 0x0000000000000000 - 0x000000007fffffff, 0x80000000 bytes
reserved.cnt = 0x6
reserved[0x0] 0x0000000000000000 - 0x00000000006b0fff, 0x6b1000 bytes
reserved[0x1] 0x0000000000ffa000 - 0x0000000000ffcfff, 0x3000 bytes
reserved[0x2] 0x000000002fbc4000 - 0x000000002fbdefff, 0x1b000 bytes
reserved[0x3] 0x000000002fbdfa88 - 0x000000002ffff4cc, 0x41fa45 bytes
reserved[0x4] 0x000000002fbe4000 - 0x000000002ffff4cd, 0x41b4ce bytes
reserved[0x5] 0x000000007ffff000 - 0x000000007fffffff, 0x1000 bytes
bootmem初始化完--->buddy完成前。结束标志为mem_init_done = 1
start_kernel->setup_arch->do_init_bootmem:
179 #ifndef CONFIG_NEED_MULTIPLE_NODES
180 void __init do_init_bootmem(void)
181 {
182 unsigned long i;
183 unsigned long start, bootmap_pages;
184 unsigned long total_pages;
185 int boot_mapsize;
186
187 max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
188 total_pages = (memblock_end_of_DRAM() - memstart_addr) >> PAGE_SHIFT;
189 #ifdef CONFIG_HIGHMEM
190 total_pages = total_lowmem >> PAGE_SHIFT;
191 max_low_pfn = lowmem_end_addr >> PAGE_SHIFT;
192 #endif
此处的计算都是从第一阶段的memblock而来,到192行之后各变量值为:
max_low_pfn: 30000
max_pfn: 80000
total_pages: 30000
因为板子上含有highmem,所以total_pages和max_low_pfn都进行了调整,这些值只反映低端内存的情况。
194 /*
195 * Find an area to use for the bootmem bitmap. Calculate the size of
196 * bitmap required as (Total Memory) / PAGE_SIZE / BITS_PER_BYTE.
197 * Add 1 additional page in case the address isn't page-aligned.
198 */
199 bootmap_pages = bootmem_bootmap_pages(total_pages);
200
201 start = memblock_alloc(bootmap_pages << PAGE_SHIFT, PAGE_SIZE);
202
203 min_low_pfn = MEMORY_START >> PAGE_SHIFT;
204 boot_mapsize = init_bootmem_node(NODE_DATA(0), start >> PAGE_SHIFT, min_low_pfn, max_low_pfn);
199:分配管理内存页所需要的内存,其中每页用1Bit表示,所以当有total_pages页时,大小为:
(Total Memory) / PAGE_SIZE / BITS_PER_BYTE。其中BITS_PER_BYTE=8
201:分配该BITMAP区域所占用的空间,还有是第一阶段的内存分配方法,返回的是内存物理地址。
204:初始化node节点,初始化bitmap区为0xff,即占用状态,返回bitmap区大小。
通过计算可知道,bitmap空间占6页,即:0x30000/4K/8bit=6page.
206 /* Add active regions with valid PFNs */
207 for (i = 0; i < memblock.memory.cnt; i++) {
208 unsigned long start_pfn, end_pfn;
209 start_pfn = memblock.memory.region[i].base >> PAGE_SHIFT;
210 end_pfn = start_pfn + memblock_size_pages(&memblock.memory, i);
211 add_active_range(0, start_pfn, end_pfn);
212 }
此循环只会执行一次,因为由第1阶段可知memblock.memory.cnt为1。
其中:start_pfn=0, end_pfn=80000,把这段内存pfn加入early_node_map[MAX_ACTIVE_REGIONS=32]中,如果有多个不连续段内存,会另入到不同的early_node_map中,最多32个段。early_node_map为临时使用,最终所占空间会被释放。
217 #ifdef CONFIG_HIGHMEM
218 free_bootmem_with_active_regions(0, lowmem_end_addr >> PAGE_SHIFT);
219
220 /* reserve the sections we're already using */
221 for (i = 0; i < memblock.reserved.cnt; i++) {
222 unsigned long addr = memblock.reserved.region[i].base +
223 memblock_size_bytes(&memblock.reserved, i) - 1;
224 if (addr < lowmem_end_addr)
225 reserve_bootmem(memblock.reserved.region[i].base,
226 memblock_size_bytes(&memblock.reserved, i),
227 BOOTMEM_DEFAULT);
228 else if (memblock.reserved.region[i].base < lowmem_end_addr) {
229 unsigned long adjusted_size = lowmem_end_addr -
230 memblock.reserved.region[i].base;
231 reserve_bootmem(memblock.reserved.region[i].base,
232 adjusted_size, BOOTMEM_DEFAULT);
233 }
234 }
235 #else
236 free_bootmem_with_active_regions(0, max_pfn);
237
238 /* reserve the sections we're already using */
239 for (i = 0; i < memblock.reserved.cnt; i++)
240 reserve_bootmem(memblock.reserved.region[i].base,
241 memblock_size_bytes(&memblock.reserved, i),
242 BOOTMEM_DEFAULT);
243
244 #endif
由于存在highmem,所以执行上半段。
218:释放early_node_map数组标识的lowmem的内存段,即把0~30000的PFN对应的bitmap区中相应位置0.其中每bit标识一页,LOG信息为:
bootmem::mark_bootmem_node nid=0 start=0 end=30000 reserve=0 flags=0
bootmem::__free nid=0 start=0 end=30000
212:开始的for循环是把已经分配的内存页标记为1,即已在使用。LOG信息如下:
bootmem::mark_bootmem_node nid=0 start=0 end=6b1 reserve=1 flags=0
bootmem::__reserve nid=0 start=0 end=6b1 flags=0
bootmem::mark_bootmem_node nid=0 start=ffa end=ffd reserve=1 flags=0
bootmem::__reserve nid=0 start=ffa end=ffd flags=0
bootmem::mark_bootmem_node nid=0 start=2fbc4 end=2fbdf reserve=1 flags=0
bootmem::__reserve nid=0 start=2fbc4 end=2fbdf flags=0
bootmem::mark_bootmem_node nid=0 start=2fbdf end=30000 reserve=1 flags=0
bootmem::__reserve nid=0 start=2fbdf end=30000 flags=0
bootmem::mark_bootmem_node nid=0 start=2fbe4 end=30000 reserve=1 flags=0
bootmem::__reserve nid=0 start=2fbe4 end=30000 flags=0
只标记了低端内存,高端内存未做标记,即:
reserved[0x5] 0x000000007ffff000 - 0x000000007fffffff, 0x1000 bytes
246 sparse_memory_present_with_active_regions(0);
247
248 init_bootmem_done = 1;
249 }
246:空函数
248:标识bootmem初始化完成,此后可以使用使用bootmem来分配内存页了。bootmem使用位图来分配页,每次分配都从头开始扫描位,找到满足条件的连续空页。
3. 全部内存初始化完毕,可以用cache和buddy分配内存。
相关常量定义:
26 #define NODE_DATA(nid) (node_data[nid])
782 extern struct pglist_data contig_page_data;
783 #define NODE_DATA(nid) (&contig_page_data)
784 #define NODE_MEM_MAP(nid) mem_map
4714 struct pglist_data __refdata contig_page_data = {
4716 .bdata = &bootmem_node_data[0]
4718 };
38 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;//MAX_NUMNODES=0
31 typedef struct bootmem_data {
32 unsigned long node_min_pfn;
33 unsigned long node_low_pfn;
34 void *node_bootmem_map;
35 unsigned long last_end_off;
36 unsigned long hint_idx;
37 struct list_head list;
38 } bootmem_data_t;
609 typedef struct pglist_data {
610 struct zone node_zones[MAX_NR_ZONES];
611 struct zonelist node_zonelists[MAX_ZONELISTS];
612 int nr_zones;
613 #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
614 struct page *node_mem_map;
615 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
616 struct page_cgroup *node_page_cgroup;
617 #endif
618 #endif
619 #ifndef CONFIG_NO_BOOTMEM
620 struct bootmem_data *bdata;
621 #endif
622 #ifdef CONFIG_MEMORY_HOTPLUG
623 /*
624 * Must be held any time you expect node_start_pfn, node_present_pages
625 * or node_spanned_pages stay constant. Holding this will also
626 * guarantee that any pfn_valid() stays that way.
627 *
628 * Nests above zone->lock and zone->size_seqlock.
629 */
630 spinlock_t node_size_lock;
631 #endif
632 unsigned long node_start_pfn;
633 unsigned long node_present_pages; /* total number of physical pages */
634 unsigned long node_spanned_pages; /* total size of physical page
635 range, including holes */
636 int node_id;
637 wait_queue_head_t kswapd_wait;
638 struct task_struct *kswapd;
639 int kswapd_max_order;
640 } pg_data_t;
576 struct zonelist {
577 struct zonelist_cache *zlcache_ptr; // NULL or &zlcache = NULL
578 struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];//MAX_ZONES_PER_ZONELIST=4
579 #ifdef CONFIG_NUMA
580 struct zonelist_cache zlcache; // optional ...
581 #endif
582 };
554 struct zoneref {
555 struct zone *zone; /* Pointer to actual zone */
556 int zone_idx; /* zone_idx(zoneref->zone) */
557 };
内存初始化步骤:
1. start_kernel---->setup_arch->pageing_init
|-->setup_per_cpu_areas
|-->build_all_zonelists
|-->mem_init
-->setup_per_cpu_pageset
start_kernel---->setup_arch->pageing_init代码如下:
378-280:计算整个内存的大小,此处为:0x8000 0000
282-288: 映射临时映射空间,也称固定映射
290-196:映射高端内存PTE
307:最为重要,该函数清空bitmap区,把所有页设为保留。
|-->setup_per_cpu_areas
该函数当非SMP情况使用,此处为空。
|-->build_all_zonelists
函数代码为:
函数中:__build_all_zonelists函数建立了zone链表,当一个zone不能满足当前分配时会根据此链表来找到下一个可以分配的zone。当有多个节点时,建立后的结果为:
b2->b1->b0->a2->a1->a0 (b=node1,a=node0,2=highmem, 1=normal, 0=dma)
|-->mem_init函数代码为:
532:为空函数
533:函数执行完毕后, mem_init_done = 1;即buddy系统分配内存内可以使用了。
534:用来初始化slab和cache,函数执行完后slab可以使用,可以分配少于1页的内存了。
535:空函数
536:vmalloc空间可以正常使用了。
整个内存的布局为:
Kernel virtual memory layout:
* 0xfffcf000..0xfffff000 : fixmap
* 0xff800000..0xffc00000 : highmem PTEs
* 0xff7d8000..0xff800000 : early ioremap
* 0xf1000000..0xff7d8000 : vmalloc & ioremap