Linux简化了分段机制,使得虚拟地址与线性地址总是一致,因此Linux的虚拟地址空间也为0~4G。Linux内核将这4G字节的空间分为两部分。将最高的1G字节(从虚拟地址0xC0000000到0xFFFFFFFF)供内核使用,称为“内核空间”。而将较低的3G字节(从虚拟地址0x00000000到0xBFFFFFFF)供各个进程使用,称为“用户空间“。因为每个进程可以通过系统调用进入内核,因此Linux内核由系统内的所有进程共享。于是,从具体进程的角度来看,每个进程可以拥有4G字节的虚拟空间。
Linux使用两级保护机制:0级供内核使用,3级供用户程序使用。每个进程有各自的私有用户空间(0~3G),这个空间对系统中的其他进程是不可见的。最高的1GB字节虚拟内核空间则为所有进程以及内核所共享。内核空间中存放的是内核代码和数据,而进程的用户空间中存放的是用户程序的代码和数据。不管是内核空间还是用户空间,它们都处于虚拟空间中。虽然内核空间占据了每个虚拟空间中的最高1GB字节,但映射到物理内存却总是从最低地址(0x00000000)开始。对内核空间来说,其地址映射是很简单的线性映射,0xC0000000就是物理地址与线性地址之间的位移量,在Linux代码中就叫做PAGE_OFFSET。
linux页表映射机制的建立分为两个阶段,第一个阶段是内核进入保护模式之前要先建立一个临时内核页表并开启分页功能,因为在进入保护模式后,内核继续初始化直到建立完整的内存映射机制之前,仍然需要用到页表来映射相应的内存地址。对x86 32位内核,这个工作在保护模式下的内核入口函数arch/x86/kernel/head_32.S:startup_32()中完成。第二阶段是建立完整的内存映射机制,在在setup_arch()--->arch/x86/mm/init.c:init_memory_mapping()中完成。
/* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from * the physical memory. To access them they are temporarily mapped. */ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; unsigned long ret = 0; unsigned long pos; struct map_range mr[NR_RANGE_MR]; int nr_range, i; int use_pse, use_gbpages; printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ use_pse = use_gbpages = 0; #else use_pse = cpu_has_pse; use_gbpages = direct_gbpages; #endif /* 定义了X86_PAE模式后进行调用 */ if (cpu_has_pse) <span style="white-space:pre"> </span>set_in_cr4(X86_CR4_PSE); /* 激活PSE(如果可用) */ if (cpu_has_pse) set_in_cr4(X86_CR4_PSE); /* 激活PGE(如果可用) */ if (cpu_has_pge) { set_in_cr4(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } /* page_size_mask在这里更新,在后面设置页表时用到 */ if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; memset(mr, 0, sizeof(mr)); nr_range = 0; /* 作为初始页面帧号值,如果没有大内存页对齐 */ start_pfn = start >> PAGE_SHIFT; /* 在setup函数中调用时,这里为0 */ pos = start_pfn << PAGE_SHIFT; /* pos为0 */ #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory * because there are often fixed size MTRRs in there * and overlapping MTRRs into large pages can cause * slowdowns. */ if (pos == 0) /* end_pfn的大小为1k,也就是4M大小的内存 */ end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); else end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); #else /* CONFIG_X86_64 */ end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); #endif if (end_pfn > (end >> PAGE_SHIFT)) end_pfn = end >> PAGE_SHIFT; if (start_pfn < end_pfn) { /* 4M空间将这个区间存放在mr数组中 */ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); pos = end_pfn << PAGE_SHIFT; } /* 大内存页(2M)范围:对齐到PMD,换算成页面的多少 */ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); #ifdef CONFIG_X86_32 /* 这里的结束地址设置为调用的结束位页面数,也就是 所有的物理页面数 */ end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); #else /* CONFIG_X86_64 */ end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); #endif if (start_pfn < end_pfn) { /* 将这段内存放入mr中,保存后面用到 */ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); /* 这里保证了运用PSE时为2M页面而不是PSE时, 仍然为4K页面(上面的按位或和这里的按位与) */ pos = end_pfn << PAGE_SHIFT; /* 更新pos */ } #ifdef CONFIG_X86_64 /* 大内存页(1G)范围 */ start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); pos = end_pfn << PAGE_SHIFT; } /* 尾部不是大内存页(1G)对齐 */ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); pos = end_pfn << PAGE_SHIFT; } #endif /* 尾部不是大内存页(2M)对齐 */ start_pfn = pos>>PAGE_SHIFT; end_pfn = end>>PAGE_SHIFT; nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); /* 合并相同页面大小的连续的页面 */ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { unsigned long old_start; if (mr[i].end != mr[i+1].start || mr[i].page_size_mask != mr[i+1].page_size_mask) continue; /* move it */ old_start = mr[i].start; memmove(&mr[i], &mr[i+1], (nr_range - 1 - i) * sizeof(struct map_range)); mr[i--].start = old_start; nr_range--; } /* 打印相关信息 */ for (i = 0; i < nr_range; i++) printk(KERN_DEBUG " %010lx - %010lx page %s\n", mr[i].start, mr[i].end, (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); /* * 为内核直接映射的页表查找空间 * 以后我们应该在内存映射的本地节点分配这些页表。不幸的是目前这需要在 * 查找到节点之前来做 */ if (!after_bootmem) /*如果内存启动分配器没有建立,则直接从e820.map中找到合适的 连续内存,找到存放页表的空间首地址为e820_table_start */ find_early_table_space(end, use_pse, use_gbpages); #ifdef CONFIG_X86_32 for (i = 0; i < nr_range; i++) /* 对每个保存的区域设置页表映射 */ kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); ret = end; #else /* CONFIG_X86_64 */ for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); #endif #ifdef CONFIG_X86_32 /* 对高端内存固定区域建立映射 */ early_ioremap_page_table_range_init(); /* 放入CR3寄存器 */ load_cr3(swapper_pg_dir); #endif #ifdef CONFIG_X86_64 if (!after_bootmem && !start) { pud_t *pud; pmd_t *pmd; mmu_cr4_features = read_cr4(); /* * _brk_end cannot change anymore, but it and _end may be * located on different 2M pages. cleanup_highmap(), however, * can only consider _end when it runs, so destroy any * mappings beyond _brk_end here. */ pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); pmd = pmd_offset(pud, _brk_end - 1); while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) pmd_clear(pmd); } #endif __flush_tlb_all(); /* 刷新寄存器 */ /* 将分配给建立页表机制的内存空间保留 */ if (!after_bootmem && e820_table_end > e820_table_start) reserve_early(e820_table_start << PAGE_SHIFT, e820_table_end << PAGE_SHIFT, "PGTABLE"); if (!after_bootmem) early_memtest(start, end); return ret >> PAGE_SHIFT; }
map_range结构、save_mr(),以及find_early_table_space()的实现也都在arch/x86/mm/init.c中,如下:
struct map_range { unsigned long start; unsigned long end; unsigned page_size_mask; };
static int __meminit save_mr(struct map_range *mr, int nr_range, unsigned long start_pfn, unsigned long end_pfn, unsigned long page_size_mask) { if (start_pfn < end_pfn) { if (nr_range >= NR_RANGE_MR) panic("run out of range for init_memory_mapping\n"); mr[nr_range].start = start_pfn<<PAGE_SHIFT; mr[nr_range].end = end_pfn<<PAGE_SHIFT; mr[nr_range].page_size_mask = page_size_mask; nr_range++; } return nr_range; }
static void __init find_early_table_space(struct map_range *mr, unsigned long end, int use_pse, int use_gbpages) /* 查找页表需要的空间 */ { unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; phys_addr_t base; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);/* 计算需要用到多少pud,当没有pud存在的情况下pud=pgd */ if (use_gbpages) { unsigned long extra; extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; } else pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);/* 计算映射所有内存所要求的所有pmd的个数 */ if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); #ifdef CONFIG_X86_32 extra += PMD_SIZE; #endif /* The first 2/4M doesn't use large pages. */ if (mr->start < PMD_SIZE) extra += mr->end - mr->start; ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else<span style="white-space:pre"> /* 计算所需要的pte个数 */</span> ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); #ifdef CONFIG_X86_32 /* for fixmap /* 加上固定内存映射区的页表数量 */*/ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); #endif good_end = max_pfn_mapped << PAGE_SHIFT; base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); if (!base) panic("Cannot find space for the kernel page tables"); pgt_buf_start = base >> PAGE_SHIFT; pgt_buf_end = pgt_buf_start; pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", end - 1, pgt_buf_start << PAGE_SHIFT, (pgt_buf_top << PAGE_SHIFT) - 1); }find_early_table_space()先计算映射所需的pud, pmd, pte个数,对32位系统,页表存放的起始地址为0x7000。然后,调用find_e820_area()从e820.map中找到连续的足够大小的内存来存放用于映射的页表,并将页表起始地址的物理页面帧号保存到相关的全局变量中。
内核空间映射kernel_physical_mapping_init()分析对32位系统,该函数在arch/x86/mm/init_32.c中。它把低端区的所有max_low_pfn个物理内存页面映射到内核虚拟地址空间,映射页表从内核空间的起始地址处开始创建,即从PAGE_OFFSET(0xc0000000)开始的整个内核空间,直到物理内存映射完毕。此函数解释内核是如何建立页表
* This maps the physical memory to kernel virtual address space, a total * of max_low_pfn pages, by creating page tables starting from address * PAGE_OFFSET: */ unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { int use_pse = page_size_mask == (1<<PG_LEVEL_2M); unsigned long last_map_addr = end; unsigned long start_pfn, end_pfn; pgd_t *pgd_base = swapper_pg_dir; int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; pgd_t *pgd; pmd_t *pmd; pte_t *pte; unsigned pages_2m, pages_4k; int mapping_iter; start_pfn = start >> PAGE_SHIFT;* 得到要映射的起始地址和终止地址所在页在页帧号 */ end_pfn = end >> PAGE_SHIFT; /* * First iteration will setup identity mapping using large/small pages * based on use_pse, with other attributes same as set by * the early code in head_32.S * * Second iteration will setup the appropriate attributes (NX, GLOBAL..) * as desired for the kernel identity mapping. * * This two pass mechanism conforms to the TLB app note which says: * * "Software should not write to a paging-structure entry in a way * that would change, for any linear address, both the page size * and either the page frame or attributes." */ mapping_iter = 1; if (!cpu_has_pse) use_pse = 0; repeat: pages_2m = pages_4k = 0; pfn = start_pfn; pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); /* 返回页框在PGD表中的索引 */ pgd = pgd_base + pgd_idx; for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { pmd = one_md_table_init(pgd);/* 创建该pgd目录项指向的pmd表 */ if (pfn >= end_pfn) continue; #ifdef CONFIG_X86_PAE<span style="white-space:pre"> /* 三级映射需要设置pmd,因此得到页框在PMD表中的索引 * </span>/ pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); pmd += pmd_idx; #else pmd_idx = 0; /* 两级映射则无需设置 */ #endif for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn; pmd++, pmd_idx++) { unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; /* * Map with big pages if possible, otherwise * create normal page tables: */ if (use_pse) { unsigned int addr2; pgprot_t prot = PAGE_KERNEL_LARGE; /* * first pass will use the same initial * identity mapping attribute + _PAGE_PSE. */ pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR | _PAGE_PSE); addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; if (is_kernel_text(addr) || is_kernel_text(addr2)) prot = PAGE_KERNEL_LARGE_EXEC; pages_2m++; if (mapping_iter == 1) set_pmd(pmd, pfn_pmd(pfn, init_prot)); else set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; continue; } pte = one_page_table_init(pmd /* 返回PMD中第一个PTE */ pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); /* PTE的索引 */ pte += pte_ofs; for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { pgprot_t prot = PAGE_KERNEL; /* * first pass will use the same initial * identity mapping attribute. */ pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); if (is_kernel_text(addr)) prot = PAGE_KERNEL_EXEC; pages_4k++; /* 没有PSE */ /* 设置页表,根据MAPPING_ITER变量的不同 对表设置不同的属性 */ if (mapping_iter == 1) { /* 第一次迭代,属性设置都一样 */ set_pte(pte, pfn_pte(pfn, init_prot)); last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE; } else set_pte(pte, pfn_pte(pfn, prot)); } } } if (mapping_iter == 1) { /* * update direct mapping page count only in the first * iteration. 只在第一次迭代中更新直接映射页的数量 */ update_page_count(PG_LEVEL_2M, pages_2m); update_page_count(PG_LEVEL_4K, pages_4k); /* * local global flush tlb, which will flush the previous * mappings present in both small and large page TLB's. */ __flush_tlb_all(); /* * Second iteration will set the actual desired PTE attributes. 第二次迭代将设置实际的PTE属性 */ mapping_iter = 2; goto repeat; } return last_map_addr; }
static pmd_t * __init one_md_table_init(pgd_t *pgd) { pud_t *pud; pmd_t *pmd_table; #ifdef CONFIG_X86_PAE /* 启用了PAE,需要三级映射,创建PMD表 */ if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { if (after_bootmem) pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); else pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); /* 设置PGD,将对应的PGD项设置为PMD表 */ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); return pmd_table; } #endif /* 非PAE模式:只需二级映射,直接返回原来pgd地址 */ pud = pud_offset(pgd, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; } static pte_t * __init one_page_table_init(pmd_t *pmd) { if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = NULL; if (after_bootmem) { #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif if (!page_table) page_table = (pte_t *)alloc_bootmem_pages(PAGE_SIZE); } else /* 如果启动分配器还没有建立,那么 从刚才分配建立的表中分配空间 */ page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); /* 设置PMD,将对应的PMD项设置为页表 */ set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } return pte_offset_kernel(pmd, 0); } static inline int is_kernel_text(unsigned long addr) { if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) return 1; return 0; }(1)函数开始定义了几个变量,pgd_base指向临时全局页表起始地址(即swapper_pg_dir)。pgd指向一个页表目录项开始的地址,pmd指向一个中间目录开始的地址,pte指向一个页表开始的地址,start_pfn为要映射的起始地址所在物理页框号,end_pfn为终止地址所在物理页框号。
void __init early_ioremap_page_table_range_init(void) { pgd_t *pgd_base = swapper_pg_dir; unsigned long vaddr, end; /* * 固定映射,只是创建页表结构,并不建立实际映射。实际映射将由set_fixmap()来完成: */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; /* 这里是对临时映射区域进行映射而为页表等分配了空间, 但是没有建立实际的映射 */ page_table_range_init(vaddr, end, pgd_base); /* 置变量after_paging_init为1,表示启动了分页机制 */ early_ioremap_reset(); } /* * This function initializes a certain range of kernel virtual memory * with new bootmem page tables, everywhere page tables are missing in * the given range. * * NOTE: The pagetables are allocated contiguous on the physical space * so we can cache the place of the first one and move around without * checking the pgd every time. */ static void __init page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) { int pgd_idx, pmd_idx; unsigned long vaddr; pgd_t *pgd; pmd_t *pmd; pte_t *pte = NULL; vaddr = start; pgd_idx = pgd_index(vaddr); pmd_idx = pmd_index(vaddr); pgd = pgd_base + pgd_idx; for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { pmd = one_md_table_init(pgd); pmd = pmd + pmd_index(vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { /* early fixmap可能对临时映射区中的页表项已经分配了页表, 为使页表分配的空间连续,需要对临时映射区的页表指定区间重新分配 */ /* 在这里已经对pte进行了分配和初始化 */ pte = page_table_kmap_check(one_page_table_init(pmd), pmd, vaddr, pte); vaddr += PMD_SIZE; } pmd_idx = 0; } }
可知:
(1)先计算出固定映射区的起始和终止地址,然后调用page_table_range_init(),用新的bootmem页表项初始化这段高端物理内存要映射到的内核虚拟地址空间,但并不建立实际的映射。最后用early_ioremap_reset()设置after_paging_init为1,表示启动分页机制。
(2)在函数page_table_range_init()中,先获取起址的pgd表项索引、pmd表项索引,然后类似地建立下一级pmd表,和最终的pte页表。在建立页表时需要调用page_table_kmap_check()进行检查,因为在前期可能对固定映射区已经分配了页表项,为使页表分配的空间连续,需要对固定映射区的页表指定区间重新分配。
在init_memory_mapping()中,内核设置好内核页表,并初始化完高端固定映射区后,紧接着调用load_cr3(swapper_pg_dir),将页全局目录表基址swapper_pg_dir送入控制寄存器cr3。每当重新设置cr3时, CPU就会将页面映射目录所在的页面装入CPU内部高速缓存中的TLB部分。现在内存中(实际上是高速缓存中)的映射目录变了,就要再让CPU装入一次。由于页面映射机制本来就是开启着的,所以从load_cr3这条指令执行完以后就扩大了系统空间中有映射区域的大小, 使整个映射覆盖到整个物理内存(高端内存除外)。实际上此时swapper_pg_dir中已经改变的目录项很可能还在高速缓存中,所以还要通过__flush_tlb_all()将高速缓存中的内容冲刷到内存中,这样才能保证内存中映射目录内容的一致性。
通过上述对init_memory_mapping()的剖析,我们可以清晰的看到,构建内核页表,无非就是向相应的表项写入下一级地址和属性。在内核空间保留着一部分内存专门用来存放内核页表。当cpu要进行寻址的时候,无论在内核空间,还是在用户空间,都会通过这个页表来进行映射。对于这个函数,内核把整个物理内存空间都映射完了,当用户空间的进程要使用物理内存时,岂不是不能做相应的映射了?其实不会,内核只是做了映射,映射不代表使用,这样做是内核为了方便管理内存而已。
arch/x86/kernel/setup.c:setup_arch()在用init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT)建立完内核页表之后,就会调用arch/x86/mm/init_32.c:initmem_init(0, max_pfn)启动bootmem内存分配器
在系统启动阶段,buddy系统和slab分配器建立之前,系统的每个节点都拥有自己的bootmem allocator来实现内存的分配,当启动阶段结束后,bootmem allocator将被销毁,而相应的空闲内存会提交给buddy系统来管理,因此bootmem allocator所存在的时间是短暂的,它的宗旨是简单;bootmem allocator的基本思想是在一个节点中建立一片位图区域,每一位对应该节点的低端内存的一个页框,通过一个bit来标记一个页的状态,实现页面的分配与回收。
初始化的函数是init_bootmem(),其和init_bootmem_node()一样,都是对init_bootmem_core()的封装,区别是前者只针对单节点系统,而后者指定了一个节点,在后面其他操作中都用到了类似的封装方法。
bootmem allocator的工作过程
1.bootmem allocator的初始化
2.bootmem allocator保留内存和释放内存
3.bootmem allocator分配内存
4.bootmem allocator的销毁
在setup_arch中通过initmem_init()-->setup_bootmem_allocator()-->setup_node_bootmem()-->init_bootmem_node()来建立节点中的bootmem allocator.
#ifndef CONFIG_NEED_MULTIPLE_NODES void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { #ifdef CONFIG_HIGHMEM highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; /* 注册内存活动区 */ e820_register_active_regions(0, 0, highend_pfn); sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; /* 计算高端内存地址 */ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else e820_register_active_regions(0, 0, max_low_pfn); sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif __vmalloc_start_set = true; printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); setup_bootmem_allocator(); /* 启动内存分配器 */ } #endif /* !CONFIG_NEED_MULTIPLE_NODES */
static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, unsigned long mapstart, unsigned long start, unsigned long end) { unsigned long mapsize; mminit_validate_memmodel_limits(&start, &end); bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));/*存储位图起始地址的虚拟地址*/ bdata->node_min_pfn = start;/*节点中的起始页*/ bdata->node_low_pfn = end; /*节点中的终止页*/ link_bootmem(bdata);/*将该bdata按顺序链入bdata_list中*/ /* * Initially all pages are reserved - setup_arch() has to * register free RAM areas explicitly. */ mapsize = bootmap_bytes(end - start); memset(bdata->node_bootmem_map, 0xff, mapsize);/*将位图全部置1,保留所有页*/ bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", bdata - bootmem_node_data, start, mapstart, end, mapsize); return mapsize;/*返回位图大小*/ }
init_bootmem_core()--->link_bootmem(bdata),最终将bdata添加到全局的bdata_list链表中。当所有在线内存节点设置好后,bootmem内存分配器就初始化完毕。
mm/bootmem.c实现了完整的引导时物理内存分配器和配置器,包括内存节点初始化、内存分配、释放等各种操作。我们概述一下启动内存分配器的主要操作接口功能:
init_bootmem_node():注册一个节点以作为启动内存。核心操作由init_bootmem_core()完成,每调用它一次来设置自己的分配器。
link_bootmem():按顺序添加一个bdata到全局的bdata_list链表中。
free_all_bootmem_node():释放一个节点的可用页面给伙伴系统。核心操作由free_all_bootmem_core()完成。
free_bootmem_node():将指定节点上的一个页面范围标记为可用(即未分配)。
reserve_bootmem_node():将指定节点上的一个页面范围标记为保留。
__alloc_bootmem_node():为指定节点分配启动内存。核心操作由alloc_bootmem_core()完成。
__free():bootmem分配器的释放内存操作。
__reserve():bootmem分配器的保留内存操作。
alloc_bootmem_core():bootmem分配器的分配内存操作
/** * reserve_bootmem_node - mark a page range as reserved * @pgdat: node the range resides on * @physaddr: starting address of the range * @size: size of the range in bytes * @flags: reservation flags (see linux/bootmem.h) * * Partial pages will be reserved. * * The range must reside completely on the specified node. */ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { unsigned long start, end; start = PFN_DOWN(physaddr); /*获得起始页框*/ end = PFN_UP(physaddr + size);/*获得终止页框*/ return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); }
static int __init mark_bootmem_node(bootmem_data_t *bdata, unsigned long start, unsigned long end, int reserve, int flags) { unsigned long sidx, eidx; bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", bdata - bootmem_node_data, start, end, reserve, flags); /*条件判断*/ BUG_ON(start < bdata->node_min_pfn); BUG_ON(end > bdata->node_low_pfn); /*计算出start index,end index,即start和end相对于节点最小页框号的偏移量*/ sidx = start - bdata->node_min_pfn; eidx = end - bdata->node_min_pfn; if (reserve) /*如果选择保留页框*/ return __reserve(bdata, sidx, eidx, flags); else /*选择释放页框*/ __free(bdata, sidx, eidx); return 0; }
static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx, int flags) { unsigned long idx; int exclusive = flags & BOOTMEM_EXCLUSIVE; bdebug("nid=%td start=%lx end=%lx flags=%x\n", bdata - bootmem_node_data, sidx + bdata->node_min_pfn, eidx + bdata->node_min_pfn, flags); for (idx = sidx; idx < eidx; idx++)/*遍历sidx-->eidx的页框对应的位图区域*/ if (test_and_set_bit(idx, bdata->node_bootmem_map)) {/*把位图的相关位置1*/ if (exclusive) { __free(bdata, sidx, idx); return -EBUSY; } bdebug("silent double reserve of PFN %lx\n", idx + bdata->node_min_pfn); } return 0; }保留页面的关键操作就是调用test_and_set_bit()将位图的相关区域置1.
static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { unsigned long idx; bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, sidx + bdata->node_min_pfn, eidx + bdata->node_min_pfn); if (bdata->hint_idx > sidx) bdata->hint_idx = sidx;/*hint_idx指向最低的空闲页*/ for (idx = sidx; idx < eidx; idx++)/*遍历相关的位图区域*/ if (!test_and_clear_bit(idx, bdata->node_bootmem_map))//清零 BUG(); }__free()相较__reserve()多了一处对bdata->hint_idx的操作,这个地方是为了保证hint_idx指向最低的空闲页,因为在进行分配的时候,boot allocator是保证从最低的空闲页开始分配
bootmem allocator要考虑的一个问题就是内存碎片。设我们的页面大小为4KB,假如我们上一次分配内存的范围是从第4个页面开始到第8个页面的2KB处,而这次要求分配的起始地址处于第九个页面,如果从第九个页面开始分配的话,那么至少会产生2KB的内存碎片,这样无疑会产生大量的浪费。这也是为什么我们之前介绍的bootmem关键数据结构中引入last_end_off这个变量,它记录了上次分配的末端地址离页尾的偏移,在我们这个例子中该值为2KB,那么如果这次我们从第9个页面开始分配,我们就要考虑将这2KB整合到这次分配中去。
static void * __init alloc_bootmem_core(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { bootmem_data_t *bdata; void *region; region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); if (region) return region; list_for_each_entry(bdata, &bdata_list, list) { if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) continue; if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) break; region = alloc_bootmem_bdata(bdata, size, align, goal, limit); if (region) return region; } return NULL; }
static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { if (WARN_ON_ONCE(slab_is_available())) return kzalloc(size, GFP_NOWAIT); #ifdef CONFIG_HAVE_ARCH_BOOTMEM { bootmem_data_t *p_bdata; p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); if (p_bdata) return alloc_bootmem_bdata(p_bdata, size, align, goal, limit); } #endif return NULL;
static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, align, goal, limit); BUG_ON(!size); BUG_ON(align & (align - 1));/*检测对齐数是否为2的指数幂*/ BUG_ON(limit && goal + size > limit); /*如果limit不为0则检测goal+size是否超过limit*/ if (!bdata->node_bootmem_map) return NULL; min = bdata->node_min_pfn; max = bdata->node_low_pfn;/*得到该节点的最小最大低端内存页框号*/ /*将goal和limit从地址转化为页框号*/ goal >>= PAGE_SHIFT; limit >>= PAGE_SHIFT; if (limit && max > limit) max = limit; if (max <= min) return NULL; step = max(align >> PAGE_SHIFT, 1UL); /*设定步进,以页面为单位*/ if (goal && min < goal && goal < max) start = ALIGN(goal, step); else start = ALIGN(min, step); /*确定起始页框和最大页框的偏移量*/ sidx = start - bdata->node_min_pfn; midx = max - bdata->node_min_pfn; if (bdata->hint_idx > sidx) { /*/*sidx小于hint_idx的话则要下调至hint_idx对齐后的结果*/ * Handle the valid case of sidx being zero and still * catch the fallback below. */ fallback = sidx + 1; sidx = align_idx(bdata, bdata->hint_idx, step); } while (1) { int merge; void *region; unsigned long eidx, i, start_off, end_off; find_block:/*找到下一个0位作为起始地址 sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); sidx = align_idx(bdata, sidx, step); /*按step进行对齐*/ eidx = sidx + PFN_UP(size); if (sidx >= midx || eidx > midx) break; for (i = sidx; i < eidx; i++) if (test_bit(i, bdata->node_bootmem_map)) {/*遇到了保留位, 则表明无法找到一块连续的空闲区域*/ sidx = align_idx(bdata, i, step); if (sidx == i) sidx += step; goto find_block; /*重新开始检索bitmap*/ } /*如果 1.上次分配的PAGE还有剩余的空间 2.PAGE_SIZE-1>0 3.上次分配的PAGE是在这次要求分配的PAGE的相邻并在前面*/ if (bdata->last_end_off & (PAGE_SIZE - 1) && PFN_DOWN(bdata->last_end_off) + 1 == sidx) start_off = align_off(bdata, bdata->last_end_off, align); else start_off = PFN_PHYS(sidx); /*不满足上述条件,则从要求的起始PAGE开始*/ merge = PFN_DOWN(start_off) < sidx;/*确定merge的值为0或1*/ end_off = start_off + size; /*重新确定last_end_off和hint_idx*/ bdata->last_end_off = end_off; bdata->hint_idx = PFN_UP(end_off); /* * Reserve the area now: */ if (__reserve(bdata, PFN_DOWN(start_off) + merge, PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) BUG(); /*得到起始地址的虚拟地址*/ region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + start_off); memset(region, 0, size); /* * The min_count is set to 0 so that bootmem allocated blocks * are never reported as leaks. */ kmemleak_alloc(region, size, 0, 0); return region; } if (fallback) { sidx = align_idx(bdata, fallback - 1, step); fallback = 0; goto find_block; } return NULL; }
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; struct page *page; unsigned long start, end, pages, count = 0; if (!bdata->node_bootmem_map)/*bitmap不存在,表示该节点已经释放*/ return 0; /*获得低端内存的起始页框和终止页框*/ start = bdata->node_min_pfn; end = bdata->node_low_pfn; /* * If the start is aligned to the machines wordsize, we might * be able to free pages in bulks of that order. */ aligned = !(start & (BITS_PER_LONG - 1));/*得到start是否为2的指数幂*/ bdebug("nid=%td start=%lx end=%lx aligned=%d\n", bdata - bootmem_node_data, start, end, aligned); /************************************* * 第一步:释放空闲页 * *************************************/ while (start < end) { unsigned long *map, idx, vec; map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; vec = ~map[idx / BITS_PER_LONG];/*将idx所处的long字段的位图部分进行取反*/ /*如果:1.起始地址是2的整数幂 2.该long字段的位图全为0,即空闲状态 3.start+BITS_PER_LONG未超过范围*/ if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { int order = ilog2(BITS_PER_LONG);/*得到Long的长度为2的多少次幂*/ __free_pages_bootmem(pfn_to_page(start), order);/*直接将整块内存释放*/ count += BITS_PER_LONG; } else {/*否则只能逐页释放*/ unsigned long off = 0; while (vec && off < BITS_PER_LONG) {/*判断该字段内的空闲页是否已经释放完*/ if (vec & 1) { /*vec的最低位为1,也就是说start+off对应的page为空闲*/ page = pfn_to_page(start + off); __free_pages_bootmem(page, 0); count++; } vec >>= 1; off++; } } start += BITS_PER_LONG; } /***************************** * 第二步:释放保存bitmap的页 * ******************************/ page = virt_to_page(bdata->node_bootmem_map);/*得到bitmap起始地址的所属页*/ pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages);/*得到bitmap的大小,以页为单位*/ count += pages; while (pages--)/*逐页释放*/ __free_pages_bootmem(page++, 0); bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); return count;/*返回释放的页框数*/ }
文章参考:http://blog.csdn.net/bullbat/article/details/7170571 http://blog.csdn.net/vanbreaker/article/details/7529414