内核版本:Linux 4.14
硬件平台:IMX6DL-SABRESD
start_kernel()
|----page_address_init()
|----setup_arch()
|----setup_machine_fdt()
| |----early_init_dt_scan_nodes()
| |----of_scan_flat_dt(early_init_dt_scan_memory, NULL)
| |----early_init_dt_scan_memory()
| |----early_init_dt_add_memory_arch()
| |----memblock_add()
|----early_mm_init()
|----setup_dma_zone()
|----paging_init()
|----mm_init_cpumask()
|----build_all_zonelists()
|----page_alloc_init()
|----vfs_caches_init_early()
|----mm_init()
|----kmem_cache_init_late()
|----kmemleak_init()
|----debug_objects_mem_init()
|----setup_per_cpu_pageset()
|----numa_policy_init()
|----anon_vma_init()
内核初始化中涉及内存的函数非常多,下面就一步一步来从总体上分析。
在32位的Linux系统中,虚拟地址一共是4GB。将整个虚拟地址划分为用户空间+内核空间,有以下3中划分方式:
choice
prompt "Memory split"
depends on MMU
default VMSPLIT_3G
help
Select the desired split between kernel and user memory.
If you are not absolutely sure what you are doing, leave this
option alone!
config VMSPLIT_3G
bool "3G/1G user/kernel split"
config VMSPLIT_3G_OPT
depends on !ARM_LPAE
bool "3G/1G user/kernel split (for full 1G low memory)"
config VMSPLIT_2G
bool "2G/2G user/kernel split"
config VMSPLIT_1G
bool "1G/3G user/kernel split"
endchoice
config PAGE_OFFSET
hex
default PHYS_OFFSET if !MMU
default 0x40000000 if VMSPLIT_1G
default 0x80000000 if VMSPLIT_2G
default 0xB0000000 if VMSPLIT_3G_OPT
default 0xC0000000
ARM 中采用 VMSPLIT_2G,所以用户空间与内核空间各 2G,分水岭为 0x80008000。
在 .config 文件中
CONFIG_VMSPLIT_2G=y
# CONFIG_VMSPLIT_1G is not set
CONFIG_PAGE_OFFSET=0x80000000
/* PAGE_OFFSET - the virtual address of the start of the kernel image */
#define PAGE_OFFSET UL(CONFIG_PAGE_OFFSET)
static inline phys_addr_t __virt_to_phys(unsigned long x)
{
return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
}
static inline unsigned long __phys_to_virt(phys_addr_t x)
{
return x - PHYS_OFFSET + PAGE_OFFSET;
}
所有的内存操作都是基于物理内存的,所以首先要获取物理内存的起始地址与大小。
通过DTS获取物理内存属性,然后解析并添加到 memblock 子系统中。
arch/arm/boot/dts/imx6qdl-sabresd.dtsi
memory: memory {
reg = <0x10000000 0x40000000>;---------------------------------PHYS_OFFSET size
};
根据第一节的总体流程,我们主要来看一下 early_init_dt_scan_memory() 这个函数:
int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
int depth, void *data)
{
const char *type = of_get_flat_dt_prop(node, "device_type", NULL); -----------------------------------------device_type = "memory"
const __be32 *reg, *endp;
int l;
bool hotpluggable;
/* We are scanning "memory" nodes only */
if (type == NULL) {
/*
* The longtrail doesn't have a device_type on the
* /memory node, so look for the node called /memory@0.
*/
if (!IS_ENABLED(CONFIG_PPC32) || depth != 1 || strcmp(uname, "memory@0") != 0)
return 0;
} else if (strcmp(type, "memory") != 0)
return 0;
reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l); -------------------------------------reg = <0x10000000 0x40000000>
if (reg == NULL)
reg = of_get_flat_dt_prop(node, "reg", &l);
if (reg == NULL)
return 0;
endp = reg + (l / sizeof(__be32));
hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
pr_debug("memory scan node %s, reg size %d,\n", uname, l);
while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
u64 base, size;
base = dt_mem_next_cell(dt_root_addr_cells, ®); -----------------------------------------0x10000000
size = dt_mem_next_cell(dt_root_size_cells, ®); -------------------------------------------0x40000000
if (size == 0)
continue;
pr_debug(" - %llx , %llx\n", (unsigned long long)base,
(unsigned long long)size);
early_init_dt_add_memory_arch(base, size); --------------------------------------------------将解析出的 mem 加入 memblock子系统
if (!hotpluggable)
continue;
if (early_init_dt_mark_hotplug_memory_arch(base, size))
pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n",
base, base + size);
}
return 0;
}
根据解析出的 base/size,调用early_init_dt_add_memory_arch–>memblock_add–>memblock_add_range将解析出的物理内存加入到memblock子系统中。
所有的内存都有全局变量 memblock 中:
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
unsigned long flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int nid;
#endif
};
struct memblock_type {
unsigned long cnt; /* number of regions */
unsigned long max; /* size of the allocated array */
phys_addr_t total_size; /* size of all regions */
struct memblock_region *regions;
char *name;
};
struct memblock {
bool bottom_up; /* is bottom up direction? */
phys_addr_t current_limit;
struct memblock_type memory;
struct memblock_type reserved;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
struct memblock_type physmem;
#endif
};
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
.memory.max = INIT_MEMBLOCK_REGIONS,
.memory.name = "memory",
.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
.reserved.name = "reserved",
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
.physmem.regions = memblock_physmem_init_regions,
.physmem.cnt = 1, /* empty dummy entry */
.physmem.max = INIT_PHYSMEM_REGIONS,
.physmem.name = "physmem",
#endif
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
memblock_add用于添加region到memblock.memory中;在内核初始化阶段很多地方(比如哈arm_memblock_init)使用memblock_reserve将region添加到memblock.reserved。
memblock_remove用于将一个region从memblock.memory中移除,memblock_free等用于将一个region从memblock.reserved中移除。
int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, unsigned long flags)
{
bool insert = false;
phys_addr_t obase = base;
phys_addr_t end = base + memblock_cap_size(base, &size);
int idx, nr_new;
struct memblock_region *rgn;
if (!size)
return 0;
/* special case for empty array */
if (type->regions[0].size == 0) {
WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
type->regions[0].flags = flags;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return 0;
}
repeat:
/*
* The following is executed twice. Once with %false @insert and
* then with %true. The first counts the number of regions needed
* to accommodate the new area. The second actually inserts them.
*/
...
...
memblock
在内核启动阶段,也有内存管理的需求,但是此时伙伴系统并没有完成初始化。在早期内核中使用bootmem机制,作为内核初始化阶段的内存分配器。
后来使用memblock作为内核初始化阶段内存分配器,用于内存分配和释放。
CONFIG_NO_BOOTMEM用于决定是否使用bootmem,IMX6DL 使能,所以使用memblock作为初始化阶段的内存分配器。
因为bootmem和memblock两者API兼容,所以使用者无感。使用memblock的时候编译mm/nobootmem.c,调用memblock.c中的分配器接口。
由于没有打开CONFIG_ARM_LPAE,Linux页表采用两层映射。所以PGD->PUD->PMD->PTE中间的PUD/PMD被省略的,pmd_off_k的返回值实际就是pgd_offset_k。
linux-4.14/arch/arm/mm/mm.h
static inline pmd_t *pmd_off_k(unsigned long virt)
{
return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt);
}
linux-4.14/mm/init-mm.c
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
INIT_MM_CONTEXT(init_mm)
};
/* to find an entry in a page-table-directory */
#define pgd_index(addr) ((addr) >> PGDIR_SHIFT)
#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
prepare_page_table用于清空页表项,其实清空了三段地址页表项。
static inline void prepare_page_table(void)
{
unsigned long addr;
phys_addr_t end;
/*
* Clear out all the mappings below the kernel image.
*/
for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE) ----------------------清除 0 ~ MODULES_VADDR(PAGE_OFFSET(0X80000000)) 地址段一级页表
pmd_clear(pmd_off_k(addr));
#ifdef CONFIG_XIP_KERNEL
/* The XIP kernel is mapped in the module area -- skip over it */
addr = ((unsigned long)_exiprom + PMD_SIZE - 1) & PMD_MASK;
#endif
for ( ; addr < PAGE_OFFSET; addr += PMD_SIZE) -------------------------------------清除 MODULES_VADDR ~ PAGE_OFFSET 地址段一级页表
pmd_clear(pmd_off_k(addr));
/*
* Find the end of the first block of lowmem.
*/
end = memblock.memory.regions[0].base + memblock.memory.regions[0].size;
if (end >= arm_lowmem_limit) -------------------------------------------------------------end = 0x50000000,arm_lowmem_limit = 0x50000000(具体查看下面代码)
end = arm_lowmem_limit;
/*
* Clear out all the kernel space mappings, except for the first
* memory bank, up to the vmalloc region.
*/
for (addr = __phys_to_virt(end); ---------------------------------------------------------end = 0x50000000,虚拟地址 0xb0000000。清除 0xc0000000 ~ VMALLOC_START 地址段一级页表。
addr < VMALLOC_START; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
}
#define VMALLOC_END 0xffffffffUL
#define VMALLOC_OFFSET (8*1024*1024)
static void * __initdata vmalloc_min =
(void *)(VMALLOC_END - (240 << 20) - VMALLOC_OFFSET);
vmalloc_limit = (u64)(uintptr_t)vmalloc_min - PAGE_OFFSET + PHYS_OFFSET;
phys_addr_t block_end = reg->base + reg->size;
lowmem_limit = min_t(u64,
vmalloc_limit,
block_end);
arm_lowmem_limit = lowmem_limit;
真正创建页表是在map_lowmem创建了两块区间映射区间一0x10000000 ~ 0x10f00000(0x80000000 ~ 0x80f00000)和区间二0x10f00000 ~ 0x50000000(0x80f00000 ~ 0xc0000000)。
区间一:具有读写执行权限,主要用于存放Kernel代码数据段,还包括swapper_pg_dir内容。
区间二:具有读写,不允许执行,是Normal Memory部分。
可以看出这两个区间虚拟到物理地址映射是线性映射,但是存在在末尾存在特殊两页不是线性映射。
static void __init map_lowmem(void)
{
struct memblock_region *reg;
phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE); --------------kernel_x_start = 0x10000000
phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE); --------------------------kernel_x_end = 0x10f00000
/* Map all the lowmem memory banks. */
for_each_memblock(memory, reg) {
phys_addr_t start = reg->base;
phys_addr_t end = start + reg->size;
struct map_desc map;
if (memblock_is_nomap(reg))
continue;
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
if (start >= end)
break;
if (end < kernel_x_start) {
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = end - start;
map.type = MT_MEMORY_RWX;
create_mapping(&map);
} else if (start >= kernel_x_end) {
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = end - start;
map.type = MT_MEMORY_RW;
create_mapping(&map);
} else {
/* This better cover the entire kernel */
if (start < kernel_x_start) {
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = kernel_x_start - start;
map.type = MT_MEMORY_RW;
create_mapping(&map);
}
map.pfn = __phys_to_pfn(kernel_x_start);
map.virtual = __phys_to_virt(kernel_x_start);
map.length = kernel_x_end - kernel_x_start;
map.type = MT_MEMORY_RWX;
create_mapping(&map);------------------------------------创建虚拟地址 0x80000000 ~ 0x80f00000 到物理地址 0x10000000 ~ 0x10f00000 的映射关系,属性为 MT_MEMORY_RWX
if (kernel_x_end < end) {
map.pfn = __phys_to_pfn(kernel_x_end);
map.virtual = __phys_to_virt(kernel_x_end);
map.length = end - kernel_x_end;
map.type = MT_MEMORY_RW;
create_mapping(&map);---------------------------------创建虚拟地址 0x80f00000 ~ 0xc0000000 到物理地址 0x10f00000 ~ 0x50000000 的映射关系,属性为 MT_MEMORY_RW
}
}
}
}
还有一部分内存映射在 devicemaps_init 中进行的,对 vectors 进行映射:
MT_HIGH_VECTORS: 虚拟地址 0xffff0000 ~ 0xffff1000 (4KB),对应物理地址 0x4fffc000 ~ 0x4fffe000。
MT_LOW_VECTORS: 虚拟地址 0xffff1000 ~ 0xffff2000 (4KB),对应物理地址 0x4fffe000 ~ 0x50000000。
static void __init devicemaps_init(const struct machine_desc *mdesc)
{
struct map_desc map;
unsigned long addr;
void *vectors;
/*
* Allocate the vector page early.
*/
vectors = early_alloc(PAGE_SIZE * 2);
early_trap_init(vectors);
/*
* Clear page table except top pmd used by early fixmaps
*/
for (addr = VMALLOC_START; addr < (FIXADDR_TOP & PMD_MASK); addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
/*
* Map the kernel if it is XIP.
* It is always first in the modulearea.
*/
#ifdef CONFIG_XIP_KERNEL
map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK);
map.virtual = MODULES_VADDR;
map.length = ((unsigned long)_exiprom - map.virtual + ~SECTION_MASK) & SECTION_MASK;
map.type = MT_ROM;
create_mapping(&map);
#endif
/*
* Map the cache flushing regions.
*/
#ifdef FLUSH_BASE
map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS);
map.virtual = FLUSH_BASE;
map.length = SZ_1M;
map.type = MT_CACHECLEAN;
create_mapping(&map);
#endif
#ifdef FLUSH_BASE_MINICACHE
map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS + SZ_1M);
map.virtual = FLUSH_BASE_MINICACHE;
map.length = SZ_1M;
map.type = MT_MINICLEAN;
create_mapping(&map);
#endif
/*
* Create a mapping for the machine vectors at the high-vectors
* location (0xffff0000). If we aren't using high-vectors, also
* create a mapping at the low-vectors virtual address.
*/
map.pfn = __phys_to_pfn(virt_to_phys(vectors));
map.virtual = 0xffff0000;
map.length = PAGE_SIZE;
#ifdef CONFIG_KUSER_HELPERS
map.type = MT_HIGH_VECTORS;
#else
map.type = MT_LOW_VECTORS;
#endif
create_mapping(&map); ------------------------------------------ 将虚拟地址 0xffff0000 ~ 0xffff1000 映射到 0x4fffc000 ~ 0x4fffe000,属性为 MT_HIGH_VECTORS
if (!vectors_high()) {
map.virtual = 0;
map.length = PAGE_SIZE * 2;
map.type = MT_LOW_VECTORS;
create_mapping(&map);
}
/* Now create a kernel read-only mapping */
map.pfn += 1;
map.virtual = 0xffff0000 + PAGE_SIZE;
map.length = PAGE_SIZE;
map.type = MT_LOW_VECTORS;
create_mapping(&map); ------------------------------------------ 将虚拟地址 0xffff1000 ~ 0xffff2000 映射到 0x4fffe000 ~ 0x50000000,属性为 MT_LOW_VECTORS
/*
* Ask the machine support to map in the statically mapped devices.
*/
if (mdesc->map_io)
mdesc->map_io();
else
debug_ll_io_init();
fill_pmd_gaps();
/* Reserve fixed i/o space in VMALLOC region */
pci_reserve_io();
/*
* Finally flush the caches and tlb to ensure that we're in a
* consistent state wrt the writebuffer. This also ensures that
* any write-allocated cache lines in the vector page are written
* back. After this point, we can start to touch devices again.
*/
local_flush_tlb_all();
flush_cache_all();
/* Enable asynchronous aborts */
early_abt_enable();
}
内存管理将一个内存 Node 分成若干个 zone 进行管理:
enum zone_type {
#ifdef CONFIG_ZONE_DMA
/*
* ZONE_DMA is used when there are devices that are not able
* to do DMA to all of addressable memory (ZONE_NORMAL). Then we
* carve out the portion of memory that is needed for these devices.
* The range is arch specific.
*
* Some examples
*
* Architecture Limit
* ---------------------------
* parisc, ia64, sparc <4G
* s390 <2G
* arm Various
* alpha Unlimited or 0-16MB.
*
* i386, x86_64 and multiple other arches
* <16M.
*/
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
/*
* x86_64 needs two ZONE_DMAs because it supports devices that are
* only able to do DMA to the lower 16M but also 32 bit devices that
* can only do DMA areas below 4G.
*/
ZONE_DMA32,
#endif
/*
* Normal addressable memory is in ZONE_NORMAL. DMA operations can be
* performed on pages in ZONE_NORMAL if the DMA devices support
* transfers to all addressable memory.
*/
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
/*
* A memory area that is only addressable by the kernel through
* mapping portions into its own address space. This is for example
* used by i386 to allow the kernel to address the memory beyond
* 900MB. The kernel will set up special mappings (page
* table entries on i386) for each page that the kernel needs to
* access.
*/
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES
};
IMX6DL 未定义 CONFIG_ZONE_DMA、CONFIG_ZONE_DMA32、CONFIG_ZONE_DEVICE,zone_type 只有 ZONE_NORMAL、ZONE_HIGHMEM、__MAX_NR_ZONES。
zone 的初始化在 bootmem_init 中进行。通过 find_limits 找出物理内存开始帧号 min_low_pfn、结束帧号 max_low_pfn。 ZONE_NORMAL 区域的结束帧号 max_low_pfn。
void __init bootmem_init(void)
{
unsigned long min, max_low, max_high;
memblock_allow_resize();
max_low = max_high = 0;
find_limits(&min, &max_low, &max_high);
early_memtest((phys_addr_t)min << PAGE_SHIFT,
(phys_addr_t)max_low << PAGE_SHIFT);
/*
* Sparsemem tries to allocate bootmem in memory_present(),
* so must be done after the fixed reservations
*/
arm_memory_present();
/*
* sparse_init() needs the bootmem allocator up and running.
*/
sparse_init();
/*
* Now free the memory - free_area_init_node needs
* the sparse mem_map arrays initialized by sparse_init()
* for memmap_init_zone(), otherwise all PFNs are invalid.
*/
zone_sizes_init(min, max_low, max_high); ------------------ 从 min_low_pfn 到 max_low_pfn 是ZONE_NORMAL, max_low_pfn 到 max_pfn 是 ZONE_HIGHMEM
/*
* This doesn't seem to be used by the Linux memory manager any
* more, but is used by ll_rw_block. If we can get rid of it, we
* also get rid of some of the stuff above as well.
*/
min_low_pfn = min; -----------------------min_low_pfn = 0x10000 max_low_pfn = 0x50000 max_pfn = 0x50000
max_low_pfn = max_low;
max_pfn = max_high;
}