arm架构的32位linux系统上面,使用到了两种形式的页表映射,段映射和分页映射。系统早期的页表映射都采用静态映射的方式,即对于要映射的某块物理内存,指定其映射的虚拟地址。在uboot 刚跳转到linux时候,先采用了段映射的方式,在arch/arm/kernel/head.S中可以看到源码:
//页表目录的物理地址起始地址放在代码段向下偏移页目录大小PG_DIR_SIZE的位置
.macro pgtbl, rd, phys
add \rd, \phys, #TEXT_OFFSET - PG_DIR_SIZE
.endm
__create_page_tables:
pgtbl r4, r8 //获取页表放置的物理地址起始地址 @ page table address
/*
* Clear the swapper page table
*/
mov r0, r4
mov r3, #0
add r6, r0, #PG_DIR_SIZE 清除页目录,页目录的地址为kernel向下移动
PG_DIR_SIZE个距离
1: str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
teq r0, r6
bne 1b
ldr r7, [r10, #PROCINFO_MM_MMUFLAGS] @ mm_mmuflags
/*
* Create identity mapping to cater for __enable_mmu.
* This identity mapping will be removed by paging_init().
*/
adr r0, __turn_mmu_on_loc
ldmia r0, {r3, r5, r6} 对开启mmu的那段代码进行恒等映射
sub r0, r0, r3 @ virt->phys offset
add r5, r5, r0 @ phys __turn_mmu_on
add r6, r6, r0 @ phys __turn_mmu_on_end
mov r5, r5, lsr #SECTION_SHIFT
mov r6, r6, lsr #SECTION_SHIFT
1: orr r3, r7, r5, lsl #SECTION_SHIFT @ flags + kernel base
str r3, [r4, r5, lsl #PMD_ORDER] @ identity mapping
cmp r5, r6
addlo r5, r5, #1 @ next section
blo 1b
/*
* Map our RAM from the start to the end of the kernel .bss section.
*/
add r0, r4, #PAGE_OFFSET >> (SECTION_SHIFT - PMD_ORDER)
ldr r6, =(_end - 1)
orr r3, r8, r7 对kernel代码进行段映射
add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ORDER)
1: str r3, [r0], #1 << PMD_ORDER
add r3, r3, #1 << SECTION_SHIFT
cmp r0, r6
bls 1b
/*
* Then map boot params address in r2 if specified.
* We map 2 sections in case the ATAGs/DTB crosses a section boundary.
*/
mov r0, r2, lsr #SECTION_SHIFT
movs r0, r0, lsl #SECTION_SHIFT 对dtb进行段映射
subne r3, r0, r8
addne r3, r3, #PAGE_OFFSET
addne r3, r4, r3, lsr #(SECTION_SHIFT - PMD_ORDER)
orrne r6, r7, r0
strne r6, [r3], #1 << PMD_ORDER
addne r6, r6, #1 << SECTION_SHIFT
strne r6, [r3]
mov pc, lr
ENDPROC(__create_page_tables)
.ltorg
.align
__turn_mmu_on_loc:
.long .
.long __turn_mmu_on
.long __turn_mmu_on_end
上面的代码是比较清晰的,为了开启mmu,需要先提前建立好页表,linux初始化早期,采用了段映射的方式,即把4G的空间划分成4096,每1M 为一段,分别
1开启mmu的那段代码需要做特殊处理,即虚拟地址要等于物理地址,进行恒等映射,开启前后执行才不会出错。
2 需要对kernel的地址进行段映射,因为页表和代码起始段在同一个1M的空间内,所以页表地址也会被映射
3对dtb进行段映射,
从系统起来以后的reserve的物理地址中可以看到使用的内存情况:
reserved[0x0] [0x00000030104000-0x00000030107fff], 0x4000 bytes
reserved[0x1] [0x00000030108400-0x000000306487df], 0x5403e0 bytes
reserved[0x2] [0x00000033ffb000-0x00000033ffcfff], 0x2000 bytes
0x0的区域是页表,4096个目录项,刚好16K,0x1是内核的地址,0x2是dtb的物理内存区域,从cpu proc_info中的mmu flag字段包含PMD_TYPE_SECT位也可以看出是段映射。
接着看start_kernel中,是如何再一次初始化页表的。主要代码在setup_arch的paging_init函数:
void __init paging_init(struct machine_desc *mdesc)
{
void *zero_page;
memblock_set_current_limit(arm_lowmem_limit);
build_mem_type_table(); //该函数主要根据arm的架构,来设置mem_types表,该表对不同的内
存类型,进行不同的映射,并附加不同的标志位
prepare_page_table(); //清零一些页目录
map_lowmem(); //对ram的地址再进行一次段映射
dma_contiguous_remap(); //dma 相关的一些映射,这边暂时没处理
devicemaps_init(mdesc); //包括中断向量的映射,io空间的映射
kmap_init();
tcm_init();
top_pmd = pmd_off_k(0xffff0000);
/* allocate the zero page. */
zero_page = early_alloc(PAGE_SIZE);
bootmem_init();
empty_zero_page = virt_to_page(zero_page);
__flush_dcache_page(NULL, empty_zero_page);
}
先来看一下prepare_page_table
static inline void prepare_page_table(void)
{
unsigned long addr;
phys_addr_t end;
/*
* Clear out all the mappings below the kernel image.
*/
for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
#ifdef CONFIG_XIP_KERNEL
/* The XIP kernel is mapped in the module area -- skip over it */
addr = ((unsigned long)_etext + PMD_SIZE - 1) & PMD_MASK;
#endif
for ( ; addr < PAGE_OFFSET; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
/*
* Find the end of the first block of lowmem.
*/
end = memblock.memory.regions[0].base + memblock.memory.regions[0].size;
if (end >= arm_lowmem_limit)
end = arm_lowmem_limit;
/*
* Clear out all the kernel space mappings, except for the first
* memory bank, up to the vmalloc region.
*/
for (addr = __phys_to_virt(end);
addr < VMALLOC_START; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
}
该函数主要对一些页表项做清理工作,这些页表项除了之前做恒等映射的一块使用过外(地址从0x3000000开始),其他应该都没有使用过。
然后调用map_lowmem
static void __init map_lowmem(void)
{
struct memblock_region *reg;
/* Map all the lowmem memory banks. */
for_each_memblock(memory, reg) {
phys_addr_t start = reg->base; //物理地址起始
phys_addr_t end = start + reg->size; //物理地址end
struct map_desc map;
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
if (start >= end)
break;
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = end - start;
map.type = MT_MEMORY;
create_mapping(&map);
}
}
该函数把memory的物理地址都做了段映射,物理地址范围是0x30000000 到 0x34000000。之前在汇编里面kernel的地址已经做过段映射了,这边次把页表里面的内容再填一下,应该没什么影响(不然要是填的地址不一样,肯定跑飞了!),
memblock进行物理内存的管理,每次都通过他来申请物理内存
主要是create_mapping函数
static void __init create_mapping(struct map_desc *md)
{
unsigned long addr, length, end;
phys_addr_t phys;
const struct mem_type *type;
pgd_t *pgd;
if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) {
printk(KERN_WARNING "BUG: not creating mapping for 0x%08llx"
" at 0x%08lx in user region\n",
(long long)__pfn_to_phys((u64)md->pfn), md->virtual);
return;
}
if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
md->virtual >= PAGE_OFFSET &&
(md->virtual < VMALLOC_START || md->virtual >= VMALLOC_END)) {
printk(KERN_WARNING "BUG: mapping for 0x%08llx"
" at 0x%08lx out of vmalloc space\n",
(long long)__pfn_to_phys((u64)md->pfn), md->virtual);
}
type = &mem_types[md->type];
#ifndef CONFIG_ARM_LPAE
/*
* Catch 36-bit addresses
*/
if (md->pfn >= 0x100000) {
create_36bit_mapping(md, type);
return;
}
#endif
addr = md->virtual & PAGE_MASK;
phys = __pfn_to_phys(md->pfn);
length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
if (type->prot_l1 == 0 && ((addr | phys | length) & ~SECTION_MASK)) {
printk(KERN_WARNING "BUG: map for 0x%08llx at 0x%08lx can not "
"be mapped using pages, ignoring.\n",
(long long)__pfn_to_phys(md->pfn), addr);
return;
}
pgd = pgd_offset_k(addr); 根据虚拟地址,得到页目录地址
end = addr + length;
do {
unsigned long next = pgd_addr_end(addr, end);
alloc_init_pud(pgd, addr, next, phys, type);
phys += next - addr;
addr = next;
} while (pgd++, addr != end);
}
pgd_offset_k能根据虚拟地址得到页目录项的虚拟地址,如下定义:
#define pgd_offset_k(addr) pgd_offset(&init_mm, addr),页目录根地址在init_mm的swapper_pg_dir字段中,基地址虚拟为0xc0104000,物理地址是0x30104000和head.S中第一次映射的页目录项基地址一样。
linux页表的实现和mmu hardware中是有差异的,mmu hardware中页目录项是12位,即4096个,每个目录映射1M,但是linux把每个目录项设为2048个,以2M为单位,所以可以看到
#define PGDIR_SHIFT 21
,页目录只有11位,但其实这边每个页目录有8字节,具体差异实现,看下面的博文介绍,篇幅太大,这边不展开:
https://www.cnblogs.com/arnoldlu/p/8087022.html
https://blog.csdn.net/zhoutaopower/article/details/88940727
alloc_init_pud把pgd转换成pud,32位linux只使用2级页映射,所以在这边pud,pmd都是等于pgd的,直接进去看alloc_init_pmd的实现:
static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
unsigned long end, phys_addr_t phys,
const struct mem_type *type)
{
pmd_t *pmd = pmd_offset(pud, addr);
unsigned long next;
do {
/*
* With LPAE, we must loop over to map
* all the pmds for the given range.
*/
next = pmd_addr_end(addr, end);
/*
* Try a section mapping - addr, next and phys must all be
* aligned to a section boundary.
*/
if (type->prot_sect &&
((addr | next | phys) & ~SECTION_MASK) == 0) {
__map_init_section(pmd, addr, next, phys, type);
} else {
alloc_init_pte(pmd, addr, next,
__phys_to_pfn(phys), type);
}
phys += next - addr;
} while (pmd++, addr = next, addr != end);
}
物理内存的映射,都会调用__map_init_section,而alloc_init_pte的实现是进行二级页表映射,map_lowmem会调用__map_init_section:
static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
unsigned long end, phys_addr_t phys,
const struct mem_type *type)
{
pmd_t *p = pmd;
#ifndef CONFIG_ARM_LPAE
/*
* In classic MMU format, puds and pmds are folded in to
* the pgds. pmd_offset gives the PGD entry. PGDs refer to a
* group of L1 entries making up one logical pointer to
* an L2 table (2MB), where as PMDs refer to the individual
* L1 entries (1MB). Hence increment to get the correct
* offset for odd 1MB sections.
* (See arch/arm/include/asm/pgtable-2level.h)
*/
if (addr & SECTION_SIZE)
pmd++;
#endif
do {
*pmd = __pmd(phys | type->prot_sect);
phys += SECTION_SIZE;
} while (pmd++, addr += SECTION_SIZE, addr != end);
flush_pmd_entry(p);
}
需要注意的是,已经把pgd转化成了pmd:可以看一下具体定义:
typedef pmdval_t pmd_t;
typedef pmdval_t pgd_t[2];所以pgd++,地址为增加8位,而pmd++,地址加四,上面传下来的pgd和next之间的地址差为0x200000,刚好2M,8字节 pgd的分辨率为2M,而转换成pmd指针以后,分辨率为1M,这时刚好与mmu硬件目录设定相对应。这边把页目录进行填充,填充值分别为段的物理地址,和页表状态位,SECTION_SIZE为1M,从这边也可以看出,内核的映射是个逻辑映射,只是物理地址加上一个偏移就能得到虚拟地址,而且和start_kernel之前的页表初始化一样,采用的是段映射,
flush_pmd_entry 应该是用来刷新页表缓存的,不深究。上面就完成了整个memory的段映射,接着分析devicemaps_init:
static void __init devicemaps_init(struct machine_desc *mdesc)
{
struct map_desc map;
unsigned long addr;
void *vectors;
/*
* Allocate the vector page early.
*/
vectors = early_alloc(PAGE_SIZE);
这边的early_alloc函数都是从memblock中进行分
配,然后添加到该reserve域中,标记为使用,memblock进行物理内存的管理,
这边返回的是虚拟地址,因为整个memory都被段映射了,所以这边的虚拟地址在物理地址
上加个偏移就能得到
early_trap_init(vectors); //异常向量页表的初始化
for (addr = VMALLOC_START; addr; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
/*
* Map the kernel if it is XIP.
* It is always first in the modulearea.
*/
#ifdef CONFIG_XIP_KERNEL
map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK);
map.virtual = MODULES_VADDR;
map.length = ((unsigned long)_etext - map.virtual + ~SECTION_MASK) & SECTION_MASK;
map.type = MT_ROM;
create_mapping(&map);
#endif
/*
* Map the cache flushing regions.
*/
#ifdef FLUSH_BASE
map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS);
map.virtual = FLUSH_BASE;
map.length = SZ_1M;
map.type = MT_CACHECLEAN;
create_mapping(&map);
#endif
#ifdef FLUSH_BASE_MINICACHE
map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS + SZ_1M);
map.virtual = FLUSH_BASE_MINICACHE;
map.length = SZ_1M;
map.type = MT_MINICLEAN;
create_mapping(&map);
#endif
/*
* Create a mapping for the machine vectors at the high-vectors
* location (0xffff0000). If we aren't using high-vectors, also
* create a mapping at the low-vectors virtual address.
*/
map.pfn = __phys_to_pfn(virt_to_phys(vectors));
map.virtual = 0xffff0000;
map.length = PAGE_SIZE;
map.type = MT_HIGH_VECTORS;
create_mapping(&map); 进行异常向量表的映射
if (!vectors_high()) {
map.virtual = 0;
map.type = MT_LOW_VECTORS;
create_mapping(&map);
}
/*
* Ask the machine support to map in the statically mapped devices.
*/
if (mdesc->map_io)
mdesc->map_io();
fill_pmd_gaps();
/* Reserve fixed i/o space in VMALLOC region */
pci_reserve_io();
/*
* Finally flush the caches and tlb to ensure that we're in a
* consistent state wrt the writebuffer. This also ensures that
* any write-allocated cache lines in the vector page are written
* back. After this point, we can start to touch devices again.
*/
local_flush_tlb_all();
flush_cache_all();
}
首先是异常向量页表的初始化:
void __init early_trap_init(void *vectors_base)
{
unsigned long vectors = (unsigned long)vectors_base;
extern char __stubs_start[], __stubs_end[];
extern char __vectors_start[], __vectors_end[];
extern char __kuser_helper_start[], __kuser_helper_end[];
int kuser_sz = __kuser_helper_end - __kuser_helper_start;
vectors_page = vectors_base;
/*
* Copy the vectors, stubs and kuser helpers (in entry-armv.S)
* into the vector page, mapped at 0xffff0000, and ensure these
* are visible to the instruction stream.
*/
异常向量表
memcpy((void *)vectors, __vectors_start, __vectors_end - __vectors_start);
异常向量执行程序
memcpy((void *)vectors + 0x200, __stubs_start, __stubs_end - __stubs_start);
memcpy((void *)vectors + 0x1000 - kuser_sz, __kuser_helper_start, kuser_sz);
/*
* Do processor specific fixups for the kuser helpers
*/
kuser_get_tls_init(vectors);
/*
* Copy signal return handlers into the vector page, and
* set sigreturn to be a pointer to these.
*/
拷贝信号处理函数
memcpy((void *)(vectors + KERN_SIGRETURN_CODE - CONFIG_VECTORS_BASE),
sigreturn_codes, sizeof(sigreturn_codes));
flush_icache_range(vectors, vectors + PAGE_SIZE);
修改异常向量表的访问权限
modify_domain(DOMAIN_USER, DOMAIN_CLIENT);
}
上面函数利用申请到的memory,填充异常向量表和异常处理函数。
linux在__vectors_start和__vectors_end中定义了中断向量表:
.globl __vectors_start
__vectors_start:
ARM( swi SYS_ERROR0 )
THUMB( svc #0 )
THUMB( nop )
W(b) vector_und + stubs_offset
W(ldr) pc, .LCvswi + stubs_offset
W(b) vector_pabt + stubs_offset
W(b) vector_dabt + stubs_offset
W(b) vector_addrexcptn + stubs_offset
W(b) vector_irq + stubs_offset
W(b) vector_fiq + stubs_offset
.globl __vectors_end
__vectors_end:
然后开始对向量表进行映射:
map.pfn = __phys_to_pfn(virt_to_phys(vectors));
map.virtual = 0xffff0000;map.length = PAGE_SIZE;
map.type = MT_HIGH_VECTORS;
create_mapping(&map);可以看到,其实页表存在多重映射的关系,也就是一个物理地址可以映射成几个虚拟地址,前面申请vectors的时候得到一个逻辑地址,由段页表完成映射,完了这边又为该块物理地址申请一个新的虚拟地址,通过页表进行映射。这里中断向量表的虚拟地址为0xffff0000。另外,linux可以设置协处理器c1的v位来控制是从0地址还是0xffff0000地址取异常向量表。接着进去看一下如何进行页表映射,在create_mapping函数中,利用0xffff0000创建对应高地址的pgd,最终调用alloc_init_pte函数:
static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
unsigned long end, unsigned long pfn,
const struct mem_type *type)
{
pte_t *pte = early_pte_alloc(pmd, addr, type->prot_l1); 分配页目录项,并且把目录里面
对应的页表地址返回
do {
set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0); 填充页表
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
}
上面分为两步,先填充页目录项,再填充页表项:
static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr, unsigned long prot)
{
if (pmd_none(*pmd)) {
pte_t *pte = early_alloc(PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE);
__pmd_populate(pmd, __pa(pte), prot);
}
BUG_ON(pmd_bad(*pmd));
return pte_offset_kernel(pmd, addr);
}
PTE_HWTABLE_OFF + PTE_HWTABLE_SIZE是 (512+512)*4,即分配4K 的空间,之前已经说过,一个pgd8字节,有2048个,每个对应512个页表,这边分配两个512,上半页给linux系统用,应该是用来记录一下信息位,下本页给mmu硬件使用。得到目录项中页表的逻辑地址以后,需要把该逻辑地址转化成物理地址填入pgd中,使用__pmd_populate:
static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
pmdval_t prot)
{
pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot;
pmdp[0] = __pmd(pmdval);
#ifndef CONFIG_ARM_LPAE
pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));
#endif
flush_pmd_entry(pmdp);
}
可以看到传入的页表项物理地址先偏移了512*4,pgd的第一个页目录填充的是4K的下半段,这也符合前面说的,下半页才会填入页目录中,给mmu使用,在512的基础上再偏移256,填充下一个页目录,这样可寻址2M的页目录都被填充完。看起来和1M的页目录效果是一样的,只不过在分配页表的时候,一下子申请4K ,都可以被使用而不浪费。
接着调用pte_offset_kernel 返回页表项的逻辑地址,注意上面填入pmd[0]中的值页表的值应该为pte_index(addr)+512,
这边页对其以后返回的pte_offset_kernel应该是上半页的起始逻辑地址,后面再汇编里面设置页表可以看到
#define pte_offset_kernel(pmd,addr) (pmd_page_vaddr(*(pmd)) + pte_index(addr))
然后调用下面的函数把物理地址填充到页表项里面
set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0);
其实是个宏定义,不同处理器的处理函数不同,可以看一下arm9的实现:
ENTRY(cpu_arm920_set_pte_ext)
#ifdef CONFIG_MMU
armv3_set_pte_ext
mov r0, r0
mcr p15, 0, r0, c7, c10, 1 @ clean D entry
mcr p15, 0, r0, c7, c10, 4 @ drain WB
#endif
mov pc, lr
.macro armv3_set_pte_ext wc_disable=1
str r1, [r0], #2048 @ linux version 把物理地址以及描述信息填在上半页的页表里,并加2048,偏移得到另一个页的地址
eor r3, r1, #L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY
bic r2, r1, #PTE_SMALL_AP_MASK @ keep C, B bits
bic r2, r2, #PTE_TYPE_MASK
orr r2, r2, #PTE_TYPE_SMALL
tst r3, #L_PTE_USER @ user?
orrne r2, r2, #PTE_SMALL_AP_URO_SRW
tst r3, #L_PTE_RDONLY | L_PTE_DIRTY @ write and dirty?
orreq r2, r2, #PTE_SMALL_AP_UNO_SRW
tst r3, #L_PTE_PRESENT | L_PTE_YOUNG @ present and young?
movne r2, #0
.if \wc_disable
#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
tst r2, #PTE_CACHEABLE
bicne r2, r2, #PTE_BUFFERABLE
#endif
.endif
str r2, [r0] @ hardware version把物理地址以及硬件相关的信息填入到
mmu使用的那个页表项中
.endm
从上面可以看出,有两次填充页地址的行为,偏移2048地址的那个页,才是mmu使用的页表,置上L_PTE_PRESENT,mmu会认为该页表映射有效。
可以用一个图来描述linux 2级页表的映射关系:
系统每次对某个虚拟地址进行映射时,先申请pgd,这个pgd是8个字节,索引11位,对于mmu来说,11位的页目录显然不对,所以填写时又分为pmd[0],pmd[1],pmd 所以12位,以1M 为单位,这个时候和mmu能对应上了,然后申请页表,一次申请4K,上面2K 用来为下面2K的做标记,下面的每页偏移512*4个字节就能找到上面的标记页,然后把下面的512又分成两份填充pmd[0],和pmd[1]。所以从实现上来看,每次分配空间,都会在页目录中填两个页目录项,范围是2M。
接着在devicemaps_init函数中调用:
if (mdesc->map_io)
mdesc->map_io();如果使用的machine定义了map_io,则会调用map_io:
static void __init smdk2440_map_io(void)
{
s3c24xx_init_io(smdk2440_iodesc, ARRAY_SIZE(smdk2440_iodesc));
s3c24xx_init_clocks(12000000);
s3c24xx_init_uarts(smdk2440_uartcfgs, ARRAY_SIZE(smdk2440_uartcfgs));
samsung_set_timer_source(SAMSUNG_PWM3, SAMSUNG_PWM4);
}
先看s3c24xx_init_io:
void __init s3c24xx_init_io(struct map_desc *mach_desc, int size)
{
arm_pm_idle = s3c24xx_default_idle;
/* initialise the io descriptors we need for initialisation */
iotable_init(mach_desc, size); 对io端口进行页表映射
iotable_init(s3c_iodesc, ARRAY_SIZE(s3c_iodesc));
if (cpu_architecture() >= CPU_ARCH_ARMv5) {
samsung_cpu_id = s3c24xx_read_idcode_v5();
} else {
samsung_cpu_id = s3c24xx_read_idcode_v4();
}
s3c24xx_init_cpu();
s3c_init_cpu(samsung_cpu_id, cpu_ids, ARRAY_SIZE(cpu_ids));
}
先调用iotable_init 对io端口寄存器进行页表映射:
void __init iotable_init(struct map_desc *io_desc, int nr)
{
struct map_desc *md;
struct vm_struct *vm;
struct static_vm *svm;
if (!nr)
return;
svm = early_alloc_aligned(sizeof(*svm) * nr, __alignof__(*svm));
for (md = io_desc; nr; md++, nr--) {
create_mapping(md);
vm = &svm->vm;
vm->addr = (void *)(md->virtual & PAGE_MASK);
vm->size = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
vm->phys_addr = __pfn_to_phys(md->pfn);
vm->flags = VM_IOREMAP | VM_ARM_STATIC_MAPPING;
vm->flags |= VM_ARM_MTYPE(md->type);
vm->caller = iotable_init;
add_static_vm_early(svm++);
}
}
也是调用create_mapping依次对传入的io端口地址进行映射,而且利用add_static_vm_early函数添加到了vmlist和static_vmlist链表,这两个全局链表应该是用来管理虚拟内存的,标记io映射使用的虚拟内存,后面如果要分配内存,不能再使用这些区域(这是我猜测的,以后再回过来学习)。做完静态映射以后,处理器就可以使用这些io虚拟地址进行io读写了。然后调用s3c_init_cpu(samsung_cpu_id, cpu_ids, ARRAY_SIZE(cpu_ids));传入的是cpu_ids,定义在
arch/arm/mach-s3c24xx/common.c中:
static struct cpu_table cpu_ids[] __initdata = {
{
.idcode = 0x32410000,
.idmask = 0xffffffff,
.map_io = s3c2410_map_io,
.init_clocks = s3c2410_init_clocks,
.init_uarts = s3c2410_init_uarts,
.init = s3c2410_init,
.name = name_s3c2410
},
{
.idcode = 0x32410002,
.idmask = 0xffffffff,
.map_io = s3c2410_map_io,
.init_clocks = s3c2410_init_clocks,
.init_uarts = s3c2410_init_uarts,
.init = s3c2410a_init,
.name = name_s3c2410a
},
。。。。。。。。。。。。
定义了很多cpu类型。
void __init s3c_init_cpu(unsigned long idcode,
struct cpu_table *cputab, unsigned int cputab_size)
{
cpu = s3c_lookup_cpu(idcode, cputab, cputab_size);
if (cpu == NULL) {
printk(KERN_ERR "Unknown CPU type 0x%08lx\n", idcode);
panic("Unknown S3C24XX CPU");
}
printk("CPU %s (id 0x%08lx)\n", cpu->name, idcode);
if (cpu->map_io == NULL || cpu->init == NULL) {
printk(KERN_ERR "CPU %s support not enabled\n", cpu->name);
panic("Unsupported Samsung CPU");
}
cpu->map_io();
}
从cpu表中根据id查找到对应的cpu,然后调用其map_io函数:
void __init s3c244x_map_io(void)
{
/* register our io-tables */
iotable_init(s3c244x_iodesc, ARRAY_SIZE(s3c244x_iodesc));
/* rename any peripherals used differing from the s3c2410 */
s3c_device_sdi.name = "s3c2440-sdi";
s3c_device_i2c0.name = "s3c2440-i2c";
s3c_nand_setname("s3c2440-nand");
s3c_device_ts.name = "s3c2440-ts";
s3c_device_usbgadget.name = "s3c2440-usbgadget";
}
可以看到依旧是建立io的一些映射页表。然后初始化clock,调用关系如下:
smdk2440_map_io
------------>s3c24xx_init_clocks
---------------->s3c244x_init_clocks
void __init s3c244x_init_clocks(int xtal)
{
/* initialise the clocks here, to allow other things like the
* console to use them, and to add new ones after the initialisation
*/
s3c24xx_register_baseclocks(xtal); 初始化了一些clock结构,最重要的是clk_xtal,
把晶振的值赋值给他,其他的clock 好像都没填充,最终依次调用clkdev_add 加入到内核的clock链表中
s3c244x_setup_clocks();
根据晶振值以及寄存器(寄存器读写就是利用之前静态映射的页表指定的虚拟地址来读写)
中设置的分频系数,依次算出各个clock的值,填充clock
s3c2410_baseclk_add();
初始化系统中的各个模块的clock结构,利用clkdev_add 加入到内核的clock链表中
}
smdk2440_map_io
--------> s3c24xx_init_uarts
--------->s3c24xx_init_uartdevs 主要工作就是利用设置的uart 参数,填充下面结构体重的各个uart
struct platform_device *s3c24xx_uart_src[4] = {
&s3c24xx_uart_device0,
&s3c24xx_uart_device1,
&s3c24xx_uart_device2,
&s3c24xx_uart_device3,
};使用dts实现的话这些 platform_device 的参数结构应该可以放到dts里面来传递,这边采取以前的做法。到这边具体machine的map io就结束了。
最后还有个bootmem_init 函数:
void __init bootmem_init(void)
{
unsigned long min, max_low, max_high;
max_low = max_high = 0;
find_limits(&min, &max_low, &max_high);
arm_bootmem_init(min, max_low);
/*
* Sparsemem tries to allocate bootmem in memory_present(),
* so must be done after the fixed reservations
*/
arm_memory_present();
/*
* sparse_init() needs the bootmem allocator up and running.
*/
sparse_init();
/*
* Now free the memory - free_area_init_node needs
* the sparse mem_map arrays initialized by sparse_init()
* for memmap_init_zone(), otherwise all PFNs are invalid.
*/
arm_bootmem_free(min, max_low, max_high);
/*
* This doesn't seem to be used by the Linux memory manager any
* more, but is used by ll_rw_block. If we can get rid of it, we
* also get rid of some of the stuff above as well.
*
* Note: max_low_pfn and max_pfn reflect the number of _pages_ in
* the system, not the maximum PFN.
*/
max_low_pfn = max_low - PHYS_PFN_OFFSET;
max_pfn = max_high - PHYS_PFN_OFFSET;
}
查了很多资料,说linux在系统完全起来之前,先使用memblock机制或者bootmem机制来分配管理物理内存(早期的系统使用bootmem),前面已经看到使用memblock的方式来分配和管理内存,但是在bootmem_init 还初始化了bootmem方式的位图,以及伙伴内存系统使用的page,为什么要用memblock和bootmem两种方式来管理内存,不是很理解。arm_bootmem_init就是初始化了bootmem的位图:
static void __init arm_bootmem_init(unsigned long start_pfn, unsigned long end_pfn) { struct memblock_region *reg; unsigned int boot_pages; phys_addr_t bitmap; pg_data_t *pgdat; /* * Allocate the bootmem bitmap page. This must be in a region * of memory which has already been mapped. */ boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn); bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES, __pfn_to_phys(end_pfn)); pr_err("boot_pages=%x bitmap=%x \n",boot_pages,bitmap); /* * Initialise the bootmem allocator, handing the * memory banks over to bootmem. */ node_set_online(0); pgdat = NODE_DATA(0); init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn); /* Free the lowmem regions from memblock into bootmem. */ for_each_memblock(memory, reg) { unsigned long start = memblock_region_memory_base_pfn(reg); unsigned long end = memblock_region_memory_end_pfn(reg); if (end >= end_pfn) end = end_pfn; if (start >= end) break; pr_err("start=%lx end=%lx \n",start,end); free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT); } /* Reserve the lowmem memblock reserved regions in bootmem. */ for_each_memblock(reserved, reg) { unsigned long start = memblock_region_reserved_base_pfn(reg); unsigned long end = memblock_region_reserved_end_pfn(reg); if (end >= end_pfn) end = end_pfn; if (start >= end) break; reserve_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT); } }
首先申请了位空间bitmap,bitmap中的一个bit代表的是一个页,最终把位空间赋值给pgdat->bdata->node_bootmem_map。然后遍历memblock中的memory节点和reserved节点,对使用过的内存在node_bootmem_map中做标记。初始化完node_bootmem_map以后,可以看到后面分配空间使用alloc_bootmem_node_nopanic来进行分配,分配内存的方式从memblock的方式转移到node_bootmem_map位图。接着调用arm_bootmem_free,该函数完成了伙伴系统page和zone的初始化。会为每个页分配一个struct page的空间,然后把这块空间赋值给pgdat->node_mem_map和mem_map,然后对这些page进行初始化
arm_bootmem_free
-------->free_area_init_node
计算需要分配的page的数量,然后调用alloc_node_mem_map 为所有的page分配空间,接着对zone和page进行初始化:
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
spin_lock_init(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = freesize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages = calc_memmap_size(size, realsize);
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
zone->spanned_pages = size;
zone->present_pages = realsize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
/ 100;
zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
lruvec_init(&zone->lruvec);
if (!size)
continue;
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
}
memmap_init
--------->memmap_init_zone
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
struct zone *z;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
* handed to this function. They do not
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
}
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
page_nid_reset_last(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made. Later some blocks near
* the start are marked MIGRATE_RESERVE by
* setup_zone_migrate_reserve()
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
if ((z->zone_start_pfn <= pfn)
&& (pfn < zone_end_pfn(z))
&& !(pfn & (pageblock_nr_pages - 1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}
}
可以看到根据每一个页,从mem_map中取出对应的page描述,初始化该page。这边把伙伴系统进行了初步的初始化,后面再分析伙伴系统怎么运作。初始化完以后的内存分布大概如下:
找到几篇比较好的博文,贴在下面:
https://www.xuebuyuan.com/1636055.html
https://blog.csdn.net/gatieme/article/details/52403924