Google整Fuchsia代码整了好些年了,近期是有看到说Fuchsia可能会正式商用了,所以抽了空把Fuchsia代码下了下来,想从kernel起好好捋一捋代码,想从根本上理解其kernel部分的实现。
理解任何的系统,都是得从启动开始,先简单看了下Fuchsia平台上的启动部分。同样的,任何的系统启动都会分成芯片初始化–bootloader–kernel—应用程序,这样一个完整的过程。
而在X86平台上,因为BIOS的存在,所以芯片平台初始化的工作会由BIOS来完成,而且往往BIOS是闭源的,所以我们只能从bootloader开始。而fushia系统的bootloader逻辑也比较简单,由UEFI BIOS load起来,再通过UEFI接口找到启动分区,load kernel到内存,然后跳入kernel启动。
今天重点分析 kernel启动代码Start.S的代码逻辑:
// Copyright 2016 The Fuchsia Authors
// Copyright (c) 2009 Corey Tabaka
// Copyright (c) 2015 Intel Corporation
// Copyright (c) 2016 Travis Geiselbrecht
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT
#include
#include
#include
#include
#include
#include
#define ADDR_OFFSET_MASK ((1 << ADDR_OFFSET)-1)
#define SHIFT_OFFSET(_s) ((_s) >> 3)
#define SHIFT_REMAIN(_s) ((_s) - (SHIFT_OFFSET(_s) << 3))
// Set a page table entry for the kernel module relocated 64-bit virtual
// address in 32-bit code. Clobbers the %ecx register.
.macro set_relocated_page_table_entry table, shift, value
// Extract 32-bit chunk of kernel_relocated_base containing the index bits
// for this page level shift.
mov PHYS(kernel_relocated_base + SHIFT_OFFSET(\shift)), %ecx
// Get the exact portion of the 32-bit value that is the index
shrl $SHIFT_REMAIN(\shift), %ecx
andl $ADDR_OFFSET_MASK, %ecx
// Get the address on the page table of index * 8 and set the value
shll $3, %ecx
addl $PHYS(\table), %ecx
movl \value, (%ecx)
.endm
// Clobbers %rax, %rdx.
.macro sample_ticks out
rdtsc
shl $32, %rdx
or %rdx, %rax
mov %rax, \out
.endm
// This section name is known specially to kernel.ld and gen-kaslr-fixups.sh.
// This code has relocations for absolute physical addresses, which do not get
// adjusted by the boot-time fixups (which this code calls at the end).
.section .text.boot, "ax", @progbits
.align 8
FUNCTION_LABEL(_start)
// As early as possible collect the time stamp.
sample_ticks %r15 //取当前时间
/* set up a temporary stack pointer */
mov $PHYS(_kstack_end), %rsp //设置栈指针,编译的时候,预留了栈段
// Save off the bootdata pointer in a register that won't get clobbered.
mov %rsi, %rbx
// The fixup code in image.S runs in 64-bit mode with paging enabled,
// so we can't run it too early. But it overlaps the bss, so we move
// it before zeroing the bss. We can't delay zeroing the bss because
// the page tables we're about to set up are themselves in bss.
// The first word after the kernel image (at __data_end in our view)
// gives the size of the following code. Copy it to _end.
mov PHYS(__data_end), %ecx
mov $PHYS(__data_end+4), %esi
mov $PHYS(_end), %edi
rep movsb // while (ecx-- > 0) *edi++ = *esi++;
// Now it's safe to zero the bss. //bss段内存清0
movl $PHYS(__bss_start), %edi
movl $PHYS(_end), %ecx
sub %edi, %ecx // Compute the length of the bss in bytes.
xor %eax, %eax
rep stosb // while (ecx-- > 0) *edi++ = al;
// _zbi_base is in bss, so now it's safe to set it.
mov %rbx, PHYS(_zbi_base) //rbx是启动时,bootloader给的第二个参数
mov %r15, PHYS(kernel_entry_ticks) //保存启动时间
/* give the boot allocator a chance to compute the physical address of the kernel */
call boot_alloc_init
// 在x86上,内存映射采用了四级映射表,一般分别命名是pml4, pdp,pd,跟PT,但是google这段代码,命名了一个PTE表,我开始一直认为PTE表是PT,但是捋下面这段代码的时候一直捋不通。后来才发现google的这个PTE表,应该是通俗一点的PD表,尼玛害死人了
.Lpaging_setup64:
/* initialize the default page tables */
/* Setting the First PML4E with a PDP table reference*/
//好理解,pdp表其实就是一个物理页,讲pdp这个物理页的地址放到pml4这个物理页上,因为一个物理页是4KB,每个物理页可以放512个64位的指针数组,比如一个pml4物理页就能放512个指向pdp页的地址指针。
movl $PHYS(pdp), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(pml4)
//同样,把PTE物理页的地址送到PDP表中,所以感觉PTE应该叫PD。。
/* Setting the First PDPTE with a Page table reference*/
movl $PHYS(pte), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(pdp)
//一个PTE页可以索引到2M*512 = 1GB的内存,而一个PTE页是由一个PDP页中的一个条目指向的。
//pdp_high是另外一个PDP表,由pml4表中某一个项的指针指过来
/* point the pml4e at the second high PDP (for -2GB mapping) */
movl $PHYS(pdp_high), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
set_relocated_page_table_entry pml4, PML4_SHIFT, %eax
//pdp_high里指向与上一个PDP表中同样的PTE表,没明白这是什么个操作
/* point the second pdp at the same low level page table */
movl $PHYS(pte), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
set_relocated_page_table_entry pdp_high, PDP_SHIFT, %eax
//更新从0开始的2M跳一次的地址到PTE表中,一个PTE物理页有512个(0x200)64位指针。
/* map the first 1GB in this table */
movl $PHYS(pte), %esi
movl $0x200, %ecx
xor %eax, %eax
//左移21位是以2M为间隔
0:
mov %eax, %ebx
shll $21, %ebx
orl $X86_KERNEL_PD_LP_FLAGS, %ebx
movl %ebx, (%esi)
addl $8,%esi
inc %eax
loop 0b
// 这个linear_map_pdp感觉是PD表,因为要索引64G的内存,索引需要64个物理页来作为PD表,一个PD表页可以索引1G内存
/* set up a linear map of the first 64GB at 0xffffff8000000000 */
movl $PHYS(linear_map_pdp), %esi
movl $32768, %ecx
xor %eax, %eax
/* loop across these page tables, incrementing the address by 2MB */
0:
mov %eax, %ebx
shll $21, %ebx
orl $X86_KERNEL_PD_LP_FLAGS, %ebx // lower word of the entry
movl %ebx, (%esi)
mov %eax, %ebx
shrl $11, %ebx // upper word of the entry
movl %ebx, 4(%esi)
addl $8,%esi
inc %eax
loop 0b
// 将linear_map_pdp中的64个PD页表分别指向pdp_high中的不停条目
/* point the high pdp at our linear mapping page tables */
movl $PHYS(pdp_high), %esi
movl $64, %ecx
movl $PHYS(linear_map_pdp), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
0:
movl %eax, (%esi)
add $8, %esi
addl $4096, %eax
loop 0b
/*
* Set PGE to enable global kernel pages
*/
mov %cr4, %rax
or $(X86_CR4_PGE), %rax
mov %rax, %cr4
//将pml4放进cr3中,物理内存与虚拟内存映射生效
/* load the physical pointer to the top level page table */
mov $PHYS(pml4), %rax
mov %rax, %cr3
// Load our new GDT by physical pointer.
// _temp_gdtr has it as a virtual pointer, so copy it and adjust.
movw PHYS(_temp_gdtr), %ax
movl PHYS(_temp_gdtr+2), %ecx
sub $PHYS_ADDR_DELTA, %ecx
movw %ax, -6(%esp)
movl %ecx, -4(%esp)
lgdt -6(%esp)
// 将high_entry函数的虚拟地址压入栈中,因为这个是函数执行的返回地址,函数返回后直接执行该地址,用这种方式让high_entry函数执行起来。
/* long jump to our code selector and the high address relocated */
push $CODE_64_SELECTOR
mov $PHYS(high_entry), %rax
addq PHYS(kernel_relocated_base), %rax
pushq %rax
lretq
// 在虚拟地址空间执行代码。设置GDT/IDT,最重要的是最终通过call lk_main函数跳入到C代码当中。
// This code runs at the final virtual address, so it should be pure PIC.
.text
high_entry:
/* zero our kernel segment data registers */
xor %eax, %eax
mov %eax, %ds
mov %eax, %es
mov %eax, %fs
mov %eax, %gs
mov %eax, %ss
/* load the high kernel stack */
lea _kstack_end(%rip), %rsp
// move_fixups_and_zero_bss copied the fixup code to _end.
// It expects %rdi to contain the actual runtime address of __code_start.
lea __code_start(%rip), %rdi
call _end
// The fixup code won't be used again, so the memory can be reused now.
/* reload the gdtr after relocations as it relies on relocated VAs */
lgdt _temp_gdtr(%rip)
// Set %gs.base to &bp_percpu. It's statically initialized
// with kernel_unsafe_sp set, so after this it's safe to call
// into C code that might use safe-stack and/or stack-protector.
lea bp_percpu(%rip), %rax
mov %rax, %rdx
shr $32, %rdx
mov $X86_MSR_IA32_GS_BASE, %ecx
wrmsr
/* set up the idt */
lea _idt_startup(%rip), %rdi
call idt_setup
call load_startup_idt
/* assign this core CPU# 0 and initialize its per cpu state */
xor %edi, %edi
call x86_init_percpu
// Fill the stack canary with a random value as early as possible.
// This isn't done in x86_init_percpu because the hw_rng_get_entropy
// call would make it eligible for stack-guard checking itself. But
// %gs is not set up yet in the prologue of the function, so it would
// crash if it tried to use the stack-guard.
call choose_stack_guard
// Move it into place.
mov %rcx, %gs:ZX_TLS_STACK_GUARD_OFFSET
// Don't leak that value to other code.
xor %ecx, %ecx
// configure the kernel base address
// TODO: dynamically figure this out once we allow the x86 kernel to be loaded anywhere
movl $PHYS_LOAD_ADDRESS, kernel_base_phys(%rip)
// Collect the time stamp of entering "normal" C++ code in virtual space.
sample_ticks kernel_virtual_entry_ticks(%rip)
/* call the main module */
call lk_main
0: /* just sit around waiting for interrupts */
hlt /* interrupts will unhalt the processor */
pause
jmp 0b /* so jump back to halt to conserve power */
.bss
.align 16
DATA(_kstack)
.skip 4096
DATA(_kstack_end)
// These symbols are used by image.S
.global IMAGE_ELF_ENTRY
IMAGE_ELF_ENTRY = _start
// This symbol is used by gdb python to know the base of the kernel module
.global KERNEL_BASE_ADDRESS
KERNEL_BASE_ADDRESS = KERNEL_BASE - KERNEL_LOAD_OFFSET
上面这段汇编代码的整体逻辑,_start函数被作为IMAGE_ELF_ENTRY了,也就是说fuchsia的kernl zircon被编译成了一个ELF文件,而这个ELF文件的入口地址为设置成了_start函数,
我们先看下在bootloader里是怎么找到这个入口的:
static int header_check(void* image, size_t sz, uint64_t* _entry, size_t* _flen, size_t* _klen) {
zbi_header_t* bd = image;
size_t flen, klen;
uint64_t entry;
if (!(bd->flags & ZBI_FLAG_VERSION)) {
printf("boot: v1 bootdata kernel no longer supported\n");
return -1;
}
zircon_kernel_t* kernel = image;
if ((sz < sizeof(zircon_kernel_t)) || (kernel->hdr_kernel.type != ZBI_TYPE_KERNEL_X64) ||
((kernel->hdr_kernel.flags & ZBI_FLAG_VERSION) == 0)) {
printf("boot: invalid zircon kernel header\n");
return -1;
}
flen = ZBI_ALIGN(kernel->hdr_file.length);
klen = ZBI_ALIGN(kernel->hdr_kernel.length);
entry = kernel->data_kernel.entry; //ELF 的入口地址,由编译器决定
if (flen > (sz - sizeof(zbi_header_t))) {
printf("boot: invalid zircon kernel header (bad flen)\n");
return -1;
}
if (klen > (sz - (sizeof(zbi_header_t) * 2))) {
printf("boot: invalid zircon kernel header (bad klen)\n");
return -1;
}
if (_entry) {
*_entry = entry;
}
if (_flen) {
*_flen = flen;
*_klen = klen;
}
return 0;
}
kernel->data_kernel.entry 这部分会是一个ELF格式的文件头,编译器在编译的时候会放置正确的入口地址放在kernel->data_kernel.entry处。
跳入Start.S的代码如下:
static void start_zircon(uint64_t entry, void* bootdata) {
#if __x86_64__
// ebx = 0, ebp = 0, edi = 0, esi = bootdata
__asm__ __volatile__(
"movl $0, %%ebp \n"
"cli \n"
"jmp *%[entry] \n" ::[entry] "a"(entry), //You jump。。。
[ bootdata ] "S"(bootdata), "b"(0), "D"(0));
#elif defined(__aarch64__)
__asm__(
"mov x0, %[zbi]\n" // Argument register.
"mov x29, xzr\n" // Clear FP.
"mov x30, xzr\n" // Clear LR.
"br %[entry]\n" ::[entry] "r"(entry),
[ zbi ] "r"(bootdata)
: "x0", "x29", "x30");
#else
#error "add code for other arches here"
#endif
__builtin_unreachable();
}
"jmp *%[entry] \n" ::[entry] "a"(entry), //
[ bootdata ] "S"(bootdata), "b"(0), "D"(0));
这个就根据入口地址直接跳进去了。
现在再回到Start.S代码中,比较头疼的时候构建虚拟地址与物理地址映射表的这部分:
.Lpaging_setup64:
/* initialize the default page tables */
/* Setting the First PML4E with a PDP table reference*/
movl $PHYS(pdp), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(pml4)
/* Setting the First PDPTE with a Page table reference*/
movl $PHYS(pte), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(pdp)
/* point the pml4e at the second high PDP (for -2GB mapping) */
movl $PHYS(pdp_high), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
set_relocated_page_table_entry pml4, PML4_SHIFT, %eax
/* point the second pdp at the same low level page table */
movl $PHYS(pte), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
set_relocated_page_table_entry pdp_high, PDP_SHIFT, %eax
/* map the first 1GB in this table */
movl $PHYS(pte), %esi
movl $0x200, %ecx
xor %eax, %eax
0:
mov %eax, %ebx
shll $21, %ebx
orl $X86_KERNEL_PD_LP_FLAGS, %ebx
movl %ebx, (%esi)
addl $8,%esi
inc %eax
loop 0b
/* set up a linear map of the first 64GB at 0xffffff8000000000 */
movl $PHYS(linear_map_pdp), %esi
movl $32768, %ecx
xor %eax, %eax
/* loop across these page tables, incrementing the address by 2MB */
0:
mov %eax, %ebx
shll $21, %ebx
orl $X86_KERNEL_PD_LP_FLAGS, %ebx // lower word of the entry
movl %ebx, (%esi)
mov %eax, %ebx
shrl $11, %ebx // upper word of the entry
movl %ebx, 4(%esi)
addl $8,%esi
inc %eax
loop 0b
/* point the high pdp at our linear mapping page tables */
movl $PHYS(pdp_high), %esi
movl $64, %ecx
movl $PHYS(linear_map_pdp), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
0:
movl %eax, (%esi)
add $8, %esi
addl $4096, %eax
loop 0b
最终虚拟内存与物理内存映射出来的图大概是这样:
movl $PHYS(pdp), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(pml4)
这是图中的1部分,
/* Setting the First PDPTE with a Page table reference*/
movl $PHYS(pte), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
movl %eax, PHYS(pdp)
这个是图中的3部分
/* point the pml4e at the second high PDP (for -2GB mapping) */
movl $PHYS(pdp_high), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
set_relocated_page_table_entry pml4, PML4_SHIFT, %eax
这个是PTE页到1G的地址空间的部分
/* map the first 1GB in this table */
movl $PHYS(pte), %esi
movl $0x200, %ecx
xor %eax, %eax
0:
mov %eax, %ebx
shll $21, %ebx
orl $X86_KERNEL_PD_LP_FLAGS, %ebx
movl %ebx, (%esi)
addl $8,%esi
inc %eax
loop 0b
这是图中的6的部分
/* set up a linear map of the first 64GB at 0xffffff8000000000 */
movl $PHYS(linear_map_pdp), %esi
movl $32768, %ecx
xor %eax, %eax
/* loop across these page tables, incrementing the address by 2MB */
0:
mov %eax, %ebx
shll $21, %ebx
orl $X86_KERNEL_PD_LP_FLAGS, %ebx // lower word of the entry
movl %ebx, (%esi)
mov %eax, %ebx
shrl $11, %ebx // upper word of the entry
movl %ebx, 4(%esi)
addl $8,%esi
inc %eax
loop 0b
这是图中的4与5的部分
/* point the high pdp at our linear mapping page tables */
movl $PHYS(pdp_high), %esi
movl $64, %ecx
movl $PHYS(linear_map_pdp), %eax
orl $X86_KERNEL_PD_FLAGS, %eax
0:
movl %eax, (%esi)
add $8, %esi
addl $4096, %eax
loop 0b
最后再简单归纳总结下Fuchsia zircon kernel在X86平台上的物理内存与虚拟内存的映射关系。
1)zircon kernel在编译的时候已经在连接脚本里给出了一个基于虚拟地址空间的地址:
SECTIONS
{
. = KERNEL_BASE;
PROVIDE_HIDDEN(__code_start = .);
而KERNEL_BASE定义:
./kernel/BUILD.gn:95: "KERNEL_BASE=$kernel_base"
而kernel_base则为:
# Virtual address where the kernel is mapped statically. This is the
# base of addresses that appear in the kernel symbol table. At runtime
# KASLR relocation processing adjusts addresses in memory from this base
# to the actual runtime virtual address.
if (current_cpu == "arm64") {
kernel_base = "0xffffffff00000000"
} else if (current_cpu == "x64") {
kernel_base = "0xffffffff80100000" # Has KERNEL_LOAD_OFFSET baked into it.
}
2)zircon kernel被装载进内存的时候,有一个装载的指定的物理地址:
kernel_zone_base = 0x100000;
kernel_zone_size = 6 * 1024 * 1024;
。。。。。。。。。。。。。。。
if (kernel_zone_size == 0) {
kernel_zone_size = 3 * 1024 * 1024;
efi_status status = gBS->AllocatePages(AllocateAddress, EfiLoaderData,
BYTES_TO_PAGES(kernel_zone_size), &kernel_zone_base);
if (status) {
printf("boot: cannot obtain %zu bytes for kernel @ %p\n", kernel_zone_size,
(void*)kernel_zone_base);
kernel_zone_size = 0;
}
}
3)也就是说zircon被装进内存的时候物理地址也确定了,虚拟地址也是被确定的,上面的Start.S这个文件,就是在X86平台上跟据这两个确定好了的地址建立一个PML4映射表,把定义好的虚拟地址与物理地址的映射关系给做好。