#define PAGE_SHIFT 12
THREAD_ORDER定义在linux/arch/x86/include/asm/下的page_32_types.h和page_64_types.h中,以下为32位系统下的定义。
#ifdef CONFIG_4KSTACKS
#define THREAD_ORDER 0
#else
#define THREAD_ORDER 1
#endif
在64位系统中,THREAD_ORDER定义为1。
/*
* PMD_SHIFT determines the size of the area a middle-level
* page table can map
*/
#define PMD_SHIFT 21
在64位系统中,PMD_SHIFT定义为21。
如此,在32位系统上,设置了4k栈的系统上,MIN_KERNEL_ALIGN_LG2的值为12,否则为13。
pad3: .word 0
cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
#added with boot protocol
#version 2.06
hardware_subarch: .long 0 # subarchitecture, added with 2.07
# default to 0 for normal x86 PC
hardware_subarch_data: .quad 0
payload_offset: .long ZO_input_data
payload_length: .long ZO_z_input_len
setup_data: .quad 0 # 64-bit physical pointer to
# single linked list of
# struct setup_data
pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
#define VO_INIT_SIZE (VO__end - VO__text)
#if ZO_INIT_SIZE > VO_INIT_SIZE
#define INIT_SIZE ZO_INIT_SIZE
#else
#define INIT_SIZE VO_INIT_SIZE
#endif
init_size: .long INIT_SIZE # kernel initialization size
下面来看看start_of_setup。
.section ".entrytext", "ax"
start_of_setup:
#ifdef SAFE_RESET_DISK_CONTROLLER
# Reset the disk controller.
movw $0x0000, %ax # Reset disk controller
movb $0x80, %dl # All disks
int $0x13
#endif
如果配置了需要安全重置磁盘控制器,那么首先做的事就是重置所有磁盘的控制器。
start_of_setup在最开始部分会将扩展段设置与数据段相同。
# Force %es = %ds
movw %ds, %ax
movw %ax, %es
cld
# Apparently some ancient versions of LILO invoked the kernel with %ss != %ds,
# which happened to work by accident for the old code. Recalculate the stack
# pointer if %ss is invalid. Otherwise leave it alone, LOADLIN sets up the
# stack behind its own code, so we can't blindly put it directly past the heap.
movw %ss, %dx
cmpw %ax, %dx # %ds == %ss?
movw %sp, %dx
je 2f # -> assume %sp is reasonably set
# Invalid %ss, make up a new stack
movw $_end, %dx
testb $CAN_USE_HEAP, loadflags
jz 1f
movw heap_end_ptr, %dx
1: addw $STACK_SIZE, %dx
jnc 2f
xorw %dx, %dx # Prevent wraparound
2: # Now %dx should point to the end of our stack space
andw $~3, %dx # dword align (might as well...)
jnz 3f
movw $0xfffc, %dx # Make sure we're not zero
3: movw %ax, %ss
movzwl %dx, %esp # Clear upper half of %esp
sti # Now we should have a working stack
以上部分代码是用来初始化堆栈的,有了堆栈之后就能运行C代码了。
# We will have entered with %cs = %ds+0x20, normalize %cs so
# it is on par with the other segments.
pushw %ds
pushw $6f
lretw
6:
# Check signature at end of setup
cmpl $0x5a5aaa55, setup_sig
jne setup_bad
以上代码通过push、ret设置了代码段寄存器,接下来的cmp来检查setup末尾的签名,如果不为0x5a5aaa55那么说明setup是坏的。
接下来会清空bss段,bss段是未初始化的数据段。
# Zero the bss
movw $__bss_start, %di
movw $_end+3, %cx
xorl %eax, %eax
subw %di, %cx
shrw $2, %cx
rep; stosl
每次清空四个字节,所以cx右移了两位,而cx加3的目的是为了向上取整。
# Jump to C code (should not return)
calll main
在这里跳转到了C代码中的main函数,main是不返回的。
# Setup corrupt somehow...
setup_bad:
movl $setup_corrupt, %eax
calll puts
# Fall through...
.globl die
.type die, @function
die:
hlt
jmp die
.size die, .-die
.section ".initdata", "a"
setup_corrupt:
.byte 7
.string "No setup signature found...\n"
setup的最后一部分代码是出错时处理相关的。
三、
linux-2.6.34.13/arch/
x86/boot/
main.c
在main.c中完成了要在实模式中所做的工作,最后会进入保护模式。
void main(void)
{
/* First, copy the boot header into the "zeropage" */
copy_boot_params();
/* End of heap check */
init_heap();
/* Make sure we have all the proper CPU support */
if (validate_cpu()) {
puts("Unable to boot - please use a kernel appropriate "
"for your CPU.\n");
die();
}
/* Tell the BIOS what CPU mode we intend to run in. */
set_bios_mode();
/* Detect memory layout */
detect_memory();
/* Set keyboard repeat rate (why?) */
keyboard_set_repeat();
/* Query MCA information */
query_mca();
/* Query Intel SpeedStep (IST) information */
query_ist();
/* Query APM information */
#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
query_apm_bios();
#endif
/* Query EDD information */
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
query_edd();
#endif
/* Set the video mode */
set_video();
/* Parse command line for 'quiet' and pass it to decompressor. */
if (cmdline_find_option_bool("quiet"))
boot_params.hdr.loadflags |= QUIET_FLAG;
/* Do the last things and invoke protected mode */
go_to_protected_mode();
}
首先,在main中要做的事情是初始化boot_params,这是在copy_boot_params()中完成的,代码如下。
static void copy_boot_params(void)
{
struct old_cmdline {
u16 cl_magic;
u16 cl_offset;
};
const struct old_cmdline * const oldcmd =
(const struct old_cmdline *)OLD_CL_ADDRESS;
BUILD_BUG_ON(sizeof boot_params != 4096);
memcpy(&boot_params.hdr, &hdr, sizeof hdr);
这里将hdr拷贝到boot_params.hdr中,hdr是在header.S中定义的数据。注意,这个变量全局变量,且未被初始化,所以位于bss段,它就位于_bss_start的开始位置。而在之后当启动保护模式的分页功能后,第一个页面就是从它开始的(注意,不是从0x0开始的喔)。所以内核注释它为“zeropage”,即所谓的0号页面,足见这个boot_params的重要性。
if (!boot_params.hdr.cmd_line_ptr &&
oldcmd->cl_magic == OLD_CL_MAGIC) {
/* Old-style command line protocol. */
u16 cmdline_seg;
如果老的bootloader没有指定命令行参数,那么就将hdr的命令行参数指针指向老的命令行。
/* Figure out if the command line falls in the region
of memory that an old kernel would have copied up
to 0x90000... */
if (oldcmd->cl_offset < boot_params.hdr.setup_move_size)
cmdline_seg = ds();
else
cmdline_seg = 0x9000;
boot_params.hdr.cmd_line_ptr =
(cmdline_seg << 4) + oldcmd->cl_offset;
}
}
boot_params是未初始化的全局变量,编译器会将它放在bss段,而在进入main之前已经将bss段清零,所以在执行copy_boot_params()之前它是空的。上述代码初始化了boot_params。
接下来所做的工作是初始化堆,调用了init_heap(),代码如下。
static void init_heap(void)
{
char *stack_end;
/* 如果bootloader告诉kernel需要使用heap, bootloader需要把hdr.loadflags的CAN_US_HEAP位置1. */
if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
/* esp是当前堆栈的底,堆栈的大小是STACK_SIZE,由此计算出堆栈的顶stack_end是esp-STACK_SIZE */
asm("leal %P1(%%esp),%0"
: "=r" (stack_end) : "i" (-STACK_SIZE));
/* 堆的底是由boot_params.hdr.heap_end_ptr指定。这个值应该是由bootloader填入的,堆的大小是0x200。那么heap_end就是heap_end_ptr+0x200 */
heap_end = (char *)
((size_t)boot_params.hdr.heap_end_ptr + 0x200);
/* 如果堆栈和堆有重叠,那么就减小堆的大小 */
if (heap_end > stack_end)
heap_end = stack_end;
} else {
/* Boot protocol 2.00 only, no heap available */
puts("WARNING: Ancient bootloader, some functionality "
"may be limited!\n");
}
}
初始化了堆之后接着要检查CPU是否支持,如果内核要求的CPU等级高于当前CPU那么就终止。其中,对CPU进行检查的代码在cpucheck.c中。
int validate_cpu(void)
{
u32 *err_flags;
int cpu_level, req_level;
const unsigned char *msg_strs;
check_cpu(&cpu_level, &req_level, &err_flags);
if (cpu_level < req_level) {
printf("This kernel requires an %s CPU, ",
cpu_name(req_level));
printf("but only detected an %s CPU.\n",
cpu_name(cpu_level));
return -1;
}
if (err_flags) {
int i, j;
puts("This kernel requires the following features "
"not present on the CPU:\n");
msg_strs = (const unsigned char *)x86_cap_strs;
for (i = 0; i < NCAPINTS; i++) {
u32 e = err_flags[i];
for (j = 0; j < 32; j++) {
if (msg_strs[0] < i ||
(msg_strs[0] == i && msg_strs[1] < j)) {
/* Skip to the next string */
msg_strs += 2;
while (*msg_strs++)
;
}
if (e & 1) {
if (msg_strs[0] == i &&
msg_strs[1] == j &&
msg_strs[2])
printf("%s ", msg_strs+2);
else
printf("%d:%d ", i, j);
}
e >>= 1;
}
}
putchar('\n');
return -1;
} else {
return 0;
}
}
紧接着设置bios模式,告诉CPU我们想要进入什么模式,通过代码可以看出,在32位系统中是不做改变的,而在64位系统中要通过中断改变模式。
static void set_bios_mode(void)
{
#ifdef CONFIG_X86_64
struct biosregs ireg;
initregs(&ireg);
ireg.ax = 0xec00;
ireg.bx = 2;
intcall(0x15, &ireg, NULL);
#endif
}
然后要检查内存,detect_memory()函数代码非常简单,linux内核会分别尝试调用detect_memory_e820()、detcct_memory_e801()、detect_memory_88()获得系统物理内存布局
int detect_memory(void)
{
int err = -1;
if (detect_memory_e820() > 0)
err = 0;
if (!detect_memory_e801())
err = 0;
if (!detect_memory_88())
err = 0;
return err;
}
detect_memory_e820()、detcct_memory_e801()、detect_memory_88()这3个函数内部其实都会以内联汇编的形式调用bios中断以取得内存信息,该中断调用形式为int 0x15,同时调用前分别把AX寄存器设置为0xe820h、0xe801h、0x88h,这里以e820为例说明。
由于历史原因,一些i/o设备也会占据一部分内存物理地址空间,因此系统可以使用的物理内存空间是不连续的,系统内存被分成了很多段,每个段的属性也是不一样的。int 0x15 查询物理内存时每次返回一个内存段的信息,因此要想返回系统中所有的物理内存,我们必须以迭代的方式去查询。detect_memory_e820()函数把int 0x15放到一个do-while循环里,每次得到的一个内存段放到struct e820entry里,而struct e820entry的结构正是e820返回结果的结构!而像其它启动时获得的结果一样,最终都会被放到boot_params里,e820被放到了 boot_params.e820_map。
static int detect_memory_e820(void)
{
int count = 0;
struct biosregs ireg, oreg;
struct e820entry *desc = boot_params.e820_map;
static struct e820entry buf; /* static so it is zeroed */
initregs(&ireg);
ireg.ax = 0xe820;
ireg.cx = sizeof buf;
ireg.edx = SMAP;
ireg.di = (size_t)&buf;
/*
* Note: at least one BIOS is known which assumes that the
* buffer pointed to by one e820 call is the same one as
* the previous call, and only changes modified fields. Therefore,
* we use a temporary buffer and copy the results entry by entry.
*
* This routine deliberately does not try to account for
* ACPI 3+ extended attributes. This is because there are
* BIOSes in the field which report zero for the valid bit for
* all ranges, and we don't currently make any use of the
* other attribute bits. Revisit this if we see the extended
* attribute bits deployed in a meaningful way in the future.
*/
do {
intcall(0x15, &ireg, &oreg);
ireg.ebx = oreg.ebx; /* for next iteration... */
/* BIOSes which terminate the chain with CF = 1 as opposed
to %ebx = 0 don't always report the SMAP signature on
the final, failing, probe. */
if (oreg.eflags & X86_EFLAGS_CF)
break;
/* Some BIOSes stop returning SMAP in the middle of
the search loop. We don't know exactly how the BIOS
screwed up the map at that point, we might have a
partial map, the full map, or complete garbage, so
just return failure. */
if (oreg.eax != SMAP) {
count = 0;
break;
}
*desc++ = buf;
count++;
} while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
return boot_params.e820_entries = count;
}
detcct_memory_e801()也是用于获取内存的布局。
static int detect_memory_e801(void)
{
struct biosregs ireg, oreg;
initregs(&ireg);
ireg.ax = 0xe801;
intcall(0x15, &ireg, &oreg);
if (oreg.eflags & X86_EFLAGS_CF)
return -1;
/* Do we really need to do this? */
if (oreg.cx || oreg.dx) {
oreg.ax = oreg.cx;
oreg.bx = oreg.dx;
}
if (oreg.ax > 15*1024) {
return -1; /* Bogus! */
} else if (oreg.ax == 15*1024) {
boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
} else {
/*
* This ignores memory above 16MB if we have a memory
* hole there. If someone actually finds a machine
* with a memory hole at 16MB and no support for
* 0E820h they should probably generate a fake e820
* map.
*/
boot_params.alt_mem_k = oreg.ax;
}
return 0;
}
detcct_memory_88()同样是用于获取内存的布局。
static int detect_memory_88(void)
{
struct biosregs ireg, oreg;
initregs(&ireg);
ireg.ah = 0x88;
intcall(0x15, &ireg, &oreg);
boot_params.screen_info.ext_mem_k = oreg.ax;
return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
}
接下来要设置键盘的重复率,但是貌似是可有可无的。在对keyboard_set_repeat的说明中,有这么一段注释“Set the keyboard repeat rate to maximum. Unclear why this is done here; this might be possible to kill off as stale code.”所以对这个操作的解释是有疑问的。
在紧接着的query_mca()中,这实际上是通过int 15h,ah=0c0h中断来获取MCA(Micro Channel Architecture)系统描述表,详情可有查阅该中断的说明。
int query_mca(void)
{
struct biosregs ireg, oreg;
u16 len;
initregs(&ireg);
ireg.ah = 0xc0;
intcall(0x15, &ireg, &oreg);
if (oreg.eflags & X86_EFLAGS_CF)
return -1; /* No MCA present */
set_fs(oreg.es);
len = rdfs16(oreg.bx);
if (len > sizeof(boot_params.sys_desc_table))
len = sizeof(boot_params.sys_desc_table);
copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
return 0;
}
再接下来的query_ist()中通过int 15h,ax=0e980h中断来获取Intel Speed Step信息。
static void query_ist(void)
{
struct biosregs ireg, oreg;
/* Some older BIOSes apparently crash on this call, so filter
it from machines too old to have SpeedStep at all. */
if (cpu.level < 6)
return;
initregs(&ireg);
ireg.ax = 0xe980; /* IST Support */
ireg.edx = 0x47534943; /* Request value */
intcall(0x15, &ireg, &oreg);
boot_params.ist_info.signature = oreg.eax;
boot_params.ist_info.command = oreg.ebx;
boot_params.ist_info.event = oreg.ecx;
boot_params.ist_info.perf_level = oreg.edx;
}
根据配置,还需要获取APM信息或EDD信息,获取方法与IST类似。
最后在进入保护模式之前设置视频模式,set_video()在video.c中定义。
void set_video(void)
{
u16 mode = boot_params.hdr.vid_mode;
RESET_HEAP();
store_mode_params();
save_screen();
probe_cards(0);
for (;;) {
if (mode == ASK_VGA)
mode = mode_menu();
if (!set_mode(mode))
break;
printf("Undefined video mode number: %x\n", mode);
mode = ASK_VGA;
}
boot_params.hdr.vid_mode = mode;
vesa_store_edid();
store_mode_params();
if (do_restore)
restore_screen();
}
根据hdr得到视频模式,存储到内部变量mode中。在header.S中设置的vid_mode值是SVGA_MODE。随后,调用store_mode_params()来设置boot_params的screen_info字段。
/*
* Store the video mode parameters for later usage by the kernel.
* This is done by asking the BIOS except for the rows/columns
* parameters in the default 80x25 mode -- these are set directly,
* because some very obscure BIOSes supply insane values.
*/
static void store_mode_params(void)
{
u16 font_size;
int x, y;
/* For graphics mode, it is up to the mode-setting driver
(currently only video-vesa.c) to store the parameters */
if (graphic_mode)
return;
store_cursor_position();
store_video_mode();
if (boot_params.screen_info.orig_video_mode == 0x07) {
/* MDA, HGC, or VGA in monochrome mode */
video_segment = 0xb000;
} else {
/* CGA, EGA, VGA and so forth */
video_segment = 0xb800;
}
set_fs(0);
font_size = rdfs16(0x485); /* Font size, BIOS area */
boot_params.screen_info.orig_video_points = font_size;
x = rdfs16(0x44a);
y = (adapter == ADAPTER_CGA) ? 25 : rdfs8(0x484)+1;
if (force_x)
x = force_x;
if (force_y)
y = force_y;
boot_params.screen_info.orig_video_cols = x;
boot_params.screen_info.orig_video_lines = y;
}
在store_mode_params()函数中调用了store_cursor_position和store_video_mode来获得光标位置和视频模式。
static void store_cursor_position(void)
{
struct biosregs ireg, oreg;
initregs(&ireg);
ireg.ah = 0x03;
intcall(0x10, &ireg, &oreg);
boot_params.screen_info.orig_x = oreg.dl;
boot_params.screen_info.orig_y = oreg.dh;
if (oreg.ch & 0x20)
boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
}
static void store_video_mode(void)
{
struct biosregs ireg, oreg;
/* N.B.: the saving of the video page here is a bit silly,
since we pretty much assume page 0 everywhere. */
initregs(&ireg);
ireg.ah = 0x0f;
intcall(0x10, &ireg, &oreg);
/* Not all BIOSes are clean with respect to the top bit */
boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
boot_params.screen_info.orig_video_page = oreg.bh;
}
以上是store_cursor_position和store_video_mode,它们都是通过中断来获取信息的。
接下来调用save_screen来保存屏幕内容。
/* Save screen content to the heap */
static struct saved_screen {
int x, y;
int curx, cury;
u16 *data;
} saved;
static void save_screen(void)
{
/* Should be called after store_mode_params() */
saved.x = boot_params.screen_info.orig_video_cols;
saved.y = boot_params.screen_info.orig_video_lines;
saved.curx = boot_params.screen_info.orig_x;
saved.cury = boot_params.screen_info.orig_y;
if (!heap_free(saved.x*saved.y*sizeof(u16)+512))
return; /* Not enough heap to save the screen */
saved.data = GET_HEAP(u16, saved.x*saved.y);
set_fs(video_segment);
copy_from_fs(saved.data, 0, saved.x*saved.y*sizeof(u16));
}
以上代码的具体工作是从video_segment读取数据然后保存到堆。
然后扫描整个显卡列表,video_cards和video_cards_end都是bootloader传递过来的显卡列表。
/* Probe the video drivers and have them generate their mode lists. */
void probe_cards(int unsafe)
{
struct card_info *card;
static u8 probed[2];
if (probed[unsafe])
return;
probed[unsafe] = 1;
for (card = video_cards; card < video_cards_end; card++) {
if (card->unsafe == unsafe) {
if (card->probe)
card->nmodes = card->probe();
else
card->nmodes = 0;
}
}
}
如果bootloader设置hdr的vid_mode为ASK_VGA,就进行一些交互式的工作,在header.S中定义的vid_mode是SVGA_MODE,也就是
ASK_VGA。
然后调用vesa_store_edid()函数,它是对EDID的设置。EDID是一种VESA标准数据格式,其中包含有关监视器及其性能的参数,包括供应商信息、最大图像大小、颜色设置、厂商预设置、频率范围的限制以及显示器名和序列号的字符串。
接下来会再次执行store_mode_params()来保存数据,最后调用restore_screen()恢复屏幕内容。
static void restore_screen(void)
{
/* Should be called after store_mode_params() */
int xs = boot_params.screen_info.orig_video_cols;
int ys = boot_params.screen_info.orig_video_lines;
int y;
addr_t dst = 0;
u16 *src = saved.data;
struct biosregs ireg;
if (graphic_mode)
return; /* Can't restore onto a graphic mode */
if (!src)
return; /* No saved screen contents */
/* Restore screen contents */
set_fs(video_segment);
for (y = 0; y < ys; y++) {
int npad;
if (y < saved.y) {
int copy = (xs < saved.x) ? xs : saved.x;
copy_to_fs(dst, src, copy*sizeof(u16));
dst += copy*sizeof(u16);
src += saved.x;
npad = (xs < saved.x) ? 0 : xs-saved.x;
} else {
npad = xs;
}
/* Writes "npad" blank characters to
video_segment:dst and advances dst */
asm volatile("pushw %%es ; "
"movw %2,%%es ; "
"shrw %%cx ; "
"jnc 1f ; "
"stosw \n\t"
"1: rep;stosl ; "
"popw %%es"
: "+D" (dst), "+c" (npad)
: "bdS" (video_segment),
"a" (0x07200720));
}
/* Restore cursor position */
if (saved.curx >= xs)
saved.curx = xs-1;
if (saved.cury >= ys)
saved.cury = ys-1;
initregs(&ireg);
ireg.ah = 0x02; /* Set cursor position */
ireg.dh = saved.cury;
ireg.dl = saved.curx;
intcall(0x10, &ireg, NULL);
store_cursor_position();
}
到这里video就设置完毕了,进入保护模式前的准备工作就做好了。
四、
linux-2.6.34.13/arch/
x86/boot/
pm.c
进入保护模式的代码在boot/pm.c中,在main的最后调用了go_to_protected_mode(),这是一个不会返回的函数。
void go_to_protected_mode(void)
{
/* Hook before leaving real mode, also disables interrupts */
realmode_switch_hook();
/* Enable the A20 gate */
if (enable_a20()) {
puts("A20 gate not responding, unable to boot...\n");
die();
}
/* Reset coprocessor (IGNNE#) */
reset_coprocessor();
/* Mask all interrupts in the PIC */
mask_all_interrupts();
/* Actual transition to protected mode... */
setup_idt();
setup_gdt();
protected_mode_jump(boot_params.hdr.code32_start,
(u32)&boot_params + (ds() << 4));
}
在进入保护模式之前要先检查有没有hook代码,有则调用,没有则关闭中断、禁用不可屏蔽中断。
static void realmode_switch_hook(void)
{
if (boot_params.hdr.realmode_swtch) {
asm volatile("lcallw *%0"
: : "m" (boot_params.hdr.realmode_swtch)
: "eax", "ebx", "ecx", "edx");
} else {
asm volatile("cli");
outb(0x80, 0x70); /* Disable NMI */
io_delay();
}
}
然后打开a20地址线,如果打开失败则直接die掉。那么什么是a20 地址线 呢?在8086中是用SEG:OFFSET这样的模式来分段的,所以能表示的最大内存是FFFF:FFFF,也就是10FFEFh。可是在8086中只有20位的地址总线,所以只能寻址到1MB,如果试图访问超过1MB的地址时会怎么样呢?实际上系统不会发生异常,而是回卷(wrap)回去,重新从地址零开始寻址。可是到了80286时,真的可以访问超过1MB的地址,如果遇到同样的情况,系统不会再回卷寻址,这样就造成了向下不兼容,威客可保证兼容性,IBM使用8042键盘控制器来控制第20个(从0开始数)地址位,这就是a20地址线,如果不被打开,第20个地址为将会总是为零。
下图就是关于实模式下A20禁用与使用的区别。
static void enable_a20_bios(void)
{
struct biosregs ireg;
initregs(&ireg);
ireg.ax = 0x2401;
intcall(0x15, &ireg, NULL);
}
static void enable_a20_kbc(void)
{
empty_8042();
outb(0xd1, 0x64); /* Command write */
empty_8042();
outb(0xdf, 0x60); /* A20 on */
empty_8042();
outb(0xff, 0x64); /* Null command, but UHCI wants it */
empty_8042();
}
static void enable_a20_fast(void)
{
u8 port_a;
port_a = inb(0x92); /* Configuration port A */
port_a |= 0x02; /* Enable A20 */
port_a &= ~0x01; /* Do not reset machine */
outb(port_a, 0x92);
}
打开a20地址线不止一种方法,在该版本内核中采用了三种方法来,从而尽可能避免打开失败。
紧接着重置数学协处理器,这里就是向端口0xf0和0xf1写一个0。
static void reset_coprocessor(void)
{
outb(0, 0xf0);
io_delay();
outb(0, 0xf1);
io_delay();
}
还要标记PIC上的所有中断,这里也是通过向0xa1和0x21端口写数据完成的。
static void mask_all_interrupts(void)
{
outb(0xff, 0xa1); /* Mask all interrupts on the secondary PIC */
io_delay();
outb(0xfb, 0x21); /* Mask all but cascade on the primary PIC */
io_delay();
}
进入保护模式之前最关键的动作时设置gdt和idt。
struct gdt_ptr {
u16 len;
u32 ptr;
} __attribute__((packed));
static void setup_gdt(void)
{
/* There are machines which are known to not boot with the GDT
being 8-byte unaligned. Intel recommends 16 byte alignment. */
static const u64 boot_gdt[] __attribute__((aligned(16))) = {
/* CS: code, read/execute, 4 GB, base 0 */
[GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
这里的GDT_ENTRY(flags,base,limit)在asm/segment.h中定义,flags是标志位,base是基址,limit是段界限。
flags的各个位的代表内容如下:
第0-3位为TYPE(描述符类型),第4位为S(1表示数据段和代码段描述符,0表示系统段描述符和门描述符),第5、6位为DPL(段的特权等级),第7位为P(1表示段在内存中存在,0表示段在内存中不存在),第8-11位为段界限的16-19位,第12位为AVL(保留并且可以被操作系统使用),第13位为保留位(总是0),第14位为D/B,第15位为G(0表示段界限粒度为字节,1表示段界限粒度为4KB)。
从这里可以看出,CS段定义的flags为0xC09B,G位置1表示段界限粒度为4KB,段界限为0xFFFFF,总计可以寻址4GB。
/* DS: data, read/write, 4 GB, base 0 */
[GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
DS段定义的flags为0xC093,G位置1表示段界限粒度为4KB,段界限为0xFFFFF,总计可以寻址4GB。
/* TSS: 32-bit tss, 104 bytes, base 4096 */
/* We only have a TSS here to keep Intel VT happy;
we don't actually use it for anything. */
[GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
这里虽然定义了TSS段,但是按照注释TSS段应该没有被使用。
};
/* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
of the gdt_ptr contents. Thus, make it static so it will
stay in memory, at least long enough that we switch to the
proper kernel GDT. */
static struct gdt_ptr gdt;
这里将全局描述符表的长度、地址信息保存到gdt_ptr结构,最后调用lgdt指令设置GDT。
gdt.len = sizeof(boot_gdt)-1;
gdt.ptr = (u32)&boot_gdt + (ds() << 4);
asm volatile("lgdtl %0" : : "m" (gdt));
}
与GDT设置相比,IDT的设置在这个阶段就比较简单了。
static void setup_idt(void)
{
static const struct gdt_ptr null_idt = {0, 0};
asm volatile("lidtl %0" : : "m" (null_idt));
}
实际上,只是调用lidt指令设置一个空表。
设置完gdt、idt后,调用protected_mode_jump()跳转到code32_start, code32_start 在header.S中定义的值为0x100000,也可以由bootloader指定。
五、
linux-2.6.34.13/arch/
x86/boot/
pmjump.S
protected_mode_jump 在pmjump.S中定义,是使用汇编编写的,它的工作是在进入保护模式并跳转到code32_start。
最开始是16位汇编代码。
.text
.code16
/*
* void protected_mode_jump(u32 entrypoint, u32 bootparams);
*/
GLOBAL(protected_mode_jump)
movl %edx, %esi # Pointer to boot_params table
edx的内容为bootparams,这是因为内核中参数传递是fastcall类型的,优先通过寄存器传参,eax的值为entrypoint。
xorl %ebx, %ebx
movw %cs, %bx
shll $4, %ebx
addl %ebx, 2f
这几句代码的作用计算并设置下文中的32位jmp指令将要跳转到的地址。
jmp 1f # Short jump to serialize on 386/486
1:
movw $__BOOT_DS, %cx
movw $__BOOT_TSS, %di
movl %cr0, %edx
orb $X86_CR0_PE, %dl # Protected mode
movl %edx, %cr0
上面三条指令设置了cr0寄存器的PE标识,这样CPU就进入保护模式工作。
接下来是一条32位指令,它的作用是跳转到in_pm32。
# Transition to 32-bit mode
.byte 0x66, 0xea # ljmpl opcode
2: .long in_pm32 # offset
.word __BOOT_CS # segment
ENDPROC(protected_mode_jump)
in_pm32的作用主要是设置寄存器和跳转到entrypoint。
.code32
.section ".text32","ax"
GLOBAL(in_pm32)
# Set up data segments for flat 32-bit mode
movl %ecx, %ds
movl %ecx, %es
movl %ecx, %fs
movl %ecx, %gs
movl %ecx, %ss
# The 32-bit code sets up its own stack, but this way we do have
# a valid stack if some debugging hack wants to use it.
addl %ebx, %esp
# Set up TR to make Intel VT happy
ltr %di
# Clear registers to allow for future extensions to the
# 32-bit boot protocol
xorl %ecx, %ecx
xorl %edx, %edx
xorl %ebx, %ebx
xorl %ebp, %ebp
xorl %edi, %edi
# Set up LDTR to make Intel VT happy
lldt %cx
jmpl *%eax # Jump to the 32-bit entrypoint
ENDPROC(in_pm32)
随着最后的一个jmp指令的执行,我们终于到了保护模式。