基于这样的特征,启动过程要从head.s部份跳转到kernel code部份,因此需要将kernel code加载到一个固定的地址.对于压缩的kernel.会加载到0x1000.对于完成的kernel.会将其加载到0x100000.
2 * setup.ld
3 *
4 * Linker script for the i386 setup code
5 */
6 OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
12 . = 0;
13 .bstext : { *(.bstext) }
14 .bsdata : { *(.bsdata) }
16 . = 497;
17 .header : { *(.header) }
18 .inittext : { *(.inittext) }
19 .initdata : { *(.initdata) }
20 .text : { *(.text*) }
22 . = ALIGN(16);
25 .videocards : {
31 . = ALIGN(16);
32 .data : { *(.data*) }
34 .signature : {
37 }
40 . = ALIGN(16);
42 {
46 }
47 . = ALIGN(16);
48 _end = .;
50 /DISCARD/ : { *(.note*) }
52 . = ASSERT(_end <= 0x8000, "Setup too big!");
53 . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
# Explicitly enter this as bytes, or the assembler
# tries to generate a 3-byte jump here, which causes
# everything else to push off to the wrong offset.
#如果定义了SAFE_RESET_DISK_CONTROLLER 重启磁盘控制器
#ifdef SAFE_RESET_DISK_CONTROLLER
# Reset the disk controller.
movw $0x0000, %ax # Reset disk controller
movb $0x80, %dl # All disks
# Force %es = %ds
movw %ds, %ax
movw %ax, %es
# Apparently some ancient versions of LILO invoked the kernel with %ss != %ds,
# which happened to work by accident for the old code. Recalculate the stack
# pointer if %ss is invalid. Otherwise leave it alone, LOADLIN sets up the
# stack behind its own code, so we can't blindly put it directly past the heap.
movw %ss, %dx
cmpw %ax, %dx # %ds == %ss?
movw %sp, %dx
je 2f # -> assume %sp is reasonably set
movw $_end, %dx
testb $CAN_USE_HEAP, loadflags
movw heap_end_ptr, %dx
1: addw $STACK_SIZE, %dx
xorw %dx, %dx # Prevent wraparound
2: # Now %dx should point to the end of our stack space
andw $~3, %dx # dword align (might as well...)
movw $0xfffc, %dx # Make sure we're not zero
3: movw %ax, %ss
movzwl %dx, %esp # Clear upper half of %esp
sti # Now we should have a working stack
# We will have entered with %cs = %ds+0x20, normalize %cs so
# it is on par with the other segments.
# Check signature at end of setup
# Zero the bss
movw $__bss_start, %di
movw $_end+3, %cx
xorl %eax, %eax
subw %di, %cx
shrw $2, %cx
# Jump to C code (should not return)
在这里,设置好了堆栈之后,call main,跳转到了用C写的函数里.在这个函数里会初始化一部份硬件环境.要注意的是,迄今为止.还一直运行在实模式.
/* First, copy the boot header into the "zeropage" */
/* Make sure we have all the proper CPU support */
puts("Unable to boot - please use a kernel appropriate "
/* Tell the BIOS what CPU mode we intend to run in. */
#ifdef CONFIG_X86_VOYAGER
/* Query Intel SpeedStep (IST) information */
#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
/* Do the last things and invoke protected mode */
static void copy_boot_params(void)
memcpy(&boot_params.hdr, &hdr, sizeof hdr);
void go_to_protected_mode(void)
/* Hook before leaving real mode, also disables interrupts */
/* Move the kernel/setup to their final resting places */
puts("A20 gate not responding, unable to boot.../n");
/* Actual transition to protected mode... */
static void setup_idt(void)
static const struct gdt_ptr null_idt = {0, 0};
asm volatile("lidtl %0" : : "m" (null_idt));
static void setup_gdt(void)
/* There are machines which are known to not boot with the GDT
being 8-byte unaligned. Intel recommends 16 byte alignment. */
static const u64 boot_gdt[] __attribute__((aligned(16))) = {
/* CS: code, read/execute, 4 GB, base 0 */
[GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
/* DS: data, read/write, 4 GB, base 0 */
[GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
/* TSS: 32-bit tss, 104 bytes, base 4096 */
/* We only have a TSS here to keep Intel VT happy;
[GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
/* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
of the gdt_ptr contents. Thus, make it static so it will
stay in memory, at least long enough that we switch to the
proper kernel GDT. */
gdt.ptr = (u32)&boot_gdt + (ds() << 4);
asm volatile("lgdtl %0" : : "m" (gdt));
在这里看到.GDT初始化了三项. GDT_ENTRY_BOOT_CS, GDT_ENTRY_BOOT_DS和GDT_ENTRY_BOOT_TSS.其中GDT_ENTRY_BOOT_CS和GDT_ENTRY_BOOT_DS基地址都为零.段限长都是4G. 实际上GDT_ENTRY_BOOT_TSS是没有被使用到的
linux-2.6.25/arch/x86/boot/ pmjump.S
movl %edx, %esi # Pointer to boot_params table
xorl %ebx, %ebx
movw %cs, %bx
shll $4, %ebx
addl %ebx, 2f
movw $__BOOT_DS, %cx
movw $__BOOT_TSS, %di
movl %cr0, %edx
orb $X86_CR0_PE, %dl # Protected mode
movl %edx, %cr0
jmp 1f # Short jump to serialize on 386/486
.byte 0x66, 0xea # ljmpl opcode
.size protected_mode_jump, .-protected_mode_jump
.type in_pm32, @function
# Set up data segments for flat 32-bit mode
movl %ecx, %ds
movl %ecx, %es
movl %ecx, %fs
movl %ecx, %gs
movl %ecx, %ss
# The 32-bit code sets up its own stack, but this way we do have
# a valid stack if some debugging hack wants to use it.
addl %ebx, %esp
# Set up TR to make Intel VT happy
# Clear registers to allow for future extensions to the
xorl %ecx, %ecx
xorl %edx, %edx
xorl %ebx, %ebx
xorl %ebp, %ebp
xorl %edi, %edi
# Set up LDTR to make Intel VT happy
jmpl *%eax # Jump to the 32-bit entrypoint
#define LOAD_OFFSET __PAGE_OFFSET
#include
#include
#include
#include
OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
jiffies = jiffies_64;
data PT_LOAD FLAGS(7); /* RWE */
.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
_text = .; /* Text and read-only data */
所有的SECTIONS是从LOAD_OFFSET + LOAD_PHYSICAL_ADDR开始的. LOAD_OFFSET就是我们经常看到的PAGE_OFFSET. LOAD_PHYSICAL_ADDR在没有压缩kernel的情况就是0x100000.这也就是kernel线性地址到物理地址转换关系的由来.
LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
* To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
* pagetables from above the 16MB DMA limit, so we'll have to set
#ifdef CONFIG_DEBUG_PAGEALLOC
LOW_PAGES = LOW_PAGES + 0x1000000
#if PTRS_PER_PMD > 1
PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
BOOTBITMAP_SIZE = LOW_PAGES / 8
INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
* Clear BSS first so that there are no surprises...
movl pa(boot_params) + NEW_CL_POINTER,%esi
jz 1f # No comand line
page_pde_offset = (__PAGE_OFFSET >> 20);
#将eax-> edi . edi存放的是pg0的地址
* End condition: we must map up to and including INIT_MAP_BEYOND_END
* bytes beyond the end of our own page tables; the +0x007 is
leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
/* Do early initialization of the fixmap area */
movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
orl $X86_CR0_PG,%eax
ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
void __init setup_arch(char **cmdline_p)
// 对bios取得的e820图进行调整,然后将其copy 到e820
find_max_pfn();
void __init early_ioremap_init(void)
printk(KERN_INFO "early_ioremap_init()/n");
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
/*在这里会将FIX_BTMAP_BEGIN 段的页面表固定使用bm_pte*/
pmd_populate_kernel(&init_mm, pmd, bm_pte);
* The boot-ioremap range spans multiple pmds, for which
if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
printk(KERN_WARNING "pmd %p != %p/n",
pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx/n",
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx/n",
printk(KERN_WARNING "FIX_BTMAP_END: %d/n", FIX_BTMAP_END);
printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d/n",
__u64 size; /* size of memory segment */
__u32 type; /* type of memory segment */
} __attribute__((packed));
void __init print_memory_map(char *who)
printk(" %s: %016Lx - %016Lx ", who,
e820.map[i].addr + e820.map[i].size);
printk("(ACPI data)/n");
printk("(ACPI NVS)/n");
default: printk("type %u/n", e820.map[i].type);
void __init find_max_pfn(void)
if (e820.map[i].type != E820_RAM)
start = PFN_UP(e820.map[i].addr);
end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
memory_present(0, start, end);
static unsigned long __init setup_memory(void)
printk(KERN_NOTICE "%ldMB HIGHMEM available./n",
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
printk(KERN_NOTICE "%ldMB LOWMEM available./n",
void __init setup_bootmem_allocator(void)
* Initialize the boot-time allocator (with low memory only):
bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
* Reserve the bootmem bitmap itself as well. We do this in two
* steps (first step was init_bootmem()) because this catches
* the (very unlikely) case of us accidentally initializing the
reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
* reserve physical page 0 - it's a special BIOS page on many boxes,
* enabling clean reboots, SMP operation, laptop functions.
reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
/* reserve EBDA region, it's a 4K region */
/* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
PCI prefetch into it (errata #56). Usually the page is reserved anyways,
unless you have no PS/2 mouse plugged in. */
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
boot_cpu_data.x86 == 6)
reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
* But first pinch a few for the stack/trampoline stuff
* FIXME: Don't need the extra page at 4K, but need to fix
* trampoline before removing it. (see the GDT stuff)
reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
#ifdef CONFIG_ACPI_SLEEP
#ifdef CONFIG_X86_FIND_SMP_CONFIG
* Find and reserve possible boot-time SMP configuration:
#ifdef CONFIG_BLK_DEV_INITRD
unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
return init_bootmem_core(NODE_DATA(0), start, 0, pages);
#define NODE_DATA(nid) (&contig_page_data)
static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
unsigned long mapstart, unsigned long start, unsigned long end)
bootmem_data_t *bdata = pgdat->bdata;
bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
bdata->node_boot_start = PFN_PHYS(start);
* Initially all pages are reserved - setup_arch() has to
memset(bdata->node_bootmem_map, 0xff, mapsize);
void __init register_bootmem_low_pages(unsigned long max_low_pfn)
unsigned long curr_pfn, last_pfn, size;
if (e820.map[i].type != E820_RAM)
* We are rounding up the start address of usable memory:
curr_pfn = PFN_UP(e820.map[i].addr);
if (curr_pfn >= max_low_pfn)
* ... and at the end of the usable range downwards:
last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
if (last_pfn > max_low_pfn)
free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
void __init paging_init(void)
#ifdef CONFIG_X86_PAE
printk(KERN_INFO "NX (Execute Disable) protection: active/n");
void __init zone_sizes_init(void)
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
void __init add_active_range(unsigned int nid, unsigned long start_pfn,
unsigned long end_pfn)
early_node_map[i].start_pfn = start_pfn;
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Sort early_node_map as initialisation assumes it is sorted */
//以下代码就是为了建立arch_zone_lowest_possible_pfn[i] ~ arch_zone_highest_possible_pfn[i]
memset(arch_zone_lowest_possible_pfn, 0,
memset(arch_zone_highest_possible_pfn, 0,
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i] =
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
printk("Movable zone start PFN for each node/n");
printk(" Node %d: %lu/n", i, zone_movable_pfn[i]);
printk("early_node_map[%d] active PFN ranges/n", nr_nodemap_entries);
for (i = 0; i < nr_nodemap_entries; i++)
printk(" %3d: %8lu -> %8lu/n", early_node_map[i].nid,
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, pgdat, NULL,
find_min_pfn_for_node(nid), NULL);
if (pgdat->node_present_pages)
node_set_state(nid, N_HIGH_MEMORY);
void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long node_start_pfn,
pgdat->node_start_pfn = node_start_pfn;
calculate_node_totalpages(pgdat, zones_size, zholes_size);
free_area_init_core(pgdat, zones_size, zholes_size);
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
#ifdef CONFIG_FLAT_NODE_MEM_MAP
/* ia64 gets its own node_mem_map, before this, without bootmem */
unsigned long size, start, end;
* The zone's endpoints aren't required to be MAX_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
end = ALIGN(end, MAX_ORDER_NR_PAGES);
map = alloc_remap(pgdat->node_id, size);
map = alloc_bootmem_node(pgdat, size);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
#ifndef CONFIG_NEED_MULTIPLE_NODES
* With no DISCONTIG, the global mem_map is just set as node 0's
mem_map = NODE_DATA(0)->node_mem_map;
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
#define memmap_init(size, nid, zone, start_pfn) /
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
unsigned long end_pfn = start_pfn + size;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
if (!early_pfn_in_nid(pfn, nid))
set_page_links(page, zone, nid, pfn);
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made. Later some blocks near
if ((pfn & (pageblock_nr_pages-1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
set_page_address(page, __va(pfn << PAGE_SHIFT));
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
void build_all_zonelists(void)
__build_all_zonelists() -à build_zonelists():
static void build_zonelists(pg_data_t *pgdat)
zonelist = pgdat->node_zonelists + i;
j = build_zonelists_node(pgdat, zonelist, 0, i);
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
int nr_zones, enum zone_type zone_type)
zone = pgdat->node_zones + zone_type;
zonelist->zones[nr_zones++] = zone;
void __init mem_init(void)
set_highmem_pages_init(bad_ppro): 将高端页面释放到伙伴系统.bootmem中的页面全部是内核可直接寻址的页面.