compress head.S分析

linux/arch/arm/boot/compressed/head.S

这是ARM-Linux运行的第一个文件,这些代码是一个比较独立的代码包裹器。其作用就是解压Linux内核,并将PC指针跳到内核(vmlinux)的第一条指令。
Bootloader中传入到Linux中的参数总共有三个,Linux中用到的是第二个和第三个。第二个参数是architecture id,第三个是taglist的地址。Architecture id的arm芯片在Linux中一定要唯一。Taglist是bootload向Linux传入的参数列表(详细的解释请参考《booting arm linux.pdf》)。
//程序的入口点
   .section ".start", #alloc, #execinstr
/*
* sort out different calling conventions
*/
   .align
start:
   .type start,#function
   .rept 8//重复8次下面的指令,也就是空出中断向量表的位置
   mov r0, r0//就是nop指令
   .endr

   b 1f
   .word 0x016f2818   @ Magic numbers to help the loader
   .word start    @ absolute load/run zImage address
   .word _edata    @ zImage end address
1:   mov r7, r1    @ save architecture ID
   mov r8, r2    @ save atags pointer

#ifndef __ARM_ARCH_2__
   /*
   * Booting from Angel - need to enter SVC mode and disable
   * FIQs/IRQs (numeric definitions from angel arm.h source).
   * We only do this if we were in user mode on entry.
   */
   mrs r2, cpsr   @ get current mode
   tst r2, #3    @ not user?
   bne not_angel
   mov r0, #0x17   @ angel_SWIreason_EnterSVC
   swi 0x123456   @ angel_SWI_ARM
not_angel:
   mrs r2, cpsr   @ turn off interrupts to
   orr r2, r2, #0xc0   @ prevent angel from running
   msr cpsr_c, r2
#else
   teqp pc, #0x0c000003   @ turn off interrupts
#endif

一定要保证当前运行在SVC模式下,否则会跳到swi里面去(为什么?我不清楚,而且我没有处理过这个swi)。然后再关闭irq和fiq。

   /*
   * Note that some cache flushing and other stuff may
   * be needed here - is there an Angel SWI call for this?
   */

   /*
   * some architecture specific code can be inserted
   * by the linker here, but it should preserve r7, r8, and r9.
   */

读入地址表。因为我们的代码可以在任何地址执行,也就是位置无关代码(PIC),所以我们需要加上一个偏移量。下面有每一个列表项的具体意义。
GOT表的初值是连接器指定的,当时程序并不知道代码在哪个地址执行。如果当前运行的地址已经和表上的地址不一样,还要修正GOT表。
   .text
   adr r0, LC0
   ldmia r0, {r1, r2, r3, r4, r5, r6, ip, sp}
   subs r0, r0, r1   @ calculate the delta offset

       @ if delta is zero, we are
   beq not_relocated   @ running at the address we
       @ were linked at.

   /*
   * We're running at a different address. We need to fix
   * up various pointers:
   *   r5 - zImage base address
   *   r6 - GOT start
   *   ip - GOT end
   */
   add r5, r5, r0
   add r6, r6, r0
   add ip, ip, r0

   /*
   * If we're running fully PIC === CONFIG_ZBOOT_ROM = n,
   * we need to fix up pointers into the BSS region.
   *   r2 - BSS start
   *   r3 - BSS end
   *   sp - stack pointer
   */
   add r2, r2, r0
   add r3, r3, r0
   add sp, sp, r0

修改GOT(全局偏移表)表。根据当前的运行地址,修正该表。
   /*
   * Relocate all entries in the GOT table.
   */
1:   ldr r1, [r6, #0]   @ relocate entries in the GOT
   add r1, r1, r0   @ table. This fixes up the
   str r1, [r6], #4   @ C references.
   cmp r6, ip
   blo 1b

清BSS段,所有的arm程序都需要做这些的。

not_relocated: mov r0, #0
1:   str r0, [r2], #4   @ clear bss
   str r0, [r2], #4
   str r0, [r2], #4
   str r0, [r2], #4
   cmp r2, r3
   blo 1b

正如下面的注释所说,C环境我们已经设置好了。下面我们要打开cache和mmu。为什么要这样做呢?这只是一个解压程序呀?为了速度。那为什么要开mmu呢,而且只是做一个平板式的映射?还是为了速度。如果不开mmu的话,就只能打开icache。因为不开mmu的话就无法实现内存管理,而io区是决不能开dcache的。

   /*
   * The C runtime environment should now be setup
   * sufficiently. Turn the cache on, set up some
   * pointers, and start decompressing.
   */
   bl cache_on
是不是要跟读进去呢?对于只是对流程感兴趣的人只是知道打开cache就行了。不过跟进去是很有乐趣的,这就是为什么虽然Linux如此庞大,但仍有人会孜孜不倦的研究它的每一行代码的原因吧。反过来说,对于Linux内核的整体把握更加重要,要不然就成盲人摸象了。还有,想做ARM高手的人可以读Linux下的每一个汇编文件,因为Linux内核用ARM的东西还是比较全的。

   mov r1, sp    @ malloc space above stack
   add r2, sp, #0x10000 @ 64k max

对下面这些地址的理解其实还是很麻烦,但有篇文档写得很清楚《About TEXTADDR, ZTEXTADDR, PAGE_OFFSET etc...》。下面程序的意义就是保证解压地址和当前程序的地址不重叠。上面分配了64KB的空间来做解压时的数据缓存。
/*
* Check to see if we will overwrite ourselves.
*   r4 = final kernel address//内核执行的最终实地址
*   r5 = start of this image//该程序的首地址
*   r2 = end of malloc space (and therefore this image)
* We basically want:
*   r4 >= r2 -> OK
*   r4 + image length <= r5 -> OK
*/
   cmp r4, r2
   bhs wont_overwrite
   add r0, r4, #4096*1024 @ 4MB largest kernel size
   cmp r0, r5
   bls wont_overwrite

如果空间不够了,只好解压到缓冲区地址后面。调用decompress_kernel进行解压缩,这段代码是用c实现的,和架构无关。

   mov r5, r2    @ decompress after malloc space
   mov r0, r5
   mov r3, r7
   bl decompress_kernel

完成了解压缩之后,由于空间不够,内核也没有解压到正确的地址,必须通过代码搬移来搬到指定的地址。搬运过程中有可能会覆盖掉现在运行的这段代码,所以必须将有可能会执行到的代码搬运到安全的地方,这里用的是解压缩了的代码的后面。

   add r0, r0, #127
   bic r0, r0, #127   @ align the kernel length
/*
* r0     = decompressed kernel length
* r1-r3 = unused
* r4     = kernel execution address
* r5     = decompressed kernel start
* r6     = processor ID
* r7     = architecture ID
* r8     = atags pointer
* r9-r14 = corrupted
*/
   add r1, r5, r0   @ end of decompressed kernel
   adr r2, reloc_start
   ldr r3, LC1
   add r3, r2, r3
1:   ldmia r2!, {r9 - r14}   @ copy relocation code
   stmia r1!, {r9 - r14}
   ldmia r2!, {r9 - r14}
   stmia r1!, {r9 - r14}
   cmp r2, r3
   blo 1b

   bl cache_clean_flush//因为有代码搬移,所以必须先清理(clean)清除(flush)cache。
   add pc, r5, r0   @ call relocation code

decompress_kernel共有4个参数,解压的内核地址、缓存区首地址、缓存区尾地址、和芯片ID,返回解压缩代码的长度。

/*
* We're not in danger of overwriting ourselves. Do this the simple way.
*
* r4     = kernel execution address
* r7     = architecture ID
*/
wont_overwrite: mov r0, r4
   mov r3, r7
   bl decompress_kernel
   b call_kernel

针对于不会出现代码覆盖的情况,就简单了。直接解压缩内核并且跳转到首地址运行。call_kernel这个函数我们会在下面分析它。

   .type LC0, #object
LC0:   .word LC0    @ r1
   .word __bss_start   @ r2
   .word _end    @ r3
   .word zreladdr   @ r4
   .word _start    @ r5
   .word _got_start   @ r6
   .word _got_end   @ ip
   .word user_stack+4096   @ sp
LC1:   .word reloc_end - reloc_start
   .size LC0, . - LC0

上面这个就是刚才我们说过的地址表,里面有几个符号的地址定义。LC0是在这里定义的。Zreladdr是在当前目录下的Makfile里定义的。其他的符号是在lds里定义的。

下面我们来分析一下有关cache和mmu的代码。通过这些代码我们可以看到Linux的高手们是如何通过汇编来实现各个ARM处理器的识别,以达到通用的目的。
/*
* Turn on the cache. We need to setup some page tables so that we
* can have both the I and D caches on.
*
* We place the page tables 16k down from the kernel execution address,
* and we hope that nothing else is using it. If we're using it, we
* will go pop!
*
* On entry,
* r4 = kernel execution address
* r6 = processor ID
* r7 = architecture number
* r8 = atags pointer
* r9 = run-time address of "start" (???)
* On exit,
* r1, r2, r3, r9, r10, r12 corrupted
* This routine must preserve:
* r4, r5, r6, r7, r8
*/
   .align 5
cache_on: mov r3, #8    @ cache_on function
   b call_cache_fn

这里涉及到了很多MMU、cache、writebuffer、TLB的操作和协处理器的编程。具体编程的东西,我就不想多说了,可以对这ARM的手册逐行的理解。至于为什么要这样做,熟悉了他们的工作原理后也就不难理解了(《ARM嵌入式系统开发》这本书就有个比较好的说明)。因为这里包含了太多的代码搬运、解压等费时的操作,所以打开cache是有必要的。由于要用到数据cache所以需要对mmu进行配置。为了简单这里制作了一级映射,而且是物理地址和虚拟地址相同的1:1映射。

__setup_mmu: sub r3, r4, #16384   @ Page directory size
   bic r3, r3, #0xff   @ Align the pointer
   bic r3, r3, #0x3f00
/*
* Initialise the page tables, turning on the cacheable and bufferable
* bits for the RAM area only.
*/
   mov r0, r3
   mov r9, r0, lsr #18
   mov r9, r9, lsl #18   @ start of RAM
   add r10, r9, #0x10000000 @ a reasonable RAM size
   mov r1, #0x12
   orr r1, r1, #3 << 10
   add r2, r3, #16384
1:   cmp r1, r9    @ if virt > start of RAM
   orrhs r1, r1, #0x0c   @ set cacheable, bufferable
   cmp r1, r10    @ if virt > end of RAM
   bichs r1, r1, #0x0c   @ clear cacheable, bufferable
   str r1, [r0], #4   @ 1:1 mapping
   add r1, r1, #1048576
   teq r0, r2
   bne 1b

参考下面的注释,如果当前在flash中运行,我们再映射2MB。就算是当前在RAM中执行其实也没关系,只不过是做了重复工作。

/*
* If ever we are running from Flash, then we surely want the cache
* to be enabled also for our execution instance... We map 2MB of it
* so there is no map overlap problem for up to 1 MB compressed kernel.
* If the execution is in RAM then we would only be duplicating the above.
*/
   mov r1, #0x1e
   orr r1, r1, #3 << 10
   mov r2, pc, lsr #20
   orr r1, r1, r2, lsl #20
   add r0, r3, r2, lsl #2
   str r1, [r0], #4
   add r1, r1, #1048576
   str r1, [r0]
   mov pc, lr

__armv4_cache_on:
   mov r12, lr
   bl __setup_mmu
   mov r0, #0
   mcr p15, 0, r0, c7, c10, 4 @ drain write buffer
   mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
   mrc p15, 0, r0, c1, c0, 0 @ read control reg
   orr r0, r0, #0x5000   @ I-cache enable, RR cache replacement
   orr r0, r0, #0x0030
   bl __common_cache_on
   mov r0, #0
   mcr p15, 0, r0, c8, c7, 0 @ flush I,D TLBs
   mov pc, r12

__common_cache_on:
#ifndef DEBUG
   orr r0, r0, #0x000d   @ Write buffer, mmu
#endif
   mov r1, #-1
   mcr p15, 0, r3, c2, c0, 0 @ load page table pointer
   mcr p15, 0, r1, c3, c0, 0 @ load domain access control
   mcr p15, 0, r0, c1, c0, 0 @ load control register
   mov pc, lr

/*
* All code following this line is relocatable. It is relocated by
* the above code to the end of the decompressed kernel image and
* executed there. During this time, we have no stacks.
*
* r0     = decompressed kernel length
* r1-r3 = unused
* r4     = kernel execution address
* r5     = decompressed kernel start
* r6     = processor ID
* r7     = architecture ID
* r8     = atags pointer
* r9-r14 = corrupted
*/

下面这段代码是在解压空间不够的情况下需要重新定位的,具体原因上面已经说明。

   .align 5
reloc_start: add r9, r5, r0
   debug_reloc_start
   mov r1, r4
1:
   .rept 4
   ldmia r5!, {r0, r2, r3, r10 - r14} @ relocate kernel
   stmia r1!, {r0, r2, r3, r10 - r14}
   .endr

   cmp r5, r9
   blo 1b
   debug_reloc_end

这是最后一个函数了,这个时候一切实质性的工作已经做完。关闭cache,并跳转到真正的内核入口。

call_kernel: bl cache_clean_flush
   bl cache_off
   mov r0, #0    @ must be zero
   mov r1, r7    @ restore architecture number
   mov r2, r8    @ restore atags pointer
   mov pc, r4    @ call kernel

/*
* Here follow the relocatable cache support functions for the
* various processors. This is a generic hook for locating an
* entry and jumping to an instruction at the specified offset
* from the start of the block. Please note this is all position
* independent code.
*
* r1 = corrupted
* r2 = corrupted
* r3 = block offset
* r6 = corrupted
* r12 = corrupted
*/

通过下面函数我们可以通过proc_types结构体数组我们可以顺利的找到现在的处理器型号,并且会根据R3的偏移量跳转到相应的函数中。里面涉及到协处理器CP15中c0的操作,如果有疑问,可以参考ARM相关手册。

call_cache_fn: adr r12, proc_types
   mrc p15, 0, r6, c0, c0 @ get processor ID
1:   ldr r1, [r12, #0]   @ get value
   ldr r2, [r12, #4]   @ get mask
   eor r1, r1, r6   @ (real ^ match)
   tst r1, r2    @       & mask
   addeq pc, r12, r3   @ call cache function
   add r12, r12, #4*5
   b 1b

/*
* Table for cache operations. This is basically:
*   - CPU ID match
*   - CPU ID mask
*   - 'cache on' method instruction
*   - 'cache off' method instruction
*   - 'cache flush' method instruction
*
* We match an entry using: ((real_id ^ match) & mask) == 0
*
* Writethrough caches generally only need 'on' and 'off'
* methods. Writeback caches _must_ have the flush method
* defined.
*/
   .type proc_types,#object
proc_types:
   .word 0x41560600   @ ARM6/610
   .word 0xffffffe0
   b __arm6_cache_off @ works, but slow
   b __arm6_cache_off
   mov pc, lr
@   b __arm6_cache_on   @ untested
@   b __arm6_cache_off
@   b __armv3_cache_flush

   .word 0x00000000   @ old ARM ID
   .word 0x0000f000
   mov pc, lr
   mov pc, lr
   mov pc, lr

   .word 0x41007000   @ ARM7/710
   .word 0xfff8fe00
   b __arm7_cache_off
   b __arm7_cache_off
   mov pc, lr

   .word 0x41807200   @ ARM720T (writethrough)
   .word 0xffffff00
   b __armv4_cache_on
   b __armv4_cache_off
   mov pc, lr

   .word 0x00007000   @ ARM7 IDs
   .word 0x0000f000
   mov pc, lr
   mov pc, lr
   mov pc, lr

   @ Everything from here on will be the new ID system.

   .word 0x4401a100   @ sa110 / sa1100
   .word 0xffffffe0
   b __armv4_cache_on
   b __armv4_cache_off
   b __armv4_cache_flush

   .word 0x6901b110   @ sa1110
   .word 0xfffffff0
   b __armv4_cache_on
   b __armv4_cache_off
   b __armv4_cache_flush

   @ These match on the architecture ID

   .word 0x00020000   @ ARMv4T
   .word 0x000f0000
   b __armv4_cache_on
   b __armv4_cache_off
   b __armv4_cache_flush

   .word 0x00050000   @ ARMv5TE
   .word 0x000f0000
   b __armv4_cache_on
   b __armv4_cache_off
   b __armv4_cache_flush

   .word 0x00060000   @ ARMv5TEJ
   .word 0x000f0000
   b __armv4_cache_on
   b __armv4_cache_off
   b __armv4_cache_flush

   .word 0x00070000   @ ARMv6
   .word 0x000f0000
   b __armv4_cache_on
   b __armv4_cache_off
   b __armv6_cache_flush

   .word 0    @ unrecognised type
   .word 0
   mov pc, lr
   mov pc, lr
   mov pc, lr

   .size proc_types, . - proc_types

/*
* Turn off the Cache and MMU. ARMv3 does not support
* reading the control register, but ARMv4 does.
*
* On entry, r6 = processor ID
* On exit,   r0, r1, r2, r3, r12 corrupted
* This routine must preserve: r4, r6, r7
*/
   .align 5
cache_off: mov r3, #12    @ cache_off function
   b call_cache_fn

//代码略

这里分配了4K的空间用来做堆栈。

reloc_end:

   .align
   .section ".stack", "w"
user_stack: .space 4096

你可能感兴趣的:(compress head.S分析)