linux 多核启动

转至http://blog.chinaunix.net/uid-27411029-id-3480919.html


Linux kernel启动的过程概览
init/main.c:start_kernel()
    |
   \|/
init/main.c:rest_init
{
……
kernel_thread(kernel_init, NULL, CLONES_FS | CLONE_SIGHAND)
……
cpu_idle()
}
    |
   \|/
init/main.c:kernel_init//从上面代码可以看出,kernel_init是一个内核线程 
    |
   \|/
init/main.c:init_post  //会在最后调用启动脚本
{
……
823         /*
824          * We try each of these until one succeeds.
825          *
826          * The Bourne shell can be used instead of init if we are
827          * trying to recover a really broken machine.
828          */
829         if (execute_command) {
830                 run_init_process(execute_command);
831                 printk(KERN_WARNING "Failed to execute %s.  Attempting "
832                                         "defaults...\n", execute_command);
833         }
834         run_init_process("/sbin/init");
835         run_init_process("/etc/init");
836         run_init_process("/bin/init");
837         run_init_process("/bin/sh");
838
839         panic("No init found.  Try passing init= option to kernel.");
……
}


我们再来看看内核启动多核的详细过程。

init/main.c:start_kernel()
    |
   \|/
init/main.c:rest_init
{
……
kernel_thread(kernel_init, NULL, CLONES_FS | CLONE_SIGHAND)
……
}
    |
   \|/
kernel_init    
    |
   \|/
/* called by boot processor to activate the rest */
init/main.c: smp_init()
{
……
for_each_present_cpu(cpu) {
          if (num_onlien_cpus() >= setup_max_cpus)
               break;
          if ( !cpu_online(cpu))     
               cpu_up(cpu);
}
/* Any cleanup work */
printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
smp_cpu_done(setup_max_cpus);
……
}
--------------------------------------------------------------
cpu_up = native_cpu_up是一个回调函数。
注册地方是在:arch/x86/kernel/smp.c

struct smp_ops smp_ops = {
   ……
  .cpu_up = native_cpu_up,
   ……
}
--------------------------------------------------------------
    |
   \|/
arch/x86/kernel/smpboot.c:native_cpu_up(unsigned int cpu)
    |
   \|/
arch/x86/kernel/smpboot.c: do_boot_cpu(int apicid, int cpu)
    |
   \|/
wakeup_secondary_cpu_via_init(apicid, start_ip)


在启动多核的过程中有两个bitmap很重要,一个是cpu_callin_mask,另一个是cpu_callout_mask。
cpu_callin_mask代表某个cpu是否已经启动,它的某个bit被与之对应的cpu在启动后置位,标记已经启动。
cpu_callout_mask在do_boot_cpu中被置位,并在检查到对应cpu已经启动后重新清零。

我们下面来详细看看do_boot_cpu(int apicid, int cpu)与wakeup_secondary_cpu_via_init(apicid, start_ip)

C代码   收藏代码
  1. /* 
  2.  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 
  3.  * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 
  4.  * Returns zero if CPU booted OK, else error code from 
  5.  * ->wakeup_secondary_cpu. 
  6.  */  
  7. static int __cpuinit do_boot_cpu(int apicid, int cpu)  
  8. {  
  9.     unsigned long boot_error = 0;  
  10.     unsigned long start_ip;  
  11.     int timeout;  
  12.     struct create_idle c_idle = {  
  13.         .cpu    = cpu,  
  14.         .done   = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),  
  15.     };  
  16.   
  17.     INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);  
  18.   
  19.     alternatives_smp_switch(1);  
  20.   
  21.     c_idle.idle = get_idle_for_cpu(cpu);  
  22.   
  23.     /* 
  24.      * We can't use kernel_thread since we must avoid to 
  25.      * reschedule the child. 
  26.      */  
  27.     if (c_idle.idle) {  
  28.         c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)  
  29.             (THREAD_SIZE +  task_stack_page(c_idle.idle))) - 1);  
  30.         init_idle(c_idle.idle, cpu);  
  31.         goto do_rest;  
  32.     }  
  33.   
  34.     if (!keventd_up() || current_is_keventd())  
  35.         c_idle.work.func(&c_idle.work);  
  36.     else {  
  37.         schedule_work(&c_idle.work);  
  38.         wait_for_completion(&c_idle.done);  
  39.     }  
  40.   
  41.     if (IS_ERR(c_idle.idle)) {  
  42.         printk("failed fork for CPU %d\n", cpu);  
  43.         destroy_work_on_stack(&c_idle.work);  
  44.         return PTR_ERR(c_idle.idle);  
  45.     }  
  46.   
  47.     set_idle_for_cpu(cpu, c_idle.idle);  
  48. do_rest:  
  49.     per_cpu(current_task, cpu) = c_idle.idle;  
  50. #ifdef CONFIG_X86_32  
  51.     /* Stack for startup_32 can be just as for start_secondary onwards */  
  52.     irq_ctx_init(cpu);  
  53. #else  
  54.     clear_tsk_thread_flag(c_idle.idle, TIF_FORK);  
  55.     initial_gs = per_cpu_offset(cpu);  
  56.     per_cpu(kernel_stack, cpu) =  
  57.         (unsigned long)task_stack_page(c_idle.idle) -  
  58.         KERNEL_STACK_OFFSET + THREAD_SIZE;  
  59. #endif  
  60.     early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);  
  61.     initial_code = (unsigned long)start_secondary;  
  62.     stack_start.sp = (void *) c_idle.idle->thread.sp;  
  63.   
  64.     /* start_ip had better be page-aligned! */  
  65.     start_ip = setup_trampoline();  
  66.   
  67.     /* So we see what's up */  
  68.     announce_cpu(cpu, apicid);  
  69.   
  70.     /* 
  71.      * This grunge runs the startup process for 
  72.      * the targeted processor. 
  73.      */  
  74.   
  75.     atomic_set(&init_deasserted, 0);  
  76.   
  77.     if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {  
  78.   
  79.         pr_debug("Setting warm reset code and vector.\n");  
  80.   
  81.         smpboot_setup_warm_reset_vector(start_ip);  
  82.         /* 
  83.          * Be paranoid about clearing APIC errors. 
  84.         */  
  85.         if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {  
  86.             apic_write(APIC_ESR, 0);  
  87.             apic_read(APIC_ESR);  
  88.         }  
  89.     }  
  90.   
  91.     /* 
  92.      * Kick the secondary CPU. Use the method in the APIC driver 
  93.      * if it's defined - or use an INIT boot APIC message otherwise: 
  94.      */  
  95.     if (apic->wakeup_secondary_cpu)  
  96.         boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);  
  97.     else  
  98.         boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);  
  99.   
  100.     if (!boot_error) {  
  101.         /* 
  102.          * allow APs to start initializing. 
  103.          */  
  104.         pr_debug("Before Callout %d.\n", cpu);  
  105.         cpumask_set_cpu(cpu, cpu_callout_mask);  
  106.         pr_debug("After Callout %d.\n", cpu);  
  107.   
  108.         /* 
  109.          * Wait 5s total for a response 
  110.          */  
  111.         for (timeout = 0; timeout < 50000; timeout++) {  
  112.             if (cpumask_test_cpu(cpu, cpu_callin_mask))  
  113.                 break;  /* It has booted */  
  114.             udelay(100);  
  115.         }  
  116.   
  117.         if (cpumask_test_cpu(cpu, cpu_callin_mask))  
  118.             pr_debug("CPU%d: has booted.\n", cpu);  
  119.         else {  
  120.             boot_error = 1;  
  121.             if (*((volatile unsigned char *)trampoline_base)  
  122.                     == 0xA5)  
  123.                 /* trampoline started but...? */  
  124.                 pr_err("CPU%d: Stuck ??\n", cpu);  
  125.             else  
  126.                 /* trampoline code not run */  
  127.                 pr_err("CPU%d: Not responding.\n", cpu);  
  128.             if (apic->inquire_remote_apic)  
  129.                 apic->inquire_remote_apic(apicid);  
  130.         }  
  131.     }  
  132.   
  133.     if (boot_error) {  
  134.         /* Try to put things back the way they were before ... */  
  135.         numa_remove_cpu(cpu); /* was set by numa_add_cpu */  
  136.   
  137.         /* was set by do_boot_cpu() */  
  138.         cpumask_clear_cpu(cpu, cpu_callout_mask);  
  139.   
  140.         /* was set by cpu_init() */  
  141.         cpumask_clear_cpu(cpu, cpu_initialized_mask);  
  142.   
  143.         set_cpu_present(cpu, false);  
  144.         per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;  
  145.     }  
  146.   
  147.     /* mark "stuck" area as not stuck */  
  148.     *((volatile unsigned long *)trampoline_base) = 0;  
  149.   
  150.     if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {  
  151.         /* 
  152.          * Cleanup possible dangling ends... 
  153.          */  
  154.         smpboot_restore_warm_reset_vector();  
  155.     }  
  156.   
  157.     destroy_work_on_stack(&c_idle.work);  
  158.     return boot_error;  
  159. }  



C代码   收藏代码
  1. /* 
  2.  * Currently trivial. Write the real->protected mode 
  3.  * bootstrap into the page concerned. The caller 
  4.  * has made sure it's suitably aligned. 
  5.  */  
  6. unsigned long __trampinit setup_trampoline(void)  
  7. {  
  8.         memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);  
  9.         return virt_to_phys(trampoline_base);  
  10. }  


可以从上面代码中看出do_boot_cpu会为编号为apicid的AP设定好它将要使用的stack以及它将要执行的代码start_eip,在完成这些后,通过发送IPI序列来启动AP,
并会将cpu_callout_mask的代表相应AP的位清零。


C代码   收藏代码
  1. static int __cpuinit  
  2. wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)  
  3. {  
  4.     unsigned long send_status, accept_status = 0;  
  5.     int maxlvt, num_starts, j;  
  6.   
  7.     maxlvt = lapic_get_maxlvt();  
  8.   
  9.     /* 
  10.      * Be paranoid about clearing APIC errors. 
  11.      */  
  12.     if (APIC_INTEGRATED(apic_version[phys_apicid])) {  
  13.         if (maxlvt > 3)      /* Due to the Pentium erratum 3AP.  */  
  14.             apic_write(APIC_ESR, 0);  
  15.         apic_read(APIC_ESR);  
  16.     }  
  17.   
  18.     pr_debug("Asserting INIT.\n");  
  19.   
  20.     /* 
  21.      * Turn INIT on target chip 
  22.      */  
  23.     /* 
  24.      * Send IPI 
  25.      */  
  26.     apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,  
  27.                phys_apicid);  
  28.   
  29.     pr_debug("Waiting for send to finish...\n");  
  30.     send_status = safe_apic_wait_icr_idle();  
  31.   
  32.     mdelay(10);  
  33.   
  34.     pr_debug("Deasserting INIT.\n");  
  35.   
  36.     /* Target chip */  
  37.     /* Send IPI */  
  38.     apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);  
  39.   
  40.     pr_debug("Waiting for send to finish...\n");  
  41.     send_status = safe_apic_wait_icr_idle();  
  42.   
  43.     mb();  
  44.     atomic_set(&init_deasserted, 1);  
  45.   
  46.     /* 
  47.      * Should we send STARTUP IPIs ? 
  48.      * 
  49.      * Determine this based on the APIC version. 
  50.      * If we don't have an integrated APIC, don't send the STARTUP IPIs. 
  51.      */  
  52.     if (APIC_INTEGRATED(apic_version[phys_apicid]))  
  53.         num_starts = 2;  
  54.     else  
  55.         num_starts = 0;  
  56.   
  57.     /* 
  58.      * Paravirt / VMI wants a startup IPI hook here to set up the 
  59.      * target processor state. 
  60.      */  
  61.     startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,  
  62.              (unsigned long)stack_start.sp);  
  63.   
  64.     /* 
  65.      * Run STARTUP IPI loop. 
  66.      */  
  67.     pr_debug("#startup loops: %d.\n", num_starts);  
  68.   
  69.     for (j = 1; j <= num_starts; j++) {  
  70.         pr_debug("Sending STARTUP #%d.\n", j);  
  71.         if (maxlvt > 3)      /* Due to the Pentium erratum 3AP.  */  
  72.             apic_write(APIC_ESR, 0);  
  73.         apic_read(APIC_ESR);  
  74.         pr_debug("After apic_write.\n");  
  75.   
  76.         /* 
  77.          * STARTUP IPI 
  78.          */  
  79.   
  80.         /* Target chip */  
  81.         /* Boot on the stack */  
  82.         /* Kick the second */  
  83.         apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),  
  84.                    phys_apicid);  
  85.   
  86.         /* 
  87.          * Give the other CPU some time to accept the IPI. 
  88.          */  
  89.         udelay(300);  
  90.   
  91.         pr_debug("Startup point 1.\n");  
  92.   
  93.         pr_debug("Waiting for send to finish...\n");  
  94.         send_status = safe_apic_wait_icr_idle();  
  95.   
  96.         /* 
  97.          * Give the other CPU some time to accept the IPI. 
  98.          */  
  99.         udelay(200);  
  100.         if (maxlvt > 3)      /* Due to the Pentium erratum 3AP.  */  
  101.             apic_write(APIC_ESR, 0);  
  102.         accept_status = (apic_read(APIC_ESR) & 0xEF);  
  103.         if (send_status || accept_status)  
  104.             break;  
  105.     }  
  106.     pr_debug("After Startup.\n");  
  107.   
  108.     if (send_status)  
  109.         printk(KERN_ERR "APIC never delivered???\n");  
  110.     if (accept_status)  
  111.         printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);  
  112.   
  113.     return (send_status | accept_status);  
  114. }  



一段wakeup_secondary_cpu_via_init执行的log
C代码   收藏代码
  1. 656 CPU17: has booted.  
  2. 657 WP output: cpu :18  
  3. 658 ------native_cpu_up cpu:18, apicid:18----------  
  4. 659 ------------in 3 do_boot_cpu------- #18  
  5. 660 Asserting INIT.  
  6. 661 Waiting for send to finish...  
  7. 662 Deasserting INIT.  
  8. 663 Waiting for send to finish...  
  9. 664 #startup loops: 2.  
  10. 665 Sending STARTUP #1.  
  11. 666 After apic_write.  
  12. 667 Startup point 1.  
  13. 668 Waiting for send to finish...  
  14. 669 Sending STARTUP #2.  
  15. 670 After apic_write.  
  16. 671 Startup point 1.  
  17. 672 Waiting for send to finish...  
  18. 673 in the cpu_init())  
  19. 674 After Startup.  
  20. 675 Before Callout 18.  
  21. 676 After Callout 18.  
  22. 677 cpu is: 12  
  23. 678 in the enable_x2apic()  
  24. 679 ------in x2apic_phys_get_apic_id-----  
  25. 680 CPU#18 (phys ID: 18) waiting for CALLOUT  
  26. 681 CALLIN, before setup_local_APIC().  
  27. 682 ------3------  
  28. 683 Stack at about ffff88021f953f44  
  29. 684 ------in x2apic_phys_get_apic_id-----  
  30. 685 CPU18: has booted.  

wakeup_secondary_cpu_via_init是与硬件相关的代码,它的主要作用是通过发送INIT-INIT-Startup IPI序列来将AP从halted的状态唤醒并让它开始执行代码start_eip所指向的代码。
Startup IPI会有一个域来指定需要执行代码的地址:apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
如果想彻底搞清楚一段代码,请去看Intel文档。


start_secondary是AP会执行的代码,这段代码通过smp_callin来将设定cpu_callin_mask来告诉BSP它已经启动。start_secondary最后是idle循环。
C代码   收藏代码
  1. /* 
  2.  * Activate a secondary processor. 
  3.  */  
  4. notrace static void __cpuinit start_secondary(void *unused)  
  5. {  
  6.     /* 
  7.      * Don't put *anything* before cpu_init(), SMP booting is too 
  8.      * fragile that we want to limit the things done here to the 
  9.      * most necessary things. 
  10.      */  
  11.     vmi_bringup();  
  12.     cpu_init();  
  13.     preempt_disable();  
  14.     smp_callin();  
  15.   
  16.     /* otherwise gcc will move up smp_processor_id before the cpu_init */  
  17.     barrier();  
  18.     /* 
  19.      * Check TSC synchronization with the BP: 
  20.      */  
  21.     check_tsc_sync_target();  
  22.   
  23.     if (nmi_watchdog == NMI_IO_APIC) {  
  24.         disable_8259A_irq(0);  
  25.         enable_NMI_through_LVT0();  
  26.         enable_8259A_irq(0);  
  27.     }  
  28.   
  29. #ifdef CONFIG_X86_32  
  30.     while (low_mappings)  
  31.         cpu_relax();  
  32.     __flush_tlb_all();  
  33. #endif  
  34.   
  35.     /* This must be done before setting cpu_online_mask */  
  36.     set_cpu_sibling_map(raw_smp_processor_id());  
  37.     wmb();  
  38.   
  39.     /* 
  40.      * We need to hold call_lock, so there is no inconsistency 
  41.      * between the time smp_call_function() determines number of 
  42.      * IPI recipients, and the time when the determination is made 
  43.      * for which cpus receive the IPI. Holding this 
  44.      * lock helps us to not include this cpu in a currently in progress 
  45.      * smp_call_function(). 
  46.      * 
  47.      * We need to hold vector_lock so there the set of online cpus 
  48.      * does not change while we are assigning vectors to cpus.  Holding 
  49.      * this lock ensures we don't half assign or remove an irq from a cpu. 
  50.      */  
  51.     ipi_call_lock();  
  52.     lock_vector_lock();  
  53.     __setup_vector_irq(smp_processor_id());  
  54.     set_cpu_online(smp_processor_id(), true);  
  55.     unlock_vector_lock();  
  56.     ipi_call_unlock();  
  57.     per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;  
  58.   
  59.     /* enable local interrupts */  
  60.     local_irq_enable();  
  61.   
  62.     x86_cpuinit.setup_percpu_clockev();  
  63.   
  64.     wmb();  
  65.     cpu_idle();  



[置顶] ARM多核处理器启动过程分析

转至:http://blog.csdn.net/qianlong4526888/article/details/27695173


linux 多核启动_第1张图片



说明:

该流程图按照代码执行时间顺序划分为4部分:

1.     Bootloader在图片上半部,最先启动;

2.     Kernel在图片下半部,由bootloader引导启动;

3.CPU0执行流程在图片左半部,bootloader代码会进行判断,先行启动CPU0;

4.  Secondary CPUs在图片右半部,由CPU唤醒

 

具体启动流程如下:

1.     在bootloader启动时,会判断执行代码的是否为CPU0,如果不是,则执行wfe等待CPU0发出sev指令唤醒。如果是CPU0,则继续进行初始化工作。

 

         mrs  x4,mpidr_el1

         tst    x4,#15             //testwether the current cpu is CPU0, ie. mpidr_el1=15

         b.eq 2f

/*

 * Secondary CPUs

 */

1: wfe

ldr x4, mbox               

cbz x4, 1b        //if x4==0(ie. The value in address of mbox is 0) dead loop,or jump to x4

br x4 // branch to thegiven address

 

2:……        //UART initialisation (38400 8N1)

 

以上mbox的地址在Makefile中写定,是0x8000fff8,该地址处初始状态内容为全0。上面代码判断,若mbox地址处内容为0,则死循环;如果不为0则直接跳转到该地址所包含内容处执行。

 

2.     在dts中,对cpu-release-addr进行赋值,将其地址设为0x8000fff8。即只要往该地址写入相应的值,例如地址A,并且发送sev指令,就能将次级CPU唤醒,并跳转到A地址处执行。

cpu-release-addr = <0x0 0x8000fff8>; 

 

3. 内核中smp_prepare_cpus 函数对0x8000fff8地址处内容进行了赋值,其值为函数secondary_holding_pen 的地址:

release_addr = __va(cpu_release_addr[cpu]);

release_addr[0] = (void*)__pa(secondary_holding_pen);//write function address to mbox

 

以上代码执行完后发送sev指令,唤醒其他次级CPU执行secondary_holding_pen函数:

/*

 * Send an event to wake up the secondaries.

 */

sev();

 

4. secondary cpu 执行secondary_holding_pen()函数时都会去判断当前CPU的ID,并与secondary_holding_pen_release变量做比对,如果相等,则执行进一步初始化,否则执行WFE等待;

secondary_holding_pen_release变量的修改过程由CPU0调用smp_init()函数进行。该函数首先为相应CPU绑定一个idle线程,然后修改secondary_holding_pen_release的值(其值即CPU0欲唤醒的CPU的ID),最后发送sev指令,唤醒相应CPU执行idle线程。

 

secondary_holding_pen()函数代码如下:

         /*

          * This provides a"holding pen" for platforms to hold all secondary

          * cores are helduntil we're ready for them to initialise.

          */

ENTRY(secondary_holding_pen)

         bl      el2_setup                          // Drop to EL1

         mrs  x0, mpidr_el1

         and  x0, x0, #15                        // CPU number

         adr   x1, 1b

         ldp   x2, x3, [x1]

         sub   x1, x1, x2

         add  x3, x3, x1

pen: ldr    x4, [x3]

         cmp x4,x0

         b.eq secondary_startup

         wfe

         b       pen

ENDPROC(secondary_holding_pen)

 

 

附录:

内核中启动secondary cpus函数调用过程大致如下:

start_kernel èrest_initèkernel_inièkernel_init_freeable èsmp_init()  kernel/smp.c line 649, 由CPU0激活剩余的处理器

cpu_upè_cpu_up()è__cpu_up ()èboot_secondary ()èwrite_pen_release该函数中有一句:secondary_holding_pen_release = val; 然后发送sev指令,激活剩余处理器。

 

 

 

 

linux SMP多核启动分析   

 

startup_32:

    cld       //决定内存地址的增长方向DF = xx ,STD对立

    cli       //禁止中断

    movl   $(KERNEL_DS),%eax

    mov    %ax,%ds

    mov    %ax,%es

    mov    %ax,%fs

mov    %ax,%gs

 

#ifdef __SMP__

    orw    %bx,%bx         # What state are we in BX=1 for SMP

# 0 for boot

jz  2f                      # Initial boot

 

//根据bx值指示是主cpu(bx=0)还是次cpu(bx=1)

//然后会有不同的执行路径

/*

 *  We are trampolining an SMP processor

 *//这里是其他次cpu执行路径

 

    mov    %ax,%ss

    xorl    %eax,%eax          # Back to 0

    mov    %cx,%ax           # SP low 16 bits

    movl   %eax,%esp

    pushl   0                   # Clear NT

    popfl

    ljmp $(KERNEL_CS), $0x100000    # Into C and sanity

 

2://这里是主cpu的执行路径

#endif

 

    lss SYMBOL_NAME(stack_start),%esp

    xorl %eax,%eax

1:  incl %eax       # check that A20 really IS enabled

    movl %eax,0x000000  # loop forever if it isn't

    cmpl %eax,0x100000

    je 1b

/*

 * Initialize eflags.  Some BIOS's leave bits like NT set.  This would

 * confuse the debugger if this code is traced.

 * XXX - best to initialize before switching to protected mode.

 */

    pushl $0

    popfl

/*

 * Clear BSS

 */

    xorl %eax,%eax

    movl $ SYMBOL_NAME(_edata),%edi

    movl $ SYMBOL_NAME(_end),%ecx

    subl %edi,%ecx

    cld

    rep

    stosb

/*

 * Do the decompression, and jump to the new kernel..

 */

    subl $16,%esp   # place for structure on the stack

    pushl %esp  # address of structure as first arg

    call SYMBOL_NAME(decompress_kernel)

    orl  %eax,%eax

    jnz  3f

    xorl %ebx,%ebx

    ljmp $(KERNEL_CS), $0x100000

 

 

 

 

ljmp $(KERNEL_CS), $0x100000

这个其实就是跳到start_kernel函数。

 

asmlinkage void start_kernel(void)

{

    char * command_line;

 

/*

 *  This little check will move.

 */

 

#ifdef __SMP__

    static int first_cpu=1;

    //这个不是函数局部变量,是函数静态变量,主cpu执行这个函数时复位为1,其他cpu0,因为主cpu总是第一个执行这个函数的。

    if(!first_cpu)

        start_secondary();

//对于

    first_cpu=0;

   

#endif 

/*

 * Interrupts are still disabled. Do necessary setups, then

 * enable them

 */

    setup_arch(&command_line, &memory_start, &memory_end);

    memory_start = paging_init(memory_start,memory_end);

    trap_init();

    init_IRQ();

    sched_init();

    time_init();

    parse_options(command_line);

#ifdef CONFIG_MODULES

    init_modules();

#endif

#ifdef CONFIG_PROFILE

    if (!prof_shift)

#ifdef CONFIG_PROFILE_SHIFT

        prof_shift = CONFIG_PROFILE_SHIFT;

#else

        prof_shift = 2;

#endif

#endif

    if (prof_shift) {

        prof_buffer = (unsigned int *) memory_start;

        /* only text is profiled */

        prof_len = (unsigned long) &_etext - (unsigned long) &_stext;

        prof_len >>= prof_shift;

        memory_start += prof_len * sizeof(unsigned int);

    }

    memory_start = console_init(memory_start,memory_end);

#ifdef CONFIG_PCI

    memory_start = pci_init(memory_start,memory_end);

#endif

    memory_start = kmalloc_init(memory_start,memory_end);

    sti();

    calibrate_delay();

    memory_start = inode_init(memory_start,memory_end);

    memory_start = file_table_init(memory_start,memory_end);

    memory_start = name_cache_init(memory_start,memory_end);

#ifdef CONFIG_BLK_DEV_INITRD

    if (initrd_start && initrd_start < memory_start) {

        printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "

            "disabling it.\n",initrd_start,memory_start);

        initrd_start = 0;

    }

#endif

    mem_init(memory_start,memory_end);

    buffer_init();

    sock_init();

#if defined(CONFIG_SYSVIPC) || defined(CONFIG_KERNELD)

    ipc_init();

#endif

    dquot_init();

    arch_syms_export();

    sti();

    check_bugs();

 

    printk(linux_banner);

#ifdef __SMP__

    smp_init();

#endif

    sysctl_init();

    /*

     *  We count on the initial thread going ok

     *  Like idlers init is an unlocked kernel thread, which will

     *  make syscalls (and thus be locked).

     */

    kernel_thread(init, NULL, 0);

/*

 * task[0] is meant to be used as an "idle" task: it may not sleep, but

 * it might do some general things like count free pages or it could be

 * used to implement a reasonable LRU algorithm for the paging routines:

 * anything that can be useful, but shouldn't take time from the real

 * processes.

 *

 * Right now task[0] just does a infinite idle loop.

 */

    cpu_idle(NULL);

}

 

 

 

asmlinkage void start_secondary(void)

{

    trap_init();

    init_IRQ();

//初始化自己的irq

    smp_callin();

//这个等待主cpu给大家发送开始信号

    cpu_idle(NULL);

//这个是ide进程。

}

void smp_callin(void)

{

    extern void calibrate_delay(void);

    int cpuid=GET_APIC_ID(apic_read(APIC_ID));

    unsigned long l;

   

    /*

     *  Activate our APIC

     */

     

    SMP_PRINTK(("CALLIN %d\n",smp_processor_id()));

    l=apic_read(APIC_SPIV);

    l|=(1<<8);      /* Enable */

    apic_write(APIC_SPIV,l);

    sti();

    /*

     *  Get our bogomips.

     */

    calibrate_delay();

    /*

     *  Save our processor parameters

     */

    smp_store_cpu_info(cpuid);

    /*

     *  Allow the master to continue.

     */

    set_bit(cpuid, (unsigned long *)&cpu_callin_map[0]);

    /*

     *  Until we are ready for SMP scheduling

     */

    load_ldt(0);

/*  printk("Testing faulting...\n");

    *(long *)0=1;       OOPS... */

    local_flush_tlb();

    while(!smp_commenced);

//这个可以看成是自旋锁,等待主cpusmp_commenced信号即开始信号。

    if (cpu_number_map[cpuid] == -1)

        while(1);

    local_flush_tlb();

    SMP_PRINTK(("Commenced..\n"));

   

    load_TR(cpu_number_map[cpuid]);

/*  while(1);*/

}

int cpu_idle(void *unused)

{

    for(;;)

        idle();

}

cpu给各次cpu发开始信号是在init函数中调用smp_begin函数:

static void smp_begin(){

smp_threads_ready=1;

smp_commence();

//这个会通过IPI给各个次cpu发送相关中断来通信

}

每个cpu有一个current指针。

刚开始的时候由主cpu赋值为init_task;

在主cpu调用 sched_init赋值。

void sched_init(void)

{

    /*

     *  We have to do a little magic to get the first

     *  process right in SMP mode.

     */

    int cpu=smp_processor_id();//这个为0,因为是主cpu才调用。

#ifndef __SMP__

    current_set[cpu]=&init_task;

#else

    init_task.processor=cpu;

//这个是将init_task标志为主cpu在运行。

    for(cpu = 0; cpu < NR_CPUS; cpu++)

        current_set[cpu] = &init_task;

#endif

    init_bh(TIMER_BH, timer_bh);

    init_bh(TQUEUE_BH, tqueue_bh);

    init_bh(IMMEDIATE_BH, immediate_bh);

}

同时这些还会在 smp_init丰富。

static void smp_init(void)

{

    int i, j;

    smp_boot_cpus();

   

    /*

     *  Create the slave init tasks as sharing pid 0.

     *

     *  This should only happen if we have virtual CPU numbers

     *  higher than 0.

     */

 

    for (i=1; i

    {

        struct task_struct *n, *p;

 

        j = cpu_logical_map[i];

        /*

         *  We use kernel_thread for the idlers which are

         *  unlocked tasks running in kernel space.

         */

        kernel_thread(cpu_idle, NULL, CLONE_PID);

//这个其实就是创建线程然后这个线程体现在task[i]上了,因为创建的时候的task_struct就是从taski]取的。

        /*

         *  Don't assume linear processor numbering

         */

        current_set[j]=task[i];

        current_set[j]->processor=j;

        cli();

        n = task[i]->next_run;

        p = task[i]->prev_run;

        nr_running--;

        n->prev_run = p;

        p->next_run = n;

        task[i]->next_run = task[i]->prev_run = task[i];

        sti();

    }

}

上面执行完后就给每个cpu加了一个idle任务。

 

然后kernel_thread(init, NULL, 0)创建的init任务。

 

//每个cpu在时间中断时都可能调用这个共同的函数。

asmlinkage void schedule(void)

{

    int c;

    struct task_struct * p;

    struct task_struct * prev, * next;

    unsigned long timeout = 0;

    int this_cpu=smp_processor_id();

//获取cpu_id;

/* check alarm, wake up any interruptible tasks that have got a signal */

 

    if (intr_count)

        goto scheduling_in_interrupt;

 

    if (bh_active & bh_mask) {

        intr_count = 1;

        do_bottom_half();

        intr_count = 0;

    }

 

    run_task_queue(&tq_scheduler);

 

    need_resched = 0;

    prev = current;

    cli();

    /* move an exhausted RR process to be last.. */

    if (!prev->counter && prev->policy == SCHED_RR) {

        prev->counter = prev->priority;

        move_last_runqueue(prev);

    }

    switch (prev->state) {

        case TASK_INTERRUPTIBLE:

            if (prev->signal & ~prev->blocked)

                goto makerunnable;

            timeout = prev->timeout;

            if (timeout && (timeout <= jiffies)) {

                prev->timeout = 0;

                timeout = 0;

        makerunnable:

                prev->state = TASK_RUNNING;

                break;

            }

        default:

            del_from_runqueue(prev);

        case TASK_RUNNING:

    }

    p = init_task.next_run;

//获取进程双向链表的一个节点。

    sti();

   

#ifdef __SMP__

    /*

     *  This is safe as we do not permit re-entry of schedule()

     */

    prev->processor = NO_PROC_ID;

#define idle_task (task[cpu_number_map[this_cpu]])

#else

#define idle_task (&init_task)

#endif 

 

/*

 * Note! there may appear new tasks on the run-queue during this, as

 * interrupts are enabled. However, they will be put on front of the

 * list, so our list starting at "p" is essentially fixed.

 */

/* this is the scheduler proper: */

    c = -1000;

    next = idle_task;

    while (p != &init_task) {

//p初始值为init_task.next_run

//当回到init_task时说明已经查找为所有的了。

        int weight = goodness(p, prev, this_cpu);

        if (weight > c)

            c = weight, next = p;

        p = p->next_run;

    }

//这个是查找所有的task,找出最合适的task来调度。

    /* if all runnable processes have "counter == 0", re-calculate counters */

    if (!c) {

        for_each_task(p)

            p->counter = (p->counter >> 1) + p->priority;

    }

#ifdef __SMP__

    /*

     *  Allocate process to CPU

     */

     

     next->processor = this_cpu;

//将这个将要被执行的processor标识为这个cpu

     next->last_processor = this_cpu;

#endif 

#ifdef __SMP_PROF__

    /* mark processor running an idle thread */

    if (0==next->pid)

        set_bit(this_cpu,&smp_idle_map);

    else

        clear_bit(this_cpu,&smp_idle_map);

#endif

    if (prev != next) {

        struct timer_list timer;

 

        kstat.context_swtch++;

        if (timeout) {

            init_timer(&timer);

            timer.expires = timeout;

            timer.data = (unsigned long) prev;

            timer.function = process_timeout;

            add_timer(&timer);

        }

        get_mmu_context(next);

        switch_to(prev,next);

        if (timeout)

            del_timer(&timer);

    }

    return;

 

scheduling_in_interrupt:

    printk("Aiee: scheduling in interrupt %p\n",

        __builtin_return_address(0));

}

上面需要注意的是current变量,在单核中肯定就是一个变量,在多核中肯定是各个cpu有自己的current

其定义如下:

#define current (0+current_set[smp_processor_id()]

smpcurrentcurrent_set数组中的一个元素,是指具体一个cpu的当前进程。

从上面可以看出一个cpu是从全局task找一个task来运行,每个cpu有一个idle_task,这个task的编号是固定的。

 

所有的task可以通过init_task来找到,因为创建新进程(内核线程)的时候,会将新建的挂到链表上。

init_task是静态挂在这上面的。

附上task_struct:

struct task_struct {

/* these are hardcoded - don't touch */

    volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */

    long counter;

    long priority;

    unsigned long signal;

    unsigned long blocked;  /* bitmap of masked signals */

    unsigned long flags;    /* per process flags, defined below */

    int errno;

    long debugreg[8];  /* Hardware debugging registers */

    struct exec_domain *exec_domain;

/* various fields */

    struct linux_binfmt *binfmt;

    struct task_struct *next_task, *prev_task;

    struct task_struct *next_run,  *prev_run;

    unsigned long saved_kernel_stack;

    unsigned long kernel_stack_page;

    int exit_code, exit_signal;

    /* ??? */

    unsigned long personality;

    int dumpable:1;

    int did_exec:1;

    /* shouldn't this be pid_t? */

    int pid;

    int pgrp;

    int tty_old_pgrp;

    int session;

    /* boolean value for session group leader */

    int leader;

    int groups[NGROUPS];

    /*

     * pointers to (original) parent process, youngest child, younger sibling,

     * older sibling, respectively.  (p->father can be replaced with

     * p->p_pptr->pid)

     */

    struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;

    struct wait_queue *wait_chldexit;   /* for wait4() */

    unsigned short uid,euid,suid,fsuid;

    unsigned short gid,egid,sgid,fsgid;

    unsigned long timeout, policy, rt_priority;

    unsigned long it_real_value, it_prof_value, it_virt_value;

    unsigned long it_real_incr, it_prof_incr, it_virt_incr;

    struct timer_list real_timer;

    long utime, stime, cutime, cstime, start_time;

/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */

    unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;

    int swappable:1;

    unsigned long swap_address;

    unsigned long old_maj_flt;  /* old value of maj_flt */

    unsigned long dec_flt;      /* page fault count of the last time */

    unsigned long swap_cnt;     /* number of pages to swap on next pass */

/* limits */

    struct rlimit rlim[RLIM_NLIMITS];

    unsigned short used_math;

    char comm[16];

/* file system info */

    int link_count;

    struct tty_struct *tty; /* NULL if no tty */

/* ipc stuff */

    struct sem_undo *semundo;

    struct sem_queue *semsleeping;

/* ldt for this task - used by Wine.  If NULL, default_ldt is used */

    struct desc_struct *ldt;

/* tss for this task */

    struct thread_struct tss;

/* filesystem information */

    struct fs_struct *fs;

/* open file information */

    struct files_struct *files;

/* memory management info */

    struct mm_struct *mm;

/* signal handlers */

    struct signal_struct *sig;

#ifdef __SMP__

    int processor;

    int last_processor;

    int lock_depth;     /* Lock depth. We can context switch in and out of holding a syscall kernel lock... */ 

#endif 

};

 

故这个p = init_task.next_run;

p可以获取到所有在就绪状态的task;

 

你可能感兴趣的:(LINUX)