转至:http://blog.csdn.net/qianlong4526888/article/details/27695173
说明:
该流程图按照代码执行时间顺序划分为4部分:
1. Bootloader在图片上半部,最先启动;
2. Kernel在图片下半部,由bootloader引导启动;
3.CPU0执行流程在图片左半部,bootloader代码会进行判断,先行启动CPU0;
4. Secondary CPUs在图片右半部,由CPU唤醒
具体启动流程如下:
1. 在bootloader启动时,会判断执行代码的是否为CPU0,如果不是,则执行wfe等待CPU0发出sev指令唤醒。如果是CPU0,则继续进行初始化工作。
mrs x4,mpidr_el1
tst x4,#15 //testwether the current cpu is CPU0, ie. mpidr_el1=15
b.eq 2f
/*
* Secondary CPUs
*/
1: wfe
ldr x4, mbox
cbz x4, 1b //if x4==0(ie. The value in address of mbox is 0) dead loop,or jump to x4
br x4 // branch to thegiven address
2:…… //UART initialisation (38400 8N1)
以上mbox的地址在Makefile中写定,是0x8000fff8,该地址处初始状态内容为全0。上面代码判断,若mbox地址处内容为0,则死循环;如果不为0则直接跳转到该地址所包含内容处执行。
2. 在dts中,对cpu-release-addr进行赋值,将其地址设为0x8000fff8。即只要往该地址写入相应的值,例如地址A,并且发送sev指令,就能将次级CPU唤醒,并跳转到A地址处执行。
cpu-release-addr = <0x0 0x8000fff8>;
3. 内核中smp_prepare_cpus 函数对0x8000fff8地址处内容进行了赋值,其值为函数secondary_holding_pen 的地址:
release_addr = __va(cpu_release_addr[cpu]);
release_addr[0] = (void*)__pa(secondary_holding_pen);//write function address to mbox
以上代码执行完后发送sev指令,唤醒其他次级CPU执行secondary_holding_pen函数:
/*
* Send an event to wake up the secondaries.
*/
sev();
4. secondary cpu 执行secondary_holding_pen()函数时都会去判断当前CPU的ID,并与secondary_holding_pen_release变量做比对,如果相等,则执行进一步初始化,否则执行WFE等待;
secondary_holding_pen_release变量的修改过程由CPU0调用smp_init()函数进行。该函数首先为相应CPU绑定一个idle线程,然后修改secondary_holding_pen_release的值(其值即CPU0欲唤醒的CPU的ID),最后发送sev指令,唤醒相应CPU执行idle线程。
secondary_holding_pen()函数代码如下:
/*
* This provides a"holding pen" for platforms to hold all secondary
* cores are helduntil we're ready for them to initialise.
*/
ENTRY(secondary_holding_pen)
bl el2_setup // Drop to EL1
mrs x0, mpidr_el1
and x0, x0, #15 // CPU number
adr x1, 1b
ldp x2, x3, [x1]
sub x1, x1, x2
add x3, x3, x1
pen: ldr x4, [x3]
cmp x4,x0
b.eq secondary_startup
wfe
b pen
ENDPROC(secondary_holding_pen)
附录:
内核中启动secondary cpus函数调用过程大致如下:
start_kernel èrest_initèkernel_inièkernel_init_freeable èsmp_init() kernel/smp.c line 649, 由CPU0激活剩余的处理器
cpu_upè_cpu_up()è__cpu_up ()èboot_secondary ()èwrite_pen_release该函数中有一句:secondary_holding_pen_release = val; 然后发送sev指令,激活剩余处理器。
linux SMP多核启动分析
startup_32:
cld //决定内存地址的增长方向DF = xx ,与STD对立
cli //禁止中断
movl $(KERNEL_DS),%eax
mov %ax,%ds
mov %ax,%es
mov %ax,%fs
mov %ax,%gs
#ifdef __SMP__
orw %bx,%bx # What state are we in BX=1 for SMP
# 0 for boot
jz 2f # Initial boot
//根据bx值指示是主cpu(bx=0)还是次cpu(bx=1)
//然后会有不同的执行路径
/*
* We are trampolining an SMP processor
*//这里是其他次cpu执行路径
mov %ax,%ss
xorl %eax,%eax # Back to 0
mov %cx,%ax # SP low 16 bits
movl %eax,%esp
pushl 0 # Clear NT
popfl
ljmp $(KERNEL_CS), $0x100000 # Into C and sanity
2://这里是主cpu的执行路径
#endif
lss SYMBOL_NAME(stack_start),%esp
xorl %eax,%eax
1: incl %eax # check that A20 really IS enabled
movl %eax,0x000000 # loop forever if it isn't
cmpl %eax,0x100000
je 1b
/*
* Initialize eflags. Some BIOS's leave bits like NT set. This would
* confuse the debugger if this code is traced.
* XXX - best to initialize before switching to protected mode.
*/
pushl $0
popfl
/*
* Clear BSS
*/
xorl %eax,%eax
movl $ SYMBOL_NAME(_edata),%edi
movl $ SYMBOL_NAME(_end),%ecx
subl %edi,%ecx
cld
rep
stosb
/*
* Do the decompression, and jump to the new kernel..
*/
subl $16,%esp # place for structure on the stack
pushl %esp # address of structure as first arg
call SYMBOL_NAME(decompress_kernel)
orl %eax,%eax
jnz 3f
xorl %ebx,%ebx
ljmp $(KERNEL_CS), $0x100000
ljmp $(KERNEL_CS), $0x100000
这个其实就是跳到start_kernel函数。
asmlinkage void start_kernel(void)
{
char * command_line;
/*
* This little check will move.
*/
#ifdef __SMP__
static int first_cpu=1;
//这个不是函数局部变量,是函数静态变量,主cpu执行这个函数时复位为1,其他cpu为0,因为主cpu总是第一个执行这个函数的。
if(!first_cpu)
start_secondary();
//对于
first_cpu=0;
#endif
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
setup_arch(&command_line, &memory_start, &memory_end);
memory_start = paging_init(memory_start,memory_end);
trap_init();
init_IRQ();
sched_init();
time_init();
parse_options(command_line);
#ifdef CONFIG_MODULES
init_modules();
#endif
#ifdef CONFIG_PROFILE
if (!prof_shift)
#ifdef CONFIG_PROFILE_SHIFT
prof_shift = CONFIG_PROFILE_SHIFT;
#else
prof_shift = 2;
#endif
#endif
if (prof_shift) {
prof_buffer = (unsigned int *) memory_start;
/* only text is profiled */
prof_len = (unsigned long) &_etext - (unsigned long) &_stext;
prof_len >>= prof_shift;
memory_start += prof_len * sizeof(unsigned int);
}
memory_start = console_init(memory_start,memory_end);
#ifdef CONFIG_PCI
memory_start = pci_init(memory_start,memory_end);
#endif
memory_start = kmalloc_init(memory_start,memory_end);
sti();
calibrate_delay();
memory_start = inode_init(memory_start,memory_end);
memory_start = file_table_init(memory_start,memory_end);
memory_start = name_cache_init(memory_start,memory_end);
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && initrd_start < memory_start) {
printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
"disabling it.\n",initrd_start,memory_start);
initrd_start = 0;
}
#endif
mem_init(memory_start,memory_end);
buffer_init();
sock_init();
#if defined(CONFIG_SYSVIPC) || defined(CONFIG_KERNELD)
ipc_init();
#endif
dquot_init();
arch_syms_export();
sti();
check_bugs();
printk(linux_banner);
#ifdef __SMP__
smp_init();
#endif
sysctl_init();
/*
* We count on the initial thread going ok
* Like idlers init is an unlocked kernel thread, which will
* make syscalls (and thus be locked).
*/
kernel_thread(init, NULL, 0);
/*
* task[0] is meant to be used as an "idle" task: it may not sleep, but
* it might do some general things like count free pages or it could be
* used to implement a reasonable LRU algorithm for the paging routines:
* anything that can be useful, but shouldn't take time from the real
* processes.
*
* Right now task[0] just does a infinite idle loop.
*/
cpu_idle(NULL);
}
asmlinkage void start_secondary(void)
{
trap_init();
init_IRQ();
//初始化自己的irq
smp_callin();
//这个等待主cpu给大家发送开始信号
cpu_idle(NULL);
//这个是ide进程。
}
void smp_callin(void)
{
extern void calibrate_delay(void);
int cpuid=GET_APIC_ID(apic_read(APIC_ID));
unsigned long l;
/*
* Activate our APIC
*/
SMP_PRINTK(("CALLIN %d\n",smp_processor_id()));
l=apic_read(APIC_SPIV);
l|=(1<<8); /* Enable */
apic_write(APIC_SPIV,l);
sti();
/*
* Get our bogomips.
*/
calibrate_delay();
/*
* Save our processor parameters
*/
smp_store_cpu_info(cpuid);
/*
* Allow the master to continue.
*/
set_bit(cpuid, (unsigned long *)&cpu_callin_map[0]);
/*
* Until we are ready for SMP scheduling
*/
load_ldt(0);
/* printk("Testing faulting...\n");
*(long *)0=1; OOPS... */
local_flush_tlb();
while(!smp_commenced);
//这个可以看成是自旋锁,等待主cpu发smp_commenced信号即开始信号。
if (cpu_number_map[cpuid] == -1)
while(1);
local_flush_tlb();
SMP_PRINTK(("Commenced..\n"));
load_TR(cpu_number_map[cpuid]);
/* while(1);*/
}
int cpu_idle(void *unused)
{
for(;;)
idle();
}
主cpu给各次cpu发开始信号是在init函数中调用smp_begin函数:
static void smp_begin(){
smp_threads_ready=1;
smp_commence();
//这个会通过IPI给各个次cpu发送相关中断来通信
}
每个cpu有一个current指针。
刚开始的时候由主cpu赋值为init_task;
在主cpu调用 sched_init赋值。
void sched_init(void)
{
/*
* We have to do a little magic to get the first
* process right in SMP mode.
*/
int cpu=smp_processor_id();//这个为0,因为是主cpu才调用。
#ifndef __SMP__
current_set[cpu]=&init_task;
#else
init_task.processor=cpu;
//这个是将init_task标志为主cpu在运行。
for(cpu = 0; cpu < NR_CPUS; cpu++)
current_set[cpu] = &init_task;
#endif
init_bh(TIMER_BH, timer_bh);
init_bh(TQUEUE_BH, tqueue_bh);
init_bh(IMMEDIATE_BH, immediate_bh);
}
同时这些还会在 smp_init丰富。
static void smp_init(void)
{
int i, j;
smp_boot_cpus();
/*
* Create the slave init tasks as sharing pid 0.
*
* This should only happen if we have virtual CPU numbers
* higher than 0.
*/
for (i=1; i
{
struct task_struct *n, *p;
j = cpu_logical_map[i];
/*
* We use kernel_thread for the idlers which are
* unlocked tasks running in kernel space.
*/
kernel_thread(cpu_idle, NULL, CLONE_PID);
//这个其实就是创建线程然后这个线程体现在task[i]上了,因为创建的时候的task_struct就是从task[i]取的。
/*
* Don't assume linear processor numbering
*/
current_set[j]=task[i];
current_set[j]->processor=j;
cli();
n = task[i]->next_run;
p = task[i]->prev_run;
nr_running--;
n->prev_run = p;
p->next_run = n;
task[i]->next_run = task[i]->prev_run = task[i];
sti();
}
}
上面执行完后就给每个cpu加了一个idle任务。
然后kernel_thread(init, NULL, 0)创建的init任务。
//每个cpu在时间中断时都可能调用这个共同的函数。
asmlinkage void schedule(void)
{
int c;
struct task_struct * p;
struct task_struct * prev, * next;
unsigned long timeout = 0;
int this_cpu=smp_processor_id();
//获取cpu_id;
/* check alarm, wake up any interruptible tasks that have got a signal */
if (intr_count)
goto scheduling_in_interrupt;
if (bh_active & bh_mask) {
intr_count = 1;
do_bottom_half();
intr_count = 0;
}
run_task_queue(&tq_scheduler);
need_resched = 0;
prev = current;
cli();
/* move an exhausted RR process to be last.. */
if (!prev->counter && prev->policy == SCHED_RR) {
prev->counter = prev->priority;
move_last_runqueue(prev);
}
switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (prev->signal & ~prev->blocked)
goto makerunnable;
timeout = prev->timeout;
if (timeout && (timeout <= jiffies)) {
prev->timeout = 0;
timeout = 0;
makerunnable:
prev->state = TASK_RUNNING;
break;
}
default:
del_from_runqueue(prev);
case TASK_RUNNING:
}
p = init_task.next_run;
//获取进程双向链表的一个节点。
sti();
#ifdef __SMP__
/*
* This is safe as we do not permit re-entry of schedule()
*/
prev->processor = NO_PROC_ID;
#define idle_task (task[cpu_number_map[this_cpu]])
#else
#define idle_task (&init_task)
#endif
/*
* Note! there may appear new tasks on the run-queue during this, as
* interrupts are enabled. However, they will be put on front of the
* list, so our list starting at "p" is essentially fixed.
*/
/* this is the scheduler proper: */
c = -1000;
next = idle_task;
while (p != &init_task) {
//p初始值为init_task.next_run
//当回到init_task时说明已经查找为所有的了。
int weight = goodness(p, prev, this_cpu);
if (weight > c)
c = weight, next = p;
p = p->next_run;
}
//这个是查找所有的task,找出最合适的task来调度。
/* if all runnable processes have "counter == 0", re-calculate counters */
if (!c) {
for_each_task(p)
p->counter = (p->counter >> 1) + p->priority;
}
#ifdef __SMP__
/*
* Allocate process to CPU
*/
next->processor = this_cpu;
//将这个将要被执行的processor标识为这个cpu
next->last_processor = this_cpu;
#endif
#ifdef __SMP_PROF__
/* mark processor running an idle thread */
if (0==next->pid)
set_bit(this_cpu,&smp_idle_map);
else
clear_bit(this_cpu,&smp_idle_map);
#endif
if (prev != next) {
struct timer_list timer;
kstat.context_swtch++;
if (timeout) {
init_timer(&timer);
timer.expires = timeout;
timer.data = (unsigned long) prev;
timer.function = process_timeout;
add_timer(&timer);
}
get_mmu_context(next);
switch_to(prev,next);
if (timeout)
del_timer(&timer);
}
return;
scheduling_in_interrupt:
printk("Aiee: scheduling in interrupt %p\n",
__builtin_return_address(0));
}
上面需要注意的是current变量,在单核中肯定就是一个变量,在多核中肯定是各个cpu有自己的current:
其定义如下:
#define current (0+current_set[smp_processor_id()]
在smp中current是current_set数组中的一个元素,是指具体一个cpu的当前进程。
从上面可以看出一个cpu是从全局task找一个task来运行,每个cpu有一个idle_task,这个task的编号是固定的。
所有的task可以通过init_task来找到,因为创建新进程(内核线程)的时候,会将新建的挂到链表上。
而init_task是静态挂在这上面的。
附上task_struct:
struct task_struct {
/* these are hardcoded - don't touch */
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
long counter;
long priority;
unsigned long signal;
unsigned long blocked; /* bitmap of masked signals */
unsigned long flags; /* per process flags, defined below */
int errno;
long debugreg[8]; /* Hardware debugging registers */
struct exec_domain *exec_domain;
/* various fields */
struct linux_binfmt *binfmt;
struct task_struct *next_task, *prev_task;
struct task_struct *next_run, *prev_run;
unsigned long saved_kernel_stack;
unsigned long kernel_stack_page;
int exit_code, exit_signal;
/* ??? */
unsigned long personality;
int dumpable:1;
int did_exec:1;
/* shouldn't this be pid_t? */
int pid;
int pgrp;
int tty_old_pgrp;
int session;
/* boolean value for session group leader */
int leader;
int groups[NGROUPS];
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->p_pptr->pid)
*/
struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
struct wait_queue *wait_chldexit; /* for wait4() */
unsigned short uid,euid,suid,fsuid;
unsigned short gid,egid,sgid,fsgid;
unsigned long timeout, policy, rt_priority;
unsigned long it_real_value, it_prof_value, it_virt_value;
unsigned long it_real_incr, it_prof_incr, it_virt_incr;
struct timer_list real_timer;
long utime, stime, cutime, cstime, start_time;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
int swappable:1;
unsigned long swap_address;
unsigned long old_maj_flt; /* old value of maj_flt */
unsigned long dec_flt; /* page fault count of the last time */
unsigned long swap_cnt; /* number of pages to swap on next pass */
/* limits */
struct rlimit rlim[RLIM_NLIMITS];
unsigned short used_math;
char comm[16];
/* file system info */
int link_count;
struct tty_struct *tty; /* NULL if no tty */
/* ipc stuff */
struct sem_undo *semundo;
struct sem_queue *semsleeping;
/* ldt for this task - used by Wine. If NULL, default_ldt is used */
struct desc_struct *ldt;
/* tss for this task */
struct thread_struct tss;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* memory management info */
struct mm_struct *mm;
/* signal handlers */
struct signal_struct *sig;
#ifdef __SMP__
int processor;
int last_processor;
int lock_depth; /* Lock depth. We can context switch in and out of holding a syscall kernel lock... */
#endif
};
故这个p = init_task.next_run;
p可以获取到所有在就绪状态的task;