回到start_kernel,563行调用mm_init_owner函数,将init_mm的owner字段指回init_task。这个函数可以说进入start_kernel以来最简单的函数了。继续走,setup_command_line也很简单:
static void __init setup_command_line(char *command_line) { saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); static_command_line = alloc_bootmem(strlen (command_line)+1); strcpy (saved_command_line, boot_command_line); strcpy (static_command_line, command_line); } |
把刚才在setup_arch()中拷贝进来的command_line,拷贝到全局变量saved_command_line和static_command_line所指向的内存单元中。这个内存单元通过alloc_bootmem函数在刚刚建立好的内存管理环境中进行分配。
继续走,565行,setup_nr_cpu_ids()函数,在多CPU情况下,调用同一文件中的:
static void __init setup_nr_cpu_ids(void)
{
nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}
nr_cpu_ids是一个特殊的值,在单CPU情况下是1;而SMP情况下,又是一个全局变量,被find_last_bit函数设置,针对x86体系其本质上会调用bsr汇编指令。这里我大概介绍一下这个指令的概念,可能有不对的地方,请高手指教。386以上的CPU有一对指令BSF/BSR ——正/反向位扫描。这个指令的使用方法是:BSF dest,src,影响标志位ZF。这个指令的意思是,扫描源操作数中的第一个被设置的位,如果发现某一位被设置了,则设置ZF位并将第一个被设置位的索引装载到目的操作数中;如果没有发现被设置的位,则清除ZF。BSF正向扫描各个位(从第0位到第N位),BSR相反(从第N位到第0位)。
继续走,566行,setup_per_cpu_areas,来自arch/x86/kernel/setup_percpu.c。这个函数只是设置一下SMP的每CPU存储区,也就是说为系统中的每个cpu的per_cpu变量申请空间。函数比较复杂,我这里只把整个函数列出来,对SMP感兴趣的同学可以尝试深入分析一下:
void __init setup_per_cpu_areas(void) { unsigned int cpu; unsigned long delta; int rc;
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d/n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/* * Allocate percpu area. Embedding allocator is our favorite; * however, on NUMA configurations, it can result in very * sparse unit mapping and vmalloc area isn't spacious enough * on 32bit. Use page in that case. */ #ifdef CONFIG_X86_32 if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) pcpu_chosen_fc = PCPU_FC_PAGE; #endif rc = -EINVAL; if (pcpu_chosen_fc != PCPU_FC_PAGE) { const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) pr_warning("%s allocator failed (%d), falling back to page size/n", pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_fc_alloc, pcpu_fc_free, pcpup_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc);
/* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); setup_stack_canary_segment(cpu); /* * Copy data used in early init routines from the * initial arrays to the per cpu data areas. These * arrays then become expendable and the *_early_ptr's * are zeroed indicating that the static arrays are * gone. */ #ifdef CONFIG_X86_LOCAL_APIC per_cpu(x86_cpu_to_apicid, cpu) = early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); #endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); #endif #endif /* * Up to this point, the boot CPU has been using .data.init * area. Reload any changed state for the boot CPU. */ if (cpu == boot_cpu_id) switch_to_new_gdt(cpu); }
/* indicate the early static arrays will soon be gone */ #ifdef CONFIG_X86_LOCAL_APIC early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; #endif #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif
#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) /* * make sure boot cpu node_number is right, when boot cpu is on the * node that doesn't have mem installed */ per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); #endif
/* Setup node to cpumask map */ setup_node_to_cpumask_map();
/* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); } |
在该函数中,为每个CPU分配一段专有数据区,并将.data.percpu中的数据拷贝到其中, 每个CPU各有一份。由于数据从__per_cpu_start处转移到各CPU自己的专有数据区中了, 因此存取其中的变量就不能再用原先的值了,比如存取per_cpu__runqueues 就不能再用per_cpu__runqueues了,需要做一个偏移量的调整,即需要加上各CPU自己的专有数据区首地址相对于__per_cpu_start的偏移量。在这里也就是__per_cpu_offset[i],其中CPU i的专有数据区相对于__per_cpu_start的偏移量为__per_cpu_offset[i]。
这样,就可以方便地计算专有数据区中各变量的新地址,比如对于per_cpu_runqueues, 其新地址即变成per_cpu_runqueues+__per_cpu_offset[i]。
经过这样的处理,.data.percpu这个section在系统初始化后就可以释放了。为什么要释放它?OK,自己去看arch/x86/kernel/vmlinux.lds文件,整个.data.percpu这个section都在__init_begin和__init_end之间,也就是说,该section所占内存会在系统启动后释放(free)掉。
继续走,start_kernel的567行smp_prepare_boot_cpu函数,来自arch/x86/include/asm/smp.h:
static inline void smp_prepare_boot_cpu(void)
{
smp_ops.smp_prepare_boot_cpu();
}
全局变量smp_ops也是一个smp_ops结构,在代码arch/x86/kernel/smp.c中被初始化成:
struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done,
.smp_send_stop = native_smp_send_stop, .smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up, .cpu_die = native_cpu_die, .cpu_disable = native_cpu_disable, .play_dead = native_play_dead,
.send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; |
所以,567行smp_prepare_boot_cpu函数最终调用native_smp_prepare_boot_cpu函数。该函数最终会调用switch_to_new_gdt函数,传给他的参数是当前CPU的编号:
void switch_to_new_gdt(int cpu) { struct desc_ptr gdt_descr;
gdt_descr.address = (long)get_cpu_gdt_table(cpu); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); /* Reload the per-cpu base */
load_percpu_segment(cpu); } |
load_gdt很熟悉了,就是调用lgdt汇编指令加载GDT表。那么,自系统启动至此,已经有三次加载GDT了,为啥这里又来加载一次呢?这里面的关键是get_cpu_gdt_table函数,来自arch/x86/include/asm/desc.h:
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
return per_cpu(gdt_page, cpu).gdt;
}
好了,SMP的重中之重来了,per_cpu宏,这里重点介绍它:
#define per_cpu(var, cpu) /
(*SHIFT_PERCPU_PTR(&(var), per_cpu_offset(cpu)))
我们看到传入这个宏的两个参数是gdt_page和cpu。gdt_page还记得吧,我们在“初始化GDT” http://blog.csdn.net/yunsongice/archive/2010/12/31/6110703.aspx中讲过,包含32个8字节的段描述符;cpu,刚刚传进来的当前CPU的id。翻译过来就是:
*SHIFT_PERCPU_PTR(&(gdt_page), per_cpu_offset(cpu))
#define SHIFT_PERCPU_PTR(__p, __offset) ({ /
__verify_pcpu_ptr((__p)); /
RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)); /
})
__verify_pcpu_ptr是GCC的一个优化过程,我们忽略,所以继续翻译就是:
RELOC_HIDE((typeof(*(&(gdt_page))) __kernel __force *)(&(gdt_page)), (per_cpu_offset(cpu))
#define RELOC_HIDE(ptr, off) /
({ unsigned long __ptr; /
__asm__ ("" : "=r"(__ptr) : "0"(ptr)); /
(typeof(ptr)) (__ptr + (off)); })
对于per_cpu(gdt_page, cpu),将等效地扩展为:
__per_cpu_offset[smp_processor_id()] + per_cpu__gdt_page
并且是一个lvalue,也就是说可以进行赋值操作。这正好是上述per_cpu__runqueues变量在对应CPU的专有数据区中的新地址。
由于不同的每cpu变量有不同的偏移量,并且不同的CPU其专有数据区首地址不同, 因此,通过per_cpu (var,cpu)便访问到了不同的变量。对这个概念还不是很清楚的同学请查阅一下博客“每CPU变量”http://blog.csdn.net/yunsongice/archive/2010/05/18/5605239.aspx。