此文档基于linux3.0.9内核。
在kernel启动最后会调用rest_init,内核启动即将完成,下面分析其流程:
该函数主要功能是启动内核线程kernel_init创建init任务
1.static noinline void__init_refok rest_init(void)
2.{
3.int pid;
内核RCU锁机制调度启动(新版本取消大内核锁),此机制以后分析;
4.rcu_scheduler_starting();
5./*
6.We need to spawn init first sothat it obtains pid 1, however
7.the init task will end upwanting to create kthreads, which, if
8.we schedule it before we createkthreadd, will OOPS.
9.*/
10.创建init线程,见下面分析;
11.kernel_thread(kernel_init, NULL, CLONE_FS |CLONE_SIGHAND);
12.numa_default_policy();
13.pid = kernel_thread(kthreadd,NULL, CLONE_FS | CLONE_FILES);
14.rcu_read_lock();
15.kthreadd_task =find_task_by_pid_ns(pid, &init_pid_ns);
16.rcu_read_unlock();
17.complete(&kthreadd_done);
18./*
19.The boot idle thread mustexecute schedule()
20.at least once to get thingsmoving:
21.*/
22.init_idle_bootup_task(current);
23.schedule_preempt_disabled();
24./* Call into cpu_idle withpreempt disabled */
25.cpu_idle();
26.}
kernel_init线程流程:
在以前内核采用大内核锁,锁住线程,这里采用completion进行线程通信
static int __refkernel_init(void *unused)
{
kernel_init_freeable();
/* need to finish all async __init codebefore freeing the memory */
async_synchronize_full();
释放所有init.* 段中的内存。
free_initmem();
mark_rodata_ro();
设置系统状态为运行状态
system_state = SYSTEM_RUNNING;
设定NUMA系统的内存访问策略为默认
numa_default_policy();
flush_delayed_fput();
如果ramdisk_execute_command有指定的init程序,就执行它。
if (ramdisk_execute_command) {
if(!run_init_process(ramdisk_execute_command))
return 0;
printk(KERN_WARNING "Failedto execute %s\n",
ramdisk_execute_command);
}
/*
*We try each of these until one succeeds.
*
*The Bourne shell can be used instead of init if we are
*trying to recover a really broken machine.
*/
如果execute_command有指定的init程序,就执行它
if (execute_command) {
if(!run_init_process(execute_command))
return 0;
printk(KERN_WARNING "Failedto execute %s.Attempting "
"defaults...\n",execute_command);
}
if(!run_init_process("/sbin/init") ||
!run_init_process("/etc/init") ||
!run_init_process("/bin/init") ||
!run_init_process("/bin/sh"))
return 0;
panic("No init found.Try passing init= option to kernel. "
"See Linux Documentation/init.txt for guidance.");
}
static noinline void__init kernel_init_freeable(void)
{
/*
*Wait until kthreadd is all set-up.
*/
等待kthreadd线程执行完成,阻塞
wait_for_completion(&kthreadd_done);
/* Now the scheduler is fully set up andcan do blocking allocations */
gfp_allowed_mask = __GFP_BITS_MASK;
/*
*init can allocate pages on any node
*/
set_mems_allowed(node_states[N_MEMORY]);
/*
*init can run on any cpu.
*/
set_cpus_allowed_ptr(current,cpu_all_mask);
cad_pid = task_pid(current);
smp_prepare_cpus(setup_max_cpus);
do_pre_smp_initcalls();
lockup_detector_init();
这里为SMP初始化,并启动SMP调度
smp_init();
sched_init_smp();
到此,与构架相关的部分已经初始化完成了,do_basic_setup函数主要是初始化设备驱动,完成其他驱动程序(直接编译进内核的模块)的初始化。内核中大部分的启动数据输出(都是各设备的驱动模块输出)都是这里产生的。这个函数非常重要,下面分析
do_basic_setup();
复制两次标准输入(0)的文件描述符(它是上面打开的/dev/console,也就是系统控制台):
一个作为标准输出(1)
一个作为标准出错(2)
现在标准输入、标准输出、标准出错都是/dev/console了。
这个console在内核启动参数中可以配置为某个串口(ttySn、ttyOn等等),也可以是虚拟控制台(tty0)。所以我们就在串口或者显示器上看到了之后的系统登录提示。
/* Open the /dev/console on the rootfs,this should never fail */
if (sys_open((const char __user *)"/dev/console", O_RDWR, 0) < 0)
printk(KERN_WARNING "Warning:unable to open an initial console.\n");
(void) sys_dup(0);
(void) sys_dup(0);
/*
*check if there is an early userspace init.If yes, let it do all
*the work
*/
if (!ramdisk_execute_command)
ramdisk_execute_command ="/init";
if (sys_access((const char __user *)ramdisk_execute_command, 0) != 0) {
ramdisk_execute_command = NULL;
prepare_namespace();
}
/*
*Ok, we have completed the initial bootup, and
*we're essentially up and running. Get rid of the
*initmem segments and start the user-mode stuff..
*/
/* rootfs is available now, try loadingdefault modules */
load_default_modules();
}
do_basic_setup()函数分析:
static void__init do_basic_setup(void)
{
cpuset_init_smp();
usermodehelper_init();
shmem_init();
driver_init();
init_irq_proc();
do_ctors();
usermodehelper_enable();
do_initcalls();
}
void __initdriver_init(void)
{
/* These are the core pieces */
devtmpfs_init();
创建节点devicesdev dev/char /dev/block;
devices_init();
创建节点bus 并常见/devices/system
buses_init();
创建节点class
classes_init();
创建节点firmware
firmware_init();
创建节点hypervisor
hypervisor_init();
/* These are also core pieces, but mustcome after the
*core core pieces.
*/
初始化平台设备
platform_bus_init();
/sys/devices/system/cpu
cpu_dev_init();
/sys/devices/system/memory
memory_dev_init();
}
这个函数调用个子系统初始化函数,按照优先级。
staticinitcall_t *initcall_levels[] __initdata = {
__initcall0_start,
__initcall1_start,
__initcall2_start,
__initcall3_start,
__initcall4_start,
__initcall5_start,
__initcall6_start,
__initcall7_start,
__initcall_end,
};
/* Keep these insync with initcalls in include/linux/init.h */
static char*initcall_level_names[] __initdata = {
"early",
"core",
"postcore",
"arch",
"subsys",
"fs",
"device",
"late",
};
static void__init do_initcalls(void)
{
int level;
for (level = 0; level <ARRAY_SIZE(initcall_levels) - 1; level++)
do_initcall_level(level);
}
static void__init do_initcall_level(int level)
{
extern const struct kernel_param__start___param[], __stop___param[];
initcall_t *fn;
strcpy(static_command_line,saved_command_line);
parse_args(initcall_level_names[level],
static_command_line, __start___param,
__stop___param - __start___param,
level, level,
&repair_env_string);
for (fn = initcall_levels[level]; fn <initcall_levels[level+1]; fn++)
do_one_initcall(*fn);
}
int__init_or_module do_one_initcall(initcall_t fn)
{
int count = preempt_count();
int ret;
if (initcall_debug)
ret = do_one_initcall_debug(fn);
else
ret =fn();
msgbuf[0] = 0;
if (ret && ret != -ENODEV&& initcall_debug)
sprintf(msgbuf, "error code%d ", ret);
if (preempt_count() != count) {
strlcat(msgbuf, "preemptionimbalance ", sizeof(msgbuf));
preempt_count() = count;
}
if (irqs_disabled()) {
strlcat(msgbuf, "disabledinterrupts ", sizeof(msgbuf));
local_irq_enable();
}
if (msgbuf[0]) {
printk("initcall %pF returnedwith %s\n", fn, msgbuf);
}
return ret;
}