ARM多核启动分析

1.概述

本文主要是记录学习Linux的多核启动的过程,对学习过程进行总结,以便进行后续回顾。

平台:ARM Vexpress

内核版本:linux-4.9

2.smp_operations初始化

系统启动过程中,Linux kernel提供了smp boot实现的框架,要实现smp boot,先要填充好smp_operations这个结构体,smp_operations结构体定义如下所示:

struct smp_operations {
#ifdef CONFIG_SMP
	/*
	 * Setup the set of possible CPUs (via set_cpu_possible)
	 */
	void (*smp_init_cpus)(void);
	/*
	 * Initialize cpu_possible map, and enable coherency
	 */
	void (*smp_prepare_cpus)(unsigned int max_cpus);
	
	/*
	 * Perform platform specific initialisation of the specified CPU.
	 */
	void (*smp_secondary_init)(unsigned int cpu);
	/*
	 * Boot a secondary CPU, and assign it the specified idle task.
	 * This also gives us the initial stack to use for this CPU.
	 */
	int  (*smp_boot_secondary)(unsigned int cpu, struct task_struct *idle);
#ifdef CONFIG_HOTPLUG_CPU
	int  (*cpu_kill)(unsigned int cpu);
	void (*cpu_die)(unsigned int cpu);
	bool  (*cpu_can_disable)(unsigned int cpu);
	int  (*cpu_disable)(unsigned int cpu);
#endif
#endif
};

2.1.smp_operations初始化流程

start_kernel()->setup_arch()
在该函数中,通过以下代码初始化smp_operations结构:

 #ifdef CONFIG_SMP
if (is_smp()) {
	if (!mdesc->smp_init || !mdesc->smp_init()) {
		if (psci_smp_available())
			smp_set_ops(&psci_smp_ops);
		else if (mdesc->smp)
			smp_set_ops(mdesc->smp);
	}
	smp_init_cpus();
	smp_build_mpidr_hash();
}
#endif

其中mdesc(机器描述符)是在arch/arm/mach-vexpress/v2m.c中通过DT_MACHINE_START宏来定义并初始化的,如下所示。

    DT_MACHINE_START(VEXPRESS_DT, "ARM-Versatile Express")
	.dt_compat	= v2m_dt_match,
	.l2c_aux_val	= 0x00400000,
	.l2c_aux_mask	= 0xfe0fffff,
	.smp		= smp_ops(vexpress_smp_dt_ops),
	.smp_init	= smp_init_ops(vexpress_smp_init_ops),
	MACHINE_END

由于mdesc->smp_init是非空的,且mdesc->smp_init()的返回值为false,该函数实现如下:

    bool __init vexpress_smp_init_ops(void)
	{
	#ifdef CONFIG_MCPM
		/*
		 * The best way to detect a multi-cluster configuration at the moment
		 * is to look for the presence of a CCI in the system.
		 * Override the default vexpress_smp_ops if so.
		 */
		struct device_node *node;
		node = of_find_compatible_node(NULL, NULL, "arm,cci-400");
		if (node && of_device_is_available(node)) {
			mcpm_smp_set_ops();
			return true;
		}
	#endif
		return false;
	}

在内核的配置中打开了CONFIG_MCPM的配置项(不太明白为什么打开改配置,难道ARM Vexpress是multi-cluster?),但dts并没有配置了compatible="arm,cci-400"的节点,因此该函数返回false。由于dts也没有配置psci节点,因此在初始化smp_operations结构时,会调用smp_set_ops()函数对smp_operations结构进行初始化,其代码如下:

    void __init smp_set_ops(const struct smp_operations *ops)
	{
		if (ops)
			smp_ops = *ops;
	};

由上可见,该函数只是把传下来的mdesc->smp赋值给了smp_ops,smp_ops是一个全局变量,定义在arch/arm/kernel/smp.c中,如下所示:

    static struct smp_operations smp_ops __ro_after_init;

而mdesc->smp是指向smp_ops的,smp_ops是一个宏,其定义如下:

    #ifdef CONFIG_SMP
	#define smp_ops(ops) (&(ops))
	#define smp_init_ops(ops) (&(ops))
	#else
	#define smp_ops(ops) (struct smp_operations *)NULL
	#define smp_init_ops(ops) (bool (*)(void))NULL
	#endif

因此,smp_ops即是vexpress_smp_dt_ops,该结构的定义如下:

    const struct smp_operations vexpress_smp_dt_ops __initconst = {
	.smp_prepare_cpus	= vexpress_smp_dt_prepare_cpus,
	.smp_secondary_init	= versatile_secondary_init,
	.smp_boot_secondary	= versatile_boot_secondary,
	#ifdef CONFIG_HOTPLUG_CPU
	.cpu_die		= vexpress_cpu_die,
	#endif
	};

2.2.多核启动

多核的启动函数调用流程主要如下所示:

start_kernel()->rest_init()->kernel_init()->kernel_init_freeable()->smp_init()

在smp_init()中,会通过for_each_present_cpu,让每一个present的cpu wakeup起来,代码如下:

    /* Called by boot processor to activate the rest. */
	void __init smp_init(void)
	{
		unsigned int cpu;
	
		idle_threads_init();
		cpuhp_threads_init();
	
		/* FIXME: This should be done in userspace --RR */
		for_each_present_cpu(cpu) {
			if (num_online_cpus() >= setup_max_cpus)
				break; 
				cpu_up(cpu);
		}
	
		/* Any cleanup work */
		smp_announce();
		smp_cpus_done(setup_max_cpus);
	}

其中cpu_up()的调用流程如下:

cpu_up()->do_cpu_up()->_cpu_up()->cpuhp_up_callbacks()->cpuhp_invoke_callback()

cpu_up()调用do_cpu_up()时主要传两个参数,一个cpuid,一个cpu状态,如下所示:

    int cpu_up(unsigned int cpu)
	{
		return do_cpu_up(cpu, CPUHP_ONLINE);
	}
	EXPORT_SYMBOL_GPL(cpu_up);

而在_cpu_up()中会根据cpu状态通过一个min宏通过与CPUHP_BRINGUP_CPU比较取最小的一个,代码如下:

    /* Requires cpu_add_remove_lock to be held */
	static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
	{
		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
		struct task_struct *idle;
		int ret = 0;
	
		cpu_hotplug_begin();
	
		if (!cpu_present(cpu)) {
			ret = -EINVAL;
			goto out;
		}
	
		/*
		 * The caller of do_cpu_up might have raced with another
		 * caller. Ignore it for now.
		 */
		if (st->state >= target)
			goto out;
	
		if (st->state == CPUHP_OFFLINE) {
			/* Let it fail before we try to bring the cpu up */
			idle = idle_thread_get(cpu);
			if (IS_ERR(idle)) {
				ret = PTR_ERR(idle);
				goto out;
			}
		}
	
		cpuhp_tasks_frozen = tasks_frozen;
	
		st->target = target;
		/*
		 * If the current CPU state is in the range of the AP hotplug thread,
		 * then we need to kick the thread once more.
		 */
		if (st->state > CPUHP_BRINGUP_CPU) {
			ret = cpuhp_kick_ap_work(cpu);
			/*
			 * The AP side has done the error rollback already. Just
			 * return the error code..
			 */
			if (ret)
				goto out;
		}
	
		/*
		 * Try to reach the target state. We max out on the BP at
		 * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
		 * responsible for bringing it up to the target state.
		 */
		target = min((int)target, CPUHP_BRINGUP_CPU);
		ret = cpuhp_up_callbacks(cpu, st, target);
	out:
		cpu_hotplug_done();
		return ret;
	}

而CPUHP_BRINGUP_CPU这些值实在cpuhotplug.h的枚举变量cpuhp_state中枚举出来,列举几个如下所示:

    enum cpuhp_state {
	CPUHP_OFFLINE,
	CPUHP_CREATE_THREADS,
	CPUHP_PERF_PREPARE,
	CPUHP_PERF_X86_PREPARE,
	CPUHP_PERF_X86_UNCORE_PREP,
	...
	}

从该变量可以看出CPUHP_ONLINE的值是最大的,因此_cpu_up()调用cpuhp_up_callbacks()时传入的target为CPUHP_BRINGUP_CPU。

进入到cpuhp_up_callbacks()后,由于st->state是0,是小于传下来的target的,因此会通过一个while循环,每个cpu都遍历所有满足st->state < CPUHP_BRINGUP_CPU,来进行启动其他cpu的一些准备工作,代码如下所示:

	static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
			      enum cpuhp_state target)
	{
		enum cpuhp_state prev_state = st->state;
		int ret = 0;
	
		while (st->state < target) {
			st->state++;
			ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
			if (ret) {
				st->target = prev_state;
				undo_cpu_up(cpu, st);
				break;
			}
		}
		return ret;
	}

进入到cpuhp_invoke_callback()函数后,首先根据传下来的st->state通过cpuhp_get_step()函数从全局数组cpuhp_bp_states[]中拿到相应的struct cpuhp_step结构变量,因此这里会遍历调用cpuhp_bp_states数组元素里的回调函数,代码如下:

    static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
				 bool bringup, struct hlist_node *node)
	{
		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
		struct cpuhp_step *step = cpuhp_get_step(state);
		int (*cbm)(unsigned int cpu, struct hlist_node *node);
		int (*cb)(unsigned int cpu);
		int ret, cnt;
	
		if (!step->multi_instance) {
			cb = bringup ? step->startup.single : step->teardown.single;
			if (!cb)
				return 0;
			trace_cpuhp_enter(cpu, st->target, state, cb);
			ret = cb(cpu);
			trace_cpuhp_exit(cpu, st->state, state, ret);
			return ret;
		}
		cbm = bringup ? step->startup.multi : step->teardown.multi;
		if (!cbm)
			return 0;
	
		/* Single invocation for instance add/remove */
		if (node) {
			trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
			ret = cbm(cpu, node);
			trace_cpuhp_exit(cpu, st->state, state, ret);
			return ret;
		}
	
		/* State transition. Invoke on all instances */
		cnt = 0;
		hlist_for_each(node, &step->list) {
			trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
			ret = cbm(cpu, node);
			trace_cpuhp_exit(cpu, st->state, state, ret);
			if (ret)
				goto err;
			cnt++;
		}
		return 0;
	err:
		/* Rollback the instances if one failed */
		cbm = !bringup ? step->startup.multi : step->teardown.multi;
		if (!cbm)
			return ret;
	
		hlist_for_each(node, &step->list) {
			if (!cnt--)
				break;
			cbm(cpu, node);
		}
		return ret;
	}

其中cpuhp_bp_states是定义在kernel/cpu.c中,列举几个如下所示:

    /* Boot processor state steps */
	static struct cpuhp_step cpuhp_bp_states[] = {
		[CPUHP_OFFLINE] = {
			.name			= "offline",
			.startup.single		= NULL,
			.teardown.single	= NULL,
		},
	#ifdef CONFIG_SMP
		[CPUHP_CREATE_THREADS]= {
			.name			= "threads:prepare",
			.startup.single		= smpboot_create_threads,
			.teardown.single	= NULL,
			.cant_stop		= true,
		},
		[CPUHP_PERF_PREPARE] = {
			.name			= "perf:prepare",
			.startup.single		= perf_event_init_cpu,
			.teardown.single	= perf_event_exit_cpu,
		},
		[CPUHP_WORKQUEUE_PREP] = {
			.name			= "workqueue:prepare",
			.startup.single		= workqueue_prepare_cpu,
			.teardown.single	= NULL,
		},
		[CPUHP_HRTIMERS_PREPARE] = {
			.name			= "hrtimers:prepare",
			.startup.single		= hrtimers_prepare_cpu,
			.teardown.single	= hrtimers_dead_cpu,
		},
		...
		[CPUHP_TIMERS_DEAD] = {
		.name			= "timers:dead",
		.startup.single		= NULL,
		.teardown.single	= timers_dead_cpu,
		},
		/* Kicks the plugged cpu into life */
		[CPUHP_BRINGUP_CPU] = {
			.name			= "cpu:bringup",
			.startup.single		= bringup_cpu,
			.teardown.single	= NULL,
			.cant_stop		= true,
		},
		...
		[CPUHP_TEARDOWN_CPU] = {
		.name			= "cpu:teardown",
		.startup.single		= NULL,
		.teardown.single	= takedown_cpu,
		.cant_stop		= true,
	},
	#else
		[CPUHP_BRINGUP_CPU] = { },
	#endif
	};

由上分析,当遍历到CPUHP_BRINGUP_CPU元素时,便会调用到bringup_cpu()函数来启动对应的cpu,该函数的主要调用流程如下:

bringup_cpu()->__cpu_up()-> smp_ops.smp_boot_secondary()

看到smp_ops.smp_boot_secondary这个回调会不会感觉到很熟悉,没错,这就是之前初始化好的,定义在arch/arm/kernel/smp.c中的全局变量smp_ops,因此该回调函数的实现如下:

	int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
	{
		unsigned long timeout;
	
		/*
		 * Set synchronisation state between this boot processor
		 * and the secondary one
		 */
		spin_lock(&boot_lock);
	
		/*
		 * This is really belt and braces; we hold unintended secondary
		 * CPUs in the holding pen until we're ready for them.  However,
		 * since we haven't sent them a soft interrupt, they shouldn't
		 * be there.
		 */
		write_pen_release(cpu_logical_map(cpu));
	
		/*
		 * Send the secondary CPU a soft interrupt, thereby causing
		 * the boot monitor to read the system wide flags register,
		 * and branch to the address found there.
		 */
		arch_send_wakeup_ipi_mask(cpumask_of(cpu));
	
		timeout = jiffies + (1 * HZ);
		while (time_before(jiffies, timeout)) {
			smp_rmb();
			if (pen_release == -1)
				break;
	
			udelay(10);
		}
	
		/*
		 * now the secondary core is starting up let it run its
		 * calibrations, then wait for it to finish
		 */
		spin_unlock(&boot_lock);
	
		return pen_release != -1 ? -ENOSYS : 0;
	}

分析到这,剩下的启动流程就是与硬件相关了,因此不再深究硬件细节的具体实现,后续可以补充进来

你可能感兴趣的:(linux)