linux内核current宏介绍

1.概述

本文主要介绍linux current宏在arm和arm64上的实现

内核版本:Linux 5.3

2.current在arm和arm64上的实现

在linux 内核中,有一个current宏,通过该宏,可以获取当前进程的task_struct数据结构。在arm上,就是利用了内核栈特性来实现的,记录如下:

首先,如果没有在arch/xxx/include/asm/目录下没有定义该宏,则该宏定义在include/asm-generic/current.h文件中,如下所示:

	/* SPDX-License-Identifier: GPL-2.0 */
	#ifndef __ASM_GENERIC_CURRENT_H
	#define __ASM_GENERIC_CURRENT_H

	#include 

	#define get_current() (current_thread_info()->task)
	#define current get_current()

	#endif /* __ASM_GENERIC_CURRENT_H */

在arm32架构中,current就是定义在上述文件的,而在arm64中,在打上以下这个补丁之前,实现也是如arm32一样,但打上以下补丁后,current就定义在arch/arm64/include/asm/current.h这个文件中。

补丁信息如下所示:

From c02433dd6de32f042cf3ffe476746b1115b8c096 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:13 +0000
Subject: [PATCH] arm64: split thread_info from task stack

This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.

Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.

This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.

Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).

Both secondary entry and idle are updated to stash the sp and task
pointer separately.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig                   |  1 +
 arch/arm64/include/asm/Kbuild        |  1 -
 arch/arm64/include/asm/current.h     | 22 +++++++++++++++++++
 arch/arm64/include/asm/smp.h         |  1 +
 arch/arm64/include/asm/thread_info.h | 24 ---------------------
 arch/arm64/kernel/asm-offsets.c      |  8 ++++---
 arch/arm64/kernel/entry.S            | 41 ++++++++++++++++++------------------
 arch/arm64/kernel/head.S             | 11 +++++-----
 arch/arm64/kernel/process.c          | 16 ++++++++++++++
 arch/arm64/kernel/smp.c              |  2 ++
 10 files changed, 73 insertions(+), 54 deletions(-)
 create mode 100644 arch/arm64/include/asm/current.h

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 77a807a..0b8227f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -109,6 +109,7 @@ config ARM64
 	select POWER_SUPPLY
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
+	select THREAD_INFO_IN_TASK
 	help
 	  ARM 64-bit (AArch64) Linux support.

diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 44e1d7f..28196b1 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -1,7 +1,6 @@
 generic-y += bugs.h
 generic-y += clkdev.h
 generic-y += cputime.h
-generic-y += current.h
 generic-y += delay.h
 generic-y += div64.h
 generic-y += dma.h
diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h
new file mode 100644
index 00000000..f2bcbe2
--- /dev/null
+++ b/arch/arm64/include/asm/current.h
@@ -0,0 +1,22 @@
+#ifndef __ASM_CURRENT_H
+#define __ASM_CURRENT_H
+
+#include <linux/compiler.h>
+
+#include <asm/sysreg.h>
+
+#ifndef __ASSEMBLY__
+
+struct task_struct;
+
+static __always_inline struct task_struct *get_current(void)
+{
+	return (struct task_struct *)read_sysreg(sp_el0);
+}
+
+#define current get_current()
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_CURRENT_H */
+
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 968b08d..a62db95 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -82,6 +82,7 @@ asmlinkage void secondary_start_kernel(void);
  */
 struct secondary_data {
 	void *stack;
+	struct task_struct *task;
 	long status;
 };

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index bce0f07..c17ad4d 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -47,41 +47,17 @@ typedef unsigned long mm_segment_t;
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	mm_segment_t		addr_limit;	/* address limit */
-	struct task_struct	*task;		/* main task structure */
 	int			preempt_count;	/* 0 => preemptable, <0 => bug */
-	int			cpu;		/* cpu */
 };

 #define INIT_THREAD_INFO(tsk)						\
 {									\
-	.task		= &tsk,						\
-	.flags		= 0,						\
 	.preempt_count	= INIT_PREEMPT_COUNT,				\
 	.addr_limit	= KERNEL_DS,					\
 }

 #define init_stack		(init_thread_union.stack)

-/*
- * how to get the thread information struct from C
- */
-static inline struct thread_info *current_thread_info(void) __attribute_const__;
-
-/*
- * struct thread_info can be accessed directly via sp_el0.
- *
- * We don't use read_sysreg() as we want the compiler to cache the value where
- * possible.
- */
-static inline struct thread_info *current_thread_info(void)
-{
-	unsigned long sp_el0;
-
-	asm ("mrs %0, sp_el0" : "=r" (sp_el0));
-
-	return (struct thread_info *)sp_el0;
-}
-
 #define thread_saved_pc(tsk)	\
 	((unsigned long)(tsk->thread.cpu_context.pc))
 #define thread_saved_sp(tsk)	\
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index d30b232..c2dc9fa 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -36,9 +36,10 @@ int main(void)
 {
   DEFINE(TSK_ACTIVE_MM,		offsetof(struct task_struct, active_mm));
   BLANK();
-  DEFINE(TI_FLAGS,		offsetof(struct thread_info, flags));
-  DEFINE(TI_PREEMPT,		offsetof(struct thread_info, preempt_count));
-  DEFINE(TI_ADDR_LIMIT,		offsetof(struct thread_info, addr_limit));
+  DEFINE(TSK_TI_FLAGS,		offsetof(struct task_struct, thread_info.flags));
+  DEFINE(TSK_TI_PREEMPT,	offsetof(struct task_struct, thread_info.preempt_count));
+  DEFINE(TSK_TI_ADDR_LIMIT,	offsetof(struct task_struct, thread_info.addr_limit));
+  DEFINE(TSK_STACK,		offsetof(struct task_struct, stack));
   BLANK();
   DEFINE(THREAD_CPU_CONTEXT,	offsetof(struct task_struct, thread.cpu_context));
   BLANK();
@@ -121,6 +122,7 @@ int main(void)
   DEFINE(TZ_DSTTIME,		offsetof(struct timezone, tz_dsttime));
   BLANK();
   DEFINE(CPU_BOOT_STACK,	offsetof(struct secondary_data, stack));
+  DEFINE(CPU_BOOT_TASK,		offsetof(struct secondary_data, task));
   BLANK();
 #ifdef CONFIG_KVM_ARM_HOST
   DEFINE(VCPU_CONTEXT,		offsetof(struct kvm_vcpu, arch.ctxt));
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2d4c83b..6349a83 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -90,9 +90,8 @@

 	.if	\el == 0
 	mrs	x21, sp_el0
-	mov	tsk, sp
-	and	tsk, tsk, #~(THREAD_SIZE - 1)	// Ensure MDSCR_EL1.SS is clear,
-	ldr	x19, [tsk, #TI_FLAGS]		// since we can unmask debug
+	ldr_this_cpu	tsk, __entry_task, x20	// Ensure MDSCR_EL1.SS is clear,
+	ldr	x19, [tsk, #TSK_TI_FLAGS]	// since we can unmask debug
 	disable_step_tsk x19, x20		// exceptions when scheduling.

 	mov	x29, xzr			// fp pointed to user-space
@@ -100,10 +99,10 @@
 	add	x21, sp, #S_FRAME_SIZE
 	get_thread_info tsk
 	/* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
-	ldr	x20, [tsk, #TI_ADDR_LIMIT]
+	ldr	x20, [tsk, #TSK_TI_ADDR_LIMIT]
 	str	x20, [sp, #S_ORIG_ADDR_LIMIT]
 	mov	x20, #TASK_SIZE_64
-	str	x20, [tsk, #TI_ADDR_LIMIT]
+	str	x20, [tsk, #TSK_TI_ADDR_LIMIT]
 	/* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
 	.endif /* \el == 0 */
 	mrs	x22, elr_el1
@@ -139,7 +138,7 @@
 	.if	\el != 0
 	/* Restore the task's original addr_limit. */
 	ldr	x20, [sp, #S_ORIG_ADDR_LIMIT]
-	str	x20, [tsk, #TI_ADDR_LIMIT]
+	str	x20, [tsk, #TSK_TI_ADDR_LIMIT]

 	/* No need to restore UAO, it will be restored from SPSR_EL1 */
 	.endif
@@ -192,13 +191,14 @@ alternative_else_nop_endif
 	mov	x19, sp			// preserve the original sp

 	/*
-	 * Compare sp with the current thread_info, if the top
-	 * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and
-	 * should switch to the irq stack.
+	 * Compare sp with the base of the task stack.
+	 * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
+	 * and should switch to the irq stack.
 	 */
-	and	x25, x19, #~(THREAD_SIZE - 1)
-	cmp	x25, tsk
-	b.ne	9998f
+	ldr	x25, [tsk, TSK_STACK]
+	eor	x25, x25, x19
+	and	x25, x25, #~(THREAD_SIZE - 1)
+	cbnz	x25, 9998f

 	adr_this_cpu x25, irq_stack, x26
 	mov	x26, #IRQ_STACK_START_SP
@@ -427,9 +427,9 @@ el1_irq:
 	irq_handler

 #ifdef CONFIG_PREEMPT
-	ldr	w24, [tsk, #TI_PREEMPT]		// get preempt count
+	ldr	w24, [tsk, #TSK_TI_PREEMPT]	// get preempt count
 	cbnz	w24, 1f				// preempt count != 0
-	ldr	x0, [tsk, #TI_FLAGS]		// get flags
+	ldr	x0, [tsk, #TSK_TI_FLAGS]	// get flags
 	tbz	x0, #TIF_NEED_RESCHED, 1f	// needs rescheduling?
 	bl	el1_preempt
 1:
@@ -444,7 +444,7 @@ ENDPROC(el1_irq)
 el1_preempt:
 	mov	x24, lr
 1:	bl	preempt_schedule_irq		// irq en/disable is done inside
-	ldr	x0, [tsk, #TI_FLAGS]		// get new tasks TI_FLAGS
+	ldr	x0, [tsk, #TSK_TI_FLAGS]	// get new tasks TI_FLAGS
 	tbnz	x0, #TIF_NEED_RESCHED, 1b	// needs rescheduling?
 	ret	x24
 #endif
@@ -674,8 +674,7 @@ ENTRY(cpu_switch_to)
 	ldp	x29, x9, [x8], #16
 	ldr	lr, [x8]
 	mov	sp, x9
-	and	x9, x9, #~(THREAD_SIZE - 1)
-	msr	sp_el0, x9
+	msr	sp_el0, x1
 	ret
 ENDPROC(cpu_switch_to)

@@ -686,7 +685,7 @@ ENDPROC(cpu_switch_to)
 ret_fast_syscall:
 	disable_irq				// disable interrupts
 	str	x0, [sp, #S_X0]			// returned x0
-	ldr	x1, [tsk, #TI_FLAGS]		// re-check for syscall tracing
+	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
 	and	x2, x1, #_TIF_SYSCALL_WORK
 	cbnz	x2, ret_fast_syscall_trace
 	and	x2, x1, #_TIF_WORK_MASK
@@ -706,14 +705,14 @@ work_pending:
 #ifdef CONFIG_TRACE_IRQFLAGS
 	bl	trace_hardirqs_on		// enabled while in userspace
 #endif
-	ldr	x1, [tsk, #TI_FLAGS]		// re-check for single-step
+	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for single-step
 	b	finish_ret_to_user
 /*
  * "slow" syscall return path.
  */
 ret_to_user:
 	disable_irq				// disable interrupts
-	ldr	x1, [tsk, #TI_FLAGS]
+	ldr	x1, [tsk, #TSK_TI_FLAGS]
 	and	x2, x1, #_TIF_WORK_MASK
 	cbnz	x2, work_pending
 finish_ret_to_user:
@@ -746,7 +745,7 @@ el0_svc_naked:					// compat entry point
 	enable_dbg_and_irq
 	ct_user_exit 1

-	ldr	x16, [tsk, #TI_FLAGS]		// check for syscall hooks
+	ldr	x16, [tsk, #TSK_TI_FLAGS]	// check for syscall hooks
 	tst	x16, #_TIF_SYSCALL_WORK
 	b.ne	__sys_trace
 	cmp     scno, sc_nr                     // check upper syscall limit
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 332e331..eaafb25 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -428,7 +428,8 @@ ENDPROC(__create_page_tables)
 __primary_switched:
 	adrp	x4, init_thread_union
 	add	sp, x4, #THREAD_SIZE
-	msr	sp_el0, x4			// Save thread_info
+	adr_l	x5, init_task
+	msr	sp_el0, x5			// Save thread_info

 	adr_l	x8, vectors			// load VBAR_EL1 with virtual
 	msr	vbar_el1, x8			// vector table address
@@ -699,10 +700,10 @@ __secondary_switched:
 	isb

 	adr_l	x0, secondary_data
-	ldr	x0, [x0, #CPU_BOOT_STACK]	// get secondary_data.stack
-	mov	sp, x0
-	and	x0, x0, #~(THREAD_SIZE - 1)
-	msr	sp_el0, x0			// save thread_info
+	ldr	x1, [x0, #CPU_BOOT_STACK]	// get secondary_data.stack
+	mov	sp, x1
+	ldr	x2, [x0, #CPU_BOOT_TASK]
+	msr	sp_el0, x2
 	mov	x29, #0
 	b	secondary_start_kernel
 ENDPROC(__secondary_switched)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index ec7b9c0..a98b743 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -45,6 +45,7 @@
 #include 
 #include 
 #include 
+#include <linux/percpu.h>

 #include 
 #include 
@@ -322,6 +323,20 @@ void uao_thread_switch(struct task_struct *next)
 }

 /*
+ * We store our current task in sp_el0, which is clobbered by userspace. Keep a
+ * shadow copy so that we can restore this upon entry from userspace.
+ *
+ * This is *only* for exception entry from EL0, and is not valid until we
+ * __switch_to() a user task.
+ */
+DEFINE_PER_CPU(struct task_struct *, __entry_task);
+
+static void entry_task_switch(struct task_struct *next)
+{
+	__this_cpu_write(__entry_task, next);
+}
+
+/*
  * Thread switching.
  */
 struct task_struct *__switch_to(struct task_struct *prev,
@@ -333,6 +348,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	tls_thread_switch(next);
 	hw_breakpoint_thread_switch(next);
 	contextidr_thread_switch(next);
+	entry_task_switch(next);
 	uao_thread_switch(next);

 	/*
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 6f42c68..cb87234 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -149,6 +149,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	 * We need to tell the secondary core where to find its stack and the
 	 * page tables.
 	 */
+	secondary_data.task = idle;
 	secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;
 	update_cpu_boot_status(CPU_MMU_OFF);
 	__flush_dcache_area(&secondary_data, sizeof(secondary_data));
@@ -173,6 +174,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 		pr_err("CPU%u: failed to boot: %d\n", cpu, ret);
 	}

+	secondary_data.task = NULL;
 	secondary_data.stack = NULL;
 	status = READ_ONCE(secondary_data.status);
 	if (ret && status) {
--
2.7.4

打上以上补丁后,arm64上current的实现如下:

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_CURRENT_H
#define __ASM_CURRENT_H

#include 

#ifndef __ASSEMBLY__

struct task_struct;

/*
 * We don't use read_sysreg() as we want the compiler to cache the value where
 * possible.
 */
static __always_inline struct task_struct *get_current(void)
{
	unsigned long sp_el0;

	asm ("mrs %0, sp_el0" : "=r" (sp_el0));

	return (struct task_struct *)sp_el0;
}

#define current get_current()

#endif /* __ASSEMBLY__ */

#endif /* __ASM_CURRENT_H */

3 current在arm32通过sp找到进程task_struct结构的实现

在arm32 cpu上,当内核发生panic时,我们通常可以在串口上看到如下打印:

/ # echo c > /proc/sysrq-trigger
sysrq: SysRq : Trigger a crash
Unhandled fault: page domain fault (0x81b) at 0x00000000
pgd = be484000
[00000000] *pgd=9e478831, *pte=00000000, *ppte=00000000
Internal error: : 81b [#1] SMP ARM
Modules linked in:
CPU: 0 PID: 739 Comm: sh Tainted: G    B           4.9.191-rc1+ #9
Hardware name: ARM-Versatile Express
task: be9f2200 task.stack: be7ca000
PC is at sysrq_handle_crash+0x30/0x38
LR is at arm_heavy_mb+0x2c/0x48
pc : [<8044e580>]    lr : [<801171b8>]    psr: 60000013
sp : be7cbe58  ip : be7cbe48  fp : be7cbe6c
r10: 00000004  r9 : 00000000  r8 : 80b2f4c4
r7 : 00000000  r6 : 00000063  r5 : 00000008  r4 : 00000001
r3 : 00000000  r2 : 00000730  r1 : 00000000  r0 : 00000063
Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 9e48406a  DAC: 00000051
Process sh (pid: 739, stack limit = 0xbe7ca210)
Stack: (0xbe7cbe58 to 0xbe7cc000)
be40:                                                       80b0d680 00000008
be60: be7cbe9c be7cbe70 8044eb2c 8044e55c 00000055 00000002 00000000 00000000
be80: be90f9c0 00000000 00000000 00000004 be7cbeb4 be7cbea0 8044f090 8044ea84
bea0: 8044f02c 00000000 be7cbed4 be7cbeb8 802a2418 8044f038 be9a7e00 802a23ac
bec0: be7cbf78 00000002 be7cbf44 be7cbed8 8024439c 802a23b8 00000017 801166fc
bee0: 000a55b8 be7cbfb0 001bb888 7ed6ffbe be7cbfac be7cbf00 8010136c 80116708
bf00: be9f21f8 be7cbf50 be7cbf4c be7cbf18 80126644 8024525c 8024276c fffffff6
bf20: 00000002 be9a7e00 00218ab8 be7cbf78 00000000 00000000 be7cbf74 be7cbf48
bf40: 8024529c 8024436c 80126d10 80265780 be9a7e00 be9a7e00 00218ab8 00000002
bf60: 00000000 00000000 be7cbfa4 be7cbf78 80246188 802451f4 00000000 00000000
bf80: 00214b24 00000001 00218ab8 00000004 801080e4 be7ca000 00000000 be7cbfa8
bfa0: 80107f00 80246138 00214b24 00000001 00000001 00218ab8 00000002 00000000
bfc0: 00214b24 00000001 00218ab8 00000004 00000001 00218ab8 00000020 00217ab8
bfe0: 00000000 7ed6f47c 000135d8 00143f1c 60000010 00000001 00000000 00000000
[<8044e580>] (sysrq_handle_crash) from [<8044eb2c>] (__handle_sysrq+0xb4/0x184)
[<8044eb2c>] (__handle_sysrq) from [<8044f090>] (write_sysrq_trigger+0x64/0x74)
[<8044f090>] (write_sysrq_trigger) from [<802a2418>] (proc_reg_write+0x6c/0x94)
[<802a2418>] (proc_reg_write) from [<8024439c>] (__vfs_write+0x3c/0x128)
[<8024439c>] (__vfs_write) from [<8024529c>] (vfs_write+0xb4/0x178)
[<8024529c>] (vfs_write) from [<80246188>] (SyS_write+0x5c/0xd0)
[<80246188>] (SyS_write) from [<80107f00>] (ret_fast_syscall+0x0/0x1c)
Code: e5834000 f57ff04e ebf32303 e3a03000 (e5c34000)
---[ end trace a875aa28f6df8b07 ]---
Kernel panic - not syncing: Fatal exception

在panic时,我们可以看到串口上有打印panic时,指向进程task_struct的地址,进程内核栈的地址,pc寄存器,sp寄存器,lr寄存器等信息。关于进程task_struct的地址,就是通过sp寄存器的值算出来的,计算的源码实现分析如下:

首先,因为每个进程,都会分配相应的内核栈,因此根据sp寄存器的值,进行一个内核栈大小的对齐,就能获取到当前进程的thread_info结构信息,如下:

源码路径:arch/arm/include/asm/thread_info.h


#define THREAD_SIZE_ORDER	1
#define THREAD_SIZE		(PAGE_SIZE << THREAD_SIZE_ORDER)
#define THREAD_START_SP		(THREAD_SIZE - 8)

/*
 * how to get the current stack pointer in C
 */
register unsigned long current_stack_pointer asm ("sp");

/*
 * how to get the thread information struct from C
 */
static inline struct thread_info *current_thread_info(void) __attribute_const__;

static inline struct thread_info *current_thread_info(void)
{
	return (struct thread_info *)
		(current_stack_pointer & ~(THREAD_SIZE - 1));
}

PAGE_SIZE定义如下:arch/arm/include/asm/page.h

/* PAGE_SHIFT determines the page size */
#define PAGE_SHIFT		12
#define PAGE_SIZE		(_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK		(~((1 << PAGE_SHIFT) - 1))

由上,可以计算出,在arm32 linux系统中,没有特殊配置下,内核栈大小是8k。

struct thread_info和内核栈通过union thread_union这个数据结构进行关联,thread_union这个类型描述了整个内核栈statck[],该数据结构如下:

union thread_union定义在:include/linux/sched.h

union thread_union {
#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
	struct task_struct task;
#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
	struct thread_info thread_info;
#endif
	unsigned long stack[THREAD_SIZE/sizeof(long)];
};

在arm32架构中,目前是不支持CONFIG_THREAD_INFO_IN_TASK,当通过sp寄存器进行一个内核栈大小的对齐操作,就可以得到struct thread_info的地址,而thread_info就保存了特定体系结构的汇编代码段需要访问的那部分进程的数据,我们在thread_info中嵌入指向task_struct的指针, 则我们可以很方便的通过thread_info来查找task_struct,arm32的thread_info定义如下:

thread_info定义在:arch/arm/include/asm/thread_info.h

/*
 * low level task data that entry.S needs immediate access to.
 * __switch_to() assumes cpu_context follows immediately after cpu_domain.
 */
struct thread_info {
	unsigned long		flags;		/* low level flags */
	int			preempt_count;	/* 0 => preemptable, <0 => bug */
	mm_segment_t		addr_limit;	/* address limit */
	struct task_struct	*task;		/* main task structure */
	__u32			cpu;		/* cpu */
	__u32			cpu_domain;	/* cpu domain */
#ifdef CONFIG_STACKPROTECTOR_PER_TASK
	unsigned long		stack_canary;
#endif
	struct cpu_context_save	cpu_context;	/* cpu context */
	__u32			syscall;	/* syscall number */
	__u8			used_cp[16];	/* thread used copro */
	unsigned long		tp_value[2];	/* TLS registers */
#ifdef CONFIG_CRUNCH
	struct crunch_state	crunchstate;
#endif
	union fp_state		fpstate __attribute__((aligned(8)));
	union vfp_state		vfpstate;
#ifdef CONFIG_ARM_THUMBEE
	unsigned long		thumbee_state;	/* ThumbEE Handler Base register */
#endif
};

arm32中,linux内核栈的结构图如下所示:
linux内核current宏介绍_第1张图片

4 current在arm64通过sp找到进程task_struct结构的实现

当打上c02433dd6de32f042cf3ffe476746b1115b8c096这个补丁时,arm64 current宏的实现如上文所述,此时,进入内核时,通过sp_el0寄存器拿到struct task_struct的地址。
sp_el0保存的struct task_struct的地址应该是在用户态通过系统调用进入内核态时,保存起来的,因为内核态使用的sp寄存器应该是sp_el1,sp_el0在内核态刚好没用到,因此用来保存
struct task_struct的地址,分析一下arm64系统调用的实现,如下:

首先,从glibc的源码确认aarch64是如何陷入内核态(切换异常等级)的,源码如下所示:

陷入内核态的指令:sysdeps/unix/sysv/linux/aarch64/sysdep.h

# define DO_CALL(syscall_name, args)		\
    mov x8, SYS_ify (syscall_name);		\
    svc 0

从代码可以确认,把系统调用号保存到x8寄存器,并且通过svc 0这条指令陷入到内核态,svc指令含义如下图所示:
linux内核current宏介绍_第2张图片

通过svc指令从el0陷入到内核态el1之后,会跳转到内核建立的异常向量表中,内核异常向量表如下所示:



/*
 * EL0 mode handlers.
 */
	.align	6
el0_sync:
	kernel_entry 0
	mrs	x25, esr_el1			// read the syndrome register
	lsr	x24, x25, #ESR_ELx_EC_SHIFT	// exception class
	cmp	x24, #ESR_ELx_EC_SVC64		// SVC in 64-bit state
	b.eq	el0_svc
	cmp	x24, #ESR_ELx_EC_DABT_LOW	// data abort in EL0
	b.eq	el0_da
	cmp	x24, #ESR_ELx_EC_IABT_LOW	// instruction abort in EL0
	b.eq	el0_ia
	cmp	x24, #ESR_ELx_EC_FP_ASIMD	// FP/ASIMD access
	b.eq	el0_fpsimd_acc
	cmp	x24, #ESR_ELx_EC_SVE		// SVE access
	b.eq	el0_sve_acc
	cmp	x24, #ESR_ELx_EC_FP_EXC64	// FP/ASIMD exception
	b.eq	el0_fpsimd_exc
	cmp	x24, #ESR_ELx_EC_SYS64		// configurable trap
	ccmp	x24, #ESR_ELx_EC_WFx, #4, ne
	b.eq	el0_sys
	cmp	x24, #ESR_ELx_EC_SP_ALIGN	// stack alignment exception
	b.eq	el0_sp
	cmp	x24, #ESR_ELx_EC_PC_ALIGN	// pc alignment exception
	b.eq	el0_pc
	cmp	x24, #ESR_ELx_EC_UNKNOWN	// unknown exception in EL0
	b.eq	el0_undef
	cmp	x24, #ESR_ELx_EC_BREAKPT_LOW	// debug exception in EL0
	b.ge	el0_dbg
	b	el0_inv

#ifdef CONFIG_COMPAT
	.align	6
el0_sync_compat:
	kernel_entry 0, 32
	mrs	x25, esr_el1			// read the syndrome register
	lsr	x24, x25, #ESR_ELx_EC_SHIFT	// exception class
	cmp	x24, #ESR_ELx_EC_SVC32		// SVC in 32-bit state
	b.eq	el0_svc_compat
	cmp	x24, #ESR_ELx_EC_DABT_LOW	// data abort in EL0
	b.eq	el0_da
	cmp	x24, #ESR_ELx_EC_IABT_LOW	// instruction abort in EL0
	b.eq	el0_ia
	cmp	x24, #ESR_ELx_EC_FP_ASIMD	// FP/ASIMD access
	b.eq	el0_fpsimd_acc
	cmp	x24, #ESR_ELx_EC_FP_EXC32	// FP/ASIMD exception
	b.eq	el0_fpsimd_exc
	cmp	x24, #ESR_ELx_EC_PC_ALIGN	// pc alignment exception
	b.eq	el0_pc
	cmp	x24, #ESR_ELx_EC_UNKNOWN	// unknown exception in EL0
	b.eq	el0_undef
	cmp	x24, #ESR_ELx_EC_CP15_32	// CP15 MRC/MCR trap
	b.eq	el0_cp15
	cmp	x24, #ESR_ELx_EC_CP15_64	// CP15 MRRC/MCRR trap
	b.eq	el0_cp15
	cmp	x24, #ESR_ELx_EC_CP14_MR	// CP14 MRC/MCR trap
	b.eq	el0_undef
	cmp	x24, #ESR_ELx_EC_CP14_LS	// CP14 LDC/STC trap
	b.eq	el0_undef
	cmp	x24, #ESR_ELx_EC_CP14_64	// CP14 MRRC/MCRR trap
	b.eq	el0_undef
	cmp	x24, #ESR_ELx_EC_BREAKPT_LOW	// debug exception in EL0
	b.ge	el0_dbg
	b	el0_inv
el0_svc_compat:
	mov	x0, sp
	bl	el0_svc_compat_handler
	b	ret_to_user

	.align	6
el0_irq_compat:
	kernel_entry 0, 32
	b	el0_irq_naked

el0_error_compat:
	kernel_entry 0, 32
	b	el0_error_naked

el0_cp15:
	/*
	 * Trapped CP15 (MRC, MCR, MRRC, MCRR) instructions
	 */
	enable_daif
	ct_user_exit
	mov	x0, x25
	mov	x1, sp
	bl	do_cp15instr
	b	ret_to_user
#endif

/*
 * SVC handler.
 */
	.align	6
el0_svc:
	gic_prio_kentry_setup tmp=x1
	mov	x0, sp
	bl	el0_svc_handler
	b	ret_to_user
ENDPROC(el0_svc)

/*
 * Exception vectors.
 */
	.pushsection ".entry.text", "ax"

	.align	11
ENTRY(vectors)
	kernel_ventry	1, sync_invalid			// Synchronous EL1t
	kernel_ventry	1, irq_invalid			// IRQ EL1t
	kernel_ventry	1, fiq_invalid			// FIQ EL1t
	kernel_ventry	1, error_invalid		// Error EL1t

	kernel_ventry	1, sync				// Synchronous EL1h
	kernel_ventry	1, irq				// IRQ EL1h
	kernel_ventry	1, fiq_invalid			// FIQ EL1h
	kernel_ventry	1, error			// Error EL1h

	kernel_ventry	0, sync				// Synchronous 64-bit EL0
	kernel_ventry	0, irq				// IRQ 64-bit EL0
	kernel_ventry	0, fiq_invalid			// FIQ 64-bit EL0
	kernel_ventry	0, error			// Error 64-bit EL0

#ifdef CONFIG_COMPAT
	kernel_ventry	0, sync_compat, 32		// Synchronous 32-bit EL0
	kernel_ventry	0, irq_compat, 32		// IRQ 32-bit EL0
	kernel_ventry	0, fiq_invalid_compat, 32	// FIQ 32-bit EL0
	kernel_ventry	0, error_compat, 32		// Error 32-bit EL0
#else
	kernel_ventry	0, sync_invalid, 32		// Synchronous 32-bit EL0
	kernel_ventry	0, irq_invalid, 32		// IRQ 32-bit EL0
	kernel_ventry	0, fiq_invalid, 32		// FIQ 32-bit EL0
	kernel_ventry	0, error_invalid, 32		// Error 32-bit EL0
#endif
END(vectors)

对应arm64的异常向量表如下:
linux内核current宏介绍_第3张图片

从异常向量表我们知道以下几点:

  • 实际上有四张表,每张表有四个异常入口,分别对应同步异常,IRQ,FIQ和出错异常。
  • 如果发生异常并不会导致exception level切换,并且使用的栈指针是SP_EL0,那么使用第一张异常向量表。
  • 如果发生异常并不会导致exception level切换,并且使用的栈指针是SP_EL1/2/3,那么使用第二张异常向量表。
  • 如果发生异常会导致exception level切换,并且比目的exception level低一级的exception level运行在AARCH64模式,那么使用第三张异常向量表。
  • 如果发生异常会导致exception level切换,并且比目的exception level低一级的exception level运行在AARCH32模式,那么使用第四张异常向量表。
    另外我们还可以看到的一点是,每一个异常入口不再仅仅占用4bytes的空间,而是占用0x80 bytes空间,也就是说,每一个异常入口可以放置多条指令,而不仅仅是一条跳转指令。

从linux实现的异常向量表来看,通过svc指令产生的异常属于Synchronous异常,根据陷入el1 Synchronous异常之前的异常模式是AARCH32模式还是AARCH64模式,跳转到el0_sync/el0_sync_compat/el0_sync_invalid入口,当没有打开CONFIG_COMPAT,如果陷入el1 Synchronous异常之前的异常模式是AARCH32,那就会跳转el0_sync_invalid入口,否置会跳转到el0_sync_compat入口。当跳转到el0_sync/el0_sync_compat时,第一句指向的便是kernel_entry这个宏对应的代码,保存el0_sp这些寄存器的值到栈中,就是在这里实现的,代码如下所示:

.macro	kernel_entry, el, regsize = 64
.if	\regsize == 32
mov	w0, w0				// zero upper 32 bits of x0
.endif
stp	x0, x1, [sp, #16 * 0]
stp	x2, x3, [sp, #16 * 1]
stp	x4, x5, [sp, #16 * 2]
stp	x6, x7, [sp, #16 * 3]
stp	x8, x9, [sp, #16 * 4]
stp	x10, x11, [sp, #16 * 5]
stp	x12, x13, [sp, #16 * 6]
stp	x14, x15, [sp, #16 * 7]
stp	x16, x17, [sp, #16 * 8]
stp	x18, x19, [sp, #16 * 9]
stp	x20, x21, [sp, #16 * 10]
stp	x22, x23, [sp, #16 * 11]
stp	x24, x25, [sp, #16 * 12]
stp	x26, x27, [sp, #16 * 13]
stp	x28, x29, [sp, #16 * 14]

.if	\el == 0
clear_gp_regs
mrs	x21, sp_el0
ldr_this_cpu	tsk, __entry_task, x20	// Ensure MDSCR_EL1.SS is clear,
ldr	x19, [tsk, #TSK_TI_FLAGS]	// since we can unmask debug
disable_step_tsk x19, x20		// exceptions when scheduling.

apply_ssbd 1, x22, x23

.else
add	x21, sp, #S_FRAME_SIZE
get_current_task tsk
/* Save the task's original addr_limit and set USER_DS */
ldr	x20, [tsk, #TSK_TI_ADDR_LIMIT]
str	x20, [sp, #S_ORIG_ADDR_LIMIT]
mov	x20, #USER_DS
str	x20, [tsk, #TSK_TI_ADDR_LIMIT]
/* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
.endif /* \el == 0 */
mrs	x22, elr_el1
mrs	x23, spsr_el1
stp	lr, x21, [sp, #S_LR]

/*
 * In order to be able to dump the contents of struct pt_regs at the
 * time the exception was taken (in case we attempt to walk the call
 * stack later), chain it together with the stack frames.
 */
.if \el == 0
stp	xzr, xzr, [sp, #S_STACKFRAME]
.else
stp	x29, x22, [sp, #S_STACKFRAME]
.endif
add	x29, sp, #S_STACKFRAME

#ifdef CONFIG_ARM64_SW_TTBR0_PAN
/*
 * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
 * EL0, there is no need to check the state of TTBR0_EL1 since
 * accesses are always enabled.
 * Note that the meaning of this bit differs from the ARMv8.1 PAN
 * feature as all TTBR0_EL1 accesses are disabled, not just those to
 * user mappings.
 */
alternative_if ARM64_HAS_PAN
b	1f				// skip TTBR0 PAN
alternative_else_nop_endif

.if	\el != 0
mrs	x21, ttbr0_el1
tst	x21, #TTBR_ASID_MASK		// Check for the reserved ASID
orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
b.eq	1f				// TTBR0 access already disabled
and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR
.endif

__uaccess_ttbr0_disable x21
1:
#endif

stp	x22, x23, [sp, #S_PC]

/* Not in a syscall by default (el0_svc overwrites for real syscall) */
.if	\el == 0
mov	w21, #NO_SYSCALL
str	w21, [sp, #S_SYSCALLNO]
.endif

/*
 * Set sp_el0 to current thread_info.
 */
.if	\el == 0
msr	sp_el0, tsk
.endif

/* Save pmr */
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
mrs_s	x20, SYS_ICC_PMR_EL1
str	x20, [sp, #S_PMR_SAVE]
alternative_else_nop_endif

/*
 * Registers that may be useful after this macro is invoked:
 *
 * x20 - ICC_PMR_EL1
 * x21 - aborted SP
 * x22 - aborted PC
 * x23 - aborted PSTATE
*/
.endm

当异常处理完毕时,都会跳转到ret_to_user函数进行上下文环境的恢复工作,恢复el0_sp寄存器的操作也在该函数中,如下所示:

/*
 * "slow" syscall return path.
 */
ret_to_user:
	disable_daif
	gic_prio_kentry_setup tmp=x3
	ldr	x1, [tsk, #TSK_TI_FLAGS]
	and	x2, x1, #_TIF_WORK_MASK
	cbnz	x2, work_pending
finish_ret_to_user:
	enable_step_tsk x1, x2
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
	bl	stackleak_erase
#endif
	kernel_exit 0
ENDPROC(ret_to_user)

.macro	kernel_exit, el
.if	\el != 0
disable_daif

/* Restore the task's original addr_limit. */
ldr	x20, [sp, #S_ORIG_ADDR_LIMIT]
str	x20, [tsk, #TSK_TI_ADDR_LIMIT]

/* No need to restore UAO, it will be restored from SPSR_EL1 */
.endif

/* Restore pmr */
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
ldr	x20, [sp, #S_PMR_SAVE]
msr_s	SYS_ICC_PMR_EL1, x20
/* Ensure priority change is seen by redistributor */
dsb	sy
alternative_else_nop_endif

ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
.if	\el == 0
ct_user_enter
.endif

#ifdef CONFIG_ARM64_SW_TTBR0_PAN
/*
 * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
 * PAN bit checking.
 */
alternative_if ARM64_HAS_PAN
b	2f				// skip TTBR0 PAN
alternative_else_nop_endif

.if	\el != 0
tbnz	x22, #22, 1f			// Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
.endif

__uaccess_ttbr0_enable x0, x1

.if	\el == 0
/*
 * Enable errata workarounds only if returning to user. The only
 * workaround currently required for TTBR0_EL1 changes are for the
 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
 * corruption).
 */
bl	post_ttbr_update_workaround
.endif
1:
.if	\el != 0
and	x22, x22, #~PSR_PAN_BIT		// ARMv8.0 CPUs do not understand this bit
.endif
2:
#endif

.if	\el == 0
ldr	x23, [sp, #S_SP]		// load return stack pointer
msr	sp_el0, x23
tst	x22, #PSR_MODE32_BIT		// native task?
b.eq	3f

#ifdef CONFIG_ARM64_ERRATUM_845719
alternative_if ARM64_WORKAROUND_845719
#ifdef CONFIG_PID_IN_CONTEXTIDR
mrs	x29, contextidr_el1
msr	contextidr_el1, x29
#else
msr contextidr_el1, xzr
#endif
alternative_else_nop_endif
#endif
3:
#ifdef CONFIG_ARM64_ERRATUM_1418040
alternative_if_not ARM64_WORKAROUND_1418040
b	4f
alternative_else_nop_endif
/*
 * if (x22.mode32 == cntkctl_el1.el0vcten)
 *     cntkctl_el1.el0vcten = ~cntkctl_el1.el0vcten
 */
mrs	x1, cntkctl_el1
eon	x0, x1, x22, lsr #3
tbz	x0, #1, 4f
eor	x1, x1, #2	// ARCH_TIMER_USR_VCT_ACCESS_EN
msr	cntkctl_el1, x1
4:
#endif
apply_ssbd 0, x0, x1
.endif

msr	elr_el1, x21			// set up the return data
msr	spsr_el1, x22
ldp	x0, x1, [sp, #16 * 0]
ldp	x2, x3, [sp, #16 * 1]
ldp	x4, x5, [sp, #16 * 2]
ldp	x6, x7, [sp, #16 * 3]
ldp	x8, x9, [sp, #16 * 4]
ldp	x10, x11, [sp, #16 * 5]
ldp	x12, x13, [sp, #16 * 6]
ldp	x14, x15, [sp, #16 * 7]
ldp	x16, x17, [sp, #16 * 8]
ldp	x18, x19, [sp, #16 * 9]
ldp	x20, x21, [sp, #16 * 10]
ldp	x22, x23, [sp, #16 * 11]
ldp	x24, x25, [sp, #16 * 12]
ldp	x26, x27, [sp, #16 * 13]
ldp	x28, x29, [sp, #16 * 14]
ldr	lr, [sp, #S_LR]
add	sp, sp, #S_FRAME_SIZE		// restore sp

.if	\el == 0
alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
bne	5f
msr	far_el1, x30
tramp_alias	x30, tramp_exit_native
br	x30
5:
tramp_alias	x30, tramp_exit_compat
br	x30
#endif
.else
eret
.endif
sb
.endm

你可能感兴趣的:(linux)