本文主要介绍linux current宏在arm和arm64上的实现
内核版本:Linux 5.3
在linux 内核中,有一个current宏,通过该宏,可以获取当前进程的task_struct数据结构。在arm上,就是利用了内核栈特性来实现的,记录如下:
首先,如果没有在arch/xxx/include/asm/目录下没有定义该宏,则该宏定义在include/asm-generic/current.h文件中,如下所示:
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_CURRENT_H
#define __ASM_GENERIC_CURRENT_H
#include
#define get_current() (current_thread_info()->task)
#define current get_current()
#endif /* __ASM_GENERIC_CURRENT_H */
在arm32架构中,current就是定义在上述文件的,而在arm64中,在打上以下这个补丁之前,实现也是如arm32一样,但打上以下补丁后,current就定义在arch/arm64/include/asm/current.h这个文件中。
补丁信息如下所示:
From c02433dd6de32f042cf3ffe476746b1115b8c096 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 3 Nov 2016 20:23:13 +0000
Subject: [PATCH] arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
arch/arm64/Kconfig | 1 +
arch/arm64/include/asm/Kbuild | 1 -
arch/arm64/include/asm/current.h | 22 +++++++++++++++++++
arch/arm64/include/asm/smp.h | 1 +
arch/arm64/include/asm/thread_info.h | 24 ---------------------
arch/arm64/kernel/asm-offsets.c | 8 ++++---
arch/arm64/kernel/entry.S | 41 ++++++++++++++++++------------------
arch/arm64/kernel/head.S | 11 +++++-----
arch/arm64/kernel/process.c | 16 ++++++++++++++
arch/arm64/kernel/smp.c | 2 ++
10 files changed, 73 insertions(+), 54 deletions(-)
create mode 100644 arch/arm64/include/asm/current.h
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 77a807a..0b8227f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -109,6 +109,7 @@ config ARM64
select POWER_SUPPLY
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
+ select THREAD_INFO_IN_TASK
help
ARM 64-bit (AArch64) Linux support.
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 44e1d7f..28196b1 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -1,7 +1,6 @@
generic-y += bugs.h
generic-y += clkdev.h
generic-y += cputime.h
-generic-y += current.h
generic-y += delay.h
generic-y += div64.h
generic-y += dma.h
diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h
new file mode 100644
index 00000000..f2bcbe2
--- /dev/null
+++ b/arch/arm64/include/asm/current.h
@@ -0,0 +1,22 @@
+#ifndef __ASM_CURRENT_H
+#define __ASM_CURRENT_H
+
+#include <linux/compiler.h>
+
+#include <asm/sysreg.h>
+
+#ifndef __ASSEMBLY__
+
+struct task_struct;
+
+static __always_inline struct task_struct *get_current(void)
+{
+ return (struct task_struct *)read_sysreg(sp_el0);
+}
+
+#define current get_current()
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_CURRENT_H */
+
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 968b08d..a62db95 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -82,6 +82,7 @@ asmlinkage void secondary_start_kernel(void);
*/
struct secondary_data {
void *stack;
+ struct task_struct *task;
long status;
};
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index bce0f07..c17ad4d 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -47,41 +47,17 @@ typedef unsigned long mm_segment_t;
struct thread_info {
unsigned long flags; /* low level flags */
mm_segment_t addr_limit; /* address limit */
- struct task_struct *task; /* main task structure */
int preempt_count; /* 0 => preemptable, <0 => bug */
- int cpu; /* cpu */
};
#define INIT_THREAD_INFO(tsk) \
{ \
- .task = &tsk, \
- .flags = 0, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
}
#define init_stack (init_thread_union.stack)
-/*
- * how to get the thread information struct from C
- */
-static inline struct thread_info *current_thread_info(void) __attribute_const__;
-
-/*
- * struct thread_info can be accessed directly via sp_el0.
- *
- * We don't use read_sysreg() as we want the compiler to cache the value where
- * possible.
- */
-static inline struct thread_info *current_thread_info(void)
-{
- unsigned long sp_el0;
-
- asm ("mrs %0, sp_el0" : "=r" (sp_el0));
-
- return (struct thread_info *)sp_el0;
-}
-
#define thread_saved_pc(tsk) \
((unsigned long)(tsk->thread.cpu_context.pc))
#define thread_saved_sp(tsk) \
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index d30b232..c2dc9fa 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -36,9 +36,10 @@ int main(void)
{
DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
BLANK();
- DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
- DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
- DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
+ DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
+ DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
+ DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit));
+ DEFINE(TSK_STACK, offsetof(struct task_struct, stack));
BLANK();
DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context));
BLANK();
@@ -121,6 +122,7 @@ int main(void)
DEFINE(TZ_DSTTIME, offsetof(struct timezone, tz_dsttime));
BLANK();
DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack));
+ DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task));
BLANK();
#ifdef CONFIG_KVM_ARM_HOST
DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt));
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2d4c83b..6349a83 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -90,9 +90,8 @@
.if \el == 0
mrs x21, sp_el0
- mov tsk, sp
- and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear,
- ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug
+ ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear,
+ ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug
disable_step_tsk x19, x20 // exceptions when scheduling.
mov x29, xzr // fp pointed to user-space
@@ -100,10 +99,10 @@
add x21, sp, #S_FRAME_SIZE
get_thread_info tsk
/* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
- ldr x20, [tsk, #TI_ADDR_LIMIT]
+ ldr x20, [tsk, #TSK_TI_ADDR_LIMIT]
str x20, [sp, #S_ORIG_ADDR_LIMIT]
mov x20, #TASK_SIZE_64
- str x20, [tsk, #TI_ADDR_LIMIT]
+ str x20, [tsk, #TSK_TI_ADDR_LIMIT]
/* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
.endif /* \el == 0 */
mrs x22, elr_el1
@@ -139,7 +138,7 @@
.if \el != 0
/* Restore the task's original addr_limit. */
ldr x20, [sp, #S_ORIG_ADDR_LIMIT]
- str x20, [tsk, #TI_ADDR_LIMIT]
+ str x20, [tsk, #TSK_TI_ADDR_LIMIT]
/* No need to restore UAO, it will be restored from SPSR_EL1 */
.endif
@@ -192,13 +191,14 @@ alternative_else_nop_endif
mov x19, sp // preserve the original sp
/*
- * Compare sp with the current thread_info, if the top
- * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and
- * should switch to the irq stack.
+ * Compare sp with the base of the task stack.
+ * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
+ * and should switch to the irq stack.
*/
- and x25, x19, #~(THREAD_SIZE - 1)
- cmp x25, tsk
- b.ne 9998f
+ ldr x25, [tsk, TSK_STACK]
+ eor x25, x25, x19
+ and x25, x25, #~(THREAD_SIZE - 1)
+ cbnz x25, 9998f
adr_this_cpu x25, irq_stack, x26
mov x26, #IRQ_STACK_START_SP
@@ -427,9 +427,9 @@ el1_irq:
irq_handler
#ifdef CONFIG_PREEMPT
- ldr w24, [tsk, #TI_PREEMPT] // get preempt count
+ ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count
cbnz w24, 1f // preempt count != 0
- ldr x0, [tsk, #TI_FLAGS] // get flags
+ ldr x0, [tsk, #TSK_TI_FLAGS] // get flags
tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
bl el1_preempt
1:
@@ -444,7 +444,7 @@ ENDPROC(el1_irq)
el1_preempt:
mov x24, lr
1: bl preempt_schedule_irq // irq en/disable is done inside
- ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS
+ ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS
tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
ret x24
#endif
@@ -674,8 +674,7 @@ ENTRY(cpu_switch_to)
ldp x29, x9, [x8], #16
ldr lr, [x8]
mov sp, x9
- and x9, x9, #~(THREAD_SIZE - 1)
- msr sp_el0, x9
+ msr sp_el0, x1
ret
ENDPROC(cpu_switch_to)
@@ -686,7 +685,7 @@ ENDPROC(cpu_switch_to)
ret_fast_syscall:
disable_irq // disable interrupts
str x0, [sp, #S_X0] // returned x0
- ldr x1, [tsk, #TI_FLAGS] // re-check for syscall tracing
+ ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
and x2, x1, #_TIF_SYSCALL_WORK
cbnz x2, ret_fast_syscall_trace
and x2, x1, #_TIF_WORK_MASK
@@ -706,14 +705,14 @@ work_pending:
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_on // enabled while in userspace
#endif
- ldr x1, [tsk, #TI_FLAGS] // re-check for single-step
+ ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step
b finish_ret_to_user
/*
* "slow" syscall return path.
*/
ret_to_user:
disable_irq // disable interrupts
- ldr x1, [tsk, #TI_FLAGS]
+ ldr x1, [tsk, #TSK_TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
finish_ret_to_user:
@@ -746,7 +745,7 @@ el0_svc_naked: // compat entry point
enable_dbg_and_irq
ct_user_exit 1
- ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks
+ ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks
tst x16, #_TIF_SYSCALL_WORK
b.ne __sys_trace
cmp scno, sc_nr // check upper syscall limit
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 332e331..eaafb25 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -428,7 +428,8 @@ ENDPROC(__create_page_tables)
__primary_switched:
adrp x4, init_thread_union
add sp, x4, #THREAD_SIZE
- msr sp_el0, x4 // Save thread_info
+ adr_l x5, init_task
+ msr sp_el0, x5 // Save thread_info
adr_l x8, vectors // load VBAR_EL1 with virtual
msr vbar_el1, x8 // vector table address
@@ -699,10 +700,10 @@ __secondary_switched:
isb
adr_l x0, secondary_data
- ldr x0, [x0, #CPU_BOOT_STACK] // get secondary_data.stack
- mov sp, x0
- and x0, x0, #~(THREAD_SIZE - 1)
- msr sp_el0, x0 // save thread_info
+ ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack
+ mov sp, x1
+ ldr x2, [x0, #CPU_BOOT_TASK]
+ msr sp_el0, x2
mov x29, #0
b secondary_start_kernel
ENDPROC(__secondary_switched)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index ec7b9c0..a98b743 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -45,6 +45,7 @@
#include
#include
#include
+#include <linux/percpu.h>
#include
#include
@@ -322,6 +323,20 @@ void uao_thread_switch(struct task_struct *next)
}
/*
+ * We store our current task in sp_el0, which is clobbered by userspace. Keep a
+ * shadow copy so that we can restore this upon entry from userspace.
+ *
+ * This is *only* for exception entry from EL0, and is not valid until we
+ * __switch_to() a user task.
+ */
+DEFINE_PER_CPU(struct task_struct *, __entry_task);
+
+static void entry_task_switch(struct task_struct *next)
+{
+ __this_cpu_write(__entry_task, next);
+}
+
+/*
* Thread switching.
*/
struct task_struct *__switch_to(struct task_struct *prev,
@@ -333,6 +348,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
tls_thread_switch(next);
hw_breakpoint_thread_switch(next);
contextidr_thread_switch(next);
+ entry_task_switch(next);
uao_thread_switch(next);
/*
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 6f42c68..cb87234 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -149,6 +149,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
* We need to tell the secondary core where to find its stack and the
* page tables.
*/
+ secondary_data.task = idle;
secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;
update_cpu_boot_status(CPU_MMU_OFF);
__flush_dcache_area(&secondary_data, sizeof(secondary_data));
@@ -173,6 +174,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
pr_err("CPU%u: failed to boot: %d\n", cpu, ret);
}
+ secondary_data.task = NULL;
secondary_data.stack = NULL;
status = READ_ONCE(secondary_data.status);
if (ret && status) {
--
2.7.4
打上以上补丁后,arm64上current的实现如下:
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_CURRENT_H
#define __ASM_CURRENT_H
#include
#ifndef __ASSEMBLY__
struct task_struct;
/*
* We don't use read_sysreg() as we want the compiler to cache the value where
* possible.
*/
static __always_inline struct task_struct *get_current(void)
{
unsigned long sp_el0;
asm ("mrs %0, sp_el0" : "=r" (sp_el0));
return (struct task_struct *)sp_el0;
}
#define current get_current()
#endif /* __ASSEMBLY__ */
#endif /* __ASM_CURRENT_H */
在arm32 cpu上,当内核发生panic时,我们通常可以在串口上看到如下打印:
/ # echo c > /proc/sysrq-trigger
sysrq: SysRq : Trigger a crash
Unhandled fault: page domain fault (0x81b) at 0x00000000
pgd = be484000
[00000000] *pgd=9e478831, *pte=00000000, *ppte=00000000
Internal error: : 81b [#1] SMP ARM
Modules linked in:
CPU: 0 PID: 739 Comm: sh Tainted: G B 4.9.191-rc1+ #9
Hardware name: ARM-Versatile Express
task: be9f2200 task.stack: be7ca000
PC is at sysrq_handle_crash+0x30/0x38
LR is at arm_heavy_mb+0x2c/0x48
pc : [<8044e580>] lr : [<801171b8>] psr: 60000013
sp : be7cbe58 ip : be7cbe48 fp : be7cbe6c
r10: 00000004 r9 : 00000000 r8 : 80b2f4c4
r7 : 00000000 r6 : 00000063 r5 : 00000008 r4 : 00000001
r3 : 00000000 r2 : 00000730 r1 : 00000000 r0 : 00000063
Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none
Control: 10c5387d Table: 9e48406a DAC: 00000051
Process sh (pid: 739, stack limit = 0xbe7ca210)
Stack: (0xbe7cbe58 to 0xbe7cc000)
be40: 80b0d680 00000008
be60: be7cbe9c be7cbe70 8044eb2c 8044e55c 00000055 00000002 00000000 00000000
be80: be90f9c0 00000000 00000000 00000004 be7cbeb4 be7cbea0 8044f090 8044ea84
bea0: 8044f02c 00000000 be7cbed4 be7cbeb8 802a2418 8044f038 be9a7e00 802a23ac
bec0: be7cbf78 00000002 be7cbf44 be7cbed8 8024439c 802a23b8 00000017 801166fc
bee0: 000a55b8 be7cbfb0 001bb888 7ed6ffbe be7cbfac be7cbf00 8010136c 80116708
bf00: be9f21f8 be7cbf50 be7cbf4c be7cbf18 80126644 8024525c 8024276c fffffff6
bf20: 00000002 be9a7e00 00218ab8 be7cbf78 00000000 00000000 be7cbf74 be7cbf48
bf40: 8024529c 8024436c 80126d10 80265780 be9a7e00 be9a7e00 00218ab8 00000002
bf60: 00000000 00000000 be7cbfa4 be7cbf78 80246188 802451f4 00000000 00000000
bf80: 00214b24 00000001 00218ab8 00000004 801080e4 be7ca000 00000000 be7cbfa8
bfa0: 80107f00 80246138 00214b24 00000001 00000001 00218ab8 00000002 00000000
bfc0: 00214b24 00000001 00218ab8 00000004 00000001 00218ab8 00000020 00217ab8
bfe0: 00000000 7ed6f47c 000135d8 00143f1c 60000010 00000001 00000000 00000000
[<8044e580>] (sysrq_handle_crash) from [<8044eb2c>] (__handle_sysrq+0xb4/0x184)
[<8044eb2c>] (__handle_sysrq) from [<8044f090>] (write_sysrq_trigger+0x64/0x74)
[<8044f090>] (write_sysrq_trigger) from [<802a2418>] (proc_reg_write+0x6c/0x94)
[<802a2418>] (proc_reg_write) from [<8024439c>] (__vfs_write+0x3c/0x128)
[<8024439c>] (__vfs_write) from [<8024529c>] (vfs_write+0xb4/0x178)
[<8024529c>] (vfs_write) from [<80246188>] (SyS_write+0x5c/0xd0)
[<80246188>] (SyS_write) from [<80107f00>] (ret_fast_syscall+0x0/0x1c)
Code: e5834000 f57ff04e ebf32303 e3a03000 (e5c34000)
---[ end trace a875aa28f6df8b07 ]---
Kernel panic - not syncing: Fatal exception
在panic时,我们可以看到串口上有打印panic时,指向进程task_struct的地址,进程内核栈的地址,pc寄存器,sp寄存器,lr寄存器等信息。关于进程task_struct的地址,就是通过sp寄存器的值算出来的,计算的源码实现分析如下:
首先,因为每个进程,都会分配相应的内核栈,因此根据sp寄存器的值,进行一个内核栈大小的对齐,就能获取到当前进程的thread_info结构信息,如下:
源码路径:arch/arm/include/asm/thread_info.h
#define THREAD_SIZE_ORDER 1
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define THREAD_START_SP (THREAD_SIZE - 8)
/*
* how to get the current stack pointer in C
*/
register unsigned long current_stack_pointer asm ("sp");
/*
* how to get the thread information struct from C
*/
static inline struct thread_info *current_thread_info(void) __attribute_const__;
static inline struct thread_info *current_thread_info(void)
{
return (struct thread_info *)
(current_stack_pointer & ~(THREAD_SIZE - 1));
}
PAGE_SIZE定义如下:arch/arm/include/asm/page.h
/* PAGE_SHIFT determines the page size */
#define PAGE_SHIFT 12
#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK (~((1 << PAGE_SHIFT) - 1))
由上,可以计算出,在arm32 linux系统中,没有特殊配置下,内核栈大小是8k。
struct thread_info和内核栈通过union thread_union这个数据结构进行关联,thread_union这个类型描述了整个内核栈statck[],该数据结构如下:
union thread_union定义在:include/linux/sched.h
union thread_union {
#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
struct task_struct task;
#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info thread_info;
#endif
unsigned long stack[THREAD_SIZE/sizeof(long)];
};
在arm32架构中,目前是不支持CONFIG_THREAD_INFO_IN_TASK,当通过sp寄存器进行一个内核栈大小的对齐操作,就可以得到struct thread_info的地址,而thread_info就保存了特定体系结构的汇编代码段需要访问的那部分进程的数据,我们在thread_info中嵌入指向task_struct的指针, 则我们可以很方便的通过thread_info来查找task_struct,arm32的thread_info定义如下:
thread_info定义在:arch/arm/include/asm/thread_info.h
/*
* low level task data that entry.S needs immediate access to.
* __switch_to() assumes cpu_context follows immediately after cpu_domain.
*/
struct thread_info {
unsigned long flags; /* low level flags */
int preempt_count; /* 0 => preemptable, <0 => bug */
mm_segment_t addr_limit; /* address limit */
struct task_struct *task; /* main task structure */
__u32 cpu; /* cpu */
__u32 cpu_domain; /* cpu domain */
#ifdef CONFIG_STACKPROTECTOR_PER_TASK
unsigned long stack_canary;
#endif
struct cpu_context_save cpu_context; /* cpu context */
__u32 syscall; /* syscall number */
__u8 used_cp[16]; /* thread used copro */
unsigned long tp_value[2]; /* TLS registers */
#ifdef CONFIG_CRUNCH
struct crunch_state crunchstate;
#endif
union fp_state fpstate __attribute__((aligned(8)));
union vfp_state vfpstate;
#ifdef CONFIG_ARM_THUMBEE
unsigned long thumbee_state; /* ThumbEE Handler Base register */
#endif
};
当打上c02433dd6de32f042cf3ffe476746b1115b8c096这个补丁时,arm64 current宏的实现如上文所述,此时,进入内核时,通过sp_el0寄存器拿到struct task_struct的地址。
sp_el0保存的struct task_struct的地址应该是在用户态通过系统调用进入内核态时,保存起来的,因为内核态使用的sp寄存器应该是sp_el1,sp_el0在内核态刚好没用到,因此用来保存
struct task_struct的地址,分析一下arm64系统调用的实现,如下:
首先,从glibc的源码确认aarch64是如何陷入内核态(切换异常等级)的,源码如下所示:
陷入内核态的指令:sysdeps/unix/sysv/linux/aarch64/sysdep.h
# define DO_CALL(syscall_name, args) \
mov x8, SYS_ify (syscall_name); \
svc 0
从代码可以确认,把系统调用号保存到x8寄存器,并且通过svc 0这条指令陷入到内核态,svc指令含义如下图所示:
通过svc指令从el0陷入到内核态el1之后,会跳转到内核建立的异常向量表中,内核异常向量表如下所示:
/*
* EL0 mode handlers.
*/
.align 6
el0_sync:
kernel_entry 0
mrs x25, esr_el1 // read the syndrome register
lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state
b.eq el0_svc
cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0
b.eq el0_da
cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
b.eq el0_ia
cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access
b.eq el0_fpsimd_acc
cmp x24, #ESR_ELx_EC_SVE // SVE access
b.eq el0_sve_acc
cmp x24, #ESR_ELx_EC_FP_EXC64 // FP/ASIMD exception
b.eq el0_fpsimd_exc
cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
ccmp x24, #ESR_ELx_EC_WFx, #4, ne
b.eq el0_sys
cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception
b.eq el0_sp
cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
b.eq el0_pc
cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0
b.eq el0_undef
cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0
b.ge el0_dbg
b el0_inv
#ifdef CONFIG_COMPAT
.align 6
el0_sync_compat:
kernel_entry 0, 32
mrs x25, esr_el1 // read the syndrome register
lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
cmp x24, #ESR_ELx_EC_SVC32 // SVC in 32-bit state
b.eq el0_svc_compat
cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0
b.eq el0_da
cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
b.eq el0_ia
cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access
b.eq el0_fpsimd_acc
cmp x24, #ESR_ELx_EC_FP_EXC32 // FP/ASIMD exception
b.eq el0_fpsimd_exc
cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
b.eq el0_pc
cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0
b.eq el0_undef
cmp x24, #ESR_ELx_EC_CP15_32 // CP15 MRC/MCR trap
b.eq el0_cp15
cmp x24, #ESR_ELx_EC_CP15_64 // CP15 MRRC/MCRR trap
b.eq el0_cp15
cmp x24, #ESR_ELx_EC_CP14_MR // CP14 MRC/MCR trap
b.eq el0_undef
cmp x24, #ESR_ELx_EC_CP14_LS // CP14 LDC/STC trap
b.eq el0_undef
cmp x24, #ESR_ELx_EC_CP14_64 // CP14 MRRC/MCRR trap
b.eq el0_undef
cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0
b.ge el0_dbg
b el0_inv
el0_svc_compat:
mov x0, sp
bl el0_svc_compat_handler
b ret_to_user
.align 6
el0_irq_compat:
kernel_entry 0, 32
b el0_irq_naked
el0_error_compat:
kernel_entry 0, 32
b el0_error_naked
el0_cp15:
/*
* Trapped CP15 (MRC, MCR, MRRC, MCRR) instructions
*/
enable_daif
ct_user_exit
mov x0, x25
mov x1, sp
bl do_cp15instr
b ret_to_user
#endif
/*
* SVC handler.
*/
.align 6
el0_svc:
gic_prio_kentry_setup tmp=x1
mov x0, sp
bl el0_svc_handler
b ret_to_user
ENDPROC(el0_svc)
/*
* Exception vectors.
*/
.pushsection ".entry.text", "ax"
.align 11
ENTRY(vectors)
kernel_ventry 1, sync_invalid // Synchronous EL1t
kernel_ventry 1, irq_invalid // IRQ EL1t
kernel_ventry 1, fiq_invalid // FIQ EL1t
kernel_ventry 1, error_invalid // Error EL1t
kernel_ventry 1, sync // Synchronous EL1h
kernel_ventry 1, irq // IRQ EL1h
kernel_ventry 1, fiq_invalid // FIQ EL1h
kernel_ventry 1, error // Error EL1h
kernel_ventry 0, sync // Synchronous 64-bit EL0
kernel_ventry 0, irq // IRQ 64-bit EL0
kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0
kernel_ventry 0, error // Error 64-bit EL0
#ifdef CONFIG_COMPAT
kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0
kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0
kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0
kernel_ventry 0, error_compat, 32 // Error 32-bit EL0
#else
kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0
kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0
kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0
kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0
#endif
END(vectors)
从异常向量表我们知道以下几点:
从linux实现的异常向量表来看,通过svc指令产生的异常属于Synchronous异常,根据陷入el1 Synchronous异常之前的异常模式是AARCH32模式还是AARCH64模式,跳转到el0_sync/el0_sync_compat/el0_sync_invalid入口,当没有打开CONFIG_COMPAT,如果陷入el1 Synchronous异常之前的异常模式是AARCH32,那就会跳转el0_sync_invalid入口,否置会跳转到el0_sync_compat入口。当跳转到el0_sync/el0_sync_compat时,第一句指向的便是kernel_entry这个宏对应的代码,保存el0_sp这些寄存器的值到栈中,就是在这里实现的,代码如下所示:
.macro kernel_entry, el, regsize = 64
.if \regsize == 32
mov w0, w0 // zero upper 32 bits of x0
.endif
stp x0, x1, [sp, #16 * 0]
stp x2, x3, [sp, #16 * 1]
stp x4, x5, [sp, #16 * 2]
stp x6, x7, [sp, #16 * 3]
stp x8, x9, [sp, #16 * 4]
stp x10, x11, [sp, #16 * 5]
stp x12, x13, [sp, #16 * 6]
stp x14, x15, [sp, #16 * 7]
stp x16, x17, [sp, #16 * 8]
stp x18, x19, [sp, #16 * 9]
stp x20, x21, [sp, #16 * 10]
stp x22, x23, [sp, #16 * 11]
stp x24, x25, [sp, #16 * 12]
stp x26, x27, [sp, #16 * 13]
stp x28, x29, [sp, #16 * 14]
.if \el == 0
clear_gp_regs
mrs x21, sp_el0
ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear,
ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug
disable_step_tsk x19, x20 // exceptions when scheduling.
apply_ssbd 1, x22, x23
.else
add x21, sp, #S_FRAME_SIZE
get_current_task tsk
/* Save the task's original addr_limit and set USER_DS */
ldr x20, [tsk, #TSK_TI_ADDR_LIMIT]
str x20, [sp, #S_ORIG_ADDR_LIMIT]
mov x20, #USER_DS
str x20, [tsk, #TSK_TI_ADDR_LIMIT]
/* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
.endif /* \el == 0 */
mrs x22, elr_el1
mrs x23, spsr_el1
stp lr, x21, [sp, #S_LR]
/*
* In order to be able to dump the contents of struct pt_regs at the
* time the exception was taken (in case we attempt to walk the call
* stack later), chain it together with the stack frames.
*/
.if \el == 0
stp xzr, xzr, [sp, #S_STACKFRAME]
.else
stp x29, x22, [sp, #S_STACKFRAME]
.endif
add x29, sp, #S_STACKFRAME
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
/*
* Set the TTBR0 PAN bit in SPSR. When the exception is taken from
* EL0, there is no need to check the state of TTBR0_EL1 since
* accesses are always enabled.
* Note that the meaning of this bit differs from the ARMv8.1 PAN
* feature as all TTBR0_EL1 accesses are disabled, not just those to
* user mappings.
*/
alternative_if ARM64_HAS_PAN
b 1f // skip TTBR0 PAN
alternative_else_nop_endif
.if \el != 0
mrs x21, ttbr0_el1
tst x21, #TTBR_ASID_MASK // Check for the reserved ASID
orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR
b.eq 1f // TTBR0 access already disabled
and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR
.endif
__uaccess_ttbr0_disable x21
1:
#endif
stp x22, x23, [sp, #S_PC]
/* Not in a syscall by default (el0_svc overwrites for real syscall) */
.if \el == 0
mov w21, #NO_SYSCALL
str w21, [sp, #S_SYSCALLNO]
.endif
/*
* Set sp_el0 to current thread_info.
*/
.if \el == 0
msr sp_el0, tsk
.endif
/* Save pmr */
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
mrs_s x20, SYS_ICC_PMR_EL1
str x20, [sp, #S_PMR_SAVE]
alternative_else_nop_endif
/*
* Registers that may be useful after this macro is invoked:
*
* x20 - ICC_PMR_EL1
* x21 - aborted SP
* x22 - aborted PC
* x23 - aborted PSTATE
*/
.endm
当异常处理完毕时,都会跳转到ret_to_user函数进行上下文环境的恢复工作,恢复el0_sp寄存器的操作也在该函数中,如下所示:
/*
* "slow" syscall return path.
*/
ret_to_user:
disable_daif
gic_prio_kentry_setup tmp=x3
ldr x1, [tsk, #TSK_TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
finish_ret_to_user:
enable_step_tsk x1, x2
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
bl stackleak_erase
#endif
kernel_exit 0
ENDPROC(ret_to_user)
.macro kernel_exit, el
.if \el != 0
disable_daif
/* Restore the task's original addr_limit. */
ldr x20, [sp, #S_ORIG_ADDR_LIMIT]
str x20, [tsk, #TSK_TI_ADDR_LIMIT]
/* No need to restore UAO, it will be restored from SPSR_EL1 */
.endif
/* Restore pmr */
alternative_if ARM64_HAS_IRQ_PRIO_MASKING
ldr x20, [sp, #S_PMR_SAVE]
msr_s SYS_ICC_PMR_EL1, x20
/* Ensure priority change is seen by redistributor */
dsb sy
alternative_else_nop_endif
ldp x21, x22, [sp, #S_PC] // load ELR, SPSR
.if \el == 0
ct_user_enter
.endif
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
/*
* Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
* PAN bit checking.
*/
alternative_if ARM64_HAS_PAN
b 2f // skip TTBR0 PAN
alternative_else_nop_endif
.if \el != 0
tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
.endif
__uaccess_ttbr0_enable x0, x1
.if \el == 0
/*
* Enable errata workarounds only if returning to user. The only
* workaround currently required for TTBR0_EL1 changes are for the
* Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
* corruption).
*/
bl post_ttbr_update_workaround
.endif
1:
.if \el != 0
and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit
.endif
2:
#endif
.if \el == 0
ldr x23, [sp, #S_SP] // load return stack pointer
msr sp_el0, x23
tst x22, #PSR_MODE32_BIT // native task?
b.eq 3f
#ifdef CONFIG_ARM64_ERRATUM_845719
alternative_if ARM64_WORKAROUND_845719
#ifdef CONFIG_PID_IN_CONTEXTIDR
mrs x29, contextidr_el1
msr contextidr_el1, x29
#else
msr contextidr_el1, xzr
#endif
alternative_else_nop_endif
#endif
3:
#ifdef CONFIG_ARM64_ERRATUM_1418040
alternative_if_not ARM64_WORKAROUND_1418040
b 4f
alternative_else_nop_endif
/*
* if (x22.mode32 == cntkctl_el1.el0vcten)
* cntkctl_el1.el0vcten = ~cntkctl_el1.el0vcten
*/
mrs x1, cntkctl_el1
eon x0, x1, x22, lsr #3
tbz x0, #1, 4f
eor x1, x1, #2 // ARCH_TIMER_USR_VCT_ACCESS_EN
msr cntkctl_el1, x1
4:
#endif
apply_ssbd 0, x0, x1
.endif
msr elr_el1, x21 // set up the return data
msr spsr_el1, x22
ldp x0, x1, [sp, #16 * 0]
ldp x2, x3, [sp, #16 * 1]
ldp x4, x5, [sp, #16 * 2]
ldp x6, x7, [sp, #16 * 3]
ldp x8, x9, [sp, #16 * 4]
ldp x10, x11, [sp, #16 * 5]
ldp x12, x13, [sp, #16 * 6]
ldp x14, x15, [sp, #16 * 7]
ldp x16, x17, [sp, #16 * 8]
ldp x18, x19, [sp, #16 * 9]
ldp x20, x21, [sp, #16 * 10]
ldp x22, x23, [sp, #16 * 11]
ldp x24, x25, [sp, #16 * 12]
ldp x26, x27, [sp, #16 * 13]
ldp x28, x29, [sp, #16 * 14]
ldr lr, [sp, #S_LR]
add sp, sp, #S_FRAME_SIZE // restore sp
.if \el == 0
alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
bne 5f
msr far_el1, x30
tramp_alias x30, tramp_exit_native
br x30
5:
tramp_alias x30, tramp_exit_compat
br x30
#endif
.else
eret
.endif
sb
.endm