进程和线程操作系统基础和重要的机制,从源码角度理解进程和线程的区别对于理解操作系统的基本原理非常有帮助,同时进程和线程的创建又是通过系统调用实现,两者结合起来理解可以融会贯通。
问题:
arm32系统调用是通过swi(svc)指令触发,且r7存储器存放系统调用号,进程和线程创建基本流程总结如下:
3.1 线程创建pthread_create
代码路径:bionic/libc/bionic/pthread_create.cpp
__BIONIC_WEAK_FOR_NATIVE_BRIDGE
int pthread_create(pthread_t* thread_out, pthread_attr_t const* attr,
void* (*start_routine)(void*), void* arg) {
ErrnoRestorer errno_restorer;
pthread_attr_t thread_attr;
...
pthread_internal_t* thread = NULL;
void* child_stack = NULL;
int result = __allocate_thread(&thread_attr, &thread, &child_stack);
if (result != 0) {
return result;
}
thread->startup_handshake_lock.init(false);
thread->startup_handshake_lock.lock();
thread->start_routine = start_routine;
thread->start_routine_arg = arg;
thread->set_cached_pid(getpid());
int flags = CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID;
void* tls = reinterpret_cast(thread->tls);
int rc = clone(__pthread_start, child_stack, flags, thread, &(thread->tid), tls, &(thread->tid));
...
return 0;
}
3.2 进程创建fork
__BIONIC_WEAK_FOR_NATIVE_BRIDGE
int fork() {
__bionic_atfork_run_prepare();
pthread_internal_t* self = __get_thread();
int result = clone(nullptr,
nullptr,
(CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD),
nullptr,
nullptr,
nullptr,
&(self->tid));
if (result == 0) {
// Update the cached pid, since clone() will not set it directly (as
// self->tid is updated by the kernel).
self->set_cached_pid(gettid());
__bionic_atfork_run_child();
} else {
__bionic_atfork_run_parent();
}
return result;
}
进程创建比线程创建相对简单,调用clone第一个参数入口函数和第二个参数childstack为nullptr,flags只指定了CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD。
3.3 clone函数
__BIONIC_WEAK_FOR_NATIVE_BRIDGE
int clone(int (*fn)(void*), void* child_stack, int flags, void* arg, ...) {
...
// Actually do the clone.
int clone_result;
if (fn != nullptr) {
clone_result = __bionic_clone(flags, child_stack, parent_tid, new_tls, child_tid, fn, arg);
} else {
#if defined(__x86_64__) // sys_clone's last two arguments are flipped on x86-64.
clone_result = syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, new_tls);
#else
clone_result = syscall(__NR_clone, flags, child_stack, parent_tid, new_tls, child_tid);
#endif
}
...
return clone_result;
}
3.4 __bionic_clone和syscall函数
__bionic_clone函数:
ENTRY_PRIVATE(__bionic_clone)
mov ip, sp
# save registers to parent stack
stmfd sp!, {r4, r5, r6, r7}
.cfi_def_cfa_offset 16
.cfi_rel_offset r4, 0
.cfi_rel_offset r5, 4
.cfi_rel_offset r6, 8
.cfi_rel_offset r7, 12
# load extra parameters
ldmfd ip, {r4, r5, r6}
# Push 'fn' and 'arg' onto the child stack.
stmdb r1!, {r5, r6}
# Make the system call.
ldr r7, =__NR_clone
swi #0
# Are we the child?
movs r0, r0
beq .L_bc_child
# In the parent, reload saved registers then either return or set errno.
ldmfd sp!, {r4, r5, r6, r7}
cmn r0, #(MAX_ERRNO + 1)
bxls lr
neg r0, r0
b __set_errno_internal
.L_bc_child:
# Setting lr to 0 will make the unwinder stop at __start_thread.
mov lr, #0
# Call __start_thread with the 'fn' and 'arg' we stored on the child stack.
pop {r0, r1} @从线程栈中取fn和arg参数放入r0,r1,这样实现给__start_thread传参。
b __start_thread
END(__bionic_clone)
注意:由于系统调用会从用户态切换到内核态,用户态和内核态栈是不同的,所以无法使用栈传参,只能使用寄存器传参数,系统调用arm32上可以使用r0-r6这7个寄存器传参。
问题:child返回值为什么0:
int
copy_thread(unsigned long clone_flags, unsigned long stack_start,
unsigned long stk_sz, struct task_struct *p)
{
struct thread_info *thread = task_thread_info(p);
struct pt_regs *childregs = task_pt_regs(p);
memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));
#ifdef CONFIG_CPU_USE_DOMAINS
/*
* Copy the initial value of the domain access control register
* from the current thread: thread->addr_limit will have been
* copied from the current thread via setup_thread_stack() in
* kernel/fork.c
*/
thread->cpu_domain = get_domain();
#endif
//非内核线程
if (likely(!(p->flags & PF_KTHREAD))) {
*childregs = *current_pt_regs();
childregs->ARM_r0 = 0;
if (stack_start)
childregs->ARM_sp = stack_start;
} else {
//内核线程
memset(childregs, 0, sizeof(struct pt_regs));
thread->cpu_context.r4 = stk_sz;
thread->cpu_context.r5 = stack_start;
childregs->ARM_cpsr = SVC_MODE;
}
...
}
childregs->ARM_sp = stack_start设置了线程的sp寄存器。同时可以看到childregs->ARM_r0 = 0,这就是子进程为什么返回值是0的原因。
问题: pop {r0, r1} @从线程栈中取fn和arg参数放入r0,r1,怎么实现的?
pop能将fn和arg值出栈写入r0和r1寄存器的前提是,从系统调用返回执行到L_bc_child这个分支时,cpu的sp寄存器指向了线程栈,这是哪里设置的呢?copy_thread的childregs->ARM_sp = stack_start,图示如下:
syscall函数:
ENTRY(syscall)
mov ip, sp @sp存储入ip,因为后续入栈要修改sp。
stmfd sp!, {r4, r5, r6, r7} @寄存器入栈
.cfi_def_cfa_offset 16
.cfi_rel_offset r4, 0
.cfi_rel_offset r5, 4
.cfi_rel_offset r6, 8
.cfi_rel_offset r7, 12
mov r7, r0 @系统调用号存入r7寄存器
mov r0, r1 @arm使用r0传第一个参数
mov r1, r2 @arm使用r1传第二个参数
mov r2, r3 @arm使用r2传第三个餐宿
ldmfd ip, {r3, r4, r5, r6} @读取第syscall函数的第5,6,7,8个参数到寄存器中
swi #0 @现在r0 - r7都已设置好,可以进入系统调用。
ldmfd sp!, {r4, r5, r6, r7}
.cfi_def_cfa_offset 0
cmn r0, #(MAX_ERRNO + 1)
bxls lr
neg r0, r0
b __set_errno_internal
END(syscall)
调用swi指令cpu会切换模式,由于两个模式之间使用的sp不同,所以系统调用参数只能使用寄存器。不同的cpu架构使用的寄存器不同:
目前默认的内核配置时arm/EABI,即使用r0-r6传参。
3.5 中断向量表
swi调用进入系统调用,进入vector_swi代码流程:
.align 5
ENTRY(vector_swi)
sub sp, sp, #PT_REGS_SIZE
stmia sp, {r0 - r12} @ Calling r0 - r12
ARM( add r8, sp, #S_PC )
ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr
THUMB( mov r8, sp )
THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr
mrs saved_psr, spsr @ called from non-FIQ mode, so ok.
TRACE( mov saved_pc, lr )
str saved_pc, [sp, #S_PC] @ Save calling PC
str saved_psr, [sp, #S_PSR] @ Save CPSR
str r0, [sp, #S_OLD_R0] @ Save OLD_R0
zero_fp
alignment_trap r10, ip, __cr_alignment
asm_trace_hardirqs_on save=0
enable_irq_notrace
ct_user_exit save=0
/*
* Get the system call number.
*/
#if defined(CONFIG_OABI_COMPAT)
/*
* If we have CONFIG_OABI_COMPAT then we need to look at the swi
* value to determine if it is an EABI or an old ABI call.
*/
ARM_BE8(rev r10, r10) @ little endian instruction
#elif defined(CONFIG_AEABI)
/*
* Pure EABI user space always put syscall number into scno (r7).
*/
#elif defined(CONFIG_ARM_THUMB)
/* Legacy ABI only, possibly thumb mode. */
tst saved_psr, #PSR_T_BIT @ this is SPSR from save_user_regs
addne scno, r7, #__NR_SYSCALL_BASE @ put OS number in
USER( ldreq scno, [saved_pc, #-4] )
#else
/* Legacy ABI only. */
USER( ldr scno, [saved_pc, #-4] ) @ get SWI instruction
#endif
/* saved_psr and saved_pc are now dead */
uaccess_disable tbl
adr tbl, sys_call_table @ load syscall table pointer
get_thread_info tsk
/*
* Reload the registers that may have been corrupted on entry to
* the syscall assembly (by tracing or context tracking.)
*/
TRACE( ldmia sp, {r0 - r3} )
local_restart:
ldr r10, [tsk, #TI_FLAGS] @ check for syscall tracing
stmdb sp!, {r4, r5} @ push fifth and sixth args
tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls?
bne __sys_trace
invoke_syscall tbl, scno, r10, ret_fast_syscall
add r1, sp, #S_OFF
2: cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE)
eor r0, scno, #__NR_SYSCALL_BASE @ put OS number back
bcs arm_syscall
mov why, #0 @ no longer a real syscall
b sys_ni_syscall @ not private func
ENDPROC(vector_swi)
核心逻辑:
invoke_syscall函数:
.macro invoke_syscall, table, nr, tmp, ret, reload=0
mov \tmp, \nr
cmp \tmp, #NR_syscalls @ check upper syscall limit
movcs \tmp, #0
csdb
badr lr, \ret @ return address
.if \reload
add r1, sp, #S_R0 + S_OFF @ pointer to regs
ldmccia r1, {r0 - r6} @ reload r0-r6
stmccia sp, {r4, r5} @ update stack arguments
.endif
ldrcc pc, [\table, \tmp, lsl #2] @ call sys_* routine
.endm
ret_fast_syscall:
ret_fast_syscall:
UNWIND(.fnstart )
UNWIND(.cantunwind )
str r0, [sp, #S_R0 + S_OFF]! @ save returned r0
disable_irq_notrace @ disable interrupts
ldr r2, [tsk, #TI_ADDR_LIMIT]
cmp r2, #TASK_SIZE
blne addr_limit_check_failed
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
beq no_work_pending
UNWIND(.fnend )
进入no_work_pending调用restore_user_regs恢复硬件上下文。
clone系统调用实现最重要的是调用copy_process来为子进程或者线程创建task_strcut数据结构,重点来看copy_process实现,主要是负责父进程的各种资源,上面介绍的copy_thread就是clone系统调用里面一个函数,后续会有文章更仔细的分析copy_process实现,本文不再展开。
父进程(执行clone的原进程)返回路径:
系统调用执行完继续执行,父进程(执行clone的进程)是按正常的系统调用的返回路径ret_fast_syscall返回,而子进程(或者线程)返回的路径是ret_from_frok,因为copy_thread中将thread->cpu_context.pc = (unsigned long)ret_from_fork;这样线程调度时候时候,将thread_info->cpu_context值弹出到cpu寄存器之后,线程就从ret_from_fork开始执行返回,图示流程参见3.4小节中图片。
子进程(线程)第一次执行:
在进程切换时,switch_to()函数会完成进程硬件上下文的切换,即把下一个进程(next进程)的cpu_context数据结构保存的内容恢复到处理器的寄存器中,从而完成进程的切换。此时,处理器开始运行next进程了。根据PC寄存器的值,处理器会从ret_from_fork汇编函数里开始执行,新进程的执行过程如图所示。为什么要把新建线程的首次执行入口设置成ret_from_fork,而不是直接像父进程一样直接ret_fast_syscall,主要是原因是还有一种情况是内核线程要特殊处理,所以返回的时候要判定是否是内核线程,如果是非内核线程才返回用户空间。
父进程和新线程返回用户空间之后,回到__bionic_clone中:
// pid_t __bionic_clone(int flags, void* child_stack, pid_t* parent_tid, void* tls, pid_t* child_tid, int (*fn)(void*), void* arg);
ENTRY_PRIVATE(__bionic_clone)
mov ip, sp
# save registers to parent stack
stmfd sp!, {r4, r5, r6, r7}
.cfi_def_cfa_offset 16
.cfi_rel_offset r4, 0
.cfi_rel_offset r5, 4
.cfi_rel_offset r6, 8
.cfi_rel_offset r7, 12
# load extra parameters
ldmfd ip, {r4, r5, r6}
# Push 'fn' and 'arg' onto the child stack.
stmdb r1!, {r5, r6}
# Make the system call.
ldr r7, =__NR_clone
swi #0
# Are we the child?
movs r0, r0
beq .L_bc_child
# In the parent, reload saved registers then either return or set errno.
ldmfd sp!, {r4, r5, r6, r7}
cmn r0, #(MAX_ERRNO + 1)
bxls lr
neg r0, r0
b __set_errno_internal
.L_bc_child:
# Setting lr to 0 will make the unwinder stop at __start_thread.
mov lr, #0
# Call __start_thread with the 'fn' and 'arg' we stored on the child stack.
pop {r0, r1}
b __start_thread
END(__bionic_clone)
此时如果r0寄存器的值是0,代表是子线程,执行.L_bc_child分支,将fn和arg参数弹出,调用__start_thread函数,这个函数会执行pthread_create中指定的入口参数
// Called from the __bionic_clone assembler to call the thread function then exit.
extern "C" __LIBC_HIDDEN__ void __start_thread(int (*fn)(void*), void* arg) {
BIONIC_STOP_UNWIND;
pthread_internal_t* self = __get_thread();
if (self && self->tid == -1) {
self->tid = syscall(__NR_gettid);
}
int status = (*fn)(arg);
__exit(status);
}