这篇文章来学习一下线程的创建过程。
线程不是一个完全由内核实现的机制,它是由内核态和用户态合作完成的。
pthread_create 不是一个系统调用,是 glibc 库的一个函数,位于 nptl/pthread_create.c 中:
int __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg)
{
......
}
versioned_symbol (libpthread, __pthread_create_2_1, pthread_create, GLIBC_2_1);
那这个函数到底做了什么呢?
const struct pthread_attr *iattr = (struct pthread_attr *) attr;
struct pthread_attr default_attr;
if (iattr == NULL)
{
......
iattr = &default_attr;
}
struct pthread *pd = NULL;
int err = ALLOCATE_STACK (iattr, &pd);
# define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
static int
allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
ALLOCATE_STACK_PARMS)
{
struct pthread *pd;
size_t size;
size_t pagesize_m1 = __getpagesize () - 1;
......
size = attr->stacksize;
......
/* Allocate some anonymous memory. If possible use the cache. */
size_t guardsize;
void *mem;
const int prot = (PROT_READ | PROT_WRITE
| ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
/* Adjust the stack size for alignment. */
size &= ~__static_tls_align_m1;
/* Make sure the size of the stack is enough for the guard and
eventually the thread descriptor. */
guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
size += guardsize;
pd = get_cached_stack (&size, &mem);
if (pd == NULL)
{
/* If a guard page is required, avoid committing memory by first
allocate with PROT_NONE and then reserve with required permission
excluding the guard page. */
mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
/* Place the thread descriptor at the end of the stack. */
#if TLS_TCB_AT_TP
pd = (struct pthread *) ((char *) mem + size) - 1;
#elif TLS_DTV_AT_TP
pd = (struct pthread *) ((((uintptr_t) mem + size - __static_tls_size) & ~__static_tls_align_m1) - TLS_PRE_TCB_SIZE);
#endif
/* Now mprotect the required region excluding the guard area. */
char *guard = guard_position (mem, size, guardsize, pd, pagesize_m1);
setup_stack_prot (mem, size, guard, guardsize, prot);
pd->stackblock = mem;
pd->stackblock_size = size;
pd->guardsize = guardsize;
pd->specific[0] = pd->specific_1stblock;
/* And add to the list of stacks in use. */
stack_list_add (&pd->list, &stack_used);
}
*pdp = pd;
void *stacktop;
# if TLS_TCB_AT_TP
/* The stack begins before the TCB and the static TLS block. */
stacktop = ((char *) (pd + 1) - __static_tls_size);
# elif TLS_DTV_AT_TP
stacktop = (char *) (pd - 1);
# endif
*stack = stacktop;
......
}
搞定了用户态栈的问题,其实用户态的事情基本搞定了一半。
资料直通车:Linux内核源码技术学习路线+视频教程内核源码
学习直通车:Linux内核源码内存调优文件系统进程管理设备驱动/网络协议栈
接下来,我们接着 pthread_create 看。其实有了用户态的栈,接着需要解决的就是用户态的程序从哪里开始运行的问题
pd->start_routine = start_routine;
pd->arg = arg;
pd->schedpolicy = self->schedpolicy;
pd->schedparam = self->schedparam;
/* Pass the descriptor to the caller. */
*newthread = (pthread_t) pd;
atomic_increment (&__nptl_nthreads);
retval = create_thread (pd, iattr, &stopped_start, STACK_VARIABLES_ARGS, &thread_ran);
start_routine 就是我们给线程的函数,start_routine 、start_routine 的参数 arg,以及调度策略都要赋值给 pthread。
接下来 __nptl_nthreads 加 1,说明又多了一个线程。
真正创建线程的是调用 create_thread 函数,这个函数定义如下:
static int
create_thread (struct pthread *pd, const struct pthread_attr *attr,
bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
{
const int clone_flags = (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SYSVSEM | CLONE_SIGHAND | CLONE_THREAD | CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | 0);
ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS, clone_flags, pd, &pd->tid, tp, &pd->tid);
/* It's started now, so if we fail below, we'll have to cancel it
and let it clean itself up. */
*thread_ran = true;
}
这里面有很长的 clone_flags 需要特别关注一下。
然后就是 ARCH_CLONE ,其实就是调用 __clone(如果对于汇编不太熟悉也没关系,重点看注释)
define ARCH_CLONE __clone
/* The userland implementation is:
int clone (int (*fn)(void *arg), void *child_stack, int flags, void *arg),
the kernel entry is:
int clone (long flags, void *child_stack).
The parameters are passed in register and on the stack from userland:
rdi: fn
rsi: child_stack
rdx: flags
rcx: arg
r8d: TID field in parent
r9d: thread pointer
%esp+8: TID field in child
The kernel expects:
rax: system call number
rdi: flags
rsi: child_stack
rdx: TID field in parent
r10: TID field in child
r8: thread pointer */
.text
ENTRY (__clone)
movq $-EINVAL,%rax
......
/* Insert the argument onto the new stack. */
subq $16,%rsi
movq %rcx,8(%rsi)
/* Save the function pointer. It will be popped off in the
child in the ebx frobbing below. */
movq %rdi,0(%rsi)
/* Do the system call. */
movq %rdx, %rdi
movq %r8, %rdx
movq %r9, %r8
mov 8(%rsp), %R10_LP
movl $SYS_ify(clone),%eax
......
syscall
......
PSEUDO_END (__clone)
我们能看到最后调用了 syscall,这一点 clone 和其他系统调用几乎一模一样,但是也有一些不一样的地方:
接下来我们就要进入内核了。内核里面对于 clone 系统调用的定义是这样的:
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
{
return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
}
可以看到,调用了 _do_fork,先前我们已经看过了一遍它的主要逻辑,现在我们重点关注几个区别。
第一个是上面复杂的标识位设定,我们来看都影响了什么。
static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
{
struct files_struct *oldf, *newf;
oldf = current->files;
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
}
newf = dup_fd(oldf, &error);
tsk->files = newf;
out:
return error;
}
static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
fs->users++;
return 0;
}
tsk->fs = copy_fs_struct(fs);
return 0;
}
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
if (clone_flags & CLONE_SIGHAND) {
atomic_inc(¤t->sighand->count);
return 0;
}
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
atomic_set(&sig->count, 1);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
return 0;
}
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
if (clone_flags & CLONE_THREAD)
return 0;
sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
tsk->signal = sig;
init_sigpending(&sig->shared_pending);
......
}
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm;
oldmm = current->mm;
if (clone_flags & CLONE_VM) {
mmget(oldmm);
mm = oldmm;
goto good_mm;
}
mm = dup_mm(tsk);
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
return 0;
}
第二个是对于亲缘关系的影响,毕竟我们要识别多个线程是不是属于一个进程。
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
p->exit_signal = (clone_flags & CSIGNAL);
p->group_leader = p;
p->tgid = p->pid;
}
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}
从上面可以看出,使用了 CLONE_THREAD 标识位之后,使得亲缘关系有了一定的变化。
第三,对信号的处理,如何保证发给进程的信号虽然可以被一个线程处理,但是影响范围应该是整个进程的。比如,kill一个进程,则所有线程都要被干掉。如果 pthread_kill 一个线程,那只有那个线程才能够收到。
init_sigpending(&p->pending);
init_sigpending(&sig->shared_pending);
至此,clone 在内核的调用完毕,要返回系统调用,回到用户态。
根据 clone 的第一个参数,回到用户态也不是直接运行我们指定的那个函数,而是一个通用的 start_thread,这是所有线程在用户态的统一入口
#define START_THREAD_DEFN \
static int __attribute__ ((noreturn)) start_thread (void *arg)
START_THREAD_DEFN
{
struct pthread *pd = START_THREAD_SELF;
/* Run the code the user provided. */
THREAD_SETMEM (pd, result, pd->start_routine (pd->arg));
/* Call destructors for the thread_local TLS variables. */
/* Run the destructor for the thread-local data. */
__nptl_deallocate_tsd ();
if (__glibc_unlikely (atomic_decrement_and_test (&__nptl_nthreads)))
/* This was the last thread. */
exit (0);
__free_tcb (pd);
__exit_thread ();
}
在 start_thread 入口函数中,才真正的调用用户提供的函数,在用户的函数执行完毕之后,会释放这个线程相关的数据。比如,线程本地数据 thread_local ,线程数目也减少 1。如果这是最后一个线程了,就直接退出进程,另外 __free_tcb 用于释放 pthread
void
internal_function
__free_tcb (struct pthread *pd)
{
......
__deallocate_stack (pd);
}
void
internal_function
__deallocate_stack (struct pthread *pd)
{
/* Remove the thread from the list of threads with user defined
stacks. */
stack_list_del (&pd->list);
/* Not much to do. Just free the mmap()ed memory. Note that we do
not reset the 'used' flag in the 'tid' field. This is done by
the kernel. If no thread has been created yet this field is
still zero. */
if (__glibc_likely (! pd->user_stack))
(void) queue_stack (pd);
}
__free_tcb 会调用 __deallocate_stack 来释放整个线程栈,这个线程栈要从当前使用线程栈的列表 stack_used 中拿下来,放到缓存的线程栈列表 stack_cache 中。
至此,整个线程的生命周期结束。
下表对比了创建进程和创建线程在用户态和内核态的不同。