上章讲到线程,现在对线程创建的代码流程分析下。来一步一步揭开她神秘的面纱
linux内核创建线程函数 kernel_thread(),最终会调用do_fork().
前面谈到线程也是用task_struct结构表示它拥有的信息,只是是共享进程的资源。
根据clone_flags标志,来调用clone()创建"线程",表示共享内存、共享文件系统访问计数、共享文件描述符表,以及共享信号处理方式。
kernel_thread定义在/arch/kernel/process.c
int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
{
struct pt_regs regs;
memset(®s, 0, sizeof(regs)); //把该结构的变量全部清0
regs.ebx = (unsigned long) fn; /* ebx指向函数地址 */
regs.edx = (unsigned long) arg; /* edx指向参数地址 */
regs.xds = __USER_DS;
regs.xes = __USER_DS;
regs.xfs = __KERNEL_PERCPU;
regs.orig_eax = -1;
regs.eip = (unsigned long) kernel_thread_helper; /*eip指向回调函数*/
regs.xcs = __KERNEL_CS | get_kernel_rpl();
regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
/* 利用do_fork来产生一个新的线程,共享父进程地址空间,并且不允许调试子进程 */
return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL);
}
其中__USER_DS,__KERNEL_PERCPU,__KERNEL_CS都是一些宏定义。在/linux/include/asm-i386/segment.h
extern void kernel_thread_helper(void); /* 定义成全局变量 */
__asm__(".section .text\n"
".align 4\n"
"kernel_thread_helper:\n\t"
"movl %edx,%eax\n\t"
"pushl %edx\n\t" /* edx指向参数,压入堆栈 */
"call *%ebx\n\t" /* ebx指向函数地址,执行函数 */
"pushl %eax\n\t"
"call do_exit\n" /* 结束线程 */
".previous");
在kernel_thread中调用了do_fork,让我们揭开do_fork()的面纱.
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
...
...
p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
...
...
}
接着分析do_fork(),copy_proces()是它的核心函数。重点分析一下:
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
struct pid *pid)
{
int retval;
struct task_struct *p = NULL;
//clone_flags参数的有效性判断
//不能同时定义CLONE_NEWNS,CLONE_FS
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
//如果定义CLONE_THREAD,则必须要定义CLONE_SIGHAND
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
//如果定义CLONE_SIGHAND,则必须要定义CLONE_VM
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
retval = -ENOMEM;
//从父进程中复制出一个task
p = dup_task_struct(current);
if (!p)
goto fork_out;
rt_mutex_init_task(p);
#ifdef CONFIG_TRACE_IRQFLAGS
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
//如果用户的进程总数超过了限制
if (atomic_read(&p->user->processes) >=
p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
p->user != current->nsproxy->user_ns->root_user)
goto bad_fork_free;
}
//更新进程用户的相关计数
atomic_inc(&p->user->__count);
atomic_inc(&p->user->processes);
get_group_info(p->group_info);
//当前进程数是否大于系统规定的最大进程数
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
//加载进程的相关执行模块
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
if (p->binfmt && !try_module_get(p->binfmt->module))
goto bad_fork_cleanup_put_domain;
//子进程还在进行初始化,没有execve
p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
//copy父进程的所有标志,除了PF_SUPERPRIV(超级权限)
//置子进程的PF_FORKNOEXEC标志,表示正在被FORK
copy_flags(clone_flags, p);
//赋值子进程的pid
p->pid = pid_nr(pid);
retval = -EFAULT;
if (clone_flags & CLONE_PARENT_SETTID)
if (put_user(p->pid, parent_tidptr))
goto bad_fork_cleanup_delays_binfmt;
//初始化子进程的几个链表
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
//父进程的TIF_SIGPENDING被复制进了子进程,这个标志表示有末处理的信号
//这个标志子进程是不需要的
clear_tsk_thread_flag(p, TIF_SIGPENDING);
init_sigpending(&p->pending);
//初始化子进程的time
p->utime = cputime_zero;
p->stime = cputime_zero;
p->prev_utime = cputime_zero;
……
……
//tgid = pid
p->tgid = p->pid;
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
//copy父进程的其它资源.比例打开的文件,信号,VM等等
if ((retval = security_task_alloc(p)))
goto bad_fork_cleanup_policy;
if ((retval = audit_alloc(p)))
goto bad_fork_cleanup_security;
/* copy all the process information */
if ((retval = copy_semundo(clone_flags, p)))
goto bad_fork_cleanup_audit;
if ((retval = copy_files(clone_flags, p)))
goto bad_fork_cleanup_semundo;
if ((retval = copy_fs(clone_flags, p)))
goto bad_fork_cleanup_files;
if ((retval = copy_sighand(clone_flags, p)))
goto bad_fork_cleanup_fs;
if ((retval = copy_signal(clone_flags, p)))
goto bad_fork_cleanup_sighand;
if ((retval = copy_mm(clone_flags, p)))
goto bad_fork_cleanup_signal;
if ((retval = copy_keys(clone_flags, p)))
goto bad_fork_cleanup_mm;
if ((retval = copy_namespaces(clone_flags, p)))
goto bad_fork_cleanup_keys;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_namespaces;
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
p->compat_robust_list = NULL;
#endif
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
/*
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
p->sas_ss_sp = p->sas_ss_size = 0;
/*
* Syscall tracing should be turned off in the child regardless
* of CLONE_PTRACE.
*/
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
p->parent_exec_id = p->self_exec_id;
/* ok, now we should be set up.. */
//exit_signal: 子进程退出时给父进程发送的信号
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
//pdeath_signal:进程退出时.给其下的子进程发送的信号
p->pdeath_signal = 0;
p->exit_state = 0;
……
……
if (likely(p->pid)) {
add_parent(p);
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);
if (thread_group_leader(p)) {
p->signal->tty = current->signal->tty;
p->signal->pgrp = process_group(current);
set_signal_session(p->signal, process_session(current));
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail_rcu(&p->tasks, &init_task.tasks);
__get_cpu_var(process_counts)++;
}
attach_pid(p, PIDTYPE_PID, pid);
//当前进程数递增
nr_threads++;
}
//被fork的进程数计数递增
total_forks++;
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
return p;
……
……
}
参考:深入理解linux内核
到这里为止,进程的运行内间已经设置好了。但子进程的怎么返回到用户空间呢?这是在copy_process()—> copy_thread()中完成的。
int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
unsigned long unused,
struct task_struct * p, struct pt_regs * regs)
{
struct pt_regs * childregs;
struct task_struct *tsk;
int err;
//子进程的内核堆栈起点
childregs = task_pt_regs(p);
//将父进程的regs参数赋值到子进程的内核堆栈
//regs参数:里面存放的是父进程陷入内核后的各寄存器的值
*childregs = *regs;
//eax:返回值. 将其设为0,子进程返回到用户空间后,它的返回值是0
childregs->eax = 0;
//esp:子进程的用户堆栈指针位置
childregs->esp = esp;
//子进程内核堆栈位置
p->thread.esp = (unsigned long) childregs;
//子进程内核堆栈指针位置
p->thread.esp0 = (unsigned long) (childregs+1);
//子进程要执行的下一条指令.对应子进程从系统空间返回用户空间
p->thread.eip = (unsigned long) ret_from_fork;
savesegment(gs,p->thread.gs);
tsk = current;
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
/*
* Set a new TLS for the child thread?
*/
if (clone_flags & CLONE_SETTLS) {
struct desc_struct *desc;
struct user_desc info;
int idx;
err = -EFAULT;
if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
goto out;
err = -EINVAL;
if (LDT_empty(&info))
goto out;
idx = info.entry_number;
if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
goto out;
desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
desc->a = LDT_entry_a(&info);
desc->b = LDT_entry_b(&info);
}
err = 0;
out:
if (err && p->thread.io_bitmap_ptr) {
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
return err;
}
在这里把ret_from_fork的地址赋值给p->thread.eip,p->thread.eip表示当进程下一次调度时的指令开始地址,
所以当线程创建后被调度时,是从ret_from_fork地址处开始的.
到这里说明,新的线程已产生了.
ENTRY(ret_from_fork)
pushl %eax
call schedule_tail
GET_THREAD_INFO(%ebp)
popl %eax
jmp syscall_exit
syscall_exit:
...
work_resched:
call schedule
...
当他从ret_from_fork退出时,会从堆栈中弹出原来保存的eip,而ip指向kernel_thread_helper,
至此kernel_thread_helper被调用,他就能够运行我们的指定的函数了do_exit().
从内核空间返回到用户空间。