接着上一文,我们看一下do_fork()函数:
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
//新建一个新的,空的task_strtuct
struct task_struct *p;
int trace = 0;
//分配一个pid
struct pid *pid = alloc_pid();
long nr;
if (!pid)
return -EAGAIN;
nr = pid->nr;
//这个不太可能发生
if (unlikely(current->ptrace)) {
trace = fork_traceflag (clone_flags);
if (trace)
clone_flags |= CLONE_PTRACE;
}
p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
/* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */
if (!IS_ERR(p)) {
struct completion vfork;
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
}
if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
/* * We'll start up with an immediate SIGSTOP 以SIGSTOP开始运行 */
sigaddset(&p->pending.signal, SIGSTOP);
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
if (!(clone_flags & CLONE_STOPPED))
wake_up_new_task(p, clone_flags);
else
p->state = TASK_STOPPED;
if (unlikely (trace)) {
current->ptrace_message = nr;
ptrace_notify ((trace << 8) | SIGTRAP);
}
if (clone_flags & CLONE_VFORK) {
wait_for_completion(&vfork);
if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
current->ptrace_message = nr;
ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
}
}
} else {
free_pid(pid);
nr = PTR_ERR(p);
}
return nr;
}
根据代码显示,当执行玩成copy_process()函数在之后,do_fork()有意选自子进程首先执行,因为子进程一般会马上调用exec()函数,这样以避免写时拷贝的额外开销。同样,如果让父进程先执行的话,有可能会开始向地址空间中写入。
(二):线程的实现
Linux实现线程的机制比较独特。从内核角度看,他并没有线程的概念。Linux把所有的线程都当作进程来实现。内核并没有准备特别的调度算法或定义特别的数据结构来表征线程。相反,线程仅仅被视为一个与其他进程共享某些资源的进程。每个线程都拥有唯一里属于自己的task_struct,所以在内核中,他看起来就像是一个普通的进程。
1:线程的创建
线程的创建和进程的创建类似,只不过就是像clone()中传递一些参数来指明需要共享的资源。
传递给clone()的参数标志决定了新创建进程的行为方式和父子进程之间共享的资源种类。下面列举了在linux/sched.h文件中定义的参数标志。
/* * cloning flags: * cloning 标志 */
//在退出的时候,被发送的信号
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
//父子进程共享地址空间
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
//父子进程共享文件系统信息
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
//父子进程共享打开的文件
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
//父子进程共享信号处理函数以及被阻断的信号
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
//继续调试子进程
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
//调用vfork(),所以父进程准备睡眠等待子进程将其唤醒
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
//指定子进程和父进程有相同的父亲
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
//父子进程放入相同的线程组
#define CLONE_THREAD 0x00010000 /* Same thread group? */
//为子进程创建新的命名空间
#define CLONE_NEWNS 0x00020000 /* New namespace group? */
//父子进程共享system V SEM_UNDO语义
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
//为子进程创建一个新的TLS
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
//设置父进程的TID
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
//清除子进程的TID
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
//没有使用,被忽略了
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
//防止跟踪进程在子进程上强制进程CLONE_PTRACE
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
//设置子进程的TID
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
//以TASK_STOPPED状态开始进程
#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
(三):进程终结
当一个进程终结的时候,内核必须释放他所占有的资源并把这一个消息告知其父进程。一般来说,进程的析构是由自身引起的,他发生在进程调用exit()系统调用。进程终结的大部分都要靠do_exit()(定义于kernel/exit.c)来完成。
下面我们来看一下:
fastcall NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
struct taskstats *tidstats;
int group_dead;
unsigned int mycpu;
profile_task_exit(tsk);
WARN_ON(atomic_read(&tsk->fs_excl));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
if (unlikely(tsk == child_reaper))
panic("Attempted to kill init!");
if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
current->ptrace_message = code;
ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
}
/* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. */
if (unlikely(tsk->flags & PF_EXITING)) {
printk(KERN_ALERT
"Fixing recursive fault but reboot is needed! ");
if (tsk->io_context)
exit_io_context();
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
//设置进程状态为PF_EXITING
tsk->flags |= PF_EXITING;
if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d ",
current->comm, current->pid,
preempt_count());
taskstats_exit_alloc(&tidstats, &mycpu);
/* * 如果BSD的进程记账功能是开启的,do_exit()调用acct_update_integrals()来输出记账 * 信息 * 时间记账:就是记录进程已经运行了多长时间了,还要运行多长时间 */
acct_update_integrals(tsk);
if (tsk->mm) {
update_hiwater_rss(tsk->mm);
update_hiwater_vm(tsk->mm);
}
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
}
acct_collect(code, group_dead);
if (unlikely(tsk->robust_list))
exit_robust_list(tsk);
#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
if (unlikely(tsk->compat_robust_list))
compat_exit_robust_list(tsk);
#endif
if (unlikely(tsk->audit_context))
audit_free(tsk);
taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
taskstats_exit_free(tidstats);
/* * 调用exit_mm()函数释放进程占有的mm_struct,如果没有别的进程使用他们 * (也就是说这个地址空间没有被共享),就彻底释放他们。 * */
exit_mm(tsk);
if (group_dead)
acct_process();
//如果进程排队等待IPC信号,则离开队列
exit_sem(tsk);
//分别递减文件描述符和文件系统数据的引用计数
__exit_files(tsk);
__exit_fs(tsk);
exit_namespace(tsk);
exit_thread();
cpuset_exit(tsk);
exit_keys(tsk);
if (group_dead && tsk->signal->leader)
disassociate_ctty(1);
module_put(task_thread_info(tsk)->exec_domain->module);
if (tsk->binfmt)
module_put(tsk->binfmt->module);
//设置task_struct中的exit_code设置为exit()函数提供的退出代码
tsk->exit_code = code;
proc_exit_connector(tsk);
/* * exit_notify()函数向父进程发送信号,给予子进程重新找养父,养父为进程组中的其他进程或者是 * init进程,并把进程状态设置为EXIT_ZOMBIE * */
exit_notify(tsk);
#ifdef CONFIG_NUMA
mpol_free(tsk->mempolicy);
tsk->mempolicy = NULL;
#endif
/* * This must happen late, after the PID is not * hashed anymore: */
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
/* * Make sure we are holding no locks: */
debug_check_no_locks_held(tsk);
if (tsk->io_context)
exit_io_context();
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
/* PF_DEAD causes final put_task_struct after we schedule. */
preempt_disable();
BUG_ON(tsk->flags & PF_DEAD);
tsk->flags |= PF_DEAD;
//调用schedule()函数切换到新的进程
schedule();
BUG();
//do_exit()函数用不返回
/* Avoid "noreturn function does return". */
for (;;) ;
}
当do_exit()执行完成之后,进程被设置为EXIT_ZOMBIE状态,其中,其占有的一些资源已经被释放了,他也不会在发生调度了。他存在的唯一目的就是向他的父进程提供信息。父进程检索到信息后或者通知内核那些无关的信息后,由进程所持有的剩余内存被释放,归还给系统使用。
1:删除进程描述符
wait()这一族函数都是通过唯一的一个系统调用wait4()来实现的。他的标准动作是挂起调用他的进程,知道其中的一个子进程退出。此时函数会返回孩子的pid.
当最终需要释放进程描述符的时候,release_task()函数会被调用,一下是他完成的工作。
1:他调用_exit_signal(),该函数会调用_unhash_process(),后者又调用detach_pid()从pidhash上删除该进程,同时也从任务队列中删除该进程。
2:_exit_signal()释放当前僵死进程所使用的所有的剩余资源,并进行最终统计和记录。
3:如果这个进程是线程组的最后一个进程,并且领头进程已经死掉,那么release_task()就要通知僵死的领头进程的父进程。
4:release_task()调用put_task_struct()释放进程内核栈和thread_info结构所占的页,并释放task_struct所占的slab高速缓存。
2:孤儿进程
如果父进程在子进程之前退出,必须有机制来保证子进程能找到一个新的父亲,否则这些成为孤儿的进程就会在退出时永远处于僵死状态,白白耗尽内存。这个问题的解决办法是给子进程 在所在线程组中找一个线程作为父亲,或者是直接找init进程作为父亲。在do_exit()中会调用exit_notify()中会调用forget_original_parent(),然后调用find_new_reaper()函数,现在我们看一下这几个函数。
首先看一下forget_original_parent()函数
/*
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
* group, and if no such member exists, give it to
* the global child reaper process (ie "init")
*
* 当我们死了的时候,重新为我们的孩子找一个父亲。
* 试着从我们的线程组中给他们找一个父亲,如果线程
* 组中没有这样存在的成员了,就从全局的进程中
* 给他们找一个父亲,例如init进程
*
*/
static void
forget_original_parent(struct task_struct *father, struct list_head *to_release)
{
struct task_struct *p, *reaper = father;
struct list_head *_p, *_n;
do {
reaper = next_thread(reaper);
if (reaper == father) {
reaper = child_reaper;
break;
}
} while (reaper->exit_state);
/*
* There are only two places where our children can be:
*
* - in our child list
* - in our ptraced child list
*
* Search them and reparent children.
*/
list_for_each_safe(_p, _n, &father->children) {
int ptrace;
p = list_entry(_p, struct task_struct, sibling);
ptrace = p->ptrace;
/* if father isn't the real parent, then ptrace must be enabled */
BUG_ON(father != p->real_parent && !ptrace);
if (father == p->real_parent) {
/* reparent with a reaper, real father it's us */
choose_new_parent(p, reaper);
reparent_thread(p, father, 0);
} else {
/* reparent ptraced task to its real parent */
__ptrace_unlink (p);
if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
thread_group_empty(p))
do_notify_parent(p, p->exit_signal);
}
/*
* if the ptraced child is a zombie with exit_signal == -1
* we must collect it before we exit, or it will remain
* zombie forever since we prevented it from self-reap itself
* while it was being traced by us, to be able to see it in wait4.
*/
if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
list_add(&p->ptrace_list, to_release);
}
list_for_each_safe(_p, _n, &father->ptrace_children) {
p = list_entry(_p, struct task_struct, ptrace_list);
choose_new_parent(p, reaper);
reparent_thread(p, father, 1);
}
}
从代码来看,首先是从当前进程所在的线程组中找到可以作为父进程的线程:
do {
reaper = next_thread(reaper);
if (reaper == father) {
reaper = child_reaper;
break;
}
} while (reaper->exit_state);
接着,从children链表和ptrace children链表中,为每一个孩子进程寻找新的父进程。
list_for_each_safe(_p, _n, &father->children) {
int ptrace;
p = list_entry(_p, struct task_struct, sibling);
ptrace = p->ptrace;
/* if father isn't the real parent, then ptrace must be enabled */
BUG_ON(father != p->real_parent && !ptrace);
if (father == p->real_parent) {
/* reparent with a reaper, real father it's us */
choose_new_parent(p, reaper);
reparent_thread(p, father, 0);
} else {
/* reparent ptraced task to its real parent */
__ptrace_unlink (p);
if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
thread_group_empty(p))
do_notify_parent(p, p->exit_signal);
}
现在我们看一下 choose_new_parent()函数,其实就相当于上面的find_new_reaper()函数:
static inline void
choose_new_parent(struct task_struct *p, struct task_struct *reaper)
{
/*
* Make sure we're not reparenting to ourselves and that
* the parent is not a zombie.
*/
BUG_ON(p == reaper || reaper->exit_state);
p->real_parent = reaper;
}
这个函数相对比较简单,直接将p的真实父进程设置为找到的reaper。
一旦系统为进程成功地找到和设置了新的父进程,就不会再有出现驻留僵死进程的危了。init进程会例行调用wait()函数来检查子进程,清除所有与其相关的僵死进程。