内核源码:linux-2.6.38.8.tar.bz2
目标平台:ARM体系结构
进程终止时,一般是调用exit库函数(无论是程序员显式调用还是编译器自动地把exit库函数插入到main函数的最后一条语句之后)来释放进程所拥有的资源。
$ man 3 exit void exit(int status); $ man 2 exit_group void exit_group(int status); $ man 3 pthread_exit void pthread_exit(void *retval); $ man 2 _exit void _exit(int status);
库函数exit使用系统调用exit_group来终止整个线程组,库函数pthread_exit使用系统调用_exit来终止某一个线程。
这两个系统调用在Linux内核中的入口点函数分别为sys_exit和sys_exit_group。
/* linux-2.6.38.8/kernel/exit.c */ SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; }
do_group_exit函数会杀死属于当前进程所在线程组的所有进程。它接受进程终止代号作为参数,进程终止代号可能是系统调用exit_group(正常结束)指定的一个值,也可能是内核提供的一个错误码(异常结束)。
NORET_TYPE void do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; BUG_ON(exit_code & 0x80); /* core dumps don't get here */ if (signal_group_exit(sig)) //检查current->sig->flags的SIGNAL_GROUP_EXIT标志是否置位,或者current->sig->group_exit_task是否不为NULL。 exit_code = sig->group_exit_code; //group_exit_code存放的是线程组终止代码 else if (!thread_group_empty(current)) { //检查线程组链表是否不为空。 struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); //遍历整个线程组链表,并杀死其中的每个线程。 } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ }
进程终止所要完成的任务都是由do_exit函数来处理。
/* linux-2.6.38.8/kernel/exit.c */ NORET_TYPE void do_exit(long code)
1、触发task_exit_nb通知链实例的处理函数
profile_task_exit(tsk); /* linux-2.6.38.8/drivers/oprofile/buffer_sync.c */ static struct notifier_block task_exit_nb = { .notifier_call = task_exit_notify, };
2、检查current->fs_excl是否为0,不为0时也不会终止后续代码的执行
WARN_ON(atomic_read(&tsk->fs_excl)); /* linux-2.6.38.8/include/asm-generic/bug.h */ #ifndef WARN_ON #define WARN_ON(condition) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ __WARN(); /* 输出警告信息的位置(哪个文件的哪行)*/ \ unlikely(__ret_warn_on); \ }) #endif
3、oops消息
if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!");
中断上下文不能执行do_exit函数,也不能终止PID为0的进程。
4、设定进程可以使用的虚拟地址的上限(用户空间)
set_fs(USER_DS); /* linux-2.6.38.8/arch/arm/include/asm/uaccess.h */ #define USER_DS TASK_SIZE #define TASK_SIZE (UL(CONFIG_PAGE_OFFSET) - UL(0x01000000)) static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); }
5、current->flags的PF_EXITING标志表示进程正在被删除。
if (unlikely(tsk->flags & PF_EXITING)) {//检查PF_EXITING标志是否未被设置,如果设置了则执行大括号里的代码 printk(KERN_ALERT "Fixing recursive fault but reboot is needed!\n"); tsk->flags |= PF_EXITPIDONE; set_current_state(TASK_UNINTERRUPTIBLE); //设置进程状态为不可中断的等待状态 schedule(); //调度其它进程 }
6、设置current->irqaction->flags的IRQTF_DIED标志,表示清除当前进程的中断服务例程
exit_irq_thread(); /* linux-2.6.38.8/kernel/irq/mamage.c */ void exit_irq_thread(void) { struct task_struct *tsk = current; if (!tsk->irqaction) return; printk(KERN_ERR "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); /* * Set the THREAD DIED flag to prevent further wakeups of the * soon to be gone threaded handler. */ set_bit(IRQTF_DIED, &tsk->irqaction->flags); }
7、设置PF_EXITING标志
exit_signals(tsk); /* sets PF_EXITING */ /* linux-2.6.38.8/kernel/signal.c */ void exit_signals(struct task_struct *tsk) { int group_stop = 0; struct task_struct *t; if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { //检查线程组链表是否为空,或者是否要终止整个线程组 tsk->flags |= PF_EXITING; return; } spin_lock_irq(&tsk->sighand->siglock); /* * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ tsk->flags |= PF_EXITING; if (!signal_pending(tsk)) //signal_pending函数用于检查当前进程是否有非阻塞的挂起信号,如果有则返回1,否则返回0 goto out; /* It could be that __group_complete_signal() choose us to * notify about group-wide signal. Another thread should be * woken now to take the signal since we will not. */ for (t = tsk; (t = next_thread(t)) != tsk; ) //检查线程组中的其他进程 if (!signal_pending(t) && !(t->flags & PF_EXITING)) //如果没有设置TIF_SIGPENDING标志,而且也没有设置PF_EXITING标志 recalc_sigpending_and_wake(t); //则设置TIF_SIGPENDING标志,表示有挂起信号 if (unlikely(tsk->signal->group_stop_count) && !--tsk->signal->group_stop_count) { //表示只终止线程组中的某个线程 tsk->signal->flags = SIGNAL_STOP_STOPPED; group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); } out: spin_unlock_irq(&tsk->sighand->siglock); if (unlikely(group_stop)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(tsk, group_stop); read_unlock(&tasklist_lock); } }
8)、内存屏障,用于确保在它之后的操作开始执行之前,它之前的操作已经完成
smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); //一直等待,直到获得current->pi_lock自旋锁 /* linux-2.6.38.8/arch/arm/include/asm/system.h */ #define smp_mb() barrier() //!CONFIG_SMP /* linux-2.6.38.8/include/linux/compiler-gcc.h */ #define barrier() __asm__ __volatile__("": : :"memory")
9)、获取current->mm->rss_stat.count[member]计数
acct_update_integrals(tsk); void acct_update_integrals(struct task_struct *tsk) { if (likely(tsk->mm)) { cputime_t time, dtime; struct timeval value; unsigned long flags; u64 delta; local_irq_save(flags); time = tsk->stime + tsk->utime; dtime = cputime_sub(time, tsk->acct_timexpd); jiffies_to_timeval(cputime_to_jiffies(dtime), &value); delta = value.tv_sec; delta = delta * USEC_PER_SEC + value.tv_usec; if (delta == 0) goto out; tsk->acct_timexpd = time; tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); //统计分配给进程的页框数(MM_FILEPAGES和MM_ANONPAGES两种类型的页框) tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; //total_vm用来表示进程地址空间的大小(页数) out: local_irq_restore(flags); } } /* linux-2.6.38.8/include/linux/mm.h */ static inline unsigned long get_mm_rss(struct mm_struct *mm) { return get_mm_counter(mm, MM_FILEPAGES) + get_mm_counter(mm, MM_ANONPAGES); } static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) //!USE_SPLIT_PTLOCKS { return mm->rss_stat.count[member]; }
然后,把它们清零。
/* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk, tsk->mm); /* linux-2.6.38.8/mm/memory.c */ void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) { __sync_task_rss_stat(task, mm); } static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) { //共有三类,MM_FILEPAGES、MM_ANONPAGES和MM_SWAPENTS if (task->rss_stat.count[i]) { add_mm_counter(mm, i, task->rss_stat.count[i]); task->rss_stat.count[i] = 0; } } task->rss_stat.events = 0; }
10)、清除定时器
group_dead = atomic_dec_and_test(&tsk->signal->live); //live用来表示线程组中活动进程的数量 if (group_dead) { //当没有活动的进程时 hrtimer_cancel(&tsk->signal->real_timer); //取消高精度定时器 exit_itimers(tsk->signal); //删除POSIX.1b类型的定时器 if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); //获取进程所拥有的最大页框数 }
11)、收集进程会计信息
acct_collect(code, group_dead);
12)、审计
if (group_dead) tty_audit_exit(); //记录审计事件 if (unlikely(tsk->audit_context)) audit_free(tsk); //释放struct audit_context结构体
13)、输出taskstats信息
tsk->exit_code = code; //设置终止代码 taskstats_exit(tsk, group_dead);
14)、释放线性区描述符和页表
exit_mm(tsk); /* linux-2.6.38.8/kernel/exit.c */ static void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; mm_release(tsk, mm); //其中会唤醒tsk->vfork_done,让父进程开始执行,用于vfork时 if (!mm) return; /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state * and clearing tsk->mm. The core-inducing thread * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ down_read(&mm->mmap_sem); core_state = mm->core_state; if (core_state) { //内存转储 struct core_thread self; up_read(&mm->mmap_sem); self.task = tsk; self.next = xchg(&core_state->dumper.next, &self); /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; schedule(); } __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); //递增mm->mm_count计数,确保内存描述符暂时不会被删除,当要把正在被终止的进程从本地CPU撤销时,才由finish_task_switch函数来释放内存描述。 BUG_ON(mm != tsk->active_mm); /* more a memory barrier than a real lock */ task_lock(tsk); tsk->mm = NULL; //设置进程描述符的mm字段为NULL。 up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); //使处理器处于懒惰TLB模式,ARM体系结构不支持。 /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); //设置TIF_FREEZE标志。 if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); //当mm->mm_users为0(即没有任何进程使用它)时,释放线性区描述符和页表,但这时还不会释放内存描述符 }
15)、输出进程会计信息
if (group_dead) acct_process(); trace_sched_process_exit(tsk); //用于跟踪,定义在linux-2.6.38.8/include/trace/events/sched.h文件中
16)、遍历current->sysvsem.undo_list链表,并清除进程所涉及的每个IPC信号量的操作痕迹
exit_sem(tsk);
17)、释放文件对象相关资源
exit_files(tsk); /* linux-2.6.38.8/kernel/exit.c */ void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; //把进程描述符的files字段设为NULL。 task_unlock(tsk); put_files_struct(files); } } void put_files_struct(struct files_struct *files) { struct fdtable *fdt; if (atomic_dec_and_test(&files->count)) { //当共享该表的进程数目为0时 close_files(files); //执行进程终止时应该执行的文件操作相关函数,如release /* * Free the fd and fdset arrays if we expanded them. * If the fdtable was embedded, pass files for freeing * at the end of the RCU grace period. Otherwise, * you can free files immediately. */ rcu_read_lock(); fdt = files_fdtable(files); if (fdt != &files->fdtab) kmem_cache_free(files_cachep, files); //释放struct files_struct结构体所用内存 free_fdtable(fdt); rcu_read_unlock(); } }
18)、释放struct fs_struct结构体
exit_fs(tsk); /* linux-2.6.38.8/fs/fs_struct.c */ void exit_fs(struct task_struct *tsk) { struct fs_struct *fs = tsk->fs; if (fs) { int kill; task_lock(tsk); spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); tsk->fs = NULL; //设置进程描述符的fs字段为NULL kill = !--fs->users; //fs->users表示共享这个表的进程个数 write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); task_unlock(tsk); if (kill) //当为0时 free_fs_struct(fs); //释放结构体所用内存 } }
19)、检查有多少未使用的进程内核栈
check_stack_usage(); /* linux-2.6.38.8/kernel/exit.c */ #ifdef CONFIG_DEBUG_STACK_USAGE static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " "left\n", current->comm, free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else static inline void check_stack_usage(void) {} #endif
20)、触发thread_notify_head链表中所有通知链实例的处理函数,用于处理struct thread_info结构体
exit_thread(); /* linux-2.6.38.8/arch/arm/kernel/process.c */ void exit_thread(void) { thread_notify(THREAD_NOTIFY_EXIT, current_thread_info()); }
21)、Performance Event功能相关资源的释放
perf_event_exit_task(tsk); /* linux-2.6.38.8/kernel/perf_event.c */ void perf_event_exit_task(struct task_struct *child) { struct perf_event *event, *tmp; int ctxn; mutex_lock(&child->perf_event_mutex); list_for_each_entry_safe(event, tmp, &child->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); /* * Ensure the list deletion is visible before we clear * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ smp_wmb(); event->owner = NULL; } mutex_unlock(&child->perf_event_mutex); for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); }
22)、释放Control Groups相关的资源
cgroup_exit(tsk, 1); /* linux-2.6.38.8/kernel/cgroup.c */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { int i; struct css_set *cg; if (run_callbacks && need_forkexit_callback) { /* * modular subsystems can't use callbacks, so no need to lock * the subsys array */ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->exit) ss->exit(ss, tsk); } } /* * Unlink from the css_set task list if necessary. * Optimistically check cg_list before taking * css_set_lock */ if (!list_empty(&tsk->cg_list)) { write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) list_del_init(&tsk->cg_list); write_unlock(&css_set_lock); } /* Reassign the task to the init_css_set. */ task_lock(tsk); cg = tsk->cgroups; tsk->cgroups = &init_css_set; task_unlock(tsk); if (cg) put_css_set_taskexit(cg); }
23)、脱离控制终端
if (group_dead) disassociate_ctty(1);
24)、执行域
module_put(task_thread_info(tsk)->exec_domain->module);
25)、进程事件连接器(通过它来报告进程fork、exec、exit以及进程用户ID与组ID的变化)
proc_exit_connector(tsk); /* linux-2.6.38.8/drivers/connector/cn_proc.c */ void proc_exit_connector(struct task_struct *task) { struct cn_msg *msg; struct proc_event *ev; __u8 buffer[CN_PROC_MSG_SIZE]; struct timespec ts; if (atomic_read(&proc_event_num_listeners) < 1) return; msg = (struct cn_msg*)buffer; ev = (struct proc_event*)msg->data; get_seq(&msg->seq, &ev->cpu); ktime_get_ts(&ts); /* get high res monotonic timestamp */ put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); ev->what = PROC_EVENT_EXIT; ev->event_data.exit.process_pid = task->pid; ev->event_data.exit.process_tgid = task->tgid; ev->event_data.exit.exit_code = task->exit_code; ev->event_data.exit.exit_signal = task->exit_signal; memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); msg->ack = 0; /* not used */ msg->len = sizeof(*ev); cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); }
参考文档:linux-2.6.38.8/Documentation/connector/connector.txt
http://www.ibm.com/developerworks/cn/linux/l-connector/
26)、注销断点
ptrace_put_breakpoints(tsk); /* linux-2.6.38.8/kernel/ptrace.c */ void ptrace_put_breakpoints(struct task_struct *tsk) { if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) flush_ptrace_hw_breakpoint(tsk); } /* linux-2.6.38.8/arch/arm/kernel/ptrace.c */ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { int i; struct thread_struct *t = &tsk->thread; for (i = 0; i < ARM_MAX_HBP_SLOTS; i++) { if (t->debug.hbp[i]) { unregister_hw_breakpoint(t->debug.hbp[i]); t->debug.hbp[i] = NULL; } } }
27)、更新所有子进程的父进程
exit_notify(tsk, group_dead); /* linux-2.6.38.8/kernel/exit.c */ static void exit_notify(struct task_struct *tsk, int group_dead) { int signal; void *cookie; /* * This does two things: * * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ forget_original_parent(tsk); //将子进程的父进程重新设置为线程组中的其他线程或init进程 exit_task_namespaces(tsk); //当使用计数(current->nsproxy->count)为0时,释放命名空间(current->nsproxy) write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); /* Let father know we died * * Thread signals are configurable, but you aren't going to use * that to send signals to arbitary processes. * That stops right now. * * If the parent exec id doesn't match the exec id we saved * when we started then we know the parent has changed security * domain. * * If our self_exec id doesn't match our parent_exec_id then * we have changed execution domain as these two values started * the same after a fork. */ if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && //task_detached函数用于判断tsk->exit_signal是否等于-1 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || tsk->self_exec_id != tsk->parent_exec_id)) tsk->exit_signal = SIGCHLD; //设置SIGCHLD信号 signal = tracehook_notify_death(tsk, &cookie, group_dead); //判断当前进程是否被跟踪 if (signal >= 0) signal = do_notify_parent(tsk, signal); //告知父进程当前进程死亡 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; //当tsk->exit_signal不等于-1,或进程正在被跟踪,则设置tsk->exit_state为EXIT_ZOMBIE /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); tracehook_report_death(tsk, signal, cookie, group_dead); /* If the process is dead, release it - nobody will wait for it */ if (signal == DEATH_REAP) //如果tsk->exit_state为EXIT_DEAD状态 release_task(tsk); //则调用release_task函数回收进程的其他数据结构所占用的内存 }
28)、用于NUMA,当引用计数为0时,释放struct mempolicy结构体所占用的内存
#ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; task_unlock(tsk); #endif
29)、释放struct futex_pi_state结构体所占用的内存
#ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif
30)、释放struct io_context结构体所占用的内存
if (tsk->io_context) exit_io_context(tsk); /* linux-2.6.38.8/block/blk-ioc.c */ void exit_io_context(struct task_struct *task) { struct io_context *ioc; task_lock(task); ioc = task->io_context; task->io_context = NULL; task_unlock(task); if (atomic_dec_and_test(&ioc->nr_tasks)) cfq_exit(ioc); put_io_context(ioc); }
31)、释放与进程描述符splice_pipe字段相关的资源
if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); /* linux-2.6.38.8/fs/pipe.c */ void __free_pipe_info(struct pipe_inode_info *pipe) { int i; for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) buf->ops->release(pipe, buf); } if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); kfree(pipe); }
32)、调度其它进程
tsk->state = TASK_DEAD; //调度程序忽略处于TASK_DEAD状态的进程 schedule();
在调用do_exit函数之后,尽管进程已经不能再被调度,但系统还是保留了它的进程描述符,这样做是为了让系统有办法在进程终止后仍能获得它的信息。在父进程获得已终止子进程的信息后,子进程的task_struct结构体才被释放(包括此进程的内核栈)。