NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
profile_task_exit(tsk);
WARN_ON(atomic_read(&tsk->fs_excl)); //如果进程holding fs exclusive resources,则报错。但是还是不理解这个字段的作用
WARN_ON(blk_needs_flush_plug(tsk));//保证task_struct中的plug字段是空的,或者plug字段指向的队列是空的。plug字段的意义是stack plugging
if (unlikely(in_interrupt())) //中断服务程序是不应该调用do_exit的,因此要对其进行检查。
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid)) //不能杀死idle进程和init进程。
panic("Attempted to kill the idle task!");
/*
* If do_exit is called because this processes oopsed, it's possible
* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
* continuing. Amongst other possible reasons, this is to prevent
* mm_release()->clear_child_tid() from writing to a user-controlled
* kernel address.
*/
set_fs(USER_DS); //fs寄存器指向当前活动线程的TEB结构(线程结构)????
tracehook_report_exit(&code); // possibly stop for a ptrace event notification
validate_creds_for_do_exit(tsk); //check creds for do_exit().我们现在先不关心credential机制。
/*
* We're taking recursive faults here in do_exit. Safest is to just
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
printk(KERN_ALERT
"Fixing recursive fault but reboot is needed!\n");
/*
* We can do this unlocked here. The futex code uses
* this flag just to verify whether the pi state
* cleanup has been done or not. In the worst case it
* loops once more. We pretend that the cleanup was
* done as there is no way to return. Either the
* OWNER_DIED bit is set by now or we push the blocked
* task into the wait for ever nirwana as well.
*/
tsk->flags |= PF_EXITPIDONE; //该进程正在被
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
exit_irq_thread();
/*
* Set the THREAD DIED flag to prevent further wakeups of the
* soon to be gone threaded handler.
*/
exit_signals(tsk); /* sets PF_EXITING *//*以后再讨论信号量的事情*/
/*
* tsk->flags are checked in the futex code to protect against
* an exiting task cleaning up the robust pi futexes.
*/
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count());
acct_update_integrals(tsk); //update mm integral fields in task_struct.不过看源代码好像主要是设置task_struct中与时间相关的选项。
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
sync_mm_rss(tsk, tsk->mm); //这个函数实现的机制并没有搞清楚,确切得说是没有搞清楚rss机制。
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
}
acct_collect(code, group_dead); //collect accounting information into pacct_struct.
if (group_dead)
tty_audit_exit();
if (unlikely(tsk->audit_context))
audit_free(tsk);
tsk->exit_code = code;
taskstats_exit(tsk, group_dead); //Send pid data out on exit
exit_mm(tsk); //放弃进程占用的mm,如果没有其他进程使用该mm,则释放它。
if (group_dead)
acct_process();
trace_sched_process_exit(tsk);
exit_sem(tsk); //与信号量相关,我们暂且不看,只要知道其功能就行了。详细的功能解释请参看《情景分析》 P339.
exit_files(tsk); //释放占用的文件资源,但是好像并没有写回指向的文件,而只是将文件的描述符或者控制符写回了,这是为什么?
/*
*void exit_files(struct task_struct *tsk)
*{
* struct files_struct * files = tsk->files;
*
* if (files) {
* task_lock(tsk);
* tsk->files = NULL;
* task_unlock(tsk);
* put_files_struct(files);
* }
*}
*/
exit_fs(tsk); //与exit_files类似
check_stack_usage();
exit_thread(); //释放task_struct中的thread_struct结构
/*
/*
* Free current thread data structures etc..
*/
*void exit_thread(void)
*{
* struct task_struct *me = current;
* struct thread_struct *t = &me->thread;
* unsigned long *bp = t->io_bitmap_ptr;
*
* if (bp) {
* struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); //Linux内核并没有想x86体系结构希望的那样使用TSS,详细的解释可以看《情景分析》 P265
*
* t->io_bitmap_ptr = NULL;
* clear_thread_flag(TIF_IO_BITMAP);
* /*
* * Careful, clear this in the TSS too:
* */
* memset(tss->io_bitmap, 0xff, t->io_bitmap_max); //将tss段中的io_bitmap都设置为0xff,表示io可用。
* t->io_bitmap_max = 0;
* put_cpu();
* kfree(bp); //释放
* }
*}
*/
/*
* Flush inherited counters to the parent - before the parent
* gets woken up by child-exit notifications.
*
* because of cgroup mode, must be called before cgroup_exit()
*/
perf_event_exit_task(tsk); //When a child task exits, feed back event values to parent events.
cgroup_exit(tsk, 1);
if (group_dead)
disassociate_ctty(1);
module_put(task_thread_info(tsk)->exec_domain->module);
proc_exit_connector(tsk);
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
ptrace_put_breakpoints(tsk);
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
task_lock(tsk);
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
task_unlock(tsk);
#endif
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
#endif
/*
* Make sure we are holding no locks:
*/
debug_check_no_locks_held(tsk);
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
* or not. In the worst case it loops once more.
*/
tsk->flags |= PF_EXITPIDONE;
if (tsk->io_context)
exit_io_context(tsk);
if (tsk->splice_pipe)
__free_pipe_info(tsk->splice_pipe);
validate_creds_for_do_exit(tsk);
preempt_disable();
exit_rcu();
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
schedule(); //重新调度,因为该进程已经被设置成了僵死状态,因此永远都不会再把它调度回来运行了,也就实现了do_exit不会有返回的目标。
BUG();
/* Avoid "noreturn function does return". */
for (;;)
cpu_relax(); /* For when BUG is null */
}
到了do_exit的第一个关键函数exit_mm(tsk)
/*
* Turn us into a lazy TLB process if we
* aren't already..
*/
static void exit_mm(struct task_struct * tsk)
{
struct mm_struct *mm = tsk->mm;
struct core_state *core_state;
mm_release(tsk, mm);
if (!mm)
return;
/*
* Serialize with any possible pending coredump.
* We must hold mmap_sem around checking core_state
* and clearing tsk->mm. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group with ->mm != NULL.
*/
down_read(&mm->mmap_sem);
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
up_read(&mm->mmap_sem);
self.task = tsk;
self.next = xchg(&core_state->dumper.next, &self);
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
*/
if (atomic_dec_and_test(&core_state->nr_threads))
complete(&core_state->startup);
for (;;) {
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
schedule();
}
__set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
}
atomic_inc(&mm->mm_count);
BUG_ON(mm != tsk->active_mm);
/* more a memory barrier than a real lock */
task_lock(tsk);
tsk->mm = NULL;
up_read(&mm->mmap_sem);
enter_lazy_tlb(mm, current);
/*
*enter_lazy_tlb notifies the underlying architecture that exchanging the userspace portion of the virtual
*address space is not required. This speeds up the context switch and is known as the lazy TLB technique.
*/
/* We don't want this task to be frozen prematurely */
clear_freeze_flag(tsk); //不会因为要挂起该进程,而将其冻结。
if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
atomic_dec(&mm->oom_disable_count);
task_unlock(tsk);
mm_update_next_owner(mm); // A task is exiting. If it owned this mm, find a new owner for the mm.
mmput(mm);
}
exit_mm的第一个关键函数是mm_release(),关于这个函数的作用,可以参看《情景分析》P342
/* Please note the differences between mmput and mm_release.
* mmput is called whenever we stop holding onto a mm_struct,
* error success whatever.
*
* mm_release is called after a mm_struct has been removed
* from the current process.
*
* This difference is important for error handling, when we
* only half set up a mm_struct for a new process and need to restore
* the old one. Because we mmput the new mm_struct before
* restoring the old one. . .
* Eric Biederman 10 January 1998
*/
void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
struct completion *vfork_done = tsk->vfork_done;
/* Get rid of any futexes when releasing the mm */
#ifdef CONFIG_FUTEX
if (unlikely(tsk->robust_list)) {
exit_robust_list(tsk);
tsk->robust_list = NULL;
}
#ifdef CONFIG_COMPAT
if (unlikely(tsk->compat_robust_list)) {
compat_exit_robust_list(tsk);
tsk->compat_robust_list = NULL;
}
#endif
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
#endif
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
/* notify parent sleeping on vfork() */
if (vfork_done) {
tsk->vfork_done = NULL;
complete(vfork_done); //
}
/*
* If we're exiting normally, clear a user-space tid field if
* requested. We leave this alone when dying by signal, to leave
* the value intact in a core dump, and to save the unnecessary
* trouble otherwise. Userland only wants this done for a sys_exit.
*/
if (tsk->clear_child_tid) {
if (!(tsk->flags & PF_SIGNALED) &&
atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
*/
put_user(0, tsk->clear_child_tid);
sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
1, NULL, NULL, 0);
}
tsk->clear_child_tid = NULL;
}
}
exit_mm中最后一个操作mmput是整个函数的核心:
/*
* Decrement the use count and release all resources for an mm.
*/
void mmput(struct mm_struct *mm)
{
might_sleep();
if (atomic_dec_and_test(&mm->mm_users)) { //将mm的users计数减一
exit_aio(mm);
/* exit_aio: called when the last user of mm goes away. At this point,
* there is no way for any new requests to be submited or any of the
* io_* syscalls to be called on the context. However, there may be
* outstanding(为完成的) requests which hold references to the context; as they
* go away, they will call put_ioctx and release any pinned memory
* associated with the request (held via struct page * references).
*/
ksm_exit(mm); //不关心
khugepaged_exit(mm); /* must run before exit_mmap 不关心*/
exit_mmap(mm);
set_mm_exe_file(mm, NULL); //将执行文件写回。
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
list_del(&mm->mmlist); //将这个mm从active mm队列中摘除
spin_unlock(&mmlist_lock);
}
put_swap_token(mm);//写回占用的交换区
if (mm->binfmt)
module_put(mm->binfmt->module); //将执行该可执行文件的模块的reference减1,并唤醒等待该模块空闲的进程。
mmdrop(mm);//重要的函数,下文详述
}
}
我们首先来看mmput中的第一个关键函数:exit_mmap
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
unsigned long end;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
if (mm->locked_vm) {
vma = mm->mmap;
while (vma) { //将所有mm包含的的vm都解锁
if (vma->vm_flags & VM_LOCKED)
munlock_vma_pages_all(vma); //munlock all pages in the vma range.'
vma = vma->vm_next;
}
}
arch_exit_mmap(mm);
vma = mm->mmap;
if (!vma) /* Can happen if dup_mmap() received an OOM */
return;
lru_add_drain();
flush_cache_mm(mm);
tlb_gather_mmu(&tlb, mm, 1);
/* tlb_gather_mmu
* Called to initialize an (on-stack) mmu_gather structure for page-table
* tear-down from mm.
*/
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); //这当然是一个重要的函数,下文会详述
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);//释放页表
tlb_finish_mmu(&tlb, 0, end);
/*
* Walk the list again, actually closing and freeing it,
* with preemption enabled, without holding any MM locks.
*/
while (vma)
vma = remove_vma(vma);// Close a vm structure and free it, returning the next.
BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
unmap_vmas函数:unmap a range of memory covered by a list of vma 's.
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
* @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
* @details: details of nonlinear truncation or shared cache invalidation
*
* Returns the end address of the unmapping (restart addr if interrupted).
*
* Unmap all pages in the vma list.
*
* We aim to not hold locks for too long (for scheduling latency reasons).
* So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
* return the ending mmu_gather to the caller.
*
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
*
* unmap_vmas() assumes that the caller will flush the whole unmapped address
* range after unmap_vmas() returns. So the only responsibility here is to
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
unsigned long unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
unsigned long start = start_addr;
struct mm_struct *mm = vma->vm_mm;
mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long end;
start = max(vma->vm_start, start_addr);
if (start >= vma->vm_end)
continue;
end = min(vma->vm_end, end_addr);
if (end <= vma->vm_start)
continue;
if (vma->vm_flags & VM_ACCOUNT)
*nr_accounted += (end - start) >> PAGE_SHIFT; //在该vm中要释放的页面数
if (unlikely(is_pfn_mapping(vma)))
untrack_pfn_vma(vma, 0, 0);
while (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
* It is undesirable to test vma->vm_file as it
* should be non-null for valid hugetlb area.
* However, vm_file will be NULL in the error
* cleanup path of do_mmap_pgoff. When
* hugetlbfs ->mmap method fails,
* do_mmap_pgoff() nullifies vma->vm_file
* before calling this function to clean up.
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
*/
if (vma->vm_file)
unmap_hugepage_range(vma, start, end, NULL);
start = end;
} else
start = unmap_page_range(tlb, vma, start, end, details); //释放页面,
//当然这也是一个很重要的函数。不过,我们会在内存相关章节中详讲
}
}
mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
return start; /* which is now the end (or restart) address */
}
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
* mmput. Free the page directory and the mm.
*/
void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
mm_free_pgd(mm); //释放其页目录项
destroy_context(mm); //释放进程上下文所占的局部页表。
mmu_notifier_mm_destroy(mm);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
VM_BUG_ON(mm->pmd_huge_pte);
#endif
free_mm(mm); //最终释放mm结构
}
到现在我们终于完成了exit_mm的过程。
总结一下在exit_mm我们主要都做了什么吧:
(1)首先mm_release(),主要作用于通过vfork创建的进程,因为在这种情况下,该进程阻塞了其父进程的执行,一直等到该进程执行结束,向父进程发送信号以唤醒其父进程
(2)最后一个函数put_mm是整个函数的核心,将资源计数器减一,并释放所mm的有资源,包括页面,页目录,页表,释放mm结构。
但是需要注意的是,我们并没有释放task_struct结构,主要有两个原因,可以参看《情景分析》P342。
在释放资源的时候,有一个简单的准则:如果进程是一个指针,在进程创建时以及运行过程中要为其在内核中分配一个数据结构和缓冲区,而且这个指针又是指向这个数据结构或
缓冲区的唯一的途径,那就一定要释放掉,否则会造成内核的存储空间的泄露。