fork系统调用是内核中相当麻烦的一部分,由于进程数据结构struct task_struct包含了进程运行所需的所有的数据结构,包括虚拟地址空间,文件系统,打开的文件,信号处理程序,sys v实例,命名空间和IO上下文等。fork系统调用最后调用do_fork函数处理请求:
/* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ long do_fork(unsigned long clone_flags,//低字节表示子进程结束发送给父进程的退出代码,高3字节包含各种flag定制子进程 unsigned long stack_start, //父进程的用户空间栈的地址 struct pt_regs *regs, //通用寄存器的值的指针,从用户态切换到内核态时保存在内核栈中 unsigned long stack_size, //未使用 int __user *parent_tidptr, /*当clone_flags的高字节中设置CLONE_PARENT_SETTID,则将子进程的pid写入这个父进程的用户空间变量*/ int __user *child_tidptr) /*当clone_flags的高字节中设置CLONE_CHILD_SETTID,则将子进程的pid写入这个父进程的用户空间变量*/ { struct task_struct *p; int trace = 0; long nr; /* * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ if (clone_flags & CLONE_NEWUSER) { /*检查标志冲突,建立新的命名空间就不能是设置创建新线程标志,两者冲突*/ if (clone_flags & CLONE_THREAD) return -EINVAL; /* hopefully this check will go away when userns support is * complete */ if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || !capable(CAP_SETGID))/*检查权限,检查是否有创建新命名空间的权限*/ return -EPERM; } /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED))/*设置试调器的选项,如果是在用户空间,且没有设置不被跟踪,则设置试调器选项,user_mode通过测试代码段寄存器的cs低两位是否是USER级别来判断是否是用户空间调用fork*/ { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; if (likely(!ptrace_event_enabled(current, trace))) trace = 0; } p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); //复制父进程的所有数据结构 /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ if (!IS_ERR(p)) { struct completion vfork; trace_sched_process_fork(current, p); nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); //将子进程的pid复制给用户空间变量parent_tidptr if (clone_flags & CLONE_VFORK) { /*如果是vfork调用,则父进程在子进程退出或者调用execve之前不能被调度执行,因此父进程必须等待,这里使用完成量,下面可以看到,父进程等待子进程执行完成*/ p->vfork_done = &vfork; init_completion(&vfork); } audit_finish_fork(p); //初始化审计相关数据 /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that * hasn't finished SIGSTOP raising yet. Now we clear it * and set the child going. */ p->flags &= ~PF_STARTING; wake_up_new_task(p); //唤醒新进程,将新产生的进程加入到就绪队列并设置为TASK_RUNNING状态 /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) ptrace_event(trace, nr); if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork);/*如果设置了CLONE_VFORK,则父进程进入等待状态,在子进程的等待队列上等待子进程唤醒自己*/ freezer_count(); ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); } return nr; }
先看看比较容易实现的wake_new_task操作,在wake_new_task之前,copy_process已经进行了调度器相关的设置。
/* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug */ set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); /* 设置进程的的cpu,在task_struct->thread_info中设置,thread_info的数据结构在进程内核栈顶。*/ #endif rq = __task_rq_lock(p); activate_task(rq, p, 0); //将进程加入到就绪队列 p->on_rq = 1; //设置标志,表示进程在就绪队列上 trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); //比较进程的优先级,新进程是否可以抢占之前运行的进程 #ifdef CONFIG_SMP if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, p, &flags); }可以看到,do_fork所有的麻烦的事情都放在了copy_process中,copy_process完成各种task_struct中的数据结构的复制操作。下面随便看一下copy_process操作
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*///参数都和do_fork中的含义相同
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
int trace)
{
int retval;
struct task_struct *p;
int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))/*检查标志冲突,新命名空间不能和共享文件系统兼容*/
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))/*线程必须共享信号处理程序*/
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))/*由于装载的信号处理程序只能在当前虚拟内存空间中运行,所以如果设置了共享信号处理程序就必须共享虚拟内存空间*/
return ERR_PTR(-EINVAL);
/*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current); /*复制task_struct结构,不包括其他的数据结构,并且新的task_struct实例p继承了current的normal_prio,并设置了p的调度器相关状态,如调度类、load_weight(用于CFS)*/
if (!p)
goto fork_out;
ftrace_graph_init_task(p);
rt_mutex_init_task(p); //初始化实时互斥锁
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
retval = -EAGAIN;
if (atomic_read(&p->real_cred->user->processes) >=
task_rlimit(p, RLIMIT_NPROC)) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
p->real_cred->user != INIT_USER)
goto bad_fork_free;
}
current->flags &= ~PF_NPROC_EXCEEDED;
retval = copy_creds(p, clone_flags);
if (retval < 0)
goto bad_fork_free;
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
retval = -EAGAIN;
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
if (!try_module_get(task_thread_info(p)->exec_domain->module))
goto bad_fork_cleanup_count;
/*初始化p的相关成员*/
p->did_exec = 0;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
init_sigpending(&p->pending);
p->utime = cputime_zero;
p->stime = cputime_zero;
p->gtime = cputime_zero;
p->utimescaled = cputime_zero;
p->stimescaled = cputime_zero;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
p->default_timer_slack_ns = current->timer_slack_ns;
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
posix_cpu_timers_init(p);
do_posix_clock_monotonic_gettime(&p->start_time);
p->real_start_time = p->start_time;
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
if (clone_flags & CLONE_THREAD)
threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_cgroup;
}
mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
p->hardirqs_enabled = 1;
#else
p->hardirqs_enabled = 0;
#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
p->hardirq_disable_event = 0;
p->softirqs_enabled = 1;
p->softirq_enable_ip = _THIS_IP_;
p->softirq_enable_event = 0;
p->softirq_disable_ip = 0;
p->softirq_disable_event = 0;
p->hardirq_context = 0;
p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
p->lockdep_recursion = 0;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
p->memcg_batch.do_batch = 0;
p->memcg_batch.memcg = NULL;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);
retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_policy;
/* copy all the process information */
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_files(clone_flags, p); //复制打开的文件描述符
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p); //复制文件系统
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p); //复制信号处理程序
if (retval)
goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p); //复制信号集
if (retval)
goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p); //复制虚拟内存空间
if (retval)
goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p); //复制命名空间
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {//pid为null
retval = -ENOMEM;
pid = alloc_pid(p->nsproxy->pid_ns); //在p的命名空间中分配一个新的pid
if (!pid)
goto bad_fork_cleanup_io;
}
/*设置p的id相关值,pid,tid,tgid*/
p->pid = pid_nr(pid);
p->tgid = p->pid;
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
}
关注一下copy_process的主体操作,复制几个数据结构,其中最麻烦应该是复制虚拟地址空间了,由于牵涉到内存管理,缺页异常等。看一下到底复制了虚拟地址空间的哪些东西。由于copy_mm的主要操作都在dup_mm中,直接看一下dup_mm这个函数的具体实现:
/* * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; if (!oldmm) return NULL; mm = allocate_mm(); //在内存中分配一个mm_struct的实例 if (!mm) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); //将父进程的mm_struct全部复制过来 mm_init_cpumask(mm); /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif if (!mm_init(mm, tsk)) //初始化mm_struct中几个成员,其中最重要的就是给pdg分配一页,用于全局页表 goto fail_nomem; if (init_new_context(tsk, mm)) goto fail_nocontext; dup_mm_exe_file(oldmm, mm); /*进程二进制执行文件共享一下,由于fork调用之后,新进程在父进程执行的地方继续执行,因此简单增加父进程的的exe_file的引用计数就可以了*/ err = dup_mmap(mm, oldmm); /* 复制用户空间页表,文件映射,匿名摄影等*/ if (err) goto free_pt; mm->hiwater_rss = get_mm_rss(mm); //进程的最大页帧的水印 mm->hiwater_vm = mm->total_vm; //进程的虚拟内存空间的大小 if (mm->binfmt && !try_module_get(mm->binfmt->module)) goto free_pt; return mm; free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; mmput(mm); fail_nomem: return NULL; fail_nocontext: /* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() */ mm_free_pgd(mm); free_mm(mm); return NULL; }真正的虚拟内存空间的复制在dup_mmap中,复制了父进程的文件映射和匿名映射。
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); /* * Not linked in yet - no deadlock potential: */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; retval = khugepaged_fork(mm, oldmm); if (retval) goto out; prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {//遍历父进程的vm_area_struct的链表,复制所有项 struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) {//不让复制的标志则跳过 long pages = vma_pages(mpnt); mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; if (security_vm_enough_memory(len)) goto fail_nomem; charge = len; } tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); //分配一个新的vm_area_struct实例 if (!tmp) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); tmp->vm_mm = mm; //设置其mm_struct if (anon_vma_fork(tmp, mpnt)) //创建当前vm_area_struct的匿名映射 goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { //file不为空表示vm_area_struct对应一个文件映射 struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = file->f_mapping; //文件映射的地址空间 get_file(file); //增加计数 if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; //如果是共享映射则增加计数 flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); //将其增加到文件地址空间的优先树上面,用于逆向映射 flush_dcache_mmap_unlock(mapping); mutex_unlock(&mapping->i_mmap_mutex); } /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child * are not guaranteed to succeed, even if read-only */ if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); /* * Link in the new vma and copy the page table entries. *///将新的vm_area_struct实例添加到子进程mm_struct的mmap链表中 *pprev = tmp; pprev = &tmp->vm_next; tmp->vm_prev = prev; prev = tmp; __vma_link_rb(mm, tmp, rb_link, rb_parent); //将vm_area_struct实例添加到红黑树上 rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); //复制vm_area_struct地址空间部分的页表项到新的mm_struct if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); if (retval) goto out; } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; fail_nomem_anon_vma_fork: mpol_put(pol); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto out; }