linux fork系统调用

fork系统调用是内核中相当麻烦的一部分,由于进程数据结构struct task_struct包含了进程运行所需的所有的数据结构,包括虚拟地址空间,文件系统,打开的文件,信号处理程序,sys v实例,命名空间和IO上下文等。fork系统调用最后调用do_fork函数处理请求:

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long do_fork(unsigned long clone_flags,//低字节表示子进程结束发送给父进程的退出代码,高3字节包含各种flag定制子进程
	      unsigned long stack_start, //父进程的用户空间栈的地址
	      struct pt_regs *regs, //通用寄存器的值的指针,从用户态切换到内核态时保存在内核栈中
	      unsigned long stack_size, //未使用
	      int __user *parent_tidptr, /*当clone_flags的高字节中设置CLONE_PARENT_SETTID,则将子进程的pid写入这个父进程的用户空间变量*/
	      int __user *child_tidptr) /*当clone_flags的高字节中设置CLONE_CHILD_SETTID,则将子进程的pid写入这个父进程的用户空间变量*/

{
	struct task_struct *p;
	int trace = 0;
	long nr;

	/*
	 * Do some preliminary argument and permissions checking before we
	 * actually start allocating stuff
	 */
	if (clone_flags & CLONE_NEWUSER) { /*检查标志冲突,建立新的命名空间就不能是设置创建新线程标志,两者冲突*/
		if (clone_flags & CLONE_THREAD)
			return -EINVAL;
		/* hopefully this check will go away when userns support is
		 * complete
		 */
		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
				!capable(CAP_SETGID))/*检查权限,检查是否有创建新命名空间的权限*/
			return -EPERM;
	}

	/*
	 * Determine whether and which event to report to ptracer.  When
	 * called from kernel_thread or CLONE_UNTRACED is explicitly
	 * requested, no event is reported; otherwise, report if the event
	 * for the type of forking is enabled.
	 */
	if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED))/*设置试调器的选项,如果是在用户空间,且没有设置不被跟踪,则设置试调器选项,user_mode通过测试代码段寄存器的cs低两位是否是USER级别来判断是否是用户空间调用fork*/ {
		if (clone_flags & CLONE_VFORK)
			trace = PTRACE_EVENT_VFORK;
		else if ((clone_flags & CSIGNAL) != SIGCHLD)
			trace = PTRACE_EVENT_CLONE;
		else
			trace = PTRACE_EVENT_FORK;

		if (likely(!ptrace_event_enabled(current, trace)))
			trace = 0;
	}

	p = copy_process(clone_flags, stack_start, regs, stack_size,
			 child_tidptr, NULL, trace); //复制父进程的所有数据结构
	/*
	 * Do this prior waking up the new thread - the thread pointer
	 * might get invalid after that point, if the thread exits quickly.
	 */
	if (!IS_ERR(p)) {
		struct completion vfork;

		trace_sched_process_fork(current, p);

		nr = task_pid_vnr(p);

		if (clone_flags & CLONE_PARENT_SETTID)
			put_user(nr, parent_tidptr); //将子进程的pid复制给用户空间变量parent_tidptr

		if (clone_flags & CLONE_VFORK) { /*如果是vfork调用,则父进程在子进程退出或者调用execve之前不能被调度执行,因此父进程必须等待,这里使用完成量,下面可以看到,父进程等待子进程执行完成*/
			p->vfork_done = &vfork;
			init_completion(&vfork);
		}

		audit_finish_fork(p); //初始化审计相关数据

		/*
		 * We set PF_STARTING at creation in case tracing wants to
		 * use this to distinguish a fully live task from one that
		 * hasn't finished SIGSTOP raising yet.  Now we clear it
		 * and set the child going.
		 */
		p->flags &= ~PF_STARTING;

		wake_up_new_task(p); //唤醒新进程,将新产生的进程加入到就绪队列并设置为TASK_RUNNING状态

		/* forking complete and child started to run, tell ptracer */
		if (unlikely(trace))
			ptrace_event(trace, nr);

		if (clone_flags & CLONE_VFORK) {
			freezer_do_not_count();
			wait_for_completion(&vfork);/*如果设置了CLONE_VFORK,则父进程进入等待状态,在子进程的等待队列上等待子进程唤醒自己*/
			freezer_count();
			ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
		}
	} else {
		nr = PTR_ERR(p);
	}
	return nr;
}

先看看比较容易实现的wake_new_task操作,在wake_new_task之前,copy_process已经进行了调度器相关的设置。

/*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
 * This function will do some initial scheduler statistics housekeeping
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
void wake_up_new_task(struct task_struct *p)
{
	unsigned long flags;
	struct rq *rq;

	raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_SMP
	/*
	 * Fork balancing, do it here and not earlier because:
	 *  - cpus_allowed can change in the fork path
	 *  - any previously selected cpu might disappear through hotplug
	 */
	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); /* 设置进程的的cpu,在task_struct->thread_info中设置,thread_info的数据结构在进程内核栈顶。*/
#endif

	rq = __task_rq_lock(p);
	activate_task(rq, p, 0); //将进程加入到就绪队列
	p->on_rq = 1; //设置标志,表示进程在就绪队列上
	trace_sched_wakeup_new(p, true); 
	check_preempt_curr(rq, p, WF_FORK); //比较进程的优先级,新进程是否可以抢占之前运行的进程
#ifdef CONFIG_SMP
	if (p->sched_class->task_woken)
		p->sched_class->task_woken(rq, p);
#endif
	task_rq_unlock(rq, p, &flags);
}

可以看到,do_fork所有的麻烦的事情都放在了copy_process中,copy_process完成各种task_struct中的数据结构的复制操作。下面随便看一下copy_process操作

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 *///参数都和do_fork中的含义相同
static struct task_struct *copy_process(unsigned long clone_flags,
					unsigned long stack_start,
					struct pt_regs *regs,
					unsigned long stack_size,
					int __user *child_tidptr,
					struct pid *pid,
					int trace)
{
	int retval;
	struct task_struct *p;
	int cgroup_callbacks_done = 0;

	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))/*检查标志冲突,新命名空间不能和共享文件系统兼容*/
		return ERR_PTR(-EINVAL);

	/*
	 * Thread groups must share signals as well, and detached threads
	 * can only be started up within the thread group.
	 */
	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))/*线程必须共享信号处理程序*/
		return ERR_PTR(-EINVAL);

	/*
	 * Shared signal handlers imply shared VM. By way of the above,
	 * thread groups also imply shared VM. Blocking this case allows
	 * for various simplifications in other code.
	 */
	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))/*由于装载的信号处理程序只能在当前虚拟内存空间中运行,所以如果设置了共享信号处理程序就必须共享虚拟内存空间*/
		return ERR_PTR(-EINVAL);

	/*
	 * Siblings of global init remain as zombies on exit since they are
	 * not reaped by their parent (swapper). To solve this and to avoid
	 * multi-rooted process trees, prevent global and container-inits
	 * from creating siblings.
	 */
	if ((clone_flags & CLONE_PARENT) &&
				current->signal->flags & SIGNAL_UNKILLABLE)
		return ERR_PTR(-EINVAL);

	retval = security_task_create(clone_flags);
	if (retval)
		goto fork_out;

	retval = -ENOMEM;
	p = dup_task_struct(current); /*复制task_struct结构,不包括其他的数据结构,并且新的task_struct实例p继承了current的normal_prio,并设置了p的调度器相关状态,如调度类、load_weight(用于CFS)*/
	if (!p)
		goto fork_out;

	ftrace_graph_init_task(p);

	rt_mutex_init_task(p); //初始化实时互斥锁

#ifdef CONFIG_PROVE_LOCKING
	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
	retval = -EAGAIN;
	if (atomic_read(&p->real_cred->user->processes) >=
			task_rlimit(p, RLIMIT_NPROC)) {
		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
		    p->real_cred->user != INIT_USER)
			goto bad_fork_free;
	}
	current->flags &= ~PF_NPROC_EXCEEDED;

	retval = copy_creds(p, clone_flags);
	if (retval < 0)
		goto bad_fork_free;

	/*
	 * If multiple threads are within copy_process(), then this check
	 * triggers too late. This doesn't hurt, the check is only there
	 * to stop root fork bombs.
	 */
	retval = -EAGAIN;
	if (nr_threads >= max_threads)
		goto bad_fork_cleanup_count;

	if (!try_module_get(task_thread_info(p)->exec_domain->module))
		goto bad_fork_cleanup_count;
	/*初始化p的相关成员*/
	p->did_exec = 0;
	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
	copy_flags(clone_flags, p);
	INIT_LIST_HEAD(&p->children);
	INIT_LIST_HEAD(&p->sibling);
	rcu_copy_process(p);
	p->vfork_done = NULL;
	spin_lock_init(&p->alloc_lock);

	init_sigpending(&p->pending);

	p->utime = cputime_zero;
	p->stime = cputime_zero;
	p->gtime = cputime_zero;
	p->utimescaled = cputime_zero;
	p->stimescaled = cputime_zero;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
	p->prev_utime = cputime_zero;
	p->prev_stime = cputime_zero;
#endif
#if defined(SPLIT_RSS_COUNTING)
	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

	p->default_timer_slack_ns = current->timer_slack_ns;

	task_io_accounting_init(&p->ioac);
	acct_clear_integrals(p);

	posix_cpu_timers_init(p);

	do_posix_clock_monotonic_gettime(&p->start_time);
	p->real_start_time = p->start_time;
	monotonic_to_bootbased(&p->real_start_time);
	p->io_context = NULL;
	p->audit_context = NULL;
	if (clone_flags & CLONE_THREAD)
		threadgroup_fork_read_lock(current);
	cgroup_fork(p);
#ifdef CONFIG_NUMA
	p->mempolicy = mpol_dup(p->mempolicy);
	if (IS_ERR(p->mempolicy)) {
		retval = PTR_ERR(p->mempolicy);
		p->mempolicy = NULL;
		goto bad_fork_cleanup_cgroup;
	}
	mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_CPUSETS
	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
	p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
	p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
	p->hardirqs_enabled = 1;
#else
	p->hardirqs_enabled = 0;
#endif
	p->hardirq_enable_ip = 0;
	p->hardirq_enable_event = 0;
	p->hardirq_disable_ip = _THIS_IP_;
	p->hardirq_disable_event = 0;
	p->softirqs_enabled = 1;
	p->softirq_enable_ip = _THIS_IP_;
	p->softirq_enable_event = 0;
	p->softirq_disable_ip = 0;
	p->softirq_disable_event = 0;
	p->hardirq_context = 0;
	p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
	p->lockdep_depth = 0; /* no locks held yet */
	p->curr_chain_key = 0;
	p->lockdep_recursion = 0;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
	p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
	p->memcg_batch.do_batch = 0;
	p->memcg_batch.memcg = NULL;
#endif

	/* Perform scheduler related setup. Assign this task to a CPU. */
	sched_fork(p);

	retval = perf_event_init_task(p);
	if (retval)
		goto bad_fork_cleanup_policy;
	retval = audit_alloc(p);
	if (retval)
		goto bad_fork_cleanup_policy;
	/* copy all the process information */
	retval = copy_semundo(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_audit;
	retval = copy_files(clone_flags, p); //复制打开的文件描述符
	if (retval)
		goto bad_fork_cleanup_semundo;
	retval = copy_fs(clone_flags, p);  //复制文件系统
	if (retval)
		goto bad_fork_cleanup_files;
	retval = copy_sighand(clone_flags, p); //复制信号处理程序
	if (retval)
		goto bad_fork_cleanup_fs;
	retval = copy_signal(clone_flags, p); //复制信号集
	if (retval)
		goto bad_fork_cleanup_sighand;
	retval = copy_mm(clone_flags, p); //复制虚拟内存空间
	if (retval)
		goto bad_fork_cleanup_signal;
	retval = copy_namespaces(clone_flags, p); //复制命名空间
	if (retval)
		goto bad_fork_cleanup_mm;
	retval = copy_io(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_namespaces;
	retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
	if (retval)
		goto bad_fork_cleanup_io;

	if (pid != &init_struct_pid) {//pid为null
		retval = -ENOMEM;
		pid = alloc_pid(p->nsproxy->pid_ns); //在p的命名空间中分配一个新的pid
		if (!pid)
			goto bad_fork_cleanup_io;
	}
/*设置p的id相关值,pid,tid,tgid*/
	p->pid = pid_nr(pid);
	p->tgid = p->pid;
	if (clone_flags & CLONE_THREAD)
		p->tgid = current->tgid;

	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
	/*
	 * Clear TID on mm_release()?
	 */
	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
}
关注一下copy_process的主体操作,复制几个数据结构,其中最麻烦应该是复制虚拟地址空间了,由于牵涉到内存管理,缺页异常等。看一下到底复制了虚拟地址空间的哪些东西。由于copy_mm的主要操作都在dup_mm中,直接看一下dup_mm这个函数的具体实现:

/*
 * Allocate a new mm structure and copy contents from the
 * mm structure of the passed in task structure.
 */
struct mm_struct *dup_mm(struct task_struct *tsk)
{
	struct mm_struct *mm, *oldmm = current->mm;
	int err;

	if (!oldmm)
		return NULL;

	mm = allocate_mm(); //在内存中分配一个mm_struct的实例
	if (!mm)
		goto fail_nomem;

	memcpy(mm, oldmm, sizeof(*mm)); //将父进程的mm_struct全部复制过来
	mm_init_cpumask(mm);

	/* Initializing for Swap token stuff */
	mm->token_priority = 0;
	mm->last_interval = 0;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	mm->pmd_huge_pte = NULL;
#endif

	if (!mm_init(mm, tsk)) //初始化mm_struct中几个成员,其中最重要的就是给pdg分配一页,用于全局页表
		goto fail_nomem;

	if (init_new_context(tsk, mm))
		goto fail_nocontext;

	dup_mm_exe_file(oldmm, mm); /*进程二进制执行文件共享一下,由于fork调用之后,新进程在父进程执行的地方继续执行,因此简单增加父进程的的exe_file的引用计数就可以了*/

	err = dup_mmap(mm, oldmm); /* 复制用户空间页表,文件映射,匿名摄影等*/
	if (err)
		goto free_pt;

	mm->hiwater_rss = get_mm_rss(mm);  //进程的最大页帧的水印
	mm->hiwater_vm = mm->total_vm; //进程的虚拟内存空间的大小

	if (mm->binfmt && !try_module_get(mm->binfmt->module))
		goto free_pt;

	return mm;

free_pt:
	/* don't put binfmt in mmput, we haven't got module yet */
	mm->binfmt = NULL;
	mmput(mm);

fail_nomem:
	return NULL;

fail_nocontext:
	/*
	 * If init_new_context() failed, we cannot use mmput() to free the mm
	 * because it calls destroy_context()
	 */
	mm_free_pgd(mm);
	free_mm(mm);
	return NULL;
}
真正的虚拟内存空间的复制在dup_mmap中,复制了父进程的文件映射和匿名映射。

static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
	struct rb_node **rb_link, *rb_parent;
	int retval;
	unsigned long charge;
	struct mempolicy *pol;

	down_write(&oldmm->mmap_sem);
	flush_cache_dup_mm(oldmm);
	/*
	 * Not linked in yet - no deadlock potential:
	 */
	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);

	mm->locked_vm = 0;
	mm->mmap = NULL;
	mm->mmap_cache = NULL;
	mm->free_area_cache = oldmm->mmap_base;
	mm->cached_hole_size = ~0UL;
	mm->map_count = 0;
	cpumask_clear(mm_cpumask(mm));
	mm->mm_rb = RB_ROOT;
	rb_link = &mm->mm_rb.rb_node;
	rb_parent = NULL;
	pprev = &mm->mmap;
	retval = ksm_fork(mm, oldmm);
	if (retval)
		goto out;
	retval = khugepaged_fork(mm, oldmm);
	if (retval)
		goto out;

	prev = NULL;
	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {//遍历父进程的vm_area_struct的链表,复制所有项
		struct file *file;

		if (mpnt->vm_flags & VM_DONTCOPY) {//不让复制的标志则跳过
			long pages = vma_pages(mpnt);
			mm->total_vm -= pages;
			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
								-pages);
			continue;
		}
		charge = 0;
		if (mpnt->vm_flags & VM_ACCOUNT) {
			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
			if (security_vm_enough_memory(len))
				goto fail_nomem;
			charge = len;
		}
		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); //分配一个新的vm_area_struct实例
		if (!tmp)
			goto fail_nomem;
		*tmp = *mpnt;
		INIT_LIST_HEAD(&tmp->anon_vma_chain);
		pol = mpol_dup(vma_policy(mpnt));
		retval = PTR_ERR(pol);
		if (IS_ERR(pol))
			goto fail_nomem_policy;
		vma_set_policy(tmp, pol);
		tmp->vm_mm = mm; //设置其mm_struct
		if (anon_vma_fork(tmp, mpnt)) //创建当前vm_area_struct的匿名映射
			goto fail_nomem_anon_vma_fork;
		tmp->vm_flags &= ~VM_LOCKED;
		tmp->vm_next = tmp->vm_prev = NULL;
		file = tmp->vm_file; 
		if (file) { //file不为空表示vm_area_struct对应一个文件映射
			struct inode *inode = file->f_path.dentry->d_inode;
			struct address_space *mapping = file->f_mapping; //文件映射的地址空间

			get_file(file); //增加计数
			if (tmp->vm_flags & VM_DENYWRITE)
				atomic_dec(&inode->i_writecount);
			mutex_lock(&mapping->i_mmap_mutex);
			if (tmp->vm_flags & VM_SHARED)
				mapping->i_mmap_writable++; //如果是共享映射则增加计数
			flush_dcache_mmap_lock(mapping);
			/* insert tmp into the share list, just after mpnt */
			vma_prio_tree_add(tmp, mpnt); //将其增加到文件地址空间的优先树上面,用于逆向映射
			flush_dcache_mmap_unlock(mapping);
			mutex_unlock(&mapping->i_mmap_mutex);
		}

		/*
		 * Clear hugetlb-related page reserves for children. This only
		 * affects MAP_PRIVATE mappings. Faults generated by the child
		 * are not guaranteed to succeed, even if read-only
		 */
		if (is_vm_hugetlb_page(tmp))
			reset_vma_resv_huge_pages(tmp);

		/*
		 * Link in the new vma and copy the page table entries.
		 *///将新的vm_area_struct实例添加到子进程mm_struct的mmap链表中
		*pprev = tmp;
		pprev = &tmp->vm_next;
		tmp->vm_prev = prev;
		prev = tmp;

		__vma_link_rb(mm, tmp, rb_link, rb_parent); //将vm_area_struct实例添加到红黑树上
		rb_link = &tmp->vm_rb.rb_right;
		rb_parent = &tmp->vm_rb;

		mm->map_count++;
		retval = copy_page_range(mm, oldmm, mpnt); //复制vm_area_struct地址空间部分的页表项到新的mm_struct

		if (tmp->vm_ops && tmp->vm_ops->open)
			tmp->vm_ops->open(tmp);

		if (retval)
			goto out;
	}
	/* a new mm has just been created */
	arch_dup_mmap(oldmm, mm);
	retval = 0;
out:
	up_write(&mm->mmap_sem);
	flush_tlb_mm(oldmm);
	up_write(&oldmm->mmap_sem);
	return retval;
fail_nomem_anon_vma_fork:
	mpol_put(pol);
fail_nomem_policy:
	kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
	retval = -ENOMEM;
	vm_unacct_memory(charge);
	goto out;
}



你可能感兴趣的:(linux fork系统调用)