Linux内核创建一个新进程的过程

作者:王鹤楼
原创作品转载请注明出处 《Linux内核分析》MOOC课程
http://mooc.study.163.com/course/USTC-1000029000

操作系统的三大功能

进程管理
内存管理
文件系统

task_struct

用来描述进程的数据结构,可以理解为进程的属性。进程状态、进程调度信息、各种标识符、进程通信有关信息、时间和定时器信息、进程链接信息、文件系统信息、虚拟内存信息、页面管理信息、对称多处理器和处理器相关的环境等,该数据结构被定义为task_struct

进程控制块 PCB

是操作系统核心中一种数据结构,主要表示进程状态。

进程状态

在实验楼里进行如下操作,由于进行到一半环境太卡,没有充分截图
执行操作:
rm -rf menu
然后克隆一份新的代码:
git clone https://github.com/mengning/menu.git
cd menu
mv fork_test.c test.c
make rootfs
出现菜单后执行fork命令,可以看出打印信息,已经创建了子进程

qemu -kernel linux-3.18.6/arch/x86/boot/bzImage -initrd rootfs.img -s -S

gdb
target remote:1234
file linux-3.18.6/vmlinux
b sys_clone
b do_fork
b dup_task_struct
b copy_process
b copy_thread
b ret_from_fork
c //开始执行

fork函数

调用一次返回两次,父进程中返加子进程的pid,子进程中返回0

//fork
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
    return do_fork(SIGCHLD, 0, 0, NULL, NULL);
#else
    /* can not support in nommu mode */
    return -EINVAL;
#endif
}
#endif

//vfork
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
    return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
            0, NULL, NULL);
}
#endif

//clone
#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int, tls_val,
         int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         int, tls_val)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
        int, stack_size,
        int __user *, parent_tidptr,
        int __user *, child_tidptr,
        int, tls_val)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         int, tls_val)
#endif
{
    return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
}
#endif

从以上代码中可以看出fork,vfork,clone最终都是调用 do_fork来创建新进程

do_fork

long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
        //创建进程描述符指针
        struct task_struct *p;

        //……

        //复制进程描述符,copy_process()的返回值是一个 task_struct 指针。
        p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace);

        if (!IS_ERR(p)) {
            struct completion vfork;
            struct pid *pid;

            trace_sched_process_fork(current, p);

            //得到新创建的进程描述符中的pid
            pid = get_task_pid(p, PIDTYPE_PID);
            nr = pid_vnr(pid);

            if (clone_flags & CLONE_PARENT_SETTID)
                put_user(nr, parent_tidptr);

            //如果调用的 vfork()方法,初始化 vfork 完成处理信息。
            if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
                get_task_struct(p);
            }

            //将子进程加入到调度器中,为其分配 CPU,准备执行
            wake_up_new_task(p);

            //fork 完成,子进程即将开始运行
            if (unlikely(trace))
                ptrace_event_pid(trace, pid);

            //如果是 vfork,将父进程加入至等待队列,等待子进程完成
            if (clone_flags & CLONE_VFORK) {
                if (!wait_for_vfork_done(p, &vfork))
                    ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
            }

            put_pid(pid);
        } else {
            nr = PTR_ERR(p);
        }
        return nr;
}

do_fork 流程

调用 copy_process 为子进程复制出一份进程信息
如果是 vfork 初始化完成处理信息
调用 wake_up_new_task 将子进程加入调度器,为之分配 CPU
如果是 vfork,父进程等待子进程完成 exec 替换自己的地址空间

进入到copy_process函数

static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace)
{
    int retval;
    //创建进程描述符指针
    struct task_struct *p;

    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);

    if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
        return ERR_PTR(-EINVAL);

    /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);

    /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);

    /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */
    if ((clone_flags & CLONE_PARENT) &&
                current->signal->flags & SIGNAL_UNKILLABLE)
        return ERR_PTR(-EINVAL);

    /* * If the new process will be in a different pid namespace * don't allow the creation of threads. */
    if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
        (task_active_pid_ns(current) != current->nsproxy->pid_ns))
        return ERR_PTR(-EINVAL);

    retval = security_task_create(clone_flags);
    if (retval)
        goto fork_out;

    retval = -ENOMEM;
    //复制当前的 task_struct
    p = dup_task_struct(current);
    if (!p)
        goto fork_out;

    ftrace_graph_init_task(p);
    get_seccomp_filter(p);

     //初始化互斥变量 
    rt_mutex_init_task(p);

#ifdef CONFIG_PROVE_LOCKING
    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
    retval = -EAGAIN;
    //检查进程数是否超过限制,由操作系统定义
    if (atomic_read(&p->real_cred->user->processes) >=
            task_rlimit(p, RLIMIT_NPROC)) {
        if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
            p->real_cred->user != INIT_USER)
            goto bad_fork_free;
    }
    current->flags &= ~PF_NPROC_EXCEEDED;

    retval = copy_creds(p, clone_flags);
    if (retval < 0)
        goto bad_fork_free;

    /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */
    retval = -EAGAIN;
    //检查进程数是否超过 max_threads 由内存大小决定
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;

    if (!try_module_get(task_thread_info(p)->exec_domain->module))
        goto bad_fork_cleanup_count;

    p->did_exec = 0;
    delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
    copy_flags(clone_flags, p);
    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    rcu_copy_process(p);
    p->vfork_done = NULL;
    //初始化自旋锁
    spin_lock_init(&p->alloc_lock);
    //初始化挂起信号
    init_sigpending(&p->pending);

    p->utime = p->stime = p->gtime = 0;
    p->utimescaled = p->stimescaled = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
    p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    seqlock_init(&p->vtime_seqlock);
    p->vtime_snap = 0;
    p->vtime_snap_whence = VTIME_SLEEPING;
#endif

#if defined(SPLIT_RSS_COUNTING)
    memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

    p->default_timer_slack_ns = current->timer_slack_ns;

    task_io_accounting_init(&p->ioac);
    acct_clear_integrals(p);

    //初始化CPU定时器
    posix_cpu_timers_init(p);

    do_posix_clock_monotonic_gettime(&p->start_time);
    p->real_start_time = p->start_time;
    monotonic_to_bootbased(&p->real_start_time);
    p->io_context = NULL;
    p->audit_context = NULL;
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_begin(current);
    cgroup_fork(p);
#ifdef CONFIG_NUMA
    p->mempolicy = mpol_dup(p->mempolicy);
    if (IS_ERR(p->mempolicy)) {
        retval = PTR_ERR(p->mempolicy);
        p->mempolicy = NULL;
        goto bad_fork_cleanup_cgroup;
    }
    mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_CPUSETS
    p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
    p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
    seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
    p->irq_events = 0;
    p->hardirqs_enabled = 0;
    p->hardirq_enable_ip = 0;
    p->hardirq_enable_event = 0;
    p->hardirq_disable_ip = _THIS_IP_;
    p->hardirq_disable_event = 0;
    p->softirqs_enabled = 1;
    p->softirq_enable_ip = _THIS_IP_;
    p->softirq_enable_event = 0;
    p->softirq_disable_ip = 0;
    p->softirq_disable_event = 0;
    p->hardirq_context = 0;
    p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
    p->lockdep_depth = 0; /* no locks held yet */
    p->curr_chain_key = 0;
    p->lockdep_recursion = 0;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_MEMCG
    p->memcg_batch.do_batch = 0;
    p->memcg_batch.memcg = NULL;
#endif

    /* Perform scheduler related setup. Assign this task to a CPU. */
     //初始化进程数据结构,并把进程状态设置为 TASK_RUNNING
    sched_fork(p);

    retval = perf_event_init_task(p);
     //复制所有进程信息,包括文件系统、信号处理函数、信号、内存管理等
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    /* copy all the process information */
    retval = copy_semundo(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_files(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_fs;
    retval = copy_signal(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_sighand;
    retval = copy_mm(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_namespaces;
    //初始化子进程内核栈
    retval = copy_thread(clone_flags, stack_start, stack_size, p);
    if (retval)
        goto bad_fork_cleanup_io;

    if (pid != &init_struct_pid) {
        retval = -ENOMEM;
         //为新进程分配新的 pid
        pid = alloc_pid(p->nsproxy->pid_ns);
        if (!pid)
            goto bad_fork_cleanup_io;
    }

     //设置子进程 pid 
    p->pid = pid_nr(pid);
    p->tgid = p->pid;
    if (clone_flags & CLONE_THREAD)
        p->tgid = current->tgid;

    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /* * Clear TID on mm_release()? */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
#ifdef CONFIG_BLOCK
    p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
    p->robust_list = NULL;
#ifdef CONFIG_COMPAT
    p->compat_robust_list = NULL;
#endif
    INIT_LIST_HEAD(&p->pi_state_list);
    p->pi_state_cache = NULL;
#endif
    uprobe_copy_process(p);
    /* * sigaltstack should be cleared when sharing the same VM */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        p->sas_ss_sp = p->sas_ss_size = 0;

    /* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */
    user_disable_single_step(p);
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
    clear_all_latency_tracing(p);

    /* ok, now we should be set up.. */
    if (clone_flags & CLONE_THREAD)
        p->exit_signal = -1;
    else if (clone_flags & CLONE_PARENT)
        p->exit_signal = current->group_leader->exit_signal;
    else
        p->exit_signal = (clone_flags & CSIGNAL);

    p->pdeath_signal = 0;
    p->exit_state = 0;

    p->nr_dirtied = 0;
    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    p->dirty_paused_when = 0;

    /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. */
    p->group_leader = p;
    INIT_LIST_HEAD(&p->thread_group);
    p->task_works = NULL;

    /* Need tasklist lock for parent etc handling! */
    write_lock_irq(&tasklist_lock);

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
        p->real_parent = current->real_parent;
        p->parent_exec_id = current->parent_exec_id;
    } else {
        p->real_parent = current;
        p->parent_exec_id = current->self_exec_id;
    }

    spin_lock(&current->sighand->siglock);

    /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */
    recalc_sigpending();
    if (signal_pending(current)) {
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        retval = -ERESTARTNOINTR;
        goto bad_fork_free_pid;
    }

    if (clone_flags & CLONE_THREAD) {
        current->signal->nr_threads++;
        atomic_inc(&current->signal->live);
        atomic_inc(&current->signal->sigcnt);
        p->group_leader = current->group_leader;
        list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
    }

    if (likely(p->pid)) {
        ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

        if (thread_group_leader(p)) {
            if (is_child_reaper(pid)) {
                ns_of_pid(pid)->child_reaper = p;
                p->signal->flags |= SIGNAL_UNKILLABLE;
            }

            p->signal->leader_pid = pid;
            p->signal->tty = tty_kref_get(current->signal->tty);
            attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
            attach_pid(p, PIDTYPE_SID, task_session(current));
            list_add_tail(&p->sibling, &p->real_parent->children);
            list_add_tail_rcu(&p->tasks, &init_task.tasks);
            __this_cpu_inc(process_counts);
        }
        attach_pid(p, PIDTYPE_PID, pid);
        nr_threads++;
    }

    total_forks++;
    spin_unlock(&current->sighand->siglock);
    write_unlock_irq(&tasklist_lock);
    proc_fork_connector(p);
    cgroup_post_fork(p);
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_end(current);
    perf_event_fork(p);

    trace_task_newtask(p, clone_flags);

    return p;

bad_fork_free_pid:
    if (pid != &init_struct_pid)
        free_pid(pid);
bad_fork_cleanup_io:
    if (p->io_context)
        exit_io_context(p);
bad_fork_cleanup_namespaces:
    exit_task_namespaces(p);
bad_fork_cleanup_mm:
    if (p->mm)
        mmput(p->mm);
bad_fork_cleanup_signal:
    if (!(clone_flags & CLONE_THREAD))
        free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
    __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
    exit_fs(p); /* blocking */
bad_fork_cleanup_files:
    exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
    exit_sem(p);
bad_fork_cleanup_audit:
    audit_free(p);
bad_fork_cleanup_policy:
    perf_event_free_task(p);
#ifdef CONFIG_NUMA
    mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
    if (clone_flags & CLONE_THREAD)
        threadgroup_change_end(current);
    cgroup_exit(p, 0);
    delayacct_tsk_free(p);
    module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
    atomic_dec(&p->cred->user->processes);
    exit_creds(p);
bad_fork_free:
    free_task(p);
fork_out:
    return ERR_PTR(retval);
}

copy_thread函数

int copy_thread(unsigned long clone_flags, unsigned long sp,
    unsigned long arg, struct task_struct *p)
{
    //获取寄存器信息
    struct pt_regs *childregs = task_pt_regs(p);
    struct task_struct *tsk;
    int err;

    p->thread.sp = (unsigned long) childregs;
    p->thread.sp0 = (unsigned long) (childregs+1);
    memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));

    if (unlikely(p->flags & PF_KTHREAD)) {
        //内核线程
        memset(childregs, 0, sizeof(struct pt_regs));
        p->thread.ip = (unsigned long) ret_from_kernel_thread;
        task_user_gs(p) = __KERNEL_STACK_CANARY;
        childregs->ds = __USER_DS;
        childregs->es = __USER_DS;
        childregs->fs = __KERNEL_PERCPU;
        childregs->bx = sp; /* function */
        childregs->bp = arg;
        childregs->orig_ax = -1;
        childregs->cs = __KERNEL_CS | get_kernel_rpl();
        childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
        p->thread.io_bitmap_ptr = NULL;
        return 0;
    }

    //将当前寄存器信息复制给子进程
    *childregs = *current_pt_regs();

    //子进程 eax 置 0,因此fork 在子进程返回0
    childregs->ax = 0;
    if (sp)
        childregs->sp = sp;

    //子进程ip 设置为ret_from_fork,因此子进程从ret_from_fork开始执行
    p->thread.ip = (unsigned long) ret_from_fork;

    //……

    return err;
}

ret_from_fork

ENTRY(ret_from_fork)
    CFI_STARTPROC
    pushl_cfi %eax
    call schedule_tail
    GET_THREAD_INFO(%ebp)
    popl_cfi %eax
    pushl_cfi $0x0202       # Reset kernel eflags
    popfl_cfi
    jmp syscall_exit
    CFI_ENDPROC
END(ret_from_fork)

总结

dup_task_struct中为子进程分配了新的堆栈
调用了sched_fork,将其置为TASK_RUNNING
copy_thread中将父进程的寄存器上下文复制给子进程,保证了父子进程的堆栈信息是一致的
将ret_from_fork的地址设置为eip寄存器的值
最终子进程从ret_from_fork开始执行

你可能感兴趣的:(linux,kernel)