进程相关小常用函数分析(arm)

register unsigned long current_stack_pointer asm ("sp");//表示访问current_stack_pointer 直接使用sp寄存器里的值。

static inline struct thread_info *current_thread_info(void)
{
    return (struct thread_info *)
        (current_stack_pointer & ~(THREAD_SIZE - 1));//将thread_size-1取反,因为刚开始sp肯定指向内核堆栈栈顶,这样将内核栈的大小减一直接与上sp则肯定是栈底的位置,有个前提就是栈的开始地址都是8k对齐的或者说栈大小对齐的。假设当前sp指向0xffffefff,栈大小为8k则8k是0xdfff则取反是0x2000,&0xffffefff则是0xffffdfff则是栈底的位置也就是thread_info的位置。
}

union thread_union {
#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
    struct task_struct task;
#endif
#ifndef CONFIG_THREAD_INFO_IN_TASK
    struct thread_info thread_info;
#endif
    unsigned long stack[THREAD_SIZE/sizeof(long)];
};//看看
struct rlimit {
    __kernel_ulong_t    rlim_cur;
    __kernel_ulong_t    rlim_max;
};//进程资源结构体
限制的资源类型:

#define RLIMIT_CPU      0   /* CPU time in sec */
#define RLIMIT_FSIZE        1   /* Maximum filesize */
#define RLIMIT_DATA     2   /* max data size */
#define RLIMIT_STACK        3   /* max stack size */
#define RLIMIT_CORE     4   /* max core file size */

#ifndef RLIMIT_RSS
# define RLIMIT_RSS     5   /* max resident set size */
#endif

#ifndef RLIMIT_NPROC
# define RLIMIT_NPROC       6   /* max number of processes */
#endif
    
#ifndef RLIMIT_NOFILE
# define RLIMIT_NOFILE      7   /* max number of open files */
#endif

#ifndef RLIMIT_MEMLOCK
# define RLIMIT_MEMLOCK     8   /* max locked-in-memory address space */
#endif
#ifndef RLIMIT_AS
# define RLIMIT_AS      9   /* address space limit */
#endif
#define RLIMIT_LOCKS        10  /* maximum file locks held */
#define RLIMIT_SIGPENDING   11  /* max number of pending signals */
#define RLIMIT_MSGQUEUE     12  /* maximum bytes in POSIX mqueues */
#define RLIMIT_NICE     13  /* max nice prio allowed to raise to
                       0-39 for nice level 19 .. -20 */
#define RLIMIT_RTPRIO       14  /* maximum realtime priority */
#define RLIMIT_RTTIME       15  /* timeout for RT tasks in us */
#define RLIM_NLIMITS        16  

想分析下dup_task_struct函数

int tsk_fork_get_node(struct task_struct *tsk)
{   
#ifdef CONFIG_NUMA
    if (tsk == kthreadd_task)
        return tsk->pref_node_fork;
#endif
    return NUMA_NO_NODE;
}//也就是普通用户的node都是不指定的
 

static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
    struct task_struct *tsk;
    unsigned long *stack;
    struct vm_struct *stack_vm_area;
    int err;

    if (node == NUMA_NO_NODE)//其实你像do_fork传入的参数就是NUMA_NO_NODE
        node = tsk_fork_get_node(orig);//获取分配变量内存的node
    tsk = alloc_task_struct_node(node);//kmem_cache_alloc_node使用GFP_KERNEL分配tsk结构体空间
    if (!tsk)
        return NULL;//这就是分配失败了呗

    stack = alloc_thread_stack_node(tsk, node);//分配stack_vm_area空间,正常是从vmalloc那段内存空间里面分配的
    if (!stack)
        goto free_tsk;

    stack_vm_area = task_stack_vm_area(tsk);//t->stack_vm_area,就是获取tsk->stack_vm_area

    err = arch_dup_task_struct(tsk, orig);//架构相关函数

//arm64  *dst = *src; dst->thread.sve_state = NULL;clear_tsk_thread_flag(dst, TIF_SVE);会覆盖stack以及stack_vm_area的值。

    /*
     * arch_dup_task_struct() clobbers the stack-related fields.  Make
     * sure they're properly initialized before using any stack-related
     * functions again.
     */
    tsk->stack = stack;//重新初始化栈地址
#ifdef CONFIG_VMAP_STACK
    tsk->stack_vm_area = stack_vm_area;//以及栈空间对应的vma地址
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
    atomic_set(&tsk->stack_refcount, 1);//增加引用计数
#endif

  if (err)
        goto free_stack;

    err = scs_prepare(tsk, node);//scs空间初始化,这个玩意是干啥的我目前不清楚。
    if (err)
        goto free_stack;

#ifdef CONFIG_SECCOMP
    /*
     * We must handle setting up seccomp filters once we're under
     * the sighand lock in case orig has changed between now and
     * then. Until then, filter must be NULL to avoid messing up
     * the usage counts on the error path calling free_task.
     */
    tsk->seccomp.filter = NULL;//进程安全那块的东西
#endif

    setup_thread_stack(tsk, orig);//直接拷贝thread_info然后将thread_info里的task绑定到新的task上

/*

 *task_thread_info(p) = *task_thread_info(org);拷贝current的thread_info;task_thread_info(p)->task = p;
 

*/
    clear_user_return_notifier(tsk);// clear_tsk_thread_flag(p, TIF_USER_RETURN_NOTIFY);
    clear_tsk_need_resched(tsk);// clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);

    set_task_stack_end_magic(tsk);// stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; //在栈尾巴添加新的魔术字

#ifdef CONFIG_STACKPROTECTOR
    tsk->stack_canary = get_random_canary();
#endif

    /*
     * One for us, one for whoever does the "release_task()" (usually
     * parent)
     */
    atomic_set(&tsk->usage, 2);

//下面都是些初始化自己看看呗
#ifdef CONFIG_BLK_DEV_IO_TRACE
    tsk->btrace_seq = 0;
#endif
    tsk->splice_pipe = NULL;
    tsk->task_frag.page = NULL;
    tsk->wake_q.next = NULL;

  account_kernel_stack(tsk, 1);

    kcov_task_init(tsk);

#ifdef CONFIG_FAULT_INJECTION
    tsk->fail_nth = 0;
#endif

#ifdef CONFIG_BLK_CGROUP
    tsk->throttle_queue = NULL;
    tsk->use_memdelay = 0;
#endif

#ifdef CONFIG_MEMCG
    tsk->active_memcg = NULL;
#endif
    return tsk;

free_stack:
    free_thread_stack(tsk);
free_tsk:
    free_task_struct(tsk);
    return NULL;
}

cpufreq_task_times_init  函数代码分析

void cpufreq_task_times_init(struct task_struct *p)
{
    unsigned long flags;

    spin_lock_irqsave(&task_time_in_state_lock, flags);
    p->time_in_state = NULL;//这个玩意是cpu调频的计数,现在重置
    spin_unlock_irqrestore(&task_time_in_state_lock, flags);
    p->max_state = 0;//也是调频相关具体含义后面看代码在回头解释
}
 

copy_process 函数分析:

static __latent_entropy struct task_struct *copy_process(
                    unsigned long clone_flags,//clone flags
                    unsigned long stack_start,//栈起始地址
                    unsigned long stack_size,//栈大小
                    int __user *parent_tidptr,//用户态父进程存储pid的
                    int __user *child_tidptr,//用户态子进程存储pid的
                    struct pid *pid,//pid结构体
                    int trace,//是否被trace
                    unsigned long tls,//tls
                    int node)//分配的node,像do_fork都是直接使用NUMA_NO_NODE
{
    int pidfd = -1, retval;
    struct task_struct *p;
    struct multiprocess_signals delayed;

    /*
     * Don't allow sharing the root directory with processes in a different
     * namespace
     */
    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);

    if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
        return ERR_PTR(-EINVAL);

    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
     */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))

           return ERR_PTR(-EINVAL);

    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
     */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);

    /*
     * Siblings of global init remain as zombies on exit since they are
     * not reaped by their parent (swapper). To solve this and to avoid
     * multi-rooted process trees, prevent global and container-inits
     * from creating siblings.
     */
    if ((clone_flags & CLONE_PARENT) &&
                current->signal->flags & SIGNAL_UNKILLABLE)
        return ERR_PTR(-EINVAL);

/*
     * If the new process will be in a different pid or user namespace
     * do not allow it to share a thread group with the forking task.
     */
    if (clone_flags & CLONE_THREAD) {
        if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
            (task_active_pid_ns(current) !=
                current->nsproxy->pid_ns_for_children))
            return ERR_PTR(-EINVAL);
    }

    if (clone_flags & CLONE_PIDFD) {
        /*
         * - CLONE_PARENT_SETTID is useless for pidfds and also
         *   parent_tidptr is used to return pidfds.
         * - CLONE_DETACHED is blocked so that we can potentially
         *   reuse it later for CLONE_PIDFD.
         * - CLONE_THREAD is blocked until someone really needs it.
         */
        if (clone_flags &
            (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
            return ERR_PTR(-EINVAL);
    }

/* 上面都是flags检查看注释就能够理解了这里不解释了*/

 sigemptyset(&delayed.signal);//清空delayed的signal
    INIT_HLIST_NODE(&delayed.node);//初始化hlist链表

    spin_lock_irq(¤t->sighand->siglock);
    if (!(clone_flags & CLONE_THREAD))
        hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
    recalc_sigpending();//判断是否有挂起信号如果没有则清除TIF_SIGPENDING标志
    spin_unlock_irq(¤t->sighand->siglock);
    retval = -ERESTARTNOINTR;
    if (signal_pending(current))//检查是否存在挂起信号通过检查TIF_SIGPENDING标志
        goto fork_out;

   retval = -ENOMEM;
    p = dup_task_struct(current, node);//复制current---------task_struct
    if (!p)
        goto fork_out;
    
    cpufreq_task_times_init(p);//上面说过这个函数比较简单

 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /* 
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;

/*如果设置了 CLONE_CHILD_SETTID-------flags则设置p->set_child_tid和p->clear_child_tid*/

 ftrace_graph_init_task(p);//ftrace初始化不看

    rt_mutex_init_task(p);//初始化一些锁
    
#ifdef CONFIG_PROVE_LOCKING
    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif  
    retval = -EAGAIN;
    if (atomic_read(&p->real_cred->user->processes) >=
            task_rlimit(p, RLIMIT_NPROC)) {
        if (p->real_cred->user != INIT_USER &&
            !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
            goto bad_fork_free;
    }//进程数资源检查
    current->flags &= ~PF_NPROC_EXCEEDED;
        
    retval = copy_creds(p, clone_flags);//安全相关
    if (retval < 0)
        goto bad_fork_free;

 retval = -EAGAIN;
    if (nr_threads >= max_threads)//线程数检查
        goto bad_fork_cleanup_count;
    
    delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
    p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
    p->flags |= PF_FORKNOEXEC;
    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    rcu_copy_process(p);//初始化rcu相关
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);
    
    init_sigpending(&p->pending);//如下初始化信号量

/*
        static inline void init_sigpending(struct sigpending *sig)
{   
    sigemptyset(&sig->signal);
    INIT_LIST_HEAD(&sig->list);
}   
    
  */

  p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
    p->utimescaled = p->stimescaled = 0;
#endif
    prev_cputime_init(&p->prev_cputime);//cpu-time初始化
    

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    seqcount_init(&p->vtime.seqcount);
    p->vtime.starttime = 0;
    p->vtime.state = VTIME_INACTIVE;
#endif

#if defined(SPLIT_RSS_COUNTING)
    memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
    
    p->default_timer_slack_ns = current->timer_slack_ns;

#ifdef CONFIG_PSI
    p->psi_flags = 0;
#endif

 task_io_accounting_init(&p->ioac);//io一些统计计数初始化
    acct_clear_integrals(p);//清除task里mm相关字段

    posix_cpu_timers_init(p);//

/*

static void posix_cpu_timers_init(struct task_struct *tsk)
{

struct task_cputime - collected CPU time counts
 * @utime:      time spent in user mode, in nanoseconds
 * @stime:      time spent in kernel mode, in nanoseconds
 * @sum_exec_runtime:   total time spent on the CPU, in nanoseconds
#define virt_exp            utime
#define prof_exp            stime
#define sched_exp           sum_exec_runtime
    tsk->cputime_expires.prof_exp = 0;//统计花费在
    tsk->cputime_expires.virt_exp = 0;
    tsk->cputime_expires.sched_exp = 0;
    INIT_LIST_HEAD(&tsk->cpu_timers[0]);
    INIT_LIST_HEAD(&tsk->cpu_timers[1]);
    INIT_LIST_HEAD(&tsk->cpu_timers[2]);
}   

*/

    p->io_context = NULL;
    audit_set_context(p, NULL);//audio上下文初始化:task->audit_context = NULL
    cgroup_fork(p);//cgroup数据初始化
#ifdef CONFIG_NUMA
    p->mempolicy = mpol_dup(p->mempolicy);
    if (IS_ERR(p->mempolicy)) {
        retval = PTR_ERR(p->mempolicy);
        p->mempolicy = NULL;
        goto bad_fork_cleanup_threadgroup_lock;
    }
#endif

#ifdef CONFIG_CPUSETS
    p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
    p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
    seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
    p->irq_events = 0;
    p->hardirqs_enabled = 0;
    p->hardirq_enable_ip = 0;
    p->hardirq_enable_event = 0;
    p->hardirq_disable_ip = _THIS_IP_;
    p->hardirq_disable_event = 0;
    p->softirqs_enabled = 1;
    p->softirq_enable_ip = _THIS_IP_;
    p->softirq_enable_event = 0;
    p->softirq_disable_ip = 0;
    p->softirq_disable_event = 0;
    p->hardirq_context = 0;
    p->softirq_context = 0;
#endif

  p->pagefault_disabled = 0;

#ifdef CONFIG_LOCKDEP
    p->lockdep_depth = 0; /* no locks held yet */
    p->curr_chain_key = 0;
    p->lockdep_recursion = 0;
    lockdep_init_task(p);
#endif

#ifdef CONFIG_DEBUG_MUTEXES
    p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
    p->sequential_io    = 0;
    p->sequential_io_avg    = 0;
#endif
//----------下面是资源相关的初始化,重点关注

 retval = sched_fork(clone_flags, p);//调度相关初始化
    if (retval)
        goto bad_fork_cleanup_policy;

    retval = perf_event_init_task(p);//perf相关数据结构初始化
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);//调用audit_alloc_context分配一个context空间task->audit_context = ctx;并且设置 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
    if (retval)
        goto bad_fork_cleanup_perf;
    /* copy all the process information */
    shm_init_task(p);//systv shm初始化
    retval = security_task_alloc(p, clone_flags);//调用task_alloc钩子
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_semundo(clone_flags, p);//undo初始化
    if (retval)
        goto bad_fork_cleanup_security;
    retval = copy_files(clone_flags, p);//打开的文件初始化或者说继承

/*

static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
{
    struct files_struct *oldf, *newf;
    int error = 0;

    /*
     * A background process may not have any files ...
     */
    oldf = current->files;
    if (!oldf)
        goto out;

    if (clone_flags & CLONE_FILES) {
        atomic_inc(&oldf->count);//如果是指定了CLONE_FILES则共享父进程的files,直接退出了就不会进行下面的拷贝动作
        goto out;
    }

    newf = dup_fd(oldf, &error);
    if (!newf)
        goto out;

    tsk->files = newf;
    error = 0;
out:
    return error;
}
 

*/
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p);//拷贝文件系统

/*

static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
    struct fs_struct *fs = current->fs;
    if (clone_flags & CLONE_FS) {//注意这行的标志
        /* tsk->fs is already what we want */
        spin_lock(&fs->lock);
        if (fs->in_exec) {
            spin_unlock(&fs->lock);
            return -EAGAIN;
        }
        fs->users++;
        spin_unlock(&fs->lock);
        return 0;
    }
    tsk->fs = copy_fs_struct(fs);
    if (!tsk->fs)
        return -ENOMEM;
    return 0;
}
 

*/
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);//拷贝信号处理函数设置

/*

static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
    struct sighand_struct *sig;

    if (clone_flags & CLONE_SIGHAND) {//共享标志
        atomic_inc(¤t->sighand->count);//增加引用计数
        return 0;
    }
    sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
    rcu_assign_pointer(tsk->sighand, sig);
    if (!sig)
        return -ENOMEM;

    atomic_set(&sig->count, 1);
    spin_lock_irq(¤t->sighand->siglock);
    memcpy(sig->action, current->sighand->action, sizeof(sig->action));//虽然不是共享但是默认还是使用父进程的action,只是拥有独自的数据结构
    spin_unlock_irq(¤t->sighand->siglock);
    return 0;
}
 

*/
    if (retval)
        goto bad_fork_cleanup_fs;

 retval = copy_signal(clone_flags, p);//初始化signal结构体
    if (retval)
        goto bad_fork_cleanup_sighand;
    retval = copy_mm(clone_flags, p);//初始化mm数据结构

/*

if (clone_flags & CLONE_VM) {//共享标志检查
        mmget(oldmm);
        mm = oldmm;
        goto good_mm;
    }
    retval = -ENOMEM;
    mm = dup_mm(tsk);//继续分析
    if (!mm)
        goto fail_nomem;

*/

/*

 mm = allocate_mm();
    if (!mm)
        goto fail_nomem;

    memcpy(mm, oldmm, sizeof(*mm));//复制父进程地址空间

    if (!mm_init(mm, tsk, mm->user_ns))
        goto fail_nomem;

    err = dup_mmap(mm, oldmm);//复制mmap数据,这里不展开
    if (err)
        goto free_pt;

    mm->hiwater_rss = get_mm_rss(mm);
    mm->hiwater_vm = mm->total_vm;

*/

    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);

/*

if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 
                  CLONE_NEWPID | CLONE_NEWNET |
                  CLONE_NEWCGROUP)))) {
        get_nsproxy(old_ns);
        return 0;
    }   
        
    if (!ns_capable(user_ns, CAP_SYS_ADMIN))
        return -EPERM;
        
    /*
     * CLONE_NEWIPC must detach from the undolist: after switching
     * to a new ipc namespace, the semaphore arrays from the old
     * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM
     * means share undolist with parent, so we must forbid using
     * it along with CLONE_NEWIPC.
     */
    if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
        (CLONE_NEWIPC | CLONE_SYSVSEM))
        return -EINVAL;

    new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);

/*申请个新的ns
 

*/
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);//io相关

/*

static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
{
#ifdef CONFIG_BLOCK
    struct io_context *ioc = current->io_context;
    struct io_context *new_ioc;

    if (!ioc)
        return 0;
    /*
     * Share io context with parent, if CLONE_IO is set
     */
    if (clone_flags & CLONE_IO) {//共享
        ioc_task_link(ioc);
        tsk->io_context = ioc;
    } else if (ioprio_valid(ioc->ioprio)) {
        new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);//看下面
        if (unlikely(!new_ioc))
            return -ENOMEM;

        new_ioc->ioprio = ioc->ioprio;
        put_io_context(new_ioc);
    }
#endif
    return 0;
}

struct io_context *get_task_io_context(struct task_struct *task,
                       gfp_t gfp_flags, int node)
{   
    struct io_context *ioc;

    might_sleep_if(gfpflags_allow_blocking(gfp_flags));

    do {
        task_lock(task);
        ioc = task->io_context;
        if (likely(ioc)) {//存在则不需要重新初始化
            get_io_context(ioc);//增加ioc引用计数
            task_unlock(task);
            return ioc;
        }
        task_unlock(task);
    } while (!create_task_io_context(task, gfp_flags, node));//获取一个ioc然后循环增加一个引用计数
        
    return NULL;
}   
EXPORT_SYMBOL(get_task_io_context);
 

int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
{
    struct io_context *ioc;
    int ret;

    ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
                    node);//申请个ioc空间
    if (unlikely(!ioc))
        return -ENOMEM;

    /* initialize */
    atomic_long_set(&ioc->refcount, 1);
    atomic_set(&ioc->nr_tasks, 1);
    atomic_set(&ioc->active_ref, 1);//设置引用计数
    spin_lock_init(&ioc->lock);
    INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
    INIT_HLIST_HEAD(&ioc->icq_list);
    INIT_WORK(&ioc->release_work, ioc_release_fn);

    /*
     * Try to install.  ioc shouldn't be installed if someone else
     * already did or @task, which isn't %current, is exiting.  Note
     * that we need to allow ioc creation on exiting %current as exit
     * path may issue IOs from e.g. exit_files().  The exit path is
     * responsible for not issuing IO after exit_io_context().
     */
    task_lock(task);
    if (!task->io_context &&
        (task == current || !(task->flags & PF_EXITING)))
        task->io_context = ioc;
    else
        kmem_cache_free(iocontext_cachep, ioc);

    ret = task->io_context ? 0 : -EBUSY;

    task_unlock(task);


    return ret;
}
 

*/
    if (retval)
        goto bad_fork_cleanup_namespaces;
    retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);//tls相关这里也不展开了
    if (retval)
        goto bad_fork_cleanup_io;

 if (pid != &init_struct_pid) {
        pid = alloc_pid(p->nsproxy->pid_ns_for_children);//分配各pid
        if (IS_ERR(pid)) {
            retval = PTR_ERR(pid);
            goto bad_fork_cleanup_thread;
        }
    }   
        


    /*
     * This has to happen after we've potentially unshared the file
     * descriptor table (so that the pidfd doesn't leak into the child
     * if the fd table isn't shared).
     */
    if (clone_flags & CLONE_PIDFD) {
        retval = pidfd_create(pid);//设置pidfd
        if (retval < 0)
            goto bad_fork_free_pid;

        pidfd = retval;
        retval = put_user(pidfd, parent_tidptr);
        if (retval)
            goto bad_fork_put_pidfd;
    }

#ifdef CONFIG_BLOCK
    p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
    p->robust_list = NULL;
#ifdef CONFIG_COMPAT
    p->compat_robust_list = NULL;
#endif
    INIT_LIST_HEAD(&p->pi_state_list);
    p->pi_state_cache = NULL;
#endif
    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        sas_ss_reset(p);

    /*
     * Syscall tracing and stepping should be turned off in the
     * child regardless of CLONE_PTRACE.
     */
    user_disable_single_step(p);
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif

 clear_all_latency_tracing(p);

    /* ok, now we should be set up.. */
    p->pid = pid_nr(pid);
    if (clone_flags & CLONE_THREAD) {
        p->group_leader = current->group_leader;
        p->tgid = current->tgid;
    } else {
        p->group_leader = p;
        p->tgid = p->pid;
    }

    p->nr_dirtied = 0;
    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    p->dirty_paused_when = 0;

    p->pdeath_signal = 0;
    INIT_LIST_HEAD(&p->thread_group);
    p->task_works = NULL;

    cgroup_threadgroup_change_begin(current);
    /*
     * Ensure that the cgroup subsystem policies allow the new process to be
     * forked. It should be noted the the new process's css_set can be changed
     * between here and cgroup_post_fork() if an organisation operation is in
     * progress.
     */

 retval = cgroup_can_fork(p);
    if (retval)
        goto bad_fork_cgroup_threadgroup_change_end;

    /*
     * From this point on we must avoid any synchronous user-space
     * communication until we take the tasklist-lock. In particular, we do
     * not want user-space to be able to predict the process start-time by
     * stalling fork(2) after we recorded the start_time but before it is
     * visible to the system.
     */

    p->start_time = ktime_get_ns();
    p->real_start_time = ktime_get_boot_ns();

    /*
     * Make it visible to the rest of the system, but dont wake it up yet.
     * Need tasklist lock for parent etc handling!
     */
    write_lock_irq(&tasklist_lock);
//**********************好玩的

 /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {//设置了此标志表面公用爸爸则是兄弟
        p->real_parent = current->real_parent;
        p->parent_exec_id = current->parent_exec_id;
        if (clone_flags & CLONE_THREAD)
            p->exit_signal = -1;
        else
            p->exit_signal = current->group_leader->exit_signal;
    } else {//没有指定则是儿子
        p->real_parent = current;
        p->parent_exec_id = current->self_exec_id;
        p->exit_signal = (clone_flags & CSIGNAL);
    }

//***************其他的基本都是变量初始化了

klp_copy_process(p);

    spin_lock(¤t->sighand->siglock);

    /*
     * Copy seccomp details explicitly here, in case they were changed
     * before holding sighand lock.
     */
    copy_seccomp(p);

    rseq_fork(p, clone_flags);

    /* Don't start children in a dying pid namespace */
    if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
        retval = -ENOMEM;
        goto bad_fork_cancel_cgroup;
    }
 


    /* Let kill terminate clone/fork in the middle */
    if (fatal_signal_pending(current)) {
        retval = -EINTR;
        goto bad_fork_cancel_cgroup;
    }

//初始化task和其他进程的关系

init_task_pid_links(p);
    if (likely(p->pid)) {
        ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

        init_task_pid(p, PIDTYPE_PID, pid);
        if (thread_group_leader(p)) {
            init_task_pid(p, PIDTYPE_TGID, pid);
            init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
            init_task_pid(p, PIDTYPE_SID, task_session(current));

            if (is_child_reaper(pid)) {
                ns_of_pid(pid)->child_reaper = p;
                p->signal->flags |= SIGNAL_UNKILLABLE;
            }
            p->signal->shared_pending.signal = delayed.signal;
            p->signal->tty = tty_kref_get(current->signal->tty);
            /*
             * Inherit has_child_subreaper flag under the same
             * tasklist_lock with adding child to the process tree
             * for propagate_has_child_subreaper optimization.
             */
            p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                             p->real_parent->signal->is_child_subreaper;
            list_add_tail(&p->sibling, &p->real_parent->children);
            list_add_tail_rcu(&p->tasks, &init_task.tasks);
            attach_pid(p, PIDTYPE_TGID);
            attach_pid(p, PIDTYPE_PGID);
            attach_pid(p, PIDTYPE_SID);
            __this_cpu_inc(process_counts);
        } else {

 current->signal->nr_threads++;
            atomic_inc(¤t->signal->live);
            atomic_inc(¤t->signal->sigcnt);
            task_join_group_stop(p);
            list_add_tail_rcu(&p->thread_group,
                      &p->group_leader->thread_group);
            list_add_tail_rcu(&p->thread_node,
                      &p->signal->thread_head);
        }
        attach_pid(p, PIDTYPE_PID);
        nr_threads++;
    }

//*********统计计数以及初始化跟踪点

total_forks++;
    hlist_del_init(&delayed.node);
    spin_unlock(¤t->sighand->siglock);
    syscall_tracepoint_update(p);
    write_unlock_irq(&tasklist_lock);

    proc_fork_connector(p);
    cgroup_post_fork(p);
    cgroup_threadgroup_change_end(current);
    perf_event_fork(p);

    trace_task_newtask(p, clone_flags);
    uprobe_copy_process(p, clone_flags);

    copy_oom_score_adj(clone_flags, p);

    return p;

//************

错误处理 。。。。。

}

//*****************_*_*_*_*_*_*_*_*_*_*

_do_fork分析:

long _do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr,
          unsigned long tls)
{
    struct completion vfork;
    struct pid *pid;
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * Determine whether and which event to report to ptracer.  When
     * called from kernel_thread or CLONE_UNTRACED is explicitly
     * requested, no event is reported; otherwise, report if the event
     * for the type of forking is enabled.
     */
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;

//处理trace

        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }
 

 p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
             child_tidptr, NULL, trace, tls, NUMA_NO_NODE);//dup  task_struct
    add_latent_entropy();

    if (IS_ERR(p))
        return PTR_ERR(p);

    cpufreq_task_times_alloc(p);//给time_in_state分配空间
/*

void cpufreq_task_times_alloc(struct task_struct *p)
{   
    void *temp;
    unsigned long flags;
    unsigned int max_state = READ_ONCE(next_offset);
    
    /* We use one array to avoid multiple allocs per task */
    temp = kcalloc(max_state, sizeof(p->time_in_state[0]), GFP_ATOMIC);
    if (!temp)
        return;

    spin_lock_irqsave(&task_time_in_state_lock, flags);
    p->time_in_state = temp;
    spin_unlock_irqrestore(&task_time_in_state_lock, flags);
    p->max_state = max_state;
}
 

*/

trace_sched_process_fork(current, p);
    
    pid = get_task_pid(p, PIDTYPE_PID);
    nr = pid_vnr(pid);//获取pid
    
    if (clone_flags & CLONE_PARENT_SETTID)
        put_user(nr, parent_tidptr);

    if (clone_flags & CLONE_VFORK) {
        p->vfork_done = &vfork;
        init_completion(&vfork);//初始化等待队列
        get_task_struct(p);//增加task引用计数
    }

 wake_up_new_task(p);//唤醒首次运行的task,具体如下

/*

void wake_up_new_task(struct task_struct *p)
{
    struct rq_flags rf;
    struct rq *rq;

    raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
    p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
    /*
     * Fork balancing, do it here and not earlier because:
     *  - cpus_allowed can change in the fork path
     *  - any previously selected CPU might disappear through hotplug
     *
     * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
     * as we're not fully set-up yet.
     */
    p->recent_used_cpu = task_cpu(p);
    rseq_migrate(p);
    __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));//设置运行cpu
#endif
    rq = __task_rq_lock(p, &rf);
    update_rq_clock(rq);
    post_init_entity_util_avg(&p->se);

    activate_task(rq, p, ENQUEUE_NOCLOCK);//激活新的task其实就是加入就绪队列
    p->on_rq = TASK_ON_RQ_QUEUED;//设置标志表示task在run_queue
trace_sched_wakeup_new(p);
    check_preempt_curr(rq, p, WF_FORK);//检查是否需要切换运行进程
#ifdef CONFIG_SMP
    if (p->sched_class->task_woken) {
        /*
         * Nothing relies on rq->lock after this, so its fine to
         * drop it.
         */
        rq_unpin_lock(rq, &rf);
        p->sched_class->task_woken(rq, p);
        rq_repin_lock(rq, &rf);
    }
#endif
    task_rq_unlock(rq, p, &rf);
}        

*/

    /* forking complete and child started to run, tell ptracer */
    if (unlikely(trace))
        ptrace_event_pid(trace, pid);
    
    if (clone_flags & CLONE_VFORK) {
        if (!wait_for_vfork_done(p, &vfork))//等待进程结束
            ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
    }

    put_pid(pid);
    return nr;
 

}

//-------------------------------------------------------------

do_fork和kernel_thread

ong do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    return _do_fork(clone_flags, stack_start, stack_size,
            parent_tidptr, child_tidptr, 0);
}
#endif

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
    return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
        (unsigned long)arg, NULL, NULL, 0);
}


//---------------------------------补充个函数copy_thread_tls

struct pt_regs {
    union {
        struct user_pt_regs user_regs;
        struct {
            u64 regs[31];
            u64 sp;
            u64 pc;
            u64 pstate;
        };
    };
    u64 orig_x0;
#ifdef __AARCH64EB__
    u32 unused2;
    s32 syscallno;
#else
    s32 syscallno;
    u32 unused2;
#endif

    u64 orig_addr_limit;
    u64 unused; // maintain 16 byte alignment
    u64 stackframe[2];
};

struct thread_struct {
    struct cpu_context  cpu_context;    /* cpu context */

    /*
     * Whitelisted fields for hardened usercopy:
     * Maintainers must ensure manually that this contains no
     * implicit padding.
     */
    struct {
        unsigned long   tp_value;   /* TLS register */
        unsigned long   tp2_value;
        struct user_fpsimd_state fpsimd_state;
    } uw;

    unsigned int        fpsimd_cpu;
    void            *sve_state; /* SVE registers, if any */
    unsigned int        sve_vl;     /* SVE vector length */
    unsigned int        sve_vl_onexec;  /* SVE vl after next exec */
    unsigned long       fault_address;  /* fault info */
    unsigned long       fault_code; /* ESR_EL1 value */
    struct debug_info   debug;      /* debugging */
};
 

static inline int copy_thread_tls(
        unsigned long clone_flags, unsigned long sp, unsigned long arg,
        struct task_struct *p, unsigned long tls)
{
    return copy_thread(clone_flags, sp, arg, p);//全调用
}
 

int copy_thread(unsigned long clone_flags, unsigned long stack_start,
        unsigned long stk_sz, struct task_struct *p)
{   
    struct pt_regs *childregs = task_pt_regs(p);
    
    memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));//清空cpu_context

    /*
     * In case p was allocated the same task_struct pointer as some
     * other recently-exited task, make sure p is disassociated from
     * any cpu that may have run that now-exited task recently.
     * Otherwise we could erroneously skip reloading the FPSIMD
     * registers for p.
     */ 
    fpsimd_flush_task_state(p);//重置thread.fpsimd_cpu=NR_CPUS

    if (likely(!(p->flags & PF_KTHREAD))) {//不是内核线程
        *childregs = *current_pt_regs();//获取当前进程的pt_regs
        childregs->regs[0] = 0;

        /*
         * Read the current TLS pointer from tpidr_el0 as it may be
         * out-of-sync with the saved value.
         */
        *task_user_tls(p) = read_sysreg(tpidr_el0);

    if (stack_start) {
            if (is_compat_thread(task_thread_info(p)))
                childregs->compat_sp = stack_start;
            else
                childregs->sp = stack_start;
        }//设置sp为函数栈帧

        /*
         * If a TLS pointer was passed to clone (4th argument), use it
         * for the new thread.
         */
        if (clone_flags & CLONE_SETTLS)
            p->thread.uw.tp_value = childregs->regs[3];
    } else {//内核线程
        memset(childregs, 0, sizeof(struct pt_regs));//重置regs
        childregs->pstate = PSR_MODE_EL1h;
        if (IS_ENABLED(CONFIG_ARM64_UAO) &&
            cpus_have_const_cap(ARM64_HAS_UAO))
            childregs->pstate |= PSR_UAO_BIT;

        if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
            set_ssbs_bit(childregs);

        p->thread.cpu_context.x19 = stack_start;
        p->thread.cpu_context.x20 = stk_sz;
    }
    p->thread.cpu_context.pc = (unsigned long)ret_from_fork;//设置返回之后的执行指令
    p->thread.cpu_context.sp = (unsigned long)childregs;//设置栈

    ptrace_hw_copy_thread(p);

    return 0;
}

arm64:

ENTRY(ret_from_fork)
    bl  schedule_tail   //完成进程任务清理工作./kernel/sched/core.c +3368
    cbz x19, 1f             // not a kernel thread,x19也就是栈起始地址,非0 则是内核线程,为0则是用户进程跳转到1执行
    mov x0, x20 //x20是函数参数地址
    blr x19  // 跳转到x19也就是fn执行
1:  get_thread_info tsk
    b   ret_to_user  //./arch/arm64/kernel/entry.S +921
ENDPROC(ret_from_fork)
NOKPROBE(ret_from_fork)


 

你可能感兴趣的:(进程管理,内核,数据结构,linux)