kernel在启动初期并没有“进程”这个概念,如果不涉及多任务并发、调度,kernel可以一直以一个控制流运行。本篇从内核初始化时的0进程开始分析,延伸到多进程的创建。
内核中的所有进程都存在依赖关系,进程有父进程、子进程、兄弟进程。0号进程为所有进程的祖先进程,又称为idle进程、swapper进程。
如前文所述,内核在初始化初期没有进程的概念,零号进程是内核从无到有创建出来的,当然,0进程的task_struct是采用静态分配的方式,见附录INIT_TASK。
除此以外,init_thread_union指定了0进程的thread_info和stack,这些是在编译阶段就已经确定了的。
在系统初始化阶段(start_kernel),内核引入了进程的概念,指定当前初始化的进程为0号进程,之后所有的进程都是0号进程的子孙进程。
随后,在0号进程的基础上创建init进程(PID为1),0 进程会被设置成idle进程,加入到运行队列中。当CPU上没有可调度进程时,调度器才会选择0号进程运行,该进程重复执行halt指令。
init进程的创建本质上和其他内核线程的创建方式一致。都是调用了kernel_thread
382 kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
OK,这样就创建了一个线程,init进程调用kernel_init,该线程调用根文件系统下的/sbin/init,进一步完成初始化。
前文简述了kernel中第一个进程的创建,其他内核线程的创建都是基于0号进程的copy,区别在于共享进程的哪些内容。
kernel_thread是对do_fork的封装:
1649 /*
1650 * Create a kernel thread.
1651 */
1652 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
1653 {
1654 return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
1655 (unsigned long)arg, NULL, NULL);
1656 }
1657
创建一个进程,do_fork传入的flag参数至关重要,影响了待创建进程与当前进程共享哪些进程组成部分。列举几个经常遇到的flag:
- CLONE_VM,共享内存描述符和页表
- CLONE_FS,共享文件系统
- CLONE_FILES,共享打开的文件表
- CLONE_SIGHAND,共享信号处理函数表和阻塞、挂起的信号;如果这个flag被设置,那么CLONE_VM也必须被设置
- CLONE_VFORK,vfork系统调用时被设置
- CLONE_PARENT,新创建的进程与当前进程拥有相同的父进程
- CLONE_THREAD,将创建的子进程插入到父进程所处的线程组中,强制子进程共享父进程的信号描述符。如果这个flag被设置,就必须设置CLONE_SIGHAND(CLONE_SIGHAND设置,CLONE_VM就必须设置)
- CLONE_UNTRACED,禁止追踪内核线程
- CLONE_NEWNS, 新进程将运行在新的mount namespace,即自己的文件系统视图(its own view of the mounted filesystems)
1637 /*
1638 * Ok, this is the main fork-routine.
1639 *
1640 * It copies the process, and if successful kick-starts
1641 * it and waits for it to finish using the VM if required.
1642 */
1643 long do_fork(unsigned long clone_flags,
1644 unsigned long stack_start,
1645 unsigned long stack_size,
1646 int __user *parent_tidptr,
1647 int __user *child_tidptr)
1648 {
1649 struct task_struct *p;
1650 int trace = 0;
1651 long nr;
1652
1653 /*
1654 * Determine whether and which event to report to ptracer. When
1655 * called from kernel_thread or CLONE_UNTRACED is explicitly
1656 * requested, no event is reported; otherwise, report if the event
1657 * for the type of forking is enabled.
1658 */
1659 if (!(clone_flags & CLONE_UNTRACED)) { ------1
1660 if (clone_flags & CLONE_VFORK)
1661 trace = PTRACE_EVENT_VFORK;
1662 else if ((clone_flags & CSIGNAL) != SIGCHLD)
1663 trace = PTRACE_EVENT_CLONE;
1664 else
1665 trace = PTRACE_EVENT_FORK;
1666
1667 if (likely(!ptrace_event_enabled(current, trace)))
1668 trace = 0;
1669 }
1670
1671 p = copy_process(clone_flags, stack_start, stack_size, ------2
1672 child_tidptr, NULL, trace);
1673 /*
1674 * Do this prior waking up the new thread - the thread pointer
1675 * might get invalid after that point, if the thread exits quickly.
1676 */
1677 if (!IS_ERR(p)) {
1678 struct completion vfork;
1679 struct pid *pid;
1680
1681 trace_sched_process_fork(current, p);
1682
1683 pid = get_task_pid(p, PIDTYPE_PID); -------3
1684 nr = pid_vnr(pid);
1685
1686 if (clone_flags & CLONE_PARENT_SETTID) -------4
1687 put_user(nr, parent_tidptr);
1688
1689 if (clone_flags & CLONE_VFORK) { -------5
1690 p->vfork_done = &vfork;
1691 init_completion(&vfork);
1692 get_task_struct(p);
1693 }
1694
1695 wake_up_new_task(p); --------6
1696
1697 /* forking complete and child started to run, tell ptracer */
1698 if (unlikely(trace))
1699 ptrace_event_pid(trace, pid);
1700
1701 if (clone_flags & CLONE_VFORK) {
1702 if (!wait_for_vfork_done(p, &vfork))
1703 ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
1704 }
1705
1706 put_pid(pid); --------7
1707 } else {
1708 nr = PTR_ERR(p);
1709 }
1710 return nr;
1711 }
1189 /*
1190 * This creates a new process as a copy of the old one,
1191 * but does not actually start it yet.
1192 *
1193 * It copies the registers, and all the appropriate
1194 * parts of the process environment (as per the clone
1195 * flags). The actual kick-off is left to the caller.
1196 */
生成一个新的进程,作为old进程的拷贝,但是并没有立刻运行。根据clone标志位,复制寄存器和进程环境下对应的部分。何时运行新进程取决于caller(即old进程)。
1197 static struct task_struct *copy_process(unsigned long clone_flags,
1198 unsigned long stack_start,
1199 unsigned long stack_size,
1200 int __user *child_tidptr,
1201 struct pid *pid,
1202 int trace)
1203 {
1204 int retval;
1205 struct task_struct *p;
1206
1207 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) ----1
1208 return ERR_PTR(-EINVAL);
1209
1210 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) ----2
1211 return ERR_PTR(-EINVAL);
1212
1213 /*
1214 * Thread groups must share signals as well, and detached threads
1215 * can only be started up within the thread group.
1216 */
1217 if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) ----3
1218 return ERR_PTR(-EINVAL);
1219
1220 /*
1221 * Shared signal handlers imply shared VM. By way of the above,
1222 * thread groups also imply shared VM. Blocking this case allows
1223 * for various simplifications in other code.
1224 */
1225 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) -----4
1226 return ERR_PTR(-EINVAL);
1227
1228 /*
1229 * Siblings of global init remain as zombies on exit since they are
1230 * not reaped by their parent (swapper). To solve this and to avoid
1231 * multi-rooted process trees, prevent global and container-inits
1232 * from creating siblings.
1233 */
1234 if ((clone_flags & CLONE_PARENT) && -----5
1235 current->signal->flags & SIGNAL_UNKILLABLE)
1236 return ERR_PTR(-EINVAL);
1237
1238 /*
1239 * If the new process will be in a different pid or user namespace
1240 * do not allow it to share a thread group or signal handlers or
1241 * parent with the forking task.
1242 */
1243 if (clone_flags & CLONE_SIGHAND) { ------6
1244 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1245 (task_active_pid_ns(current) !=
1246 current->nsproxy->pid_ns_for_children))
1247 return ERR_PTR(-EINVAL);
1248 }
1249
进程的credentials关系到进程的执行权限。检查该进程是否超出了用户进程的最大线程限制RLIMIT_NPROC。
检查通过以后copy_creds,如果是设置了CLONE_THREAD,并且p->cred->thread_keyring为0,那么就共享,否则拷贝。因为对credentials不甚了解,只是简单分析一下流程。
初始化task_struct中的delays,
1522 #ifdef CONFIG_TASK_DELAY_ACCT
1523 struct task_delay_info *delays;
1524 #endif
进程的delay主要有以下几种情况:
调度器会根据该成员合理调度进程。在后续进程调度的分析中会详细说明。
1103 static void copy_flags(unsigned long clone_flags, struct task_struct *p)
1104 {
1105 unsigned long new_flags = p->flags;
1106
1107 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
1108 new_flags |= PF_FORKNOEXEC;
1109 p->flags = new_flags;
1110 }
在前面copy task_struct中,已经将旧进程的task_struct全部copy给新进程,此时再对其中的flag做一些处理。
清除PF_SUPERPRIV ,表示旧进程曾经使用过超级用户权限。清除PF_WQ_WORKER,如果是worker,那么在
worker_thread中会被重新设置。设置PF_FORKNOEXEC,表示正在被fork。
进程审计上下文相关,TODO
如果设置了CLONE_SYSVSEM,那么新旧进程共享sem undo list,否则设置 tsk->sysvsem.undo_list为NULL
如果设置了CLONE_FILES,新旧进程共享打开的文件,仅仅将旧进程的files->count加1, 否则创建files_struct并拷贝旧进程的files,赋值给新进程。
如果设置了CLONE_FS,新旧进程共享文件系统,仅仅将旧进程的 fs->users加1,否则拷贝旧进程的fs_struct并赋值给新进程
如果设置了CLONE_SIGHAND,新旧进程共享信号处理函数,仅仅将旧进程的 sighand->count加1,否则拷贝旧进程的f current->sighand->action并赋值给新进程
如果设置了CLONE_THREAD,线程组中的所有线程共享信号,所以直接退出;否则系统为新进程分配tsk->signal并对其初始化
如果设置了CLONE_VM,新旧线程共享进程描述符(指向进程地址空间),将新进程的mm旧进程的mm即可;否则系统为新进程分配mm,并拷贝旧进程的mm对其赋值。对于没有设置CLONE_VM的情况,涉及到COW,在另一篇文章中有详细分析(TODO)
如果设置了CLONE_NEWNS、CLONE_NEWUTS、CLONE_NEWIPC、CLONE_NEWPID、CLONE_NEWNET其中之一,则创建新的namespace并用就进程的namespace初始化,否则线程组中的所有线程mount namespace,old namespace引用计数加1所退出
如果设置了CLONE_IO,新旧进程共享io context,将旧进程的io_context赋值给新进程;否则系统为新进程分配tsk->io_context并用旧进程的io context初始化
task_struct的thread_struct,在进程切换时保存了硬件上下文
copy_thread初始化新进程的栈:
如果是kernel thread:
167 /*
168 * INIT_TASK is used to set up the first task table, touch at
169 * your own risk!. Base=0, limit=0x1fffff (=2MB)
170 */
171 #define INIT_TASK(tsk) \
172 { \
173 .state = 0, \
174 .stack = &init_thread_info, \
175 .usage = ATOMIC_INIT(2), \
176 .flags = PF_KTHREAD, \
177 .prio = MAX_PRIO-20, \
178 .static_prio = MAX_PRIO-20, \
179 .normal_prio = MAX_PRIO-20, \
180 .policy = SCHED_NORMAL, \
181 .cpus_allowed = CPU_MASK_ALL, \
182 .nr_cpus_allowed= NR_CPUS, \
183 .mm = NULL, \
184 .active_mm = &init_mm, \
185 .se = { \
186 .group_node = LIST_HEAD_INIT(tsk.se.group_node), \
187 }, \
188 .rt = { \
189 .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
190 .time_slice = RR_TIMESLICE, \
191 }, \
192 .tasks = LIST_HEAD_INIT(tsk.tasks), \
193 INIT_PUSHABLE_TASKS(tsk) \
194 INIT_CGROUP_SCHED(tsk) \
195 .ptraced = LIST_HEAD_INIT(tsk.ptraced), \
196 .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
197 .real_parent = &tsk, \
198 .parent = &tsk, \
199 .children = LIST_HEAD_INIT(tsk.children), \
200 .sibling = LIST_HEAD_INIT(tsk.sibling), \
201 .group_leader = &tsk, \
202 RCU_POINTER_INITIALIZER(real_cred, &init_cred), \
203 RCU_POINTER_INITIALIZER(cred, &init_cred), \
204 .comm = INIT_TASK_COMM, \
205 .thread = INIT_THREAD, \
206 .fs = &init_fs, \
207 .files = &init_files, \
208 .signal = &init_signals, \
209 .sighand = &init_sighand, \
210 .nsproxy = &init_nsproxy, \
211 .pending = { \
212 .list = LIST_HEAD_INIT(tsk.pending.list), \
213 .signal = {{0}}}, \
214 .blocked = {{0}}, \
215 .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \
216 .journal_info = NULL, \
217 .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
218 .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
219 .timer_slack_ns = 50000, /* 50 usec default slack */ \
220 .pids = { \
221 [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
222 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
223 [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \
224 }, \
225 .thread_group = LIST_HEAD_INIT(tsk.thread_group), \
226 .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \
227 INIT_IDS \
228 INIT_PERF_EVENTS(tsk) \
229 INIT_TRACE_IRQFLAGS \
230 INIT_LOCKDEP \
231 INIT_FTRACE_GRAPH \
232 INIT_TRACE_RECURSION \
233 INIT_TASK_RCU_PREEMPT(tsk) \
234 INIT_CPUSET_SEQ(tsk) \
235 INIT_RT_MUTEXES(tsk) \
236 INIT_VTIME(tsk) \
237 }
At every process switch, the hardware context of the process being replaced must be
saved somewhere. It cannot be saved on the TSS, as in the original Intel design,
because Linux uses a single TSS for each processor, instead of one for every process.
Thus, each process descriptor includes a field called thread of type thread_struct, in
which the kernel saves the hardware context whenever the process is being switched
out. As we’ll see later, this data structure includes fields for most of the CPU registers, except the general-purpose registers such as eax, ebx, etc., which are stored in
the Kernel Mode stack.
在每个进程切换时,被替换的进程的硬件上下文必须是保存在某个地方。 它不能保存在TSS上,如在原来的英特尔设计中,因为Linux对每个处理器使用单个TSS,而不是对每个进程使用一个TSS。
因此,每个进程描述符包括一个称为thread_struct类型的线程的字段。每当该过程被切换时,内核保存硬件上下文。 我们将在后面看到,这个数据结构包括大多数CPU寄存器的字段,除了通用寄存器如eax,ebx等,存储在内核栈。
460 struct thread_struct {
461 /* Cached TLS descriptors: */
462 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
463 unsigned long sp0;
464 unsigned long sp;
465 #ifdef CONFIG_X86_32
466 unsigned long sysenter_cs;
467 #else
468 unsigned long usersp; /* Copy from PDA */
469 unsigned short es;
470 unsigned short ds;
471 unsigned short fsindex;
472 unsigned short gsindex;
473 #endif
474 #ifdef CONFIG_X86_32
475 unsigned long ip;
476 #endif
477 #ifdef CONFIG_X86_64
478 unsigned long fs;
479 #endif
480 unsigned long gs;
481 /* Save middle states of ptrace breakpoints */
482 struct perf_event *ptrace_bps[HBP_NUM];
483 /* Debug status used for traps, single steps, etc... */
484 unsigned long debugreg6;
485 /* Keep track of the exact dr7 value set by the user */
486 unsigned long ptrace_dr7;
487 /* Fault info: */
488 unsigned long cr2;
489 unsigned long trap_nr;
490 unsigned long error_code;
491 /* floating point and extended processor state */
492 struct fpu fpu;
493 #ifdef CONFIG_X86_32
494 /* Virtual 86 mode info */
495 struct vm86_struct __user *vm86_info;
496 unsigned long screen_bitmap;
497 unsigned long v86flags;
498 unsigned long v86mask;
499 unsigned long saved_sp0;
500 unsigned int saved_fs;
501 unsigned int saved_gs;
502 #endif
503 /* IO permissions: */
504 unsigned long *io_bitmap_ptr;
505 unsigned long iopl;
506 /* Max allowed port in the bitmap, in bytes: */
507 unsigned io_bitmap_max;
508 /*
509 * fpu_counter contains the number of consecutive context switches
510 * that the FPU is used. If this is over a threshold, the lazy fpu
511 * saving becomes unlazy to save the trap. This is an unsigned char
512 * so that after 256 times the counter wraps and the behavior turns
513 * lazy again; this to deal with bursty apps that only use FPU for
514 * a short time
515 */
516 unsigned char fpu_counter;
517 };