大家都知道容器是通过namespace和cgroup技术来创建的,但是具体代码是如何控制namespace和cgroup的呢?下面就以runc源码为例子简单介绍下。
func (p *initProcess) start() error {
defer p.parentPipe.Close()
err := p.cmd.Start()
//开始执行初始化进程命令
p.process.ops = p
p.childPipe.Close()
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
}
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}//通过进程号对进程使用cgroup
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
}
}
defer func() {
if err != nil {
// TODO: should not be the responsibility to call here
p.manager.Destroy()
if p.intelRdtManager != nil {
p.intelRdtManager.Destroy()
}
}
}()
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
//前面创建bootstrapData(命名空间映射参数)从parentPipe传出去(init进程会从childPipe接收到这些数据,
reverse出写入的内容,在C语言实现的nsexec.c 中控制clone函数的参数,最终实现namespace相关的配置)
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
}//对进程使用namesapce结束。
fds, err := getPipeFds(p.pid())
if err != nil {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
}
p.setExternalDescriptors(fds)
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
}
//进行网络相关配置,主要遍历网络配置,对各种网络根据相应的策略添加veth对,最终都是调用linux ip link 命令实现
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
}
var (
sentRun bool
sentResume bool
)
......
}
func (m *Manager) Apply(pid int) (err error) {
if m.Cgroups == nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
var c = m.Cgroups
d, err := getCgroupData(m.Cgroups, pid)
if err != nil {
return err
}
m.Paths = make(map[string]string)
if c.Paths != nil {
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
if cgroups.IsNotFound(err) {
continue
}
return err
}
m.Paths[name] = path
}
return cgroups.EnterPid(m.Paths, pid)
}
for _, sys := range subsystems {
// 对每个subsystems 进行apply
p, err := d.path(sys.Name())
if err != nil {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
continue
}
return err
}
m.Paths[sys.Name()] = p
if err := sys.Apply(d); err != nil {
if os.IsPermission(err) && m.Cgroups.Path == "" {
delete(m.Paths, sys.Name())
continue
}
return err
}
}
return nil
}
func (s *CpuGroup) Apply(d *cgroupData) error {
// We always want to join the cpu group, to allow fair cpu scheduling
// on a container basis
path, err := d.path("cpu")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return s.ApplyDir(path, d.config, d.pid)
}
func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error {
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
if err := s.SetRtSched(path, cgroup); err != nil {
return err
}
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return err
}
return nil
}
func WriteCgroupProc(dir string, pid int) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s", CgroupProcesses)
}
// Dont attach any pid to the cgroup if -1 is specified as a pid
if pid != -1 {
if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
}
}
return nil
}
也就是将进程号写入到cgroup中cpu的文件夹下,在容器内新建子进程时,根据cgroup规则,子进程也就是子节点是父节点进程组的子集,并继承父节点属性,也就是说容器内的子进程会也受该cgroup约束
func (p *initProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return err
}
if !status.Success() {
p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return err
}
// Clean up the zombie parent process
firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
if err != nil {
return err
}
// Ignore the error in case the child has already been reaped for any reason
_, _ = firstChildProcess.Wait()
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process // 将cmd.Process对应的进程替换为该进程。
p.process.ops = p
return nil
}