【runc 源码分析】runc create / start 流程分析

命令行

COMMANDS:
     checkpoint  checkpoint a running container
     create      create a container
     delete      delete any resources held by the container often used with detached container
     events      display container events such as OOM notifications, cpu, memory, and IO usage statistics
     exec        execute new process inside the container
     init        initialize the namespaces and launch the process (do not call it outside of runc)
     kill        kill sends the specified signal (default: SIGTERM) to the container's init process
     list        lists containers started by runc with the given root
     pause       pause suspends all processes inside the container
     ps          ps displays the processes running inside a container
     restore     restore a container from a previous checkpoint
     resume      resumes all processes that have been previously paused
     run         create and run a container
     spec        create a new specification file
     start       executes the user defined process in a created container
     state       output the state of a container
     update      update container resource constraints
     help, h     Shows a list of commands or help for one command

GLOBAL OPTIONS:
   --debug             enable debug output for logging
   --log value         set the log file path where internal debug information is written (default: "/dev/null")
   --log-format value  set the format used by logs ('text' (default), or 'json') (default: "text")
   --root value        root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc")
   --criu value        path to the criu binary used for checkpoint and restore (default: "criu")
   --systemd-cgroup    enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234"
   --help, -h          show help
   --version, -v       print the version

 

前言

     容器标准包 bundle 与配置

  config.json: 基本配置文件,包括与宿主机独立的和应用相关的特定信息,如安全权限、环境变量和参数等。具体如下:

  • 容器版本
  • rootfs 路径及权限(ro / rw)
  • 各类文件挂载点及相应容器内挂载目录(必须与 state.json 一致)
  • 初始进程配置信息,包括是否绑定终端、工作目录、环境变量配置、可执行文件参数、uid、gid 以及额外需要加入的 hostname 等

  state.json: 运行时配置文件(运行时主机相关的信息,如内存限制、设备访问权限、挂载点等)

  rootfs:根文件系统目录,容器执行的环境依赖,如/bin/var/lib/dev/usr等目录及相应文件

 

一. runc create 命令分析

     路径 opencontainers/runc/create.go

  1.1 定义command结构

var createCommand = cli.Command{
	Name:  "create",
	Usage: "create a container",
	ArgsUsage: `

Where "" is your name for the instance of the container that you
are starting. The name you provide for the container instance must be unique on
your host.`,
	Description: `The create command creates an instance of a container for a bundle. The bundle
is a directory with a specification file named "` + specConfig + `" and a root
filesystem.

The specification file includes an args parameter. The args parameter is used
to specify command(s) that get run when the container is started. To change the
command(s) that get executed on start, edit the args parameter of the spec. See
"runc spec --help" for more explanation.`,

  1.2 定义runc create 命令行参数

    NAME:
       docker-runc create - create a container

    USAGE:
       docker-runc create [command options]

	Flags: []cli.Flag{
		cli.StringFlag{
			Name:  "bundle, b",
			Value: "",
			Usage: `path to the root of the bundle directory, defaults to the current directory`,
		},
		cli.StringFlag{
			Name:  "console-socket",
			Value: "",
			Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
		},
		cli.StringFlag{
			Name:  "pid-file",
			Value: "",
			Usage: "specify the file to write the process id to",
		},
		cli.BoolFlag{
			Name:  "no-pivot",
			Usage: "do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk",
		},
		cli.BoolFlag{
			Name:  "no-new-keyring",
			Usage: "do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key",
		},
		cli.IntFlag{
			Name:  "preserve-fds",
			Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
		},
	},

  1.3 定义Action函数

	Action: func(context *cli.Context) error {
		if err := checkArgs(context, 1, exactArgs); err != nil {
			return err
		}
		if err := revisePidFile(context); err != nil {
			return err
		}
		spec, err := setupSpec(context)
		if err != nil {
			return err
		}
		status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
		if err != nil {
			return err
		}
		// exit with the container's exit status so any external supervisor is
		// notified of the exit with the correct exit status.
		os.Exit(status)
		return nil
	},

    1.3.1 startContainer函数

      startContainer 函数中参数 action CT_ACT_CREATE 为创建,后面会用到。获得启动命令行的容器 id,createContainer 函数第2章节讲解,runner 结构体以及 run 函数内容多一些第三章节进行讲解 

func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
	id := context.Args().First()
	if id == "" {
		return -1, errEmptyID
	}

	notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
	if notifySocket != nil {
		notifySocket.setupSpec(context, spec)
	}

	container, err := createContainer(context, id, spec)
	if err != nil {
		return -1, err
	}

	if notifySocket != nil {
		err := notifySocket.setupSocket()
		if err != nil {
			return -1, err
		}
	}

	// Support on-demand socket activation by passing file descriptors into the container init process.
	listenFDs := []*os.File{}
	if os.Getenv("LISTEN_FDS") != "" {
		listenFDs = activation.Files(false)
	}
	r := &runner{
		enableSubreaper: !context.Bool("no-subreaper"),
		shouldDestroy:   true,
		container:       container,
		listenFDs:       listenFDs,
		notifySocket:    notifySocket,
		consoleSocket:   context.String("console-socket"),
		detach:          context.Bool("detach"),
		pidFile:         context.String("pid-file"),
		preserveFDs:     context.Int("preserve-fds"),
		action:          action,
		criuOpts:        criuOpts,
		init:            true,
	}
	return r.run(spec.Process)
}

 

2. createContainer函数

     CreateLibcontainerConfig 创建一个配置来创建容器,例如 bundle 路径,namespace,capatilities,标签等一堆堆,loadFactory 2.2 讲解,factory.Create 函数在 2.3 讲解

func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
	rootlessCg, err := shouldUseRootlessCgroupManager(context)
	if err != nil {
		return nil, err
	}
	config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
		CgroupName:       id,
		UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
		NoPivotRoot:      context.Bool("no-pivot"),
		NoNewKeyring:     context.Bool("no-new-keyring"),
		Spec:             spec,
		RootlessEUID:     os.Geteuid() != 0,
		RootlessCgroups:  rootlessCg,
	})
	if err != nil {
		return nil, err
	}

	factory, err := loadFactory(context)
	if err != nil {
		return nil, err
	}
	return factory.Create(id, config)
}

  2.1 CreateLibcontainerConfig函数

  路径libcontainer/specconv/spec_linux.go

  2.1.1 创建Config配置,主要为在容器环境运行进程

	config := &configs.Config{
		Rootfs:          rootfsPath,
		NoPivotRoot:     opts.NoPivotRoot,
		Readonlyfs:      spec.Root.Readonly,
		Hostname:        spec.Hostname,
		Labels:          append(labels, fmt.Sprintf("bundle=%s", cwd)),
		NoNewKeyring:    opts.NoNewKeyring,
		RootlessEUID:    opts.RootlessEUID,
		RootlessCgroups: opts.RootlessCgroups,
	}

	exists := false
	for _, m := range spec.Mounts {
		config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
	}

  2.1.2   createDevices创建device配置

     可以在容器中看到/dev

# ls
core  full    null  pts     shm     stdin   termination-log  urandom
fd    mqueue  ptmx  random  stderr  stdout  tty              zero

func createDevices(spec *specs.Spec, config *configs.Config) error {
	// add whitelisted devices
	config.Devices = []*configs.Device{
		{
			Type:     'c',
			Path:     "/dev/null",
			Major:    1,
			Minor:    3,
			FileMode: 0666,
			Uid:      0,
			Gid:      0,
		},
		{
			Type:     'c',
			Path:     "/dev/random",
			Major:    1,
			Minor:    8,
			FileMode: 0666,
			Uid:      0,
			Gid:      0,
		},
     。。。。。。
	
	}
	// merge in additional devices from the spec
	if spec.Linux != nil {
		。。。。。
			config.Devices = append(config.Devices, device)
		}
	}
	return nil
}

  2.1.3 createCgroupConfig函数

   配置cgroup,包括memory cpu blockio network

func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
	。。。

	// In rootless containers, any attempt to make cgroup changes is likely to fail.
	// libcontainer will validate this but ignores the error.
	c.Resources.AllowedDevices = allowedDevices
	if spec.Linux != nil {
		
		if r.Memory != nil {
			if r.Memory.Limit != nil {
				c.Resources.Memory = *r.Memory.Limit
			}
			if r.Memory.Reservation != nil {
				c.Resources.MemoryReservation = *r.Memory.Reservation
			}
			if r.Memory.Swap != nil {
				c.Resources.MemorySwap = *r.Memory.Swap
			}
			if r.Memory.Kernel != nil {
				c.Resources.KernelMemory = *r.Memory.Kernel
			}
			if r.Memory.KernelTCP != nil {
				c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
			}
			if r.Memory.Swappiness != nil {
				c.Resources.MemorySwappiness = r.Memory.Swappiness
			}
			if r.Memory.DisableOOMKiller != nil {
				c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
			}
		}
		。。。
		if r.Network != nil {
			if r.Network.ClassID != nil {
				c.Resources.NetClsClassid = *r.Network.ClassID
			}
			for _, m := range r.Network.Priorities {
				c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
					Interface: m.Name,
					Priority:  int64(m.Priority),
				})
			}
		}
	}
	// append the default allowed devices to the end of the list
	c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
	return c, nil
}

  2.2 loadFactory函数为容器返回配置化的实例 factory

// loadFactory returns the configured factory instance for execing containers.
func loadFactory(context *cli.Context) (libcontainer.Factory, error)

  2.2.1 cgroupManager 指向 Cgroupfs 函数

    配置 cgroup Manager 接口指向 fs 结构体 Manager 实现了其接口,cgroupManager := libcontainer.Cgroupfs

// Cgroupfs is an options func to configure a LinuxFactory to return containers
// that use the native cgroups filesystem implementation to create and manage
// cgroups.
func Cgroupfs(l *LinuxFactory) error {
	l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
		return &fs.Manager{
			Cgroups: config,
			Paths:   paths,
		}
	}
	return nil
}

  2.2.2 实例化libcontainer  

    libcontainer.New(abs, cgroupManager, intelRdtManager,
        libcontainer.CriuPath(context.GlobalString("criu")),
        libcontainer.NewuidmapPath(newuidmap),
       libcontainer.NewgidmapPath(newgidmap))
   libcontainer.New 返回一个 linux 系统实现的结构体,根据传入的参数配置结构体中的 NewCgroupsManager 和 CriuPath

// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
	if root != "" {
		if err := os.MkdirAll(root, 0700); err != nil {
			return nil, newGenericError(err, SystemError)
		}
	}
	l := &LinuxFactory{
		Root:      root,
		InitPath:  "/proc/self/exe",
		InitArgs:  []string{os.Args[0], "init"},
		Validator: validate.New(),
		CriuPath:  "criu",
	}
	Cgroupfs(l)
	for _, opt := range options {
		if opt == nil {
			continue
		}
		if err := opt(l); err != nil {
			return nil, err
		}
	}
	return l, nil
}

   2.3 Create函数

      Create 做的事情比较简单,对配置进行一些验证工作,验证成功创建容器的根路径并设置权限

func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
	if l.Root == "" {
		return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
	}
	if err := l.validateID(id); err != nil {
		return nil, err
	}
	if err := l.Validator.Validate(config); err != nil {
		return nil, newGenericError(err, ConfigInvalid)
	}
	containerRoot, err := securejoin.SecureJoin(l.Root, id)
	if err != nil {
		return nil, err
	}
	if _, err := os.Stat(containerRoot); err == nil {
		return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
	} else if !os.IsNotExist(err) {
		return nil, newGenericError(err, SystemError)
	}
	if err := os.MkdirAll(containerRoot, 0711); err != nil {
		return nil, newGenericError(err, SystemError)
	}
	if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
		return nil, newGenericError(err, SystemError)
	}
	c := &linuxContainer{
		id:            id,
		root:          containerRoot,
		config:        config,
		initPath:      l.InitPath,
		initArgs:      l.InitArgs,
		criuPath:      l.CriuPath,
		newuidmapPath: l.NewuidmapPath,
		newgidmapPath: l.NewgidmapPath,
		cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
	}
	if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
		c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
	}
	c.state = &stoppedState{c: c}
	return c, nil
}

    2.3.1 成功会返回 linuxContainer 结构体,包括容器 ID,跟路径,参数等,将状态置为 stopped

type linuxContainer struct {
       id                   string
       root                 string
       config               *configs.Config
       cgroupManager        cgroups.Manager
       initArgs             []string
       initProcess          parentProcess
       initProcessStartTime string
       criuPath             string
       m                    sync.Mutex
       criuVersion          int
       state                containerState
       created              time.Time
}

 

二. Factory 分析

    2.1 Factory对象为容器创建和初始化工作提供了一组抽象接口

type Factory interface {
       Create(id string, config *configs.Config) (Container, error)

       Load(id string) (Container, error)
       StartInitialization() error
       Type() string
}
  • Create: id 和配置参数创建容器,返回一个运行的进程。
  • Load:  从容器为 id 目录下读取 state.json 来载入容器

    2.2 Linux 系统 factory 对象的结构体,实现了 Factory 接口

type LinuxFactory struct {
       // Root directory for the factory to store state.
       Root string

       // InitArgs are arguments for calling the init responsibilities for spawning
       // a container.
       InitArgs []string

       // CriuPath is the path to the criu binary used for checkpoint and restore of
       // containers.
       CriuPath string

       // Validator provides validation to container configurations.
       Validator validate.Validator

       // NewCgroupsManager returns an initialized cgroups manager for a single container.
       NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
}

 

三. runner 分析

    runner 结构体,运行容器的信息 

type runner struct {
	init            bool
	enableSubreaper bool
	shouldDestroy   bool
	detach          bool
	listenFDs       []*os.File
	preserveFDs     int
	pidFile         string
	consoleSocket   string
	container       libcontainer.Container
	action          CtAct
	notifySocket    *notifySocket
	criuOpts        *libcontainer.CriuOpts
}

    Process 结构体,启动容器内进程的信息

// Process contains information to start a specific application inside the container.
type Process struct {
	// Terminal creates an interactive terminal for the container.
	Terminal bool `json:"terminal,omitempty"`
	// ConsoleSize specifies the size of the console.
	ConsoleSize *Box `json:"consoleSize,omitempty"`
	// User specifies user information for the process.
	User User `json:"user"`
	// Args specifies the binary and arguments for the application to execute.
	Args []string `json:"args"`
	// Env populates the process environment for the process.
	Env []string `json:"env,omitempty"`
	// Cwd is the current working directory for the process and must be
	// relative to the container's root.
	Cwd string `json:"cwd"`
	// Capabilities are Linux capabilities that are kept for the process.
	Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"`
	// Rlimits specifies rlimit options to apply to the process.
	Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris"`
	// NoNewPrivileges controls whether additional privileges could be gained by processes in the container.
	NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"`
	// ApparmorProfile specifies the apparmor profile for the container.
	ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"`
	// Specify an oom_score_adj for the container.
	OOMScoreAdj *int `json:"oomScoreAdj,omitempty" platform:"linux"`
	// SelinuxLabel specifies the selinux context that the container process is run as.
	SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
}

   3.1 run 函数

func (r *runner) run(config *specs.Process) (int, error)

    3.1.1 newProcess 主要是填充 libcontainer.Process 结构体

      包括参数,环境变量,user 权限,工作目录,cpabilities,资源限制等

// newProcess returns a new libcontainer Process with the arguments from the
// spec and stdio from the current process.
func newProcess(p specs.Process, init bool) (*libcontainer.Process, error) {
	lp := &libcontainer.Process{
		Args: p.Args,
		Env:  p.Env,
		// TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
		User:            fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
		Cwd:             p.Cwd,
		Label:           p.SelinuxLabel,
		NoNewPrivileges: &p.NoNewPrivileges,
		AppArmorProfile: p.ApparmorProfile,
		Init:            init,
	}

	if p.ConsoleSize != nil {
		lp.ConsoleWidth = uint16(p.ConsoleSize.Width)
		lp.ConsoleHeight = uint16(p.ConsoleSize.Height)
	}

	if p.Capabilities != nil {
		lp.Capabilities = &configs.Capabilities{}
		lp.Capabilities.Bounding = p.Capabilities.Bounding
		lp.Capabilities.Effective = p.Capabilities.Effective
		lp.Capabilities.Inheritable = p.Capabilities.Inheritable
		lp.Capabilities.Permitted = p.Capabilities.Permitted
		lp.Capabilities.Ambient = p.Capabilities.Ambient
	}
	for _, gid := range p.User.AdditionalGids {
		lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
	}
	for _, rlimit := range p.Rlimits {
		rl, err := createLibContainerRlimit(rlimit)
		if err != nil {
			return nil, err
		}
		lp.Rlimits = append(lp.Rlimits, rl)
	}
	return lp, nil
}

    3.1.2 listen fd 加入 process 的环境变量和需要在新进程保持打开的文件列表中(ExtraFiles

	if len(r.listenFDs) > 0 {
		process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
		process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
	}
	baseFd := 3 + len(process.ExtraFiles)
	for i := baseFd; i < baseFd+r.preserveFDs; i++ {
		process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
	}

    3.1.3 创建 signalHandler 处理 tty 和 signal

      setupIO 来进行 io 和 tty 相关配置,对于 create 就是 dup 将当前进程的 io,chown 用户/组权限

	// Setting up IO is a two stage process. We need to modify process to deal
	// with detaching containers, and then we get a tty after the container has
	// started.
	handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
	tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
	if err != nil {
		r.destroy()
		return -1, err
	}
	defer tty.Close()

    3.1.4 根据 action 为 create

        第四章继续分析 r.container.Start 函数

	switch r.action {
	case CT_ACT_CREATE:
		err = r.container.Start(process)
	case CT_ACT_RESTORE:
		err = r.container.Restore(process, r.criuOpts)
	case CT_ACT_RUN:
		err = r.container.Run(process)
	default:
		panic("Unknown action")
	}
	if err != nil {
		r.destroy()
		return -1, err
	}

    3.1.4.1 BaseContainer接口

// BaseContainer is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found. BaseContainer includes methods that are platform agnostic.
type BaseContainer interface {


	// Start a process inside the container. Returns error if process fails to
	// start. You can track process lifecycle with passed Process structure.
	//
	// errors:
	// ContainerNotExists - Container no longer exists,
	// ConfigInvalid - config is invalid,
	// ContainerPaused - Container is paused,
	// SystemError - System error.
	Start(process *Process) (err error)

}

  

四. container 分析

    Container对象主要包含了容器配置、控制、状态显示等功能,每一个 Container 进程内部都是线程安全的。由于 Container 可能被其他进程销毁,所以每个方法都会对容器是否存在进行检测。

    Container 接口

      路径: libcontainer/container_linux.go

  • ID():返回容器的 ID
  • Status(): 返回容器的当前状态
  • State(): 返回运行容器状态信息,包括容器ID,初始进程ID,初始进程启动时间,配置信息,cgroup 路径,namespace 路径
  • Config(): 返回当前容器的配置
  • Processes(): 返回容器内 PIDs
  • Stats(): 返回容器的统计信息
  • Start(): 在容器内启动一个进程
type Container interface {
       BaseContainer

       Checkpoint(criuOpts *CriuOpts) error
       Restore(process *Process, criuOpts *CriuOpts) error

       Pause() error
       Resume() error

       NotifyOOM() (<-chan struct{}, error)
       NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
}
type BaseContainer interface {
       ID() string

       Status() (Status, error)
       State() (*State, error)
       Config() configs.Config
       Processes() ([]int, error)
       Stats() (*Stats, error)
       
       Set(config configs.Config) error
       Start(process *Process) (err error)
       Run(process *Process) (err error)
       Destroy() error
       Signal(s os.Signal, all bool) error
       Exec() error
}

    State 结构

        表示一个运行中的容器状态信息:

// State represents a running container's state
type State struct {
       BaseState

       // Platform specific fields below here

       // Specifies if the container was started under the rootless mode.
       Rootless bool `json:"rootless"`

       // Path to all the cgroups setup for a container. Key is cgroup subsystem name
       // with the value as the path.
       CgroupPaths map[string]string `json:"cgroup_paths"`

       // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
       // with the value as the path.
       NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`

       // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
       ExternalDescriptors []string `json:"external_descriptors,omitempty"`
}
// BaseState represents the platform agnostic pieces relating to a
// running container's state
type BaseState struct {
	// ID is the container ID.
	ID string `json:"id"`

	// InitProcessPid is the init process id in the parent namespace.
	InitProcessPid int `json:"init_process_pid"`

	// InitProcessStartTime is the init process start time in clock cycles since boot time.
	InitProcessStartTime uint64 `json:"init_process_start"`

	// Created is the unix timestamp for the creation time of the container in UTC
	Created time.Time `json:"created"`

	// Config is the container's configuration.
	Config configs.Config `json:"config"`
}

    4.1 Start函数

     linuxContainer实现了Container接口

     根据 create 流程向前将 status 置为 stopped 状态,传入 start 函数第二个参数为 true 将状态设置为 created,主要 4.2 分析 start 函数

func (c *linuxContainer) Start(process *Process) error {
	c.m.Lock()
	defer c.m.Unlock()
	if process.Init {
		if err := c.createExecFifo(); err != nil {
			return err
		}
	}
	if err := c.start(process); err != nil {
		if process.Init {
			c.deleteExecFifo()
		}
		return err
	}
	return nil
}

    4.2 start 函数

func (c *linuxContainer) start(process *Process, isInit bool) error

    4.2.1 newParentProcess

  • 创建一对pipe,parentPipe和childPipe,作为 runc start 进程与容器内部 init 进程通信管道
  • 创建一个命令模版作为 Parent 进程启动的模板
  • newInitProcess 封装 initProcess。主要工作为添加初始化类型环境变量,将namespace、uid/gid 映射等信息使用 bootstrapData 封装为一个 io.Reader
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
	parentPipe, childPipe, err := utils.NewSockPair("init")
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new init pipe")
	}
	cmd, err := c.commandTemplate(p, childPipe)
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new command template")
	}
	if !p.Init {
		return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
	}

	// We only set up fifoFd if we're not doing a `runc exec`. The historic
	// reason for this is that previously we would pass a dirfd that allowed
	// for container rootfs escape (and not doing it in `runc exec` avoided
	// that problem), but we no longer do that. However, there's no need to do
	// this for `runc exec` so we just keep it this way to be safe.
	if err := c.includeExecFifo(cmd); err != nil {
		return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
	}
	return c.newInitProcess(p, cmd, parentPipe, childPipe)
}

    4.2.1.1newInitProcess函数

      添加初始化类型环境变量,将namespace、uid/gid 映射等信息使用 bootstrapData 函数封装为一个 io.Reader,使用的是 netlink 用于内核间的通信,返回 initProcess 结构体

func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
	nsMaps := make(map[configs.NamespaceType]string)
	for _, ns := range c.config.Namespaces {
		if ns.Path != "" {
			nsMaps[ns.Type] = ns.Path
		}
	}
	_, sharePidns := nsMaps[configs.NEWPID]
	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
	if err != nil {
		return nil, err
	}
	init := &initProcess{
		cmd:             cmd,
		childPipe:       childPipe,
		parentPipe:      parentPipe,
		manager:         c.cgroupManager,
		intelRdtManager: c.intelRdtManager,
		config:          c.newInitConfig(p),
		container:       c,
		process:         p,
		bootstrapData:   data,
		sharePidns:      sharePidns,
	}
	c.initProcess = init
	return init, nil
}

    4.3 parent.start 函数

        第五章节讲解,创建了新的进程。而此时新的进程使用 /proc/self/exec 为执行入口,参数为 init,会在 main 函数调用之前执行,所以在新的进程中 func init() 会直接调用,而不会去执行main函数

unc (p *initProcess) start() error

 

五. initProcess 分析

    initProcess 结构体

type initProcess struct {
	cmd             *exec.Cmd
	parentPipe      *os.File
	childPipe       *os.File
	config          *initConfig
	manager         cgroups.Manager
	intelRdtManager intelrdt.Manager
	container       *linuxContainer
	fds             []string
	process         *Process
	bootstrapData   io.Reader
	sharePidns      bool
}

   5.1 start 函数

func (p *initProcess) start() error

    5.1.1  中 cmd 如最后命令所示Path填充为 /proc/self/exe(本身 runC)。参数字段 Args 为 init,表示对容器进行初始化,调用的为 runc init (https://blog.csdn.net/zhonglinzhang/article/details/86502530这篇文章分析

	defer p.parentPipe.Close()
	err := p.cmd.Start()
	p.process.ops = p
	p.childPipe.Close()
	if err != nil {
		p.process.ops = nil
		return newSystemErrorWithCause(err, "starting init process command")
	}

    5.1.2  Apply 设置进程 cgroup 进行限额

	// Do this before syncing with child so that no children can escape the
	// cgroup. We don't need to worry about not doing this and not being root
	// because we'd be using the rootless cgroup manager in that case.
	if err := p.manager.Apply(p.pid()); err != nil {
		return newSystemErrorWithCause(err, "applying cgroup configuration for process")
	}

    5.1.3  createNetworkInterfaces  如果没有指定网络只设置 loopback,如果指定网络还有 veth 类型。

         sendConfig 发送配置到 init process

	if err := p.createNetworkInterfaces(); err != nil {
		return newSystemErrorWithCause(err, "creating network interfaces")
	}
	if err := p.sendConfig(); err != nil {
		return newSystemErrorWithCause(err, "sending config to init process")
	}

      initProcess结构的start方法真正完成了容器进程的创建,并通过init管道协助其完成初始化工作。该方法首先调用p.cmd.Start()创建一个独立的进程,执行命令runc init。

 

示例:使用runc运行一个容器

     创建容器标准包,使用 bundle 模块实现,将 docker 镜像转换成容器标准包

           $ docker pull busybox

           $ docker export $(docker create busybox) | tar -C rootfs -xvf -

     创建配置文件

           $ runc spec

     运行容器 

           $ runc run busybox     

 

create 总结:

  • 创建容器的 linux factory,然后调用其 create 方法,这个比较简单,主要是对参数进行校验工作,并创建根目录并进行 uid,gid 权限,返回创建容器的结构体
  • 封装一个 runner 结构体,调用 run 方法,将 config.json 中 process 填充到结构体,根据 action 为创建的调用 container.start(process)
  • 创建管道与容器内进行通信,一个 init process 结构体封装了命令,配置,cgroup,带有 namespace 的 netlink 请求 
  • 容器内执行 runc init 命令,配置 cgroup,创建网络接口,通过管道发送配置给容器内
  • 容器内从管道读取配置进行初始化配置

 

cmd 内容:

{
    Path: "/proc/self/exe",
    Args: [
        
    ]string{
        "/proc/self/exe",
        "init"

    },
    Env: [
        
    ]string{
        "_LIBCONTAINER_CONSOLE=3",
        "_LIBCONTAINER_INITPIPE=4",
        "_LIBCONTAINER_STATEDIR=5",
        "_LIBCONTAINER_INITTYPE=standard"
    },
    Dir: "/home/lin/project/src/github.com/opencontainers/runc/mycontainers/rootfs",
    Stdin: io.Reader(nil),
    Stdout: io.Writer(nil),
    Stderr: io.Writer(nil),
    ExtraFiles: [
        
    ]*os.File{
        (*os.File)(0xc42000e120),
        (*os.File)(0xc42000e140),
        (*os.File)(0xc42000e150)
    },
    SysProcAttr: (*syscall.SysProcAttr)(0xc4200981b0),
    Process: (*os.Process)(nil),
    ProcessState: (*os.ProcessState)(nil),
    ctx: context.Context(nil),
    lookPathErr: error(nil),
    finished: false,
    childFiles: [
        
    ]*os.File(nil),
    closeAfterStart: [
        
    ]io.Closer(nil),
    closeAfterWait: [
        
    ]io.Closer(nil),
    goroutine: [
        
    ]func()error(nil),
    errch: (chanerror)(nil),
    waitDone: (chanstruct{
        
    })(nil)
}

 

 

/var/run/runc/${container-id}/state.json

{
    "id": "container-bbbb",
    "init_process_pid": 3193,
    "init_process_start": 7331,
    "created": "2017-08-15T02:30:51.244343167Z",
    "config": {
        "no_pivot_root": false,
        "parent_death_signal": 0,
        "rootfs": "/home/lin/project/src/github.com/opencontainers/runc/mycontainers/rootfs",
        "readonlyfs": true,
        "rootPropagation": 278528,
        "mounts": [
            {
                "source": "proc",
                "destination": "/proc",
                "device": "proc",
                "flags": 0,
                "propagation_flags": null,
                "data": "",
                "relabel": "",
                "extensions": 0,
                "premount_cmds": null,
                "postmount_cmds": null
            },
            {
                "source": "tmpfs",
                "destination": "/dev",
                "device": "tmpfs",
                "flags": 16777218,
                "propagation_flags": null,
                "data": "mode=755,size=65536k",
                "relabel": "",
                "extensions": 0,
                "premount_cmds": null,
                "postmount_cmds": null
            },
            {
                "source": "devpts",
                "destination": "/dev/pts",
                "device": "devpts",
                "flags": 10,
                "propagation_flags": null,
                "data": "newinstance,ptmxmode=0666,mode=0620,gid=5",
                "relabel": "",
                "extensions": 0,
                "premount_cmds": null,
                "postmount_cmds": null
            },
            {
                "source": "shm",
                "destination": "/dev/shm",
                "device": "tmpfs",
                "flags": 14,
                "propagation_flags": null,
                "data": "mode=1777,size=65536k",
                "relabel": "",
                "extensions": 0,
                "premount_cmds": null,
                "postmount_cmds": null
            },
            {
                "source": "mqueue",
                "destination": "/dev/mqueue",
                "device": "mqueue",
                "flags": 14,
                "propagation_flags": null,
                "data": "",
                "relabel": "",
                "extensions": 0,
                "premount_cmds": null,
                "postmount_cmds": null
            },
            {
                "source": "sysfs",
                "destination": "/sys",
                "device": "sysfs",
                "flags": 15,
                "propagation_flags": null,
                "data": "",
                "relabel": "",
                "extensions": 0,
                "premount_cmds": null,
                "postmount_cmds": null
            },
            {
                "source": "cgroup",
                "destination": "/sys/fs/cgroup",
                "device": "cgroup",
                "flags": 2097167,
                "propagation_flags": null,
                "data": "",
                "relabel": "",
                "extensions": 0,
                "premount_cmds": null,
                "postmount_cmds": null
            }
        ],
        "devices": [
            {
                "type": 99,
                "path": "/dev/null",
                "major": 1,
                "minor": 3,
                "permissions": "",
                "file_mode": 438,
                "uid": 0,
                "gid": 0,
                "allow": false
            },
            {
                "type": 99,
                "path": "/dev/random",
                "major": 1,
                "minor": 8,
                "permissions": "",
                "file_mode": 438,
                "uid": 0,
                "gid": 0,
                "allow": false
            },
            {
                "type": 99,
                "path": "/dev/full",
                "major": 1,
                "minor": 7,
                "permissions": "",
                "file_mode": 438,
                "uid": 0,
                "gid": 0,
                "allow": false
            },
            {
                "type": 99,
                "path": "/dev/tty",
                "major": 5,
                "minor": 0,
                "permissions": "",
                "file_mode": 438,
                "uid": 0,
                "gid": 0,
                "allow": false
            },
            {
                "type": 99,
                "path": "/dev/zero",
                "major": 1,
                "minor": 5,
                "permissions": "",
                "file_mode": 438,
                "uid": 0,
                "gid": 0,
                "allow": false
            },
            {
                "type": 99,
                "path": "/dev/urandom",
                "major": 1,
                "minor": 9,
                "permissions": "",
                "file_mode": 438,
                "uid": 0,
                "gid": 0,
                "allow": false
            }
        ],
        "mount_label": "",
        "hostname": "runc",
        "namespaces": [
            {
                "type": "NEWPID",
                "path": ""
            },
            {
                "type": "NEWNET",
                "path": ""
            },
            {
                "type": "NEWIPC",
                "path": ""
            },
            {
                "type": "NEWUTS",
                "path": ""
            },
            {
                "type": "NEWNS",
                "path": ""
            }
        ],
        "capabilities": {
            "Bounding": [
                "CAP_AUDIT_WRITE",
                "CAP_KILL",
                "CAP_NET_BIND_SERVICE"
            ],
            "Effective": [
                "CAP_AUDIT_WRITE",
                "CAP_KILL",
                "CAP_NET_BIND_SERVICE"
            ],
            "Inheritable": [
                "CAP_AUDIT_WRITE",
                "CAP_KILL",
                "CAP_NET_BIND_SERVICE"
            ],
            "Permitted": [
                "CAP_AUDIT_WRITE",
                "CAP_KILL",
                "CAP_NET_BIND_SERVICE"
            ],
            "Ambient": [
                "CAP_AUDIT_WRITE",
                "CAP_KILL",
                "CAP_NET_BIND_SERVICE"
            ]
        },
        "networks": [
            {
                "type": "loopback",
                "name": "",
                "bridge": "",
                "mac_address": "",
                "address": "",
                "gateway": "",
                "ipv6_address": "",
                "ipv6_gateway": "",
                "mtu": 0,
                "txqueuelen": 0,
                "host_interface_name": "",
                "hairpin_mode": false
            }
        ],
        "routes": null,
        "cgroups": {
            "name": "container-bbbb",
            "path": "",
            "scope_prefix": "",
            "Paths": null,
            "allowed_devices": [
                {
                    "type": 99,
                    "path": "",
                    "major": -1,
                    "minor": -1,
                    "permissions": "m",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 98,
                    "path": "",
                    "major": -1,
                    "minor": -1,
                    "permissions": "m",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/null",
                    "major": 1,
                    "minor": 3,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/random",
                    "major": 1,
                    "minor": 8,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/full",
                    "major": 1,
                    "minor": 7,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/tty",
                    "major": 5,
                    "minor": 0,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/zero",
                    "major": 1,
                    "minor": 5,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/urandom",
                    "major": 1,
                    "minor": 9,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/console",
                    "major": 5,
                    "minor": 1,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "",
                    "major": 136,
                    "minor": -1,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "",
                    "major": 5,
                    "minor": 2,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "",
                    "major": 10,
                    "minor": 200,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                }
            ],
            "devices": [
                {
                    "type": 97,
                    "path": "",
                    "major": -1,
                    "minor": -1,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": false
                },
                {
                    "type": 99,
                    "path": "",
                    "major": -1,
                    "minor": -1,
                    "permissions": "m",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 98,
                    "path": "",
                    "major": -1,
                    "minor": -1,
                    "permissions": "m",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/null",
                    "major": 1,
                    "minor": 3,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/random",
                    "major": 1,
                    "minor": 8,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/full",
                    "major": 1,
                    "minor": 7,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/tty",
                    "major": 5,
                    "minor": 0,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/zero",
                    "major": 1,
                    "minor": 5,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/urandom",
                    "major": 1,
                    "minor": 9,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "/dev/console",
                    "major": 5,
                    "minor": 1,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "",
                    "major": 136,
                    "minor": -1,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "",
                    "major": 5,
                    "minor": 2,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                },
                {
                    "type": 99,
                    "path": "",
                    "major": 10,
                    "minor": 200,
                    "permissions": "rwm",
                    "file_mode": 0,
                    "uid": 0,
                    "gid": 0,
                    "allow": true
                }
            ],
            "memory": 0,
            "memory_reservation": 0,
            "memory_swap": 0,
            "kernel_memory": 0,
            "kernel_memory_tcp": 0,
            "cpu_shares": 0,
            "cpu_quota": 0,
            "cpu_period": 0,
            "cpu_rt_quota": 0,
            "cpu_rt_period": 0,
            "cpuset_cpus": "",
            "cpuset_mems": "",
            "pids_limit": 0,
            "blkio_weight": 0,
            "blkio_leaf_weight": 0,
            "blkio_weight_device": null,
            "blkio_throttle_read_bps_device": null,
            "blkio_throttle_write_bps_device": null,
            "blkio_throttle_read_iops_device": null,
            "blkio_throttle_write_iops_device": null,
            "freezer": "",
            "hugetlb_limit": null,
            "oom_kill_disable": false,
            "memory_swappiness": null,
            "net_prio_ifpriomap": null,
            "net_cls_classid_u": 0
        },
        "oom_score_adj": 0,
        "uid_mappings": null,
        "gid_mappings": null,
        "mask_paths": [
            "/proc/kcore",
            "/proc/latency_stats",
            "/proc/timer_list",
            "/proc/timer_stats",
            "/proc/sched_debug",
            "/sys/firmware"
        ],
        "readonly_paths": [
            "/proc/asound",
            "/proc/bus",
            "/proc/fs",
            "/proc/irq",
            "/proc/sys",
            "/proc/sysrq-trigger"
        ],
        "sysctl": null,
        "seccomp": null,
        "Hooks": {
            "poststart": null,
            "poststop": null,
            "prestart": null
        },
        "version": "1.0.0",
        "labels": [
            "bundle=/home/lin/project/src/github.com/opencontainers/runc/mycontainers"
        ],
        "no_new_keyring": false,
        "rootless": false
    },
    "rootless": false,
    "cgroup_paths": {
        "blkio": "/sys/fs/cgroup/blkio/user.slice/container-bbbb",
        "cpu": "/sys/fs/cgroup/cpu,cpuacct/user.slice/container-bbbb",
        "cpuacct": "/sys/fs/cgroup/cpu,cpuacct/user.slice/container-bbbb",
        "cpuset": "/sys/fs/cgroup/cpuset/container-bbbb",
        "devices": "/sys/fs/cgroup/devices/user.slice/container-bbbb",
        "freezer": "/sys/fs/cgroup/freezer/container-bbbb",
        "hugetlb": "/sys/fs/cgroup/hugetlb/container-bbbb",
        "memory": "/sys/fs/cgroup/memory/user.slice/container-bbbb",
        "name=systemd": "/sys/fs/cgroup/systemd/user.slice/user-1000.slice/session-c2.scope/container-bbbb",
        "net_cls": "/sys/fs/cgroup/net_cls,net_prio/container-bbbb",
        "net_prio": "/sys/fs/cgroup/net_cls,net_prio/container-bbbb",
        "perf_event": "/sys/fs/cgroup/perf_event/container-bbbb",
        "pids": "/sys/fs/cgroup/pids/user.slice/user-1000.slice/container-bbbb"
    },
    "namespace_paths": {
        "NEWIPC": "/proc/3193/ns/ipc",
        "NEWNET": "/proc/3193/ns/net",
        "NEWNS": "/proc/3193/ns/mnt",
        "NEWPID": "/proc/3193/ns/pid",
        "NEWUSER": "/proc/3193/ns/user",
        "NEWUTS": "/proc/3193/ns/uts"
    },
    "external_descriptors": [
        "/dev/null",
        "/dev/null",
        "/dev/null"
    ]
}

 

runc start -h

   runc start

 

runc start 命令分析

   路径 start.go,命令行内容,主要是 Action 定义的,如果只是创建的话状态为 created,则执行 container.Exec()

Action: func(context *cli.Context) error { if err := checkArgs(context, 1, exactArgs); err != nil { return err } container, err := getContainer(context) if err != nil { return err } status, err := container.Status() if err != nil { return err } switch status { case libcontainer.Created: return container.Exec() case libcontainer.Stopped: return errors.New("cannot start a container that has stopped") case libcontainer.Running: return errors.New("cannot start an already running container") default: return fmt.Errorf("cannot start a container in the %s state\n", status) } },

 

 

   路径 libcontainer/container_linux.go,主要读取 /var/run/runc/${container-id}/exc.fifo 内容,然后成功读取进行删除该文件,同时恢复阻塞了 Create 的初始化进程。

func (c *linuxContainer) Exec() error { c.m.Lock() defer c.m.Unlock() return c.exec() } func (c *linuxContainer) exec() error { path := filepath.Join(c.root, execFifoFilename) f, err := os.OpenFile(path, os.O_RDONLY, 0) if err != nil { return newSystemErrorWithCause(err, "open exec fifo for reading") } defer f.Close() data, err := ioutil.ReadAll(f) if err != nil { return err } if len(data) > 0 { os.Remove(path) return nil  } return fmt.Errorf("cannot start an already running container") }

 

 

 

你可能感兴趣的:(Docker)