COMMANDS:
checkpoint checkpoint a running container
create create a container
delete delete any resources held by the container often used with detached container
events display container events such as OOM notifications, cpu, memory, and IO usage statistics
exec execute new process inside the container
init initialize the namespaces and launch the process (do not call it outside of runc)
kill kill sends the specified signal (default: SIGTERM) to the container's init process
list lists containers started by runc with the given root
pause pause suspends all processes inside the container
ps ps displays the processes running inside a container
restore restore a container from a previous checkpoint
resume resumes all processes that have been previously paused
run create and run a container
spec create a new specification file
start executes the user defined process in a created container
state output the state of a container
update update container resource constraints
help, h Shows a list of commands or help for one command
GLOBAL OPTIONS:
--debug enable debug output for logging
--log value set the log file path where internal debug information is written (default: "/dev/null")
--log-format value set the format used by logs ('text' (default), or 'json') (default: "text")
--root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc")
--criu value path to the criu binary used for checkpoint and restore (default: "criu")
--systemd-cgroup enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234"
--help, -h show help
--version, -v print the version
config.json
: 基本配置文件,包括与宿主机独立的和应用相关的特定信息,如安全权限、环境变量和参数等。具体如下:
state.json
一致) state.json
: 运行时配置文件(运行时主机相关的信息,如内存限制、设备访问权限、挂载点等)
rootfs
:根文件系统目录,容器执行的环境依赖,如/bin
、/var
、/lib
、/dev
、/usr
等目录及相应文件
路径 opencontainers/runc/create.go
var createCommand = cli.Command{
Name: "create",
Usage: "create a container",
ArgsUsage: `
Where "" is your name for the instance of the container that you
are starting. The name you provide for the container instance must be unique on
your host.`,
Description: `The create command creates an instance of a container for a bundle. The bundle
is a directory with a specification file named "` + specConfig + `" and a root
filesystem.
The specification file includes an args parameter. The args parameter is used
to specify command(s) that get run when the container is started. To change the
command(s) that get executed on start, edit the args parameter of the spec. See
"runc spec --help" for more explanation.`,
NAME:
docker-runc create - create a container
USAGE:
docker-runc create [command options]
Flags: []cli.Flag{
cli.StringFlag{
Name: "bundle, b",
Value: "",
Usage: `path to the root of the bundle directory, defaults to the current directory`,
},
cli.StringFlag{
Name: "console-socket",
Value: "",
Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
},
cli.StringFlag{
Name: "pid-file",
Value: "",
Usage: "specify the file to write the process id to",
},
cli.BoolFlag{
Name: "no-pivot",
Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk",
},
cli.BoolFlag{
Name: "no-new-keyring",
Usage: "do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key",
},
cli.IntFlag{
Name: "preserve-fds",
Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
},
},
Action: func(context *cli.Context) error {
if err := checkArgs(context, 1, exactArgs); err != nil {
return err
}
if err := revisePidFile(context); err != nil {
return err
}
spec, err := setupSpec(context)
if err != nil {
return err
}
status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
if err != nil {
return err
}
// exit with the container's exit status so any external supervisor is
// notified of the exit with the correct exit status.
os.Exit(status)
return nil
},
1.3.1 startContainer函数
startContainer 函数中参数 action CT_ACT_CREATE 为创建,后面会用到。获得启动命令行的容器 id,createContainer 函数第2章节讲解,runner 结构体以及 run 函数内容多一些第三章节进行讲解
func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
id := context.Args().First()
if id == "" {
return -1, errEmptyID
}
notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
if notifySocket != nil {
notifySocket.setupSpec(context, spec)
}
container, err := createContainer(context, id, spec)
if err != nil {
return -1, err
}
if notifySocket != nil {
err := notifySocket.setupSocket()
if err != nil {
return -1, err
}
}
// Support on-demand socket activation by passing file descriptors into the container init process.
listenFDs := []*os.File{}
if os.Getenv("LISTEN_FDS") != "" {
listenFDs = activation.Files(false)
}
r := &runner{
enableSubreaper: !context.Bool("no-subreaper"),
shouldDestroy: true,
container: container,
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
action: action,
criuOpts: criuOpts,
init: true,
}
return r.run(spec.Process)
}
CreateLibcontainerConfig 创建一个配置来创建容器,例如 bundle 路径,namespace,capatilities,标签等一堆堆,loadFactory 2.2 讲解,factory.Create 函数在 2.3 讲解
func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
rootlessCg, err := shouldUseRootlessCgroupManager(context)
if err != nil {
return nil, err
}
config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
CgroupName: id,
UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
NoPivotRoot: context.Bool("no-pivot"),
NoNewKeyring: context.Bool("no-new-keyring"),
Spec: spec,
RootlessEUID: os.Geteuid() != 0,
RootlessCgroups: rootlessCg,
})
if err != nil {
return nil, err
}
factory, err := loadFactory(context)
if err != nil {
return nil, err
}
return factory.Create(id, config)
}
路径libcontainer/specconv/spec_linux.go
2.1.1 创建Config配置,主要为在容器环境运行进程
config := &configs.Config{
Rootfs: rootfsPath,
NoPivotRoot: opts.NoPivotRoot,
Readonlyfs: spec.Root.Readonly,
Hostname: spec.Hostname,
Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
NoNewKeyring: opts.NoNewKeyring,
RootlessEUID: opts.RootlessEUID,
RootlessCgroups: opts.RootlessCgroups,
}
exists := false
for _, m := range spec.Mounts {
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
}
2.1.2 createDevices创建device配置
可以在容器中看到/dev
# ls
core full null pts shm stdin termination-log urandom
fd mqueue ptmx random stderr stdout tty zero
func createDevices(spec *specs.Spec, config *configs.Config) error {
// add whitelisted devices
config.Devices = []*configs.Device{
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
。。。。。。
}
// merge in additional devices from the spec
if spec.Linux != nil {
。。。。。
config.Devices = append(config.Devices, device)
}
}
return nil
}
2.1.3 createCgroupConfig函数
配置cgroup,包括memory cpu blockio network
func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
。。。
// In rootless containers, any attempt to make cgroup changes is likely to fail.
// libcontainer will validate this but ignores the error.
c.Resources.AllowedDevices = allowedDevices
if spec.Linux != nil {
if r.Memory != nil {
if r.Memory.Limit != nil {
c.Resources.Memory = *r.Memory.Limit
}
if r.Memory.Reservation != nil {
c.Resources.MemoryReservation = *r.Memory.Reservation
}
if r.Memory.Swap != nil {
c.Resources.MemorySwap = *r.Memory.Swap
}
if r.Memory.Kernel != nil {
c.Resources.KernelMemory = *r.Memory.Kernel
}
if r.Memory.KernelTCP != nil {
c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
}
if r.Memory.Swappiness != nil {
c.Resources.MemorySwappiness = r.Memory.Swappiness
}
if r.Memory.DisableOOMKiller != nil {
c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
}
}
。。。
if r.Network != nil {
if r.Network.ClassID != nil {
c.Resources.NetClsClassid = *r.Network.ClassID
}
for _, m := range r.Network.Priorities {
c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
Interface: m.Name,
Priority: int64(m.Priority),
})
}
}
}
// append the default allowed devices to the end of the list
c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
return c, nil
}
// loadFactory returns the configured factory instance for execing containers.
func loadFactory(context *cli.Context) (libcontainer.Factory, error)
2.2.1 cgroupManager 指向 Cgroupfs 函数
配置 cgroup Manager 接口指向 fs 结构体 Manager 实现了其接口,cgroupManager := libcontainer.Cgroupfs
// Cgroupfs is an options func to configure a LinuxFactory to return containers
// that use the native cgroups filesystem implementation to create and manage
// cgroups.
func Cgroupfs(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &fs.Manager{
Cgroups: config,
Paths: paths,
}
}
return nil
}
2.2.2 实例化libcontainer
libcontainer.New(abs, cgroupManager, intelRdtManager,
libcontainer.CriuPath(context.GlobalString("criu")),
libcontainer.NewuidmapPath(newuidmap),
libcontainer.NewgidmapPath(newgidmap))
libcontainer.New 返回一个 linux 系统实现的结构体,根据传入的参数配置结构体中的 NewCgroupsManager 和 CriuPath
// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0700); err != nil {
return nil, newGenericError(err, SystemError)
}
}
l := &LinuxFactory{
Root: root,
InitPath: "/proc/self/exe",
InitArgs: []string{os.Args[0], "init"},
Validator: validate.New(),
CriuPath: "criu",
}
Cgroupfs(l)
for _, opt := range options {
if opt == nil {
continue
}
if err := opt(l); err != nil {
return nil, err
}
}
return l, nil
}
Create 做的事情比较简单,对配置进行一些验证工作,验证成功创建容器的根路径并设置权限
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
if err := l.validateID(id); err != nil {
return nil, err
}
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
}
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
}
if err := os.MkdirAll(containerRoot, 0711); err != nil {
return nil, newGenericError(err, SystemError)
}
if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
return nil, newGenericError(err, SystemError)
}
c := &linuxContainer{
id: id,
root: containerRoot,
config: config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
}
if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
}
c.state = &stoppedState{c: c}
return c, nil
}
2.3.1 成功会返回 linuxContainer 结构体,包括容器 ID,跟路径,参数等,将状态置为 stopped
type linuxContainer struct {
id string
root string
config *configs.Config
cgroupManager cgroups.Manager
initArgs []string
initProcess parentProcess
initProcessStartTime string
criuPath string
m sync.Mutex
criuVersion int
state containerState
created time.Time
}
2.1 Factory对象为容器创建和初始化工作提供了一组抽象接口
type Factory interface {
Create(id string, config *configs.Config) (Container, error)
Load(id string) (Container, error)
StartInitialization() error
Type() string
}
id
和配置参数创建容器,返回一个运行的进程。id
目录下读取 state.json
来载入容器2.2 Linux 系统 factory 对象的结构体,实现了 Factory 接口
type LinuxFactory struct {
// Root directory for the factory to store state.
Root string
// InitArgs are arguments for calling the init responsibilities for spawning
// a container.
InitArgs []string
// CriuPath is the path to the criu binary used for checkpoint and restore of
// containers.
CriuPath string
// Validator provides validation to container configurations.
Validator validate.Validator
// NewCgroupsManager returns an initialized cgroups manager for a single container.
NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
}
type runner struct {
init bool
enableSubreaper bool
shouldDestroy bool
detach bool
listenFDs []*os.File
preserveFDs int
pidFile string
consoleSocket string
container libcontainer.Container
action CtAct
notifySocket *notifySocket
criuOpts *libcontainer.CriuOpts
}
// Process contains information to start a specific application inside the container.
type Process struct {
// Terminal creates an interactive terminal for the container.
Terminal bool `json:"terminal,omitempty"`
// ConsoleSize specifies the size of the console.
ConsoleSize *Box `json:"consoleSize,omitempty"`
// User specifies user information for the process.
User User `json:"user"`
// Args specifies the binary and arguments for the application to execute.
Args []string `json:"args"`
// Env populates the process environment for the process.
Env []string `json:"env,omitempty"`
// Cwd is the current working directory for the process and must be
// relative to the container's root.
Cwd string `json:"cwd"`
// Capabilities are Linux capabilities that are kept for the process.
Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"`
// Rlimits specifies rlimit options to apply to the process.
Rlimits []POSIXRlimit `json:"rlimits,omitempty" platform:"linux,solaris"`
// NoNewPrivileges controls whether additional privileges could be gained by processes in the container.
NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"`
// ApparmorProfile specifies the apparmor profile for the container.
ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"`
// Specify an oom_score_adj for the container.
OOMScoreAdj *int `json:"oomScoreAdj,omitempty" platform:"linux"`
// SelinuxLabel specifies the selinux context that the container process is run as.
SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
}
func (r *runner) run(config *specs.Process) (int, error)
3.1.1 newProcess 主要是填充 libcontainer.Process 结构体
包括参数,环境变量,user 权限,工作目录,cpabilities,资源限制等
// newProcess returns a new libcontainer Process with the arguments from the
// spec and stdio from the current process.
func newProcess(p specs.Process, init bool) (*libcontainer.Process, error) {
lp := &libcontainer.Process{
Args: p.Args,
Env: p.Env,
// TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
User: fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
Cwd: p.Cwd,
Label: p.SelinuxLabel,
NoNewPrivileges: &p.NoNewPrivileges,
AppArmorProfile: p.ApparmorProfile,
Init: init,
}
if p.ConsoleSize != nil {
lp.ConsoleWidth = uint16(p.ConsoleSize.Width)
lp.ConsoleHeight = uint16(p.ConsoleSize.Height)
}
if p.Capabilities != nil {
lp.Capabilities = &configs.Capabilities{}
lp.Capabilities.Bounding = p.Capabilities.Bounding
lp.Capabilities.Effective = p.Capabilities.Effective
lp.Capabilities.Inheritable = p.Capabilities.Inheritable
lp.Capabilities.Permitted = p.Capabilities.Permitted
lp.Capabilities.Ambient = p.Capabilities.Ambient
}
for _, gid := range p.User.AdditionalGids {
lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
}
for _, rlimit := range p.Rlimits {
rl, err := createLibContainerRlimit(rlimit)
if err != nil {
return nil, err
}
lp.Rlimits = append(lp.Rlimits, rl)
}
return lp, nil
}
3.1.2 listen fd 加入 process 的环境变量和需要在新进程保持打开的文件列表中(ExtraFiles
)
if len(r.listenFDs) > 0 {
process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
}
baseFd := 3 + len(process.ExtraFiles)
for i := baseFd; i < baseFd+r.preserveFDs; i++ {
process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
}
3.1.3 创建 signalHandler 处理 tty 和 signal
setupIO
来进行 io 和 tty 相关配置,对于 create 就是 dup 将当前进程的 io,chown 用户/组权限
// Setting up IO is a two stage process. We need to modify process to deal
// with detaching containers, and then we get a tty after the container has
// started.
handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
if err != nil {
r.destroy()
return -1, err
}
defer tty.Close()
3.1.4 根据 action 为 create
第四章继续分析 r.container.Start 函数
switch r.action {
case CT_ACT_CREATE:
err = r.container.Start(process)
case CT_ACT_RESTORE:
err = r.container.Restore(process, r.criuOpts)
case CT_ACT_RUN:
err = r.container.Run(process)
default:
panic("Unknown action")
}
if err != nil {
r.destroy()
return -1, err
}
3.1.4.1 BaseContainer接口
// BaseContainer is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found. BaseContainer includes methods that are platform agnostic.
type BaseContainer interface {
// Start a process inside the container. Returns error if process fails to
// start. You can track process lifecycle with passed Process structure.
//
// errors:
// ContainerNotExists - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// SystemError - System error.
Start(process *Process) (err error)
}
Container对象主要包含了容器配置、控制、状态显示等功能,每一个 Container 进程内部都是线程安全的。由于 Container 可能被其他进程销毁,所以每个方法都会对容器是否存在进行检测。
路径: libcontainer/container_linux.go
type Container interface {
BaseContainer
Checkpoint(criuOpts *CriuOpts) error
Restore(process *Process, criuOpts *CriuOpts) error
Pause() error
Resume() error
NotifyOOM() (<-chan struct{}, error)
NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
}
type BaseContainer interface {
ID() string
Status() (Status, error)
State() (*State, error)
Config() configs.Config
Processes() ([]int, error)
Stats() (*Stats, error)
Set(config configs.Config) error
Start(process *Process) (err error)
Run(process *Process) (err error)
Destroy() error
Signal(s os.Signal, all bool) error
Exec() error
}
表示一个运行中的容器状态信息:
// State represents a running container's state
type State struct {
BaseState
// Platform specific fields below here
// Specifies if the container was started under the rootless mode.
Rootless bool `json:"rootless"`
// Path to all the cgroups setup for a container. Key is cgroup subsystem name
// with the value as the path.
CgroupPaths map[string]string `json:"cgroup_paths"`
// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
// with the value as the path.
NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
ExternalDescriptors []string `json:"external_descriptors,omitempty"`
}
// BaseState represents the platform agnostic pieces relating to a
// running container's state
type BaseState struct {
// ID is the container ID.
ID string `json:"id"`
// InitProcessPid is the init process id in the parent namespace.
InitProcessPid int `json:"init_process_pid"`
// InitProcessStartTime is the init process start time in clock cycles since boot time.
InitProcessStartTime uint64 `json:"init_process_start"`
// Created is the unix timestamp for the creation time of the container in UTC
Created time.Time `json:"created"`
// Config is the container's configuration.
Config configs.Config `json:"config"`
}
linuxContainer实现了Container接口
根据 create 流程向前将 status 置为 stopped 状态,传入 start 函数第二个参数为 true 将状态设置为 created,主要 4.2 分析 start 函数
func (c *linuxContainer) Start(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
if process.Init {
if err := c.createExecFifo(); err != nil {
return err
}
}
if err := c.start(process); err != nil {
if process.Init {
c.deleteExecFifo()
}
return err
}
return nil
}
func (c *linuxContainer) start(process *Process, isInit bool) error
4.2.1 newParentProcess
io.Reader
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
parentPipe, childPipe, err := utils.NewSockPair("init")
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
}
cmd, err := c.commandTemplate(p, childPipe)
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
}
if !p.Init {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
}
// We only set up fifoFd if we're not doing a `runc exec`. The historic
// reason for this is that previously we would pass a dirfd that allowed
// for container rootfs escape (and not doing it in `runc exec` avoided
// that problem), but we no longer do that. However, there's no need to do
// this for `runc exec` so we just keep it this way to be safe.
if err := c.includeExecFifo(cmd); err != nil {
return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
}
return c.newInitProcess(p, cmd, parentPipe, childPipe)
}
4.2.1.1newInitProcess函数
添加初始化类型环境变量,将namespace、uid/gid 映射等信息使用 bootstrapData 函数封装为一个 io.Reader,使用的是 netlink 用于内核间的通信,返回 initProcess 结构体
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
}
}
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
}
init := &initProcess{
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
intelRdtManager: c.intelRdtManager,
config: c.newInitConfig(p),
container: c,
process: p,
bootstrapData: data,
sharePidns: sharePidns,
}
c.initProcess = init
return init, nil
}
第五章节讲解,创建了新的进程。而此时新的进程使用 /proc/self/exec 为执行入口,参数为 init,会在 main 函数调用之前执行,所以在新的进程中 func init() 会直接调用,而不会去执行main函数
unc (p *initProcess) start() error
type initProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
config *initConfig
manager cgroups.Manager
intelRdtManager intelrdt.Manager
container *linuxContainer
fds []string
process *Process
bootstrapData io.Reader
sharePidns bool
}
func (p *initProcess) start() error
5.1.1 中 cmd 如最后命令所示,Path填充为 /proc/self/exe(本身 runC)。参数字段 Args 为 init,表示对容器进行初始化,调用的为 runc init (https://blog.csdn.net/zhonglinzhang/article/details/86502530这篇文章分析)
defer p.parentPipe.Close()
err := p.cmd.Start()
p.process.ops = p
p.childPipe.Close()
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
}
5.1.2 Apply 设置进程 cgroup 进行限额
// Do this before syncing with child so that no children can escape the
// cgroup. We don't need to worry about not doing this and not being root
// because we'd be using the rootless cgroup manager in that case.
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
5.1.3 createNetworkInterfaces 如果没有指定网络只设置 loopback,如果指定网络还有 veth 类型。
sendConfig 发送配置到 init process
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
}
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
}
initProcess结构的start方法真正完成了容器进程的创建,并通过init管道协助其完成初始化工作。该方法首先调用p.cmd.Start()创建一个独立的进程,执行命令runc init。
创建容器标准包,使用 bundle 模块实现,将 docker 镜像转换成容器标准包
$ docker pull busybox
$ docker export $(docker create busybox) | tar -C rootfs -xvf -
创建配置文件
$ runc spec
运行容器
$ runc run busybox
create 总结:
cmd 内容:
{
Path: "/proc/self/exe",
Args: [
]string{
"/proc/self/exe",
"init"
},
Env: [
]string{
"_LIBCONTAINER_CONSOLE=3",
"_LIBCONTAINER_INITPIPE=4",
"_LIBCONTAINER_STATEDIR=5",
"_LIBCONTAINER_INITTYPE=standard"
},
Dir: "/home/lin/project/src/github.com/opencontainers/runc/mycontainers/rootfs",
Stdin: io.Reader(nil),
Stdout: io.Writer(nil),
Stderr: io.Writer(nil),
ExtraFiles: [
]*os.File{
(*os.File)(0xc42000e120),
(*os.File)(0xc42000e140),
(*os.File)(0xc42000e150)
},
SysProcAttr: (*syscall.SysProcAttr)(0xc4200981b0),
Process: (*os.Process)(nil),
ProcessState: (*os.ProcessState)(nil),
ctx: context.Context(nil),
lookPathErr: error(nil),
finished: false,
childFiles: [
]*os.File(nil),
closeAfterStart: [
]io.Closer(nil),
closeAfterWait: [
]io.Closer(nil),
goroutine: [
]func()error(nil),
errch: (chanerror)(nil),
waitDone: (chanstruct{
})(nil)
}
/var/run/runc/${container-id}/state.json
{
"id": "container-bbbb",
"init_process_pid": 3193,
"init_process_start": 7331,
"created": "2017-08-15T02:30:51.244343167Z",
"config": {
"no_pivot_root": false,
"parent_death_signal": 0,
"rootfs": "/home/lin/project/src/github.com/opencontainers/runc/mycontainers/rootfs",
"readonlyfs": true,
"rootPropagation": 278528,
"mounts": [
{
"source": "proc",
"destination": "/proc",
"device": "proc",
"flags": 0,
"propagation_flags": null,
"data": "",
"relabel": "",
"extensions": 0,
"premount_cmds": null,
"postmount_cmds": null
},
{
"source": "tmpfs",
"destination": "/dev",
"device": "tmpfs",
"flags": 16777218,
"propagation_flags": null,
"data": "mode=755,size=65536k",
"relabel": "",
"extensions": 0,
"premount_cmds": null,
"postmount_cmds": null
},
{
"source": "devpts",
"destination": "/dev/pts",
"device": "devpts",
"flags": 10,
"propagation_flags": null,
"data": "newinstance,ptmxmode=0666,mode=0620,gid=5",
"relabel": "",
"extensions": 0,
"premount_cmds": null,
"postmount_cmds": null
},
{
"source": "shm",
"destination": "/dev/shm",
"device": "tmpfs",
"flags": 14,
"propagation_flags": null,
"data": "mode=1777,size=65536k",
"relabel": "",
"extensions": 0,
"premount_cmds": null,
"postmount_cmds": null
},
{
"source": "mqueue",
"destination": "/dev/mqueue",
"device": "mqueue",
"flags": 14,
"propagation_flags": null,
"data": "",
"relabel": "",
"extensions": 0,
"premount_cmds": null,
"postmount_cmds": null
},
{
"source": "sysfs",
"destination": "/sys",
"device": "sysfs",
"flags": 15,
"propagation_flags": null,
"data": "",
"relabel": "",
"extensions": 0,
"premount_cmds": null,
"postmount_cmds": null
},
{
"source": "cgroup",
"destination": "/sys/fs/cgroup",
"device": "cgroup",
"flags": 2097167,
"propagation_flags": null,
"data": "",
"relabel": "",
"extensions": 0,
"premount_cmds": null,
"postmount_cmds": null
}
],
"devices": [
{
"type": 99,
"path": "/dev/null",
"major": 1,
"minor": 3,
"permissions": "",
"file_mode": 438,
"uid": 0,
"gid": 0,
"allow": false
},
{
"type": 99,
"path": "/dev/random",
"major": 1,
"minor": 8,
"permissions": "",
"file_mode": 438,
"uid": 0,
"gid": 0,
"allow": false
},
{
"type": 99,
"path": "/dev/full",
"major": 1,
"minor": 7,
"permissions": "",
"file_mode": 438,
"uid": 0,
"gid": 0,
"allow": false
},
{
"type": 99,
"path": "/dev/tty",
"major": 5,
"minor": 0,
"permissions": "",
"file_mode": 438,
"uid": 0,
"gid": 0,
"allow": false
},
{
"type": 99,
"path": "/dev/zero",
"major": 1,
"minor": 5,
"permissions": "",
"file_mode": 438,
"uid": 0,
"gid": 0,
"allow": false
},
{
"type": 99,
"path": "/dev/urandom",
"major": 1,
"minor": 9,
"permissions": "",
"file_mode": 438,
"uid": 0,
"gid": 0,
"allow": false
}
],
"mount_label": "",
"hostname": "runc",
"namespaces": [
{
"type": "NEWPID",
"path": ""
},
{
"type": "NEWNET",
"path": ""
},
{
"type": "NEWIPC",
"path": ""
},
{
"type": "NEWUTS",
"path": ""
},
{
"type": "NEWNS",
"path": ""
}
],
"capabilities": {
"Bounding": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"Effective": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"Inheritable": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"Permitted": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"Ambient": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
]
},
"networks": [
{
"type": "loopback",
"name": "",
"bridge": "",
"mac_address": "",
"address": "",
"gateway": "",
"ipv6_address": "",
"ipv6_gateway": "",
"mtu": 0,
"txqueuelen": 0,
"host_interface_name": "",
"hairpin_mode": false
}
],
"routes": null,
"cgroups": {
"name": "container-bbbb",
"path": "",
"scope_prefix": "",
"Paths": null,
"allowed_devices": [
{
"type": 99,
"path": "",
"major": -1,
"minor": -1,
"permissions": "m",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 98,
"path": "",
"major": -1,
"minor": -1,
"permissions": "m",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/null",
"major": 1,
"minor": 3,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/random",
"major": 1,
"minor": 8,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/full",
"major": 1,
"minor": 7,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/tty",
"major": 5,
"minor": 0,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/zero",
"major": 1,
"minor": 5,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/urandom",
"major": 1,
"minor": 9,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/console",
"major": 5,
"minor": 1,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "",
"major": 136,
"minor": -1,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "",
"major": 5,
"minor": 2,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "",
"major": 10,
"minor": 200,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
}
],
"devices": [
{
"type": 97,
"path": "",
"major": -1,
"minor": -1,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": false
},
{
"type": 99,
"path": "",
"major": -1,
"minor": -1,
"permissions": "m",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 98,
"path": "",
"major": -1,
"minor": -1,
"permissions": "m",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/null",
"major": 1,
"minor": 3,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/random",
"major": 1,
"minor": 8,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/full",
"major": 1,
"minor": 7,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/tty",
"major": 5,
"minor": 0,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/zero",
"major": 1,
"minor": 5,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/urandom",
"major": 1,
"minor": 9,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "/dev/console",
"major": 5,
"minor": 1,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "",
"major": 136,
"minor": -1,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "",
"major": 5,
"minor": 2,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
},
{
"type": 99,
"path": "",
"major": 10,
"minor": 200,
"permissions": "rwm",
"file_mode": 0,
"uid": 0,
"gid": 0,
"allow": true
}
],
"memory": 0,
"memory_reservation": 0,
"memory_swap": 0,
"kernel_memory": 0,
"kernel_memory_tcp": 0,
"cpu_shares": 0,
"cpu_quota": 0,
"cpu_period": 0,
"cpu_rt_quota": 0,
"cpu_rt_period": 0,
"cpuset_cpus": "",
"cpuset_mems": "",
"pids_limit": 0,
"blkio_weight": 0,
"blkio_leaf_weight": 0,
"blkio_weight_device": null,
"blkio_throttle_read_bps_device": null,
"blkio_throttle_write_bps_device": null,
"blkio_throttle_read_iops_device": null,
"blkio_throttle_write_iops_device": null,
"freezer": "",
"hugetlb_limit": null,
"oom_kill_disable": false,
"memory_swappiness": null,
"net_prio_ifpriomap": null,
"net_cls_classid_u": 0
},
"oom_score_adj": 0,
"uid_mappings": null,
"gid_mappings": null,
"mask_paths": [
"/proc/kcore",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/sys/firmware"
],
"readonly_paths": [
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger"
],
"sysctl": null,
"seccomp": null,
"Hooks": {
"poststart": null,
"poststop": null,
"prestart": null
},
"version": "1.0.0",
"labels": [
"bundle=/home/lin/project/src/github.com/opencontainers/runc/mycontainers"
],
"no_new_keyring": false,
"rootless": false
},
"rootless": false,
"cgroup_paths": {
"blkio": "/sys/fs/cgroup/blkio/user.slice/container-bbbb",
"cpu": "/sys/fs/cgroup/cpu,cpuacct/user.slice/container-bbbb",
"cpuacct": "/sys/fs/cgroup/cpu,cpuacct/user.slice/container-bbbb",
"cpuset": "/sys/fs/cgroup/cpuset/container-bbbb",
"devices": "/sys/fs/cgroup/devices/user.slice/container-bbbb",
"freezer": "/sys/fs/cgroup/freezer/container-bbbb",
"hugetlb": "/sys/fs/cgroup/hugetlb/container-bbbb",
"memory": "/sys/fs/cgroup/memory/user.slice/container-bbbb",
"name=systemd": "/sys/fs/cgroup/systemd/user.slice/user-1000.slice/session-c2.scope/container-bbbb",
"net_cls": "/sys/fs/cgroup/net_cls,net_prio/container-bbbb",
"net_prio": "/sys/fs/cgroup/net_cls,net_prio/container-bbbb",
"perf_event": "/sys/fs/cgroup/perf_event/container-bbbb",
"pids": "/sys/fs/cgroup/pids/user.slice/user-1000.slice/container-bbbb"
},
"namespace_paths": {
"NEWIPC": "/proc/3193/ns/ipc",
"NEWNET": "/proc/3193/ns/net",
"NEWNS": "/proc/3193/ns/mnt",
"NEWPID": "/proc/3193/ns/pid",
"NEWUSER": "/proc/3193/ns/user",
"NEWUTS": "/proc/3193/ns/uts"
},
"external_descriptors": [
"/dev/null",
"/dev/null",
"/dev/null"
]
}
runc start -h
runc start 命令分析
Action: func(context *cli.Context) error { if err := checkArgs(context, 1, exactArgs); err != nil { return err } container, err := getContainer(context) if err != nil { return err } status, err := container.Status() if err != nil { return err } switch status { case libcontainer.Created: return container.Exec() case libcontainer.Stopped: return errors.New("cannot start a container that has stopped") case libcontainer.Running: return errors.New("cannot start an already running container") default: return fmt.Errorf("cannot start a container in the %s state\n", status) } },
func (c *linuxContainer) Exec() error { c.m.Lock() defer c.m.Unlock() return c.exec() } func (c *linuxContainer) exec() error { path := filepath.Join(c.root, execFifoFilename) f, err := os.OpenFile(path, os.O_RDONLY, 0) if err != nil { return newSystemErrorWithCause(err, "open exec fifo for reading") } defer f.Close() data, err := ioutil.ReadAll(f) if err != nil { return err } if len(data) > 0 { os.Remove(path) return nil } return fmt.Errorf("cannot start an already running container") }