承接上一篇“docker containerd 架构和源码简单分析”本文继续沿着docker run/create的流程简单分析一下docker RunC的源码。
本文依然结合docker1.12的框架对docker 框架中最底层的runc做简单的流程分析。
从docker的命令行开始,其架构如下:
本文分析的对象是RUNC, 而且仅仅分析Linux部分的代码,即默认在Linux环境下运行。
承接containerd最后调用的cmd.Start()–即docker-runc.该命令的一般有restore,run,create.
这里我们简单分析一下runc main()函数的流程:
func main() {
app := cli.NewApp()
app.Name = "runc"
app.Usage = usage
var v []string
if version != "" {
v = append(v, version)
}
if gitCommit != "" {
v = append(v, fmt.Sprintf("commit: %s", gitCommit))
}
v = append(v, fmt.Sprintf("spec: %s", specs.Version))
app.Version = strings.Join(v, "\n")
root := "/run/runc"
if os.Geteuid() != 0 {
runtimeDir := os.Getenv("XDG_RUNTIME_DIR")
if runtimeDir != "" {
root = runtimeDir + "/runc"
}
}
app.Flags = []cli.Flag{
……
……
}
app.Commands = []cli.Command{
checkpointCommand,
createCommand,
deleteCommand,
eventsCommand,
execCommand,
initCommand,
killCommand,
listCommand,
pauseCommand,
psCommand,
restoreCommand,
resumeCommand,
runCommand,
specCommand,
startCommand,
stateCommand,
updateCommand,
}
……
……
// If the command returns an error, cli takes upon itself to print
// the error on cli.ErrWriter and exit.
// Use our own writer here to ensure the log gets sent to the right location.
cli.ErrWriter = &FatalWriter{cli.ErrWriter}
if err := app.Run(os.Args); err != nil {
fatal(err)
}
}
从上面代码中看,就是初始化了一个APP,设置了APP都有那些command, 然后执行app.Run
接下来看一下app.Run()函数:
func (a *App) Run(arguments []string) (err error) {
a.Setup()
// parse flags
set := flagSet(a.Name, a.Flags)
set.SetOutput(ioutil.Discard)
err = set.Parse(arguments[1:])
……
……
/******* 核心部分 start ******/
args := context.Args()
if args.Present() {
name := args.First()
c := a.Command(name)
if c != nil {
return c.Run(context)
}
}
// Run default Action
err = HandleAction(a.Action, context)
/********核心部分代码 end *******/
HandleExitCoder(err)
return err
}
以上代码中先处理了一下arguments, set.Parse(arguments[1:]) 会先处理掉带“–”的所有参赛,因此到下面 name=args.First()获取到的就是create、restore、run等。接下来调用HandleAction(a.Action, context) 会调用到 create对应的cli.command的Action函数,我们先看一下cli.command 的createCommand函数的定义:
var createCommand = cli.Command{
Name: "create",
Usage: "create a container",
ArgsUsage: ……
Description: ……
Flags: []cli.Flag{
……
……
},
Action: func(context *cli.Context) error {
if err := checkArgs(context, 1, exactArgs); err != nil {
return err
}
if err := revisePidFile(context); err != nil {
return err
}
spec, err := setupSpec(context)
if err != nil {
return err
}
status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
if err != nil {
return err
}
// exit with the container's exit status so any external supervisor is
// notified of the exit with the correct exit status.
os.Exit(status)
return nil
},
}
上面的定义中最中心的函数就是Action Func(context *cli.Context)的函数,该函数对应的是createCommand的实际执行函数。从而进入startContainer函数。
上面说了这么多都还没有进入实际处理流程中呢。接下来才是实际处理流程。我们接着看源码:
func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
id := context.Args().First()
if id == "" {
return -1, errEmptyID
}
notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
if notifySocket != nil {
notifySocket.setupSpec(context, spec)
}
container, err := createContainer(context, id, spec)
if err != nil {
return -1, err
}
if notifySocket != nil {
err := notifySocket.setupSocket()
if err != nil {
return -1, err
}
}
// Support on-demand socket activation by passing file descriptors into the container init process.
listenFDs := []*os.File{}
if os.Getenv("LISTEN_FDS") != "" {
listenFDs = activation.Files(false)
}
r := &runner{
enableSubreaper: !context.Bool("no-subreaper"),
shouldDestroy: true,
container: container,
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
action: action,
criuOpts: criuOpts,
}
return r.run(spec.Process)
}
上面的函数中createContainer创建了一个container的对象内附参数,另外创建了一个runner对象,最终指向runner的run函数,该函数是create, start, run三个命令的公共入口。
我们先开一下createContainer:
func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
CgroupName: id,
UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
NoPivotRoot: context.Bool("no-pivot"),
NoNewKeyring: context.Bool("no-new-keyring"),
Spec: spec,
Rootless: isRootless(),
})
if err != nil {
return nil, err
}
factory, err := loadFactory(context)
if err != nil {
return nil, err
}
return factory.Create(id, config)
}
这里根据container ID,以及spec参数创建了以linux的container的Config信息。根据Linux容器框架获取factory并使用Factory的create函数创建Libcontainer对象–即linux container对象。 我们来看一下loadFactory(context):
// loadFactory returns the configured factory instance for execing containers.
func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
root := context.GlobalString("root")
abs, err := filepath.Abs(root)
if err != nil {
return nil, err
}
// We default to cgroupfs, and can only use systemd if the system is a
// systemd box.
cgroupManager := libcontainer.Cgroupfs
if context.GlobalBool("systemd-cgroup") {
if systemd.UseSystemd() {
cgroupManager = libcontainer.SystemdCgroups
} else {
return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
}
}
intelRdtManager := libcontainer.IntelRdtFs
if !intelrdt.IsEnabled() {
intelRdtManager = nil
}
// We resolve the paths for {newuidmap,newgidmap} from the context of runc,
// to avoid doing a path lookup in the nsexec context. TODO: The binary
// names are not currently configurable.
newuidmap, err := exec.LookPath("newuidmap")
if err != nil {
newuidmap = ""
}
newgidmap, err := exec.LookPath("newgidmap")
if err != nil {
newgidmap = ""
}
return libcontainer.New(abs, cgroupManager, intelRdtManager,
libcontainer.CriuPath(context.GlobalString("criu")),
libcontainer.NewuidmapPath(newuidmap),
libcontainer.NewgidmapPath(newgidmap))
}
返回一个 linux 系统实现的结构体,根据传入的参数配置结构体中的 NewCgroupsManager 和 CriuPath。 最终返回的factory的创建信息如下:
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0700); err != nil {
return nil, newGenericError(err, SystemError)
}
}
l := &LinuxFactory{
Root: root,
InitPath: "/proc/self/exe",
InitArgs: []string{os.Args[0], "init"},
Validator: validate.New(),
CriuPath: "criu",
}
Cgroupfs(l)
for _, opt := range options {
if opt == nil {
continue
}
if err := opt(l); err != nil {
return nil, err
}
}
return l, nil
}
该函数的内容非常重要,”/proc/self/exe”是最终容器init初始化的重要参数–即初始化是runc本身。Factory是一个interface,只要实现了其定义的接口即实现了其本身,看一下Factory定义的接口:
type Factory interface {
// Creates a new container with the given id and starts the initial process inside it.
Create(id string, config *configs.Config) (Container, error)
// Load takes an ID for an existing container and returns the container information
Load(id string) (Container, error)
// StartInitialization is an internal API to libcontainer used during the reexec of the
StartInitialization() error
// Type returns info string about factory type (e.g. lxc, libcontainer...)
Type() string
}
LinuxFactory实现了以上接口因此可以当做Facktory返回
接下来的factory.Create(id, config)事实上创建了容器的root路径并修改器权限等信息。
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
if err := l.validateID(id); err != nil {
return nil, err
}
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
containerRoot := filepath.Join(l.Root, id)
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
}
if err := os.MkdirAll(containerRoot, 0711); err != nil {
return nil, newGenericError(err, SystemError)
}
if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
return nil, newGenericError(err, SystemError)
}
c := &linuxContainer{
id: id,
root: containerRoot,
config: config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
}
if intelrdt.IsEnabled() {
c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
}
c.state = &stoppedState{c: c}
return c, nil
}
创建了容器的根目录,修改了所有者,并创建了LinuxContainer对象, 返回该对象。另外我们来看一下LinuxContainer的结构体:
type linuxContainer struct {
id string //容器ID
root string //容器的跟路径
config *configs.Config //容器配置
cgroupManager cgroups.Manager
initArgs []string //初始化运行的买哪个函数跟参数
initProcess parentProcess
initProcessStartTime string
criuPath string
m sync.Mutex
criuVersion int
state containerState //一开始的状态设置为stopped
created time.Time
}
我们来看一下runner的结构体定义:
type runner struct {
enableSubreaper bool
shouldDestroy bool
detach bool
listenFDs []*os.File
preserveFDs int
pidFile string
consoleSocket string
container libcontainer.Container ***--***
action CtAct ***--***
notifySocket *notifySocket
criuOpts *libcontainer.CriuOpts
}
接下来运行runner
func (r *runner) run(config *specs.Process) (int, error) {
if err := r.checkTerminal(config); err != nil {
r.destroy()
return -1, err
}
process, err := newProcess(*config)
if err != nil {
r.destroy()
return -1, err
}
if len(r.listenFDs) > 0 {
//ListenFD加入process的环境变量
process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
}
baseFd := 3 + len(process.ExtraFiles)
for i := baseFd; i < baseFd+r.preserveFDs; i++ {
process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
}
rootuid, err := r.container.Config().HostRootUID()
if err != nil {
r.destroy()
return -1, err
}
rootgid, err := r.container.Config().HostRootGID()
if err != nil {
r.destroy()
return -1, err
}
var (
detach = r.detach || (r.action == CT_ACT_CREATE)
)
// Setting up IO is a two stage process. We need to modify process to deal
// with detaching containers, and then we get a tty after the container has
// started.
handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
//修改process的io, UID,GID
if err != nil {
r.destroy()
return -1, err
}
defer tty.Close()
switch r.action {
case CT_ACT_CREATE:
err = r.container.Start(process) //start container Process
case CT_ACT_RESTORE:
err = r.container.Restore(process, r.criuOpts)
case CT_ACT_RUN:
err = r.container.Run(process)
default:
panic("Unknown action")
}
if err != nil {
r.destroy()
return -1, err
}
if err := tty.waitConsole(); err != nil {
r.terminate(process)
r.destroy()
return -1, err
}
if err = tty.ClosePostStart(); err != nil {
r.terminate(process)
r.destroy()
return -1, err
}
if r.pidFile != "" {
if err = createPidFile(r.pidFile, process); err != nil {
r.terminate(process)
r.destroy()
return -1, err
}
}
status, err := handler.forward(process, tty, detach)
if err != nil {
r.terminate(process)
}
if detach {
return 0, nil
}
r.destroy()
return status, err
}
这个函数是非常重要的,是create,run, restore的公共入口。
1. 首先根据spec里面process的配置信息调用newProcess创建process对象。
2. 其次将listen fd加入process的环境变量和需要在新进程保持打开的文件列表中。
3. 调用setupIO来处理io和tty相关配置,对于create来说,这里就是修改当前前进程的io,chown用户/组权限。
4. 创建一个signalHandler来处理tty和signal。
5. 调用container.Start(process)来启动process进程–即进入container的阶段。
我们先看一下process的结构定义:
type Process struct {
// The command to be run followed by any arguments.
Args []string
// Env specifies the environment variables for the process.
Env []string
// User will set the uid and gid of the executing process running inside the container
// local to the container's user and group configuration.
User string
// AdditionalGroups specifies the gids that should be added to supplementary groups
// in addition to those that the user belongs to.
AdditionalGroups []string
// Cwd will change the processes current working directory inside the container's rootfs.
Cwd string
// Stdin is a pointer to a reader which provides the standard input stream.
Stdin io.Reader
// Stdout is a pointer to a writer which receives the standard output stream.
Stdout io.Writer
// Stderr is a pointer to a writer which receives the standard error stream.
Stderr io.Writer
// ExtraFiles specifies additional open files to be inherited by the container
ExtraFiles []*os.File
// Initial sizings for the console
ConsoleWidth uint16
ConsoleHeight uint16
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask
Capabilities *configs.Capabilities
// AppArmorProfile specifies the profile to apply to the process and is
// changed at the time the process is execed
AppArmorProfile string
// Label specifies the label to apply to the process. It is commonly used by selinux
Label string
// NoNewPrivileges controls whether processes can gain additional privileges.
NoNewPrivileges *bool
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []configs.Rlimit
// ConsoleSocket provides the masterfd console.
ConsoleSocket *os.File
ops processOperations
}
其实是定义了linux 进程运行的各种参数。
接下来看一下process的创建代码:
func newProcess(p specs.Process) (*libcontainer.Process, error) {
lp := &libcontainer.Process{
Args: p.Args,
Env: p.Env,
// TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
User:fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
Cwd: p.Cwd,
Label: p.SelinuxLabel,
NoNewPrivileges: &p.NoNewPrivileges,
AppArmorProfile: p.ApparmorProfile,
}
if p.ConsoleSize != nil {
lp.ConsoleWidth = uint16(p.ConsoleSize.Width)
lp.ConsoleHeight = uint16(p.ConsoleSize.Height)
}
if p.Capabilities != nil {
lp.Capabilities = &configs.Capabilities{}
lp.Capabilities.Bounding = p.Capabilities.Bounding
lp.Capabilities.Effective = p.Capabilities.Effective
lp.Capabilities.Inheritable = p.Capabilities.Inheritable
lp.Capabilities.Permitted = p.Capabilities.Permitted
lp.Capabilities.Ambient = p.Capabilities.Ambient
}
for _, gid := range p.User.AdditionalGids {
lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
}
for _, rlimit := range p.Rlimits {
rl, err := createLibContainerRlimit(rlimit)
if err != nil {
return nil, err
}
lp.Rlimits = append(lp.Rlimits, rl)
}
return lp, nil
}
上面的函数中生成了libcontainer.Process,在run中处理一下listen、io、目录、UID、GID之后调用container的start函数进入container的处理阶段
我们先来看一下container的接口定义:
type BaseContainer interface {
// Returns the ID of the container
ID() string
// Returns the current status of the container.
Status() (Status, error)
// State returns the current container's state information.
State() (*State, error)
// Returns the current config of the container.
Config() configs.Config
// Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
Processes() ([]int, error)
// Returns statistics for the container.
Stats() (*Stats, error)
// Set resources of container as configured
Set(config configs.Config) error
// Start a process inside the container. Returns error if process fails to
// start. You can track process lifecycle with passed Process structure.
Start(process *Process) (err error)
// Run immediately starts the process inside the container. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns.
Run(process *Process) (err error)
// Destroys the container, if its in a valid state, after killing any
// remaining running processes.
Destroy() error
// Signal sends the provided signal code to the container's initial process.
Signal(s os.Signal, all bool) error
// Exec signals the container to exec the users process at the end of the init.
Exec() error
}
type Container interface {
BaseContainer
// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
Checkpoint(criuOpts *CriuOpts) error
// Restore restores the checkpointed container to a running state using the criu(8) utility.
Restore(process *Process, criuOpts *CriuOpts) error
// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
// the execution of any user processes. Asynchronously, when the container finished being paused the
// state is changed to PAUSED.
Pause() error
// If the Container state is PAUSED, resumes the execution of any user processes in the
// Container before setting the Container state to RUNNING.
Resume() error
// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
NotifyOOM() (<-chan struct{}, error)
// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
}
以上的定义基本上是对一个容器的所有操作,主要包括了状态信息。
这里我们分析一下container的start函数:
func (c *linuxContainer) Start(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
if err := c.createExecFifo(); err != nil {
return err
}
}
if err := c.start(process, status == Stopped); err != nil {
if status == Stopped {
c.deleteExecFifo()
}
return err
}
return nil
}
这里主要是调用了currentStatus()获取容器当前的状态,之前LinuxFactory创建container的时候设置了容器的状态为stopped,接下来调用start函数设置了两个参数为true,我们接着分析该函数:
func (c *linuxContainer) start(process *Process, isInit bool) error {
parent, err := c.newParentProcess(process, isInit)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
}
if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped.
if err := ignoreTerminateErrors(parent.terminate()); err != nil {
logrus.Warn(err)
}
return newSystemErrorWithCause(err, "starting container process")
}
// generate a timestamp indicating when the container was started
c.created = time.Now().UTC()
if isInit {
c.state = &createdState{
c: c,
}
state, err := c.updateState(parent)
if err != nil {
return err
}
c.initProcessStartTime = state.InitProcessStartTime
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Bundle: utils.SearchLabels(c.config.Labels, "bundle"),
}
for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := ignoreTerminateErrors(parent.terminate()); err != nil {
logrus.Warn(err)
}
return newSystemErrorWithCausef(err, "running poststart hook %d", i)
}
}
}
} else {
c.state = &runningState{
c: c,
}
}
return nil
}
该函数中new了一个newParentProcess,执行start ParentProcess,并更新parentProcess的状态。
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := utils.NewSockPair("init")
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
}
cmd, err := c.commandTemplate(p, childPipe)
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
}
if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
}
// We only set up fifoFd if we're not doing a `runc exec`. The historic
// reason for this is that previously we would pass a dirfd that allowed
// for container rootfs escape (and not doing it in `runc exec` avoided
// that problem), but we no longer do that. However, there's no need to do
// this for `runc exec` so we just keep it this way to be safe.
if err := c.includeExecFifo(cmd); err != nil {
return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
}
return c.newInitProcess(p, cmd, parentPipe, childPipe)
}
该函数中创建了一对pipe作为runc Crete进程与接下来的initProcess–runc init(容器内部的init进程)直接通信用的, 然后又newInitProcess作为返回对象–仅仅是结构和数据初始化,其源码如下:
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
}
}
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
if err != nil {
return nil, err
}
return &initProcess{
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
intelRdtManager: c.intelRdtManager,
config: c.newInitConfig(p),
container: c,
process: p,
bootstrapData: data,
sharePidns: sharePidns,
}, nil
}
接下来执行parentProcess.start—也就是initProcess.start():
func (p *initProcess) start() error {
defer p.parentPipe.Close()
err := p.cmd.Start()
p.process.ops = p
p.childPipe.Close()
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
}
// Do this before syncing with child so that no children can escape the
// cgroup. We don't need to worry about not doing this and not being root
// because we'd be using the rootless cgroup manager in that case.
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
……
……
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid())
if err != nil {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
}
p.setExternalDescriptors(fds)
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
}
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
}
var (
sentRun bool
sentResume bool
)
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
……
……
})
if !sentRun {
return newSystemErrorWithCause(ierr, "container init")
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
}
if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "shutting down init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return ierr
}
return nil
}
其中err := p.cmd.Start()、p.manager.Apply(p.pid())、 p.createNetworkInterfaces();、p.sendConfig();是该函数的主体。
p.cmd的实体参数是上面说过的:“/proc/self/exe”–即runC自身,然后参数是”init”.
p.manager.Apply()设置了新进程的cgroup信息,p.createNetworkInterfaces()创建新的network pari,p.sendConfig发送配置到新进程中。
前面所有的流程都是runc create 容器的流程,这里新创举的容器即为容器内的进程,即容器内功init的进程,根据前面的配置信息其执行的是runc init 命令,我们来分析runc init。
首先这里用一张图来总结和展望一下linux Container创建容器的过程。
从上图可以看出linux 容器创建的基本流程是runc-create进程中先构建一个linuxFactory,然后根据该Factory的配置创建LinuxContainer, 解析来创建Container的process信息,在开始处理该process–即在runc-create进程中fork出一个进程–runc-init,然后由runc-create将runc-init的cgroup信息进行设置并发送给runc-init进程,该runc-init进程就是linux Container的1号进程。而container 本身只是一个逻辑概念~~一组Cgroup限制下的进程的组合体。
接下来分析runc init进程:
var initCommand = cli.Command{
Name: "init",
Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
Action: func(context *cli.Context) error {
factory, _ := libcontainer.New("")
if err := factory.StartInitialization(); err != nil {
// as the error is sent back to the parent there is no need to log
// or write it to stderr because the parent process will handle this
os.Exit(1)
}
panic("libcontainer: container init failed to exec")
},
}
这个函数很简单,直接指向factory.StartInitialization():
func (l *LinuxFactory) StartInitialization() (err error) {
……
……
……
i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
if err != nil {
return err
}
// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
return i.Init()
}
这个函数上面大部分代码没有在这里贴出来,其主要部分就是newContainerInit(it, pipe, consoleSocket, fifofd)跟i.Init()。
我们来看一下 newContainerInit(it, pipe, consoleSocket, fifofd):
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
}
if err := populateProcessEnvironment(config.Env); err != nil {
return nil, err
}
switch t {
case initSetns:
return &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
config: config,
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
consoleSocket: consoleSocket,
parentPid: unix.Getppid(),
config: config,
fifoFd: fifoFd,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)
}
根据initType返回initSetns结构或者initStandard,接下来分析i.Init():
func (l *linuxStandardInit) Init() error {
……
……
if err := setupNetwork(l.config); err != nil {
创建和设置网络配
return err
}
if err := setupRoute(l.config.Config); err != nil {
初始化路由配置
return err
}
label.Init()
设置seLinux Label
……
……
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
配置console
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
// Finish the rootfs setup.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {
return err
}
}
if hostname := l.config.Config.Hostname; hostname != "" {
if err := unix.Sethostname([]byte(hostname)); err != nil {
return err
}
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
for key, value := range l.config.Config.Sysctl {
if err := writeSystemProperty(key, value); err != nil {
return err
}
}
设置Namespace, profile, 环境变量等
for _, path := range l.config.Config.ReadonlyPaths {
if err := readonlyPath(path); err != nil {
return err
}
}
配置只读文件信息
for _, path := range l.config.Config.MaskPaths {
if err := maskPath(path); err != nil {
return err
}
}
pdeath, err := system.GetParentDeathSignal()
if err != nil {
return err
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
}
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
}
if err := finalizeNamespace(l.config); err != nil {
return err
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := pdeath.Restore(); err != nil {
return err
}
// Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died
// and we were reparented to something else so we should just kill ourself
// and not cause problems for someone else.
if unix.Getppid() != l.parentPid {
return unix.Kill(unix.Getpid(), unix.SIGKILL)
}
// Check for the arg before waiting to make sure it exists and it is
// returned as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// Close the pipe to signal that we have completed our init.
l.pipe.Close()
// Wait for the FIFO to be opened on the other side before exec-ing the
// user process. We open it through /proc/self/fd/$fd, because the fd that
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
// re-open an O_PATH fd through /proc.
fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo")
}
if _, err := unix.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
}
// Close the O_PATH fifofd fd before exec because the kernel resets
// dumpable in the wrong order. This has been fixed in newer kernels, but
// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
// N.B. the core issue itself (passing dirfds to the host filesystem) has
// since been resolved.
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
unix.Close(l.fifoFd)
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
syscall.Exec真正执行用户指定的程序
return newSystemErrorWithCause(err, "exec user process")
}
return nil
}
至此容器创建完成,容器以及正常运行了。
1、http://blog.csdn.net/max_cong/article/details/60721500
2、http://blog.csdn.net/max_cong/article/details/54782434
3、http://blog.csdn.net/max_cong/article/details/60872323
4、http://blog.csdn.net/max_cong/article/details/60879164