docker 学习笔记2:runc 源码分析

2019-12-07  本文已影响0人  董泽润

TL;DR 这一篇从源码角度分析 runc 的实现,不了解的可以参考《什么是 runc》,代码逻辑很绕需要耐心。

程序入口

使用 github.com/urfave/cli 库做为命令行工具,看起来非常好用,main.go 里定义了 App,将所有命令行参数注册进来,然后 App.Run() 启动服务。

    Action: func(context *cli.Context) error {
        if err := checkArgs(context, 1, exactArgs); err != nil {
            return err
        }
        if err := revisePidFile(context); err != nil {
            return err
        }
        spec, err := setupSpec(context)
        if err != nil {
            return err
        }
        status, err := startContainer(context, spec, CT_ACT_RUN, nil)
        if err == nil {
            // exit with the container's exit status so any external supervisor is
            // notified of the exit with the correct exit status.
            os.Exit(status)
        }
        return err
    },

来看下 runCommandcreateCommand 内容基本一样:

唯一的区别在于启动时传入的参数 CT_ACT_RUN 或是 CT_ACT_CREATE. 由上一篇可知,runc run xxx 一口气启动容器并执行 cmd,但是 runc create xxx 只是先启动 runc init 并初始化,但是没有执行 cmd.

启动 Container

startContainer 是启动容器的入口,主要做:

1. createContainer

func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
    config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
        CgroupName:       id,
        UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
        NoPivotRoot:      context.Bool("no-pivot"),
        NoNewKeyring:     context.Bool("no-new-keyring"),
        Spec:             spec,
        Rootless:         isRootless(),
    })
    if err != nil {
        return nil, err
    }

    factory, err := loadFactory(context)
    if err != nil {
        return nil, err
    }
    return factory.Create(id, config)
}

这里面是个工厂方法,因为 runc 要支持 linux, windows 或其它操作系统

func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
    root := context.GlobalString("root")
    abs, err := filepath.Abs(root)
    if err != nil {
        return nil, err
    }
    cgroupManager := libcontainer.Cgroupfs
    if context.GlobalBool("systemd-cgroup") {
        if systemd.UseSystemd() {
            cgroupManager = libcontainer.SystemdCgroups
        } else {
            return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
        }
    }
    return libcontainer.New(abs, cgroupManager, libcontainer.CriuPath(context.GlobalString("criu")))
}

然后看 loadFactory 函数,如果系统使用了 systemd 那么 cgroupManger 就用 libcontainer.SystemdCgroups,否则使用 libcontainer.Cgroupfs

// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
    if root != "" {
        if err := os.MkdirAll(root, 0700); err != nil {
            return nil, newGenericError(err, SystemError)
        }
    }
    l := &LinuxFactory{
        Root:      root,
        InitArgs:  []string{"/proc/self/exe", "init"},
        Validator: validate.New(),
        CriuPath:  "criu",
    }
    Cgroupfs(l)
    for _, opt := range options {
        if err := opt(l); err != nil {
            return nil, err
        }
    }
    return l, nil
}

这个 libcontainer.New 非常关键,可以看到 InitArgs 的内容,/proc/self/exe 就是当前进程二进制, 实际上最后执行的就是 runc init

func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
    ......
    c := &linuxContainer{
        id:            id,
        root:          containerRoot,
        config:        config,
        initArgs:      l.InitArgs,
        criuPath:      l.CriuPath,
        cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
    }
    c.state = &stoppedState{c: c}
    return c, nil
}

最后根据工厂方法,来生成我们的 container,要记住,linuxContainer.initArgs 就是 runc init,然后设置状态为 stoppedState

2. runner

startContainer 函数末尾,调用 r.run(spec.Process) 来运行 container,其中 spec.Process 就是 config.json 中要运行的内容。

func (r *runner) run(config *specs.Process) (int, error) {
    process, err := newProcess(*config)
    if err != nil {
        r.destroy()
        return -1, err
    }
    if len(r.listenFDs) > 0 {
        process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
        process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
    }
    baseFd := 3 + len(process.ExtraFiles)
    for i := baseFd; i < baseFd+r.preserveFDs; i++ {
        process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
    }
    ......
    switch r.action {
    case CT_ACT_CREATE:
        err = r.container.Start(process)
    case CT_ACT_RESTORE:
        err = r.container.Restore(process, r.criuOpts)
    case CT_ACT_RUN:
        err = r.container.Run(process)
    default:
        panic("Unknown action")
    }
    ......
}
// newProcess returns a new libcontainer Process with the arguments from the
// spec and stdio from the current process.
func newProcess(p specs.Process) (*libcontainer.Process, error) {
    lp := &libcontainer.Process{
        Args: p.Args,
        Env:  p.Env,
        // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
        User:            fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
        Cwd:             p.Cwd,
        Label:           p.SelinuxLabel,
        NoNewPrivileges: &p.NoNewPrivileges,
        AppArmorProfile: p.ApparmorProfile,
    }
    if p.Capabilities != nil {
        lp.Capabilities = &configs.Capabilities{}
        lp.Capabilities.Bounding = p.Capabilities.Bounding
        lp.Capabilities.Effective = p.Capabilities.Effective
        lp.Capabilities.Inheritable = p.Capabilities.Inheritable
        lp.Capabilities.Permitted = p.Capabilities.Permitted
        lp.Capabilities.Ambient = p.Capabilities.Ambient
    }
    for _, gid := range p.User.AdditionalGids {
        lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
    }
    for _, rlimit := range p.Rlimits {
        rl, err := createLibContainerRlimit(rlimit)
        if err != nil {
            return nil, err
        }
        lp.Rlimits = append(lp.Rlimits, rl)
    }
    return lp, nil
}

3. CT_ACT_RUN

调用 r.container.Run(process) 一口气运行容器的 cmd

func (c *linuxContainer) Run(process *Process) error {
    c.m.Lock()
    status, err := c.currentStatus()
    if err != nil {
        c.m.Unlock()
        return err
    }
    c.m.Unlock()
    if err := c.Start(process); err != nil {
        return err
    }
    if status == Stopped {
        return c.exec()
    }
    return nil
}

func (c *linuxContainer) exec() error {
    path := filepath.Join(c.root, execFifoFilename)
    f, err := os.OpenFile(path, os.O_RDONLY, 0)
    if err != nil {
        return newSystemErrorWithCause(err, "open exec fifo for reading")
    }
    defer f.Close()
    data, err := ioutil.ReadAll(f)
    if err != nil {
        return err
    }
    if len(data) > 0 {
        os.Remove(path)
        return nil
    }
    return fmt.Errorf("cannot start an already running container")
}

Run 先获取当前容器状态,由前面代码可知此时是 stopState,调用 c.Start 去启动容器,最后调用 exec 运行容器进程,通过 exec 代码可知,只是读取了 execFifoFilename 文件内容而己。实际上这个 fifo file 就是用来同步的。

func (c *linuxContainer) Start(process *Process) error {
    c.m.Lock()
    defer c.m.Unlock()
    status, err := c.currentStatus()
    if err != nil {
        return err
    }
    if status == Stopped {
        if err := c.createExecFifo(); err != nil {
            return err
        }
    }
    if err := c.start(process, status == Stopped); err != nil {
        if status == Stopped {
            c.deleteExecFifo()
        }
        return err
    }
    return nil
}

继续看 Start 代码,判断当前状态是 Stopped,那么就创建管道文件 execFifoFilename,最后调用 start 去启动

func (c *linuxContainer) start(process *Process, isInit bool) error {
    parent, err := c.newParentProcess(process, isInit)
    if err != nil {
        return newSystemErrorWithCause(err, "creating new parent process")
    }
    if err := parent.start(); err != nil {
        // terminate the process to ensure that it properly is reaped.
        if err := parent.terminate(); err != nil {
            logrus.Warn(err)
        }
        return newSystemErrorWithCause(err, "starting container process")
    }
    // generate a timestamp indicating when the container was started
    c.created = time.Now().UTC()
    if isInit {
        c.state = &createdState{
            c: c,
        }
        state, err := c.updateState(parent)
        if err != nil {
            return err
        }
        c.initProcessStartTime = state.InitProcessStartTime

        if c.config.Hooks != nil {
            s := configs.HookState{
                Version: c.config.Version,
                ID:      c.id,
                Pid:     parent.pid(),
                Bundle:  utils.SearchLabels(c.config.Labels, "bundle"),
            }
            for i, hook := range c.config.Hooks.Poststart {
                if err := hook.Run(s); err != nil {
                    if err := parent.terminate(); err != nil {
                        logrus.Warn(err)
                    }
                    return newSystemErrorWithCausef(err, "running poststart hook %d", i)
                }
            }
        }
    } else {
        c.state = &runningState{
            c: c,
        }
    }
    return nil
}

调用 newParentProcess 创建 runc init,然后 start 启动,最后调用各种 hook

func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
    parentPipe, childPipe, err := utils.NewSockPair("init")
    if err != nil {
        return nil, newSystemErrorWithCause(err, "creating new init pipe")
    }
    cmd, err := c.commandTemplate(p, childPipe)
    if err != nil {
        return nil, newSystemErrorWithCause(err, "creating new command template")
    }
    if !doInit {
        return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
    }

    // We only set up rootDir if we're not doing a `runc exec`. The reason for
    // this is to avoid cases where a racing, unprivileged process inside the
    // container can get access to the statedir file descriptor (which would
    // allow for container rootfs escape).
    rootDir, err := os.Open(c.root)
    if err != nil {
        return nil, err
    }
    cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
    cmd.Env = append(cmd.Env,
        fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
    return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
}
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
    cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
    cmd.Stdin = p.Stdin
    cmd.Stdout = p.Stdout
    cmd.Stderr = p.Stderr
    cmd.Dir = c.config.Rootfs
    if cmd.SysProcAttr == nil {
        cmd.SysProcAttr = &syscall.SysProcAttr{}
    }
    cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
    if p.ConsoleSocket != nil {
        cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
        cmd.Env = append(cmd.Env,
            fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
        )
    }
    cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
    cmd.Env = append(cmd.Env,
        fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
    )
    return cmd, nil
}

再来看 commandTemplate, 注意这个 initArgs 就是 runc init,另外看 childPipe 也扔到了 cmd.ExtraFiles 中,用于通信

4. newInitProcess

func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
    cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
    nsMaps := make(map[configs.NamespaceType]string)
    for _, ns := range c.config.Namespaces {
        if ns.Path != "" {
            nsMaps[ns.Type] = ns.Path
        }
    }
    _, sharePidns := nsMaps[configs.NEWPID]
    data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
    if err != nil {
        return nil, err
    }
    return &initProcess{
        cmd:           cmd,
        childPipe:     childPipe,
        parentPipe:    parentPipe,
        manager:       c.cgroupManager,
        config:        c.newInitConfig(p),
        container:     c,
        process:       p,
        bootstrapData: data,
        sharePidns:    sharePidns,
        rootDir:       rootDir,
    }, nil
}

再来看下最核心的 initProcess.start 函数的实现

func (p *initProcess) start() error {
    defer p.parentPipe.Close()
    err := p.cmd.Start()
    p.process.ops = p
    p.childPipe.Close()
    p.rootDir.Close()
    if err != nil {
        p.process.ops = nil
        return newSystemErrorWithCause(err, "starting init process command")
    }
    if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
        return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
    }
    if err := p.execSetns(); err != nil {
        return newSystemErrorWithCause(err, "running exec setns process for init")
    }
    // Save the standard descriptor names before the container process
    // can potentially move them (e.g., via dup2()).  If we don't do this now,
    // we won't know at checkpoint time which file descriptor to look up.
    fds, err := getPipeFds(p.pid())
    if err != nil {
        return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
    }
    p.setExternalDescriptors(fds)
    // Do this before syncing with child so that no children can escape the
    // cgroup. We don't need to worry about not doing this and not being root
    // because we'd be using the rootless cgroup manager in that case.
    if err := p.manager.Apply(p.pid()); err != nil {
        return newSystemErrorWithCause(err, "applying cgroup configuration for process")
    }
    defer func() {
        if err != nil {
            // TODO: should not be the responsibility to call here
            p.manager.Destroy()
        }
    }()
    if err := p.createNetworkInterfaces(); err != nil {
        return newSystemErrorWithCause(err, "creating network interfaces")
    }
    if err := p.sendConfig(); err != nil {
        return newSystemErrorWithCause(err, "sending config to init process")
    }
    ......
}

子进程 runc init

接下来看最重要的 runc init,负责启动容器,并运行 cmd

import (
    "os"
    "runtime"

    "github.com/opencontainers/runc/libcontainer"
    _ "github.com/opencontainers/runc/libcontainer/nsenter"
    "github.com/urfave/cli"
)

func init() {
    if len(os.Args) > 1 && os.Args[1] == "init" {
        runtime.GOMAXPROCS(1)
        runtime.LockOSThread()
    }
}

var initCommand = cli.Command{
    Name:  "init",
    Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
    Action: func(context *cli.Context) error {
        factory, _ := libcontainer.New("")
        if err := factory.StartInitialization(); err != nil {
            // as the error is sent back to the parent there is no need to log
            // or write it to stderr because the parent process will handle this
            os.Exit(1)
        }
        panic("libcontainer: container init failed to exec")
    },
}

runc/init.go 文件很短,但是内容超级多。

1. nsenter 两次 fork

nsenter.go 有一个 init 函数,它有一个 constructor 属性,GNU C 会保证他在 main 函数前执行,可以理解为构造函数,两将 clone 创建线程的逻辑就在这里。

/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
    nsexec();
}
*/
struct nlconfig_t {
    char *data;
    uint32_t cloneflags;
    char *uidmap;
    size_t uidmap_len;
    char *gidmap;
    size_t gidmap_len;
    char *namespaces;
    size_t namespaces_len;
    uint8_t is_setgroup;
    uint8_t is_rootless;
    char *oom_score_adj;
    size_t oom_score_adj_len;
};

nsexec 函数太长太长了,简单来说,就是从 pipe 中拿到 runc create 或是 runc run 进程传过来的 bootstrapData,就是 nlconfig_t 配置文件,内容是 cloneflags,还有 uid, gid map 用于做 user ns 隔离的,还有 cgroup, oom 等配置,根据这些来限制容器。

nsexec() will first get the file descriptor number for the init pipe from the environment variable _LIBCONTAINER_INITPIPE (which was opened by the parent and kept open across the fork-exec of the nsexec() init process). The init pipe is used to read bootstrap data (namespace paths, clone flags, uid and gid mappings, and the console path) from the parent process. nsexec() will then call setns(2) to join the namespaces provided in the bootstrap data (if available), clone(2) a child process with the provided clone flags, update the user and group ID mappings, do some further miscellaneous setup steps, and then send the PID of the child process to the parent of the nsexec() "caller". Finally, the parent nsexec() will exit and the child nsexec() process will return to allow the Go runtime take over.
NOTE: We do both setns(2) and clone(2) even if we don't have any CLONE_NEW* clone flags because we must fork a new process in order to enter the PID namespace.

为啥要做这么多次 clone 呢,就是因为 PID namespace 的实现必须要求这么做。

2. StartInitialization 启动与运行容器

// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
    var (
        pipefd, rootfd int
        consoleSocket  *os.File
        envInitPipe    = os.Getenv("_LIBCONTAINER_INITPIPE")
        envStateDir    = os.Getenv("_LIBCONTAINER_STATEDIR")
        envConsole     = os.Getenv("_LIBCONTAINER_CONSOLE")
    )

    // Get the INITPIPE.
    pipefd, err = strconv.Atoi(envInitPipe)
    if err != nil {
        return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
    }

    var (
        pipe = os.NewFile(uintptr(pipefd), "pipe")
        it   = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
    )
    defer pipe.Close()

    // Only init processes have STATEDIR.
    rootfd = -1
    if it == initStandard {
        if rootfd, err = strconv.Atoi(envStateDir); err != nil {
            return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
        }
    }

    if envConsole != "" {
        console, err := strconv.Atoi(envConsole)
        if err != nil {
            return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
        }
        consoleSocket = os.NewFile(uintptr(console), "console-socket")
        defer consoleSocket.Close()
    }

    // clear the current process's environment to clean any libcontainer
    // specific env vars.
    os.Clearenv()

    defer func() {
        // We have an error during the initialization of the container's init,
        // send it back to the parent process in the form of an initError.
        if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
            fmt.Fprintln(os.Stderr, err)
            return
        }
        if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
            fmt.Fprintln(os.Stderr, err)
            return
        }
    }()
    defer func() {
        if e := recover(); e != nil {
            err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
        }
    }()

    i, err := newContainerInit(it, pipe, consoleSocket, rootfd)
    if err != nil {
        return err
    }

    // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
    return i.Init()
}
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD int) (initer, error) {
    var config *initConfig
    if err := json.NewDecoder(pipe).Decode(&config); err != nil {
        return nil, err
    }
    if err := populateProcessEnvironment(config.Env); err != nil {
        return nil, err
    }
    switch t {
    case initSetns:
        return &linuxSetnsInit{
            pipe:          pipe,
            consoleSocket: consoleSocket,
            config:        config,
        }, nil
    case initStandard:
        return &linuxStandardInit{
            pipe:          pipe,
            consoleSocket: consoleSocket,
            parentPid:     unix.Getppid(),
            config:        config,
            stateDirFD:    stateDirFD,
        }, nil
    }
    return nil, fmt.Errorf("unknown init type %q", t)
}
func (l *linuxStandardInit) Init() error {
    if !l.config.Config.NoNewKeyring {
        ringname, keepperms, newperms := l.getSessionRingParams()

        // do not inherit the parent's session keyring
        sessKeyId, err := keys.JoinSessionKeyring(ringname)
        if err != nil {
            return err
        }
        // make session keyring searcheable
        if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
            return err
        }
    }

    if err := setupNetwork(l.config); err != nil {
        return err
    }
    if err := setupRoute(l.config.Config); err != nil {
        return err
    }

    label.Init()

    // prepareRootfs() can be executed only for a new mount namespace.
    if l.config.Config.Namespaces.Contains(configs.NEWNS) {
        if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
            return err
        }
    }

    // Set up the console. This has to be done *before* we finalize the rootfs,
    // but *after* we've given the user the chance to set up all of the mounts
    // they wanted.
    if l.config.CreateConsole {
        if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
            return err
        }
        if err := system.Setctty(); err != nil {
            return err
        }
    }

    // Finish the rootfs setup.
    if l.config.Config.Namespaces.Contains(configs.NEWNS) {
        if err := finalizeRootfs(l.config.Config); err != nil {
            return err
        }
    }

    if hostname := l.config.Config.Hostname; hostname != "" {
        if err := unix.Sethostname([]byte(hostname)); err != nil {
            return err
        }
    }
    if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
        return err
    }
    if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
        return err
    }

    for key, value := range l.config.Config.Sysctl {
        if err := writeSystemProperty(key, value); err != nil {
            return err
        }
    }
    for _, path := range l.config.Config.ReadonlyPaths {
        if err := readonlyPath(path); err != nil {
            return err
        }
    }
    for _, path := range l.config.Config.MaskPaths {
        if err := maskPath(path); err != nil {
            return err
        }
    }
    pdeath, err := system.GetParentDeathSignal()
    if err != nil {
        return err
    }
    if l.config.NoNewPrivileges {
        if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
            return err
        }
    }
    // Tell our parent that we're ready to Execv. This must be done before the
    // Seccomp rules have been applied, because we need to be able to read and
    // write to a socket.
    if err := syncParentReady(l.pipe); err != nil {
        return err
    }
    // Without NoNewPrivileges seccomp is a privileged operation, so we need to
    // do this before dropping capabilities; otherwise do it as late as possible
    // just before execve so as few syscalls take place after it as possible.
    if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
        if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
            return err
        }
    }
    if err := finalizeNamespace(l.config); err != nil {
        return err
    }
    // finalizeNamespace can change user/group which clears the parent death
    // signal, so we restore it here.
    if err := pdeath.Restore(); err != nil {
        return err
    }
    // compare the parent from the initial start of the init process and make sure that it did not change.
    // if the parent changes that means it died and we were reparented to something else so we should
    // just kill ourself and not cause problems for someone else.
    if unix.Getppid() != l.parentPid {
        return unix.Kill(unix.Getpid(), unix.SIGKILL)
    }
    // check for the arg before waiting to make sure it exists and it is returned
    // as a create time error.
    name, err := exec.LookPath(l.config.Args[0])
    if err != nil {
        return err
    }
    // close the pipe to signal that we have completed our init.
    l.pipe.Close()
    // wait for the fifo to be opened on the other side before
    // exec'ing the users process.
    fd, err := unix.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|unix.O_CLOEXEC, 0)
    if err != nil {
        return newSystemErrorWithCause(err, "openat exec fifo")
    }
    if _, err := unix.Write(fd, []byte("0")); err != nil {
        return newSystemErrorWithCause(err, "write 0 exec fifo")
    }
    if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
        if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
            return newSystemErrorWithCause(err, "init seccomp")
        }
    }
    // close the statedir fd before exec because the kernel resets dumpable in the wrong order
    // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
    unix.Close(l.stateDirFD)
    if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
        return newSystemErrorWithCause(err, "exec user process")
    }
    return nil
}

先看下用 runc run 启动时调用的 linuxStandardInit.Init 逻辑

小结

想不到 runc 代码量不大,但是逻辑这么复杂。下一步是继续看 docker 的其它组件并分析源码

上一篇 下一篇

猜你喜欢

热点阅读