docker 学习笔记2：runc 源码分析

2019-12-07 本文已影响0人董泽润

TL;DR 这一篇从源码角度分析 runc 的实现，不了解的可以参考《什么是 runc》，代码逻辑很绕需要耐心。

程序入口

使用 github.com/urfave/cli 库做为命令行工具，看起来非常好用，main.go 里定义了 App，将所有命令行参数注册进来，然后 App.Run() 启动服务。

    Action: func(context *cli.Context) error {
        if err := checkArgs(context, 1, exactArgs); err != nil {
            return err
        }
        if err := revisePidFile(context); err != nil {
            return err
        }
        spec, err := setupSpec(context)
        if err != nil {
            return err
        }
        status, err := startContainer(context, spec, CT_ACT_RUN, nil)
        if err == nil {
            // exit with the container's exit status so any external supervisor is
            // notified of the exit with the correct exit status.
            os.Exit(status)
        }
        return err
    },

来看下 runCommand 和 createCommand 内容基本一样：

检查命令行参数个数
修定 pidfile 变成绝对路径
解析 config.json 文件，这个文件就是 runtime spec
最后调用 startContainer 启动容器

唯一的区别在于启动时传入的参数 CT_ACT_RUN 或是 CT_ACT_CREATE. 由上一篇可知，runc run xxx 一口气启动容器并执行 cmd，但是 runc create xxx 只是先启动 runc init 并初始化，但是没有执行 cmd.

启动 Container

startContainer 是启动容器的入口，主要做：

context.Args().First() 获取 id，实际就是容器的名字，比如 runc run xxx 中的 xxx
如果环境变量中有 NOTIFY_SOCKET，那就要初始化 notifySocket，这个忽略不看
createContainer 根据给定的 id, spec 调用工厂方法创建容易
封装了 runner 去真正的运行容器

1. createContainer

func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
    config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
        CgroupName:       id,
        UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
        NoPivotRoot:      context.Bool("no-pivot"),
        NoNewKeyring:     context.Bool("no-new-keyring"),
        Spec:             spec,
        Rootless:         isRootless(),
    })
    if err != nil {
        return nil, err
    }

    factory, err := loadFactory(context)
    if err != nil {
        return nil, err
    }
    return factory.Create(id, config)
}

这里面是个工厂方法，因为 runc 要支持 linux, windows 或其它操作系统

func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
    root := context.GlobalString("root")
    abs, err := filepath.Abs(root)
    if err != nil {
        return nil, err
    }
    cgroupManager := libcontainer.Cgroupfs
    if context.GlobalBool("systemd-cgroup") {
        if systemd.UseSystemd() {
            cgroupManager = libcontainer.SystemdCgroups
        } else {
            return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
        }
    }
    return libcontainer.New(abs, cgroupManager, libcontainer.CriuPath(context.GlobalString("criu")))
}

然后看 loadFactory 函数，如果系统使用了 systemd 那么 cgroupManger 就用 libcontainer.SystemdCgroups，否则使用 libcontainer.Cgroupfs

// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
    if root != "" {
        if err := os.MkdirAll(root, 0700); err != nil {
            return nil, newGenericError(err, SystemError)
        }
    }
    l := &LinuxFactory{
        Root:      root,
        InitArgs:  []string{"/proc/self/exe", "init"},
        Validator: validate.New(),
        CriuPath:  "criu",
    }
    Cgroupfs(l)
    for _, opt := range options {
        if err := opt(l); err != nil {
            return nil, err
        }
    }
    return l, nil
}

这个 libcontainer.New 非常关键，可以看到 InitArgs 的内容，/proc/self/exe 就是当前进程二进制，实际上最后执行的就是 runc init

func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
    ......
    c := &linuxContainer{
        id:            id,
        root:          containerRoot,
        config:        config,
        initArgs:      l.InitArgs,
        criuPath:      l.CriuPath,
        cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
    }
    c.state = &stoppedState{c: c}
    return c, nil
}

最后根据工厂方法，来生成我们的 container，要记住，linuxContainer.initArgs 就是 runc init，然后设置状态为 stoppedState

2. runner

startContainer 函数末尾，调用 r.run(spec.Process) 来运行 container，其中 spec.Process 就是 config.json 中要运行的内容。

func (r *runner) run(config *specs.Process) (int, error) {
    process, err := newProcess(*config)
    if err != nil {
        r.destroy()
        return -1, err
    }
    if len(r.listenFDs) > 0 {
        process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
        process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
    }
    baseFd := 3 + len(process.ExtraFiles)
    for i := baseFd; i < baseFd+r.preserveFDs; i++ {
        process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
    }
    ......
    switch r.action {
    case CT_ACT_CREATE:
        err = r.container.Start(process)
    case CT_ACT_RESTORE:
        err = r.container.Restore(process, r.criuOpts)
    case CT_ACT_RUN:
        err = r.container.Run(process)
    default:
        panic("Unknown action")
    }
    ......
}

newProcess 根据 config.json 中的 Process config 生成 process 结构体，这里面包含运行容器的 cmd
根据 LISTEN_FDS 和 preserveFDs，把当前打开的哪些文件传入到 process 中，容器会用到
根据 r.action 的不同，分别调用不同逻辑，CT_ACT_CREATE 只需创建容器就可以，而 CT_ACT_RUN 需要运行 cmd，CT_ACT_RESTORE 后面再看。

// newProcess returns a new libcontainer Process with the arguments from the
// spec and stdio from the current process.
func newProcess(p specs.Process) (*libcontainer.Process, error) {
    lp := &libcontainer.Process{
        Args: p.Args,
        Env:  p.Env,
        // TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
        User:            fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
        Cwd:             p.Cwd,
        Label:           p.SelinuxLabel,
        NoNewPrivileges: &p.NoNewPrivileges,
        AppArmorProfile: p.ApparmorProfile,
    }
    if p.Capabilities != nil {
        lp.Capabilities = &configs.Capabilities{}
        lp.Capabilities.Bounding = p.Capabilities.Bounding
        lp.Capabilities.Effective = p.Capabilities.Effective
        lp.Capabilities.Inheritable = p.Capabilities.Inheritable
        lp.Capabilities.Permitted = p.Capabilities.Permitted
        lp.Capabilities.Ambient = p.Capabilities.Ambient
    }
    for _, gid := range p.User.AdditionalGids {
        lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
    }
    for _, rlimit := range p.Rlimits {
        rl, err := createLibContainerRlimit(rlimit)
        if err != nil {
            return nil, err
        }
        lp.Rlimits = append(lp.Rlimits, rl)
    }
    return lp, nil
}

3. CT_ACT_RUN

调用 r.container.Run(process) 一口气运行容器的 cmd

func (c *linuxContainer) Run(process *Process) error {
    c.m.Lock()
    status, err := c.currentStatus()
    if err != nil {
        c.m.Unlock()
        return err
    }
    c.m.Unlock()
    if err := c.Start(process); err != nil {
        return err
    }
    if status == Stopped {
        return c.exec()
    }
    return nil
}

func (c *linuxContainer) exec() error {
    path := filepath.Join(c.root, execFifoFilename)
    f, err := os.OpenFile(path, os.O_RDONLY, 0)
    if err != nil {
        return newSystemErrorWithCause(err, "open exec fifo for reading")
    }
    defer f.Close()
    data, err := ioutil.ReadAll(f)
    if err != nil {
        return err
    }
    if len(data) > 0 {
        os.Remove(path)
        return nil
    }
    return fmt.Errorf("cannot start an already running container")
}

Run 先获取当前容器状态，由前面代码可知此时是 stopState，调用 c.Start 去启动容器，最后调用 exec 运行容器进程，通过 exec 代码可知，只是读取了 execFifoFilename 文件内容而己。实际上这个 fifo file 就是用来同步的。

func (c *linuxContainer) Start(process *Process) error {
    c.m.Lock()
    defer c.m.Unlock()
    status, err := c.currentStatus()
    if err != nil {
        return err
    }
    if status == Stopped {
        if err := c.createExecFifo(); err != nil {
            return err
        }
    }
    if err := c.start(process, status == Stopped); err != nil {
        if status == Stopped {
            c.deleteExecFifo()
        }
        return err
    }
    return nil
}

继续看 Start 代码，判断当前状态是 Stopped，那么就创建管道文件 execFifoFilename，最后调用 start 去启动

func (c *linuxContainer) start(process *Process, isInit bool) error {
    parent, err := c.newParentProcess(process, isInit)
    if err != nil {
        return newSystemErrorWithCause(err, "creating new parent process")
    }
    if err := parent.start(); err != nil {
        // terminate the process to ensure that it properly is reaped.
        if err := parent.terminate(); err != nil {
            logrus.Warn(err)
        }
        return newSystemErrorWithCause(err, "starting container process")
    }
    // generate a timestamp indicating when the container was started
    c.created = time.Now().UTC()
    if isInit {
        c.state = &createdState{
            c: c,
        }
        state, err := c.updateState(parent)
        if err != nil {
            return err
        }
        c.initProcessStartTime = state.InitProcessStartTime

        if c.config.Hooks != nil {
            s := configs.HookState{
                Version: c.config.Version,
                ID:      c.id,
                Pid:     parent.pid(),
                Bundle:  utils.SearchLabels(c.config.Labels, "bundle"),
            }
            for i, hook := range c.config.Hooks.Poststart {
                if err := hook.Run(s); err != nil {
                    if err := parent.terminate(); err != nil {
                        logrus.Warn(err)
                    }
                    return newSystemErrorWithCausef(err, "running poststart hook %d", i)
                }
            }
        }
    } else {
        c.state = &runningState{
            c: c,
        }
    }
    return nil
}

调用 newParentProcess 创建 runc init，然后 start 启动，最后调用各种 hook

func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
    parentPipe, childPipe, err := utils.NewSockPair("init")
    if err != nil {
        return nil, newSystemErrorWithCause(err, "creating new init pipe")
    }
    cmd, err := c.commandTemplate(p, childPipe)
    if err != nil {
        return nil, newSystemErrorWithCause(err, "creating new command template")
    }
    if !doInit {
        return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
    }

    // We only set up rootDir if we're not doing a `runc exec`. The reason for
    // this is to avoid cases where a racing, unprivileged process inside the
    // container can get access to the statedir file descriptor (which would
    // allow for container rootfs escape).
    rootDir, err := os.Open(c.root)
    if err != nil {
        return nil, err
    }
    cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
    cmd.Env = append(cmd.Env,
        fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
    return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
}

NewSockPair 生成管道，用于通信
commandTemplate 返回一个 *exec.Cmd 生成，这是一个 runc init 进程
根据当前是否是初始化，来决定调用 newSetnsProcess 还是 newInitProcess，无论是 runc create 还是 runc run，此时都是调用 newInitProcess

func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
    cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
    cmd.Stdin = p.Stdin
    cmd.Stdout = p.Stdout
    cmd.Stderr = p.Stderr
    cmd.Dir = c.config.Rootfs
    if cmd.SysProcAttr == nil {
        cmd.SysProcAttr = &syscall.SysProcAttr{}
    }
    cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
    if p.ConsoleSocket != nil {
        cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
        cmd.Env = append(cmd.Env,
            fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
        )
    }
    cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
    cmd.Env = append(cmd.Env,
        fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
    )
    return cmd, nil
}

再来看 commandTemplate, 注意这个 initArgs 就是 runc init，另外看 childPipe 也扔到了 cmd.ExtraFiles 中，用于通信

4. newInitProcess

func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
    cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
    nsMaps := make(map[configs.NamespaceType]string)
    for _, ns := range c.config.Namespaces {
        if ns.Path != "" {
            nsMaps[ns.Type] = ns.Path
        }
    }
    _, sharePidns := nsMaps[configs.NEWPID]
    data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
    if err != nil {
        return nil, err
    }
    return &initProcess{
        cmd:           cmd,
        childPipe:     childPipe,
        parentPipe:    parentPipe,
        manager:       c.cgroupManager,
        config:        c.newInitConfig(p),
        container:     c,
        process:       p,
        bootstrapData: data,
        sharePidns:    sharePidns,
        rootDir:       rootDir,
    }, nil
}

将环境变量 _LIBCONTAINER_INITTYPE = initStandard 设置到待运行的 runc init cmd 中
根据 namespace 生成 bootstrapData，子进程 runc init 会用到

再来看下最核心的 initProcess.start 函数的实现

func (p *initProcess) start() error {
    defer p.parentPipe.Close()
    err := p.cmd.Start()
    p.process.ops = p
    p.childPipe.Close()
    p.rootDir.Close()
    if err != nil {
        p.process.ops = nil
        return newSystemErrorWithCause(err, "starting init process command")
    }
    if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
        return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
    }
    if err := p.execSetns(); err != nil {
        return newSystemErrorWithCause(err, "running exec setns process for init")
    }
    // Save the standard descriptor names before the container process
    // can potentially move them (e.g., via dup2()).  If we don't do this now,
    // we won't know at checkpoint time which file descriptor to look up.
    fds, err := getPipeFds(p.pid())
    if err != nil {
        return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
    }
    p.setExternalDescriptors(fds)
    // Do this before syncing with child so that no children can escape the
    // cgroup. We don't need to worry about not doing this and not being root
    // because we'd be using the rootless cgroup manager in that case.
    if err := p.manager.Apply(p.pid()); err != nil {
        return newSystemErrorWithCause(err, "applying cgroup configuration for process")
    }
    defer func() {
        if err != nil {
            // TODO: should not be the responsibility to call here
            p.manager.Destroy()
        }
    }()
    if err := p.createNetworkInterfaces(); err != nil {
        return newSystemErrorWithCause(err, "creating network interfaces")
    }
    if err := p.sendConfig(); err != nil {
        return newSystemErrorWithCause(err, "sending config to init process")
    }
    ......
}

p.cmd.Start 运行子进程，也就是 runc init，然后关掉无用的 pipe rootfs
通过 pipe 把 bootstrapData 发送到子进程，里面的内容是要隔离的 namespaces
调用 execSetns 设置 runc init 进程的 namespaces, 这里面细节非常多，源码只是等待子进程退出，但是 runc init 退出了谁运行容器呢？这里很绕，后面再讲。涉及到 runc init 进程的两次 fork 操作
调用 p.manager.Apply 设置 cgroup 限制资源
调用 createNetworkInterfaces 创建网络资源，
调用 sendConfig 发送容器启动的配置，到 runc init
最后是状态同步过程，不看了，麻烦

子进程 runc init

接下来看最重要的 runc init，负责启动容器，并运行 cmd

import (
    "os"
    "runtime"

    "github.com/opencontainers/runc/libcontainer"
    _ "github.com/opencontainers/runc/libcontainer/nsenter"
    "github.com/urfave/cli"
)

func init() {
    if len(os.Args) > 1 && os.Args[1] == "init" {
        runtime.GOMAXPROCS(1)
        runtime.LockOSThread()
    }
}

var initCommand = cli.Command{
    Name:  "init",
    Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
    Action: func(context *cli.Context) error {
        factory, _ := libcontainer.New("")
        if err := factory.StartInitialization(); err != nil {
            // as the error is sent back to the parent there is no need to log
            // or write it to stderr because the parent process will handle this
            os.Exit(1)
        }
        panic("libcontainer: container init failed to exec")
    },
}

runc/init.go 文件很短，但是内容超级多。

引入并初始化 github.com/opencontainers/runc/libcontainer/nsenter 库，这里是两次 fork 并初始化 ns 等一系列的关键，参考 README.md
init() 函数里，将 GOMAXPROCS 设为 1，并且调用 LockOSThread 将 main goroutine 锁在了当前线程里。可以参考官方文档
工厂函数，调用 StartInitialization 启动容器，开始干活了

1. nsenter 两次 fork

nsenter.go 有一个 init 函数，它有一个 constructor 属性，GNU C 会保证他在 main 函数前执行，可以理解为构造函数，两将 clone 创建线程的逻辑就在这里。

/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
    nsexec();
}
*/

struct nlconfig_t {
    char *data;
    uint32_t cloneflags;
    char *uidmap;
    size_t uidmap_len;
    char *gidmap;
    size_t gidmap_len;
    char *namespaces;
    size_t namespaces_len;
    uint8_t is_setgroup;
    uint8_t is_rootless;
    char *oom_score_adj;
    size_t oom_score_adj_len;
};

nsexec 函数太长太长了，简单来说，就是从 pipe 中拿到 runc create 或是 runc run 进程传过来的 bootstrapData，就是 nlconfig_t 配置文件，内容是 cloneflags，还有 uid, gid map 用于做 user ns 隔离的，还有 cgroup, oom 等配置，根据这些来限制容器。

nsexec() will first get the file descriptor number for the init pipe from the environment variable _LIBCONTAINER_INITPIPE (which was opened by the parent and kept open across the fork-exec of the nsexec() init process). The init pipe is used to read bootstrap data (namespace paths, clone flags, uid and gid mappings, and the console path) from the parent process. nsexec() will then call setns(2) to join the namespaces provided in the bootstrap data (if available), clone(2) a child process with the provided clone flags, update the user and group ID mappings, do some further miscellaneous setup steps, and then send the PID of the child process to the parent of the nsexec() "caller". Finally, the parent nsexec() will exit and the child nsexec() process will return to allow the Go runtime take over.
NOTE: We do both setns(2) and clone(2) even if we don't have any CLONE_NEW* clone flags because we must fork a new process in order to enter the PID namespace.

为啥要做这么多次 clone 呢，就是因为 PID namespace 的实现必须要求这么做。

2. StartInitialization 启动与运行容器

// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
    var (
        pipefd, rootfd int
        consoleSocket  *os.File
        envInitPipe    = os.Getenv("_LIBCONTAINER_INITPIPE")
        envStateDir    = os.Getenv("_LIBCONTAINER_STATEDIR")
        envConsole     = os.Getenv("_LIBCONTAINER_CONSOLE")
    )

    // Get the INITPIPE.
    pipefd, err = strconv.Atoi(envInitPipe)
    if err != nil {
        return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
    }

    var (
        pipe = os.NewFile(uintptr(pipefd), "pipe")
        it   = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
    )
    defer pipe.Close()

    // Only init processes have STATEDIR.
    rootfd = -1
    if it == initStandard {
        if rootfd, err = strconv.Atoi(envStateDir); err != nil {
            return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
        }
    }

    if envConsole != "" {
        console, err := strconv.Atoi(envConsole)
        if err != nil {
            return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
        }
        consoleSocket = os.NewFile(uintptr(console), "console-socket")
        defer consoleSocket.Close()
    }

    // clear the current process's environment to clean any libcontainer
    // specific env vars.
    os.Clearenv()

    defer func() {
        // We have an error during the initialization of the container's init,
        // send it back to the parent process in the form of an initError.
        if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
            fmt.Fprintln(os.Stderr, err)
            return
        }
        if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
            fmt.Fprintln(os.Stderr, err)
            return
        }
    }()
    defer func() {
        if e := recover(); e != nil {
            err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
        }
    }()

    i, err := newContainerInit(it, pipe, consoleSocket, rootfd)
    if err != nil {
        return err
    }

    // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
    return i.Init()
}

首先就是根据环境变量，初始一些数据，比如 _LIBCONTAINER_INITPIPE 就是与父进程通信用的 pipe，_LIBCONTAINER_INITTYPE 获取当前启动类型
中间注册了一些 defer 函数用于将错误返回父进程，因为在后面会调用 exec 执行 cmd，理论上这些 defer 不会再执行
newContainerInit 调用 Init 执行运行逻辑

func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD int) (initer, error) {
    var config *initConfig
    if err := json.NewDecoder(pipe).Decode(&config); err != nil {
        return nil, err
    }
    if err := populateProcessEnvironment(config.Env); err != nil {
        return nil, err
    }
    switch t {
    case initSetns:
        return &linuxSetnsInit{
            pipe:          pipe,
            consoleSocket: consoleSocket,
            config:        config,
        }, nil
    case initStandard:
        return &linuxStandardInit{
            pipe:          pipe,
            consoleSocket: consoleSocket,
            parentPid:     unix.Getppid(),
            config:        config,
            stateDirFD:    stateDirFD,
        }, nil
    }
    return nil, fmt.Errorf("unknown init type %q", t)
}

首先从 pipe 中读取父进程传过来的配置文件 initConfig
根据当前启动类型，来决定是 linuxSetnsInit 还是 linuxStandardInit，因为我们是标准的创建逻辑，所以走 linuxStandardInit

func (l *linuxStandardInit) Init() error {
    if !l.config.Config.NoNewKeyring {
        ringname, keepperms, newperms := l.getSessionRingParams()

        // do not inherit the parent's session keyring
        sessKeyId, err := keys.JoinSessionKeyring(ringname)
        if err != nil {
            return err
        }
        // make session keyring searcheable
        if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
            return err
        }
    }

    if err := setupNetwork(l.config); err != nil {
        return err
    }
    if err := setupRoute(l.config.Config); err != nil {
        return err
    }

    label.Init()

    // prepareRootfs() can be executed only for a new mount namespace.
    if l.config.Config.Namespaces.Contains(configs.NEWNS) {
        if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
            return err
        }
    }

    // Set up the console. This has to be done *before* we finalize the rootfs,
    // but *after* we've given the user the chance to set up all of the mounts
    // they wanted.
    if l.config.CreateConsole {
        if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
            return err
        }
        if err := system.Setctty(); err != nil {
            return err
        }
    }

    // Finish the rootfs setup.
    if l.config.Config.Namespaces.Contains(configs.NEWNS) {
        if err := finalizeRootfs(l.config.Config); err != nil {
            return err
        }
    }

    if hostname := l.config.Config.Hostname; hostname != "" {
        if err := unix.Sethostname([]byte(hostname)); err != nil {
            return err
        }
    }
    if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
        return err
    }
    if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
        return err
    }

    for key, value := range l.config.Config.Sysctl {
        if err := writeSystemProperty(key, value); err != nil {
            return err
        }
    }
    for _, path := range l.config.Config.ReadonlyPaths {
        if err := readonlyPath(path); err != nil {
            return err
        }
    }
    for _, path := range l.config.Config.MaskPaths {
        if err := maskPath(path); err != nil {
            return err
        }
    }
    pdeath, err := system.GetParentDeathSignal()
    if err != nil {
        return err
    }
    if l.config.NoNewPrivileges {
        if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
            return err
        }
    }
    // Tell our parent that we're ready to Execv. This must be done before the
    // Seccomp rules have been applied, because we need to be able to read and
    // write to a socket.
    if err := syncParentReady(l.pipe); err != nil {
        return err
    }
    // Without NoNewPrivileges seccomp is a privileged operation, so we need to
    // do this before dropping capabilities; otherwise do it as late as possible
    // just before execve so as few syscalls take place after it as possible.
    if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
        if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
            return err
        }
    }
    if err := finalizeNamespace(l.config); err != nil {
        return err
    }
    // finalizeNamespace can change user/group which clears the parent death
    // signal, so we restore it here.
    if err := pdeath.Restore(); err != nil {
        return err
    }
    // compare the parent from the initial start of the init process and make sure that it did not change.
    // if the parent changes that means it died and we were reparented to something else so we should
    // just kill ourself and not cause problems for someone else.
    if unix.Getppid() != l.parentPid {
        return unix.Kill(unix.Getpid(), unix.SIGKILL)
    }
    // check for the arg before waiting to make sure it exists and it is returned
    // as a create time error.
    name, err := exec.LookPath(l.config.Args[0])
    if err != nil {
        return err
    }
    // close the pipe to signal that we have completed our init.
    l.pipe.Close()
    // wait for the fifo to be opened on the other side before
    // exec'ing the users process.
    fd, err := unix.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|unix.O_CLOEXEC, 0)
    if err != nil {
        return newSystemErrorWithCause(err, "openat exec fifo")
    }
    if _, err := unix.Write(fd, []byte("0")); err != nil {
        return newSystemErrorWithCause(err, "write 0 exec fifo")
    }
    if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
        if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
            return newSystemErrorWithCause(err, "init seccomp")
        }
    }
    // close the statedir fd before exec because the kernel resets dumpable in the wrong order
    // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
    unix.Close(l.stateDirFD)
    if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
        return newSystemErrorWithCause(err, "exec user process")
    }
    return nil
}

先看下用 runc run 启动时调用的 linuxStandardInit.Init 逻辑

Keyring 配置缓存的一些安全设置，具体干啥怎么用不看了
setupNetwork 根据配置设置网络接口，setupRoute 设置对应的路由
label.Init 设置 selinux, 工作这么多年一直关着的，都不知道干啥用
判断是否设置了 NEWNS，其实就是 mount ns，准备 rootfs
配置 console, tty, 主机名, sysctl, readonly 乱七八糟的等等
中间一大段 syncParentReady 与前面 fork 的父进程同步，然后父进程就死掉了，最后自己肯定被 runc run 接管了，所以才有 unix.Getppid() != l.parentPid 的判断逻辑
unix.Openat 打开 fifofile，然后写一个 []byte("0") 到管道文件中，并阻塞在这里，直到 runc run 进程将数据读走
最后的最后，syscall.Exec 运行容器的 cmd，由于是系统调用 Exec，当前 runc init 进程的代码段都被 cmd 覆盖，Init 函数也不会返回

小结

想不到 runc 代码量不大，但是逻辑这么复杂。下一步是继续看 docker 的其它组件并分析源码