runc 启动容器过程分析（附 CVE-2019-5736 实现过程）

环境

OCI runtime spec 地址：https://github.com/opencontainers/runtime-spec
runc 地址：https://github.com/opencontainers/runc/
Commit：f414f497b50a61750ea3af9fccf998a3db687cea
系统版本：Fedora Release 28
内核版本：4.17.9-200.fc28.x86_64

runc 介绍

runc 实现了 OCI 的容器标准，能够管理容器的生命周期。runc 的详细功能请参考帮助文档。

runc 不是基于 server 形式的，所以所有的配置和状态都会存储在本地文件系统中（以下均为使用 docker 时的默认路径）：

容器配置：/run/docker/libcontainerd/{cnotainer-id}/config.json
容器 init 进程的标准输入输出流：/run/docker/libcontainerd/{cnotainer-id}/{init-stdin,init-stdout,init-stderr}
容器状态信息：/run/runc/*/state.json

runc 创建容器时会将状态记录到 state.json 中，所有查询都是从 state.json 中取得容器基本信息，然后再从系统中获取容器实时状态。

docker 的调用链如下：

docker-client -> dockerd -> docker-containerd -> docker-containerd-shim -> runc（容器外） -> runc（容器内） -> containter-entrypoint

runc 启动容器过程

runc 在被 docker-containerd-shim 调用时，参数中会指定容器的配置路径（即 config.json 的位置），同时容器的根路径也已经准备完毕，因此 runc 不会有跟镜像相关的概念。容器的启动过程分析直接从 runc run 开始，即 docker 调用链中的 runc（容器外）这个时间点。

runc（容器外）环境准备

读取 config.json（github.com/opencontainers/runc/run.go#65）：

// 读取 config.json
spec, err := setupSpec(context)
if err != nil {
	return err
}
// 启动容器
status, err := startContainer(context, spec, CT_ACT_RUN, nil)
if err == nil {
	os.Exit(status)
}
return err

startContainer 创建容器信息，并启动（github.com/opencontainers/runc/utils_linux.go#396）：

func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
    // 通过 spec 创建容器结构，在 createContainer 中将 spec 转换为了 runc 的 container config
	container, err := createContainer(context, id, spec)
	if err != nil {
		return -1, err
	}
    // 构建 runner 启动容器
	r := &runner{
		// 容器
		container:       container,
		// 即 CT_ACT_RUN
		action:          action,
		// 用于设置 process.Init 字段
		init:            true,
	}
	return r.run(spec.Process)
}

r.run() 启动容器（github.com/opencontainers/runc/utils_linux.go#268）：

func (r *runner) run(config *specs.Process) (int, error) {
	// 根据 config 构建容器进程，此处 r.init 为 true
	process, err := newProcess(*config, r.init)
	if err != nil {
		r.destroy()
		return -1, err
	}

    // 根据 action 调用 container 的对应方法
	switch r.action {
	case CT_ACT_CREATE:
		err = r.container.Start(process)
	case CT_ACT_RESTORE:
		err = r.container.Restore(process, r.criuOpts)
    case CT_ACT_RUN:
        // 此处调用的是这个方法
		err = r.container.Run(process)
	default:
		panic("Unknown action")
	}
}

container 是由 createContainer() 方法创建，根据创建链路 createContainer() -> loadFactory() -> libcontainer.New() 确认容器由 LinuxFactory.Create() 创建：

// github.com/opencontainers/runc/libcontainer/factory_linux.go#132
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
	l := &LinuxFactory{
        // 指向当前的 exe 程序，即 runc 本身
        InitPath:  "/proc/self/exe",
        // os.Args[0] 是当前 runc 的路径，本质上和 InitPath 是一样的，即 runc init
		InitArgs:  []string{os.Args[0], "init"},
	}
	return l, nil
}

// github.com/opencontainers/runc/libcontainer/factory_linux.go#189
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
    // 创建 linux 容器结构
	c := &linuxContainer{
        // 容器 ID
        id:            id,
        // 容器状态文件存放目录，默认是 /run/runc/{容器 id}/
        root:          containerRoot,
        // 容器配置
        config:        config,
        // 即 /proc/self/exe，就是 runc
        initPath:      l.InitPath,
        // 即 runc init
		initArgs:      l.InitArgs,
	}
	return c, nil
}

所以整个容器的启动逻辑在 linuxContainer.Run() 里，调用链是 linuxContainer.Run() -> linuxContainer.Start() -> linuxContainer.start()：

// github.com/opencontainers/runc/libcontainer/container_linux.go#334
func (c *linuxContainer) start(process *Process) error {
    // process 是容器的 entrypoint，此处创建的是 entrypoint 的父进程
	parent, err := c.newParentProcess(process)
	if err != nil {
		return newSystemErrorWithCause(err, "creating new parent process")
    }
    // 启动父进程
	if err := parent.start(); err != nil {
		// terminate the process to ensure that it properly is reaped.
		if err := ignoreTerminateErrors(parent.terminate()); err != nil {
			logrus.Warn(err)
		}
		return newSystemErrorWithCause(err, "starting container process")
	}
}

func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
    // 创建用于父子进程通信的 pipe
	parentPipe, childPipe, err := utils.NewSockPair("init")
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new init pipe")
    }
    // 创建父进程的 cmd
	cmd, err := c.commandTemplate(p, childPipe)
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new command template")
	}
	if !p.Init {
        // 由于 p.Init 为 true，所以不会执行到这里
		return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
	}

    // 返回标准 init 进程
	return c.newInitProcess(p, cmd, parentPipe, childPipe)
}

func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
    // 这里可以看到 cmd 就是 runc init
	cmd := exec.Command(c.initPath, c.initArgs[1:]...)
    cmd.Args[0] = c.initArgs[0]
    // 将设置给容器 entrypoint 的 std 流给了 runc init 命令，这些流最终会通过 runc init 传递给 entrypoint 
	cmd.Stdin = p.Stdin
	cmd.Stdout = p.Stdout
    cmd.Stderr = p.Stderr
    
    // 这个 childPipe 用于跟父进程通信（父进程就是当前这个 runc 进程）
    cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe)
    // 通过环境变量 _LIBCONTAINER_INITPIPE 把 fd 号传递给 runc init，由于 std 流会占用前三个 fd 编号（0，1，2）
    // 所以 fd 要加上 3（stdioFdCount）
    cmd.Env = append(cmd.Env,
		fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
	)
	return cmd, nil
}

func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
    // 这里通过环境变量 _LIBCONTAINER_INITTYPE 设置 init 类型为 standard（initStandard）
	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
	nsMaps := make(map[configs.NamespaceType]string)
	for _, ns := range c.config.Namespaces {
		if ns.Path != "" {
			nsMaps[ns.Type] = ns.Path
		}
	}
    _, sharePidns := nsMaps[configs.NEWPID]
    // 构造 namespace 设置，然后序列化成字节数据
	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
	if err != nil {
		return nil, err
	}
	init := &initProcess{
		cmd:             cmd,
		childPipe:       childPipe,
		parentPipe:      parentPipe,
		manager:         c.cgroupManager,
        intelRdtManager: c.intelRdtManager,
        
		config:          c.newInitConfig(p),
		container:       c,
		process:         p,
		bootstrapData:   data,
		sharePidns:      sharePidns,
	}
	c.initProcess = init
	return init, nil
}

在 linuxContainer.start() 中，创建了一个命令是 runc init 的初始化进程（initProcess），并启动了该进程，这里是 runc（容器外）的最核心的逻辑：

// github.com/opencontainers/runc/libcontainer/process_linux.go#262
func (p *initProcess) start() error {
    defer p.parentPipe.Close()
    // 启动了 cmd，即启动了 runc init
	err := p.cmd.Start()
	p.process.ops = p
	p.childPipe.Close()
	if err != nil {
		p.process.ops = nil
		return newSystemErrorWithCause(err, "starting init process command")
	}

    // 将 bootstrapData 写入到 parent pipe 中，此时 runc init 可以从 child pipe 里读取到这个数据
	if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
		return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
    }
    
    // 获取子进程的 PID，即 runc init 的 PID
    childPid, err := p.getChildPid()
	if err != nil {
		return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
	}

	// 如果子容器的配置中要求创建新的 CGROUP Namespace，那么这里还要向 parent pipe 写入一个字节的数据 0x80（createCgroupns）
	if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
		if _, err := p.parentPipe.Write([]byte{createCgroupns}); err != nil {
			return newSystemErrorWithCause(err, "sending synchronization value to init process")
		}
	}

	// 等待 runc init 退出
	if err := p.waitForChildExit(childPid); err != nil {
		return newSystemErrorWithCause(err, "waiting for our first child to exit")
	}
    
    // 向 parent pipe 中写入 container config，也就是把容器配置传递给了 runc init
    // 为什么 runc init 都退出了，还要往里面写配置？==》这个问题下面说到 runc init 的时候再解释
	if err := p.sendConfig(); err != nil {
		return newSystemErrorWithCause(err, "sending config to init process")
	}
	var (
		sentRun    bool
		sentResume bool
	)
    // 从 parent pipe 中读取来自 runc init 的同步消息
	ierr := parseSync(p.parentPipe, func(sync *syncT) error {
		...
		return nil
	})
	return nil
}

总结：

runc 被 docker-containerd-shim 调用后，从 config.json 中读取 container spec，并转换成内部 config
这个 runc 在外部运行，拥有 root 权限
runc 启动了一个子进程，runc init，然后通过 pipe 将 bootstrapData（含有 namespace 信息），0x80（NEWCGROUP），容器 config 传输给 runc init，并开始等待 runc init 的同步消息

runc（容器内）启动过程

原则上来说，容器外的 runc 启动的 runc init 仍然是在容器外部的，但是它会逐步的限制自身的 namespace 来构建容器环境，因此这里直接算作容器内的 runc。

runc init 命令启动：

package main

import (
	"os"
	"runtime"

    "github.com/opencontainers/runc/libcontainer"
    // 这个包非常重要，是 runc init 启动的基石
	_ "github.com/opencontainers/runc/libcontainer/nsenter"
	"github.com/urfave/cli"
)

func init() {
	if len(os.Args) > 1 && os.Args[1] == "init" {
		runtime.GOMAXPROCS(1)
		runtime.LockOSThread()
	}
}

var initCommand = cli.Command{
	Name:  "init",
	Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
	Action: func(context *cli.Context) error {
        // 构造了一个空的 factory
        factory, _ := libcontainer.New("")
        // 初始化容器环境
		if err := factory.StartInitialization(); err != nil {
			os.Exit(1)
		}
		panic("libcontainer: container init failed to exec")
	},
}

由于 nsenter 包被匿名引入，而且利用了 GCC 构造器特性，导致 go 的代码最后才会执行，因此先看 nsenter 包的代码（github.com/opencontainers/runc/libcontainer/nsenter/nsenter.go）：

// +build linux,!gccgo

package nsenter

/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
	nsexec();
}
*/
import "C"

这个代码利用了 GCC 的 constructor 特性，init 会在 runtimel.main()（不是 main.main()）函数之前执行，这样保证了启动时是单线程的，这一点很重要。因为 linux 不允许在多线程中通过 setns 设置 user namespace。

这个初始化函数调用了 nsexec()（github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c#540）：

void nsexec(void)
{
	int pipenum;
	jmp_buf env;
	int sync_child_pipe[2], sync_grandchild_pipe[2];
	struct nlconfig_t config = { 0 };

	// 从环境变量 _LIBCONTAINER_INITPIPE 中取得 child pipe 的 fd 编号
	pipenum = initpipe();
    if (pipenum == -1)
        // 由于正常启动的 runc 是没有这个环境变量的，所以这里会直接返回，然后就开始正常的执行 go 程序了
		return;

    // 确保当前的二进制文件是已经复制过的，用来规避 CVE-2019-5736 漏洞
    // ensure_cloned_binary 中使用了两种方法：
    // - 使用 memfd，将二进制文件写入 memfd，然后重启 runc
    // - 复制二进制文件到临时文件，然后重启 runc
	if (ensure_cloned_binary() < 0)
		bail("could not ensure we are a cloned binary");

	// 从 child pipe 中读取 namespace config
	nl_parse(pipenum, &config);

	// 设置 oom score，这个只能在特权模式下设置，所以在这里就要修改完成
	update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);

	// 设置不可 dump
	if (config.namespaces) {
		if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
			bail("failed to set process as non-dumpable");
	}

	// 创建和子进程通信的 pipe，为什么有这个 pipe，下面解释
	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
		bail("failed to setup sync pipe between parent and child");

	// 创建和孙进程通信的 pipe，为什么有这个 pipe，下面解释
	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
        bail("failed to setup sync pipe between parent and grandchild");
    
    // setjmp 将当前执行位置的环境保存下来，用于多进程环境下的程序跳转
    // 第一次执行的时候 setjmp 返回 0，对应 JUMP_PARENT
	switch (setjmp(env)) {
	case JUMP_PARENT:{
			int len;
			pid_t child, first_child = -1;
			bool ready = false;

			/* For debugging. */
			prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);

            // clone_parent 创建了和当前进程完全一致的一个进程（子进程）
            // 在 clone_parent 中，通过 longjmp() 跳转到 env 保存的位置
            // 并且 setjmp 返回值为 JUMP_CHILD
            // 这样这个子进程就会根据 switch 执行到 JUMP_CHILD 分支
            // 而当前 runc init 和 子 runc init 之间通过上面创建的
            // sync_child_pipe 进行同步通信
			child = clone_parent(&env, JUMP_CHILD);
			if (child < 0)
				bail("unable to fork: child_func");

            // 通过 sync_child_pipe 循环读取来自子进程的消息
			while (!ready) {
				enum sync_t s;
				int ret;

				syncfd = sync_child_pipe[1];
				close(sync_child_pipe[0]);

				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with child: next state");

				switch (s) {
				case SYNC_ERR:
					/* We have to mirror the error code of the child. */
					if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
						bail("failed to sync with child: read(error code)");

					exit(ret);
				case SYNC_USERMAP_PLS:
					// 这里设置 user map，因为子进程修改自身的 user namespace 之后，就没有权限再设置 user map 了

					if (config.is_rootless_euid && !config.is_setgroup)
						update_setgroups(child, SETGROUPS_DENY);

					/* Set up mappings. */
					update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
					update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);

                    // 向子进程发送 SYNC_USERMAP_ACK，表示处理完成
					s = SYNC_USERMAP_ACK;
					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
						kill(child, SIGKILL);
						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
					}
					break;
				case SYNC_RECVPID_PLS:{
						first_child = child;
                        // 接收孙进程（还是 runc init）的 pid
						/* Get the init_func pid. */
						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
							kill(first_child, SIGKILL);
							bail("failed to sync with child: read(childpid)");
						}

						// 向子进程发送 SYNC_RECVPID_ACK，表示处理完成
						s = SYNC_RECVPID_ACK;
						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
							kill(first_child, SIGKILL);
							kill(child, SIGKILL);
							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
						}

                        // 通过容器外传进来的 child pipe 把子和孙进程 PID，写回去，然后让容器外的 runc 接管 PID
                        // 这个是因为 clone_parent 的时候参数传了 CLONE_PARENT，导致子孙的父进程都是容器外的那个 runc
                        // 所以当前进程无法接管这些 PID
						len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
						if (len < 0) {
							kill(child, SIGKILL);
							bail("unable to generate JSON for child pid");
						}
					}
					break;
                case SYNC_CHILD_READY:
                    // 子进程已经处理完了所有事情，退出循环
					ready = true;
					break;
				default:
					bail("unexpected sync value: %u", s);
				}
			}

            // 通过 sync_grandchild_pipe 循环读取来自孙进程的消息
			ready = false;
			while (!ready) {
				enum sync_t s;
				int ret;

				syncfd = sync_grandchild_pipe[1];
				close(sync_grandchild_pipe[0]);

				s = SYNC_GRANDCHILD;
				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
					kill(child, SIGKILL);
					bail("failed to sync with child: write(SYNC_GRANDCHILD)");
				}

				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with child: next state");

				switch (s) {
				case SYNC_ERR:
					if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
						bail("failed to sync with child: read(error code)");

					exit(ret);
                case SYNC_CHILD_READY:
                    // 等待孙进程准备完成
					ready = true;
					break;
				default:
					bail("unexpected sync value: %u", s);
				}
            }
            // 退出。很明显，当前 runc init 退出的时候，子 runc init 一定也退出了，但是孙 runc init 还没有退出
            // 这也是为什么容器外的 runc 等待子进程退出，却又向 pipe 里写数据的原因，因为孙 runc init 还在等着容器配置
            // 进程正常退出（不给 go 代码执行的机会）
			exit(0);
		}
	case JUMP_CHILD:{
			pid_t child;
			enum sync_t s;

			/* We're in a child and thus need to tell the parent if we die. */
			syncfd = sync_child_pipe[0];
			close(sync_child_pipe[1]);

			/* For debugging. */
			prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);

			// 通过 setns 加入现有的 namespace
			if (config.namespaces)
				join_namespaces(config.namespaces);

            // 如果 clone flag 里有 CLONE_NEWUSER，说明需要创建新的 user namespace，此处调用 unshare 进行了处理
			if (config.cloneflags & CLONE_NEWUSER) {
				if (unshare(CLONE_NEWUSER) < 0)
					bail("failed to unshare user namespace");
				config.cloneflags &= ~CLONE_NEWUSER;

				if (config.namespaces) {
					if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
						bail("failed to set process as dumpable");
                }
                
                // 等待父 runc init 配置 user map
				s = SYNC_USERMAP_PLS;
				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");

				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
				if (s != SYNC_USERMAP_ACK)
					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);

				if (config.namespaces) {
					if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
						bail("failed to set process as dumpable");
				}

				// 设置当前进程的 uid 为 0，即容器内的 root
				if (setresuid(0, 0, 0) < 0)
					bail("failed to become root in user namespace");
            }
            
			// unshare 其他需要新建的 namespace
			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
				bail("failed to unshare namespaces");

			// 创建孙进程，当前进程已经完成了 namespace 的设置，孙进程会继承这些设置
			child = clone_parent(&env, JUMP_INIT);
			if (child < 0)
				bail("unable to fork: init_func");

			// 将孙进程 PID 传给父 runc init
			s = SYNC_RECVPID_PLS;
			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
				kill(child, SIGKILL);
				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
			}
			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
				kill(child, SIGKILL);
				bail("failed to sync with parent: write(childpid)");
			}

			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
				kill(child, SIGKILL);
				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
			}
			if (s != SYNC_RECVPID_ACK) {
				kill(child, SIGKILL);
				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
			}

            // 发送 SYNC_CHILD_READY 给父 runc init
			s = SYNC_CHILD_READY;
			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
				kill(child, SIGKILL);
				bail("failed to sync with parent: write(SYNC_CHILD_READY)");
			}

            // 子 runc init 的工作到此结束，进程正常退出（不给 go 代码执行的机会）
			exit(0);
		}

	case JUMP_INIT:{
			// 孙 runc init 是真正启动容器 entrypoint 的进程，并且在启动之前，进行最后的环境准备工作
			enum sync_t s;

			/* We're in a child and thus need to tell the parent if we die. */
			syncfd = sync_grandchild_pipe[0];
			close(sync_grandchild_pipe[1]);
			close(sync_child_pipe[0]);
			close(sync_child_pipe[1]);

			/* For debugging. */
			prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);

			if (read(syncfd, &s, sizeof(s)) != sizeof(s))
				bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
			if (s != SYNC_GRANDCHILD)
				bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);

			if (setsid() < 0)
				bail("setsid failed");

			if (setuid(0) < 0)
				bail("setuid failed");

			if (setgid(0) < 0)
				bail("setgid failed");

			if (!config.is_rootless_euid && config.is_setgroup) {
				if (setgroups(0, NULL) < 0)
					bail("setgroups failed");
			}

			// 等待来自容器外 runc 的 child pipe 的关于 cgroup namespace 的消息 0x80（CREATECGROUPNS）
			if (config.cloneflags & CLONE_NEWCGROUP) {
				uint8_t value;
				if (read(pipenum, &value, sizeof(value)) != sizeof(value))
					bail("read synchronisation value failed");
				if (value == CREATECGROUPNS) {
					if (unshare(CLONE_NEWCGROUP) < 0)
						bail("failed to unshare cgroup namespace");
				} else
					bail("received unknown synchronisation value");
			}

            // 发送孙进程准备完成的消息给祖父 runc init
			s = SYNC_CHILD_READY;
			if (write(syncfd, &s, sizeof(s)) != sizeof(s))
				bail("failed to sync with patent: write(SYNC_CHILD_READY)");

			/* Close sync pipes. */
			close(sync_grandchild_pipe[0]);

			/* Free netlink data. */
			nl_free(&config);

            // 此时，父 / 祖父 runc init 都退出了（可能会有时差）
            // 但是当前进程是不能直接退出的，所以这里单纯的 return，然后开始执行 go 代码
			return;
		}
	default:
		bail("unexpected jump value");
	}

	/* Should never be reached. */
	bail("should never be reached");
}

在 namespace 初始化完成后，会通过调用链 LinuxFactory.StartInitialization() -> newContainerInit() 创建容器初始化结构 linuxStandardInit（github.com/opencontainers/runc/libcontainer/init_linux.go#47）：

func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
	var config *initConfig
    // 此处从 child pipe 中读取了 container config
	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
		return nil, err
	}
	if err := populateProcessEnvironment(config.Env); err != nil {
		return nil, err
    }
    // t 为 standard，来自于环境变量 _LIBCONTAINER_INITTYPE
	switch t {
	case initSetns:
		return &linuxSetnsInit{
			pipe:          pipe,
			consoleSocket: consoleSocket,
			config:        config,
		}, nil
	case initStandard:
		return &linuxStandardInit{
			pipe:          pipe,
			consoleSocket: consoleSocket,
			parentPid:     unix.Getppid(),
			config:        config,
			fifoFd:        fifoFd,
		}, nil
	}
	return nil, fmt.Errorf("unknown init type %q", t)
}

然后执行 linuxStandardInit.Init()（github.com/opencontainers/runc/libcontainer/standard_init_linux.go#47）：

func (l *linuxStandardInit) Init() error {
    // 这里比较重要的是这个函数，此时各个 Namespace 虽然都挂载完毕了，但是当前的进程的视角里根目录和容器外是一样的
    // 因此这个方法会挂载设备，bind mount，然后将当前根目录切换到容器的根目录下。
	if err := prepareRootfs(l.pipe, l.config); err != nil {
		return err
	}

	// 设置 root (/) 为只读
	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
		if err := finalizeRootfs(l.config.Config); err != nil {
			return err
		}
	}

	// 在完成一系列容器内的环境准备之后，通过 execve 执行容器内的 entrypoint
	if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
		return newSystemErrorWithCause(err, "exec user process")
	}
	return nil
}

总结：

runc init 一个会有三个进程
- 第一个进程读取 bootstrapData，并完成第二个进程的 user map 的设置
- 第二个进程完成 namespace 的设置
- 第三个进程完成 CGROUP namesapce 的设置，并读取了 0x80 的同步信息。最后进入 go 代码。go 代码读取 container config，进行容器内环境准备，最后执行容器的 entrypoint

CVE-2019-5736 过程分析

链接：https://seclists.org/oss-sec/2019/q1/119

通过构造一个恶意的容器，替换掉 runc 执行程序。runc 被再次执行时，恶意代码即可拿到 root 权限。

过程：

在 runc init 的最后一个阶段，runc 会加载容器的 entrypoint
我们伪造一个容器，它具备以下两个要素：
- entrypoint 链接到 /proc/self/exe
- 含有恶意代码的 libc.so（或者其他任意 so，只要会被 runc 加载就行）
当 runc init 最后通过 execve 启动 entrypoint 时，由于 entrypoint 指向了 /proc/self/exe，那么实际上就等于执行了 runc 自身
runc init 被替换，但是容器内的 runc 启动了，由于现在 rootfs 已经是容器的 rootfs 了，所以 so 会从容器内加载，这样就会加载到含有恶意代码的 libc.so
libc.so 的恶意代码在 constructor 里，所以一加载这个 so，这个代码就会执行。恶意代码通过 open 系统调用去只读形式打开 /proc/self/exe（只能以只读形式，因为 runc 在运行），这个时候就会有一个对应的 fd 保留下来
恶意代码这个时候通过 execve 去执行容器内的一个程序，这样不会导致 PID 发生变化，但是程序改变了，并且 fd 继续保留了下来
程序的工作就是找到 fd 编号，就在 /proc/self/fd/ 中，然后再以写的方式重新打开这个 fd（这个时候因为 runc 已经退出了，所以可以以写的方式打开）。然后写入包含恶意代码的 runc。
在下次宿主机上的 runc 再被执行时，这个恶意代码即可执行，并且拥有 runc 的权限，即 root 权限。