namespaces 学习笔记1：mount ns 源码实现

2019-10-08 本文已影响0人董泽润

TL;DR 最近想看 docker 相关的实现，自然涉及底层 namespace, 所以边做实验边看源码，感兴趣的先看耗子叔的文章

概览

Mount namespaces, 参数是 CLONE_NEWNS，由于是第一个 namespaces 实现，这个参数名比较特殊。用于隔离文件系统挂载
UTS namesapces, 参数是 CLONE_NEWUTS，隔离 hostname 和 domain name
IPC namespaces, 参数是 CLONE_NEWIPC，隔离 ipc, 信号量，共享内存等等
PID namespaces，参数是 CLONE_NEWPID，隔离 pid
NETWORK namespaces, 参数是 CLONE_NEWNET，隔离网络
USER namespaces, 参数是 CLONE_NEWUSER，隔离 uid, gid 等等

可以通过 /proc/pid/ns 来查看进程扔有哪些 ns, id 相同代码属于同一个 ns

root@iZhp36ik63t96xhzjh00ujZ:~# ls -l /proc/$$/ns
total 0
lrwxrwxrwx 1 root root 0 Oct  5 15:06 cgroup -> 'cgroup:[4026531835]'
lrwxrwxrwx 1 root root 0 Oct  5 15:06 ipc -> 'ipc:[4026531839]'
lrwxrwxrwx 1 root root 0 Oct  5 15:06 mnt -> 'mnt:[4026531840]'
lrwxrwxrwx 1 root root 0 Oct  5 15:06 net -> 'net:[4026531993]'
lrwxrwxrwx 1 root root 0 Oct  5 15:06 pid -> 'pid:[4026531836]'
lrwxrwxrwx 1 root root 0 Oct  5 15:06 pid_for_children -> 'pid:[4026531836]'
lrwxrwxrwx 1 root root 0 Oct  5 15:06 user -> 'user:[4026531837]'
lrwxrwxrwx 1 root root 0 Oct  5 15:06 uts -> 'uts:[4026531838]'

测试案例

下面的代码大部份来自耗子叔的文章，唯一区别是调用 mount 将根变成私有

#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/wait.h>
#include <stdio.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>
#include <sys/mount.h>

/* 定义一个给 clone 用的栈，栈大小1M */
#define STACK_SIZE (1024 * 1024)
static char container_stack[STACK_SIZE];

char* const container_args[] = {
    "/bin/bash",
    NULL
};

int container_main(void* arg)
{
    printf("Container - inside the container!\n");
    /* 直接执行一个shell，以便我们观察这个进程空间里的资源是否被隔离了 */
    mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL);
    execv(container_args[0], container_args);
    printf("Something's wrong!\n");
    return 1;
}

int main()
{
    printf("Parent - start a container!\n");
    /* 调用clone函数，其中传出一个函数，还有一个栈空间的（为什么传尾指针，因为栈是反着的） */
    int container_pid = clone(container_main, container_stack+STACK_SIZE, SIGCHLD|CLONE_NEWNS, NULL);
    /* 等待子进程结束 */
    waitpid(container_pid, NULL, 0);
    printf("Parent - container stopped!\n");
    return 0;
}

运行后进入容器中，随变 mount 一个目录，我的例子是 dev/vda1 挂到 mnt

~# mount /dev/vda1 /mnt

然后再执行 mount 查看当前挂载了哪些目录

~# mount
......
systemd-1 on /proc/sys/fs/binfmt_misc type autofs (rw,relatime,fd=25,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=12886)
/dev/vda1 on /mnt type ext4 (rw,relatime,errors=remount-ro,data=ordered)
~# ls -l /proc/$$/ns | grep mnt
lrwxrwxrwx 1 root root 0 Oct  6 15:36 mnt -> mnt:[4026532235]

再打开另一个终端执行 mount 发现并没有 /mnt 这一列，并且 mount ns id 也是不同的

~# mount
......
configfs on /sys/kernel/config type configfs (rw,relatime)
fusectl on /sys/fs/fuse/connections type fusectl (rw,relatime)
tmpfs on /run/user/0 type tmpfs (rw,nosuid,nodev,relatime,size=403944k,mode=700)
~# ls -l /proc/$$/ns | grep mnt
lrwxrwxrwx 1 root root 0 Oct  6 15:36 mnt -> mnt:[4026531840]

这个例子有一点小问题，就是 PID 没有隔离，/proc 目录还是和宿主机一样的，所以能看到其它无用进程，再次尝试 clone 时添加 CLONE_NEWPID

int container_pid = clone(container_main, container_stack+STACK_SIZE, SIGCHLD|CLONE_NEWNS|CLONE_NEWPID, NULL);

并且在 mount 后再添加重新挂载 /proc 目录

mount("proc", "/proc", "proc", MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL);

再次编绎运行测试程序，并且发现 /bin/bash 的 pid 是 1

~# gcc clone.c && ./a.out
Parent - start a container!
Container - inside the container!
~# ps aux
USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root         1  0.0  0.0  23324  3648 pts/2    S    16:15   0:00 /bin/bash
root        11  0.0  0.0  39084  3264 pts/2    R+   16:15   0:00 ps aux

核心结构体

做完测试，我们看下源码实现，先从核心结构体看起

struct task_struct {
    ......
    /* Namespaces: */
    struct nsproxy          *nsproxy;
    ......
}

每个进程结构体 task_struct 都有个 nsproxy 字段

struct nsproxy {
    atomic_t count;
    struct uts_namespace *uts_ns;
    struct ipc_namespace *ipc_ns;
    struct mnt_namespace *mnt_ns;
    struct pid_namespace *pid_ns_for_children;
    struct net       *net_ns;
    struct cgroup_namespace *cgroup_ns;
};

可以看到，nsproxy 里有所有的不同 namespaces 的指针，count 字段用于引用计数

创建 ns

内核 _do_fork 时，会调用 copy_namespaces 生成 nsproxy

int copy_namespaces(unsigned long flags, struct task_struct *tsk)
{
    struct nsproxy *old_ns = tsk->nsproxy;
    struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
    struct nsproxy *new_ns;

    if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                  CLONE_NEWPID | CLONE_NEWNET |
                  CLONE_NEWCGROUP)))) {
        get_nsproxy(old_ns);
        return 0;
    }
    ......
    new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
    if (IS_ERR(new_ns))
        return  PTR_ERR(new_ns);

    tsk->nsproxy = new_ns;
    return 0;
}

首先判断 flags 里如果没有创建任何 ns 的参数，那么调用 get_nsproxy 将原 nsproxy->count 计数加一后直接返回。否则调用 create_new_namespaces 根据 flags 按需创建新的 ns

static struct nsproxy *create_new_namespaces(unsigned long flags,
    struct task_struct *tsk, struct user_namespace *user_ns,
    struct fs_struct *new_fs)
{
    struct nsproxy *new_nsp;
    int err;

    new_nsp = create_nsproxy();
    if (!new_nsp)
        return ERR_PTR(-ENOMEM);

    new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
    if (IS_ERR(new_nsp->mnt_ns)) {
        err = PTR_ERR(new_nsp->mnt_ns);
        goto out_ns;
    }
    ......
    return new_nsp;
    ......

这段代码比较好理解，首先创建 nsproxy 结构体，引用计数置为 1，再分别调用 copy_xxxx_ns 创建不同的 ns

mount ns 结构体

struct mnt_namespace {
    atomic_t        count;
    struct ns_common    ns;
    struct mount *  root;
    struct list_head    list;
    struct user_namespace   *user_ns;
    struct ucounts      *ucounts;
    u64         seq;    /* Sequence number to prevent loops */
    wait_queue_head_t poll;
    u64 event;
    unsigned int        mounts; /* # of mounts in the namespace */
    unsigned int        pending_mounts;
} __randomize_layout;

其中 count 是当前 mount namespaces 引用计数，ns_common 存储一些公用字段。root 是根目录挂载点

struct ns_common {
    atomic_long_t stashed;
    const struct proc_ns_operations *ops;
    unsigned int inum;
};

struct proc_ns_operations {
    const char *name;
    const char *real_ns_name;
    int type;
    struct ns_common *(*get)(struct task_struct *task);
    void (*put)(struct ns_common *ns);
    int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
    struct user_namespace *(*owner)(struct ns_common *ns);
    struct ns_common *(*get_parent)(struct ns_common *ns);
} __randomize_layout;

const struct proc_ns_operations mntns_operations = {
    .name       = "mnt",
    .type       = CLONE_NEWNS,
    .get        = mntns_get,
    .put        = mntns_put,
    .install    = mntns_install,
    .owner      = mntns_owner,
};

ns_common 结构体最关心的就是 proc_ns_operations 回调结构体，里面有不同 ns 的抽像操作。

copy_mnt_ns 创建

struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
        struct user_namespace *user_ns, struct fs_struct *new_fs)
{
    struct mnt_namespace *new_ns;
    struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
    struct mount *p, *q;
    struct mount *old;
    struct mount *new;
    int copy_flags;

    BUG_ON(!ns);

    if (likely(!(flags & CLONE_NEWNS))) {
        get_mnt_ns(ns);
        return ns;
    }

    old = ns->root;

    new_ns = alloc_mnt_ns(user_ns, false); // 创建结构体，初始化基本字段
    if (IS_ERR(new_ns))
        return new_ns;

    namespace_lock();
    /* First pass: copy the tree topology */
    copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
    if (user_ns != ns->user_ns)
        copy_flags |= CL_SHARED_TO_SLAVE;
    new = copy_tree(old, old->mnt.mnt_root, copy_flags);
    if (IS_ERR(new)) {
        namespace_unlock();
        free_mnt_ns(new_ns);
        return ERR_CAST(new);
    }
    if (user_ns != ns->user_ns) {
        lock_mount_hash();
        lock_mnt_tree(new);
        unlock_mount_hash();
    }
    new_ns->root = new;
    list_add_tail(&new_ns->list, &new->mnt_list);

    /*
     * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
     * as belonging to new namespace.  We have already acquired a private
     * fs_struct, so tsk->fs->lock is not needed.
     */
    p = old;
    q = new;
    while (p) {
        q->mnt_ns = new_ns;
        new_ns->mounts++;
        if (new_fs) {
            if (&p->mnt == new_fs->root.mnt) {
                new_fs->root.mnt = mntget(&q->mnt);
                rootmnt = &p->mnt;
            }
            if (&p->mnt == new_fs->pwd.mnt) {
                new_fs->pwd.mnt = mntget(&q->mnt);
                pwdmnt = &p->mnt;
            }
        }
        p = next_mnt(p, old);
        q = next_mnt(q, new);
        if (!q)
            break;
        while (p->mnt.mnt_root != q->mnt.mnt_root)
            p = next_mnt(p, old);
    }
    namespace_unlock();

    if (rootmnt)
        mntput(rootmnt);
    if (pwdmnt)
        mntput(pwdmnt);

    return new_ns;
}

首先调用 alloc_mnt_ns 创建 mnt_namespace 结构体，并且初始化一些基本字段。比如 ns.ops 安装 mntns_operations 回调函数，计数 count 初始化为 1，初始化 List 等等
copy_tree 复制父进程的 root vfsmount 挂载拓扑，然后赋给 root 字段并把 root 连到 new_ns->list 尾部
最后遍历所有挂载点，将 new_ns->mounts 计数加一，判断并设置新进程的 pwd 和 root 挂载点

mount 命令与 mount ns 交互

简单来说，mount namespace 就是用来隔离挂载点的，不同 ns 的修改不会影响其它。暂时不考滤 Shared subtrees, 一般我们挂载文件系统时，用 mount 命令，来看一下系统调用的实现

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
        char __user *, type, unsigned long, flags, void __user *, data)
{
    return ksys_mount(dev_name, dir_name, type, flags, data);
}

int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
           unsigned long flags, void __user *data)
{
    ......
    ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
    ......
}

经过一些组装，最后调用 do_mount

long do_mount(const char *dev_name, const char __user *dir_name,
        const char *type_page, unsigned long flags, void *data_page)
{
    struct path path;
    unsigned int mnt_flags = 0, sb_flags;
    int retval = 0;
    ......

    /* ... and get the mountpoint */
    retval = user_path(dir_name, &path);
    if (retval)
        return retval;
   ......

    if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
        retval = do_reconfigure_mnt(&path, mnt_flags);
    else if (flags & MS_REMOUNT)
        retval = do_remount(&path, flags, sb_flags, mnt_flags,
                    data_page);
    else if (flags & MS_BIND)
        retval = do_loopback(&path, dev_name, flags & MS_REC);
    else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
        retval = do_change_type(&path, flags);
    else if (flags & MS_MOVE)
        retval = do_move_mount_old(&path, dev_name);
    else
        retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
                      dev_name, data_page);
    ......
}

这段代码中间忽略 flags 的判断与生成。首先是生成 struct path 结构体，即挂载点目录，这块代码有点复杂，不看了，path 结构体包含 dentry 目录项和 vfsmount 目标挂载点。有的时候我们是挂载 loopdev 设备，有的是移动挂载点，有的是重新 remount，我们只看 do_new_mount 如何处理新挂载

static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
            int mnt_flags, const char *name, void *data)
{
    struct file_system_type *type;
    struct fs_context *fc;
    const char *subtype = NULL;
    int err = 0;

    if (!fstype)
        return -EINVAL;

    type = get_fs_type(fstype);
    if (!type)
        return -ENODEV;

    ......

    fc = fs_context_for_mount(type, sb_flags);
    put_filesystem(type);
    if (IS_ERR(fc))
        return PTR_ERR(fc);

    if (subtype)
        err = vfs_parse_fs_string(fc, "subtype",
                      subtype, strlen(subtype));
    if (!err && name)
        err = vfs_parse_fs_string(fc, "source", name, strlen(name));
    if (!err)
        err = parse_monolithic_mount_data(fc, data);
    if (!err)
        err = vfs_get_tree(fc);
    if (!err)
        err = do_new_mount_fc(fc, path, mnt_flags);

    put_fs_context(fc);
    return err;
}

这里有个重要的结构体 fs_context, 用于封装本创建或重新配置 superblock 的上下文。

get_fs_type 获取调挂载的文件系统类型，比如 proc, ext3, xfs 等等，用于多态处理
fs_context_for_mount 根据文件系统类型，来创建相应的 fs_context, 深入代码会发现，调用 init_fs_context 初始化，主要是设置回调结构体 legacy_fs_context_ops
如果有 subtype, 调用 vfs_parse_fs_string 初始化子类型
调用 vfs_parse_fs_string 初始化待挂载设备参数
parse_monolithic_mount_data 初始化待挂载的 kv 参数，暂时不看
vfs_get_tree 时际上调用具体文件系统的 mount 函数，比如 ext4_mount 生成待挂载设备的 dentry 页目录项和 superblock，并赋值给 fc->root
最后 do_new_mount_fc 做真正的挂载操作，并更新 mount namespace

static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
               unsigned int mnt_flags)
{
    struct vfsmount *mnt;
    struct super_block *sb = fc->root->d_sb;
    int error;
    ......
    mnt = vfs_create_mount(fc);
    if (IS_ERR(mnt))
        return PTR_ERR(mnt);

    error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
    if (error < 0)
        mntput(mnt);
    return error;
}

调用 vfs_create_mount 创建待挂载设备的 vfsmount 及 mount 结构体，各种初始化，设置 superblock 及 root dentry
最后调用 do_add_mount 将待挂载设备的 vfsmount 挂到对应 path 的 mount namespace tree 下面

static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
{
    struct mountpoint *mp;
    struct mount *parent;

    mp = lock_mount(path);
    ......
    parent = real_mount(path->mnt);
    ......
    err = graft_tree(newmnt, parent, mp);
    ......
}

lock_mount 获取挂载目点的 mountpoint
获取挂载点的 parent mount，注意，这个 parent 实际上就是在目录 mount namespace 下的
调用 graft_tree，将 newmount 设备挂载到 mp 上，这块最后工作的是 attach_recursive_mnt

static int attach_recursive_mnt(struct mount *source_mnt,
            struct mount *dest_mnt,
            struct mountpoint *dest_mp,
            struct path *parent_path)
{
    struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
    HLIST_HEAD(tree_list);
    struct mnt_namespace *ns = dest_mnt->mnt_ns;
    struct mountpoint *smp;
    struct mount *child, *p;
    struct hlist_node *n;
    int err;

    /* Preallocate a mountpoint in case the new mounts need
     * to be tucked under other mounts.
     */
    smp = get_mountpoint(source_mnt->mnt.mnt_root);
    ......
    if (parent_path) {
        detach_mnt(source_mnt, parent_path);
        attach_mnt(source_mnt, dest_mnt, dest_mp);
        touch_mnt_namespace(source_mnt->mnt_ns);
    } else {
        if (source_mnt->mnt_ns) {
            /* move from anon - the caller will destroy */
            list_del_init(&source_mnt->mnt_ns->list);
        }
        mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
        commit_tree(source_mnt);
    }

    ......
    return 0;
}

这个函数是将 source_mnt 挂载到 dest_mp，另外还处理了 MOVE 情况，也就是说如果 parent 不为空，那么要先从 parent 上 detach, 再 attach 到 dest_mp, 本次只看新挂载的情况，也不看 shared 的情况

source_mnt->mnt_ns 如果待挂载设备有 mnt_ns，那么清除掉，因为要设置成 dest_mp 的
调用 mnt_set_mountpoint，设置挂载相关字段。所谓挂载，就是待挂载设备 source_mnt.mnt_parent 字段指向 dest_mp，并将 source_mnt 连到 dest_mp.m_list 连表上
调用 commit_tree 更新 source_mnt 的 mount namespace。遍历 mnt_list ，如果 source_mnt 还有 child mount, 递归更新 mount namespace

static void commit_tree(struct mount *mnt)
{
    struct mount *parent = mnt->mnt_parent;
    struct mount *m;
    LIST_HEAD(head);
    struct mnt_namespace *n = parent->mnt_ns;

    BUG_ON(parent == mnt);

    list_add_tail(&head, &mnt->mnt_list);
    list_for_each_entry(m, &head, mnt_list)
        m->mnt_ns = n;

    list_splice(&head, n->list.prev);

    n->mounts += n->pending_mounts;
    n->pending_mounts = 0;

    __attach_mnt(mnt, parent);
    touch_mnt_namespace(n);
}

小结

这一块还需要大量 vfs 相关知识，以后还需要多理解多看。感觉 namespace 就像祖谱一样的树形结构，docker 隔离时相当于重新定制了一份祖谱，只有一个人，那么他就是祖先。