程序员

Linux Kernel Mount 流程分析

2019-05-10  本文已影响3人  jerryyyq

调用栈

入口

入口地址有两个:

之后,都会调用: fs/namespace.c 里的 do_mount 函数,之后的调用栈如下:

'fs/namespace.c' do_mount, to call -> do_new_mount
'fs/namespace.c' do_new_mount, to call -> vfs_kern_mount
'fs/namespace.c' vfs_kern_mount, to call -> mount_fs
'fs/super.c' mount_fs to call -> type->mount

nfs

此处的 type->mount 是个函数指针,各个类型(例如 nfs)的 mount 实现分别声明并注册自己的 mount 实现
nfs 的 mount 实现函数为:'nfs/super.c' nfs_fs_mount
后续调用栈如下:

nfs_fs_mount 一

'nfs/super.c' nfs_fs_mount, to call -> nfs23_validate_mount_data
'nfs/super.c' nfs_fs_mount, to call -> nfs_validate_text_mount_data。调用参数示例:(options = nolock,addr=192.168.0.120, args->nfs_server.export_path = (null), args->nfs_server.port = -1, args->nfs_server.protocol = 6, dev_name = 192.168.0.120:/srv/nfs2, flags = 0, version = 3)
'nfs/super.c' nfs_validate_text_mount_data, to call -> nfs_parse_mount_options
'nfs/super.c' nfs_parse_mount_options, to call -> nfs_verify_server_address

nfs_fs_mount 二

'nfs/super.c' nfs_fs_mount, to call -> get_nfs_version
'nfs/super.c' nfs_fs_mount, to call -> nfs_try_mount。
        调用参数示例:(flags = 32768, dev_name = 192.168.0.120:/srv/nfs2, mount_info->parsed->flags = 3146240, mount_info->parsed->version = 3)
'nfs/super.c' nfs_try_mount, to call -> nfs_try_mount_request
'nfs/super.c' nfs_try_mount_request, to call -> nfs_request_mount
'nfs/super.c' nfs_request_mount, to call -> nfs_mount
'nfs/mount_clnt.c' nfs_mount, to call -> rpc_create
'/net/sunrpc/clnt.c' rpc_create, to call -> rpc_create_xprt
'/net/sunrpc/clnt.c' rpc_create_xprt, to call -> rpc_ping
'/net/sunrpc/clnt.c' rpc_ping, to call -> rpc_call_sync
'/net/sunrpc/clnt.c' rpc_call_sync, to call -> rpc_run_task, task->tk_pid = 0 (task->tk_pid 系统从 1 开始分配,每创建一个新的 task 加 1)
'/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_call_start
'/net/sunrpc/clnt.c' rpc_call_start, to set : task->tk_action = call_start
'/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_execute
'/net/sunrpc/sched.c' rpc_execute, to call __rpc_execute, task->tk_pid = 1。
        在 __rpc_execute 函数中, 会循环调用 do_action(task), 直到 RPC_IS_QUEUED(task) 退出循环。 
'/net/sunrpc/sched.c' __rpc_execute, to call do_action(task)。
        do_action 是个函数指针,实际指向 task->tk_callback,当 task->tk_callback 为 NULL 时,do_action 指向 task->tk_action,
    '/net/sunrpc/clnt.c' call_start, set task->tk_action = call_reserve
    '/net/sunrpc/clnt.c' call_reserve, set task->tk_action = call_reserveresult, and to call -> xprt_reserve()
    '/net/sunrpc/clnt.c' call_reserveresult, set task->tk_action = call_refresh
    '/net/sunrpc/clnt.c' call_refresh 999, to call -> rpcauth_refreshcred
    '/net/sunrpc/clnt.c' call_refreshresult, set task->tk_action = call_refresh, and to call -> rpcauth_uptodatecred, set task->tk_action = call_allocate
    '/net/sunrpc/clnt.c' call_allocate, set task->tk_action = call_bind, and to call -> xprt_inject_disconnect
    '/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_connect, set task->tk_action = call_bind_status, and to call -> xprt->ops->rpcbind。此处为函数指针,实现函数定义在 net/sunrpc/xprtsock.c:2712:static struct rpc_xprt_ops xs_tcp_ops {.rpcbind = rpcb_getport_async,} 
        '/net/sunrpc/rpcb_clnt.c' rpcb_getport_async。调用参数示例:(task->tk_pid = 1, task->tk_status = 0, servername = 192.168.0.120, 
                cl_prog = 100005, cl_vers = 3, prot = 6, xprt->bind_index = 0)
        '/net/sunrpc/rpcb_clnt.c' rpcb_getport_async, to call -> rpcb_create, and to call -> rpc_create。此处会生成一个新的 clnt 和 task。
                调用参数示例:(args = 0Xfffffffe17d1f598, proto = 6, nodename = localhost, hostname = 192.168.0.120, version = 2)
        '/net/sunrpc/clnt.c' rpc_create, to call -> rpc_create_xprt
        '/net/sunrpc/clnt.c' rpc_create_xprt。这次与上次不同,不会调用 rpc_ping
        '/net/sunrpc/clnt.c' rpc_create_xprt, to call -> rpcb_call_async
        '/net/sunrpc/clnt.c' rpcb_call_async, to call -> rpc_run_task
        '/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_call_start    
            '/net/sunrpc/clnt.c' rpc_call_start, set task->tk_action = call_start
        '/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_execute。调用参数示例:is_async = 1, task->tk_pid = 2 
        '/net/sunrpc/clnt.c' rpcb_call_async, to call -> rpc_put_task
        '/net/sunrpc/sched.c' rpc_put_task, to call queue_work(q, &task->u.tk_work) 将 task 加入队列
        '/net/sunrpc/sched.c' rpc_execute, 异步 to call __rpc_execute, task->tk_pid = 2
            '/net/sunrpc/clnt.c' call_start, set task->tk_action = call_reserve
            '/net/sunrpc/clnt.c' call_reserve, set task->tk_action = call_reserveresult, and to call -> xprt_reserve()
            '/net/sunrpc/clnt.c' call_reserveresult, set task->tk_action = call_reserveresult
            '/net/sunrpc/clnt.c' call_reserveresult, set task->tk_action = call_refresh
            '/net/sunrpc/clnt.c' call_refresh set task->tk_action = call_refreshresult, and to call -> rpcauth_refreshcred
            '/net/sunrpc/clnt.c' call_refreshresult, set task->tk_action = call_refresh, and to call -> rpcauth_uptodatecred
            '/net/sunrpc/clnt.c' call_refreshresult, set task->tk_action = call_allocate
            '/net/sunrpc/clnt.c' call_allocate, set task->tk_action = call_bind, and to call -> xprt_inject_disconnect
            '/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_connect
            '/net/sunrpc/clnt.c' call_connect, set task->tk_action = call_transmit, and to call -> xprt_connected
            '/net/sunrpc/clnt.c' call_connect, set task->tk_action = call_connect_status, and to call -> xprt_connect
                '/net/sunrpc/xprt.c' xprt_connect, to call -> xprt_connected
                '/net/sunrpc/xprt.c' xprt_connect, to call -> xprt->ops->connect。此函数指针对应 xs_connect
                '/net/sunrpc/xprtsock.c' xs_connect, to call -> queue_delayed_work
                '/net/sunrpc/xprtsock.c' queue_delayed_work, to call -> xs_tcp_setup_socket
                '/net/sunrpc/xprtsock.c' xs_tcp_setup_socket, to call -> xs_create_sock
                '/net/sunrpc/xprtsock.c' xs_create_sock, to call -> __sock_create。调用参数示例: family = 2, type = 1, protocol = 6
                '/net/socket.c' __sock_create, to call -> security_socket_create。调用参数示例: family = 2, type = 1, protocol = 6, kern = 1。
                    '/security/security.c' security_socket_create.c, to call -> call_int_hook(socket_create ...
                    '/security/selinux/hooks.c' selinux_socket_create
                '/net/socket.c' __sock_create, to call -> sock_alloc, and to call -> pf->create。此函数指针指向 inet_create
                    '/net/ipv4/af_inet.c' inet_create, to call -> current_has_network
                        '/net/ipv4/af_inet.c' current_has_network。此函数会判断权限,如果没有权限会返回 0,那么 inet_create 会返回失败
            '/net/sunrpc/clnt.c' call_connect_status, to call -> rpc_exit

    '/net/sunrpc/clnt.c' call_bind_status, task->tk_pid = 1
    '/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_connect
    '/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_bind_status, and to call -> xprt->ops->rpcbind
        '/net/sunrpc/rpcb_clnt.c' rpcb_getport_async。调用参数示例: task->tk_pid = 1, xprt->bind_index = 1
    '/net/sunrpc/clnt.c' call_bind_status, to call -> rpc_exit, task->tk_pid = 1
'/net/sunrpc/sched.c' __rpc_execute, to call -> rpc_release_task
'/net/sunrpc/clnt.c' rpc_create_xprt, to call -> rpc_shutdown_client        

相关代码说明

static int do_new_mount(struct path *path, const char *fstype, int flags,
            int mnt_flags, const char *name, void *data)
{
    struct file_system_type *type;
    struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
    struct vfsmount *mnt;
    int err;
 
    if (!fstype)
        return -EINVAL;
    // 通过 name 获取文件系统类型
    type = get_fs_type(fstype);
    if (!type)
        return -ENODEV;
    printk(KERN_ERR "fs type:%s\n",type->name);
    
    if (user_ns != &init_user_ns) {
        if (!(type->fs_flags & FS_USERNS_MOUNT)) {
            put_filesystem(type);
            return -EPERM;
        }
        /* Only in special cases allow devices from mounts
         * created outside the initial user namespace.
         */
        if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
            flags |= MS_NODEV;
            mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
        }
    }

    // 获取 struct mount 结构,调用特定文件系统 mount 函数,主要填充 super block 数据
    mnt = vfs_kern_mount(type, flags, name, data);
    // 有子文件系统
    if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
        !mnt->mnt_sb->s_subtype)
        mnt = fs_set_subtype(mnt, fstype);
 
    put_filesystem(type);
    if (IS_ERR(mnt))
        return PTR_ERR(mnt);

    // 将 mount 加入到全局文件树中
    err = do_add_mount(real_mount(mnt), path, mnt_flags);
    if (err)
        mntput(mnt);
    return err;
}

struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
    struct mount *mnt;
    struct dentry *root;
 
    if (!type)
        return ERR_PTR(-ENODEV);
    // 分配并初始化 struct mount 结构
    mnt = alloc_vfsmnt(name);
    if (!mnt)
        return ERR_PTR(-ENOMEM);
 
    if (flags & MS_KERNMOUNT)
        mnt->mnt.mnt_flags = MNT_INTERNAL;

    // 调用具体文件系统的 mount 函数
    root = mount_fs(type, flags, name, data);
    if (IS_ERR(root)) {
        free_vfsmnt(mnt);
        return ERR_CAST(root);
    }
    // 初始化 mnt 变量,并将 mnt 加入超级块 s_mounts 链表中
    mnt->mnt.mnt_root = root;
    mnt->mnt.mnt_sb = root->d_sb;
    mnt->mnt_mountpoint = mnt->mnt.mnt_root;
    mnt->mnt_parent = mnt;
    br_write_lock(&vfsmount_lock);
    list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
    br_write_unlock(&vfsmount_lock);
    return &mnt->mnt;
}

struct dentry *
mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
{
    struct dentry *root;
    struct super_block *sb;
    char *secdata = NULL;
    int error = -ENOMEM;
 
    if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
        secdata = alloc_secdata();
        if (!secdata)
            goto out;
 
        error = security_sb_copy_data(data, secdata);
        if (error)
            goto out_free_secdata;
    }
    // 具体文件系统的 mount 函数,比如 ext4,该函数就是系统初始化时注册的 ext4_fs_type 里面的 mount
    root = type->mount(type, flags, name, data); // 返回 mount 后的 denty
    if (IS_ERR(root)) {
        error = PTR_ERR(root);
        goto out_free_secdata;
    }
    sb = root->d_sb;
    BUG_ON(!sb);
    WARN_ON(!sb->s_bdi);
    WARN_ON(sb->s_bdi == &default_backing_dev_info);
    sb->s_flags |= MS_BORN;
 
    error = security_sb_kern_mount(sb, flags, secdata);
    if (error)
        goto out_sb;
 
    /*
     * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
     * but s_maxbytes was an unsigned long long for many releases. Throw
     * this warning for a little while to try and catch filesystems that
     * violate this rule.
     */
    WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
        "negative value (%lld)\n", type->name, sb->s_maxbytes);
 
    up_write(&sb->s_umount);
    free_secdata(secdata);
    return root;
out_sb:
    dput(root);
    deactivate_locked_super(sb);
out_free_secdata:
    free_secdata(secdata);
out:
    return ERR_PTR(error);
}

// newmnt: 新创建的挂载实例, path: 挂载路径
static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
{
    struct mountpoint *mp;
    struct mount *parent;
    int err;
 
    mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
    // 这里不是简单的加锁,如果 path 上挂载了很多文件系统,那么这里就是要找出最新一次挂载到其上的文件系统的根路径,
    // 这才是我们这个文件系统要挂载到的 mountpoint
    mp = lock_mount(path);
    if (IS_ERR(mp))
        return PTR_ERR(mp);
 
    parent = real_mount(path->mnt); // 得到挂载点所属的挂载结构
    err = -EINVAL;
    if (unlikely(!check_mnt(parent))) {
        /* that's acceptable only for automounts done in private ns */
        if (!(mnt_flags & MNT_SHRINKABLE))
            goto unlock;
        /* ... and for those we'd better have mountpoint still alive */
        if (!parent->mnt_ns)
            goto unlock;
    }
 
    /* Refuse the same filesystem on the same mount point */
    err = -EBUSY;

    // 禁止同一个文件系统挂在到同一个挂载点
    if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
        path->mnt->mnt_root == path->dentry)
        goto unlock;
 
    err = -EINVAL;
    if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
        goto unlock;
 
    newmnt->mnt.mnt_flags = mnt_flags;
    // 把 newmnt 加入到全局文件系统树中
    err = graft_tree(newmnt, parent, mp);  
 
unlock:
    unlock_mount(mp);
    return err;
}
 
static struct mountpoint *lock_mount(struct path *path)
{
    struct vfsmount *mnt;
    struct dentry *dentry = path->dentry;
retry:
    mutex_lock(&dentry->d_inode->i_mutex);
    if (unlikely(cant_mount(dentry))) {
        mutex_unlock(&dentry->d_inode->i_mutex);
        return ERR_PTR(-ENOENT);
    }
    namespace_lock();
    mnt = lookup_mnt(path);
    if (likely(!mnt)) { // 这里表示 dentry 上未挂载文件系统,创建一个新的 mountpoint 返回
        struct mountpoint *mp = new_mountpoint(dentry);
        if (IS_ERR(mp)) {
            namespace_unlock();
            mutex_unlock(&dentry->d_inode->i_mutex);
            return mp;
        }
        return mp;
    }
    namespace_unlock();
    mutex_unlock(&path->dentry->d_inode->i_mutex);
    path_put(path);

    // 如果 lookup_mnt 没有返回 NULL,则说明它找到了挂载在 /mnt 上的子文件系统,下面的逻辑是: 
    // 把子文件系统的 mount 结构赋值给 path->mnt    
    path->mnt = mnt;

    // 如果此 dentry 之前挂载了文件系统,则新的 dentry 将为子文件系统mnt的挂载点
    dentry = path->dentry = dget(mnt->mnt_root);
    // 返回到 lookup_mnt 函数,用新的 path 变量继续查找是否还有后续的子文件系统
    // 这样组成的 list 结构: p->C1->C2->C3,从全局来看后挂载的会覆盖之前挂载的文件系统
    goto retry;
}
 
// 参数为挂载点所属的挂载实例跟目录项, dir为移动方向
/*
路径名查找时都会调用到这个函数,它的作用就是根据一个父<mount, dentry>
二元组找到挂载在其下面的子文件系统的 mount 实例,如果没找到就返回 NULL
*/
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
                  int dir)
{
    struct list_head *head = mount_hashtable + hash(mnt, dentry);
    struct list_head *tmp = head;
    struct mount *p, *found = NULL;
    
    for (;;) {
        tmp = dir ? tmp->next : tmp->prev;
        p = NULL;
        if (tmp == head) // 循环一圈未找到
            break;
        p = list_entry(tmp, struct mount, mnt_hash); // mnt_hash 链接到 mount_hashtable
        if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) { // p 其实是参数 mnt 的子文件系统
            found = p;
            break;
        }
    }
    return found;
}

type->mount 在 nfs 中的定义(fs/nfs/super.c:292):
struct file_system_type nfs_fs_type = {
    .owner      = THIS_MODULE,
    .name       = "nfs",
    .mount      = nfs_fs_mount,
    .kill_sb    = nfs_kill_super,
    .fs_flags   = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
};


struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
    int flags, const char *dev_name, void *raw_data)
{
    struct nfs_mount_info mount_info = {
        .fill_super = nfs_fill_super,
        .set_security = nfs_set_sb_security,
    };
    struct dentry *mntroot = ERR_PTR(-ENOMEM);
    struct nfs_subversion *nfs_mod;
    int error;

    mount_info.parsed = nfs_alloc_parsed_mount_data();

    mount_info.mntfh = nfs_alloc_fhandle();
    if (mount_info.parsed == NULL || mount_info.mntfh == NULL)
        goto out;

    /* Validate the mount data */
    // 验证参数是否有效
    error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name);
    if (error == NFS_TEXT_DATA)
        error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name);
    if (error < 0) {
        mntroot = ERR_PTR(error);
        goto out;
    }

    nfs_mod = get_nfs_version(mount_info.parsed->version);
    if (IS_ERR(nfs_mod)) {
        mntroot = ERR_CAST(nfs_mod);
        goto out;
    }

    // 尝试开始 mount。这个是 mount 主功能实现入口
    mntroot = nfs_mod->rpc_ops->try_mount(flags, dev_name, &mount_info, nfs_mod);

    put_nfs_version(nfs_mod);

out:
    nfs_free_parsed_mount_data(mount_info.parsed);

    nfs_free_fhandle(mount_info.mntfh);

    return mntroot;
}

nfs_mod->rpc_ops->try_mount 在 nfs3 中的定义(fs/nfs/nfs3proc.c:926):
const struct nfs_rpc_ops nfs_v3_clientops = {
    .version    = 3,            /* protocol version */
    .dentry_ops = &nfs_dentry_operations,
    .dir_inode_ops  = &nfs3_dir_inode_operations,
    .file_inode_ops = &nfs3_file_inode_operations,
    .file_ops   = &nfs_file_operations,
    .getroot    = nfs3_proc_get_root,
    .submount   = nfs_submount,
    .try_mount  = nfs_try_mount,
    .getattr    = nfs3_proc_getattr,
    .setattr    = nfs3_proc_setattr,
    .lookup     = nfs3_proc_lookup,
    .access     = nfs3_proc_access,
    .readlink   = nfs3_proc_readlink,
    .create     = nfs3_proc_create,
    .remove     = nfs3_proc_remove,
    .unlink_setup   = nfs3_proc_unlink_setup,
    .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
    .unlink_done    = nfs3_proc_unlink_done,
    .rename_setup   = nfs3_proc_rename_setup,
    .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
    .rename_done    = nfs3_proc_rename_done,
    .link       = nfs3_proc_link,
    .symlink    = nfs3_proc_symlink,
    .mkdir      = nfs3_proc_mkdir,
    .rmdir      = nfs3_proc_rmdir,
    .readdir    = nfs3_proc_readdir,
    .mknod      = nfs3_proc_mknod,
    .statfs     = nfs3_proc_statfs,
    .fsinfo     = nfs3_proc_fsinfo,
    .pathconf   = nfs3_proc_pathconf,
    .decode_dirent  = nfs3_decode_dirent,
    .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,
    .read_setup = nfs3_proc_read_setup,
    .read_done  = nfs3_read_done,
    .write_setup    = nfs3_proc_write_setup,
    .write_done = nfs3_write_done,
    .commit_setup   = nfs3_proc_commit_setup,
    .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
    .commit_done    = nfs3_commit_done,
    .lock       = nfs3_proc_lock,
    .clear_acl_cache = forget_all_cached_acls,
    .close_context  = nfs_close_context,
    .have_delegation = nfs3_have_delegation,
    .return_delegation = nfs3_return_delegation,
    .alloc_client   = nfs_alloc_client,
    .init_client    = nfs_init_client,
    .free_client    = nfs_free_client,
    .create_server  = nfs3_create_server,
    .clone_server   = nfs3_clone_server,
};

struct dentry *nfs_try_mount(int flags, const char *dev_name,
                 struct nfs_mount_info *mount_info,
                 struct nfs_subversion *nfs_mod)
{
    struct nfs_server *server;

    if (mount_info->parsed->need_mount)
    {
        // mount 第一步,创建 client, 检测权限等 
        server = nfs_try_mount_request(mount_info, nfs_mod);
    }
    else
    {
        server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
    }

    if (IS_ERR(server))
    {
        return ERR_CAST(server);
    }

    return nfs_fs_mount_common(server, flags, dev_name, mount_info, nfs_mod);
}
上一篇下一篇

猜你喜欢

热点阅读