Linux Kernel Mount 流程分析
2019-05-10 本文已影响3人
jerryyyq
调用栈
入口
入口地址有两个:
- 系统自带的 mount 命令会调用 /fs/namespace.c 中的 SYSCALL_DEFINE5
- busybox mount 命令会调用 /fs/compat.c 中的 COMPAT_SYSCALL_DEFINE5
之后,都会调用: fs/namespace.c 里的 do_mount 函数,之后的调用栈如下:
'fs/namespace.c' do_mount, to call -> do_new_mount
'fs/namespace.c' do_new_mount, to call -> vfs_kern_mount
'fs/namespace.c' vfs_kern_mount, to call -> mount_fs
'fs/super.c' mount_fs to call -> type->mount
nfs
此处的 type->mount 是个函数指针,各个类型(例如 nfs)的 mount 实现分别声明并注册自己的 mount 实现
nfs 的 mount 实现函数为:'nfs/super.c' nfs_fs_mount
后续调用栈如下:
nfs_fs_mount 一
'nfs/super.c' nfs_fs_mount, to call -> nfs23_validate_mount_data
'nfs/super.c' nfs_fs_mount, to call -> nfs_validate_text_mount_data。调用参数示例:(options = nolock,addr=192.168.0.120, args->nfs_server.export_path = (null), args->nfs_server.port = -1, args->nfs_server.protocol = 6, dev_name = 192.168.0.120:/srv/nfs2, flags = 0, version = 3)
'nfs/super.c' nfs_validate_text_mount_data, to call -> nfs_parse_mount_options
'nfs/super.c' nfs_parse_mount_options, to call -> nfs_verify_server_address
nfs_fs_mount 二
'nfs/super.c' nfs_fs_mount, to call -> get_nfs_version
'nfs/super.c' nfs_fs_mount, to call -> nfs_try_mount。
调用参数示例:(flags = 32768, dev_name = 192.168.0.120:/srv/nfs2, mount_info->parsed->flags = 3146240, mount_info->parsed->version = 3)
'nfs/super.c' nfs_try_mount, to call -> nfs_try_mount_request
'nfs/super.c' nfs_try_mount_request, to call -> nfs_request_mount
'nfs/super.c' nfs_request_mount, to call -> nfs_mount
'nfs/mount_clnt.c' nfs_mount, to call -> rpc_create
'/net/sunrpc/clnt.c' rpc_create, to call -> rpc_create_xprt
'/net/sunrpc/clnt.c' rpc_create_xprt, to call -> rpc_ping
'/net/sunrpc/clnt.c' rpc_ping, to call -> rpc_call_sync
'/net/sunrpc/clnt.c' rpc_call_sync, to call -> rpc_run_task, task->tk_pid = 0 (task->tk_pid 系统从 1 开始分配,每创建一个新的 task 加 1)
'/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_call_start
'/net/sunrpc/clnt.c' rpc_call_start, to set : task->tk_action = call_start
'/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_execute
'/net/sunrpc/sched.c' rpc_execute, to call __rpc_execute, task->tk_pid = 1。
在 __rpc_execute 函数中, 会循环调用 do_action(task), 直到 RPC_IS_QUEUED(task) 退出循环。
'/net/sunrpc/sched.c' __rpc_execute, to call do_action(task)。
do_action 是个函数指针,实际指向 task->tk_callback,当 task->tk_callback 为 NULL 时,do_action 指向 task->tk_action,
'/net/sunrpc/clnt.c' call_start, set task->tk_action = call_reserve
'/net/sunrpc/clnt.c' call_reserve, set task->tk_action = call_reserveresult, and to call -> xprt_reserve()
'/net/sunrpc/clnt.c' call_reserveresult, set task->tk_action = call_refresh
'/net/sunrpc/clnt.c' call_refresh 999, to call -> rpcauth_refreshcred
'/net/sunrpc/clnt.c' call_refreshresult, set task->tk_action = call_refresh, and to call -> rpcauth_uptodatecred, set task->tk_action = call_allocate
'/net/sunrpc/clnt.c' call_allocate, set task->tk_action = call_bind, and to call -> xprt_inject_disconnect
'/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_connect, set task->tk_action = call_bind_status, and to call -> xprt->ops->rpcbind。此处为函数指针,实现函数定义在 net/sunrpc/xprtsock.c:2712:static struct rpc_xprt_ops xs_tcp_ops {.rpcbind = rpcb_getport_async,}
'/net/sunrpc/rpcb_clnt.c' rpcb_getport_async。调用参数示例:(task->tk_pid = 1, task->tk_status = 0, servername = 192.168.0.120,
cl_prog = 100005, cl_vers = 3, prot = 6, xprt->bind_index = 0)
'/net/sunrpc/rpcb_clnt.c' rpcb_getport_async, to call -> rpcb_create, and to call -> rpc_create。此处会生成一个新的 clnt 和 task。
调用参数示例:(args = 0Xfffffffe17d1f598, proto = 6, nodename = localhost, hostname = 192.168.0.120, version = 2)
'/net/sunrpc/clnt.c' rpc_create, to call -> rpc_create_xprt
'/net/sunrpc/clnt.c' rpc_create_xprt。这次与上次不同,不会调用 rpc_ping
'/net/sunrpc/clnt.c' rpc_create_xprt, to call -> rpcb_call_async
'/net/sunrpc/clnt.c' rpcb_call_async, to call -> rpc_run_task
'/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_call_start
'/net/sunrpc/clnt.c' rpc_call_start, set task->tk_action = call_start
'/net/sunrpc/clnt.c' rpc_run_task, to call -> rpc_execute。调用参数示例:is_async = 1, task->tk_pid = 2
'/net/sunrpc/clnt.c' rpcb_call_async, to call -> rpc_put_task
'/net/sunrpc/sched.c' rpc_put_task, to call queue_work(q, &task->u.tk_work) 将 task 加入队列
'/net/sunrpc/sched.c' rpc_execute, 异步 to call __rpc_execute, task->tk_pid = 2
'/net/sunrpc/clnt.c' call_start, set task->tk_action = call_reserve
'/net/sunrpc/clnt.c' call_reserve, set task->tk_action = call_reserveresult, and to call -> xprt_reserve()
'/net/sunrpc/clnt.c' call_reserveresult, set task->tk_action = call_reserveresult
'/net/sunrpc/clnt.c' call_reserveresult, set task->tk_action = call_refresh
'/net/sunrpc/clnt.c' call_refresh set task->tk_action = call_refreshresult, and to call -> rpcauth_refreshcred
'/net/sunrpc/clnt.c' call_refreshresult, set task->tk_action = call_refresh, and to call -> rpcauth_uptodatecred
'/net/sunrpc/clnt.c' call_refreshresult, set task->tk_action = call_allocate
'/net/sunrpc/clnt.c' call_allocate, set task->tk_action = call_bind, and to call -> xprt_inject_disconnect
'/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_connect
'/net/sunrpc/clnt.c' call_connect, set task->tk_action = call_transmit, and to call -> xprt_connected
'/net/sunrpc/clnt.c' call_connect, set task->tk_action = call_connect_status, and to call -> xprt_connect
'/net/sunrpc/xprt.c' xprt_connect, to call -> xprt_connected
'/net/sunrpc/xprt.c' xprt_connect, to call -> xprt->ops->connect。此函数指针对应 xs_connect
'/net/sunrpc/xprtsock.c' xs_connect, to call -> queue_delayed_work
'/net/sunrpc/xprtsock.c' queue_delayed_work, to call -> xs_tcp_setup_socket
'/net/sunrpc/xprtsock.c' xs_tcp_setup_socket, to call -> xs_create_sock
'/net/sunrpc/xprtsock.c' xs_create_sock, to call -> __sock_create。调用参数示例: family = 2, type = 1, protocol = 6
'/net/socket.c' __sock_create, to call -> security_socket_create。调用参数示例: family = 2, type = 1, protocol = 6, kern = 1。
'/security/security.c' security_socket_create.c, to call -> call_int_hook(socket_create ...
'/security/selinux/hooks.c' selinux_socket_create
'/net/socket.c' __sock_create, to call -> sock_alloc, and to call -> pf->create。此函数指针指向 inet_create
'/net/ipv4/af_inet.c' inet_create, to call -> current_has_network
'/net/ipv4/af_inet.c' current_has_network。此函数会判断权限,如果没有权限会返回 0,那么 inet_create 会返回失败
'/net/sunrpc/clnt.c' call_connect_status, to call -> rpc_exit
'/net/sunrpc/clnt.c' call_bind_status, task->tk_pid = 1
'/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_connect
'/net/sunrpc/clnt.c' call_bind, set task->tk_action = call_bind_status, and to call -> xprt->ops->rpcbind
'/net/sunrpc/rpcb_clnt.c' rpcb_getport_async。调用参数示例: task->tk_pid = 1, xprt->bind_index = 1
'/net/sunrpc/clnt.c' call_bind_status, to call -> rpc_exit, task->tk_pid = 1
'/net/sunrpc/sched.c' __rpc_execute, to call -> rpc_release_task
'/net/sunrpc/clnt.c' rpc_create_xprt, to call -> rpc_shutdown_client
相关代码说明
static int do_new_mount(struct path *path, const char *fstype, int flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
struct vfsmount *mnt;
int err;
if (!fstype)
return -EINVAL;
// 通过 name 获取文件系统类型
type = get_fs_type(fstype);
if (!type)
return -ENODEV;
printk(KERN_ERR "fs type:%s\n",type->name);
if (user_ns != &init_user_ns) {
if (!(type->fs_flags & FS_USERNS_MOUNT)) {
put_filesystem(type);
return -EPERM;
}
/* Only in special cases allow devices from mounts
* created outside the initial user namespace.
*/
if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
flags |= MS_NODEV;
mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
}
}
// 获取 struct mount 结构,调用特定文件系统 mount 函数,主要填充 super block 数据
mnt = vfs_kern_mount(type, flags, name, data);
// 有子文件系统
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
mnt = fs_set_subtype(mnt, fstype);
put_filesystem(type);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
// 将 mount 加入到全局文件树中
err = do_add_mount(real_mount(mnt), path, mnt_flags);
if (err)
mntput(mnt);
return err;
}
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
struct mount *mnt;
struct dentry *root;
if (!type)
return ERR_PTR(-ENODEV);
// 分配并初始化 struct mount 结构
mnt = alloc_vfsmnt(name);
if (!mnt)
return ERR_PTR(-ENOMEM);
if (flags & MS_KERNMOUNT)
mnt->mnt.mnt_flags = MNT_INTERNAL;
// 调用具体文件系统的 mount 函数
root = mount_fs(type, flags, name, data);
if (IS_ERR(root)) {
free_vfsmnt(mnt);
return ERR_CAST(root);
}
// 初始化 mnt 变量,并将 mnt 加入超级块 s_mounts 链表中
mnt->mnt.mnt_root = root;
mnt->mnt.mnt_sb = root->d_sb;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
br_write_lock(&vfsmount_lock);
list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
br_write_unlock(&vfsmount_lock);
return &mnt->mnt;
}
struct dentry *
mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
{
struct dentry *root;
struct super_block *sb;
char *secdata = NULL;
int error = -ENOMEM;
if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
secdata = alloc_secdata();
if (!secdata)
goto out;
error = security_sb_copy_data(data, secdata);
if (error)
goto out_free_secdata;
}
// 具体文件系统的 mount 函数,比如 ext4,该函数就是系统初始化时注册的 ext4_fs_type 里面的 mount
root = type->mount(type, flags, name, data); // 返回 mount 后的 denty
if (IS_ERR(root)) {
error = PTR_ERR(root);
goto out_free_secdata;
}
sb = root->d_sb;
BUG_ON(!sb);
WARN_ON(!sb->s_bdi);
WARN_ON(sb->s_bdi == &default_backing_dev_info);
sb->s_flags |= MS_BORN;
error = security_sb_kern_mount(sb, flags, secdata);
if (error)
goto out_sb;
/*
* filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
* but s_maxbytes was an unsigned long long for many releases. Throw
* this warning for a little while to try and catch filesystems that
* violate this rule.
*/
WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
"negative value (%lld)\n", type->name, sb->s_maxbytes);
up_write(&sb->s_umount);
free_secdata(secdata);
return root;
out_sb:
dput(root);
deactivate_locked_super(sb);
out_free_secdata:
free_secdata(secdata);
out:
return ERR_PTR(error);
}
// newmnt: 新创建的挂载实例, path: 挂载路径
static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
{
struct mountpoint *mp;
struct mount *parent;
int err;
mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
// 这里不是简单的加锁,如果 path 上挂载了很多文件系统,那么这里就是要找出最新一次挂载到其上的文件系统的根路径,
// 这才是我们这个文件系统要挂载到的 mountpoint
mp = lock_mount(path);
if (IS_ERR(mp))
return PTR_ERR(mp);
parent = real_mount(path->mnt); // 得到挂载点所属的挂载结构
err = -EINVAL;
if (unlikely(!check_mnt(parent))) {
/* that's acceptable only for automounts done in private ns */
if (!(mnt_flags & MNT_SHRINKABLE))
goto unlock;
/* ... and for those we'd better have mountpoint still alive */
if (!parent->mnt_ns)
goto unlock;
}
/* Refuse the same filesystem on the same mount point */
err = -EBUSY;
// 禁止同一个文件系统挂在到同一个挂载点
if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
path->mnt->mnt_root == path->dentry)
goto unlock;
err = -EINVAL;
if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
goto unlock;
newmnt->mnt.mnt_flags = mnt_flags;
// 把 newmnt 加入到全局文件系统树中
err = graft_tree(newmnt, parent, mp);
unlock:
unlock_mount(mp);
return err;
}
static struct mountpoint *lock_mount(struct path *path)
{
struct vfsmount *mnt;
struct dentry *dentry = path->dentry;
retry:
mutex_lock(&dentry->d_inode->i_mutex);
if (unlikely(cant_mount(dentry))) {
mutex_unlock(&dentry->d_inode->i_mutex);
return ERR_PTR(-ENOENT);
}
namespace_lock();
mnt = lookup_mnt(path);
if (likely(!mnt)) { // 这里表示 dentry 上未挂载文件系统,创建一个新的 mountpoint 返回
struct mountpoint *mp = new_mountpoint(dentry);
if (IS_ERR(mp)) {
namespace_unlock();
mutex_unlock(&dentry->d_inode->i_mutex);
return mp;
}
return mp;
}
namespace_unlock();
mutex_unlock(&path->dentry->d_inode->i_mutex);
path_put(path);
// 如果 lookup_mnt 没有返回 NULL,则说明它找到了挂载在 /mnt 上的子文件系统,下面的逻辑是:
// 把子文件系统的 mount 结构赋值给 path->mnt
path->mnt = mnt;
// 如果此 dentry 之前挂载了文件系统,则新的 dentry 将为子文件系统mnt的挂载点
dentry = path->dentry = dget(mnt->mnt_root);
// 返回到 lookup_mnt 函数,用新的 path 变量继续查找是否还有后续的子文件系统
// 这样组成的 list 结构: p->C1->C2->C3,从全局来看后挂载的会覆盖之前挂载的文件系统
goto retry;
}
// 参数为挂载点所属的挂载实例跟目录项, dir为移动方向
/*
路径名查找时都会调用到这个函数,它的作用就是根据一个父<mount, dentry>
二元组找到挂载在其下面的子文件系统的 mount 实例,如果没找到就返回 NULL
*/
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
int dir)
{
struct list_head *head = mount_hashtable + hash(mnt, dentry);
struct list_head *tmp = head;
struct mount *p, *found = NULL;
for (;;) {
tmp = dir ? tmp->next : tmp->prev;
p = NULL;
if (tmp == head) // 循环一圈未找到
break;
p = list_entry(tmp, struct mount, mnt_hash); // mnt_hash 链接到 mount_hashtable
if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) { // p 其实是参数 mnt 的子文件系统
found = p;
break;
}
}
return found;
}
type->mount 在 nfs 中的定义(fs/nfs/super.c:292):
struct file_system_type nfs_fs_type = {
.owner = THIS_MODULE,
.name = "nfs",
.mount = nfs_fs_mount,
.kill_sb = nfs_kill_super,
.fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
};
struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data)
{
struct nfs_mount_info mount_info = {
.fill_super = nfs_fill_super,
.set_security = nfs_set_sb_security,
};
struct dentry *mntroot = ERR_PTR(-ENOMEM);
struct nfs_subversion *nfs_mod;
int error;
mount_info.parsed = nfs_alloc_parsed_mount_data();
mount_info.mntfh = nfs_alloc_fhandle();
if (mount_info.parsed == NULL || mount_info.mntfh == NULL)
goto out;
/* Validate the mount data */
// 验证参数是否有效
error = nfs_validate_mount_data(fs_type, raw_data, mount_info.parsed, mount_info.mntfh, dev_name);
if (error == NFS_TEXT_DATA)
error = nfs_validate_text_mount_data(raw_data, mount_info.parsed, dev_name);
if (error < 0) {
mntroot = ERR_PTR(error);
goto out;
}
nfs_mod = get_nfs_version(mount_info.parsed->version);
if (IS_ERR(nfs_mod)) {
mntroot = ERR_CAST(nfs_mod);
goto out;
}
// 尝试开始 mount。这个是 mount 主功能实现入口
mntroot = nfs_mod->rpc_ops->try_mount(flags, dev_name, &mount_info, nfs_mod);
put_nfs_version(nfs_mod);
out:
nfs_free_parsed_mount_data(mount_info.parsed);
nfs_free_fhandle(mount_info.mntfh);
return mntroot;
}
nfs_mod->rpc_ops->try_mount 在 nfs3 中的定义(fs/nfs/nfs3proc.c:926):
const struct nfs_rpc_ops nfs_v3_clientops = {
.version = 3, /* protocol version */
.dentry_ops = &nfs_dentry_operations,
.dir_inode_ops = &nfs3_dir_inode_operations,
.file_inode_ops = &nfs3_file_inode_operations,
.file_ops = &nfs_file_operations,
.getroot = nfs3_proc_get_root,
.submount = nfs_submount,
.try_mount = nfs_try_mount,
.getattr = nfs3_proc_getattr,
.setattr = nfs3_proc_setattr,
.lookup = nfs3_proc_lookup,
.access = nfs3_proc_access,
.readlink = nfs3_proc_readlink,
.create = nfs3_proc_create,
.remove = nfs3_proc_remove,
.unlink_setup = nfs3_proc_unlink_setup,
.unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
.unlink_done = nfs3_proc_unlink_done,
.rename_setup = nfs3_proc_rename_setup,
.rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
.rename_done = nfs3_proc_rename_done,
.link = nfs3_proc_link,
.symlink = nfs3_proc_symlink,
.mkdir = nfs3_proc_mkdir,
.rmdir = nfs3_proc_rmdir,
.readdir = nfs3_proc_readdir,
.mknod = nfs3_proc_mknod,
.statfs = nfs3_proc_statfs,
.fsinfo = nfs3_proc_fsinfo,
.pathconf = nfs3_proc_pathconf,
.decode_dirent = nfs3_decode_dirent,
.pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,
.read_setup = nfs3_proc_read_setup,
.read_done = nfs3_read_done,
.write_setup = nfs3_proc_write_setup,
.write_done = nfs3_write_done,
.commit_setup = nfs3_proc_commit_setup,
.commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
.commit_done = nfs3_commit_done,
.lock = nfs3_proc_lock,
.clear_acl_cache = forget_all_cached_acls,
.close_context = nfs_close_context,
.have_delegation = nfs3_have_delegation,
.return_delegation = nfs3_return_delegation,
.alloc_client = nfs_alloc_client,
.init_client = nfs_init_client,
.free_client = nfs_free_client,
.create_server = nfs3_create_server,
.clone_server = nfs3_clone_server,
};
struct dentry *nfs_try_mount(int flags, const char *dev_name,
struct nfs_mount_info *mount_info,
struct nfs_subversion *nfs_mod)
{
struct nfs_server *server;
if (mount_info->parsed->need_mount)
{
// mount 第一步,创建 client, 检测权限等
server = nfs_try_mount_request(mount_info, nfs_mod);
}
else
{
server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
}
if (IS_ERR(server))
{
return ERR_CAST(server);
}
return nfs_fs_mount_common(server, flags, dev_name, mount_info, nfs_mod);
}