From 8654df4e2ac9704905198d63845554c2ddf6a93f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 16:06:06 -0500 Subject: [PATCH 01/30] mnt: Refactor fs_fully_visible into mount_too_revealing Replace the call of fs_fully_visible in do_new_mount from before the new superblock is allocated with a call of mount_too_revealing after the superblock is allocated. This winds up being a much better location for maintainability of the code. The first change this enables is the replacement of FS_USERNS_VISIBLE with SB_I_USERNS_VISIBLE. Moving the flag from struct filesystem_type to sb_iflags on the superblock. Unfortunately mount_too_revealing fundamentally needs to touch mnt_flags adding several MNT_LOCKED_XXX flags at the appropriate times. If the mnt_flags did not need to be touched the code could be easily moved into the filesystem specific mount code. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 38 +++++++++++++++++++++++++------------- fs/proc/inode.c | 1 + fs/proc/root.c | 2 +- fs/sysfs/mount.c | 4 ++-- include/linux/fs.h | 4 +++- 5 files changed, 32 insertions(+), 17 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 783004af57077a..1a69aa78697583 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2375,7 +2375,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) return err; } -static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags); +static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags); /* * create a new mount for userspace and request it to be added into the @@ -2408,12 +2408,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, flags |= MS_NODEV; mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } - if (type->fs_flags & FS_USERNS_VISIBLE) { - if (!fs_fully_visible(type, &mnt_flags)) { - put_filesystem(type); - return -EPERM; - } - } } mnt = vfs_kern_mount(type, flags, name, data); @@ -2425,6 +2419,11 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, if (IS_ERR(mnt)) return PTR_ERR(mnt); + if (mount_too_revealing(mnt, &mnt_flags)) { + mntput(mnt); + return -EPERM; + } + err = do_add_mount(real_mount(mnt), path, mnt_flags); if (err) mntput(mnt); @@ -3216,22 +3215,19 @@ bool current_chrooted(void) return chrooted; } -static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) +static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, + int *new_mnt_flags) { - struct mnt_namespace *ns = current->nsproxy->mnt_ns; int new_flags = *new_mnt_flags; struct mount *mnt; bool visible = false; - if (unlikely(!ns)) - return false; - down_read(&namespace_sem); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != type) + if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type) continue; /* This mount is not fully visible if it's root directory @@ -3298,6 +3294,22 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) return visible; } +static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + unsigned long s_iflags; + + if (ns->user_ns == &init_user_ns) + return false; + + /* Can this filesystem be too revealing? */ + s_iflags = mnt->mnt_sb->s_iflags; + if (!(s_iflags & SB_I_USERNS_VISIBLE)) + return false; + + return !mnt_already_visible(ns, mnt, new_mnt_flags); +} + static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 42305ddcbaa001..78fa452d65ed18 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -462,6 +462,7 @@ int proc_fill_super(struct super_block *s) struct inode *root_inode; int ret; + s->s_iflags |= SB_I_USERNS_VISIBLE; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; diff --git a/fs/proc/root.c b/fs/proc/root.c index 55bc7d6c8aacae..a1b2860fec62d7 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -158,7 +158,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index f3db82071cfbd5..f31e36994dfb4e 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -42,7 +42,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); else if (new_sb) /* Userspace would break if executables appear on sysfs */ - root->d_sb->s_iflags |= SB_I_NOEXEC; + root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC; return root; } @@ -59,7 +59,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/include/linux/fs.h b/include/linux/fs.h index dd288148a6b15f..71988dd3af95b1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1328,6 +1328,9 @@ struct mm_struct; #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ +/* sb->s_iflags to limit user namespace mounts */ +#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ + /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ @@ -2011,7 +2014,6 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ -#define FS_USERNS_VISIBLE 32 /* FS must already be visible */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); From b236017acffa73d52eac9427f42d8993067d20fb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 May 2016 12:26:41 -0500 Subject: [PATCH 02/30] ipc: Initialize ipc_namespace->user_ns early. Allow the ipc namespace initialization code to depend on ns->user_ns being set during initialization. In particular this allows mq_init_ns to use ns->user_ns for permission checks and initializating s_user_ns while the the mq filesystem is being mounted. Acked-by: Seth Forshee Suggested-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- ipc/namespace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ipc/namespace.c b/ipc/namespace.c index 068caf18d56509..04cb07eb81f1c1 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -34,8 +34,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, ns->ns.ops = &ipcns_operations; atomic_set(&ns->count, 1); + ns->user_ns = get_user_ns(user_ns); + err = mq_init_ns(ns); if (err) { + put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); return ERR_PTR(err); @@ -46,8 +49,6 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, msg_init_ns(ns); shm_init_ns(ns); - ns->user_ns = get_user_ns(user_ns); - return ns; } From d91ee87d8d85a0808c01787e8b4a6b48f2ba487b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 23 May 2016 14:51:59 -0500 Subject: [PATCH 03/30] vfs: Pass data, ns, and ns->userns to mount_ns Today what is normally called data (the mount options) is not passed to fill_super through mount_ns. Pass the mount options and the namespace separately to mount_ns so that filesystems such as proc that have mount options, can use mount_ns. Pass the user namespace to mount_ns so that the standard permission check that verifies the mounter has permissions over the namespace can be performed in mount_ns instead of in each filesystems .mount method. Thus removing the duplication between mqueuefs and proc in terms of permission checks. The extra permission check does not currently affect the rpc_pipefs filesystem and the nfsd filesystem as those filesystems do not currently allow unprivileged mounts. Without unpvileged mounts it is guaranteed that the caller has already passed capable(CAP_SYS_ADMIN) which guarantees extra permission check will pass. Update rpc_pipefs and the nfsd filesystem to ensure that the network namespace reference is always taken in fill_super and always put in kill_sb so that the logic is simpler and so that errors originating inside of fill_super do not cause a network namespace leak. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/nfsd/nfsctl.c | 13 ++++--------- fs/super.c | 13 ++++++++++--- include/linux/fs.h | 5 +++-- ipc/mqueue.c | 19 ++++++++----------- net/sunrpc/rpc_pipe.c | 8 ++++---- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 9690cb4dd5887b..5a6ae2522266bf 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1154,20 +1154,15 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) #endif /* last one */ {""} }; - struct net *net = data; - int ret; - - ret = simple_fill_super(sb, 0x6e667364, nfsd_files); - if (ret) - return ret; - sb->s_fs_info = get_net(net); - return 0; + get_net(sb->s_fs_info); + return simple_fill_super(sb, 0x6e667364, nfsd_files); } static struct dentry *nfsd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super); + struct net *net = current->nsproxy->net_ns; + return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super); } static void nfsd_umount(struct super_block *sb) diff --git a/fs/super.c b/fs/super.c index d78b9847e6cbc6..fd65667832e568 100644 --- a/fs/super.c +++ b/fs/super.c @@ -918,12 +918,19 @@ static int ns_set_super(struct super_block *sb, void *data) return set_anon_super(sb, NULL); } -struct dentry *mount_ns(struct file_system_type *fs_type, int flags, - void *data, int (*fill_super)(struct super_block *, void *, int)) +struct dentry *mount_ns(struct file_system_type *fs_type, + int flags, void *data, void *ns, struct user_namespace *user_ns, + int (*fill_super)(struct super_block *, void *, int)) { struct super_block *sb; - sb = sget(fs_type, ns_test_super, ns_set_super, flags, data); + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN + * over the namespace. + */ + if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns); if (IS_ERR(sb)) return ERR_CAST(sb); diff --git a/include/linux/fs.h b/include/linux/fs.h index 71988dd3af95b1..1ce006a24f497e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2034,8 +2034,9 @@ struct file_system_type { #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) -extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, - void *data, int (*fill_super)(struct super_block *, void *, int)); +extern struct dentry *mount_ns(struct file_system_type *fs_type, + int flags, void *data, void *ns, struct user_namespace *user_ns, + int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index ade739f67f1df6..60d97082f4dcff 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -305,7 +305,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, static int mqueue_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - struct ipc_namespace *ns = data; + struct ipc_namespace *ns = sb->s_fs_info; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -326,17 +326,14 @@ static struct dentry *mqueue_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - if (!(flags & MS_KERNMOUNT)) { - struct ipc_namespace *ns = current->nsproxy->ipc_ns; - /* Don't allow mounting unless the caller has CAP_SYS_ADMIN - * over the ipc namespace. - */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - data = ns; + struct ipc_namespace *ns; + if (flags & MS_KERNMOUNT) { + ns = data; + data = NULL; + } else { + ns = current->nsproxy->ipc_ns; } - return mount_ns(fs_type, flags, data, mqueue_fill_super); + return mount_ns(fs_type, flags, data, ns, ns->user_ns, mqueue_fill_super); } static void init_once(void *foo) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index fc48eca21fd2ed..84f98cbe31c3cf 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1386,7 +1386,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct dentry *root, *gssd_dentry; - struct net *net = data; + struct net *net = get_net(sb->s_fs_info); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1419,7 +1419,6 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb); if (err) goto err_depopulate; - sb->s_fs_info = get_net(net); mutex_unlock(&sn->pipefs_sb_lock); return 0; @@ -1448,7 +1447,8 @@ static struct dentry * rpc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_ns(fs_type, flags, current->nsproxy->net_ns, rpc_fill_super); + struct net *net = current->nsproxy->net_ns; + return mount_ns(fs_type, flags, data, net, net->user_ns, rpc_fill_super); } static void rpc_kill_sb(struct super_block *sb) @@ -1468,9 +1468,9 @@ static void rpc_kill_sb(struct super_block *sb) RPC_PIPEFS_UMOUNT, sb); mutex_unlock(&sn->pipefs_sb_lock); - put_net(net); out: kill_litter_super(sb); + put_net(net); } static struct file_system_type rpc_pipe_fs_type = { From e94591d0d90c13166cb6eb54ce5f96ed13d81b55 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 15:32:10 -0500 Subject: [PATCH 04/30] proc: Convert proc_mount to use mount_ns. Move the call of get_pid_ns, the call of proc_parse_options, and the setting of s_iflags into proc_fill_super so that mount_ns can be used. Convert proc_mount to call mount_ns and remove the now unnecessary code. Acked-by: Seth Forshee Reviewed-by: Djalal Harouni Signed-off-by: "Eric W. Biederman" --- fs/proc/inode.c | 9 ++++++-- fs/proc/internal.h | 3 ++- fs/proc/root.c | 52 ++++------------------------------------------ 3 files changed, 13 insertions(+), 51 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 78fa452d65ed18..f4817efb25a6a6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -457,12 +457,17 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) return inode; } -int proc_fill_super(struct super_block *s) +int proc_fill_super(struct super_block *s, void *data, int silent) { + struct pid_namespace *ns = get_pid_ns(s->s_fs_info); struct inode *root_inode; int ret; - s->s_iflags |= SB_I_USERNS_VISIBLE; + if (!proc_parse_options(data, ns)) + return -EINVAL; + + /* User space would break if executables appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa2781095bd15f..7931c558c19250 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -212,7 +212,7 @@ extern const struct inode_operations proc_pid_link_inode_operations; extern void proc_init_inodecache(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -extern int proc_fill_super(struct super_block *); +extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); /* @@ -268,6 +268,7 @@ static inline void proc_tty_init(void) {} * root.c */ extern struct proc_dir_entry proc_root; +extern int proc_parse_options(char *options, struct pid_namespace *pid); extern void proc_self_init(void); extern int proc_remount(struct super_block *, int *, char *); diff --git a/fs/proc/root.c b/fs/proc/root.c index a1b2860fec62d7..8d3e484055a6b3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -23,21 +23,6 @@ #include "internal.h" -static int proc_test_super(struct super_block *sb, void *data) -{ - return sb->s_fs_info == data; -} - -static int proc_set_super(struct super_block *sb, void *data) -{ - int err = set_anon_super(sb, NULL); - if (!err) { - struct pid_namespace *ns = (struct pid_namespace *)data; - sb->s_fs_info = get_pid_ns(ns); - } - return err; -} - enum { Opt_gid, Opt_hidepid, Opt_err, }; @@ -48,7 +33,7 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; -static int proc_parse_options(char *options, struct pid_namespace *pid) +int proc_parse_options(char *options, struct pid_namespace *pid) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -100,45 +85,16 @@ int proc_remount(struct super_block *sb, int *flags, char *data) static struct dentry *proc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - int err; - struct super_block *sb; struct pid_namespace *ns; - char *options; if (flags & MS_KERNMOUNT) { - ns = (struct pid_namespace *)data; - options = NULL; + ns = data; + data = NULL; } else { ns = task_active_pid_ns(current); - options = data; - - /* Does the mounter have privilege over the pid namespace? */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - } - - sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); - if (IS_ERR(sb)) - return ERR_CAST(sb); - - if (!proc_parse_options(options, ns)) { - deactivate_locked_super(sb); - return ERR_PTR(-EINVAL); - } - - if (!sb->s_root) { - err = proc_fill_super(sb); - if (err) { - deactivate_locked_super(sb); - return ERR_PTR(err); - } - - sb->s_flags |= MS_ACTIVE; - /* User space would break if executables appear on proc */ - sb->s_iflags |= SB_I_NOEXEC; } - return dget(sb->s_root); + return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super); } static void proc_kill_sb(struct super_block *sb) From 6e4eab577a0cae15b3da9b888cff16fe57981b3e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 24 May 2016 09:29:01 -0500 Subject: [PATCH 05/30] fs: Add user namespace member to struct super_block Start marking filesystems with a user namespace owner, s_user_ns. In this change this is only used for permission checks of who may mount a filesystem. Ultimately s_user_ns will be used for translating ids and checking capabilities for filesystems mounted from user namespaces. The default policy for setting s_user_ns is implemented in sget(), which arranges for s_user_ns to be set to current_user_ns() and to ensure that the mounter of the filesystem has CAP_SYS_ADMIN in that user_ns. The guts of sget are split out into another function sget_userns(). The function sget_userns calls alloc_super with the specified user namespace or it verifies the existing superblock that was found has the expected user namespace, and fails with EBUSY when it is not. This failing prevents users with the wrong privileges mounting a filesystem. The reason for the split of sget_userns from sget is that in some cases such as mount_ns and kernfs_mount_ns a different policy for permission checking of mounts and setting s_user_ns is necessary, and the existence of sget_userns() allows those policies to be implemented. The helper mount_ns is expected to be used for filesystems such as proc and mqueuefs which present per namespace information. The function mount_ns is modified to call sget_userns instead of sget to ensure the user namespace owner of the namespace whose information is presented by the filesystem is used on the superblock. For sysfs and cgroup the appropriate permission checks are already in place, and kernfs_mount_ns is modified to call sget_userns so that the init_user_ns is the only user namespace used. For the cgroup filesystem cgroup namespace mounts are bind mounts of a subset of the full cgroup filesystem and as such s_user_ns must be the same for all of them as there is only a single superblock. Mounts of sysfs that vary based on the network namespace could in principle change s_user_ns but it keeps the analysis and implementation of kernfs simpler if that is not supported, and at present there appear to be no benefits from supporting a different s_user_ns on any sysfs mount. Getting the details of setting s_user_ns correct has been a long process. Thanks to Pavel Tikhorirorv who spotted a leak in sget_userns. Thanks to Seth Forshee who has kept the work alive. Thanks-to: Seth Forshee Thanks-to: Pavel Tikhomirov Acked-by: Seth Forshee Signed-off-by: Eric W. Biederman --- fs/kernfs/mount.c | 3 ++- fs/super.c | 52 ++++++++++++++++++++++++++++++++++++++++------ include/linux/fs.h | 12 +++++++++++ 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 63534f5f9073c3..d90d574c15a2f5 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -241,7 +241,8 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, info->root = root; info->ns = ns; - sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info); + sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, + &init_user_ns, info); if (IS_ERR(sb) || sb->s_fs_info != info) kfree(info); if (IS_ERR(sb)) diff --git a/fs/super.c b/fs/super.c index fd65667832e568..874c7e3ebb8f18 100644 --- a/fs/super.c +++ b/fs/super.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "internal.h" @@ -165,6 +166,7 @@ static void destroy_super(struct super_block *s) list_lru_destroy(&s->s_inode_lru); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); + put_user_ns(s->s_user_ns); kfree(s->s_subtype); kfree(s->s_options); call_rcu(&s->rcu, destroy_super_rcu); @@ -174,11 +176,13 @@ static void destroy_super(struct super_block *s) * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags + * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ -static struct super_block *alloc_super(struct file_system_type *type, int flags) +static struct super_block *alloc_super(struct file_system_type *type, int flags, + struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; @@ -188,6 +192,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) return NULL; INIT_LIST_HEAD(&s->s_mounts); + s->s_user_ns = get_user_ns(user_ns); if (security_sb_alloc(s)) goto fail; @@ -443,17 +448,18 @@ void generic_shutdown_super(struct super_block *sb) EXPORT_SYMBOL(generic_shutdown_super); /** - * sget - find or create a superblock + * sget_userns - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags + * @user_ns: User namespace for the super_block * @data: argument to each of them */ -struct super_block *sget(struct file_system_type *type, +struct super_block *sget_userns(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), - int flags, + int flags, struct user_namespace *user_ns, void *data) { struct super_block *s = NULL; @@ -466,6 +472,14 @@ struct super_block *sget(struct file_system_type *type, hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; + if (user_ns != old->s_user_ns) { + spin_unlock(&sb_lock); + if (s) { + up_write(&s->s_umount); + destroy_super(s); + } + return ERR_PTR(-EBUSY); + } if (!grab_super(old)) goto retry; if (s) { @@ -478,7 +492,7 @@ struct super_block *sget(struct file_system_type *type, } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, flags); + s = alloc_super(type, flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -501,6 +515,31 @@ struct super_block *sget(struct file_system_type *type, return s; } +EXPORT_SYMBOL(sget_userns); + +/** + * sget - find or create a superblock + * @type: filesystem type superblock should belong to + * @test: comparison callback + * @set: setup callback + * @flags: mount flags + * @data: argument to each of them + */ +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + int flags, + void *data) +{ + struct user_namespace *user_ns = current_user_ns(); + + /* Ensure the requestor has permissions over the target filesystem */ + if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + return sget_userns(type, test, set, flags, user_ns, data); +} + EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) @@ -930,7 +969,8 @@ struct dentry *mount_ns(struct file_system_type *fs_type, if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - sb = sget(fs_type, ns_test_super, ns_set_super, flags, ns); + sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags, + user_ns, ns); if (IS_ERR(sb)) return ERR_CAST(sb); diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ce006a24f497e..9eef64f23a755f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1432,6 +1432,13 @@ struct super_block { struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; + /* + * Owning user namespace and default context in which to + * interpret filesystem uids, gids, quotas, device nodes, + * xattrs and security labels. + */ + struct user_namespace *s_user_ns; + /* * Keep the lru lists last in the structure so they always sit on their * own individual cachelines. @@ -2056,6 +2063,11 @@ void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); +struct super_block *sget_userns(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + int flags, struct user_namespace *user_ns, + void *data); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), From a001e74cef34d95ede6535ef521011c612657a3a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 6 Jun 2016 15:48:04 -0500 Subject: [PATCH 06/30] mnt: Move the FS_USERNS_MOUNT check into sget_userns Allowing a filesystem to be mounted by other than root in the initial user namespace is a filesystem property not a mount namespace property and as such should be checked in filesystem specific code. Move the FS_USERNS_MOUNT test into super.c:sget_userns(). Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 4 ---- fs/super.c | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 1a69aa78697583..2e13f6cfe5dfd8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2397,10 +2397,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, return -ENODEV; if (user_ns != &init_user_ns) { - if (!(type->fs_flags & FS_USERNS_MOUNT)) { - put_filesystem(type); - return -EPERM; - } /* Only in special cases allow devices from mounts * created outside the initial user namespace. */ diff --git a/fs/super.c b/fs/super.c index 874c7e3ebb8f18..78790ada71915d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -466,6 +466,10 @@ struct super_block *sget_userns(struct file_system_type *type, struct super_block *old; int err; + if (!(flags & MS_KERNMOUNT) && + !(type->fs_flags & FS_USERNS_MOUNT) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); retry: spin_lock(&sb_lock); if (test) { From 29a517c232d21a717aecea29838aeb07131f6196 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Jun 2016 13:03:05 -0500 Subject: [PATCH 07/30] kernfs: The cgroup filesystem also benefits from SB_I_NOEXEC The cgroup filesystem is in the same boat as sysfs. No one ever permits executables of any kind on the cgroup filesystem, and there is no reasonable future case to support executables in the future. Therefore move the setting of SB_I_NOEXEC which makes the code proof against future mistakes of accidentally creating executables from sysfs to kernfs itself. Making the code simpler and covering the sysfs, cgroup, and cgroup2 filesystems. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/kernfs/mount.c | 2 ++ fs/sysfs/mount.c | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d90d574c15a2f5..1443df670260a1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -152,6 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) struct dentry *root; info->sb = sb; + /* Userspace would break if executables appear on sysfs */ + sb->s_iflags |= SB_I_NOEXEC; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = magic; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index f31e36994dfb4e..20b8f82e115b64 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -41,8 +41,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); else if (new_sb) - /* Userspace would break if executables appear on sysfs */ - root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC; + root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; return root; } From 3ee690143c3c99f6c0e83f08ff17556890bc6027 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 May 2016 15:42:21 -0500 Subject: [PATCH 08/30] ipc/mqueue: The mqueue filesystem should never contain executables Set SB_I_NOEXEC on mqueuefs to ensure small implementation mistakes do not result in executable on mqueuefs by accident. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- ipc/mqueue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 60d97082f4dcff..5bdd50de7d058a 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -307,6 +307,7 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode; struct ipc_namespace *ns = sb->s_fs_info; + sb->s_iflags |= SB_I_NOEXEC; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = MQUEUE_MAGIC; From a2982cc922c3068783eb9a1f77a5626a1ec36a1f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 15:34:02 -0500 Subject: [PATCH 09/30] vfs: Generalize filesystem nodev handling. Introduce a function may_open_dev that tests MNT_NODEV and a new superblock flab SB_I_NODEV. Use this new function in all of the places where MNT_NODEV was previously tested. Add the new SB_I_NODEV s_iflag to proc, sysfs, and mqueuefs as those filesystems should never support device nodes, and a simple superblock flags makes that very hard to get wrong. With SB_I_NODEV set if any device nodes somehow manage to show up on on a filesystem those device nodes will be unopenable. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/block_dev.c | 2 +- fs/kernfs/mount.c | 4 ++-- fs/namei.c | 8 +++++++- fs/proc/inode.c | 4 ++-- include/linux/fs.h | 2 ++ ipc/mqueue.c | 2 +- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 71ccab1d22c613..30b8d568203a0a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1857,7 +1857,7 @@ struct block_device *lookup_bdev(const char *pathname) if (!S_ISBLK(inode->i_mode)) goto fail; error = -EACCES; - if (path.mnt->mnt_flags & MNT_NODEV) + if (!may_open_dev(&path)) goto fail; error = -ENOMEM; bdev = bd_acquire(inode); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 1443df670260a1..b3d73ad52b22ae 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -152,8 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) struct dentry *root; info->sb = sb; - /* Userspace would break if executables appear on sysfs */ - sb->s_iflags |= SB_I_NOEXEC; + /* Userspace would break if executables or devices appear on sysfs */ + sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = magic; diff --git a/fs/namei.c b/fs/namei.c index 6a82fb7e21273c..757a32725d92e6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2881,6 +2881,12 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } EXPORT_SYMBOL(vfs_create); +bool may_open_dev(const struct path *path) +{ + return !(path->mnt->mnt_flags & MNT_NODEV) && + !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); +} + static int may_open(struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; @@ -2899,7 +2905,7 @@ static int may_open(struct path *path, int acc_mode, int flag) break; case S_IFBLK: case S_IFCHR: - if (path->mnt->mnt_flags & MNT_NODEV) + if (!may_open_dev(path)) return -EACCES; /*FALLTHRU*/ case S_IFIFO: diff --git a/fs/proc/inode.c b/fs/proc/inode.c index f4817efb25a6a6..a5b2c33745b7be 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -466,8 +466,8 @@ int proc_fill_super(struct super_block *s, void *data, int silent) if (!proc_parse_options(data, ns)) return -EINVAL; - /* User space would break if executables appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC; + /* User space would break if executables or devices appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eef64f23a755f..e05983170d2301 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1327,6 +1327,7 @@ struct mm_struct; /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ +#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ @@ -1602,6 +1603,7 @@ extern int vfs_whiteout(struct inode *, struct dentry *); */ extern void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode); +extern bool may_open_dev(const struct path *path); /* * VFS FS_IOC_FIEMAP helper definitions. */ diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 5bdd50de7d058a..0b13ace266f2d0 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -307,7 +307,7 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode; struct ipc_namespace *ns = sb->s_fs_info; - sb->s_iflags |= SB_I_NOEXEC; + sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = MQUEUE_MAGIC; From a1935c1738af53249a02290ff7c10e8a6e650a16 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Jun 2016 06:59:49 -0500 Subject: [PATCH 10/30] mnt: Simplify mount_too_revealing Verify all filesystems that we check in mount_too_revealing set SB_I_NOEXEC and SB_I_NODEV in sb->s_iflags. That is true for today and it should remain true in the future. Remove the now unnecessary checks from mnt_already_visibile that ensure MNT_LOCK_NOSUID, MNT_LOCK_NOEXEC, and MNT_LOCK_NODEV are preserved. Making the code shorter and easier to read. Relying on SB_I_NOEXEC and SB_I_NODEV instead of the user visible MNT_NOSUID, MNT_NOEXEC, and MNT_NODEV ensures the many current systems where proc and sysfs are mounted with "nosuid, nodev, noexec" and several slightly buggy container applications don't bother to set those flags continue to work. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 2e13f6cfe5dfd8..b1da7f8182c481 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3232,12 +3232,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; - /* Read the mount flags and filter out flags that - * may safely be ignored. - */ + /* A local view of the mount flags */ mnt_flags = mnt->mnt.mnt_flags; - if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC) - mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC); /* Don't miss readonly hidden in the superblock flags */ if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY) @@ -3249,15 +3245,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY)) continue; - if ((mnt_flags & MNT_LOCK_NODEV) && - !(new_flags & MNT_NODEV)) - continue; - if ((mnt_flags & MNT_LOCK_NOSUID) && - !(new_flags & MNT_NOSUID)) - continue; - if ((mnt_flags & MNT_LOCK_NOEXEC) && - !(new_flags & MNT_NOEXEC)) - continue; if ((mnt_flags & MNT_LOCK_ATIME) && ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; @@ -3277,9 +3264,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, } /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ - MNT_LOCK_NODEV | \ - MNT_LOCK_NOSUID | \ - MNT_LOCK_NOEXEC | \ MNT_LOCK_ATIME); visible = true; goto found; @@ -3292,6 +3276,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) { + const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; unsigned long s_iflags; @@ -3303,6 +3288,12 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; + if ((s_iflags & required_iflags) != required_iflags) { + WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", + required_iflags); + return true; + } + return !mnt_already_visible(ns, mnt, new_mnt_flags); } From 67690f937c38bbab1d94cb45f6a32e61612834ae Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 18 May 2016 13:50:06 -0500 Subject: [PATCH 11/30] userns: Remove implicit MNT_NODEV fragility. Replace the implict setting of MNT_NODEV on mounts that happen with just user namespace permissions with an implicit setting of SB_I_NODEV in s_iflags. The visibility of the implicit MNT_NODEV has caused problems in the past. With this change the fragile case where an implicit MNT_NODEV needs to be preserved in do_remount is removed. Using SB_I_NODEV is much less fragile as s_iflags are set during the original mount and never changed. In do_new_mount with the implicit setting of MNT_NODEV gone, the only code that can affect mnt_flags is fs_fully_visible so simplify the if statement and reduce the indentation of the code to make that clear. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 19 +------------------ fs/super.c | 3 +++ 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index b1da7f8182c481..9786a38d168188 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2185,13 +2185,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, } if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) { - /* Was the nodev implicitly added in mount? */ - if ((mnt->mnt_ns->user_ns != &init_user_ns) && - !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) { - mnt_flags |= MNT_NODEV; - } else { - return -EPERM; - } + return -EPERM; } if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) { @@ -2385,7 +2379,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct vfsmount *mnt; int err; @@ -2396,16 +2389,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, if (!type) return -ENODEV; - if (user_ns != &init_user_ns) { - /* Only in special cases allow devices from mounts - * created outside the initial user namespace. - */ - if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { - flags |= MS_NODEV; - mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; - } - } - mnt = vfs_kern_mount(type, flags, name, data); if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && !mnt->mnt_sb->s_subtype) diff --git a/fs/super.c b/fs/super.c index 78790ada71915d..25cdceed2ad37e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -206,6 +206,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; + if ((s->s_user_ns != &init_user_ns) && + !(type->fs_flags & FS_USERNS_DEV_MOUNT)) + s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); mutex_init(&s->s_sync_lock); From cc50a07a247e17db76b1f0b0ca06652556e04fa3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Jun 2016 15:44:48 -0500 Subject: [PATCH 12/30] userns: Remove the now unnecessary FS_USERNS_DEV_MOUNT flag Now that SB_I_NODEV controls the nodev behavior devpts can just clear this flag during mount. Simplifying the code and making it easier to audit how the code works. While still preserving the invariant that s_iflags is only modified during mount. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/devpts/inode.c | 3 ++- fs/super.c | 3 +-- include/linux/fs.h | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 37c134a132c7db..d116453b027663 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -396,6 +396,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; + s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; @@ -480,7 +481,7 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, - .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; /* diff --git a/fs/super.c b/fs/super.c index 25cdceed2ad37e..37813bf479cfee 100644 --- a/fs/super.c +++ b/fs/super.c @@ -206,8 +206,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; - if ((s->s_user_ns != &init_user_ns) && - !(type->fs_flags & FS_USERNS_DEV_MOUNT)) + if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); diff --git a/include/linux/fs.h b/include/linux/fs.h index e05983170d2301..375e37f42cdf6e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2022,7 +2022,6 @@ struct file_system_type { #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ -#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); From d07b846f6200454c50d791796edb82660192513d Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Wed, 23 Sep 2015 15:16:04 -0500 Subject: [PATCH 13/30] fs: Limit file caps to the user namespace of the super block Capability sets attached to files must be ignored except in the user namespaces where the mounter is privileged, i.e. s_user_ns and its descendants. Otherwise a vector exists for gaining privileges in namespaces where a user is not already privileged. Add a new helper function, current_in_user_ns(), to test whether a user namespace is the same as or a descendant of another namespace. Use this helper to determine whether a file's capability set should be applied to the caps constructed during exec. --EWB Replaced in_userns with the simpler current_in_userns. Acked-by: Serge Hallyn Signed-off-by: Seth Forshee Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 6 ++++++ kernel/user_namespace.c | 14 ++++++++++++++ security/commoncap.c | 2 ++ 3 files changed, 22 insertions(+) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 8297e5b341d863..9217169c64cb80 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -72,6 +72,7 @@ extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); +extern bool current_in_userns(const struct user_namespace *target_ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) @@ -100,6 +101,11 @@ static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } + +static inline bool current_in_userns(const struct user_namespace *target_ns) +{ + return true; +} #endif #endif /* _LINUX_USER_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9bafc211930c79..68f5942127590e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -938,6 +938,20 @@ bool userns_may_setgroups(const struct user_namespace *ns) return allowed; } +/* + * Returns true if @ns is the same namespace as or a descendant of + * @target_ns. + */ +bool current_in_userns(const struct user_namespace *target_ns) +{ + struct user_namespace *ns; + for (ns = current_user_ns(); ns; ns = ns->parent) { + if (ns == target_ns) + return true; + } + return false; +} + static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); diff --git a/security/commoncap.c b/security/commoncap.c index e7fadde737f41c..e109e6dac85870 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -455,6 +455,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) return 0; + if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns)) + return 0; rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); if (rc < 0) { From 380cf5ba6b0a0b307f4afb62b186ca801defb203 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 23 Jun 2016 16:41:05 -0500 Subject: [PATCH 14/30] fs: Treat foreign mounts as nosuid If a process gets access to a mount from a different user namespace, that process should not be able to take advantage of setuid files or selinux entrypoints from that filesystem. Prevent this by treating mounts from other mount namespaces and those not owned by current_user_ns() or an ancestor as nosuid. This will make it safer to allow more complex filesystems to be mounted in non-root user namespaces. This does not remove the need for MNT_LOCK_NOSUID. The setuid, setgid, and file capability bits can no longer be abused if code in a user namespace were to clear nosuid on an untrusted filesystem, but this patch, by itself, is insufficient to protect the system from abuse of files that, when execed, would increase MAC privilege. As a more concrete explanation, any task that can manipulate a vfsmount associated with a given user namespace already has capabilities in that namespace and all of its descendents. If they can cause a malicious setuid, setgid, or file-caps executable to appear in that mount, then that executable will only allow them to elevate privileges in exactly the set of namespaces in which they are already privileges. On the other hand, if they can cause a malicious executable to appear with a dangerous MAC label, running it could change the caller's security context in a way that should not have been possible, even inside the namespace in which the task is confined. As a hardening measure, this would have made CVE-2014-5207 much more difficult to exploit. Signed-off-by: Andy Lutomirski Signed-off-by: Seth Forshee Acked-by: James Morris Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/exec.c | 2 +- fs/namespace.c | 13 +++++++++++++ include/linux/mount.h | 1 + security/commoncap.c | 8 +++++++- security/selinux/hooks.c | 2 +- 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 887c1c955df826..ca239fc86d8d0a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1411,7 +1411,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) bprm->cred->euid = current_euid(); bprm->cred->egid = current_egid(); - if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) + if (!mnt_may_suid(bprm->file->f_path.mnt)) return; if (task_no_new_privs(current)) diff --git a/fs/namespace.c b/fs/namespace.c index 9786a38d168188..aabe8e397fc3f4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3280,6 +3280,19 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return !mnt_already_visible(ns, mnt, new_mnt_flags); } +bool mnt_may_suid(struct vfsmount *mnt) +{ + /* + * Foreign mounts (accessed via fchdir or through /proc + * symlinks) are always treated as if they are nosuid. This + * prevents namespaces from trusting potentially unsafe + * suid/sgid bits, file caps, or security labels that originate + * in other namespaces. + */ + return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && + current_in_userns(mnt->mnt_sb->s_user_ns); +} + static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; diff --git a/include/linux/mount.h b/include/linux/mount.h index f822c3c1137771..54a594d49733b6 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -81,6 +81,7 @@ extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); +extern bool mnt_may_suid(struct vfsmount *mnt); struct path; extern struct vfsmount *clone_private_mount(struct path *path); diff --git a/security/commoncap.c b/security/commoncap.c index e109e6dac85870..14540bd7856182 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -453,8 +453,14 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c if (!file_caps_enabled) return 0; - if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) + if (!mnt_may_suid(bprm->file->f_path.mnt)) return 0; + + /* + * This check is redundant with mnt_may_suid() but is kept to make + * explicit that capability bits are limited to s_user_ns and its + * descendants. + */ if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns)) return 0; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index a86d537eb79b14..15541756eb0774 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2259,7 +2259,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, const struct task_security_struct *new_tsec) { int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS); - int nosuid = (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID); + int nosuid = !mnt_may_suid(bprm->file->f_path.mnt); int rc; if (!nnp && !nosuid) From 9f50eda2a9277e0bc51d8ca5dd2ec1d0e73601bc Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Wed, 23 Sep 2015 15:16:06 -0500 Subject: [PATCH 15/30] Smack: Add support for unprivileged mounts from user namespaces Security labels from unprivileged mounts cannot be trusted. Ideally for these mounts we would assign the objects in the filesystem the same label as the inode for the backing device passed to mount. Unfortunately it's currently impossible to determine which inode this is from the LSM mount hooks, so we settle for the label of the process doing the mount. This label is assigned to s_root, and also to smk_default to ensure that new inodes receive this label. The transmute property is also set on s_root to make this behavior more explicit, even though it is technically not necessary. If a filesystem has existing security labels, access to inodes is permitted if the label is the same as smk_root, otherwise access is denied. The SMACK64EXEC xattr is completely ignored. Explicit setting of security labels continues to require CAP_MAC_ADMIN in init_user_ns. Altogether, this ensures that filesystem objects are not accessible to subjects which cannot already access the backing store, that MAC is not violated for any objects in the fileystem which are already labeled, and that a user cannot use an unprivileged mount to gain elevated MAC privileges. sysfs, tmpfs, and ramfs are already mountable from user namespaces and support security labels. We can't rule out the possibility that these filesystems may already be used in mounts from user namespaces with security lables set from the init namespace, so failing to trust lables in these filesystems may introduce regressions. It is safe to trust labels from these filesystems, since the unprivileged user does not control the backing store and thus cannot supply security labels, so an explicit exception is made to trust labels from these filesystems. Signed-off-by: Seth Forshee Acked-by: Casey Schaufler Signed-off-by: Eric W. Biederman --- security/smack/smack.h | 8 +++++++- security/smack/smack_lsm.c | 41 ++++++++++++++++++++++++++++---------- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/security/smack/smack.h b/security/smack/smack.h index 6c91156ae22567..26e58f1804b10b 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -90,9 +90,15 @@ struct superblock_smack { struct smack_known *smk_floor; struct smack_known *smk_hat; struct smack_known *smk_default; - int smk_initialized; + int smk_flags; }; +/* + * Superblock flags + */ +#define SMK_SB_INITIALIZED 0x01 +#define SMK_SB_UNTRUSTED 0x02 + struct socket_smack { struct smack_known *smk_out; /* outbound label */ struct smack_known *smk_in; /* inbound label */ diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 6777295f4b2b75..a799b7af11afcd 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -549,7 +549,7 @@ static int smack_sb_alloc_security(struct super_block *sb) sbsp->smk_floor = &smack_known_floor; sbsp->smk_hat = &smack_known_hat; /* - * smk_initialized will be zero from kzalloc. + * SMK_SB_INITIALIZED will be zero from kzalloc. */ sb->s_security = sbsp; @@ -766,10 +766,10 @@ static int smack_set_mnt_opts(struct super_block *sb, int num_opts = opts->num_mnt_opts; int transmute = 0; - if (sp->smk_initialized) + if (sp->smk_flags & SMK_SB_INITIALIZED) return 0; - sp->smk_initialized = 1; + sp->smk_flags |= SMK_SB_INITIALIZED; for (i = 0; i < num_opts; i++) { switch (opts->mnt_opts_flags[i]) { @@ -821,6 +821,17 @@ static int smack_set_mnt_opts(struct super_block *sb, skp = smk_of_current(); sp->smk_root = skp; sp->smk_default = skp; + /* + * For a handful of fs types with no user-controlled + * backing store it's okay to trust security labels + * in the filesystem. The rest are untrusted. + */ + if (sb->s_user_ns != &init_user_ns && + sb->s_magic != SYSFS_MAGIC && sb->s_magic != TMPFS_MAGIC && + sb->s_magic != RAMFS_MAGIC) { + transmute = 1; + sp->smk_flags |= SMK_SB_UNTRUSTED; + } } /* @@ -1203,6 +1214,7 @@ static int smack_inode_rename(struct inode *old_inode, */ static int smack_inode_permission(struct inode *inode, int mask) { + struct superblock_smack *sbsp = inode->i_sb->s_security; struct smk_audit_info ad; int no_block = mask & MAY_NOT_BLOCK; int rc; @@ -1214,6 +1226,11 @@ static int smack_inode_permission(struct inode *inode, int mask) if (mask == 0) return 0; + if (sbsp->smk_flags & SMK_SB_UNTRUSTED) { + if (smk_of_inode(inode) != sbsp->smk_root) + return -EACCES; + } + /* May be droppable after audit */ if (no_block) return -ECHILD; @@ -3529,14 +3546,16 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) if (rc >= 0) transflag = SMK_INODE_TRANSMUTE; } - /* - * Don't let the exec or mmap label be "*" or "@". - */ - skp = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp); - if (IS_ERR(skp) || skp == &smack_known_star || - skp == &smack_known_web) - skp = NULL; - isp->smk_task = skp; + if (!(sbsp->smk_flags & SMK_SB_UNTRUSTED)) { + /* + * Don't let the exec or mmap label be "*" or "@". + */ + skp = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp); + if (IS_ERR(skp) || skp == &smack_known_star || + skp == &smack_known_web) + skp = NULL; + isp->smk_task = skp; + } skp = smk_fetch(XATTR_NAME_SMACKMMAP, inode, dp); if (IS_ERR(skp) || skp == &smack_known_star || From 809c02e091a8272bc8586a5d606565bc900f3467 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 26 Apr 2016 14:36:22 -0500 Subject: [PATCH 16/30] Smack: Handle labels consistently in untrusted mounts The SMACK64, SMACK64EXEC, and SMACK64MMAP labels are all handled differently in untrusted mounts. This is confusing and potentically problematic. Change this to handle them all the same way that SMACK64 is currently handled; that is, read the label from disk and check it at use time. For SMACK64 and SMACK64MMAP access is denied if the label does not match smk_root. To be consistent with suid, a SMACK64EXEC label which does not match smk_root will still allow execution of the file but will not run with the label supplied in the xattr. Signed-off-by: Seth Forshee Acked-by: Casey Schaufler Signed-off-by: Eric W. Biederman --- security/smack/smack_lsm.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index a799b7af11afcd..b75634dbf53ba2 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -919,6 +919,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm) struct inode *inode = file_inode(bprm->file); struct task_smack *bsp = bprm->cred->security; struct inode_smack *isp; + struct superblock_smack *sbsp; int rc; if (bprm->cred_prepared) @@ -928,6 +929,11 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm) if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task) return 0; + sbsp = inode->i_sb->s_security; + if ((sbsp->smk_flags & SMK_SB_UNTRUSTED) && + isp->smk_task != sbsp->smk_root) + return 0; + if (bprm->unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) { struct task_struct *tracer; rc = 0; @@ -1725,6 +1731,7 @@ static int smack_mmap_file(struct file *file, struct task_smack *tsp; struct smack_known *okp; struct inode_smack *isp; + struct superblock_smack *sbsp; int may; int mmay; int tmay; @@ -1736,6 +1743,10 @@ static int smack_mmap_file(struct file *file, isp = file_inode(file)->i_security; if (isp->smk_mmap == NULL) return 0; + sbsp = file_inode(file)->i_sb->s_security; + if (sbsp->smk_flags & SMK_SB_UNTRUSTED && + isp->smk_mmap != sbsp->smk_root) + return -EACCES; mkp = isp->smk_mmap; tsp = current_security(); @@ -3546,16 +3557,14 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode) if (rc >= 0) transflag = SMK_INODE_TRANSMUTE; } - if (!(sbsp->smk_flags & SMK_SB_UNTRUSTED)) { - /* - * Don't let the exec or mmap label be "*" or "@". - */ - skp = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp); - if (IS_ERR(skp) || skp == &smack_known_star || - skp == &smack_known_web) - skp = NULL; - isp->smk_task = skp; - } + /* + * Don't let the exec or mmap label be "*" or "@". + */ + skp = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp); + if (IS_ERR(skp) || skp == &smack_known_star || + skp == &smack_known_web) + skp = NULL; + isp->smk_task = skp; skp = smk_fetch(XATTR_NAME_SMACKMMAP, inode, dp); if (IS_ERR(skp) || skp == &smack_known_star || From aad82892af261b9903cc11c55be3ecf5f0b0b4f8 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 26 Apr 2016 14:36:20 -0500 Subject: [PATCH 17/30] selinux: Add support for unprivileged mounts from user namespaces Security labels from unprivileged mounts in user namespaces must be ignored. Force superblocks from user namespaces whose labeling behavior is to use xattrs to use mountpoint labeling instead. For the mountpoint label, default to converting the current task context into a form suitable for file objects, but also allow the policy writer to specify a different label through policy transition rules. Pieced together from code snippets provided by Stephen Smalley. Signed-off-by: Seth Forshee Acked-by: Stephen Smalley Acked-by: James Morris Signed-off-by: Eric W. Biederman --- security/selinux/hooks.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 15541756eb0774..19be9d39c7424e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -830,6 +830,28 @@ static int selinux_set_mnt_opts(struct super_block *sb, goto out; } } + + /* + * If this is a user namespace mount, no contexts are allowed + * on the command line and security labels must be ignored. + */ + if (sb->s_user_ns != &init_user_ns) { + if (context_sid || fscontext_sid || rootcontext_sid || + defcontext_sid) { + rc = -EACCES; + goto out; + } + if (sbsec->behavior == SECURITY_FS_USE_XATTR) { + sbsec->behavior = SECURITY_FS_USE_MNTPOINT; + rc = security_transition_sid(current_sid(), current_sid(), + SECCLASS_FILE, NULL, + &sbsec->mntpoint_sid); + if (rc) + goto out; + } + goto out_set_opts; + } + /* sets the context of the superblock for the fs being mounted. */ if (fscontext_sid) { rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred); @@ -898,6 +920,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, sbsec->def_sid = defcontext_sid; } +out_set_opts: rc = sb_finish_set_opts(sb); out: mutex_unlock(&sbsec->lock); From a475acf01f79e89a1a5845733e10108d80f77188 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 26 Apr 2016 14:36:25 -0500 Subject: [PATCH 18/30] fs: Refuse uid/gid changes which don't map into s_user_ns Add checks to notify_change to verify that uid and gid changes will map into the superblock's user namespace. If they do not fail with -EOVERFLOW. This is mandatory so that fileystems don't have to even think of dealing with ia_uid and ia_gid that --EWB Moved the test from inode_change_ok to notify_change Signed-off-by: Seth Forshee Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/attr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/attr.c b/fs/attr.c index 25b24d0f6c8810..dd723578ddce7c 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -255,6 +255,17 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) return 0; + /* + * Verify that uid/gid changes are valid in the target + * namespace of the superblock. + */ + if (ia_valid & ATTR_UID && + !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid)) + return -EOVERFLOW; + if (ia_valid & ATTR_GID && + !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) + return -EOVERFLOW; + error = security_inode_setattr(dentry, attr); if (error) return error; From 37b11804ed1725dc2ea97be2236150210a69e9d5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Jun 2016 15:52:48 -0500 Subject: [PATCH 19/30] userns: Handle -1 in k[ug]id_has_mapping when !CONFIG_USER_NS Refuse to admit any user namespace has a mapping of the INVALID_UID and the INVALID_GID when !CONFIG_USER_NS. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- include/linux/uidgid.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index 03835522dfcb68..25e9d92163408c 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -177,12 +177,12 @@ static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid) static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { - return true; + return uid_valid(uid); } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { - return true; + return gid_valid(gid); } #endif /* CONFIG_USER_NS */ From 0d4d717f25834134bb6f43284f84c8ccee5bbf2a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Jun 2016 16:04:06 -0500 Subject: [PATCH 20/30] vfs: Verify acls are valid within superblock's s_user_ns. Update posix_acl_valid to verify that an acl is within a user namespace. Update the callers of posix_acl_valid to pass in an appropriate user namespace. For posix_acl_xattr_set and v9fs_xattr_set_acl pass in inode->i_sb->s_user_ns to posix_acl_valid. For md_unpack_acl pass in &init_user_ns as no inode or superblock is in sight. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- drivers/staging/lustre/lustre/mdc/mdc_request.c | 2 +- fs/9p/acl.c | 2 +- fs/posix_acl.c | 8 ++++---- include/linux/posix_acl.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c index 86b7445365f43b..40cf57fad58197 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -434,7 +434,7 @@ static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md) return rc; } - rc = posix_acl_valid(acl); + rc = posix_acl_valid(&init_user_ns, acl); if (rc) { CERROR("validate acl: %d\n", rc); posix_acl_release(acl); diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 0576eaeb60b952..5b6a1743ea17bd 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -266,7 +266,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { - retval = posix_acl_valid(acl); + retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (retval) goto err_out; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8a4a266beff39d..647c28180675b3 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -205,7 +205,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. */ int -posix_acl_valid(const struct posix_acl *acl) +posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; @@ -225,7 +225,7 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_USER: if (state != ACL_USER) return -EINVAL; - if (!uid_valid(pa->e_uid)) + if (!kuid_has_mapping(user_ns, pa->e_uid)) return -EINVAL; needs_mask = 1; break; @@ -240,7 +240,7 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_GROUP: if (state != ACL_GROUP) return -EINVAL; - if (!gid_valid(pa->e_gid)) + if (!kgid_has_mapping(user_ns, pa->e_gid)) return -EINVAL; needs_mask = 1; break; @@ -845,7 +845,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler, return PTR_ERR(acl); if (acl) { - ret = posix_acl_valid(acl); + ret = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (ret) goto out; } diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 5b5a80cc592658..1af6438fde3e40 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -81,7 +81,7 @@ posix_acl_release(struct posix_acl *acl) extern void posix_acl_init(struct posix_acl *, int); extern struct posix_acl *posix_acl_alloc(int, gfp_t); -extern int posix_acl_valid(const struct posix_acl *); +extern int posix_acl_valid(struct user_namespace *, const struct posix_acl *); extern int posix_acl_permission(struct inode *, const struct posix_acl *, int); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); From 2d7f9e2ad35e4e7a3086231f19bfab33c6a8a64a Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 26 Apr 2016 14:36:23 -0500 Subject: [PATCH 21/30] fs: Check for invalid i_uid in may_follow_link() Filesystem uids which don't map into a user namespace may result in inode->i_uid being INVALID_UID. A symlink and its parent could have different owners in the filesystem can both get mapped to INVALID_UID, which may result in following a symlink when this would not have otherwise been permitted when protected symlinks are enabled. Signed-off-by: Seth Forshee Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 757a32725d92e6..8701bd9a527032 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -901,6 +901,7 @@ static inline int may_follow_link(struct nameidata *nd) { const struct inode *inode; const struct inode *parent; + kuid_t puid; if (!sysctl_protected_symlinks) return 0; @@ -916,7 +917,8 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if parent directory and link owner match. */ - if (uid_eq(parent->i_uid, inode->i_uid)) + puid = parent->i_uid; + if (uid_valid(puid) && uid_eq(puid, inode->i_uid)) return 0; if (nd->flags & LOOKUP_RCU) From 5f65e5ca286126a60f62c8421b77c2018a482b8a Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 26 Apr 2016 14:36:24 -0500 Subject: [PATCH 22/30] cred: Reject inodes with invalid ids in set_create_file_as() Using INVALID_[UG]ID for the LSM file creation context doesn't make sense, so return an error if the inode passed to set_create_file_as() has an invalid id. Signed-off-by: Seth Forshee Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/cred.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cred.c b/kernel/cred.c index 0c0cd8a6228527..5f264fb5737dcd 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx); */ int set_create_files_as(struct cred *new, struct inode *inode) { + if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); From 0bd23d09b874e53bd1a2fe2296030aa2720d7b08 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 29 Jun 2016 14:54:46 -0500 Subject: [PATCH 23/30] vfs: Don't modify inodes with a uid or gid unknown to the vfs When a filesystem outside of init_user_ns is mounted it could have uids and gids stored in it that do not map to init_user_ns. The plan is to allow those filesystems to set i_uid to INVALID_UID and i_gid to INVALID_GID for unmapped uids and gids and then to handle that strange case in the vfs to ensure there is consistent robust handling of the weirdness. Upon a careful review of the vfs and filesystems about the only case where there is any possibility of confusion or trouble is when the inode is written back to disk. In that case filesystems typically read the inode->i_uid and inode->i_gid and write them to disk even when just an inode timestamp is being updated. Which leads to a rule that is very simple to implement and understand inodes whose i_uid or i_gid is not valid may not be written. In dealing with access times this means treat those inodes as if the inode flag S_NOATIME was set. Reads of the inodes appear safe and useful, but any write or modification is disallowed. The only inode write that is allowed is a chown that sets the uid and gid on the inode to valid values. After such a chown the inode is normal and may be treated as such. Denying all writes to inodes with uids or gids unknown to the vfs also prevents several oddball cases where corruption would have occurred because the vfs does not have complete information. One problem case that is prevented is attempting to use the gid of a directory for new inodes where the directories sgid bit is set but the directories gid is not mapped. Another problem case avoided is attempting to update the evm hash after setxattr, removexattr, and setattr. As the evm hash includeds the inode->i_uid or inode->i_gid not knowning the uid or gid prevents a correct evm hash from being computed. evm hash verification also fails when i_uid or i_gid is unknown but that is essentially harmless as it does not cause filesystem corruption. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/attr.c | 8 ++++++++ fs/inode.c | 7 +++++++ fs/namei.c | 26 +++++++++++++++++++++----- fs/xattr.c | 7 +++++++ include/linux/fs.h | 5 +++++ 5 files changed, 48 insertions(+), 5 deletions(-) diff --git a/fs/attr.c b/fs/attr.c index dd723578ddce7c..42bb42bb3c72c2 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -266,6 +266,14 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) return -EOVERFLOW; + /* Don't allow modifications of files with invalid uids or + * gids unless those uids & gids are being made valid. + */ + if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid)) + return -EOVERFLOW; + if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid)) + return -EOVERFLOW; + error = security_inode_setattr(dentry, attr); if (error) return error; diff --git a/fs/inode.c b/fs/inode.c index 4ccbc21b30ce6f..c0ebb97fb0856a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1617,6 +1617,13 @@ bool atime_needs_update(const struct path *path, struct inode *inode) if (inode->i_flags & S_NOATIME) return false; + + /* Atime updates will likely cause i_uid and i_gid to be written + * back improprely if their true value is unknown to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return false; + if (IS_NOATIME(inode)) return false; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) diff --git a/fs/namei.c b/fs/namei.c index 8701bd9a527032..840201c4c29062 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -410,6 +410,14 @@ int __inode_permission(struct inode *inode, int mask) */ if (IS_IMMUTABLE(inode)) return -EACCES; + + /* + * Updating mtime will likely cause i_uid and i_gid to be + * written back improperly if their true value is unknown + * to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EACCES; } retval = do_inode_permission(inode, mask); @@ -2759,10 +2767,11 @@ EXPORT_SYMBOL(__check_sticky); * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. - * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. - * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. - * 9. We can't remove a root or mountpoint. - * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * 7. If the victim has an unknown uid or gid we can't change the inode. + * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 10. We can't remove a root or mountpoint. + * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) @@ -2784,7 +2793,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) return -EPERM; if (check_sticky(dir, inode) || IS_APPEND(inode) || - IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -4190,6 +4199,13 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; + /* + * Updating the link count will likely cause i_uid and i_gid to + * be writen back improperly if their true value is unknown to + * the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) diff --git a/fs/xattr.c b/fs/xattr.c index 4beafc43daa58b..c243905835abd2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -38,6 +38,13 @@ xattr_permission(struct inode *inode, const char *name, int mask) if (mask & MAY_WRITE) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; + /* + * Updating an xattr will likely cause i_uid and i_gid + * to be writen back improperly if their true value is + * unknown to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EPERM; } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 375e37f42cdf6e..cb25ceb6d1efcf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1874,6 +1874,11 @@ struct super_operations { #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) +static inline bool HAS_UNMAPPED_ID(struct inode *inode) +{ + return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid); +} + /* * Inode state bits. Protected by inode->i_lock * From 036d523641c66bef713042894a17f4335f199e49 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 1 Jul 2016 12:52:06 -0500 Subject: [PATCH 24/30] vfs: Don't create inodes with a uid or gid unknown to the vfs It is expected that filesystems can not represent uids and gids from outside of their user namespace. Keep things simple by not even trying to create filesystem nodes with non-sense uids and gids. Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namei.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 840201c4c29062..629823f19a6abd 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2814,16 +2814,22 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) - * 3. We should have write and exec permissions on dir - * 4. We can't do it if dir is immutable (done in permission()) + * 3. We can't do it if the fs can't represent the fsuid or fsgid. + * 4. We should have write and exec permissions on dir + * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct inode *dir, struct dentry *child) { + struct user_namespace *s_user_ns; audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; + s_user_ns = dir->i_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) + return -EOVERFLOW; return inode_permission(dir, MAY_WRITE | MAY_EXEC); } From d49d37624a1931c2f3b5d0cbe95bd5181cbdc279 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 30 Jun 2016 16:31:01 -0500 Subject: [PATCH 25/30] quota: Ensure qids map to the filesystem Introduce the helper qid_has_mapping and use it to ensure that the quota system only considers qids that map to the filesystems s_user_ns. In practice for quota supporting filesystems today this is the exact same check as qid_valid. As only 0xffffffff aka (qid_t)-1 does not map into init_user_ns. Replace the qid_valid calls with qid_has_mapping as values come in from userspace. This is harmless today and it prepares the quota system to work on filesystems with quotas but mounted by unprivileged users. Call qid_has_mapping from dqget. This ensures the passed in qid has a prepresentation on the underlying filesystem. Previously this was unnecessary as filesystesm never had qids that could not map. With the introduction of filesystems outside of s_user_ns this will not remain true. All of this ensures the quota code never has to deal with qids that don't map to the underlying filesystem. Cc: Jan Kara Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/quota/dquot.c | 3 +++ fs/quota/quota.c | 12 ++++++------ include/linux/quota.h | 10 ++++++++++ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ff21980d0119a5..74706b6aa74747 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -841,6 +841,9 @@ struct dquot *dqget(struct super_block *sb, struct kqid qid) unsigned int hashent = hashfn(sb, qid); struct dquot *dquot, *empty = NULL; + if (!qid_has_mapping(sb->s_user_ns, qid)) + return ERR_PTR(-EINVAL); + if (!sb_has_quota_active(sb, qid.type)) return ERR_PTR(-ESRCH); we_slept: diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 0f10ee9892ce3a..73f6f4cf0a2107 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -211,7 +211,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); if (ret) @@ -237,7 +237,7 @@ static int quota_getnextquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq); if (ret) @@ -288,7 +288,7 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; copy_from_if_dqblk(&fdq, &idq); return sb->s_qcop->set_dqblk(sb, qid, &fdq); @@ -581,7 +581,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; /* Are we actually setting timer / warning limits for all users? */ if (from_kqid(&init_user_ns, qid) == 0 && @@ -642,7 +642,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &qdq); if (ret) @@ -669,7 +669,7 @@ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq); if (ret) diff --git a/include/linux/quota.h b/include/linux/quota.h index 9dfb6bce8c9eb0..1db16ee39b318f 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -179,6 +179,16 @@ static inline struct kqid make_kqid_projid(kprojid_t projid) return kqid; } +/** + * qid_has_mapping - Report if a qid maps into a user namespace. + * @ns: The user namespace to see if a value maps into. + * @qid: The kernel internal quota identifier to test. + */ +static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid) +{ + return from_kqid(ns, qid) != (qid_t) -1; +} + extern spinlock_t dq_data_lock; From cfd4c70a18c4e806aaac2f483153dae01e0ace1c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 5 Jul 2016 11:10:57 -0500 Subject: [PATCH 26/30] quota: Handle quota data stored in s_user_ns in quota_setxquota In Q_XSETQLIMIT use sb->s_user_ns to detect when we are dealing with the filesystems notion of id 0. Cc: Jan Kara Acked-by: Seth Forshee Inspired-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/quota/quota.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 73f6f4cf0a2107..35df08ee9c97da 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -584,7 +584,7 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; /* Are we actually setting timer / warning limits for all users? */ - if (from_kqid(&init_user_ns, qid) == 0 && + if (from_kqid(sb->s_user_ns, qid) == 0 && fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) { struct qc_info qinfo; int ret; From 5c0048280babd579fa9e5f0e787122b06aee3f3b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 5 Jul 2016 10:41:57 -0500 Subject: [PATCH 27/30] dquot: For now explicitly don't support filesystems outside of init_user_ns Mostly supporting filesystems outside of init_user_ns is s/&init_usre_ns/dquot->dq_sb->s_user_ns/. An actual need for supporting quotas on filesystems outside of s_user_ns is quite a ways away and to be done responsibily needs an audit on what can happen with hostile quota files. Until that audit is complete don't attempt to support quota files on filesystems outside of s_user_ns. Cc: Jan Kara Acked-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/quota/dquot.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 74706b6aa74747..87197d13cc7651 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2271,6 +2271,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, error = -EINVAL; goto out_fmt; } + /* Filesystems outside of init_user_ns not yet supported */ + if (sb->s_user_ns != &init_user_ns) { + error = -EINVAL; + goto out_fmt; + } /* Usage always has to be set... */ if (!(flags & DQUOT_USAGE_ENABLED)) { error = -EINVAL; From 0b3c9761d1e405514a551ed24d3ea89aea26ce14 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Thu, 5 Feb 2015 10:44:50 -0600 Subject: [PATCH 28/30] evm: Translate user/group ids relative to s_user_ns when computing HMAC The EVM HMAC should be calculated using the on disk user and group ids, so the k[ug]ids in the inode must be translated relative to the s_user_ns of the inode's super block. Signed-off-by: Seth Forshee Signed-off-by: Eric W. Biederman --- security/integrity/evm/evm_crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 30b6b7d0429fc9..11c1d30bd705a9 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -151,8 +151,8 @@ static void hmac_add_misc(struct shash_desc *desc, struct inode *inode, memset(&hmac_misc, 0, sizeof(hmac_misc)); hmac_misc.ino = inode->i_ino; hmac_misc.generation = inode->i_generation; - hmac_misc.uid = from_kuid(&init_user_ns, inode->i_uid); - hmac_misc.gid = from_kgid(&init_user_ns, inode->i_gid); + hmac_misc.uid = from_kuid(inode->i_sb->s_user_ns, inode->i_uid); + hmac_misc.gid = from_kgid(inode->i_sb->s_user_ns, inode->i_gid); hmac_misc.mode = inode->i_mode; crypto_shash_update(desc, (const u8 *)&hmac_misc, sizeof(hmac_misc)); if (evm_hmac_attrs & EVM_ATTR_FSUUID) From 81754357770ebd900801231e7bc8d151ddc00498 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Thu, 11 Dec 2014 10:15:45 -0600 Subject: [PATCH 29/30] fs: Update i_[ug]id_(read|write) to translate relative to s_user_ns For filesystems mounted from a user namespace on-disk ids should be translated relative to s_users_ns rather than init_user_ns. When an id in the filesystem doesn't exist in s_user_ns the associated id in the inode will be set to INVALID_[UG]ID, which turns these into de facto "nobody" ids. This actually maps pretty well into the way most code already works, and those places where it didn't were fixed in previous patches. Moving forward vfs code needs to be careful to handle instances where ids in inodes may be invalid. Signed-off-by: Seth Forshee Signed-off-by: Eric W. Biederman --- include/linux/fs.h | 50 +++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index cb25ceb6d1efcf..8aa9b72e0bc5a4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -831,31 +831,6 @@ static inline void i_size_write(struct inode *inode, loff_t i_size) #endif } -/* Helper functions so that in most cases filesystems will - * not need to deal directly with kuid_t and kgid_t and can - * instead deal with the raw numeric values that are stored - * in the filesystem. - */ -static inline uid_t i_uid_read(const struct inode *inode) -{ - return from_kuid(&init_user_ns, inode->i_uid); -} - -static inline gid_t i_gid_read(const struct inode *inode) -{ - return from_kgid(&init_user_ns, inode->i_gid); -} - -static inline void i_uid_write(struct inode *inode, uid_t uid) -{ - inode->i_uid = make_kuid(&init_user_ns, uid); -} - -static inline void i_gid_write(struct inode *inode, gid_t gid) -{ - inode->i_gid = make_kgid(&init_user_ns, gid); -} - static inline unsigned iminor(const struct inode *inode) { return MINOR(inode->i_rdev); @@ -1461,6 +1436,31 @@ struct super_block { struct list_head s_inodes; /* all inodes */ }; +/* Helper functions so that in most cases filesystems will + * not need to deal directly with kuid_t and kgid_t and can + * instead deal with the raw numeric values that are stored + * in the filesystem. + */ +static inline uid_t i_uid_read(const struct inode *inode) +{ + return from_kuid(inode->i_sb->s_user_ns, inode->i_uid); +} + +static inline gid_t i_gid_read(const struct inode *inode) +{ + return from_kgid(inode->i_sb->s_user_ns, inode->i_gid); +} + +static inline void i_uid_write(struct inode *inode, uid_t uid) +{ + inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid); +} + +static inline void i_gid_write(struct inode *inode, gid_t gid) +{ + inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); +} + extern struct timespec current_fs_time(struct super_block *sb); /* From aeaa4a79ff6a5ed912b7362f206cf8576fca538b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 23 Jul 2016 11:20:44 -0500 Subject: [PATCH 30/30] fs: Call d_automount with the filesystems creds Seth Forshee reported a mount regression in nfs autmounts with "fs: Add user namespace member to struct super_block". It turns out that the assumption that current->cred is something reasonable during mount while necessary to improve support of unprivileged mounts is wrong in the automount path. To fix the existing filesystems override current->cred with the init_cred before calling d_automount and restore current->cred after d_automount completes. To support unprivileged mounts would require a more nuanced cred selection, so fail on unprivileged mounts for the time being. As none of the filesystems that currently set FS_USERNS_MOUNT implement d_automount this check is only good for preventing future problems. Fixes: 6e4eab577a0c ("fs: Add user namespace member to struct super_block") Tested-by: Seth Forshee Signed-off-by: "Eric W. Biederman" --- fs/namei.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 629823f19a6abd..ef573df3297fbf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1099,6 +1100,7 @@ static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; + const struct cred *old_cred; int err; if (!path->dentry->d_op || !path->dentry->d_op->d_automount) @@ -1120,11 +1122,16 @@ static int follow_automount(struct path *path, struct nameidata *nd, path->dentry->d_inode) return -EISDIR; + if (path->dentry->d_sb->s_user_ns != &init_user_ns) + return -EACCES; + nd->total_link_count++; if (nd->total_link_count >= 40) return -ELOOP; + old_cred = override_creds(&init_cred); mnt = path->dentry->d_op->d_automount(path); + revert_creds(old_cred); if (IS_ERR(mnt)) { /* * The filesystem is allowed to return -EISDIR here to indicate