Skip to content

Commit

Permalink
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel…
Browse files Browse the repository at this point in the history
…/git/ebiederm/user-namespace

Pull user namespace updates from Eric Biederman:
 "This finishes up the changes to ensure proc and sysfs do not start
  implementing executable files, as the there are application today that
  are only secure because such files do not exist.

  It akso fixes a long standing misfeature of /proc/<pid>/mountinfo that
  did not show the proper source for files bind mounted from
  /proc/<pid>/ns/*.

  It also straightens out the handling of clone flags related to user
  namespaces, fixing an unnecessary failure of unshare(CLONE_NEWUSER)
  when files such as /proc/<pid>/environ are read while <pid> is calling
  unshare.  This winds up fixing a minor bug in unshare flag handling
  that dates back to the first version of unshare in the kernel.

  Finally, this fixes a minor regression caused by the introduction of
  sysfs_create_mount_point, which broke someone's in house application,
  by restoring the size of /sys/fs/cgroup to 0 bytes.  Apparently that
  application uses the directory size to determine if a tmpfs is mounted
  on /sys/fs/cgroup.

  The bind mount escape fixes are present in Al Viros for-next branch.
  and I expect them to come from there.  The bind mount escape is the
  last of the user namespace related security bugs that I am aware of"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
  fs: Set the size of empty dirs to 0.
  userns,pidns: Force thread group sharing, not signal handler sharing.
  unshare: Unsharing a thread does not require unsharing a vm
  nsfs: Add a show_path method to fix mountinfo
  mnt: fs_fully_visible enforce noexec and nosuid  if !SB_I_NOEXEC
  vfs: Commit to never having exectuables on proc and sysfs.
  • Loading branch information
torvalds committed Sep 1, 2015
2 parents e713c80 + 4b75de8 commit 73b6fa8
Show file tree
Hide file tree
Showing 14 changed files with 83 additions and 34 deletions.
10 changes: 8 additions & 2 deletions fs/exec.c
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,12 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
module_put(fmt->module);
}

bool path_noexec(const struct path *path)
{
return (path->mnt->mnt_flags & MNT_NOEXEC) ||
(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}

#ifdef CONFIG_USELIB
/*
* Note that a shared library must be both readable and executable due to
Expand Down Expand Up @@ -132,7 +138,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
goto exit;

error = -EACCES;
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
if (path_noexec(&file->f_path))
goto exit;

fsnotify_open(file);
Expand Down Expand Up @@ -777,7 +783,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
if (!S_ISREG(file_inode(file)->i_mode))
goto exit;

if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
if (path_noexec(&file->f_path))
goto exit;

err = deny_write_access(file);
Expand Down
2 changes: 1 addition & 1 deletion fs/libfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1185,7 +1185,7 @@ void make_empty_dir_inode(struct inode *inode)
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_rdev = 0;
inode->i_size = 2;
inode->i_size = 0;
inode->i_blkbits = PAGE_SHIFT;
inode->i_blocks = 0;

Expand Down
33 changes: 25 additions & 8 deletions fs/namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -3218,6 +3218,8 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
down_read(&namespace_sem);
list_for_each_entry(mnt, &ns->list, mnt_list) {
struct mount *child;
int mnt_flags;

if (mnt->mnt.mnt_sb->s_type != type)
continue;

Expand All @@ -3227,17 +3229,30 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
continue;

/* Read the mount flags and filter out flags that
* may safely be ignored.
*/
mnt_flags = mnt->mnt.mnt_flags;
if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);

/* Verify the mount flags are equal to or more permissive
* than the proposed new mount.
*/
if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
if ((mnt_flags & MNT_LOCK_READONLY) &&
!(new_flags & MNT_READONLY))
continue;
if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
if ((mnt_flags & MNT_LOCK_NODEV) &&
!(new_flags & MNT_NODEV))
continue;
if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
if ((mnt_flags & MNT_LOCK_NOSUID) &&
!(new_flags & MNT_NOSUID))
continue;
if ((mnt_flags & MNT_LOCK_NOEXEC) &&
!(new_flags & MNT_NOEXEC))
continue;
if ((mnt_flags & MNT_LOCK_ATIME) &&
((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
continue;

/* This mount is not fully visible if there are any
Expand All @@ -3247,16 +3262,18 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
struct inode *inode = child->mnt_mountpoint->d_inode;
/* Only worry about locked mounts */
if (!(mnt->mnt.mnt_flags & MNT_LOCKED))
if (!(mnt_flags & MNT_LOCKED))
continue;
/* Is the directory permanetly empty? */
if (!is_empty_dir_inode(inode))
goto next;
}
/* Preserve the locked attributes */
*new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \
MNT_LOCK_NODEV | \
MNT_LOCK_ATIME);
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
MNT_LOCK_NODEV | \
MNT_LOCK_NOSUID | \
MNT_LOCK_NOEXEC | \
MNT_LOCK_ATIME);
visible = true;
goto found;
next: ;
Expand Down
10 changes: 10 additions & 0 deletions fs/nsfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/ktime.h>
#include <linux/seq_file.h>

static struct vfsmount *nsfs_mnt;

Expand Down Expand Up @@ -136,9 +137,18 @@ struct file *proc_ns_fget(int fd)
return ERR_PTR(-EINVAL);
}

static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = dentry->d_fsdata;

return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
}

static const struct super_operations nsfs_ops = {
.statfs = simple_statfs,
.evict_inode = nsfs_evict,
.show_path = nsfs_show_path,
};
static struct dentry *nsfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
Expand Down
2 changes: 1 addition & 1 deletion fs/open.c
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
* with the "noexec" flag.
*/
res = -EACCES;
if (path.mnt->mnt_flags & MNT_NOEXEC)
if (path_noexec(&path))
goto out_path_release;
}

Expand Down
2 changes: 2 additions & 0 deletions fs/proc/root.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
}

sb->s_flags |= MS_ACTIVE;
/* User space would break if executables appear on proc */
sb->s_iflags |= SB_I_NOEXEC;
}

return dget(sb->s_root);
Expand Down
4 changes: 4 additions & 0 deletions fs/sysfs/mount.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
SYSFS_MAGIC, &new_sb, ns);
if (IS_ERR(root) || !new_sb)
kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
else if (new_sb)
/* Userspace would break if executables appear on sysfs */
root->d_sb->s_iflags |= SB_I_NOEXEC;

return root;
}

Expand Down
3 changes: 3 additions & 0 deletions include/linux/fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -1260,6 +1260,7 @@ struct mm_struct;

/* sb->s_iflags */
#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */

/* Possible states of 'frozen' field */
enum {
Expand Down Expand Up @@ -3041,4 +3042,6 @@ static inline bool dir_relax(struct inode *inode)
return !IS_DEADDIR(inode);
}

extern bool path_noexec(const struct path *path);

#endif /* _LINUX_FS_H */
36 changes: 22 additions & 14 deletions kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -1280,10 +1280,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,

/*
* If the new process will be in a different pid or user namespace
* do not allow it to share a thread group or signal handlers or
* parent with the forking task.
* do not allow it to share a thread group with the forking task.
*/
if (clone_flags & CLONE_SIGHAND) {
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) !=
current->nsproxy->pid_ns_for_children))
Expand Down Expand Up @@ -1872,13 +1871,21 @@ static int check_unshare_flags(unsigned long unshare_flags)
CLONE_NEWUSER|CLONE_NEWPID))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing to
* unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
* needs to unshare vm.
* Not implemented, but pretend it works if there is nothing
* to unshare. Note that unsharing the address space or the
* signal handlers also need to unshare the signal queues (aka
* CLONE_THREAD).
*/
if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
/* FIXME: get_task_mm() increments ->mm_users */
if (atomic_read(&current->mm->mm_users) > 1)
if (!thread_group_empty(current))
return -EINVAL;
}
if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
if (atomic_read(&current->sighand->count) > 1)
return -EINVAL;
}
if (unshare_flags & CLONE_VM) {
if (!current_is_single_threaded())
return -EINVAL;
}

Expand Down Expand Up @@ -1942,20 +1949,21 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
int err;

/*
* If unsharing a user namespace must also unshare the thread.
* If unsharing a user namespace must also unshare the thread group
* and unshare the filesystem root and working directories.
*/
if (unshare_flags & CLONE_NEWUSER)
unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
* If unsharing a thread from a thread group, must also unshare vm.
*/
if (unshare_flags & CLONE_THREAD)
unshare_flags |= CLONE_VM;
/*
* If unsharing vm, must also unshare signal handlers.
*/
if (unshare_flags & CLONE_VM)
unshare_flags |= CLONE_SIGHAND;
/*
* If unsharing a signal handlers, must also unshare the signal queues.
*/
if (unshare_flags & CLONE_SIGHAND)
unshare_flags |= CLONE_THREAD;
/*
* If unsharing namespace, must also unshare filesystem information.
*/
Expand Down
3 changes: 1 addition & 2 deletions kernel/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
* overall picture.
*/
err = -EACCES;
if (!S_ISREG(inode->i_mode) ||
exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
goto exit;

err = inode_permission(inode, MAY_EXEC);
Expand Down
4 changes: 2 additions & 2 deletions kernel/user_namespace.c
Original file line number Diff line number Diff line change
Expand Up @@ -976,8 +976,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
if (user_ns == current_user_ns())
return -EINVAL;

/* Threaded processes may not enter a different user namespace */
if (atomic_read(&current->mm->mm_users) > 1)
/* Tasks that share a thread group must share a user namespace */
if (!thread_group_empty(current))
return -EINVAL;

if (current->fs->users != 1)
Expand Down
4 changes: 2 additions & 2 deletions mm/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1268,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;

if (!(flags & MAP_FIXED))
Expand Down Expand Up @@ -1337,7 +1337,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (path_noexec(&file->f_path)) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
Expand Down
2 changes: 1 addition & 1 deletion mm/nommu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1035,7 +1035,7 @@ static int validate_mmap_request(struct file *file,

/* handle executable mappings and implied executable
* mappings */
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (path_noexec(&file->f_path)) {
if (prot & PROT_EXEC)
return -EPERM;
} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
Expand Down
2 changes: 1 addition & 1 deletion security/security.c
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
* ditto if it's not on noexec mount, except that on !MMU we need
* NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
*/
if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
if (!path_noexec(&file->f_path)) {
#ifndef CONFIG_MMU
if (file->f_op->mmap_capabilities) {
unsigned caps = file->f_op->mmap_capabilities(file);
Expand Down

0 comments on commit 73b6fa8

Please sign in to comment.