diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 965d381abd7..25db92a8e1c 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -1094,7 +1094,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private) LOAD_INT(c), LOAD_FRAC(c), count_active_contexts(), atomic_read(&nr_spu_contexts), - current->nsproxy->pid_ns->last_pid); + task_active_pid_ns(current)->last_pid); return 0; } diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 49e3b49e552..4bd82ac0210 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -123,7 +123,7 @@ void mconsole_log(struct mc_request *req) void mconsole_proc(struct mc_request *req) { - struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; + struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; char *buf; int len; struct file *file; diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 4a36e9ab8cf..2d12e8a1f82 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "binder.h" #include "binder_trace.h" @@ -2320,7 +2321,7 @@ retry: if (t->from) { struct task_struct *sender = t->from->proc->tsk; tr.sender_pid = task_tgid_nr_ns(sender, - current->nsproxy->pid_ns); + task_active_pid_ns(current)); } else { tr.sender_pid = 0; } diff --git a/fs/attr.c b/fs/attr.c index cce7df53b69..1449adb14ef 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -49,14 +49,15 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && (!uid_eq(current_fsuid(), inode->i_uid) || - !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) + !uid_eq(attr->ia_uid, inode->i_uid)) && + !inode_capable(inode, CAP_CHOWN)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && (!uid_eq(current_fsuid(), inode->i_uid) || (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && - !capable(CAP_CHOWN)) + !inode_capable(inode, CAP_CHOWN)) return -EPERM; /* Make sure a caller can chmod. */ @@ -65,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr) return -EPERM; /* Also check the setgid bit! */ if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - inode->i_gid) && !capable(CAP_FSETID)) + inode->i_gid) && + !inode_capable(inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } @@ -157,7 +159,8 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!in_group_p(inode->i_gid) && + !inode_capable(inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 908e1845541..b785e770795 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -74,8 +74,8 @@ struct autofs_info { unsigned long last_used; atomic_t count; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ @@ -89,8 +89,8 @@ struct autofs_wait_queue { struct qstr name; u32 dev; u64 ino; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; pid_t pid; pid_t tgid; /* This is for status reporting upon return */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index a16214109d3..9f68a37bb2b 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -437,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp, err = 0; autofs4_expire_wait(path.dentry); spin_lock(&sbi->fs_lock); - param->requester.uid = ino->uid; - param->requester.gid = ino->gid; + param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); + param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); spin_unlock(&sbi->fs_lock); } path_put(&path); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 8a4fed8ead3..b104726e2d0 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) void autofs4_clean_ino(struct autofs_info *ino) { - ino->uid = 0; - ino->gid = 0; + ino->uid = GLOBAL_ROOT_UID; + ino->gid = GLOBAL_ROOT_GID; ino->last_used = jiffies; } @@ -79,10 +79,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) return 0; seq_printf(m, ",fd=%d", sbi->pipefd); - if (root_inode->i_uid != 0) - seq_printf(m, ",uid=%u", root_inode->i_uid); - if (root_inode->i_gid != 0) - seq_printf(m, ",gid=%u", root_inode->i_gid); + if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, root_inode->i_uid)); + if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, root_inode->i_gid)); seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); @@ -126,7 +128,7 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, +static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) { char *p; @@ -159,12 +161,16 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, case Opt_uid: if (match_int(args, &option)) return 1; - *uid = option; + *uid = make_kuid(current_user_ns(), option); + if (!uid_valid(*uid)) + return 1; break; case Opt_gid: if (match_int(args, &option)) return 1; - *gid = option; + *gid = make_kgid(current_user_ns(), option); + if (!gid_valid(*gid)) + return 1; break; case Opt_pgrp: if (match_int(args, &option)) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index dce436e595c..03bc1d347d8 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -154,6 +154,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, case autofs_ptype_expire_direct: { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; + struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; pktsz = sizeof(*packet); @@ -163,8 +164,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, packet->name[wq->name.len] = '\0'; packet->dev = wq->dev; packet->ino = wq->ino; - packet->uid = wq->uid; - packet->gid = wq->gid; + packet->uid = from_kuid_munged(user_ns, wq->uid); + packet->gid = from_kgid_munged(user_ns, wq->gid); packet->pid = wq->pid; packet->tgid = wq->tgid; break; diff --git a/fs/exec.c b/fs/exec.c index 721a2992951..b71b08ce712 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1266,14 +1266,13 @@ int prepare_binprm(struct linux_binprm *bprm) bprm->cred->egid = current_egid(); if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && - !current->no_new_privs) { + !current->no_new_privs && + kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && + kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { /* Set-uid? */ if (mode & S_ISUID) { - if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid)) - return -EPERM; bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = inode->i_uid; - } /* Set-gid? */ @@ -1283,8 +1282,6 @@ int prepare_binprm(struct linux_binprm *bprm) * executable. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { - if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) - return -EPERM; bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = inode->i_gid; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8c23fa7a91e..c16335315e5 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -92,8 +92,8 @@ static void __fuse_put_request(struct fuse_req *req) static void fuse_req_init_context(struct fuse_req *req) { - req->in.h.uid = current_fsuid(); - req->in.h.gid = current_fsgid(); + req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); req->in.h.pid = current->pid; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 324bc085053..b7c09f9eb40 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -818,8 +818,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; - stat->uid = attr->uid; - stat->gid = attr->gid; + stat->uid = make_kuid(&init_user_ns, attr->uid); + stat->gid = make_kgid(&init_user_ns, attr->gid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; @@ -1007,12 +1007,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) rcu_read_lock(); ret = 0; cred = __task_cred(task); - if (cred->euid == fc->user_id && - cred->suid == fc->user_id && - cred->uid == fc->user_id && - cred->egid == fc->group_id && - cred->sgid == fc->group_id && - cred->gid == fc->group_id) + if (uid_eq(cred->euid, fc->user_id) && + uid_eq(cred->suid, fc->user_id) && + uid_eq(cred->uid, fc->user_id) && + gid_eq(cred->egid, fc->group_id) && + gid_eq(cred->sgid, fc->group_id) && + gid_eq(cred->gid, fc->group_id)) ret = 1; rcu_read_unlock(); @@ -1306,9 +1306,9 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = iattr->ia_uid; + arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid); if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = iattr->ia_gid; + arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid); if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e24dd74e306..e105a53fc72 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -333,10 +333,10 @@ struct fuse_conn { atomic_t count; /** The user id for this mount */ - uid_t user_id; + kuid_t user_id; /** The group id for this mount */ - gid_t group_id; + kgid_t group_id; /** The fuse mount flags for this mount */ unsigned flags; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f0eda124cff..73ca6b72bea 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -60,8 +60,8 @@ MODULE_PARM_DESC(max_user_congthresh, struct fuse_mount_data { int fd; unsigned rootmode; - unsigned user_id; - unsigned group_id; + kuid_t user_id; + kgid_t group_id; unsigned fd_present:1; unsigned rootmode_present:1; unsigned user_id_present:1; @@ -164,8 +164,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); set_nlink(inode, attr->nlink); - inode->i_uid = attr->uid; - inode->i_gid = attr->gid; + inode->i_uid = make_kuid(&init_user_ns, attr->uid); + inode->i_gid = make_kgid(&init_user_ns, attr->gid); inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; @@ -492,14 +492,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) case OPT_USER_ID: if (match_int(&args[0], &value)) return 0; - d->user_id = value; + d->user_id = make_kuid(current_user_ns(), value); + if (!uid_valid(d->user_id)) + return 0; d->user_id_present = 1; break; case OPT_GROUP_ID: if (match_int(&args[0], &value)) return 0; - d->group_id = value; + d->group_id = make_kgid(current_user_ns(), value); + if (!gid_valid(d->group_id)) + return 0; d->group_id_present = 1; break; @@ -540,8 +544,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - seq_printf(m, ",user_id=%u", fc->user_id); - seq_printf(m, ",group_id=%u", fc->group_id); + seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); if (fc->flags & FUSE_DEFAULT_PERMISSIONS) seq_puts(m, ",default_permissions"); if (fc->flags & FUSE_ALLOW_OTHER) @@ -989,7 +993,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) goto err; - if (file->f_op != &fuse_dev_operations) + if ((file->f_op != &fuse_dev_operations) || + (file->f_cred->user_ns != &init_user_ns)) goto err_fput; fc = kmalloc(sizeof(*fc), GFP_KERNEL); diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 78f21f8dc2e..43b315f2002 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) struct vfsmount *proc_mnt; int err = -ENOENT; - proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); + proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); if (IS_ERR(proc_mnt)) goto out; diff --git a/fs/mount.h b/fs/mount.h index 4f291f9de64..cd500798040 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -4,8 +4,11 @@ struct mnt_namespace { atomic_t count; + unsigned int proc_inum; struct mount * root; struct list_head list; + struct user_namespace *user_ns; + u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; int event; }; diff --git a/fs/namespace.c b/fs/namespace.c index 24960626bb6..c1bbe86f492 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,7 @@ #include /* get_fs_root et.al. */ #include /* fsnotify_vfsmount_delete */ #include +#include #include "pnode.h" #include "internal.h" @@ -784,7 +786,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (!mnt) return ERR_PTR(-ENOMEM); - if (flag & (CL_SLAVE | CL_PRIVATE)) + if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; @@ -805,7 +807,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, list_add_tail(&mnt->mnt_instance, &sb->s_mounts); br_write_unlock(&vfsmount_lock); - if (flag & CL_SLAVE) { + if ((flag & CL_SLAVE) || + ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; CLEAR_MNT_SHARED(mnt); @@ -1266,7 +1269,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; retval = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) goto dput_and_out; retval = do_umount(mnt, flags); @@ -1292,7 +1295,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static int mount_is_safe(struct path *path) { - if (capable(CAP_SYS_ADMIN)) + if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) return 0; return -EPERM; #ifdef notyet @@ -1308,6 +1311,26 @@ static int mount_is_safe(struct path *path) #endif } +static bool mnt_ns_loop(struct path *path) +{ + /* Could bind mounting the mount namespace inode cause a + * mount namespace loop? + */ + struct inode *inode = path->dentry->d_inode; + struct proc_inode *ei; + struct mnt_namespace *mnt_ns; + + if (!proc_ns_inode(inode)) + return false; + + ei = PROC_I(inode); + if (ei->ns_ops != &mntns_operations) + return false; + + mnt_ns = ei->ns; + return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; +} + struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { @@ -1610,7 +1633,7 @@ static int do_change_type(struct path *path, int flag) int type; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (path->dentry != path->mnt->mnt_root) @@ -1655,6 +1678,10 @@ static int do_loopback(struct path *path, const char *old_name, if (err) return err; + err = -EINVAL; + if (mnt_ns_loop(&old_path)) + goto out; + err = lock_mount(path); if (err) goto out; @@ -1770,7 +1797,7 @@ static int do_move_mount(struct path *path, const char *old_name) struct mount *p; struct mount *old; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (!old_name || !*old_name) return -EINVAL; @@ -1857,21 +1884,6 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) return ERR_PTR(err); } -static struct vfsmount * -do_kern_mount(const char *fstype, int flags, const char *name, void *data) -{ - struct file_system_type *type = get_fs_type(fstype); - struct vfsmount *mnt; - if (!type) - return ERR_PTR(-ENODEV); - mnt = vfs_kern_mount(type, flags, name, data); - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && - !mnt->mnt_sb->s_subtype) - mnt = fs_set_subtype(mnt, fstype); - put_filesystem(type); - return mnt; -} - /* * add a mount into a namespace's mount tree */ @@ -1917,20 +1929,46 @@ unlock: * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct path *path, const char *type, int flags, +static int do_new_mount(struct path *path, const char *fstype, int flags, int mnt_flags, const char *name, void *data) { + struct file_system_type *type; + struct user_namespace *user_ns; struct vfsmount *mnt; int err; - if (!type) + if (!fstype) return -EINVAL; /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) + user_ns = real_mount(path->mnt)->mnt_ns->user_ns; + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - mnt = do_kern_mount(type, flags, name, data); + type = get_fs_type(fstype); + if (!type) + return -ENODEV; + + if (user_ns != &init_user_ns) { + if (!(type->fs_flags & FS_USERNS_MOUNT)) { + put_filesystem(type); + return -EPERM; + } + /* Only in special cases allow devices from mounts + * created outside the initial user namespace. + */ + if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { + flags |= MS_NODEV; + mnt_flags |= MNT_NODEV; + } + } + + mnt = vfs_kern_mount(type, flags, name, data); + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && + !mnt->mnt_sb->s_subtype) + mnt = fs_set_subtype(mnt, fstype); + + put_filesystem(type); if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -2261,18 +2299,42 @@ dput_out: return retval; } -static struct mnt_namespace *alloc_mnt_ns(void) +static void free_mnt_ns(struct mnt_namespace *ns) +{ + proc_free_inum(ns->proc_inum); + put_user_ns(ns->user_ns); + kfree(ns); +} + +/* + * Assign a sequence number so we can detect when we attempt to bind + * mount a reference to an older mount namespace into the current + * mount namespace, preventing reference counting loops. A 64bit + * number incrementing at 10Ghz will take 12,427 years to wrap which + * is effectively never, so we can ignore the possibility. + */ +static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); + +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; + int ret; new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); + ret = proc_alloc_inum(&new_ns->proc_inum); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); new_ns->event = 0; + new_ns->user_ns = get_user_ns(user_ns); return new_ns; } @@ -2281,24 +2343,28 @@ static struct mnt_namespace *alloc_mnt_ns(void) * copied from the namespace of the passed in task structure. */ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, - struct fs_struct *fs) + struct user_namespace *user_ns, struct fs_struct *fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; struct mount *p, *q; struct mount *old = mnt_ns->root; struct mount *new; + int copy_flags; - new_ns = alloc_mnt_ns(); + new_ns = alloc_mnt_ns(user_ns); if (IS_ERR(new_ns)) return new_ns; down_write(&namespace_sem); /* First pass: copy the tree topology */ - new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); + copy_flags = CL_COPY_ALL | CL_EXPIRE; + if (user_ns != mnt_ns->user_ns) + copy_flags |= CL_SHARED_TO_SLAVE; + new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { up_write(&namespace_sem); - kfree(new_ns); + free_mnt_ns(new_ns); return ERR_CAST(new); } new_ns->root = new; @@ -2339,7 +2405,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, } struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, - struct fs_struct *new_fs) + struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; @@ -2349,7 +2415,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (!(flags & CLONE_NEWNS)) return ns; - new_ns = dup_mnt_ns(ns, new_fs); + new_ns = dup_mnt_ns(ns, user_ns, new_fs); put_mnt_ns(ns); return new_ns; @@ -2361,7 +2427,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, */ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) { - struct mnt_namespace *new_ns = alloc_mnt_ns(); + struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); if (!IS_ERR(new_ns)) { struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; @@ -2501,7 +2567,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, struct mount *new_mnt, *root_mnt; int error; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; error = user_path_dir(new_root, &new); @@ -2583,8 +2649,13 @@ static void __init init_mount_tree(void) struct vfsmount *mnt; struct mnt_namespace *ns; struct path root; + struct file_system_type *type; - mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); + type = get_fs_type("rootfs"); + if (!type) + panic("Can't find rootfs type"); + mnt = vfs_kern_mount(type, 0, "rootfs", NULL); + put_filesystem(type); if (IS_ERR(mnt)) panic("Can't create rootfs"); @@ -2647,7 +2718,7 @@ void put_mnt_ns(struct mnt_namespace *ns) br_write_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); - kfree(ns); + free_mnt_ns(ns); } struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) @@ -2681,3 +2752,71 @@ bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); } + +static void *mntns_get(struct task_struct *task) +{ + struct mnt_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->mnt_ns; + get_mnt_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void mntns_put(void *ns) +{ + put_mnt_ns(ns); +} + +static int mntns_install(struct nsproxy *nsproxy, void *ns) +{ + struct fs_struct *fs = current->fs; + struct mnt_namespace *mnt_ns = ns; + struct path root; + + if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || + !nsown_capable(CAP_SYS_CHROOT)) + return -EPERM; + + if (fs->users != 1) + return -EINVAL; + + get_mnt_ns(mnt_ns); + put_mnt_ns(nsproxy->mnt_ns); + nsproxy->mnt_ns = mnt_ns; + + /* Find the root */ + root.mnt = &mnt_ns->root->mnt; + root.dentry = mnt_ns->root->mnt.mnt_root; + path_get(&root); + while(d_mountpoint(root.dentry) && follow_down_one(&root)) + ; + + /* Update the pwd and root */ + set_fs_pwd(fs, &root); + set_fs_root(fs, &root); + + path_put(&root); + return 0; +} + +static unsigned int mntns_inum(void *ns) +{ + struct mnt_namespace *mnt_ns = ns; + return mnt_ns->proc_inum; +} + +const struct proc_ns_operations mntns_operations = { + .name = "mnt", + .type = CLONE_NEWNS, + .get = mntns_get, + .put = mntns_put, + .install = mntns_install, + .inum = mntns_inum, +}; diff --git a/fs/open.c b/fs/open.c index 59071f55bf7..182d8667b7b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -435,7 +435,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) goto dput_and_out; error = -EPERM; - if (!capable(CAP_SYS_CHROOT)) + if (!nsown_capable(CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) diff --git a/fs/pnode.h b/fs/pnode.h index 65c60979d54..19b853a3445 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -22,6 +22,7 @@ #define CL_COPY_ALL 0x04 #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 +#define CL_SHARED_TO_SLAVE 0x20 static inline void set_mnt_shared(struct mount *mnt) { diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 99349efbbc2..981b0560193 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -21,6 +21,7 @@ proc-y += uptime.o proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o +proc-y += self.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/fs/proc/array.c b/fs/proc/array.c index d3696708fc1..d66248a1919 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk) static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { - struct user_namespace *user_ns = current_user_ns(); + struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g; struct fdtable *fdt = NULL; diff --git a/fs/proc/base.c b/fs/proc/base.c index aa63d25157b..5a5a0be40e4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2345,146 +2345,6 @@ static const struct file_operations proc_coredump_filter_operations = { }; #endif -/* - * /proc/self: - */ -static int proc_self_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return -ENOENT; - sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); -} - -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (tgid) { - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d", tgid); - } - nd_set_link(nd, name); - return NULL; -} - -static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - kfree(s); -} - -static const struct inode_operations proc_self_inode_operations = { - .readlink = proc_self_readlink, - .follow_link = proc_self_follow_link, - .put_link = proc_self_put_link, -}; - -/* - * proc base - * - * These are the directory entries in the root directory of /proc - * that properly belong to the /proc filesystem, as they describe - * describe something that is process related. - */ -static const struct pid_entry proc_base_stuff[] = { - NOD("self", S_IFLNK|S_IRWXUGO, - &proc_self_inode_operations, NULL, {}), -}; - -static struct dentry *proc_base_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) -{ - const struct pid_entry *p = ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error; - - /* Allocate the inode */ - error = ERR_PTR(-ENOMEM); - inode = new_inode(dir->i_sb); - if (!inode) - goto out; - - /* Initialize the inode */ - ei = PROC_I(inode); - inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - - /* - * grab the reference to the task. - */ - ei->pid = get_task_pid(task, PIDTYPE_PID); - if (!ei->pid) - goto out_iput; - - inode->i_mode = p->mode; - if (S_ISDIR(inode->i_mode)) - set_nlink(inode, 2); - if (S_ISLNK(inode->i_mode)) - inode->i_size = 64; - if (p->iop) - inode->i_op = p->iop; - if (p->fop) - inode->i_fop = p->fop; - ei->op = p->op; - d_add(dentry, inode); - error = NULL; -out: - return error; -out_iput: - iput(inode); - goto out; -} - -static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) -{ - struct dentry *error; - struct task_struct *task = get_proc_task(dir); - const struct pid_entry *p, *last; - - error = ERR_PTR(-ENOENT); - - if (!task) - goto out_no_task; - - /* Lookup the directory entry */ - last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; - for (p = proc_base_stuff; p <= last; p++) { - if (p->len != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, p->name, p->len)) - break; - } - if (p > last) - goto out; - - error = proc_base_instantiate(dir, dentry, task, p); - -out: - put_task_struct(task); -out_no_task: - return error; -} - -static int proc_base_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, const struct pid_entry *p) -{ - return proc_fill_cache(filp, dirent, filldir, p->name, p->len, - proc_base_instantiate, task, p); -} - #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { @@ -2839,10 +2699,6 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, @@ -2876,15 +2732,11 @@ out: struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result; + struct dentry *result = NULL; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; - result = proc_base_lookup(dir, dentry); - if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) - goto out; - tgid = name_to_int(dentry); if (tgid == ~0U) goto out; @@ -2947,7 +2799,7 @@ retry: return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY) static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct tgid_iter iter) @@ -2967,25 +2819,12 @@ static int fake_filldir(void *buf, const char *name, int namelen, /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int nr; - struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) - goto out_no_task; - nr = filp->f_pos - FIRST_PROCESS_ENTRY; - - reaper = get_proc_task(filp->f_path.dentry->d_inode); - if (!reaper) - goto out_no_task; - - for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { - const struct pid_entry *p = &proc_base_stuff[nr]; - if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) - goto out; - } + goto out; ns = filp->f_dentry->d_sb->s_fs_info; iter.task = NULL; @@ -3006,8 +2845,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) } filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; out: - put_task_struct(reaper); -out_no_task: return 0; } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 0d80cef4cfb..7b3ae3cc0ef 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ * Return an inode number between PROC_DYNAMIC_FIRST and * 0xffffffff, or zero on failure. */ -static unsigned int get_inode_number(void) +int proc_alloc_inum(unsigned int *inum) { unsigned int i; int error; retry: - if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) - return 0; + if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) + return -ENOMEM; spin_lock(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); @@ -365,18 +365,19 @@ retry: if (error == -EAGAIN) goto retry; else if (error) - return 0; + return error; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, i); spin_unlock(&proc_inum_lock); - return 0; + return -ENOSPC; } - return PROC_DYNAMIC_FIRST + i; + *inum = PROC_DYNAMIC_FIRST + i; + return 0; } -static void release_inode_number(unsigned int inum) +void proc_free_inum(unsigned int inum) { spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); @@ -554,13 +555,12 @@ static const struct inode_operations proc_dir_inode_operations = { static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) { - unsigned int i; struct proc_dir_entry *tmp; + int ret; - i = get_inode_number(); - if (i == 0) - return -EAGAIN; - dp->low_ino = i; + ret = proc_alloc_inum(&dp->low_ino); + if (ret) + return ret; if (S_ISDIR(dp->mode)) { if (dp->proc_iops == NULL) { @@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data); static void free_proc_entry(struct proc_dir_entry *de) { - release_inode_number(de->low_ino); + proc_free_inum(de->low_ino); if (S_ISLNK(de->mode)) kfree(de->data); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 3b22bbdee9e..439ae688650 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode) struct proc_dir_entry *de; struct ctl_table_header *head; const struct proc_ns_operations *ns_ops; + void *ns; truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); @@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode) } /* Release any associated namespace */ ns_ops = PROC_I(inode)->ns_ops; - if (ns_ops && ns_ops->put) - ns_ops->put(PROC_I(inode)->ns); + ns = PROC_I(inode)->ns; + if (ns_ops && ns) + ns_ops->put(ns); } static struct kmem_cache * proc_inode_cachep; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 43973b084ab..252544c0520 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -15,6 +15,7 @@ struct ctl_table_header; struct mempolicy; extern struct proc_dir_entry proc_root; +extern void proc_self_init(void); #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void sysctl_head_put(struct ctl_table_header *head); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index b178ed733c3..b7a47196c8c 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" @@ -24,12 +25,168 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_IPC_NS &ipcns_operations, #endif +#ifdef CONFIG_PID_NS + &pidns_operations, +#endif +#ifdef CONFIG_USER_NS + &userns_operations, +#endif + &mntns_operations, }; static const struct file_operations ns_file_operations = { .llseek = no_llseek, }; +static const struct inode_operations ns_inode_operations = { + .setattr = proc_setattr, +}; + +static int ns_delete_dentry(const struct dentry *dentry) +{ + /* Don't cache namespace inodes when not in use */ + return 1; +} + +static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + ns_ops->name, inode->i_ino); +} + +const struct dentry_operations ns_dentry_operations = +{ + .d_delete = ns_delete_dentry, + .d_dname = ns_dname, +}; + +static struct dentry *proc_ns_get_dentry(struct super_block *sb, + struct task_struct *task, const struct proc_ns_operations *ns_ops) +{ + struct dentry *dentry, *result; + struct inode *inode; + struct proc_inode *ei; + struct qstr qname = { .name = "", }; + void *ns; + + ns = ns_ops->get(task); + if (!ns) + return ERR_PTR(-ENOENT); + + dentry = d_alloc_pseudo(sb, &qname); + if (!dentry) { + ns_ops->put(ns); + return ERR_PTR(-ENOMEM); + } + + inode = iget_locked(sb, ns_ops->inum(ns)); + if (!inode) { + dput(dentry); + ns_ops->put(ns); + return ERR_PTR(-ENOMEM); + } + + ei = PROC_I(inode); + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &ns_inode_operations; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; + unlock_new_inode(inode); + } else { + ns_ops->put(ns); + } + + d_set_d_op(dentry, &ns_dentry_operations); + result = d_instantiate_unique(dentry, inode); + if (result) { + dput(dentry); + dentry = result; + } + + return dentry; +} + +static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct proc_inode *ei = PROC_I(inode); + struct task_struct *task; + struct dentry *ns_dentry; + void *error = ERR_PTR(-EACCES); + + task = get_proc_task(inode); + if (!task) + goto out; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); + if (IS_ERR(ns_dentry)) { + error = ERR_CAST(ns_dentry); + goto out_put_task; + } + + dput(nd->path.dentry); + nd->path.dentry = ns_dentry; + error = NULL; + +out_put_task: + put_task_struct(task); +out: + return error; +} + +static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + struct proc_inode *ei = PROC_I(inode); + const struct proc_ns_operations *ns_ops = ei->ns_ops; + struct task_struct *task; + void *ns; + char name[50]; + int len = -EACCES; + + task = get_proc_task(inode); + if (!task) + goto out; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + len = -ENOENT; + ns = ns_ops->get(task); + if (!ns) + goto out_put_task; + + snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); + len = strlen(name); + + if (len > buflen) + len = buflen; + if (copy_to_user(buffer, name, len)) + len = -EFAULT; + + ns_ops->put(ns); +out_put_task: + put_task_struct(task); +out: + return len; +} + +static const struct inode_operations proc_ns_link_inode_operations = { + .readlink = proc_ns_readlink, + .follow_link = proc_ns_follow_link, + .setattr = proc_setattr, +}; + static struct dentry *proc_ns_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -37,21 +194,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); - void *ns; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; - ns = ns_ops->get(task); - if (!ns) - goto out_iput; - ei = PROC_I(inode); - inode->i_mode = S_IFREG|S_IRUSR; - inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + inode->i_mode = S_IFLNK|S_IRWXUGO; + inode->i_op = &proc_ns_link_inode_operations; + ei->ns_ops = ns_ops; d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); @@ -60,9 +211,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, error = NULL; out: return error; -out_iput: - iput(inode); - goto out; } static int proc_ns_fill_cache(struct file *filp, void *dirent, @@ -89,10 +237,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent, if (!task) goto out_no_task; - ret = -EPERM; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - ret = 0; i = filp->f_pos; switch (i) { @@ -152,10 +296,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!task) goto out_no_task; - error = ERR_PTR(-EPERM); - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - last = &ns_entries[ARRAY_SIZE(ns_entries)]; for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) @@ -163,7 +303,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } - error = ERR_PTR(-ENOENT); if (entry == last) goto out; @@ -198,3 +337,7 @@ out_invalid: return ERR_PTR(-EINVAL); } +bool proc_ns_inode(struct inode *inode) +{ + return inode->i_fop == &ns_file_operations; +} diff --git a/fs/proc/root.c b/fs/proc/root.c index 9889a92d2e0..c6e9fac26ba 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -100,14 +100,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, int err; struct super_block *sb; struct pid_namespace *ns; - struct proc_inode *ei; char *options; if (flags & MS_KERNMOUNT) { ns = (struct pid_namespace *)data; options = NULL; } else { - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); options = data; } @@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - ei = PROC_I(sb->s_root->d_inode); - if (!ei->pid) { - rcu_read_lock(); - ei->pid = get_pid(find_pid_ns(1, ns)); - rcu_read_unlock(); - } - return dget(sb->s_root); } @@ -153,6 +145,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -163,12 +156,8 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - err = pid_ns_prepare_proc(&init_pid_ns); - if (err) { - unregister_filesystem(&proc_fs_type); - return; - } + proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); diff --git a/fs/proc/self.c b/fs/proc/self.c new file mode 100644 index 00000000000..aa5cc3bff14 --- /dev/null +++ b/fs/proc/self.c @@ -0,0 +1,59 @@ +#include +#include +#include + +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char tmp[PROC_NUMBUF]; + if (!tgid) + return -ENOENT; + sprintf(tmp, "%d", tgid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); +} + +static const struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, +}; + +void __init proc_self_init(void) +{ + struct proc_dir_entry *proc_self_symlink; + mode_t mode; + + mode = S_IFLNK | S_IRWXUGO; + proc_self_symlink = proc_create("self", mode, NULL, NULL ); + proc_self_symlink->proc_iops = &proc_self_inode_operations; +} diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 71eb7e25392..db940a9be04 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -149,6 +149,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/include/linux/cred.h b/include/linux/cred.h index 0142aacb70b..abb2cd50f6b 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -344,10 +344,8 @@ static inline void put_cred(const struct cred *_cred) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) -#define task_user_ns(task) (task_cred_xxx((task), user_ns)) #else #define current_user_ns() (&init_user_ns) -#define task_user_ns(task) (&init_user_ns) #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 408fb1e77a0..035521b4652 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1810,6 +1810,8 @@ struct file_system_type { #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 +#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ +#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 5499c92a915..fe771978e87 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -67,6 +67,8 @@ struct ipc_namespace { /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; + + unsigned int proc_inum; }; extern struct ipc_namespace init_ipc_ns; @@ -133,7 +135,8 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #if defined(CONFIG_IPC_NS) extern struct ipc_namespace *copy_ipcs(unsigned long flags, - struct task_struct *tsk); + struct user_namespace *user_ns, struct ipc_namespace *ns); + static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) @@ -144,12 +147,12 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) return ERR_PTR(-EINVAL); - return tsk->nsproxy->ipc_ns; + return ns; } static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 5a8e3903d77..12b2ab51032 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -4,9 +4,10 @@ struct mnt_namespace; struct fs_struct; +struct user_namespace; extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, - struct fs_struct *); + struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); extern const struct file_operations proc_mounts_operations; diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index cc37a55ad00..10e5947491c 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -67,7 +67,7 @@ void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, - struct fs_struct *); + struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 65e3e87eacc..bf285999273 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -21,6 +21,7 @@ struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; int last_pid; + int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -31,9 +32,12 @@ struct pid_namespace { #ifdef CONFIG_BSD_PROCESS_ACCT struct bsd_acct_struct *bacct; #endif + struct user_namespace *user_ns; + struct work_struct proc_work; kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ + unsigned int proc_inum; }; extern struct pid_namespace init_pid_ns; @@ -46,7 +50,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } -extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns); +extern struct pid_namespace *copy_pid_ns(unsigned long flags, + struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); @@ -59,8 +64,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } -static inline struct pid_namespace * -copy_pid_ns(unsigned long flags, struct pid_namespace *ns) +static inline struct pid_namespace *copy_pid_ns(unsigned long flags, + struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) ns = ERR_PTR(-EINVAL); diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 3fd2e871ff1..2e24018b7ce 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -28,7 +28,11 @@ struct mm_struct; */ enum { - PROC_ROOT_INO = 1, + PROC_ROOT_INO = 1, + PROC_IPC_INIT_INO = 0xEFFFFFFFU, + PROC_UTS_INIT_INO = 0xEFFFFFFEU, + PROC_USER_INIT_INO = 0xEFFFFFFDU, + PROC_PID_INIT_INO = 0xEFFFFFFCU, }; /* @@ -174,7 +178,10 @@ extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, struct proc_dir_entry *parent); extern struct file *proc_ns_fget(int fd); +extern bool proc_ns_inode(struct inode *inode); +extern int proc_alloc_inum(unsigned int *pino); +extern void proc_free_inum(unsigned int inum); #else #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; }) @@ -229,6 +236,19 @@ static inline struct file *proc_ns_fget(int fd) return ERR_PTR(-EINVAL); } +static inline bool proc_ns_inode(struct inode *inode) +{ + return false; +} + +static inline int proc_alloc_inum(unsigned int *inum) +{ + *inum = 1; + return 0; +} +static inline void proc_free_inum(unsigned int inum) +{ +} #endif /* CONFIG_PROC_FS */ #if !defined(CONFIG_PROC_KCORE) @@ -247,10 +267,14 @@ struct proc_ns_operations { void *(*get)(struct task_struct *task); void (*put)(void *ns); int (*install)(struct nsproxy *nsproxy, void *ns); + unsigned int (*inum)(void *ns); }; extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations mntns_operations; union proc_op { int (*proc_get_link)(struct dentry *, struct path *); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 95142cae446..b9bd2e6c73c 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -25,6 +25,7 @@ struct user_namespace { struct user_namespace *parent; kuid_t owner; kgid_t group; + unsigned int proc_inum; }; extern struct user_namespace init_user_ns; @@ -39,6 +40,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns) } extern int create_user_ns(struct cred *new); +extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void free_user_ns(struct kref *kref); static inline void put_user_ns(struct user_namespace *ns) @@ -66,6 +68,14 @@ static inline int create_user_ns(struct cred *new) return -EINVAL; } +static inline int unshare_userns(unsigned long unshare_flags, + struct cred **new_cred) +{ + if (unshare_flags & CLONE_NEWUSER) + return -EINVAL; + return 0; +} + static inline void put_user_ns(struct user_namespace *ns) { } diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 2b345206722..239e27733d6 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -23,6 +23,7 @@ struct uts_namespace { struct kref kref; struct new_utsname name; struct user_namespace *user_ns; + unsigned int proc_inum; }; extern struct uts_namespace init_uts_ns; @@ -33,7 +34,7 @@ static inline void get_uts_ns(struct uts_namespace *ns) } extern struct uts_namespace *copy_utsname(unsigned long flags, - struct task_struct *tsk); + struct user_namespace *user_ns, struct uts_namespace *old_ns); extern void free_uts_ns(struct kref *kref); static inline void put_uts_ns(struct uts_namespace *ns) @@ -50,12 +51,12 @@ static inline void put_uts_ns(struct uts_namespace *ns) } static inline struct uts_namespace *copy_utsname(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct uts_namespace *old_ns) { if (flags & CLONE_NEWUTS) return ERR_PTR(-EINVAL); - return tsk->nsproxy->uts_ns; + return old_ns; } #endif diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index c5a43f56b79..de644bcd861 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -56,6 +56,8 @@ struct net { struct user_namespace *user_ns; /* Owning user namespace */ + unsigned int proc_inum; + struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; diff --git a/init/Kconfig b/init/Kconfig index 1a207efca59..675d8a2326c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1069,11 +1069,9 @@ config UIDGID_CONVERTED # Filesystems depends on 9P_FS = n depends on AFS_FS = n - depends on AUTOFS4_FS = n depends on CEPH_FS = n depends on CIFS = n depends on CODA_FS = n - depends on FUSE_FS = n depends on GFS2_FS = n depends on NCP_FS = n depends on NFSD = n diff --git a/init/main.c b/init/main.c index 63ae904a99a..baf1f0f5c46 100644 --- a/init/main.c +++ b/init/main.c @@ -812,7 +812,6 @@ static int __ref kernel_init(void *unused) system_state = SYSTEM_RUNNING; numa_default_policy(); - current->signal->flags |= SIGNAL_UNKILLABLE; flush_delayed_fput(); if (ramdisk_execute_command) { diff --git a/init/version.c b/init/version.c index 86fe0ccb997..58170f18912 100644 --- a/init/version.c +++ b/init/version.c @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef CONFIG_KALLSYMS #define version(a) Version_ ## a @@ -34,6 +35,7 @@ struct uts_namespace init_uts_ns = { .domainname = UTS_DOMAINNAME, }, .user_ns = &init_user_ns, + .proc_inum = PROC_UTS_INIT_INO, }; EXPORT_SYMBOL_GPL(init_uts_ns); diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 26143d377c9..6471f1bdae9 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "util.h" @@ -30,6 +31,7 @@ DEFINE_SPINLOCK(mq_lock); struct ipc_namespace init_ipc_ns = { .count = ATOMIC_INIT(1), .user_ns = &init_user_ns, + .proc_inum = PROC_IPC_INIT_INO, }; atomic_t nr_ipc_ns = ATOMIC_INIT(1); diff --git a/ipc/namespace.c b/ipc/namespace.c index f362298c5ce..cf3386a51de 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -16,7 +16,7 @@ #include "util.h" -static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, +static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, struct ipc_namespace *old_ns) { struct ipc_namespace *ns; @@ -26,9 +26,16 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, if (ns == NULL) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + atomic_set(&ns->count, 1); err = mq_init_ns(ns); if (err) { + proc_free_inum(ns->proc_inum); kfree(ns); return ERR_PTR(err); } @@ -46,19 +53,17 @@ static struct ipc_namespace *create_ipc_ns(struct task_struct *tsk, ipcns_notify(IPCNS_CREATED); register_ipcns_notifier(ns); - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); + ns->user_ns = get_user_ns(user_ns); return ns; } struct ipc_namespace *copy_ipcs(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct ipc_namespace *ns) { - struct ipc_namespace *ns = tsk->nsproxy->ipc_ns; - if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); - return create_ipc_ns(tsk, ns); + return create_ipc_ns(user_ns, ns); } /* @@ -113,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) */ ipcns_notify(IPCNS_REMOVED); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -161,8 +167,12 @@ static void ipcns_put(void *ns) return put_ipc_ns(ns); } -static int ipcns_install(struct nsproxy *nsproxy, void *ns) +static int ipcns_install(struct nsproxy *nsproxy, void *new) { + struct ipc_namespace *ns = new; + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + /* Ditch state from the old ipc namespace */ exit_sem(current); put_ipc_ns(nsproxy->ipc_ns); @@ -170,10 +180,18 @@ static int ipcns_install(struct nsproxy *nsproxy, void *ns) return 0; } +static unsigned int ipcns_inum(void *vp) +{ + struct ipc_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, + .inum = ipcns_inum, }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f34c41bfaa3..9915ffe0137 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3409,7 +3409,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = current->nsproxy->pid_ns; + struct pid_namespace *ns = task_active_pid_ns(current); /* * We can't drop the pidlist_mutex before taking the l->mutex in case diff --git a/kernel/events/core.c b/kernel/events/core.c index f9ff5493171..301079d06f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->parent = parent_event; - event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; diff --git a/kernel/exit.c b/kernel/exit.c index 50d2e93c36e..b4df2193721 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); - /* - * If we are the last child process in a pid namespace to be - * reaped, notify the reaper sleeping zap_pid_ns_processes(). - */ - if (IS_ENABLED(CONFIG_PID_NS)) { - struct task_struct *parent = p->real_parent; - - if ((task_active_pid_ns(parent)->child_reaper == parent) && - list_empty(&parent->children) && - (parent->flags & PF_EXITING)) - wake_up_process(parent); - } } list_del_rcu(&p->thread_group); } diff --git a/kernel/fork.c b/kernel/fork.c index 115d6c2e4cc..c36c4e301ef 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1044,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); - if (clone_flags & CLONE_NEWPID) - sig->flags |= SIGNAL_UNKILLABLE; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -1438,8 +1436,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { - if (is_child_reaper(pid)) - p->nsproxy->pid_ns->child_reaper = p; + if (is_child_reaper(pid)) { + ns_of_pid(pid)->child_reaper = p; + p->signal->flags |= SIGNAL_UNKILLABLE; + } p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); @@ -1473,8 +1473,6 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - if (unlikely(clone_flags & CLONE_NEWPID)) - pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) @@ -1554,15 +1552,9 @@ long do_fork(unsigned long clone_flags, * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ - if (clone_flags & CLONE_NEWUSER) { - if (clone_flags & CLONE_THREAD) + if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { + if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) return -EINVAL; - /* hopefully this check will go away when userns support is - * complete - */ - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || - !capable(CAP_SETGID)) - return -EPERM; } /* @@ -1724,7 +1716,8 @@ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| + CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1791,19 +1784,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; + struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; - + /* + * If unsharing a user namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWUSER) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a pid namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWPID) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a thread from a thread group, must also unshare vm. + */ + if (unshare_flags & CLONE_THREAD) + unshare_flags |= CLONE_VM; + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (unshare_flags & CLONE_VM) + unshare_flags |= CLONE_SIGHAND; /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -1817,11 +1831,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + err = unshare_userns(unshare_flags, &new_cred); if (err) goto bad_unshare_cleanup_fd; + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_cred, new_fs); + if (err) + goto bad_unshare_cleanup_cred; - if (new_fs || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1854,11 +1872,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } task_unlock(current); + + if (new_cred) { + /* Install the new user namespace */ + commit_creds(new_cred); + new_cred = NULL; + } } if (new_nsproxy) put_nsproxy(new_nsproxy); +bad_unshare_cleanup_cred: + if (new_cred) + put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 7e1c3de1ce4..78e2ecb2016 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void) * leave it to the caller to do proper locking and attach it to task. */ static struct nsproxy *create_new_namespaces(unsigned long flags, - struct task_struct *tsk, struct fs_struct *new_fs) + struct task_struct *tsk, struct user_namespace *user_ns, + struct fs_struct *new_fs) { struct nsproxy *new_nsp; int err; @@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk); + new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk); + new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); + new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; } - new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns); + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); goto out_net; @@ -122,6 +123,7 @@ out_ns: int copy_namespaces(unsigned long flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; + struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; int err = 0; @@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) CLONE_NEWPID | CLONE_NEWNET))) return 0; - if (!capable(CAP_SYS_ADMIN)) { + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { err = -EPERM; goto out; } @@ -151,7 +153,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - new_ns = create_new_namespaces(flags, tsk, tsk->fs); + new_ns = create_new_namespaces(flags, tsk, + task_cred_xxx(tsk, user_ns), tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); goto out; @@ -183,19 +186,21 @@ void free_nsproxy(struct nsproxy *ns) * On success, returns the new nsproxy. */ int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct fs_struct *new_fs) + struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { + struct user_namespace *user_ns; int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET))) + CLONE_NEWNET | CLONE_NEWPID))) return 0; - if (!capable(CAP_SYS_ADMIN)) + user_ns = new_cred ? new_cred->user_ns : current_user_ns(); + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = create_new_namespaces(unshare_flags, current, - new_fs ? new_fs : current->fs); + *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, + new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); goto out; @@ -241,9 +246,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) struct file *file; int err; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - file = proc_ns_fget(fd); if (IS_ERR(file)) return PTR_ERR(file); @@ -254,7 +256,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) if (nstype && (ops->type != nstype)) goto out; - new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); if (IS_ERR(new_nsproxy)) { err = PTR_ERR(new_nsproxy); goto out; diff --git a/kernel/pid.c b/kernel/pid.c index fd996c1ed9f..3e2cf8100ac 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -78,6 +79,8 @@ struct pid_namespace init_pid_ns = { .last_pid = 0, .level = 0, .child_reaper = &init_task, + .user_ns = &init_user_ns, + .proc_inum = PROC_PID_INIT_INO, }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -269,8 +272,24 @@ void free_pid(struct pid *pid) unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); + for (i = 0; i <= pid->level; i++) { + struct upid *upid = pid->numbers + i; + struct pid_namespace *ns = upid->ns; + hlist_del_rcu(&upid->pid_chain); + switch(--ns->nr_hashed) { + case 1: + /* When all that is left in the pid namespace + * is the reaper wake up the reaper. The reaper + * may be sleeping in zap_pid_ns_processes(). + */ + wake_up_process(ns->child_reaper); + break; + case 0: + ns->nr_hashed = -1; + schedule_work(&ns->proc_work); + break; + } + } spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) @@ -292,6 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) goto out; tmp = ns; + pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); if (nr < 0) @@ -302,22 +322,32 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } + if (unlikely(is_child_reaper(pid))) { + if (pid_ns_prepare_proc(ns)) + goto out_free; + } + get_pid_ns(ns); - pid->level = ns->level; atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for ( ; upid >= pid->numbers; --upid) + if (ns->nr_hashed < 0) + goto out_unlock; + for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + upid->ns->nr_hashed++; + } spin_unlock_irq(&pidmap_lock); out: return pid; +out_unlock: + spin_unlock(&pidmap_lock); out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); @@ -344,7 +374,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { - return find_pid_ns(nr, current->nsproxy->pid_ns); + return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); @@ -428,7 +458,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) struct task_struct *find_task_by_vpid(pid_t vnr) { - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); + return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) @@ -483,7 +513,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { - return pid_nr_ns(pid, current->nsproxy->pid_ns); + return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); @@ -494,7 +524,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; @@ -569,6 +599,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); + init_pid_ns.nr_hashed = 1; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7b07cc0dfb7..560da0dab23 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -71,10 +72,17 @@ err_alloc: return NULL; } +static void proc_cleanup_work(struct work_struct *work) +{ + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); + pid_ns_release_proc(ns); +} + /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 -static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) +static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, + struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; @@ -99,9 +107,15 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p if (ns->pid_cachep == NULL) goto out_free_map; + err = proc_alloc_inum(&ns->proc_inum); + if (err) + goto out_free_map; + kref_init(&ns->kref); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); + ns->user_ns = get_user_ns(user_ns); + INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -109,14 +123,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - err = pid_ns_prepare_proc(ns); - if (err) - goto out_put_parent_pid_ns; - return ns; -out_put_parent_pid_ns: - put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: @@ -129,18 +137,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; + proc_free_inum(ns->proc_inum); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); + put_user_ns(ns->user_ns); kmem_cache_free(pid_ns_cachep, ns); } -struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) +struct pid_namespace *copy_pid_ns(unsigned long flags, + struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & (CLONE_THREAD|CLONE_PARENT)) + if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); - return create_pid_namespace(old_ns); + return create_pid_namespace(user_ns, old_ns); } static void free_pid_ns(struct kref *kref) @@ -211,22 +222,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* * sys_wait4() above can't reap the TASK_DEAD children. - * Make sure they all go away, see __unhash_process(). + * Make sure they all go away, see free_pid(). */ for (;;) { - bool need_wait = false; - - read_lock(&tasklist_lock); - if (!list_empty(¤t->children)) { - __set_current_state(TASK_UNINTERRUPTIBLE); - need_wait = true; - } - read_unlock(&tasklist_lock); - - if (!need_wait) + set_current_state(TASK_UNINTERRUPTIBLE); + if (pid_ns->nr_hashed == 1) break; schedule(); } + __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; @@ -239,9 +243,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) static int pid_ns_ctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; - if (write && !capable(CAP_SYS_ADMIN)) + if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -250,7 +255,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = ¤t->nsproxy->pid_ns->last_pid; + tmp.data = &pid_ns->last_pid; return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } @@ -299,6 +304,67 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } +static void *pidns_get(struct task_struct *task) +{ + struct pid_namespace *ns; + + rcu_read_lock(); + ns = get_pid_ns(task_active_pid_ns(task)); + rcu_read_unlock(); + + return ns; +} + +static void pidns_put(void *ns) +{ + put_pid_ns(ns); +} + +static int pidns_install(struct nsproxy *nsproxy, void *ns) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *ancestor, *new = ns; + + if (!ns_capable(new->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Only allow entering the current active pid namespace + * or a child of the current active pid namespace. + * + * This is required for fork to return a usable pid value and + * this maintains the property that processes and their + * children can not escape their current pid namespace. + */ + if (new->level < active->level) + return -EINVAL; + + ancestor = new; + while (ancestor->level > active->level) + ancestor = ancestor->parent; + if (ancestor != active) + return -EINVAL; + + put_pid_ns(nsproxy->pid_ns); + nsproxy->pid_ns = get_pid_ns(new); + return 0; +} + +static unsigned int pidns_inum(void *ns) +{ + struct pid_namespace *pid_ns = ns; + return pid_ns->proc_inum; +} + +const struct proc_ns_operations pidns_operations = { + .name = "pid", + .type = CLONE_NEWPID, + .get = pidns_get, + .put = pidns_put, + .install = pidns_install, + .inum = pidns_inum, +}; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1f5e55dda95..7b09b88862c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -215,8 +215,12 @@ ok: smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode)) + rcu_read_lock(); + if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); return security_ptrace_access_check(task, mode); } @@ -280,8 +284,10 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) flags |= PT_SEIZED; - if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) + rcu_read_lock(); + if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) flags |= PT_PTRACE_CAP; + rcu_read_unlock(); task->ptrace = flags; __ptrace_link(task, current); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1fb82104bf..257002c13bb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4097,8 +4097,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) - goto out_unlock; + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + goto out_unlock; + } + rcu_read_unlock(); + } retval = security_task_setscheduler(p); if (retval) diff --git a/kernel/signal.c b/kernel/signal.c index a49c7f36ceb..580a91e6347 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1753,7 +1753,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 65bdcf198d4..5a638445050 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, goto out_putname; } - mnt = current->nsproxy->pid_ns->proc_mnt; + mnt = task_active_pid_ns(current)->proc_mnt; file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) diff --git a/kernel/user.c b/kernel/user.c index 750acffbe9e..33acb5e53a5 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * userns count is 1 for root user, 1 for init_uts_ns, @@ -51,6 +52,7 @@ struct user_namespace init_user_ns = { }, .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, + .proc_inum = PROC_USER_INIT_INO, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 456a6b9fba3..f5975ccf934 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,24 @@ static struct kmem_cache *user_ns_cachep __read_mostly; static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); +static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) +{ + /* Start with the same capabilities as init but useless for doing + * anything as the capabilities are bound to the new user namespace. + */ + cred->securebits = SECUREBITS_DEFAULT; + cred->cap_inheritable = CAP_EMPTY_SET; + cred->cap_permitted = CAP_FULL_SET; + cred->cap_effective = CAP_FULL_SET; + cred->cap_bset = CAP_FULL_SET; +#ifdef CONFIG_KEYS + key_put(cred->request_key_auth); + cred->request_key_auth = NULL; +#endif + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ + cred->user_ns = user_ns; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -39,6 +58,7 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; + int ret; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who @@ -52,38 +72,45 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; + ret = proc_alloc_inum(&ns->proc_inum); + if (ret) { + kmem_cache_free(user_ns_cachep, ns); + return ret; + } + kref_init(&ns->kref); + /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->owner = owner; ns->group = group; - /* Start with the same capabilities as init but useless for doing - * anything as the capabilities are bound to the new user namespace. - */ - new->securebits = SECUREBITS_DEFAULT; - new->cap_inheritable = CAP_EMPTY_SET; - new->cap_permitted = CAP_FULL_SET; - new->cap_effective = CAP_FULL_SET; - new->cap_bset = CAP_FULL_SET; -#ifdef CONFIG_KEYS - key_put(new->request_key_auth); - new->request_key_auth = NULL; -#endif - /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - - /* Leave the new->user_ns reference with the new user namespace. */ - /* Leave the reference to our user_ns with the new cred. */ - new->user_ns = ns; + set_cred_user_ns(new, ns); return 0; } +int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) +{ + struct cred *cred; + + if (!(unshare_flags & CLONE_NEWUSER)) + return 0; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + *new_cred = cred; + return create_user_ns(cred); +} + void free_user_ns(struct kref *kref) { struct user_namespace *parent, *ns = container_of(kref, struct user_namespace, kref); parent = ns->parent; + proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); put_user_ns(parent); } @@ -372,7 +399,7 @@ static int uid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; uid_t lower; - lower_ns = current_user_ns(); + lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -393,7 +420,7 @@ static int gid_m_show(struct seq_file *seq, void *v) struct user_namespace *lower_ns; gid_t lower; - lower_ns = current_user_ns(); + lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; @@ -669,10 +696,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } @@ -681,10 +712,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; + struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; + if ((seq_ns != ns) && (seq_ns != ns->parent)) + return -EPERM; + return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } @@ -709,6 +744,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { + /* Allow mapping to your own filesystem ids */ + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { + u32 id = new_map->extent[0].lower_first; + if (cap_setid == CAP_SETUID) { + kuid_t uid = make_kuid(ns->parent, id); + if (uid_eq(uid, current_fsuid())) + return true; + } + else if (cap_setid == CAP_SETGID) { + kgid_t gid = make_kgid(ns->parent, id); + if (gid_eq(gid, current_fsgid())) + return true; + } + } + /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; @@ -722,6 +772,65 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, return false; } +static void *userns_get(struct task_struct *task) +{ + struct user_namespace *user_ns; + + rcu_read_lock(); + user_ns = get_user_ns(__task_cred(task)->user_ns); + rcu_read_unlock(); + + return user_ns; +} + +static void userns_put(void *ns) +{ + put_user_ns(ns); +} + +static int userns_install(struct nsproxy *nsproxy, void *ns) +{ + struct user_namespace *user_ns = ns; + struct cred *cred; + + /* Don't allow gaining capabilities by reentering + * the same user namespace. + */ + if (user_ns == current_user_ns()) + return -EINVAL; + + /* Threaded many not enter a different user namespace */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + put_user_ns(cred->user_ns); + set_cred_user_ns(cred, get_user_ns(user_ns)); + + return commit_creds(cred); +} + +static unsigned int userns_inum(void *ns) +{ + struct user_namespace *user_ns = ns; + return user_ns->proc_inum; +} + +const struct proc_ns_operations userns_operations = { + .name = "user", + .type = CLONE_NEWUSER, + .get = userns_get, + .put = userns_put, + .install = userns_install, + .inum = userns_inum, +}; + static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); diff --git a/kernel/utsname.c b/kernel/utsname.c index 679d97a5d3f..f6336d51d64 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -32,18 +32,25 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, +static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + int err; ns = create_uts_ns(); if (!ns) return ERR_PTR(-ENOMEM); + err = proc_alloc_inum(&ns->proc_inum); + if (err) { + kfree(ns); + return ERR_PTR(err); + } + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); + ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; } @@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, * versa. */ struct uts_namespace *copy_utsname(unsigned long flags, - struct task_struct *tsk) + struct user_namespace *user_ns, struct uts_namespace *old_ns) { - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(tsk, old_ns); + new_ns = clone_uts_ns(user_ns, old_ns); put_uts_ns(old_ns); return new_ns; @@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); + proc_free_inum(ns->proc_inum); kfree(ns); } @@ -102,19 +109,31 @@ static void utsns_put(void *ns) put_uts_ns(ns); } -static int utsns_install(struct nsproxy *nsproxy, void *ns) +static int utsns_install(struct nsproxy *nsproxy, void *new) { + struct uts_namespace *ns = new; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + get_uts_ns(ns); put_uts_ns(nsproxy->uts_ns); nsproxy->uts_ns = ns; return 0; } +static unsigned int utsns_inum(void *vp) +{ + struct uts_namespace *ns = vp; + + return ns->proc_inum; +} + const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, + .inum = utsns_inum, }; - diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6456439cbbd..2e9a3132b8d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -381,6 +381,21 @@ struct net *get_net_ns_by_pid(pid_t pid) } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); +static __net_init int net_ns_net_init(struct net *net) +{ + return proc_alloc_inum(&net->proc_inum); +} + +static __net_exit void net_ns_net_exit(struct net *net) +{ + proc_free_inum(net->proc_inum); +} + +static struct pernet_operations __net_initdata net_ns_ops = { + .init = net_ns_net_init, + .exit = net_ns_net_exit, +}; + static int __init net_ns_init(void) { struct net_generic *ng; @@ -412,6 +427,8 @@ static int __init net_ns_init(void) mutex_unlock(&net_mutex); + register_pernet_subsys(&net_ns_ops); + return 0; } @@ -630,16 +647,28 @@ static void netns_put(void *ns) static int netns_install(struct nsproxy *nsproxy, void *ns) { + struct net *net = ns; + + if (!ns_capable(net->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + put_net(nsproxy->net_ns); - nsproxy->net_ns = get_net(ns); + nsproxy->net_ns = get_net(net); return 0; } +static unsigned int netns_inum(void *ns) +{ + struct net *net = ns; + return net->proc_inum; +} + const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, + .inum = netns_inum, }; #endif diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 2663145d119..23414b93771 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -298,14 +298,18 @@ int yama_ptrace_access_check(struct task_struct *child, /* No additional restrictions. */ break; case YAMA_SCOPE_RELATIONAL: + rcu_read_lock(); if (!task_is_descendant(current, child) && !ptracer_exception_found(current, child) && - !ns_capable(task_user_ns(child), CAP_SYS_PTRACE)) + !ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; + rcu_read_unlock(); break; case YAMA_SCOPE_CAPABILITY: - if (!ns_capable(task_user_ns(child), CAP_SYS_PTRACE)) + rcu_read_lock(); + if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; + rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: default: @@ -343,8 +347,10 @@ int yama_ptrace_traceme(struct task_struct *parent) /* Only disallow PTRACE_TRACEME on more aggressive settings. */ switch (ptrace_scope) { case YAMA_SCOPE_CAPABILITY: - if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE)) + rcu_read_lock(); + if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE)) rc = -EPERM; + rcu_read_unlock(); break; case YAMA_SCOPE_NO_ATTACH: rc = -EPERM;