From 475d0094293b51353e342d1198377967dbc48169 Mon Sep 17 00:00:00 2001 From: Dong Aisheng Date: Wed, 11 Jul 2012 15:16:37 +1000 Subject: of: Improve prom_update_property() function prom_update_property() currently fails if the property doesn't actually exist yet which isn't what we want. Change to add-or-update instead of update-only, then we can remove a lot duplicated lines. Suggested-by: Grant Likely Signed-off-by: Dong Aisheng Acked-by: Rob Herring Signed-off-by: Benjamin Herrenschmidt --- fs/proc/proc_devtree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 927cbd115e5..df7dd08d439 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -101,6 +101,11 @@ void proc_device_tree_update_prop(struct proc_dir_entry *pde, { struct proc_dir_entry *ent; + if (!oldprop) { + proc_device_tree_add_prop(pde, newprop); + return; + } + for (ent = pde->subdir; ent != NULL; ent = ent->next) if (ent->data == oldprop) break; -- cgit v1.2.3 From 0b728e1911cbe6e24020727c3870628b9653f32a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Jun 2012 16:03:43 -0400 Subject: stop passing nameidata * to ->d_revalidate() Just the lookup flags. Die, bastard, die... Signed-off-by: Al Viro --- fs/proc/base.c | 22 +++++++++++----------- fs/proc/internal.h | 2 +- fs/proc/namespaces.c | 2 +- fs/proc/proc_sysctl.c | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 437195f204e..bf749cca4cc 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1601,13 +1601,13 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) * made this apply to all per process world readable and executable * directories. */ -int pid_revalidate(struct dentry *dentry, struct nameidata *nd) +int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; const struct cred *cred; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; @@ -1781,7 +1781,7 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) return proc_fd_info(dentry->d_inode, path, NULL); } -static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) +static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; @@ -1789,7 +1789,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) struct files_struct *files; const struct cred *cred; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; @@ -1868,7 +1868,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, NULL)) + if (tid_fd_revalidate(dentry, 0)) error = NULL; out: @@ -2003,7 +2003,7 @@ static int dname_to_vma_addr(struct dentry *dentry, return 0; } -static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) +static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; bool exact_vma_exists = false; @@ -2013,7 +2013,7 @@ static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *inode; int status = 0; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; if (!capable(CAP_SYS_ADMIN)) { @@ -2371,7 +2371,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, NULL)) + if (tid_fd_revalidate(dentry, 0)) error = NULL; out: @@ -2430,7 +2430,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) + if (pid_revalidate(dentry, 0)) error = NULL; out: return error; @@ -3237,7 +3237,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) + if (pid_revalidate(dentry, 0)) error = NULL; out: return error; @@ -3508,7 +3508,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) + if (pid_revalidate(dentry, 0)) error = NULL; out: return error; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index eca4aca5b6e..e0c2a48dab7 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -142,7 +142,7 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *, int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, const char *name, int len, instantiate_t instantiate, struct task_struct *task, const void *ptr); -int pid_revalidate(struct dentry *dentry, struct nameidata *nd); +int pid_revalidate(struct dentry *dentry, unsigned int flags); struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); extern const struct dentry_operations pid_dentry_operations; int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 0d9e23a39e4..40ceb40f985 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -56,7 +56,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) + if (pid_revalidate(dentry, 0)) error = NULL; out: return error; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 3476bca8f7a..fda69fa3909 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -794,9 +794,9 @@ static const struct inode_operations proc_sys_dir_operations = { .getattr = proc_sys_getattr, }; -static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) +static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) { - if (nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; return !PROC_I(dentry->d_inode)->sysctl->unregistering; } -- cgit v1.2.3 From 00cd8dd3bf95f2cc8435b4cac01d9995635c6d0b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Jun 2012 17:13:09 -0400 Subject: stop passing nameidata to ->lookup() Just the flags; only NFS cares even about that, but there are legitimate uses for such argument. And getting rid of that completely would require splitting ->lookup() into a couple of methods (at least), so let's leave that alone for now... Signed-off-by: Al Viro --- fs/proc/base.c | 18 ++++++++++-------- fs/proc/generic.c | 2 +- fs/proc/internal.h | 4 ++-- fs/proc/namespaces.c | 2 +- fs/proc/proc_net.c | 2 +- fs/proc/proc_sysctl.c | 2 +- fs/proc/root.c | 7 +++---- 7 files changed, 19 insertions(+), 18 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index bf749cca4cc..8eaa5ea1c61 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1956,7 +1956,7 @@ out_no_task: } static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); } @@ -2145,7 +2145,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, } static struct dentry *proc_map_files_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; @@ -2380,7 +2380,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, static struct dentry *proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); } @@ -2630,7 +2630,7 @@ static const struct file_operations proc_attr_dir_operations = { }; static struct dentry *proc_attr_dir_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); @@ -3114,7 +3114,8 @@ static const struct file_operations proc_tgid_base_operations = { .llseek = default_llseek, }; -static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ +static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ return proc_pident_lookup(dir, dentry, tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } @@ -3243,7 +3244,7 @@ out: return error; } -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *result; struct task_struct *task; @@ -3470,7 +3471,8 @@ static int proc_tid_base_readdir(struct file * filp, tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); } -static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ +static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ return proc_pident_lookup(dir, dentry, tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); } @@ -3514,7 +3516,7 @@ out: return error; } -static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 2edf34f2eb6..b3647fe6a60 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -446,7 +446,7 @@ out_unlock: } struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { return proc_lookup_de(PDE(dir), dir, dentry); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index e0c2a48dab7..e1167a1c912 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -106,7 +106,7 @@ void pde_users_dec(struct proc_dir_entry *pde); extern spinlock_t proc_subdir_lock; -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int); int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); unsigned long task_vsize(struct mm_struct *); unsigned long task_statm(struct mm_struct *, @@ -132,7 +132,7 @@ int proc_remount(struct super_block *sb, int *flags, char *data); * of the /proc/ subdirectories. */ int proc_readdir(struct file *, void *, filldir_t); -struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); +struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 40ceb40f985..b178ed733c3 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -140,7 +140,7 @@ const struct file_operations proc_ns_dir_operations = { }; static struct dentry *proc_ns_dir_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { struct dentry *error; struct task_struct *task = get_proc_task(dir); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 06e1cc17caf..fe72cd073de 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -119,7 +119,7 @@ static struct net *get_proc_task_net(struct inode *dir) } static struct dentry *proc_tgid_net_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) + struct dentry *dentry, unsigned int flags) { struct dentry *de; struct net *net; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fda69fa3909..dfafeb2b05a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -433,7 +433,7 @@ static struct ctl_table_header *grab_header(struct inode *inode) } static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; diff --git a/fs/proc/root.c b/fs/proc/root.c index 7c30fce037c..568b20290c7 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -200,13 +200,12 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct return 0; } -static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_lookup(dir, dentry, nd)) { + if (!proc_lookup(dir, dentry, flags)) return NULL; - } - return proc_pid_lookup(dir, dentry, nd); + return proc_pid_lookup(dir, dentry, flags); } static int proc_root_readdir(struct file * filp, -- cgit v1.2.3 From 408ef013cc9e2f94a14f7ccbbe52ddfb18437a99 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Jun 2012 10:47:03 -0400 Subject: fs: move path_put on failure out of ->follow_link Currently the non-nd_set_link based versions of ->follow_link are expected to do a path_put(&nd->path) on failure. This calling convention is unexpected, undocumented and doesn't match what the nd_set_link-based instances do. Move the path_put out of the only non-nd_set_link based ->follow_link instance into the caller. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/proc/base.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 8eaa5ea1c61..3bd5ac1ff01 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1427,16 +1427,20 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; + struct path path; int error = -EACCES; - /* We don't need a base pointer in the /proc filesystem */ - path_put(&nd->path); - /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path); + error = PROC_I(inode)->op.proc_get_link(dentry, &path); + if (error) + goto out; + + path_put(&nd->path); + nd->path = path; + return NULL; out: return ERR_PTR(error); } -- cgit v1.2.3 From b5fb63c18315c5510c1d0636179c057e0c761c77 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Jun 2012 10:47:04 -0400 Subject: fs: add nd_jump_link Add a helper that abstracts out the jump to an already parsed struct path from ->follow_link operation from procfs. Not only does this clean up the code by moving the two sides of this game into a single helper, but it also prepares for making struct nameidata private to namei.c Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/proc/base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 3bd5ac1ff01..2772208338f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1438,8 +1438,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) if (error) goto out; - path_put(&nd->path); - nd->path = path; + nd_jump_link(nd, &path); return NULL; out: return ERR_PTR(error); -- cgit v1.2.3 From 9249e17fe094d853d1ef7475dd559a2cc7e23d42 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Jun 2012 12:55:37 +0100 Subject: VFS: Pass mount flags to sget() Pass mount flags to sget() so that it can use them in initialising a new superblock before the set function is called. They could also be passed to the compare function. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/proc/root.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/root.c b/fs/proc/root.c index 568b20290c7..9a2d9fd7cad 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -111,7 +111,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, options = data; } - sb = sget(fs_type, proc_test_super, proc_set_super, ns); + sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); if (IS_ERR(sb)) return ERR_CAST(sb); @@ -121,7 +121,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } if (!sb->s_root) { - sb->s_flags = flags; err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); -- cgit v1.2.3 From e8905ec27e2f4ea1b9f7e03df68a060b3ae6fca8 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 30 Jul 2012 14:42:26 -0700 Subject: proc: environ_read() make sure offset points to environment address range Currently the following offset and environment address range check in environ_read() of /proc//environ is buggy: int this_len = mm->env_end - (mm->env_start + src); if (this_len <= 0) break; Large or negative offsets on /proc//environ converted to 'unsigned long' may pass this check since '(mm->env_start + src)' can overflow and 'this_len' will be positive. This can turn /proc//environ to act like /proc//mem since (mm->env_start + src) will point and read from another VMA. There are two fixes here plus some code cleaning: 1) Fix the overflow by checking if the offset that was converted to unsigned long will always point to the [mm->env_start, mm->env_end] address range. 2) Remove the truncation that was made to the result of the check, storing the result in 'int this_len' will alter its value and we can not depend on it. For kernels that have commit b409e578d ("proc: clean up /proc//environ handling") which adds the appropriate ptrace check and saves the 'mm' at ->open() time, this is not a security issue. This patch is taken from the grsecurity patch since it was just made available. Signed-off-by: Djalal Harouni Cc: Oleg Nesterov Cc: Brad Spengler Acked-by: Kees Cook Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 2772208338f..39ee093b5e9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -827,15 +827,16 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; while (count > 0) { - int this_len, retval, max_len; + size_t this_len, max_len; + int retval; - this_len = mm->env_end - (mm->env_start + src); - - if (this_len <= 0) + if (src >= (mm->env_end - mm->env_start)) break; - max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - this_len = (this_len > max_len) ? max_len : this_len; + this_len = mm->env_end - (mm->env_start + src); + + max_len = min_t(size_t, PAGE_SIZE, count); + this_len = min(max_len, this_len); retval = access_remote_vm(mm, (mm->env_start + src), page, this_len, 0); -- cgit v1.2.3 From bc452b4b65bd589083a7a7ba4f14f85dfc8454fa Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Mon, 30 Jul 2012 14:42:28 -0700 Subject: proc: do not allow negative offsets on /proc//environ __mem_open() which is called by both /proc//environ and /proc//mem ->open() handlers will allow the use of negative offsets. /proc//mem has negative offsets but not /proc//environ. Clean this by moving the 'force FMODE_UNSIGNED_OFFSET flag' to mem_open() to allow negative offsets only on /proc//mem. Signed-off-by: Djalal Harouni Cc: Oleg Nesterov Cc: Brad Spengler Acked-by: Kees Cook Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 39ee093b5e9..1b6c84cbdb7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -695,8 +695,6 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) mmput(mm); } - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; file->private_data = mm; return 0; @@ -704,7 +702,12 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) static int mem_open(struct inode *inode, struct file *file) { - return __mem_open(inode, file, PTRACE_MODE_ATTACH); + int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); + + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; + + return ret; } static ssize_t mem_rw(struct file *file, char __user *buf, -- cgit v1.2.3 From 6bf6104573482570f7103d3e5ddf9574db43a363 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Thu, 13 Sep 2012 15:03:37 -0700 Subject: fs/proc: fix potential unregister_sysctl_table hang The unregister_sysctl_table() function hangs if all references to its ctl_table_header structure are not dropped. This can happen sometimes because of a leak in proc_sys_lookup(): proc_sys_lookup() gets a reference to the table via lookup_entry(), but it does not release it when a subsequent call to sysctl_follow_link() fails. This patch fixes this leak by making sure the reference is always dropped on return. See also commit 076c3eed2c31 ("sysctl: Rewrite proc_sys_lookup introducing find_entry and lookup_entry") which reorganized this code in 3.4. Tested in Linux 3.4.4. Signed-off-by: Francesco Ruggeri Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dfafeb2b05a..eb7cc91b725 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -462,9 +462,6 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (h) - sysctl_head_finish(h); - if (!inode) goto out; @@ -473,6 +470,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, d_add(dentry, inode); out: + if (h) + sysctl_head_finish(h); sysctl_head_finish(head); return err; } -- cgit v1.2.3 From e1760bd5ffae8cb98cffb030ee8e631eba28f3d8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Sep 2012 22:39:43 -0700 Subject: userns: Convert the audit loginuid to be a kuid Always store audit loginuids in type kuid_t. Print loginuids by converting them into uids in the appropriate user namespace, and then printing the resulting uid. Modify audit_get_loginuid to return a kuid_t. Modify audit_set_loginuid to take a kuid_t. Modify /proc//loginuid on read to convert the loginuid into the user namespace of the opener of the file. Modify /proc//loginud on write to convert the loginuid rom the user namespace of the opener of the file. Cc: Al Viro Cc: Eric Paris Cc: Paul Moore ? Cc: David Miller Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1b6c84cbdb7..138cff4b05d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1089,7 +1089,8 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf, if (!task) return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", - audit_get_loginuid(task)); + from_kuid(file->f_cred->user_ns, + audit_get_loginuid(task))); put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -1101,6 +1102,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, char *page, *tmp; ssize_t length; uid_t loginuid; + kuid_t kloginuid; rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { @@ -1130,7 +1132,13 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(loginuid); + kloginuid = make_kuid(file->f_cred->user_ns, loginuid); + if (!uid_valid(kloginuid)) { + length = -EINVAL; + goto out_free_page; + } + + length = audit_set_loginuid(kloginuid); if (likely(length == 0)) length = count; -- cgit v1.2.3 From f76d207a66c3a53defea67e7d36c3eb1b7d6d61d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 30 Aug 2012 01:24:05 -0700 Subject: userns: Add kprojid_t and associated infrastructure in projid.h Implement kprojid_t a cousin of the kuid_t and kgid_t. The per user namespace mapping of project id values can be set with /proc//projid_map. A full compliment of helpers is provided: make_kprojid, from_kprojid, from_kprojid_munged, kporjid_has_mapping, projid_valid, projid_eq, projid_eq, projid_lt. Project identifiers are part of the generic disk quota interface, although it appears only xfs implements project identifiers currently. The xfs code allows anyone who has permission to set the project identifier on a file to use any project identifier so when setting up the user namespace project identifier mappings I do not require a capability. Cc: Dave Chinner Cc: Jan Kara Signed-off-by: "Eric W. Biederman" --- fs/proc/base.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 138cff4b05d..acd1960c28a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2991,6 +2991,11 @@ static int proc_gid_map_open(struct inode *inode, struct file *file) return proc_id_map_open(inode, file, &proc_gid_seq_operations); } +static int proc_projid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_projid_seq_operations); +} + static const struct file_operations proc_uid_map_operations = { .open = proc_uid_map_open, .write = proc_uid_map_write, @@ -3006,6 +3011,14 @@ static const struct file_operations proc_gid_map_operations = { .llseek = seq_lseek, .release = proc_id_map_release, }; + +static const struct file_operations proc_projid_map_operations = { + .open = proc_projid_map_open, + .write = proc_projid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; #endif /* CONFIG_USER_NS */ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, @@ -3113,6 +3126,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), #endif }; @@ -3476,6 +3490,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), #endif }; -- cgit v1.2.3 From faf60af17f8da87e1c87a6be527344791025ce9e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 23 Aug 2012 14:43:24 +0400 Subject: procfs: Move /proc/pid/fd[info] handling code to fd.[ch] This patch prepares the ground for further extension of /proc/pid/fd[info] handling code by moving fdinfo handling code into fs/proc/fd.c. I think such move makes both fs/proc/base.c and fs/proc/fd.c easier to read. Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov CC: Al Viro CC: Alexey Dobriyan CC: Andrew Morton CC: James Bottomley CC: "Aneesh Kumar K.V" CC: Alexey Dobriyan CC: Matthew Helsley CC: "J. Bruce Fields" CC: "Aneesh Kumar K.V" Signed-off-by: Al Viro --- fs/proc/Makefile | 2 +- fs/proc/base.c | 388 +---------------------------------------------------- fs/proc/fd.c | 351 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/proc/fd.h | 14 ++ fs/proc/internal.h | 48 +++++++ 5 files changed, 416 insertions(+), 387 deletions(-) create mode 100644 fs/proc/fd.c create mode 100644 fs/proc/fd.h (limited to 'fs/proc') diff --git a/fs/proc/Makefile b/fs/proc/Makefile index c1c72933592..99349efbbc2 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -8,7 +8,7 @@ proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := mmu.o task_mmu.o proc-y += inode.o root.o base.o generic.o array.o \ - proc_tty.o + proc_tty.o fd.o proc-y += cmdline.o proc-y += consoles.o proc-y += cpuinfo.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 1b6c84cbdb7..b55c3bb298e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -90,6 +90,7 @@ #endif #include #include "internal.h" +#include "fd.h" /* NOTE: * Implementing inode permission operations in /proc is almost @@ -136,8 +137,6 @@ struct pid_entry { NULL, &proc_single_file_operations, \ { .proc_show = show } ) -static int proc_fd_permission(struct inode *inode, int mask); - /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. @@ -1492,7 +1491,7 @@ out: return error; } -static const struct inode_operations proc_pid_link_inode_operations = { +const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .follow_link = proc_pid_follow_link, .setattr = proc_setattr, @@ -1501,21 +1500,6 @@ static const struct inode_operations proc_pid_link_inode_operations = { /* building an inode */ -static int task_dumpable(struct task_struct *task) -{ - int dumpable = 0; - struct mm_struct *mm; - - task_lock(task); - mm = task->mm; - if (mm) - dumpable = get_dumpable(mm); - task_unlock(task); - if(dumpable == 1) - return 1; - return 0; -} - struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) { struct inode * inode; @@ -1641,15 +1625,6 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) return 0; } -static int pid_delete_dentry(const struct dentry * dentry) -{ - /* Is the task we represent dead? - * If so, then don't put the dentry on the lru list, - * kill it immediately. - */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; -} - const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, @@ -1712,289 +1687,6 @@ end_instantiate: return filldir(dirent, name, len, filp->f_pos, ino, type); } -static unsigned name_to_int(struct dentry *dentry) -{ - const char *name = dentry->d_name.name; - int len = dentry->d_name.len; - unsigned n = 0; - - if (len > 1 && *name == '0') - goto out; - while (len-- > 0) { - unsigned c = *name++ - '0'; - if (c > 9) - goto out; - if (n >= (~0U-9)/10) - goto out; - n *= 10; - n += c; - } - return n; -out: - return ~0U; -} - -#define PROC_FDINFO_MAX 64 - -static int proc_fd_info(struct inode *inode, struct path *path, char *info) -{ - struct task_struct *task = get_proc_task(inode); - struct files_struct *files = NULL; - struct file *file; - int fd = proc_fd(inode); - - if (task) { - files = get_files_struct(task); - put_task_struct(task); - } - if (files) { - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (file) { - unsigned int f_flags; - struct fdtable *fdt; - - fdt = files_fdtable(files); - f_flags = file->f_flags & ~O_CLOEXEC; - if (close_on_exec(fd, fdt)) - f_flags |= O_CLOEXEC; - - if (path) { - *path = file->f_path; - path_get(&file->f_path); - } - if (info) - snprintf(info, PROC_FDINFO_MAX, - "pos:\t%lli\n" - "flags:\t0%o\n", - (long long) file->f_pos, - f_flags); - spin_unlock(&files->file_lock); - put_files_struct(files); - return 0; - } - spin_unlock(&files->file_lock); - put_files_struct(files); - } - return -ENOENT; -} - -static int proc_fd_link(struct dentry *dentry, struct path *path) -{ - return proc_fd_info(dentry->d_inode, path, NULL); -} - -static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct inode *inode; - struct task_struct *task; - int fd; - struct files_struct *files; - const struct cred *cred; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - inode = dentry->d_inode; - task = get_proc_task(inode); - fd = proc_fd(inode); - - if (task) { - files = get_files_struct(task); - if (files) { - struct file *file; - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - unsigned f_mode = file->f_mode; - - rcu_read_unlock(); - put_files_struct(files); - - if (task_dumpable(task)) { - rcu_read_lock(); - cred = __task_cred(task); - inode->i_uid = cred->euid; - inode->i_gid = cred->egid; - rcu_read_unlock(); - } else { - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - } - - if (S_ISLNK(inode->i_mode)) { - unsigned i_mode = S_IFLNK; - if (f_mode & FMODE_READ) - i_mode |= S_IRUSR | S_IXUSR; - if (f_mode & FMODE_WRITE) - i_mode |= S_IWUSR | S_IXUSR; - inode->i_mode = i_mode; - } - - security_task_to_inode(task, inode); - put_task_struct(task); - return 1; - } - rcu_read_unlock(); - put_files_struct(files); - } - put_task_struct(task); - } - d_drop(dentry); - return 0; -} - -static const struct dentry_operations tid_fd_dentry_operations = -{ - .d_revalidate = tid_fd_revalidate, - .d_delete = pid_delete_dentry, -}; - -static struct dentry *proc_fd_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) -{ - unsigned fd = (unsigned long)ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); - - inode = proc_pid_make_inode(dir->i_sb, task); - if (!inode) - goto out; - ei = PROC_I(inode); - ei->fd = fd; - - inode->i_mode = S_IFLNK; - inode->i_op = &proc_pid_link_inode_operations; - inode->i_size = 64; - ei->op.proc_get_link = proc_fd_link; - d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, 0)) - error = NULL; - - out: - return error; -} - -static struct dentry *proc_lookupfd_common(struct inode *dir, - struct dentry *dentry, - instantiate_t instantiate) -{ - struct task_struct *task = get_proc_task(dir); - unsigned fd = name_to_int(dentry); - struct dentry *result = ERR_PTR(-ENOENT); - - if (!task) - goto out_no_task; - if (fd == ~0U) - goto out; - - result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); -out: - put_task_struct(task); -out_no_task: - return result; -} - -static int proc_readfd_common(struct file * filp, void * dirent, - filldir_t filldir, instantiate_t instantiate) -{ - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *p = get_proc_task(inode); - unsigned int fd, ino; - int retval; - struct files_struct * files; - - retval = -ENOENT; - if (!p) - goto out_no_task; - retval = 0; - - fd = filp->f_pos; - switch (fd) { - case 0: - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - default: - files = get_files_struct(p); - if (!files) - goto out; - rcu_read_lock(); - for (fd = filp->f_pos-2; - fd < files_fdtable(files)->max_fds; - fd++, filp->f_pos++) { - char name[PROC_NUMBUF]; - int len; - int rv; - - if (!fcheck_files(files, fd)) - continue; - rcu_read_unlock(); - - len = snprintf(name, sizeof(name), "%d", fd); - rv = proc_fill_cache(filp, dirent, filldir, - name, len, instantiate, p, - (void *)(unsigned long)fd); - if (rv < 0) - goto out_fd_loop; - rcu_read_lock(); - } - rcu_read_unlock(); -out_fd_loop: - put_files_struct(files); - } -out: - put_task_struct(p); -out_no_task: - return retval; -} - -static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); -} - -static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) -{ - return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); -} - -static ssize_t proc_fdinfo_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - char tmp[PROC_FDINFO_MAX]; - int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp); - if (!err) - err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp)); - return err; -} - -static const struct file_operations proc_fdinfo_file_operations = { - .open = nonseekable_open, - .read = proc_fdinfo_read, - .llseek = no_llseek, -}; - -static const struct file_operations proc_fd_operations = { - .read = generic_read_dir, - .readdir = proc_readfd, - .llseek = default_llseek, -}; - #ifdef CONFIG_CHECKPOINT_RESTORE /* @@ -2337,82 +2029,6 @@ static const struct file_operations proc_map_files_operations = { #endif /* CONFIG_CHECKPOINT_RESTORE */ -/* - * /proc/pid/fd needs a special permission handler so that a process can still - * access /proc/self/fd after it has executed a setuid(). - */ -static int proc_fd_permission(struct inode *inode, int mask) -{ - int rv = generic_permission(inode, mask); - if (rv == 0) - return 0; - if (task_pid(current) == proc_pid(inode)) - rv = 0; - return rv; -} - -/* - * proc directories can do almost nothing.. - */ -static const struct inode_operations proc_fd_inode_operations = { - .lookup = proc_lookupfd, - .permission = proc_fd_permission, - .setattr = proc_setattr, -}; - -static struct dentry *proc_fdinfo_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) -{ - unsigned fd = (unsigned long)ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error = ERR_PTR(-ENOENT); - - inode = proc_pid_make_inode(dir->i_sb, task); - if (!inode) - goto out; - ei = PROC_I(inode); - ei->fd = fd; - inode->i_mode = S_IFREG | S_IRUSR; - inode->i_fop = &proc_fdinfo_file_operations; - d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (tid_fd_revalidate(dentry, 0)) - error = NULL; - - out: - return error; -} - -static struct dentry *proc_lookupfdinfo(struct inode *dir, - struct dentry *dentry, - unsigned int flags) -{ - return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); -} - -static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) -{ - return proc_readfd_common(filp, dirent, filldir, - proc_fdinfo_instantiate); -} - -static const struct file_operations proc_fdinfo_operations = { - .read = generic_read_dir, - .readdir = proc_readfdinfo, - .llseek = default_llseek, -}; - -/* - * proc directories can do almost nothing.. - */ -static const struct inode_operations proc_fdinfo_inode_operations = { - .lookup = proc_lookupfdinfo, - .setattr = proc_setattr, -}; - - static struct dentry *proc_pident_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { diff --git a/fs/proc/fd.c b/fs/proc/fd.c new file mode 100644 index 00000000000..1b0f932b4bd --- /dev/null +++ b/fs/proc/fd.c @@ -0,0 +1,351 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" +#include "fd.h" + +#define PROC_FDINFO_MAX 64 + +static int proc_fd_info(struct inode *inode, struct path *path, char *info) +{ + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; + int fd = proc_fd(inode); + struct file *file; + + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } + if (files) { + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + unsigned int f_flags; + struct fdtable *fdt; + + fdt = files_fdtable(files); + f_flags = file->f_flags & ~O_CLOEXEC; + if (close_on_exec(fd, fdt)) + f_flags |= O_CLOEXEC; + + if (path) { + *path = file->f_path; + path_get(&file->f_path); + } + if (info) + snprintf(info, PROC_FDINFO_MAX, + "pos:\t%lli\n" + "flags:\t0%o\n", + (long long) file->f_pos, + f_flags); + spin_unlock(&files->file_lock); + put_files_struct(files); + return 0; + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } + return -ENOENT; +} + +static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct files_struct *files; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int fd; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + fd = proc_fd(inode); + + if (task) { + files = get_files_struct(task); + if (files) { + struct file *file; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + unsigned f_mode = file->f_mode; + + rcu_read_unlock(); + put_files_struct(files); + + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + } + + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } + + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } + rcu_read_unlock(); + put_files_struct(files); + } + put_task_struct(task); + } + + d_drop(dentry); + return 0; +} + +static const struct dentry_operations tid_fd_dentry_operations = { + .d_revalidate = tid_fd_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_fd_link(struct dentry *dentry, struct path *path) +{ + return proc_fd_info(dentry->d_inode, path, NULL); +} + +static struct dentry * +proc_fd_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + struct dentry *error = ERR_PTR(-ENOENT); + unsigned fd = (unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + ei->fd = fd; + + inode->i_mode = S_IFLNK; + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + + ei->op.proc_get_link = proc_fd_link; + + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, 0)) + error = NULL; + out: + return error; +} + +static struct dentry *proc_lookupfd_common(struct inode *dir, + struct dentry *dentry, + instantiate_t instantiate) +{ + struct task_struct *task = get_proc_task(dir); + struct dentry *result = ERR_PTR(-ENOENT); + unsigned fd = name_to_int(dentry); + + if (!task) + goto out_no_task; + if (fd == ~0U) + goto out; + + result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); +out: + put_task_struct(task); +out_no_task: + return result; +} + +static int proc_readfd_common(struct file * filp, void * dirent, + filldir_t filldir, instantiate_t instantiate) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + struct files_struct *files; + unsigned int fd, ino; + int retval; + + retval = -ENOENT; + if (!p) + goto out_no_task; + retval = 0; + + fd = filp->f_pos; + switch (fd) { + case 0: + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + default: + files = get_files_struct(p); + if (!files) + goto out; + rcu_read_lock(); + for (fd = filp->f_pos - 2; + fd < files_fdtable(files)->max_fds; + fd++, filp->f_pos++) { + char name[PROC_NUMBUF]; + int len; + int rv; + + if (!fcheck_files(files, fd)) + continue; + rcu_read_unlock(); + + len = snprintf(name, sizeof(name), "%d", fd); + rv = proc_fill_cache(filp, dirent, filldir, + name, len, instantiate, p, + (void *)(unsigned long)fd); + if (rv < 0) + goto out_fd_loop; + rcu_read_lock(); + } + rcu_read_unlock(); +out_fd_loop: + put_files_struct(files); + } +out: + put_task_struct(p); +out_no_task: + return retval; +} + +static ssize_t proc_fdinfo_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + char tmp[PROC_FDINFO_MAX]; + int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp); + if (!err) + err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp)); + return err; +} + +static const struct file_operations proc_fdinfo_file_operations = { + .open = nonseekable_open, + .read = proc_fdinfo_read, + .llseek = no_llseek, +}; + +static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) +{ + return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); +} + +const struct file_operations proc_fd_operations = { + .read = generic_read_dir, + .readdir = proc_readfd, + .llseek = default_llseek, +}; + +static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); +} + +/* + * /proc/pid/fd needs a special permission handler so that a process can still + * access /proc/self/fd after it has executed a setuid(). + */ +int proc_fd_permission(struct inode *inode, int mask) +{ + int rv = generic_permission(inode, mask); + if (rv == 0) + return 0; + if (task_pid(current) == proc_pid(inode)) + rv = 0; + return rv; +} + +const struct inode_operations proc_fd_inode_operations = { + .lookup = proc_lookupfd, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static struct dentry * +proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + struct dentry *error = ERR_PTR(-ENOENT); + unsigned fd = (unsigned long)ptr; + struct proc_inode *ei; + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + ei->fd = fd; + + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_fop = &proc_fdinfo_file_operations; + + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, 0)) + error = NULL; + out: + return error; +} + +static struct dentry * +proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); +} + +static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) +{ + return proc_readfd_common(filp, dirent, filldir, + proc_fdinfo_instantiate); +} + +const struct inode_operations proc_fdinfo_inode_operations = { + .lookup = proc_lookupfdinfo, + .setattr = proc_setattr, +}; + +const struct file_operations proc_fdinfo_operations = { + .read = generic_read_dir, + .readdir = proc_readfdinfo, + .llseek = default_llseek, +}; diff --git a/fs/proc/fd.h b/fs/proc/fd.h new file mode 100644 index 00000000000..cbb1d47deda --- /dev/null +++ b/fs/proc/fd.h @@ -0,0 +1,14 @@ +#ifndef __PROCFS_FD_H__ +#define __PROCFS_FD_H__ + +#include + +extern const struct file_operations proc_fd_operations; +extern const struct inode_operations proc_fd_inode_operations; + +extern const struct file_operations proc_fdinfo_operations; +extern const struct inode_operations proc_fdinfo_inode_operations; + +extern int proc_fd_permission(struct inode *inode, int mask); + +#endif /* __PROCFS_FD_H__ */ diff --git a/fs/proc/internal.h b/fs/proc/internal.h index e1167a1c912..67925a7bd8c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include struct ctl_table_header; @@ -65,6 +66,7 @@ extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; +extern const struct inode_operations proc_pid_link_inode_operations; struct proc_maps_private { struct pid *pid; @@ -91,6 +93,52 @@ static inline int proc_fd(struct inode *inode) return PROC_I(inode)->fd; } +static inline int task_dumpable(struct task_struct *task) +{ + int dumpable = 0; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) + dumpable = get_dumpable(mm); + task_unlock(task); + if(dumpable == 1) + return 1; + return 0; +} + +static inline int pid_delete_dentry(const struct dentry * dentry) +{ + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; +} + +static inline unsigned name_to_int(struct dentry *dentry) +{ + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + while (len-- > 0) { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } + return n; +out: + return ~0U; +} + struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, struct dentry *dentry); int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, -- cgit v1.2.3 From ddd3e0771bc7b869c550687c204e21f0155d5496 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 26 Aug 2012 18:28:20 +0400 Subject: procfs: Convert /proc/pid/fdinfo/ handling routines to seq-file v2 This patch converts /proc/pid/fdinfo/ handling routines to seq-file which is needed to extend seq operations and plug in auxiliary fdinfo provides from subsystems like eventfd/eventpoll/fsnotify. Note the proc_fd_link no longer call for proc_fd_info, simply because the guts of proc_fd_info() got merged into ->show() of that seq_file Signed-off-by: Al Viro --- fs/proc/fd.c | 112 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 64 insertions(+), 48 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 1b0f932b4bd..9cef449c0f7 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -6,61 +6,68 @@ #include #include #include +#include +#include #include #include "internal.h" #include "fd.h" -#define PROC_FDINFO_MAX 64 - -static int proc_fd_info(struct inode *inode, struct path *path, char *info) +static int seq_show(struct seq_file *m, void *v) { - struct task_struct *task = get_proc_task(inode); struct files_struct *files = NULL; - int fd = proc_fd(inode); - struct file *file; + int f_flags = 0, ret = -ENOENT; + struct file *file = NULL; + struct task_struct *task; + + task = get_proc_task(m->private); + if (!task) + return -ENOENT; + + files = get_files_struct(task); + put_task_struct(task); - if (task) { - files = get_files_struct(task); - put_task_struct(task); - } if (files) { - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ + int fd = proc_fd(m->private); + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (file) { - unsigned int f_flags; - struct fdtable *fdt; + struct fdtable *fdt = files_fdtable(files); - fdt = files_fdtable(files); f_flags = file->f_flags & ~O_CLOEXEC; if (close_on_exec(fd, fdt)) f_flags |= O_CLOEXEC; - if (path) { - *path = file->f_path; - path_get(&file->f_path); - } - if (info) - snprintf(info, PROC_FDINFO_MAX, - "pos:\t%lli\n" - "flags:\t0%o\n", - (long long) file->f_pos, - f_flags); - spin_unlock(&files->file_lock); - put_files_struct(files); - return 0; + get_file(file); + ret = 0; } spin_unlock(&files->file_lock); put_files_struct(files); } - return -ENOENT; + + if (!ret) { + seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", + (long long)file->f_pos, f_flags); + fput(file); + } + + return ret; } +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + return single_open(file, seq_show, inode); +} + +static const struct file_operations proc_fdinfo_file_operations = { + .open = seq_fdinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) { struct files_struct *files; @@ -130,7 +137,32 @@ static const struct dentry_operations tid_fd_dentry_operations = { static int proc_fd_link(struct dentry *dentry, struct path *path) { - return proc_fd_info(dentry->d_inode, path, NULL); + struct files_struct *files = NULL; + struct task_struct *task; + int ret = -ENOENT; + + task = get_proc_task(dentry->d_inode); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } + + if (files) { + int fd = proc_fd(dentry->d_inode); + struct file *fd_file; + + spin_lock(&files->file_lock); + fd_file = fcheck_files(files, fd); + if (fd_file) { + *path = fd_file->f_path; + path_get(&fd_file->f_path); + ret = 0; + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } + + return ret; } static struct dentry * @@ -245,22 +277,6 @@ out_no_task: return retval; } -static ssize_t proc_fdinfo_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - char tmp[PROC_FDINFO_MAX]; - int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp); - if (!err) - err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp)); - return err; -} - -static const struct file_operations proc_fdinfo_file_operations = { - .open = nonseekable_open, - .read = proc_fdinfo_read, - .llseek = no_llseek, -}; - static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) { return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); -- cgit v1.2.3 From c6f3d81115989e274c42a852222b80d2e14ced6f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Aug 2012 11:01:04 -0400 Subject: don't leak O_CLOEXEC into ->f_flags Signed-off-by: Al Viro --- fs/proc/fd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 9cef449c0f7..f28a875f877 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -36,7 +36,7 @@ static int seq_show(struct seq_file *m, void *v) if (file) { struct fdtable *fdt = files_fdtable(files); - f_flags = file->f_flags & ~O_CLOEXEC; + f_flags = file->f_flags; if (close_on_exec(fd, fdt)) f_flags |= O_CLOEXEC; -- cgit v1.2.3 From cb0942b81249798e15c3f04eee2946ef543e8115 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Aug 2012 14:48:26 -0400 Subject: make get_file() return its argument simplifies a bunch of callers... Signed-off-by: Al Viro --- fs/proc/base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index b55c3bb298e..f1e8438d21b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1979,8 +1979,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) if (++pos <= filp->f_pos) continue; - get_file(vma->vm_file); - info.file = vma->vm_file; + info.file = get_file(vma->vm_file); info.len = snprintf(info.name, sizeof(info.name), "%lx-%lx", vma->vm_start, vma->vm_end); -- cgit v1.2.3 From 7b540d0646ce122f0ba4520412be91e530719742 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Aug 2012 14:55:26 -0400 Subject: proc_map_files_readdir(): don't bother with grabbing files all we need is their ->f_mode, so just collect _that_ Signed-off-by: Al Viro --- fs/proc/base.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index f1e8438d21b..df18db61d6d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1805,7 +1805,7 @@ out: } struct map_files_info { - struct file *file; + fmode_t mode; unsigned long len; unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; @@ -1814,13 +1814,10 @@ static struct dentry * proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - const struct file *file = ptr; + fmode_t mode = (fmode_t)(unsigned long)ptr; struct proc_inode *ei; struct inode *inode; - if (!file) - return ERR_PTR(-ENOENT); - inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) return ERR_PTR(-ENOENT); @@ -1832,9 +1829,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, inode->i_size = 64; inode->i_mode = S_IFLNK; - if (file->f_mode & FMODE_READ) + if (mode & FMODE_READ) inode->i_mode |= S_IRUSR; - if (file->f_mode & FMODE_WRITE) + if (mode & FMODE_WRITE) inode->i_mode |= S_IWUSR; d_set_d_op(dentry, &tid_map_files_dentry_operations); @@ -1878,7 +1875,8 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!vma) goto out_no_vma; - result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); + result = proc_map_files_instantiate(dir, dentry, task, + (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: up_read(&mm->mmap_sem); @@ -1979,7 +1977,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) if (++pos <= filp->f_pos) continue; - info.file = get_file(vma->vm_file); + info.mode = vma->vm_file->f_mode; info.len = snprintf(info.name, sizeof(info.name), "%lx-%lx", vma->vm_start, vma->vm_end); @@ -1994,19 +1992,11 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) ret = proc_fill_cache(filp, dirent, filldir, p->name, p->len, proc_map_files_instantiate, - task, p->file); + task, + (void *)(unsigned long)p->mode); if (ret) break; filp->f_pos++; - fput(p->file); - } - for (; i < nr_files; i++) { - /* - * In case of error don't forget - * to put rest of file refs. - */ - p = flex_array_get(fa, i); - fput(p->file); } if (fa) flex_array_free(fa); -- cgit v1.2.3 From 0f4cfb2e4e7a7e4e97a3e90e2ba1062f07fb2cb1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 4 Oct 2012 17:15:27 -0700 Subject: coredump: use SUID_DUMPABLE_ENABLED rather than hardcoded 1 Cosmetic. Change setup_new_exec() and task_dumpable() to use SUID_DUMPABLE_ENABLED for /bin/grep. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 67925a7bd8c..cceaab07ad5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -103,7 +103,7 @@ static inline int task_dumpable(struct task_struct *task) if (mm) dumpable = get_dumpable(mm); task_unlock(task); - if(dumpable == 1) + if (dumpable == SUID_DUMPABLE_ENABLED) return 1; return 0; } -- cgit v1.2.3 From 620727506dc6da0562fa4f6950dedb8a51bd8237 Mon Sep 17 00:00:00 2001 From: yan Date: Thu, 4 Oct 2012 17:15:38 -0700 Subject: proc: return -ENOMEM when inode allocation failed If proc_get_inode() returns NULL then presumably it encountered memory exhaustion. proc_lookup_de() should return -ENOMEM in this case, not -EINVAL. Signed-off-by: yan Cc: Ryan Mallon Cc: Cong Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index b3647fe6a60..9e8f6316430 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -427,7 +427,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { pde_get(de); spin_unlock(&proc_subdir_lock); - error = -EINVAL; + error = -ENOMEM; inode = proc_get_inode(dir->i_sb, de); goto out_unlock; } -- cgit v1.2.3 From 0e06936057674ef3f2bd0b398a32ddc512642992 Mon Sep 17 00:00:00 2001 From: yan Date: Thu, 4 Oct 2012 17:15:41 -0700 Subject: proc: no need to initialize proc_inode->fd in proc_get_inode() proc_get_inode() obtains the inode via a call to iget_locked(). iget_locked() calls alloc_inode() which will call proc_alloc_inode() which clears proc_inode.fd, so there is no need to clear this field in proc_get_inode(). If iget_locked() instead found the inode via find_inode_fast(), that inode will not have I_NEW set so this change has no effect. Signed-off-by: yan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 7ac817b64a7..3b22bbdee9e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -450,7 +450,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) return NULL; if (inode->i_state & I_NEW) { inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - PROC_I(inode)->fd = 0; PROC_I(inode)->pde = de; if (de->mode) { -- cgit v1.2.3 From 17baa2a2c40f2f0330d25819b589a515d36b2d40 Mon Sep 17 00:00:00 2001 From: yan Date: Thu, 4 Oct 2012 17:15:43 -0700 Subject: proc: use kzalloc instead of kmalloc and memset Part of the memory will be written twice after this change, but that should be negligible. [akpm@linux-foundation.org: fix __proc_create() coding-style issues, remove unneeded zero-initialisations] Signed-off-by: yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 9e8f6316430..0d80cef4cfb 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -605,7 +605,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, unsigned int len; /* make sure name is valid */ - if (!name || !strlen(name)) goto out; + if (!name || !strlen(name)) + goto out; if (xlate_proc_name(name, parent, &fn) != 0) goto out; @@ -616,20 +617,18 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, len = strlen(fn); - ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); - if (!ent) goto out; + ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); + if (!ent) + goto out; - memset(ent, 0, sizeof(struct proc_dir_entry)); memcpy(ent->name, fn, len + 1); ent->namelen = len; ent->mode = mode; ent->nlink = nlink; atomic_set(&ent->count, 1); - ent->pde_users = 0; spin_lock_init(&ent->pde_unload_lock); - ent->pde_unload_completion = NULL; INIT_LIST_HEAD(&ent->pde_openers); - out: +out: return ent; } -- cgit v1.2.3 From ab4a1f247003ad146c1a1067b2ba043922d0dff2 Mon Sep 17 00:00:00 2001 From: Prasad Joshi Date: Thu, 4 Oct 2012 17:15:45 -0700 Subject: proc_sysctl.c: use BUG_ON instead of BUG The use of if (!head) BUG(); can be replaced with the single line BUG_ON(!head). Signed-off-by: Prasad Joshi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index eb7cc91b725..dcd56f84db7 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -266,8 +266,7 @@ void sysctl_head_put(struct ctl_table_header *head) static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) { - if (!head) - BUG(); + BUG_ON(!head); spin_lock(&sysctl_lock); if (!use_table(head)) head = ERR_PTR(-ENOENT); -- cgit v1.2.3 From 9fb88442105f5fce96ea343927500d6307c8c550 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Thu, 4 Oct 2012 17:15:46 -0700 Subject: fs/proc/root.c: use NULL instead of 0 for pointer This cleanup also fixes the following sparse warning: fs/proc/root.c:64:45: warning: Using plain integer as NULL pointer Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/root.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/root.c b/fs/proc/root.c index 9a2d9fd7cad..9889a92d2e0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -61,7 +61,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) if (!*p) continue; - args[0].to = args[0].from = 0; + args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); switch (token) { case Opt_gid: -- cgit v1.2.3 From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Mon, 8 Oct 2012 16:29:02 -0700 Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov Cc: Alexander Viro Cc: Carsten Otte Cc: Chris Metcalf Cc: Cyrill Gorcunov Cc: Eric Paris Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Morris Cc: Jason Baron Cc: Kentaro Takeda Cc: Matt Helsley Cc: Nick Piggin Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Robert Richter Cc: Suresh Siddha Cc: Tetsuo Handa Cc: Venkatesh Pallipadi Acked-by: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4540b8f76f1..79827ce03e3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPTE:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), - (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), -- cgit v1.2.3 From 01dc52ebdf472f77cca623ca693ca24cfc0f1bbe Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 8 Oct 2012 16:29:30 -0700 Subject: oom: remove deprecated oom_adj The deprecated /proc//oom_adj is scheduled for removal this month. Signed-off-by: Davidlohr Bueso Acked-by: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 117 +-------------------------------------------------------- 1 file changed, 1 insertion(+), 116 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index d295af99367..ef5c84be66f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -873,111 +873,6 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; -static ssize_t oom_adjust_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - char buffer[PROC_NUMBUF]; - size_t len; - int oom_adjust = OOM_DISABLE; - unsigned long flags; - - if (!task) - return -ESRCH; - - if (lock_task_sighand(task, &flags)) { - oom_adjust = task->signal->oom_adj; - unlock_task_sighand(task, &flags); - } - - put_task_struct(task); - - len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); - - return simple_read_from_buffer(buf, count, ppos, buffer, len); -} - -static ssize_t oom_adjust_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task; - char buffer[PROC_NUMBUF]; - int oom_adjust; - unsigned long flags; - int err; - - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) { - err = -EFAULT; - goto out; - } - - err = kstrtoint(strstrip(buffer), 0, &oom_adjust); - if (err) - goto out; - if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && - oom_adjust != OOM_DISABLE) { - err = -EINVAL; - goto out; - } - - task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) { - err = -ESRCH; - goto out; - } - - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_lock; - } - - if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } - - /* - * Warn that /proc/pid/oom_adj is deprecated, see - * Documentation/feature-removal-schedule.txt. - */ - printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", - current->comm, task_pid_nr(current), task_pid_nr(task), - task_pid_nr(task)); - task->signal->oom_adj = oom_adjust; - /* - * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum - * value is always attainable. - */ - if (task->signal->oom_adj == OOM_ADJUST_MAX) - task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX; - else - task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / - -OOM_DISABLE; - trace_oom_score_adj_update(task); -err_sighand: - unlock_task_sighand(task, &flags); -err_task_lock: - task_unlock(task); - put_task_struct(task); -out: - return err < 0 ? err : count; -} - -static const struct file_operations proc_oom_adjust_operations = { - .read = oom_adjust_read, - .write = oom_adjust_write, - .llseek = generic_file_llseek, -}; - static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -1051,15 +946,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; trace_oom_score_adj_update(task); - /* - * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is - * always attainable. - */ - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - task->signal->oom_adj = OOM_DISABLE; - else - task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / - OOM_SCORE_ADJ_MAX; + err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -2710,7 +2597,6 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -3077,7 +2963,6 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), -- cgit v1.2.3 From 4c199a93a2d36b277a9fd209a0f2793f8460a215 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:32 -0700 Subject: rbtree: empty nodes have no color Empty nodes have no color. We can make use of this property to simplify the code emitted by the RB_EMPTY_NODE and RB_CLEAR_NODE macros. Also, we can get rid of the rb_init_node function which had been introduced by commit 88d19cf37952 ("timers: Add rb_init_node() to allow for stack allocated rb nodes") to avoid some issue with the empty node's color not being initialized. I'm not sure what the RB_EMPTY_NODE checks in rb_prev() / rb_next() are doing there, though. axboe introduced them in commit 10fd48f2376d ("rbtree: fixed reversed RB_EMPTY_NODE and rb_next/prev"). The way I see it, the 'empty node' abstraction is only used by rbtree users to flag nodes that they haven't inserted in any rbtree, so asking the predecessor or successor of such nodes doesn't make any sense. One final rb_init_node() caller was recently added in sysctl code to implement faster sysctl name lookups. This code doesn't make use of RB_EMPTY_NODE at all, and from what I could see it only called rb_init_node() under the mistaken assumption that such initialization was required before node insertion. [sfr@canb.auug.org.au: fix net/ceph/osd_client.c build] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Cc: John Stultz Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dcd56f84db7..fddc5072963 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -168,10 +168,8 @@ static void init_header(struct ctl_table_header *head, head->node = node; if (node) { struct ctl_table *entry; - for (entry = table; entry->procname; entry++, node++) { - rb_init_node(&node->node); + for (entry = table; entry->procname; entry++, node++) node->header = head; - } } } -- cgit v1.2.3 From ea5272f5c94fb2ee62f4f15a5b88eef6184cd506 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:35 -0700 Subject: rbtree: fix incorrect rbtree node insertion in fs/proc/proc_sysctl.c The recently added code to use rbtrees in sysctl did not follow the proper rbtree interface on insertion - it was calling rb_link_node() which inserts a new node into the binary tree, but missed the call to rb_insert_color() which properly balances the rbtree and establishes all expected rbtree invariants. I found out about this only because faulty commit also used rb_init_node(), which I am removing within this patchset. But I think it's an easy mistake to make, and it makes me wonder if we should change the rbtree API so that insertions would be done with a single rb_insert() call (even if its implementation could still inline the rb_link_node() part and call a private __rb_insert_color function to do the rebalancing). Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fddc5072963..a781bdf0669 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) } rb_link_node(node, parent, p); + rb_insert_color(node, &head->parent->root); return 0; } -- cgit v1.2.3 From 7a71932d5676b7410ab64d149bad8bde6b0d8632 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 8 Oct 2012 16:33:47 -0700 Subject: kpageflags: fix wrong KPF_THP on non-huge compound pages KPF_THP can be set on non-huge compound pages (like slab pages or pages allocated by drivers with __GFP_COMP) because PageTransCompound only checks PG_head and PG_tail. Obviously this is a bug and breaks user space applications which look for thp via /proc/kpageflags. This patch rules out setting KPF_THP wrongly by additionally checking PageLRU on the head pages. Signed-off-by: Naoya Horiguchi Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Reviewed-by: Fengguang Wu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/page.c b/fs/proc/page.c index 7fcd0d60a96..b8730d9ebae 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_COMPOUND_TAIL; if (PageHuge(page)) u |= 1 << KPF_HUGE; - else if (PageTransCompound(page)) + /* + * PageTransCompound can be true for non-huge compound pages (slab + * pages or pages allocated by drivers with __GFP_COMP) because it + * just checks PG_head/PG_tail, so we need to check PageLRU to make + * sure a given page is a thp, not a non-huge compound page. + */ + else if (PageTransCompound(page) && PageLRU(compound_trans_head(page))) u |= 1 << KPF_THP; /* -- cgit v1.2.3 From 7386cdbf2f57ea8cff3c9fde93f206e58b9fe13f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 10 Oct 2012 11:51:09 +0530 Subject: nohz: Fix idle ticks in cpu summary line of /proc/stat Git commit 09a1d34f8535ecf9 "nohz: Make idle/iowait counter update conditional" introduced a bug in regard to cpu hotplug. The effect is that the number of idle ticks in the cpu summary line in /proc/stat is still counting ticks for offline cpus. Reproduction is easy, just start a workload that keeps all cpus busy, switch off one or more cpus and then watch the idle field in top. On a dual-core with one cpu 100% busy and one offline cpu you will get something like this: %Cpu(s): 48.7 us, 1.3 sy, 0.0 ni, 50.0 id, 0.0 wa, 0.0 hi, 0.0 si, %0.0 st The problem is that an offline cpu still has ts->idle_active == 1. To fix this we should make sure that the cpu is online when calling get_cpu_idle_time_us and get_cpu_iowait_time_us. [Srivatsa: Rebased to current mainline] Reported-by: Martin Schwidefsky Signed-off-by: Michal Hocko Reviewed-by: Srivatsa S. Bhat Signed-off-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/20121010061820.8999.57245.stgit@srivatsabhat.in.ibm.com Cc: deepthi@linux.vnet.ibm.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- fs/proc/stat.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 64c3b317236..e296572c73e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -45,10 +45,13 @@ static cputime64_t get_iowait_time(int cpu) static u64 get_idle_time(int cpu) { - u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL); + u64 idle, idle_time = -1ULL; + + if (cpu_online(cpu)) + idle_time = get_cpu_idle_time_us(cpu, NULL); if (idle_time == -1ULL) - /* !NO_HZ so we can rely on cpustat.idle */ + /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; else idle = usecs_to_cputime64(idle_time); @@ -58,10 +61,13 @@ static u64 get_idle_time(int cpu) static u64 get_iowait_time(int cpu) { - u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL); + u64 iowait, iowait_time = -1ULL; + + if (cpu_online(cpu)) + iowait_time = get_cpu_iowait_time_us(cpu, NULL); if (iowait_time == -1ULL) - /* !NO_HZ so we can rely on cpustat.iowait */ + /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; else iowait = usecs_to_cputime64(iowait_time); -- cgit v1.2.3 From f81700bd831efcd12eb7f0e66b24b16c2ad00a32 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Oct 2012 16:43:21 -0400 Subject: procfs: don't need a PATH_MAX allocation to hold a string representation of an int Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/proc/base.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index ef5c84be66f..144a96732dd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2258,7 +2258,8 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) pid_t tgid = task_tgid_nr_ns(current, ns); char *name = ERR_PTR(-ENOENT); if (tgid) { - name = __getname(); + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); if (!name) name = ERR_PTR(-ENOMEM); else @@ -2273,7 +2274,7 @@ static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, { char *s = nd_get_link(nd); if (!IS_ERR(s)) - __putname(s); + kfree(s); } static const struct inode_operations proc_self_inode_operations = { -- cgit v1.2.3 From 32f8516a8c733d281faa9f6666b509035246505c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Oct 2012 17:31:23 -0700 Subject: mm, mempolicy: fix printing stack contents in numa_maps When reading /proc/pid/numa_maps, it's possible to return the contents of the stack where the mempolicy string should be printed if the policy gets freed from beneath us. This happens because mpol_to_str() may return an error the stack-allocated buffer is then printed without ever being stored. There are two possible error conditions in mpol_to_str(): - if the buffer allocated is insufficient for the string to be stored, and - if the mempolicy has an invalid mode. The first error condition is not triggered in any of the callers to mpol_to_str(): at least 50 bytes is always allocated on the stack and this is sufficient for the string to be written. A future patch should convert this into BUILD_BUG_ON() since we know the maximum strlen possible, but that's not -rc material. The second error condition is possible if a race occurs in dropping a reference to a task's mempolicy causing it to be freed during the read(). The slab poison value is then used for the mode and mpol_to_str() returns -EINVAL. This race is only possible because get_vma_policy() believes that mm->mmap_sem protects task->mempolicy, which isn't true. The exit path does not hold mm->mmap_sem when dropping the reference or setting task->mempolicy to NULL: it uses task_lock(task) instead. Thus, it's required for the caller of a task mempolicy to hold task_lock(task) while grabbing the mempolicy and reading it. Callers with a vma policy store their mempolicy earlier and can simply increment the reference count so it's guaranteed not to be freed. Reported-by: Dave Jones Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 79827ce03e3..14df8806ff2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1158,6 +1158,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; + struct task_struct *task = proc_priv->task; struct mm_struct *mm = vma->vm_mm; struct mm_walk walk = {}; struct mempolicy *pol; @@ -1177,9 +1178,11 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.private = md; walk.mm = mm; - pol = get_vma_policy(proc_priv->task, vma, vma->vm_start); + task_lock(task); + pol = get_vma_policy(task, vma, vma->vm_start); mpol_to_str(buffer, sizeof(buffer), pol, 0); mpol_cond_put(pol); + task_unlock(task); seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1189,7 +1192,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_printf(m, " heap"); } else { - pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid); + pid_t tid = vm_is_stack(task, vma, is_pid); if (tid != 0) { /* * Thread stack in /proc/PID/task/TID/maps or -- cgit v1.2.3 From 9e7814404b77c3e8920bee4277162bf3a7460505 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 19 Oct 2012 17:00:55 +0900 Subject: hold task->mempolicy while numa_maps scans. /proc//numa_maps scans vma and show mempolicy under mmap_sem. It sometimes accesses task->mempolicy which can be freed without mmap_sem and numa_maps can show some garbage while scanning. This patch tries to take reference count of task->mempolicy at reading numa_maps before calling get_vma_policy(). By this, task->mempolicy will not be freed until numa_maps reaches its end. V2->v3 - updated comments to be more verbose. - removed task_lock() in numa_maps code. V1->V2 - access task->mempolicy only once and remember it. Becase kernel/exit.c can overwrite it. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: David Rientjes Acked-by: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- fs/proc/internal.h | 4 ++++ fs/proc/task_mmu.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index cceaab07ad5..43973b084ab 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -12,6 +12,7 @@ #include #include struct ctl_table_header; +struct mempolicy; extern struct proc_dir_entry proc_root; #ifdef CONFIG_PROC_SYSCTL @@ -74,6 +75,9 @@ struct proc_maps_private { #ifdef CONFIG_MMU struct vm_area_struct *tail_vma; #endif +#ifdef CONFIG_NUMA + struct mempolicy *task_mempolicy; +#endif }; void proc_init_inodecache(void); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 14df8806ff2..90c63f9392a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -90,10 +90,55 @@ static void pad_len_spaces(struct seq_file *m, int len) seq_printf(m, "%*c", len, ' '); } +#ifdef CONFIG_NUMA +/* + * These functions are for numa_maps but called in generic **maps seq_file + * ->start(), ->stop() ops. + * + * numa_maps scans all vmas under mmap_sem and checks their mempolicy. + * Each mempolicy object is controlled by reference counting. The problem here + * is how to avoid accessing dead mempolicy object. + * + * Because we're holding mmap_sem while reading seq_file, it's safe to access + * each vma's mempolicy, no vma objects will never drop refs to mempolicy. + * + * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy + * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). + * So, without task_lock(), we cannot trust get_vma_policy() because we cannot + * gurantee the task never exits under us. But taking task_lock() around + * get_vma_plicy() causes lock order problem. + * + * To access task->mempolicy without lock, we hold a reference count of an + * object pointed by task->mempolicy and remember it. This will guarantee + * that task->mempolicy points to an alive object or NULL in numa_maps accesses. + */ +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ + struct task_struct *task = priv->task; + + task_lock(task); + priv->task_mempolicy = task->mempolicy; + mpol_get(priv->task_mempolicy); + task_unlock(task); +} +static void release_task_mempolicy(struct proc_maps_private *priv) +{ + mpol_put(priv->task_mempolicy); +} +#else +static void hold_task_mempolicy(struct proc_maps_private *priv) +{ +} +static void release_task_mempolicy(struct proc_maps_private *priv) +{ +} +#endif + static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) { if (vma && vma != priv->tail_vma) { struct mm_struct *mm = vma->vm_mm; + release_task_mempolicy(priv); up_read(&mm->mmap_sem); mmput(mm); } @@ -132,7 +177,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) tail_vma = get_gate_vma(priv->task->mm); priv->tail_vma = tail_vma; - + hold_task_mempolicy(priv); /* Start with last addr hint */ vma = find_vma(mm, last_addr); if (last_addr && vma) { @@ -159,6 +204,7 @@ out: if (vma) return vma; + release_task_mempolicy(priv); /* End of vmas has been reached */ m->version = (tail_vma != NULL)? 0: -1UL; up_read(&mm->mmap_sem); @@ -1178,11 +1224,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.private = md; walk.mm = mm; - task_lock(task); pol = get_vma_policy(task, vma, vma->vm_start); mpol_to_str(buffer, sizeof(buffer), pol, 0); mpol_cond_put(pol); - task_unlock(task); seq_printf(m, "%08lx %s", vma->vm_start, buffer); -- cgit v1.2.3 From 5258f386ea4e8454bc801fb443e8a4217da1947c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 28 Oct 2012 12:19:23 -0700 Subject: sched/autogroup: Fix crash on reboot when autogroup is disabled Due to these two commits: 8323f26ce342 sched: Fix race in task_group() 800d4d30c8f2 sched, autogroup: Stop going ahead if autogroup is disabled ... autogroup scheduling's dynamic knobs are wrecked. With both patches applied, all you have to do to crash a box is disable autogroup during boot up, then reboot.. boom, NULL pointer dereference due to 800d4d30 not allowing autogroup to move things, and 8323f26ce making that the only way to switch runqueues. Remove most of the (dysfunctional) knobs and turn the remaining sched_autogroup_enabled knob readonly. If the user fiddles with cgroups hereafter, once tasks are moved, autogroup won't mess with them again unless they call setsid(). No knobs, no glitz, nada, just a cute little thing folks can turn on if they don't want to muck about with cgroups and/or systemd. Signed-off-by: Mike Galbraith Cc: Xiaotian Feng Cc: Peter Zijlstra Cc: Xiaotian Feng Cc: Linus Torvalds Cc: Andrew Morton Cc: Oleg Nesterov Cc: # v3.6 Link: http://lkml.kernel.org/r/1351451963.4999.8.camel@maggy.simpson.net Signed-off-by: Ingo Molnar --- fs/proc/base.c | 78 ---------------------------------------------------------- 1 file changed, 78 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1b6c84cbdb7..bb1d9623bad 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1271,81 +1271,6 @@ static const struct file_operations proc_pid_sched_operations = { #endif -#ifdef CONFIG_SCHED_AUTOGROUP -/* - * Print out autogroup related information: - */ -static int sched_autogroup_show(struct seq_file *m, void *v) -{ - struct inode *inode = m->private; - struct task_struct *p; - - p = get_proc_task(inode); - if (!p) - return -ESRCH; - proc_sched_autogroup_show_task(p, m); - - put_task_struct(p); - - return 0; -} - -static ssize_t -sched_autogroup_write(struct file *file, const char __user *buf, - size_t count, loff_t *offset) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct task_struct *p; - char buffer[PROC_NUMBUF]; - int nice; - int err; - - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; - - err = kstrtoint(strstrip(buffer), 0, &nice); - if (err < 0) - return err; - - p = get_proc_task(inode); - if (!p) - return -ESRCH; - - err = proc_sched_autogroup_set_nice(p, nice); - if (err) - count = err; - - put_task_struct(p); - - return count; -} - -static int sched_autogroup_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = single_open(filp, sched_autogroup_show, NULL); - if (!ret) { - struct seq_file *m = filp->private_data; - - m->private = inode; - } - return ret; -} - -static const struct file_operations proc_pid_sched_autogroup_operations = { - .open = sched_autogroup_open, - .read = seq_read, - .write = sched_autogroup_write, - .llseek = seq_lseek, - .release = single_release, -}; - -#endif /* CONFIG_SCHED_AUTOGROUP */ - static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -3035,9 +2960,6 @@ static const struct pid_entry tgid_base_stuff[] = { INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif -#ifdef CONFIG_SCHED_AUTOGROUP - REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK -- cgit v1.2.3 From fa0cbbf145aabbf29c6f28f8a11935c0b0fd86fc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 12 Nov 2012 17:53:04 -0800 Subject: mm, oom: reintroduce /proc/pid/oom_adj This is mostly a revert of 01dc52ebdf47 ("oom: remove deprecated oom_adj") from Davidlohr Bueso. It reintroduces /proc/pid/oom_adj for backwards compatibility with earlier kernels. It simply scales the value linearly when /proc/pid/oom_score_adj is written. The major difference is that its scheduled removal is no longer included in Documentation/feature-removal-schedule.txt. We do warn users with a single printk, though, to suggest the more powerful and supported /proc/pid/oom_score_adj interface. Reported-by: Artem S. Tashkinov Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- fs/proc/base.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 144a96732dd..3c231adf845 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -873,6 +873,113 @@ static const struct file_operations proc_environ_operations = { .release = mem_release, }; +static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + int oom_adj = OOM_ADJUST_MIN; + size_t len; + unsigned long flags; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int oom_adj; + unsigned long flags; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adj); + if (err) + goto out; + if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && + oom_adj != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (oom_adj == OOM_ADJUST_MAX) + oom_adj = OOM_SCORE_ADJ_MAX; + else + oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + + task->signal->oom_score_adj = oom_adj; + trace_oom_score_adj_update(task); +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_adj_operations = { + .read = oom_adj_read, + .write = oom_adj_write, + .llseek = generic_file_llseek, +}; + static ssize_t oom_score_adj_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -2598,6 +2705,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), @@ -2964,6 +3072,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), -- cgit v1.2.3 From 73f7ef435934e952c1d70d83d69921ea5d1f6bd4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 16 Nov 2012 03:02:58 +0000 Subject: sysctl: Pass useful parameters to sysctl permissions - Current is implicitly avaiable so passing current->nsproxy isn't useful. - The ctl_table_header is needed to find how the sysctl table is connected to the rest of sysctl. - ctl_table_root is avaiable in the ctl_table_header so no need to it. With these changes it becomes possible to write a version of net_sysctl_permission that takes into account the network namespace of the sysctl table, an important feature in extending the user namespace. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- fs/proc/proc_sysctl.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a781bdf0669..701580ddfcc 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -378,12 +378,13 @@ static int test_perm(int mode, int op) return -EACCES; } -static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) +static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) { + struct ctl_table_root *root = head->root; int mode; if (root->permissions) - mode = root->permissions(root, current->nsproxy, table); + mode = root->permissions(head, table); else mode = table->mode; @@ -491,7 +492,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, * and won't be until we finish. */ error = -EPERM; - if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) + if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) goto out; /* if that can happen at all, it should be -EINVAL, not -EISDIR */ @@ -717,7 +718,7 @@ static int proc_sys_permission(struct inode *inode, int mask) if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ - error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK); + error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); sysctl_head_finish(head); return error; -- cgit v1.2.3 From e656d8a6f7fdf7612d2f5771f0ddfca9487f59d9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 10 Jul 2010 14:52:49 -0700 Subject: procfs: Use the proc generic infrastructure for proc/self. I had visions at one point of splitting proc into two filesystems. If that had happened proc/self being the the part of proc that actually deals with pids would have been a nice cleanup. As it is proc/self requires a lot of unnecessary infrastructure for a single file. The only user visible change is that a mounted /proc for a pid namespace that is dead now shows a broken proc symlink, instead of being completely invisible. I don't think anyone will notice or care. Signed-off-by: Eric W. Biederman --- fs/proc/Makefile | 1 + fs/proc/base.c | 154 +---------------------------------------------------- fs/proc/internal.h | 1 + fs/proc/root.c | 1 + fs/proc/self.c | 59 ++++++++++++++++++++ 5 files changed, 64 insertions(+), 152 deletions(-) create mode 100644 fs/proc/self.c (limited to 'fs/proc') diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 99349efbbc2..981b0560193 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -21,6 +21,7 @@ proc-y += uptime.o proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o +proc-y += self.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 144a96732dd..cbe454e94af 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2237,146 +2237,6 @@ static const struct file_operations proc_coredump_filter_operations = { }; #endif -/* - * /proc/self: - */ -static int proc_self_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return -ENOENT; - sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); -} - -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (tgid) { - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d", tgid); - } - nd_set_link(nd, name); - return NULL; -} - -static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - kfree(s); -} - -static const struct inode_operations proc_self_inode_operations = { - .readlink = proc_self_readlink, - .follow_link = proc_self_follow_link, - .put_link = proc_self_put_link, -}; - -/* - * proc base - * - * These are the directory entries in the root directory of /proc - * that properly belong to the /proc filesystem, as they describe - * describe something that is process related. - */ -static const struct pid_entry proc_base_stuff[] = { - NOD("self", S_IFLNK|S_IRWXUGO, - &proc_self_inode_operations, NULL, {}), -}; - -static struct dentry *proc_base_instantiate(struct inode *dir, - struct dentry *dentry, struct task_struct *task, const void *ptr) -{ - const struct pid_entry *p = ptr; - struct inode *inode; - struct proc_inode *ei; - struct dentry *error; - - /* Allocate the inode */ - error = ERR_PTR(-ENOMEM); - inode = new_inode(dir->i_sb); - if (!inode) - goto out; - - /* Initialize the inode */ - ei = PROC_I(inode); - inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - - /* - * grab the reference to the task. - */ - ei->pid = get_task_pid(task, PIDTYPE_PID); - if (!ei->pid) - goto out_iput; - - inode->i_mode = p->mode; - if (S_ISDIR(inode->i_mode)) - set_nlink(inode, 2); - if (S_ISLNK(inode->i_mode)) - inode->i_size = 64; - if (p->iop) - inode->i_op = p->iop; - if (p->fop) - inode->i_fop = p->fop; - ei->op = p->op; - d_add(dentry, inode); - error = NULL; -out: - return error; -out_iput: - iput(inode); - goto out; -} - -static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) -{ - struct dentry *error; - struct task_struct *task = get_proc_task(dir); - const struct pid_entry *p, *last; - - error = ERR_PTR(-ENOENT); - - if (!task) - goto out_no_task; - - /* Lookup the directory entry */ - last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; - for (p = proc_base_stuff; p <= last; p++) { - if (p->len != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, p->name, p->len)) - break; - } - if (p > last) - goto out; - - error = proc_base_instantiate(dir, dentry, task, p); - -out: - put_task_struct(task); -out_no_task: - return error; -} - -static int proc_base_fill_cache(struct file *filp, void *dirent, - filldir_t filldir, struct task_struct *task, const struct pid_entry *p) -{ - return proc_fill_cache(filp, dirent, filldir, p->name, p->len, - proc_base_instantiate, task, p); -} - #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, char *buffer, int whole) { @@ -2767,15 +2627,11 @@ out: struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - struct dentry *result; + struct dentry *result = NULL; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; - result = proc_base_lookup(dir, dentry); - if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) - goto out; - tgid = name_to_int(dentry); if (tgid == ~0U) goto out; @@ -2838,7 +2694,7 @@ retry: return iter; } -#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY) static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, struct tgid_iter iter) @@ -2872,12 +2728,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) if (!reaper) goto out_no_task; - for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { - const struct pid_entry *p = &proc_base_stuff[nr]; - if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) - goto out; - } - ns = filp->f_dentry->d_sb->s_fs_info; iter.task = NULL; iter.tgid = filp->f_pos - TGID_OFFSET; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 43973b084ab..252544c0520 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -15,6 +15,7 @@ struct ctl_table_header; struct mempolicy; extern struct proc_dir_entry proc_root; +extern void proc_self_init(void); #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void sysctl_head_put(struct ctl_table_header *head); diff --git a/fs/proc/root.c b/fs/proc/root.c index 9889a92d2e0..5da984959ed 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -169,6 +169,7 @@ void __init proc_root_init(void) return; } + proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); diff --git a/fs/proc/self.c b/fs/proc/self.c new file mode 100644 index 00000000000..aa5cc3bff14 --- /dev/null +++ b/fs/proc/self.c @@ -0,0 +1,59 @@ +#include +#include +#include + +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char tmp[PROC_NUMBUF]; + if (!tgid) + return -ENOENT; + sprintf(tmp, "%d", tgid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); +} + +static const struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, +}; + +void __init proc_self_init(void) +{ + struct proc_dir_entry *proc_self_symlink; + mode_t mode; + + mode = S_IFLNK | S_IRWXUGO; + proc_self_symlink = proc_create("self", mode, NULL, NULL ); + proc_self_symlink->proc_iops = &proc_self_inode_operations; +} -- cgit v1.2.3 From ae06c7c83fc6e97ba247a261921c101960f3d28f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 10 Jul 2010 15:23:34 -0700 Subject: procfs: Don't cache a pid in the root inode. Now that we have s_fs_info pointing to our pid namespace the original reason for the proc root inode having a struct pid is gone. Caching a pid in the root inode has led to some complicated code. Now that we don't need the struct pid, just remove it. Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 11 +---------- fs/proc/root.c | 8 -------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index cbe454e94af..6177fc238fd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2714,19 +2714,12 @@ static int fake_filldir(void *buf, const char *name, int namelen, /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int nr; - struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) - goto out_no_task; - nr = filp->f_pos - FIRST_PROCESS_ENTRY; - - reaper = get_proc_task(filp->f_path.dentry->d_inode); - if (!reaper) - goto out_no_task; + goto out; ns = filp->f_dentry->d_sb->s_fs_info; iter.task = NULL; @@ -2747,8 +2740,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) } filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; out: - put_task_struct(reaper); -out_no_task: return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 5da984959ed..13ef6247e7a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -100,7 +100,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, int err; struct super_block *sb; struct pid_namespace *ns; - struct proc_inode *ei; char *options; if (flags & MS_KERNMOUNT) { @@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - ei = PROC_I(sb->s_root->d_inode); - if (!ei->pid) { - rcu_read_lock(); - ei->pid = get_pid(find_pid_ns(1, ns)); - rcu_read_unlock(); - } - return dget(sb->s_root); } -- cgit v1.2.3 From 17cf22c33e1f1b5e435469c84e43872579497653 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 14:51:53 -0800 Subject: pidns: Use task_active_pid_ns where appropriate The expressions tsk->nsproxy->pid_ns and task_active_pid_ns aka ns_of_pid(task_pid(tsk)) should have the same number of cache line misses with the practical difference that ns_of_pid(task_pid(tsk)) is released later in a processes life. Furthermore by using task_active_pid_ns it becomes trivial to write an unshare implementation for the the pid namespace. So I have used task_active_pid_ns everywhere I can. In fork since the pid has not yet been attached to the process I use ns_of_pid, to achieve the same effect. Signed-off-by: Eric W. Biederman --- fs/proc/root.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/root.c b/fs/proc/root.c index 13ef6247e7a..fc1609321a7 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -106,7 +106,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = (struct pid_namespace *)data; options = NULL; } else { - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); options = data; } -- cgit v1.2.3 From 0a01f2cc390e10633a54f72c608cc3fe19a50c3d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2012 10:33:47 -0700 Subject: pidns: Make the pidns proc mount/umount logic obvious. Track the number of pids in the proc hash table. When the number of pids goes to 0 schedule work to unmount the kernel mount of proc. Move the mount of proc into alloc_pid when we allocate the pid for init. Remove the surprising calls of pid_ns_release proc in fork and proc_flush_task. Those code paths really shouldn't know about proc namespace implementation details and people have demonstrated several times that finding and understanding those code paths is difficult and non-obvious. Because of the call path detach pid is alwasy called with the rtnl_lock held free_pid is not allowed to sleep, so the work to unmounting proc is moved to a work queue. This has the side benefit of not blocking the entire world waiting for the unnecessary rcu_barrier in deactivate_locked_super. In the process of making the code clear and obvious this fixes a bug reported by Gao feng where we would leak a mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns succeeded and copy_net_ns failed. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- fs/proc/base.c | 4 ---- fs/proc/root.c | 5 ----- 2 files changed, 9 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6177fc238fd..7621dc51cff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, diff --git a/fs/proc/root.c b/fs/proc/root.c index fc1609321a7..f2f251158d3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -155,11 +155,6 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - err = pid_ns_prepare_proc(&init_pid_ns); - if (err) { - unregister_filesystem(&proc_fs_type); - return; - } proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); -- cgit v1.2.3 From 57e8391d327609cbf12d843259c968b9e5c1838f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:17:03 -0800 Subject: pidns: Add setns support - Pid namespaces are designed to be inescapable so verify that the passed in pid namespace is a child of the currently active pid namespace or the currently active pid namespace itself. Allowing the currently active pid namespace is important so the effects of an earlier setns can be cancelled. Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index b178ed733c3..85ca047e35f 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -24,6 +24,9 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_IPC_NS &ipcns_operations, #endif +#ifdef CONFIG_PID_NS + &pidns_operations, +#endif }; static const struct file_operations ns_file_operations = { -- cgit v1.2.3 From 8823c079ba7136dc1948d6f6dcb5f8022bde438e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:49:36 -0800 Subject: vfs: Add setns support for the mount namespace setns support for the mount namespace is a little tricky as an arbitrary decision must be made about what to set fs->root and fs->pwd to, as there is no expectation of a relationship between the two mount namespaces. Therefore I arbitrarily find the root mount point, and follow every mount on top of it to find the top of the mount stack. Then I set fs->root and fs->pwd to that location. The topmost root of the mount stack seems like a reasonable place to be. Bind mount support for the mount namespace inodes has the possibility of creating circular dependencies between mount namespaces. Circular dependencies can result in loops that prevent mount namespaces from every being freed. I avoid creating those circular dependencies by adding a sequence number to the mount namespace and require all bind mounts be of a younger mount namespace into an older mount namespace. Add a helper function proc_ns_inode so it is possible to detect when we are attempting to bind mound a namespace inode. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 85ca047e35f..2a17fd9ae6a 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -27,6 +27,7 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_PID_NS &pidns_operations, #endif + &mntns_operations, }; static const struct file_operations ns_file_operations = { @@ -201,3 +202,7 @@ out_invalid: return ERR_PTR(-EINVAL); } +bool proc_ns_inode(struct inode *inode) +{ + return inode->i_fop == &ns_file_operations; +} -- cgit v1.2.3 From cde1975bc242f3e1072bde623ef378e547b73f91 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 06:24:06 -0700 Subject: userns: Implent proc namespace operations This allows entering a user namespace, and the ability to store a reference to a user namespace with a bind mount. Addition of missing userns_ns_put in userns_install from Gao feng Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/proc/namespaces.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 2a17fd9ae6a..030250c27d7 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" @@ -26,6 +27,9 @@ static const struct proc_ns_operations *ns_entries[] = { #endif #ifdef CONFIG_PID_NS &pidns_operations, +#endif +#ifdef CONFIG_USER_NS + &userns_operations, #endif &mntns_operations, }; -- cgit v1.2.3 From e9f238c3041e2582a710e75910c8cbf2a98e51b2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 11 Aug 2012 12:38:26 -0700 Subject: procfs: Print task uids and gids in the userns that opened the proc file Instead of using current_userns() use the userns of the opener of the file so that if the file is passed between processes the contents of the file do not change. Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index c1c207c36ca..55443426561 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk) static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { - struct user_namespace *user_ns = current_user_ns(); + struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g; struct fdtable *fdt = NULL; -- cgit v1.2.3 From 4f326c0064b20b78b8041f4d2f6fe188a1129f18 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 27 Jul 2012 05:56:48 -0700 Subject: userns: Allow unprivilged mounts of proc and sysfs - The context in which proc and sysfs are mounted have no effect on the the uid/gid of their files so no conversion is needed except allowing the mount. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/proc/root.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/proc') diff --git a/fs/proc/root.c b/fs/proc/root.c index f2f251158d3..c6e9fac26ba 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -145,6 +145,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) -- cgit v1.2.3 From 33d6dce607573b5fd7a43168e0d91221b3ca532b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 17 Jun 2011 13:33:20 -0700 Subject: proc: Generalize proc inode allocation Generalize the proc inode allocation so that it can be used without having to having to create a proc_dir_entry. This will allow namespace file descriptors to remain light weight entitities but still have the same inode number when the backing namespace is the same. Acked-by: Serge E. Hallyn Signed-off-by: Eric W. Biederman --- fs/proc/generic.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 0d80cef4cfb..7b3ae3cc0ef 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ * Return an inode number between PROC_DYNAMIC_FIRST and * 0xffffffff, or zero on failure. */ -static unsigned int get_inode_number(void) +int proc_alloc_inum(unsigned int *inum) { unsigned int i; int error; retry: - if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) - return 0; + if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) + return -ENOMEM; spin_lock(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); @@ -365,18 +365,19 @@ retry: if (error == -EAGAIN) goto retry; else if (error) - return 0; + return error; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, i); spin_unlock(&proc_inum_lock); - return 0; + return -ENOSPC; } - return PROC_DYNAMIC_FIRST + i; + *inum = PROC_DYNAMIC_FIRST + i; + return 0; } -static void release_inode_number(unsigned int inum) +void proc_free_inum(unsigned int inum) { spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); @@ -554,13 +555,12 @@ static const struct inode_operations proc_dir_inode_operations = { static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) { - unsigned int i; struct proc_dir_entry *tmp; + int ret; - i = get_inode_number(); - if (i == 0) - return -EAGAIN; - dp->low_ino = i; + ret = proc_alloc_inum(&dp->low_ino); + if (ret) + return ret; if (S_ISDIR(dp->mode)) { if (dp->proc_iops == NULL) { @@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data); static void free_proc_entry(struct proc_dir_entry *de) { - release_inode_number(de->low_ino); + proc_free_inum(de->low_ino); if (S_ISLNK(de->mode)) kfree(de->data); -- cgit v1.2.3 From bf056bfa80596a5d14b26b17276a56a0dcb080e5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 18 Jun 2011 17:48:18 -0700 Subject: proc: Fix the namespace inode permission checks. Change the proc namespace files into symlinks so that we won't cache the dentries for the namespace files which can bypass the ptrace_may_access checks. To support the symlinks create an additional namespace inode with it's own set of operations distinct from the proc pid inode and dentry methods as those no longer make sense. Signed-off-by: Eric W. Biederman --- fs/proc/inode.c | 6 +- fs/proc/namespaces.c | 169 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 152 insertions(+), 23 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 3b22bbdee9e..439ae688650 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode) struct proc_dir_entry *de; struct ctl_table_header *head; const struct proc_ns_operations *ns_ops; + void *ns; truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); @@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode) } /* Release any associated namespace */ ns_ops = PROC_I(inode)->ns_ops; - if (ns_ops && ns_ops->put) - ns_ops->put(PROC_I(inode)->ns); + ns = PROC_I(inode)->ns; + if (ns_ops && ns) + ns_ops->put(ns); } static struct kmem_cache * proc_inode_cachep; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 030250c27d7..7a6d8d69cdb 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -38,6 +38,151 @@ static const struct file_operations ns_file_operations = { .llseek = no_llseek, }; +static const struct inode_operations ns_inode_operations = { + .setattr = proc_setattr, +}; + +static int ns_delete_dentry(const struct dentry *dentry) +{ + /* Don't cache namespace inodes when not in use */ + return 1; +} + +static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; + + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + ns_ops->name, inode->i_ino); +} + +const struct dentry_operations ns_dentry_operations = +{ + .d_delete = ns_delete_dentry, + .d_dname = ns_dname, +}; + +static struct dentry *proc_ns_get_dentry(struct super_block *sb, + struct task_struct *task, const struct proc_ns_operations *ns_ops) +{ + struct dentry *dentry, *result; + struct inode *inode; + struct proc_inode *ei; + struct qstr qname = { .name = "", }; + void *ns; + + ns = ns_ops->get(task); + if (!ns) + return ERR_PTR(-ENOENT); + + dentry = d_alloc_pseudo(sb, &qname); + if (!dentry) { + ns_ops->put(ns); + return ERR_PTR(-ENOMEM); + } + + inode = new_inode(sb); + if (!inode) { + dput(dentry); + ns_ops->put(ns); + return ERR_PTR(-ENOMEM); + } + + ei = PROC_I(inode); + inode->i_ino = get_next_ino(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &ns_inode_operations; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; + + d_set_d_op(dentry, &ns_dentry_operations); + result = d_instantiate_unique(dentry, inode); + if (result) { + dput(dentry); + dentry = result; + } + + return dentry; +} + +static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct proc_inode *ei = PROC_I(inode); + struct task_struct *task; + struct dentry *ns_dentry; + void *error = ERR_PTR(-EACCES); + + task = get_proc_task(inode); + if (!task) + goto out; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops); + if (IS_ERR(ns_dentry)) { + error = ERR_CAST(ns_dentry); + goto out_put_task; + } + + dput(nd->path.dentry); + nd->path.dentry = ns_dentry; + error = NULL; + +out_put_task: + put_task_struct(task); +out: + return error; +} + +static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + struct proc_inode *ei = PROC_I(inode); + const struct proc_ns_operations *ns_ops = ei->ns_ops; + struct task_struct *task; + void *ns; + char name[50]; + int len = -EACCES; + + task = get_proc_task(inode); + if (!task) + goto out; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + len = -ENOENT; + ns = ns_ops->get(task); + if (!ns) + goto out_put_task; + + snprintf(name, sizeof(name), "%s", ns_ops->name); + len = strlen(name); + + if (len > buflen) + len = buflen; + if (copy_to_user(buffer, ns_ops->name, len)) + len = -EFAULT; + + ns_ops->put(ns); +out_put_task: + put_task_struct(task); +out: + return len; +} + +static const struct inode_operations proc_ns_link_inode_operations = { + .readlink = proc_ns_readlink, + .follow_link = proc_ns_follow_link, + .setattr = proc_setattr, +}; + static struct dentry *proc_ns_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -45,21 +190,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); - void *ns; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; - ns = ns_ops->get(task); - if (!ns) - goto out_iput; - ei = PROC_I(inode); - inode->i_mode = S_IFREG|S_IRUSR; - inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + inode->i_mode = S_IFLNK|S_IRWXUGO; + inode->i_op = &proc_ns_link_inode_operations; + ei->ns_ops = ns_ops; d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); @@ -68,9 +207,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, error = NULL; out: return error; -out_iput: - iput(inode); - goto out; } static int proc_ns_fill_cache(struct file *filp, void *dirent, @@ -97,10 +233,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent, if (!task) goto out_no_task; - ret = -EPERM; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - ret = 0; i = filp->f_pos; switch (i) { @@ -160,10 +292,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!task) goto out_no_task; - error = ERR_PTR(-EPERM); - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - last = &ns_entries[ARRAY_SIZE(ns_entries)]; for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) @@ -171,7 +299,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } - error = ERR_PTR(-ENOENT); if (entry == last) goto out; -- cgit v1.2.3 From 98f842e675f96ffac96e6c50315790912b2812be Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Jun 2011 10:21:48 -0700 Subject: proc: Usable inode numbers for the namespace file descriptors. Assign a unique proc inode to each namespace, and use that inode number to ensure we only allocate at most one proc inode for every namespace in proc. A single proc inode per namespace allows userspace to test to see if two processes are in the same namespace. This has been a long requested feature and only blocked because a naive implementation would put the id in a global space and would ultimately require having a namespace for the names of namespaces, making migration and certain virtualization tricks impossible. We still don't have per superblock inode numbers for proc, which appears necessary for application unaware checkpoint/restart and migrations (if the application is using namespace file descriptors) but that is now allowd by the design if it becomes important. I have preallocated the ipc and uts initial proc inode numbers so their structures can be statically initialized. Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 7a6d8d69cdb..b7a47196c8c 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -82,7 +82,7 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, return ERR_PTR(-ENOMEM); } - inode = new_inode(sb); + inode = iget_locked(sb, ns_ops->inum(ns)); if (!inode) { dput(dentry); ns_ops->put(ns); @@ -90,13 +90,17 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb, } ei = PROC_I(inode); - inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_op = &ns_inode_operations; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - ei->ns_ops = ns_ops; - ei->ns = ns; + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &ns_inode_operations; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; + unlock_new_inode(inode); + } else { + ns_ops->put(ns); + } d_set_d_op(dentry, &ns_dentry_operations); result = d_instantiate_unique(dentry, inode); @@ -162,12 +166,12 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!ns) goto out_put_task; - snprintf(name, sizeof(name), "%s", ns_ops->name); + snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); len = strlen(name); if (len > buflen) len = buflen; - if (copy_to_user(buffer, ns_ops->name, len)) + if (copy_to_user(buffer, name, len)) len = -EFAULT; ns_ops->put(ns); -- cgit v1.2.3 From 05f564849d49499ced97913a0914b5950577d07d Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Mon, 26 Nov 2012 16:29:42 -0800 Subject: proc: check vma->vm_file before dereferencing Commit 7b540d0646ce ("proc_map_files_readdir(): don't bother with grabbing files") switched proc_map_files_readdir() to use @f_mode directly instead of grabbing @file reference, but same time the test for @vm_file presence was lost leading to nil dereference. The patch brings the test back. The all proc_map_files feature is CONFIG_CHECKPOINT_RESTORE wrapped (which is set to 'n' by default) so the bug doesn't affect regular kernels. The regression is 3.7-rc1 only as far as I can tell. [gorcunov@openvz.org: provided changelog] Signed-off-by: Stanislav Kinsbursky Acked-by: Cyrill Gorcunov Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 3c231adf845..9e28356a959 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1877,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!vma) goto out_no_vma; - result = proc_map_files_instantiate(dir, dentry, task, - (void *)(unsigned long)vma->vm_file->f_mode); + if (vma->vm_file) + result = proc_map_files_instantiate(dir, dentry, task, + (void *)(unsigned long)vma->vm_file->f_mode); out_no_vma: up_read(&mm->mmap_sem); -- cgit v1.2.3 From e80d0a1ae8bb8fee0edd37427836f108b30f596b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Nov 2012 16:26:44 +0100 Subject: cputime: Rename thread_group_times to thread_group_cputime_adjusted We have thread_group_cputime() and thread_group_times(). The naming doesn't provide enough information about the difference between these two APIs. To lower the confusion, rename thread_group_times() to thread_group_cputime_adjusted(). This name better suggests that it's a version of thread_group_cputime() that does some stabilization on the raw cputime values. ie here: scale on top of CFS runtime stats and bound lower value for monotonicity. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- fs/proc/array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index c1c207c36ca..d3696708fc1 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_times(task, &utime, &stime); + thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; } @@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - task_times(task, &utime, &stime); + task_cputime_adjusted(task, &utime, &stime); gtime = task->gtime; } -- cgit v1.2.3 From c1ad41f1f7270c1956da13fa8fd59d8d5929d56e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Dec 2012 10:23:45 +0100 Subject: Revert "sched/autogroup: Fix crash on reboot when autogroup is disabled" This reverts commit 5258f386ea4e8454bc801fb443e8a4217da1947c, because the underlying autogroups bug got fixed upstream in a better way, via: fd8ef11730f1 Revert "sched, autogroup: Stop going ahead if autogroup is disabled" Cc: Mike Galbraith Cc: Yong Zhang Cc: Peter Zijlstra Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- fs/proc/base.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 587631e1cd0..9e28356a959 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1272,6 +1272,81 @@ static const struct file_operations proc_pid_sched_operations = { #endif +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = kstrtoint(strstrip(buffer), 0, &nice); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = proc_sched_autogroup_set_nice(p, nice); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_autogroup_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, + .read = seq_read, + .write = sched_autogroup_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_SCHED_AUTOGROUP */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -2582,6 +2657,9 @@ static const struct pid_entry tgid_base_stuff[] = { INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK -- cgit v1.2.3 From a9c58b907dbc6821533dfc295b63caf111ff1f16 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:54 -0800 Subject: mm, oom: change type of oom_score_adj to short The maximum oom_score_adj is 1000 and the minimum oom_score_adj is -1000, so this range can be represented by the signed short type with no functional change. The extra space this frees up in struct signal_struct will be used for per-thread oom kill flags in the next patch. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Anton Vorontsov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e28356a959..aa63d25157b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -985,7 +985,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, { struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); char buffer[PROC_NUMBUF]; - int oom_score_adj = OOM_SCORE_ADJ_MIN; + short oom_score_adj = OOM_SCORE_ADJ_MIN; unsigned long flags; size_t len; @@ -996,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, unlock_task_sighand(task, &flags); } put_task_struct(task); - len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); + len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } @@ -1043,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_task_lock; } - if (oom_score_adj < task->signal->oom_score_adj_min && + if ((short)oom_score_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; } - task->signal->oom_score_adj = oom_score_adj; + task->signal->oom_score_adj = (short)oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) - task->signal->oom_score_adj_min = oom_score_adj; + task->signal->oom_score_adj_min = (short)oom_score_adj; trace_oom_score_adj_update(task); err_sighand: -- cgit v1.2.3 From e180377f1ae48b3cbc559c9875d9b038f7f000c6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 12 Dec 2012 13:50:59 -0800 Subject: thp: change split_huge_page_pmd() interface Pass vma instead of mm and add address parameter. In most cases we already have vma on the stack. We provides split_huge_page_pmd_mm() for few cases when we have mm, but not vma. This change is preparation to huge zero pmd splitting implementation. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: Mel Gorman Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 90c63f9392a..291a0d15a0b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; - split_huge_page_pmd(walk->mm, pmd); + split_huge_page_pmd(vma, addr, pmd); if (pmd_trans_unstable(pmd)) return 0; -- cgit v1.2.3 From 4ff1b2c29326a2a3e130b46f69b7ab0e853d09d8 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:25 -0800 Subject: procfs: use N_MEMORY instead N_HIGH_MEMORY N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Signed-off-by: Lai Jiangshan Acked-by: Hillf Danton Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Lin Feng Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 2 +- fs/proc/task_mmu.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 86c67eee439..e96d4f18ca3 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -249,7 +249,7 @@ static int kcore_update_ram(void) /* Not inialized....update now */ /* find out "max pfn" */ end_pfn = 0; - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { unsigned long node_end; node_end = NODE_DATA(nid)->node_start_pfn + NODE_DATA(nid)->node_spanned_pages; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 291a0d15a0b..48775628abb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return NULL; nid = page_to_nid(page); - if (!node_isset(nid, node_states[N_HIGH_MEMORY])) + if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; @@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (md->writeback) seq_printf(m, " writeback=%lu", md->writeback); - for_each_node_state(n, N_HIGH_MEMORY) + for_each_node_state(n, N_MEMORY) if (md->node[n]) seq_printf(m, " N%d=%lu", n, md->node[n]); out: -- cgit v1.2.3 From f9a00e8738c209d95493cf97d3a82ab2655892e5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 17 Dec 2012 16:01:25 -0800 Subject: procfs: use kbasename() [yongjun_wei@trendmicro.com.cn: remove duplicated include] Signed-off-by: Andy Shevchenko Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_devtree.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index df7dd08d439..de20ec480fa 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -195,11 +195,7 @@ void proc_device_tree_add_node(struct device_node *np, set_node_proc_entry(np, de); for (child = NULL; (child = of_get_next_child(np, child));) { /* Use everything after the last slash, or the full name */ - p = strrchr(child->full_name, '/'); - if (!p) - p = child->full_name; - else - ++p; + p = kbasename(child->full_name); if (duplicate_name(de, p)) p = fixup_name(np, de, p); -- cgit v1.2.3 From 7b9a7ec565505699f503b4fcf61500dceb36e744 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Mon, 17 Dec 2012 16:03:10 -0800 Subject: proc: don't show nonexistent capabilities Without this patch it is really hard to interpret a bounding set, if CAP_LAST_CAP is unknown for a current kernel. Non-existant capabilities can not be deleted from a bounding set with help of prctl. E.g.: Here are two examples without/with this patch. CapBnd: ffffffe0fdecffff CapBnd: 00000000fdecffff I suggest to hide non-existent capabilities. Here is two reasons. * It's logically and easier for using. * It helps to checkpoint-restore capabilities of tasks, because tasks can be restored on another kernel, where CAP_LAST_CAP is bigger. Signed-off-by: Andrew Vagin Cc: Andrew G. Morgan Reviewed-by: Serge E. Hallyn Cc: Pavel Emelyanov Reviewed-by: Kees Cook Cc: KAMEZAWA Hiroyuki Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index d3696708fc1..377a37366dd 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -308,6 +308,10 @@ static void render_cap_t(struct seq_file *m, const char *header, seq_putc(m, '\n'); } +/* Remove non-existent capabilities */ +#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \ + CAP_TO_MASK(CAP_LAST_CAP + 1) - 1) + static inline void task_cap(struct seq_file *m, struct task_struct *p) { const struct cred *cred; @@ -321,6 +325,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) cap_bset = cred->cap_bset; rcu_read_unlock(); + NORM_CAPS(cap_inheritable); + NORM_CAPS(cap_permitted); + NORM_CAPS(cap_effective); + NORM_CAPS(cap_bset); + render_cap_t(m, "CapInh:\t", &cap_inheritable); render_cap_t(m, "CapPrm:\t", &cap_permitted); render_cap_t(m, "CapEff:\t", &cap_effective); -- cgit v1.2.3 From 834f82e2aa9a8ede94b17b656329f850c1471514 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:03:13 -0800 Subject: procfs: add VmFlags field in smaps output During c/r sessions we've found that there is no way at the moment to fetch some VMA associated flags, such as mlock() and madvise(). This leads us to a problem -- we don't know if we should call for mlock() and/or madvise() after restore on the vma area we're bringing back to life. This patch intorduces a new field into "smaps" output called VmFlags, where all set flags associated with the particular VMA is shown as two letter mnemonics. [ Strictly speaking for c/r we only need mlock/madvise bits but it has been said that providing just a few flags looks somehow inconsistent. So all flags are here now. ] This feature is made available on CONFIG_CHECKPOINT_RESTORE=n kernels, as other applications may start to use these fields. The data is encoded in a somewhat awkward two letters mnemonic form, to encourage userspace to be prepared for fields being added or removed in the future. [a.p.zijlstra@chello.nl: props to use for_each_set_bit] [sfr@canb.auug.org.au: props to use array instead of struct] [akpm@linux-foundation.org: overall redesign and simplification] [akpm@linux-foundation.org: remove unneeded braces per sfr, avoid using bloaty for_each_set_bit()] Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Peter Zijlstra Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 48775628abb..448455b7fd9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -526,6 +526,57 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } +static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) +{ + /* + * Don't forget to update Documentation/ on changes. + */ + static const char mnemonics[BITS_PER_LONG][2] = { + /* + * In case if we meet a flag we don't know about. + */ + [0 ... (BITS_PER_LONG-1)] = "??", + + [ilog2(VM_READ)] = "rd", + [ilog2(VM_WRITE)] = "wr", + [ilog2(VM_EXEC)] = "ex", + [ilog2(VM_SHARED)] = "sh", + [ilog2(VM_MAYREAD)] = "mr", + [ilog2(VM_MAYWRITE)] = "mw", + [ilog2(VM_MAYEXEC)] = "me", + [ilog2(VM_MAYSHARE)] = "ms", + [ilog2(VM_GROWSDOWN)] = "gd", + [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_DENYWRITE)] = "dw", + [ilog2(VM_LOCKED)] = "lo", + [ilog2(VM_IO)] = "io", + [ilog2(VM_SEQ_READ)] = "sr", + [ilog2(VM_RAND_READ)] = "rr", + [ilog2(VM_DONTCOPY)] = "dc", + [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_ACCOUNT)] = "ac", + [ilog2(VM_NORESERVE)] = "nr", + [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_NONLINEAR)] = "nl", + [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_DONTDUMP)] = "dd", + [ilog2(VM_MIXEDMAP)] = "mm", + [ilog2(VM_HUGEPAGE)] = "hg", + [ilog2(VM_NOHUGEPAGE)] = "nh", + [ilog2(VM_MERGEABLE)] = "mg", + }; + size_t i; + + seq_puts(m, "VmFlags: "); + for (i = 0; i < BITS_PER_LONG; i++) { + if (vma->vm_flags & (1UL << i)) { + seq_printf(m, "%c%c ", + mnemonics[i][0], mnemonics[i][1]); + } + } + seq_putc(m, '\n'); +} + static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; @@ -581,6 +632,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) seq_printf(m, "Nonlinear: %8lu kB\n", mss.nonlinear >> 10); + show_smap_vma_flags(m, vma); + if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task->mm)) ? vma->vm_start : 0; -- cgit v1.2.3 From 2f4b3bf6b2318cfaa177ec5a802f4d8d6afbd816 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 17 Dec 2012 16:03:14 -0800 Subject: /proc/pid/status: add "Seccomp" field It is currently impossible to examine the state of seccomp for a given process. While attaching with gdb and attempting "call prctl(PR_GET_SECCOMP,...)" will work with some situations, it is not reliable. If the process is in seccomp mode 1, this query will kill the process (prctl not allowed), if the process is in mode 2 with prctl not allowed, it will similarly be killed, and in weird cases, if prctl is filtered to return errno 0, it can look like seccomp is disabled. When reviewing the state of running processes, there should be a way to externally examine the seccomp mode. ("Did this build of Chrome end up using seccomp?" "Did my distro ship ssh with seccomp enabled?") This adds the "Seccomp" line to /proc/$pid/status. Signed-off-by: Kees Cook Reviewed-by: Cyrill Gorcunov Cc: Andrea Arcangeli Cc: James Morris Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 377a37366dd..077235ffb38 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -336,6 +336,13 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) render_cap_t(m, "CapBnd:\t", &cap_bset); } +static inline void task_seccomp(struct seq_file *m, struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP + seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode); +#endif +} + static inline void task_context_switch_counts(struct seq_file *m, struct task_struct *p) { @@ -369,6 +376,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, } task_sig(m, task); task_cap(m, task); + task_seccomp(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); -- cgit v1.2.3 From 8d238027b87e654be552eabdf492042a34c5c300 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 17 Dec 2012 16:03:17 -0800 Subject: proc: pid/status: show all supplementary groups We display a list of supplementary group for each process in /proc//status. However, we show only the first 32 groups, not all of them. Although this is rare, but sometimes processes do have more than 32 supplementary groups, and this kernel limitation breaks user-space apps that rely on the group list in /proc//status. Number 32 comes from the internal NGROUPS_SMALL macro which defines the length for the internal kernel "small" groups buffer. There is no apparent reason to limit to this value. This patch removes the 32 groups printing limit. The Linux kernel limits the amount of supplementary groups by NGROUPS_MAX, which is currently set to 65536. And this is the maximum count of groups we may possibly print. Signed-off-by: Artem Bityutskiy Acked-by: Serge E. Hallyn Acked-by: Kees Cook Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 077235ffb38..439544fec38 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -212,7 +212,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, group_info = cred->group_info; task_unlock(p); - for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) + for (g = 0; g < group_info->ngroups; g++) seq_printf(m, "%d ", from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); -- cgit v1.2.3 From 55985dd72ab27b47530dcc8bdddd28b69f4abe8b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:04:55 -0800 Subject: procfs: add ability to plug in auxiliary fdinfo providers This patch brings ability to print out auxiliary data associated with file in procfs interface /proc/pid/fdinfo/fd. In particular further patches make eventfd, evenpoll, signalfd and fsnotify to print additional information complete enough to restore these objects after checkpoint. To simplify the code we add show_fdinfo callback inside struct file_operations (as Al and Pavel are proposing). Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: Andrey Vagin Cc: Al Viro Cc: Alexey Dobriyan Cc: James Bottomley Cc: "Aneesh Kumar K.V" Cc: Alexey Dobriyan Cc: Matthew Helsley Cc: "J. Bruce Fields" Cc: "Aneesh Kumar K.V" Cc: Tvrtko Ursulin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/fd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index f28a875f877..d7a4a28ef63 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -50,6 +50,8 @@ static int seq_show(struct seq_file *m, void *v) if (!ret) { seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", (long long)file->f_pos, f_flags); + if (file->f_op->show_fdinfo) + ret = file->f_op->show_fdinfo(m, file); fput(file); } -- cgit v1.2.3 From 138d22b58696c506799f8de759804083ff9effae Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 17 Dec 2012 16:05:02 -0800 Subject: fs, epoll: add procfs fdinfo helper This allows us to print out eventpoll target file descriptor, events and data, the /proc/pid/fdinfo/fd consists of | pos: 0 | flags: 02 | tfd: 5 events: 1d data: ffffffffffffffff enabled: 1 [avagin@: fix for unitialized ret variable] Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: Andrey Vagin Cc: Al Viro Cc: Alexey Dobriyan Cc: James Bottomley Cc: "Aneesh Kumar K.V" Cc: Alexey Dobriyan Cc: Matthew Helsley Cc: "J. Bruce Fields" Cc: "Aneesh Kumar K.V" Cc: Tvrtko Ursulin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 439544fec38..060a56a9127 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -220,7 +220,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_putc(m, '\n'); } -static void render_sigset_t(struct seq_file *m, const char *header, +void render_sigset_t(struct seq_file *m, const char *header, sigset_t *set) { int i; -- cgit v1.2.3 From 46f69557103e11fb963ae5c98b7777e90493241b Mon Sep 17 00:00:00 2001 From: Marco Stornelli Date: Sat, 15 Dec 2012 11:48:48 +0100 Subject: procfs: drop vmtruncate Removed vmtruncate Signed-off-by: Marco Stornelli Signed-off-by: Al Viro --- fs/proc/base.c | 7 ------- fs/proc/generic.c | 9 +-------- fs/proc/proc_sysctl.c | 7 ------- 3 files changed, 1 insertion(+), 22 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 5a5a0be40e4..9b43ff77a51 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -542,13 +542,6 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - setattr_copy(inode, attr); mark_inode_dirty(inode); return 0; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 7b3ae3cc0ef..2e4ed13b9ee 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -261,16 +261,9 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) if (error) return error; - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, iattr->ia_size); - if (error) - return error; - } - setattr_copy(inode, iattr); mark_inode_dirty(inode); - + de->uid = inode->i_uid; de->gid = inode->i_gid; de->mode = inode->i_mode; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 701580ddfcc..1827d88ad58 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -736,13 +736,6 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); - if (error) - return error; - } - setattr_copy(inode, attr); mark_inode_dirty(inode); return 0; -- cgit v1.2.3 From ee297209bf0a25c6717b7c063e76795142d32f37 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 20 Dec 2012 15:05:44 -0800 Subject: proc: fix inconsistent lock state Lockdep found an inconsistent lock state when rcu is processing delayed work in softirq. Currently, kernel is using spin_lock/spin_unlock to protect proc_inum_ida, but proc_free_inum is called by rcu in softirq context. Use spin_lock_bh/spin_unlock_bh fix following lockdep warning. ================================= [ INFO: inconsistent lock state ] 3.7.0 #36 Not tainted --------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/1/0 [HC0[0]:SC1[1]:HE1:SE0] takes: (proc_inum_lock){+.?...}, at: proc_free_inum+0x1c/0x50 {SOFTIRQ-ON-W} state was registered at: __lock_acquire+0x8ae/0xca0 lock_acquire+0x199/0x200 _raw_spin_lock+0x41/0x50 proc_alloc_inum+0x4c/0xd0 alloc_mnt_ns+0x49/0xc0 create_mnt_ns+0x25/0x70 mnt_init+0x161/0x1c7 vfs_caches_init+0x107/0x11a start_kernel+0x348/0x38c x86_64_start_reservations+0x131/0x136 x86_64_start_kernel+0x103/0x112 irq event stamp: 2993422 hardirqs last enabled at (2993422): _raw_spin_unlock_irqrestore+0x55/0x80 hardirqs last disabled at (2993421): _raw_spin_lock_irqsave+0x29/0x70 softirqs last enabled at (2993394): _local_bh_enable+0x13/0x20 softirqs last disabled at (2993395): call_softirq+0x1c/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(proc_inum_lock); lock(proc_inum_lock); *** DEADLOCK *** no locks held by swapper/1/0. stack backtrace: Pid: 0, comm: swapper/1 Not tainted 3.7.0 #36 Call Trace: [] ? vprintk_emit+0x471/0x510 print_usage_bug+0x2a5/0x2c0 mark_lock+0x33b/0x5e0 __lock_acquire+0x813/0xca0 lock_acquire+0x199/0x200 _raw_spin_lock+0x41/0x50 proc_free_inum+0x1c/0x50 free_pid_ns+0x1c/0x50 put_pid_ns+0x2e/0x50 put_pid+0x4a/0x60 delayed_put_pid+0x12/0x20 rcu_process_callbacks+0x462/0x790 __do_softirq+0x1b4/0x3b0 call_softirq+0x1c/0x30 do_softirq+0x59/0xd0 irq_exit+0x54/0xd0 smp_apic_timer_interrupt+0x95/0xa3 apic_timer_interrupt+0x72/0x80 cpuidle_enter_tk+0x10/0x20 cpuidle_enter_state+0x17/0x50 cpuidle_idle_call+0x287/0x520 cpu_idle+0xba/0x130 start_secondary+0x2b3/0x2bc Signed-off-by: Xiaotian Feng Cc: Al Viro Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 7b3ae3cc0ef..e659a0ff1da 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -359,18 +359,18 @@ retry: if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) return -ENOMEM; - spin_lock(&proc_inum_lock); + spin_lock_bh(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); - spin_unlock(&proc_inum_lock); + spin_unlock_bh(&proc_inum_lock); if (error == -EAGAIN) goto retry; else if (error) return error; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { - spin_lock(&proc_inum_lock); + spin_lock_bh(&proc_inum_lock); ida_remove(&proc_inum_ida, i); - spin_unlock(&proc_inum_lock); + spin_unlock_bh(&proc_inum_lock); return -ENOSPC; } *inum = PROC_DYNAMIC_FIRST + i; @@ -379,9 +379,9 @@ retry: void proc_free_inum(unsigned int inum) { - spin_lock(&proc_inum_lock); + spin_lock_bh(&proc_inum_lock); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); - spin_unlock(&proc_inum_lock); + spin_unlock_bh(&proc_inum_lock); } static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) -- cgit v1.2.3 From dfb2ea45becb198beeb75350d0b7b7ad9076a38f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 21 Dec 2012 20:38:00 -0800 Subject: proc: Allow proc_free_inum to be called from any context While testing the pid namespace code I hit this nasty warning. [ 176.262617] ------------[ cut here ]------------ [ 176.263388] WARNING: at /home/eric/projects/linux/linux-userns-devel/kernel/softirq.c:160 local_bh_enable_ip+0x7a/0xa0() [ 176.265145] Hardware name: Bochs [ 176.265677] Modules linked in: [ 176.266341] Pid: 742, comm: bash Not tainted 3.7.0userns+ #18 [ 176.266564] Call Trace: [ 176.266564] [] warn_slowpath_common+0x7f/0xc0 [ 176.266564] [] warn_slowpath_null+0x1a/0x20 [ 176.266564] [] local_bh_enable_ip+0x7a/0xa0 [ 176.266564] [] _raw_spin_unlock_bh+0x19/0x20 [ 176.266564] [] proc_free_inum+0x3a/0x50 [ 176.266564] [] free_pid_ns+0x1c/0x80 [ 176.266564] [] put_pid_ns+0x35/0x50 [ 176.266564] [] put_pid+0x4a/0x60 [ 176.266564] [] tty_ioctl+0x717/0xc10 [ 176.266564] [] ? wait_consider_task+0x855/0xb90 [ 176.266564] [] ? default_spin_lock_flags+0x9/0x10 [ 176.266564] [] ? remove_wait_queue+0x5a/0x70 [ 176.266564] [] do_vfs_ioctl+0x98/0x550 [ 176.266564] [] ? recalc_sigpending+0x1f/0x60 [ 176.266564] [] ? __set_task_blocked+0x37/0x80 [ 176.266564] [] ? sys_wait4+0xab/0xf0 [ 176.266564] [] sys_ioctl+0x91/0xb0 [ 176.266564] [] ? task_stopped_code+0x50/0x50 [ 176.266564] [] system_call_fastpath+0x16/0x1b [ 176.266564] ---[ end trace 387af88219ad6143 ]--- It turns out that spin_unlock_bh(proc_inum_lock) is not safe when put_pid is called with another spinlock held and irqs disabled. For now take the easy path and use spin_lock_irqsave(proc_inum_lock) in proc_free_inum and spin_loc_irq in proc_alloc_inum(proc_inum_lock). Signed-off-by: "Eric W. Biederman" --- fs/proc/generic.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e064f562b1f..76ddae83daa 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -352,18 +352,18 @@ retry: if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) return -ENOMEM; - spin_lock_bh(&proc_inum_lock); + spin_lock_irq(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); - spin_unlock_bh(&proc_inum_lock); + spin_unlock_irq(&proc_inum_lock); if (error == -EAGAIN) goto retry; else if (error) return error; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { - spin_lock_bh(&proc_inum_lock); + spin_lock_irq(&proc_inum_lock); ida_remove(&proc_inum_ida, i); - spin_unlock_bh(&proc_inum_lock); + spin_unlock_irq(&proc_inum_lock); return -ENOSPC; } *inum = PROC_DYNAMIC_FIRST + i; @@ -372,9 +372,10 @@ retry: void proc_free_inum(unsigned int inum) { - spin_lock_bh(&proc_inum_lock); + unsigned long flags; + spin_lock_irqsave(&proc_inum_lock, flags); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); - spin_unlock_bh(&proc_inum_lock); + spin_unlock_irqrestore(&proc_inum_lock, flags); } static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) -- cgit v1.2.3 From a7a88b23737095e6c18a20c5d4eef9e25ec5b829 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Jan 2013 02:04:23 -0800 Subject: mempolicy: remove arg from mpol_parse_str, mpol_to_str Remove the unused argument (formerly no_context) from mpol_parse_str() and from mpol_to_str(). Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 448455b7fd9..ca5ce7f9f80 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1278,7 +1278,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.mm = mm; pol = get_vma_policy(task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); seq_printf(m, "%08lx %s", vma->vm_start, buffer); -- cgit v1.2.3