From 475d0094293b51353e342d1198377967dbc48169 Mon Sep 17 00:00:00 2001
From: Dong Aisheng <dong.aisheng@linaro.org>
Date: Wed, 11 Jul 2012 15:16:37 +1000
Subject: of: Improve prom_update_property() function

prom_update_property() currently fails if the property doesn't
actually exist yet which isn't what we want. Change to add-or-update
instead of update-only, then we can remove a lot duplicated lines.

Suggested-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Dong Aisheng <dong.aisheng@linaro.org>
Acked-by: Rob Herring <rob.herring@calxeda.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 fs/proc/proc_devtree.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 927cbd115e5..df7dd08d439 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -101,6 +101,11 @@ void proc_device_tree_update_prop(struct proc_dir_entry *pde,
 {
 	struct proc_dir_entry *ent;
 
+	if (!oldprop) {
+		proc_device_tree_add_prop(pde, newprop);
+		return;
+	}
+
 	for (ent = pde->subdir; ent != NULL; ent = ent->next)
 		if (ent->data == oldprop)
 			break;
-- 
cgit v1.2.3


From 0b728e1911cbe6e24020727c3870628b9653f32a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 Jun 2012 16:03:43 -0400
Subject: stop passing nameidata * to ->d_revalidate()

Just the lookup flags.  Die, bastard, die...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c        | 22 +++++++++++-----------
 fs/proc/internal.h    |  2 +-
 fs/proc/namespaces.c  |  2 +-
 fs/proc/proc_sysctl.c |  4 ++--
 4 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 437195f204e..bf749cca4cc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1601,13 +1601,13 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
  * made this apply to all per process world readable and executable
  * directories.
  */
-int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
+int pid_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	struct task_struct *task;
 	const struct cred *cred;
 
-	if (nd && nd->flags & LOOKUP_RCU)
+	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	inode = dentry->d_inode;
@@ -1781,7 +1781,7 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
 	return proc_fd_info(dentry->d_inode, path, NULL);
 }
 
-static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	struct task_struct *task;
@@ -1789,7 +1789,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 	struct files_struct *files;
 	const struct cred *cred;
 
-	if (nd && nd->flags & LOOKUP_RCU)
+	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	inode = dentry->d_inode;
@@ -1868,7 +1868,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-	if (tid_fd_revalidate(dentry, NULL))
+	if (tid_fd_revalidate(dentry, 0))
 		error = NULL;
 
  out:
@@ -2003,7 +2003,7 @@ static int dname_to_vma_addr(struct dentry *dentry,
 	return 0;
 }
 
-static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	unsigned long vm_start, vm_end;
 	bool exact_vma_exists = false;
@@ -2013,7 +2013,7 @@ static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 	struct inode *inode;
 	int status = 0;
 
-	if (nd && nd->flags & LOOKUP_RCU)
+	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	if (!capable(CAP_SYS_ADMIN)) {
@@ -2371,7 +2371,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-	if (tid_fd_revalidate(dentry, NULL))
+	if (tid_fd_revalidate(dentry, 0))
 		error = NULL;
 
  out:
@@ -2430,7 +2430,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
 	d_set_d_op(dentry, &pid_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, NULL))
+	if (pid_revalidate(dentry, 0))
 		error = NULL;
 out:
 	return error;
@@ -3237,7 +3237,7 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
 
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, NULL))
+	if (pid_revalidate(dentry, 0))
 		error = NULL;
 out:
 	return error;
@@ -3508,7 +3508,7 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
 
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, NULL))
+	if (pid_revalidate(dentry, 0))
 		error = NULL;
 out:
 	return error;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index eca4aca5b6e..e0c2a48dab7 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -142,7 +142,7 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
 int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	const char *name, int len,
 	instantiate_t instantiate, struct task_struct *task, const void *ptr);
-int pid_revalidate(struct dentry *dentry, struct nameidata *nd);
+int pid_revalidate(struct dentry *dentry, unsigned int flags);
 struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
 extern const struct dentry_operations pid_dentry_operations;
 int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 0d9e23a39e4..40ceb40f985 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -56,7 +56,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
 	d_set_d_op(dentry, &pid_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, NULL))
+	if (pid_revalidate(dentry, 0))
 		error = NULL;
 out:
 	return error;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 3476bca8f7a..fda69fa3909 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -794,9 +794,9 @@ static const struct inode_operations proc_sys_dir_operations = {
 	.getattr	= proc_sys_getattr,
 };
 
-static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	if (nd->flags & LOOKUP_RCU)
+	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 	return !PROC_I(dentry->d_inode)->sysctl->unregistering;
 }
-- 
cgit v1.2.3


From 00cd8dd3bf95f2cc8435b4cac01d9995635c6d0b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 Jun 2012 17:13:09 -0400
Subject: stop passing nameidata to ->lookup()

Just the flags; only NFS cares even about that, but there are
legitimate uses for such argument.  And getting rid of that
completely would require splitting ->lookup() into a couple
of methods (at least), so let's leave that alone for now...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c        | 18 ++++++++++--------
 fs/proc/generic.c     |  2 +-
 fs/proc/internal.h    |  4 ++--
 fs/proc/namespaces.c  |  2 +-
 fs/proc/proc_net.c    |  2 +-
 fs/proc/proc_sysctl.c |  2 +-
 fs/proc/root.c        |  7 +++----
 7 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index bf749cca4cc..8eaa5ea1c61 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1956,7 +1956,7 @@ out_no_task:
 }
 
 static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
-				    struct nameidata *nd)
+				    unsigned int flags)
 {
 	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
 }
@@ -2145,7 +2145,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 }
 
 static struct dentry *proc_map_files_lookup(struct inode *dir,
-		struct dentry *dentry, struct nameidata *nd)
+		struct dentry *dentry, unsigned int flags)
 {
 	unsigned long vm_start, vm_end;
 	struct vm_area_struct *vma;
@@ -2380,7 +2380,7 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
 
 static struct dentry *proc_lookupfdinfo(struct inode *dir,
 					struct dentry *dentry,
-					struct nameidata *nd)
+					unsigned int flags)
 {
 	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
 }
@@ -2630,7 +2630,7 @@ static const struct file_operations proc_attr_dir_operations = {
 };
 
 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
-				struct dentry *dentry, struct nameidata *nd)
+				struct dentry *dentry, unsigned int flags)
 {
 	return proc_pident_lookup(dir, dentry,
 				  attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
@@ -3114,7 +3114,8 @@ static const struct file_operations proc_tgid_base_operations = {
 	.llseek		= default_llseek,
 };
 
-static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
+static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
 	return proc_pident_lookup(dir, dentry,
 				  tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
 }
@@ -3243,7 +3244,7 @@ out:
 	return error;
 }
 
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
 	struct dentry *result;
 	struct task_struct *task;
@@ -3470,7 +3471,8 @@ static int proc_tid_base_readdir(struct file * filp,
 				   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
 }
 
-static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
+static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
 	return proc_pident_lookup(dir, dentry,
 				  tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
 }
@@ -3514,7 +3516,7 @@ out:
 	return error;
 }
 
-static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
 	struct dentry *result = ERR_PTR(-ENOENT);
 	struct task_struct *task;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 2edf34f2eb6..b3647fe6a60 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -446,7 +446,7 @@ out_unlock:
 }
 
 struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
-		struct nameidata *nd)
+		unsigned int flags)
 {
 	return proc_lookup_de(PDE(dir), dir, dentry);
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index e0c2a48dab7..e1167a1c912 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -106,7 +106,7 @@ void pde_users_dec(struct proc_dir_entry *pde);
 
 extern spinlock_t proc_subdir_lock;
 
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
 unsigned long task_vsize(struct mm_struct *);
 unsigned long task_statm(struct mm_struct *,
@@ -132,7 +132,7 @@ int proc_remount(struct super_block *sb, int *flags, char *data);
  * of the /proc/<pid> subdirectories.
  */
 int proc_readdir(struct file *, void *, filldir_t);
-struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
+struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
 
 
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 40ceb40f985..b178ed733c3 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -140,7 +140,7 @@ const struct file_operations proc_ns_dir_operations = {
 };
 
 static struct dentry *proc_ns_dir_lookup(struct inode *dir,
-				struct dentry *dentry, struct nameidata *nd)
+				struct dentry *dentry, unsigned int flags)
 {
 	struct dentry *error;
 	struct task_struct *task = get_proc_task(dir);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 06e1cc17caf..fe72cd073de 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -119,7 +119,7 @@ static struct net *get_proc_task_net(struct inode *dir)
 }
 
 static struct dentry *proc_tgid_net_lookup(struct inode *dir,
-		struct dentry *dentry, struct nameidata *nd)
+		struct dentry *dentry, unsigned int flags)
 {
 	struct dentry *de;
 	struct net *net;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fda69fa3909..dfafeb2b05a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -433,7 +433,7 @@ static struct ctl_table_header *grab_header(struct inode *inode)
 }
 
 static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
-					struct nameidata *nd)
+					unsigned int flags)
 {
 	struct ctl_table_header *head = grab_header(dir);
 	struct ctl_table_header *h = NULL;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7c30fce037c..568b20290c7 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -200,13 +200,12 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	return 0;
 }
 
-static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
 {
-	if (!proc_lookup(dir, dentry, nd)) {
+	if (!proc_lookup(dir, dentry, flags))
 		return NULL;
-	}
 	
-	return proc_pid_lookup(dir, dentry, nd);
+	return proc_pid_lookup(dir, dentry, flags);
 }
 
 static int proc_root_readdir(struct file * filp,
-- 
cgit v1.2.3


From 408ef013cc9e2f94a14f7ccbbe52ddfb18437a99 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 18 Jun 2012 10:47:03 -0400
Subject: fs: move path_put on failure out of ->follow_link

Currently the non-nd_set_link based versions of ->follow_link are expected
to do a path_put(&nd->path) on failure.  This calling convention is unexpected,
undocumented and doesn't match what the nd_set_link-based instances do.

Move the path_put out of the only non-nd_set_link based ->follow_link
instance into the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8eaa5ea1c61..3bd5ac1ff01 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1427,16 +1427,20 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
+	struct path path;
 	int error = -EACCES;
 
-	/* We don't need a base pointer in the /proc filesystem */
-	path_put(&nd->path);
-
 	/* Are we allowed to snoop on the tasks file descriptors? */
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+	if (error)
+		goto out;
+
+	path_put(&nd->path);
+	nd->path = path;
+	return NULL;
 out:
 	return ERR_PTR(error);
 }
-- 
cgit v1.2.3


From b5fb63c18315c5510c1d0636179c057e0c761c77 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 18 Jun 2012 10:47:04 -0400
Subject: fs: add nd_jump_link

Add a helper that abstracts out the jump to an already parsed struct path
from ->follow_link operation from procfs.  Not only does this clean up
the code by moving the two sides of this game into a single helper, but
it also prepares for making struct nameidata private to namei.c

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3bd5ac1ff01..2772208338f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1438,8 +1438,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (error)
 		goto out;
 
-	path_put(&nd->path);
-	nd->path = path;
+	nd_jump_link(nd, &path);
 	return NULL;
 out:
 	return ERR_PTR(error);
-- 
cgit v1.2.3


From 9249e17fe094d853d1ef7475dd559a2cc7e23d42 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Jun 2012 12:55:37 +0100
Subject: VFS: Pass mount flags to sget()

Pass mount flags to sget() so that it can use them in initialising a new
superblock before the set function is called.  They could also be passed to the
compare function.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/root.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 568b20290c7..9a2d9fd7cad 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -111,7 +111,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		options = data;
 	}
 
-	sb = sget(fs_type, proc_test_super, proc_set_super, ns);
+	sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 
@@ -121,7 +121,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	}
 
 	if (!sb->s_root) {
-		sb->s_flags = flags;
 		err = proc_fill_super(sb);
 		if (err) {
 			deactivate_locked_super(sb);
-- 
cgit v1.2.3


From e8905ec27e2f4ea1b9f7e03df68a060b3ae6fca8 Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Mon, 30 Jul 2012 14:42:26 -0700
Subject: proc: environ_read() make sure offset points to environment address
 range

Currently the following offset and environment address range check in
environ_read() of /proc/<pid>/environ is buggy:

  int this_len = mm->env_end - (mm->env_start + src);
  if (this_len <= 0)
    break;

Large or negative offsets on /proc/<pid>/environ converted to 'unsigned
long' may pass this check since '(mm->env_start + src)' can overflow and
'this_len' will be positive.

This can turn /proc/<pid>/environ to act like /proc/<pid>/mem since
(mm->env_start + src) will point and read from another VMA.

There are two fixes here plus some code cleaning:

1) Fix the overflow by checking if the offset that was converted to
   unsigned long will always point to the [mm->env_start, mm->env_end]
   address range.

2) Remove the truncation that was made to the result of the check,
   storing the result in 'int this_len' will alter its value and we can
   not depend on it.

For kernels that have commit b409e578d ("proc: clean up
/proc/<pid>/environ handling") which adds the appropriate ptrace check and
saves the 'mm' at ->open() time, this is not a security issue.

This patch is taken from the grsecurity patch since it was just made
available.

Signed-off-by: Djalal Harouni <tixxdz@opendz.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Brad Spengler <spender@grsecurity.net>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2772208338f..39ee093b5e9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -827,15 +827,16 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	if (!atomic_inc_not_zero(&mm->mm_users))
 		goto free;
 	while (count > 0) {
-		int this_len, retval, max_len;
+		size_t this_len, max_len;
+		int retval;
 
-		this_len = mm->env_end - (mm->env_start + src);
-
-		if (this_len <= 0)
+		if (src >= (mm->env_end - mm->env_start))
 			break;
 
-		max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-		this_len = (this_len > max_len) ? max_len : this_len;
+		this_len = mm->env_end - (mm->env_start + src);
+
+		max_len = min_t(size_t, PAGE_SIZE, count);
+		this_len = min(max_len, this_len);
 
 		retval = access_remote_vm(mm, (mm->env_start + src),
 			page, this_len, 0);
-- 
cgit v1.2.3


From bc452b4b65bd589083a7a7ba4f14f85dfc8454fa Mon Sep 17 00:00:00 2001
From: Djalal Harouni <tixxdz@opendz.org>
Date: Mon, 30 Jul 2012 14:42:28 -0700
Subject: proc: do not allow negative offsets on /proc/<pid>/environ

__mem_open() which is called by both /proc/<pid>/environ and
/proc/<pid>/mem ->open() handlers will allow the use of negative offsets.
/proc/<pid>/mem has negative offsets but not /proc/<pid>/environ.

Clean this by moving the 'force FMODE_UNSIGNED_OFFSET flag' to mem_open()
to allow negative offsets only on /proc/<pid>/mem.

Signed-off-by: Djalal Harouni <tixxdz@opendz.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Brad Spengler <spender@grsecurity.net>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 39ee093b5e9..1b6c84cbdb7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -695,8 +695,6 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
 		mmput(mm);
 	}
 
-	/* OK to pass negative loff_t, we can catch out-of-range */
-	file->f_mode |= FMODE_UNSIGNED_OFFSET;
 	file->private_data = mm;
 
 	return 0;
@@ -704,7 +702,12 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
 
 static int mem_open(struct inode *inode, struct file *file)
 {
-	return __mem_open(inode, file, PTRACE_MODE_ATTACH);
+	int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
+
+	/* OK to pass negative loff_t, we can catch out-of-range */
+	file->f_mode |= FMODE_UNSIGNED_OFFSET;
+
+	return ret;
 }
 
 static ssize_t mem_rw(struct file *file, char __user *buf,
-- 
cgit v1.2.3


From 6bf6104573482570f7103d3e5ddf9574db43a363 Mon Sep 17 00:00:00 2001
From: Francesco Ruggeri <fruggeri@aristanetworks.com>
Date: Thu, 13 Sep 2012 15:03:37 -0700
Subject: fs/proc: fix potential unregister_sysctl_table hang

The unregister_sysctl_table() function hangs if all references to its
ctl_table_header structure are not dropped.

This can happen sometimes because of a leak in proc_sys_lookup():
proc_sys_lookup() gets a reference to the table via lookup_entry(), but
it does not release it when a subsequent call to sysctl_follow_link()
fails.

This patch fixes this leak by making sure the reference is always
dropped on return.

See also commit 076c3eed2c31 ("sysctl: Rewrite proc_sys_lookup
introducing find_entry and lookup_entry") which reorganized this code in
3.4.

Tested in Linux 3.4.4.

Signed-off-by: Francesco Ruggeri <fruggeri@aristanetworks.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dfafeb2b05a..eb7cc91b725 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -462,9 +462,6 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 
 	err = ERR_PTR(-ENOMEM);
 	inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
-	if (h)
-		sysctl_head_finish(h);
-
 	if (!inode)
 		goto out;
 
@@ -473,6 +470,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 	d_add(dentry, inode);
 
 out:
+	if (h)
+		sysctl_head_finish(h);
 	sysctl_head_finish(head);
 	return err;
 }
-- 
cgit v1.2.3


From e1760bd5ffae8cb98cffb030ee8e631eba28f3d8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 10 Sep 2012 22:39:43 -0700
Subject: userns: Convert the audit loginuid  to be a kuid

Always store audit loginuids in type kuid_t.

Print loginuids by converting them into uids in the appropriate user
namespace, and then printing the resulting uid.

Modify audit_get_loginuid to return a kuid_t.

Modify audit_set_loginuid to take a kuid_t.

Modify /proc/<pid>/loginuid on read to convert the loginuid into the
user namespace of the opener of the file.

Modify /proc/<pid>/loginud on write to convert the loginuid
rom the user namespace of the opener of the file.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Cc: Paul Moore <paul@paul-moore.com> ?
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/base.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b6c84cbdb7..138cff4b05d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1089,7 +1089,8 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 	if (!task)
 		return -ESRCH;
 	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
-				audit_get_loginuid(task));
+			   from_kuid(file->f_cred->user_ns,
+				     audit_get_loginuid(task)));
 	put_task_struct(task);
 	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
 }
@@ -1101,6 +1102,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	char *page, *tmp;
 	ssize_t length;
 	uid_t loginuid;
+	kuid_t kloginuid;
 
 	rcu_read_lock();
 	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1130,7 +1132,13 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	length = audit_set_loginuid(loginuid);
+	kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
+	if (!uid_valid(kloginuid)) {
+		length = -EINVAL;
+		goto out_free_page;
+	}
+
+	length = audit_set_loginuid(kloginuid);
 	if (likely(length == 0))
 		length = count;
 
-- 
cgit v1.2.3


From f76d207a66c3a53defea67e7d36c3eb1b7d6d61d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 30 Aug 2012 01:24:05 -0700
Subject: userns: Add kprojid_t and associated infrastructure in projid.h

Implement kprojid_t a cousin of the kuid_t and kgid_t.

The per user namespace mapping of project id values can be set with
/proc/<pid>/projid_map.

A full compliment of helpers is provided: make_kprojid, from_kprojid,
from_kprojid_munged, kporjid_has_mapping, projid_valid, projid_eq,
projid_eq, projid_lt.

Project identifiers are part of the generic disk quota interface,
although it appears only xfs implements project identifiers currently.

The xfs code allows anyone who has permission to set the project
identifier on a file to use any project identifier so when
setting up the user namespace project identifier mappings I do
not require a capability.

Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/base.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 138cff4b05d..acd1960c28a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2991,6 +2991,11 @@ static int proc_gid_map_open(struct inode *inode, struct file *file)
 	return proc_id_map_open(inode, file, &proc_gid_seq_operations);
 }
 
+static int proc_projid_map_open(struct inode *inode, struct file *file)
+{
+	return proc_id_map_open(inode, file, &proc_projid_seq_operations);
+}
+
 static const struct file_operations proc_uid_map_operations = {
 	.open		= proc_uid_map_open,
 	.write		= proc_uid_map_write,
@@ -3006,6 +3011,14 @@ static const struct file_operations proc_gid_map_operations = {
 	.llseek		= seq_lseek,
 	.release	= proc_id_map_release,
 };
+
+static const struct file_operations proc_projid_map_operations = {
+	.open		= proc_projid_map_open,
+	.write		= proc_projid_map_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= proc_id_map_release,
+};
 #endif /* CONFIG_USER_NS */
 
 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
@@ -3113,6 +3126,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_USER_NS
 	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
 	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
+	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
 #endif
 };
 
@@ -3476,6 +3490,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_USER_NS
 	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
 	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
+	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
 #endif
 };
 
-- 
cgit v1.2.3


From faf60af17f8da87e1c87a6be527344791025ce9e Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 23 Aug 2012 14:43:24 +0400
Subject: procfs: Move /proc/pid/fd[info] handling code to fd.[ch]

This patch prepares the ground for further extension of
/proc/pid/fd[info] handling code by moving fdinfo handling
code into fs/proc/fd.c.

I think such move makes both fs/proc/base.c and fs/proc/fd.c
easier to read.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: James Bottomley <jbottomley@parallels.com>
CC: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Matthew Helsley <matt.helsley@gmail.com>
CC: "J. Bruce Fields" <bfields@fieldses.org>
CC: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/Makefile   |   2 +-
 fs/proc/base.c     | 388 +----------------------------------------------------
 fs/proc/fd.c       | 351 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/fd.h       |  14 ++
 fs/proc/internal.h |  48 +++++++
 5 files changed, 416 insertions(+), 387 deletions(-)
 create mode 100644 fs/proc/fd.c
 create mode 100644 fs/proc/fd.h

(limited to 'fs/proc')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index c1c72933592..99349efbbc2 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,7 +8,7 @@ proc-y			:= nommu.o task_nommu.o
 proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
-		proc_tty.o
+		proc_tty.o fd.o
 proc-y	+= cmdline.o
 proc-y	+= consoles.o
 proc-y	+= cpuinfo.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b6c84cbdb7..b55c3bb298e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -90,6 +90,7 @@
 #endif
 #include <trace/events/oom.h>
 #include "internal.h"
+#include "fd.h"
 
 /* NOTE:
  *	Implementing inode permission operations in /proc is almost
@@ -136,8 +137,6 @@ struct pid_entry {
 		NULL, &proc_single_file_operations,	\
 		{ .proc_show = show } )
 
-static int proc_fd_permission(struct inode *inode, int mask);
-
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
@@ -1492,7 +1491,7 @@ out:
 	return error;
 }
 
-static const struct inode_operations proc_pid_link_inode_operations = {
+const struct inode_operations proc_pid_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
 	.follow_link	= proc_pid_follow_link,
 	.setattr	= proc_setattr,
@@ -1501,21 +1500,6 @@ static const struct inode_operations proc_pid_link_inode_operations = {
 
 /* building an inode */
 
-static int task_dumpable(struct task_struct *task)
-{
-	int dumpable = 0;
-	struct mm_struct *mm;
-
-	task_lock(task);
-	mm = task->mm;
-	if (mm)
-		dumpable = get_dumpable(mm);
-	task_unlock(task);
-	if(dumpable == 1)
-		return 1;
-	return 0;
-}
-
 struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
 {
 	struct inode * inode;
@@ -1641,15 +1625,6 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
 	return 0;
 }
 
-static int pid_delete_dentry(const struct dentry * dentry)
-{
-	/* Is the task we represent dead?
-	 * If so, then don't put the dentry on the lru list,
-	 * kill it immediately.
-	 */
-	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
-}
-
 const struct dentry_operations pid_dentry_operations =
 {
 	.d_revalidate	= pid_revalidate,
@@ -1712,289 +1687,6 @@ end_instantiate:
 	return filldir(dirent, name, len, filp->f_pos, ino, type);
 }
 
-static unsigned name_to_int(struct dentry *dentry)
-{
-	const char *name = dentry->d_name.name;
-	int len = dentry->d_name.len;
-	unsigned n = 0;
-
-	if (len > 1 && *name == '0')
-		goto out;
-	while (len-- > 0) {
-		unsigned c = *name++ - '0';
-		if (c > 9)
-			goto out;
-		if (n >= (~0U-9)/10)
-			goto out;
-		n *= 10;
-		n += c;
-	}
-	return n;
-out:
-	return ~0U;
-}
-
-#define PROC_FDINFO_MAX 64
-
-static int proc_fd_info(struct inode *inode, struct path *path, char *info)
-{
-	struct task_struct *task = get_proc_task(inode);
-	struct files_struct *files = NULL;
-	struct file *file;
-	int fd = proc_fd(inode);
-
-	if (task) {
-		files = get_files_struct(task);
-		put_task_struct(task);
-	}
-	if (files) {
-		/*
-		 * We are not taking a ref to the file structure, so we must
-		 * hold ->file_lock.
-		 */
-		spin_lock(&files->file_lock);
-		file = fcheck_files(files, fd);
-		if (file) {
-			unsigned int f_flags;
-			struct fdtable *fdt;
-
-			fdt = files_fdtable(files);
-			f_flags = file->f_flags & ~O_CLOEXEC;
-			if (close_on_exec(fd, fdt))
-				f_flags |= O_CLOEXEC;
-
-			if (path) {
-				*path = file->f_path;
-				path_get(&file->f_path);
-			}
-			if (info)
-				snprintf(info, PROC_FDINFO_MAX,
-					 "pos:\t%lli\n"
-					 "flags:\t0%o\n",
-					 (long long) file->f_pos,
-					 f_flags);
-			spin_unlock(&files->file_lock);
-			put_files_struct(files);
-			return 0;
-		}
-		spin_unlock(&files->file_lock);
-		put_files_struct(files);
-	}
-	return -ENOENT;
-}
-
-static int proc_fd_link(struct dentry *dentry, struct path *path)
-{
-	return proc_fd_info(dentry->d_inode, path, NULL);
-}
-
-static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	struct inode *inode;
-	struct task_struct *task;
-	int fd;
-	struct files_struct *files;
-	const struct cred *cred;
-
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	inode = dentry->d_inode;
-	task = get_proc_task(inode);
-	fd = proc_fd(inode);
-
-	if (task) {
-		files = get_files_struct(task);
-		if (files) {
-			struct file *file;
-			rcu_read_lock();
-			file = fcheck_files(files, fd);
-			if (file) {
-				unsigned f_mode = file->f_mode;
-
-				rcu_read_unlock();
-				put_files_struct(files);
-
-				if (task_dumpable(task)) {
-					rcu_read_lock();
-					cred = __task_cred(task);
-					inode->i_uid = cred->euid;
-					inode->i_gid = cred->egid;
-					rcu_read_unlock();
-				} else {
-					inode->i_uid = GLOBAL_ROOT_UID;
-					inode->i_gid = GLOBAL_ROOT_GID;
-				}
-
-				if (S_ISLNK(inode->i_mode)) {
-					unsigned i_mode = S_IFLNK;
-					if (f_mode & FMODE_READ)
-						i_mode |= S_IRUSR | S_IXUSR;
-					if (f_mode & FMODE_WRITE)
-						i_mode |= S_IWUSR | S_IXUSR;
-					inode->i_mode = i_mode;
-				}
-
-				security_task_to_inode(task, inode);
-				put_task_struct(task);
-				return 1;
-			}
-			rcu_read_unlock();
-			put_files_struct(files);
-		}
-		put_task_struct(task);
-	}
-	d_drop(dentry);
-	return 0;
-}
-
-static const struct dentry_operations tid_fd_dentry_operations =
-{
-	.d_revalidate	= tid_fd_revalidate,
-	.d_delete	= pid_delete_dentry,
-};
-
-static struct dentry *proc_fd_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
-{
-	unsigned fd = (unsigned long)ptr;
- 	struct inode *inode;
- 	struct proc_inode *ei;
-	struct dentry *error = ERR_PTR(-ENOENT);
-
-	inode = proc_pid_make_inode(dir->i_sb, task);
-	if (!inode)
-		goto out;
-	ei = PROC_I(inode);
-	ei->fd = fd;
-
-	inode->i_mode = S_IFLNK;
-	inode->i_op = &proc_pid_link_inode_operations;
-	inode->i_size = 64;
-	ei->op.proc_get_link = proc_fd_link;
-	d_set_d_op(dentry, &tid_fd_dentry_operations);
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (tid_fd_revalidate(dentry, 0))
-		error = NULL;
-
- out:
-	return error;
-}
-
-static struct dentry *proc_lookupfd_common(struct inode *dir,
-					   struct dentry *dentry,
-					   instantiate_t instantiate)
-{
-	struct task_struct *task = get_proc_task(dir);
-	unsigned fd = name_to_int(dentry);
-	struct dentry *result = ERR_PTR(-ENOENT);
-
-	if (!task)
-		goto out_no_task;
-	if (fd == ~0U)
-		goto out;
-
-	result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
-out:
-	put_task_struct(task);
-out_no_task:
-	return result;
-}
-
-static int proc_readfd_common(struct file * filp, void * dirent,
-			      filldir_t filldir, instantiate_t instantiate)
-{
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = get_proc_task(inode);
-	unsigned int fd, ino;
-	int retval;
-	struct files_struct * files;
-
-	retval = -ENOENT;
-	if (!p)
-		goto out_no_task;
-	retval = 0;
-
-	fd = filp->f_pos;
-	switch (fd) {
-		case 0:
-			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-				goto out;
-			filp->f_pos++;
-		case 1:
-			ino = parent_ino(dentry);
-			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-				goto out;
-			filp->f_pos++;
-		default:
-			files = get_files_struct(p);
-			if (!files)
-				goto out;
-			rcu_read_lock();
-			for (fd = filp->f_pos-2;
-			     fd < files_fdtable(files)->max_fds;
-			     fd++, filp->f_pos++) {
-				char name[PROC_NUMBUF];
-				int len;
-				int rv;
-
-				if (!fcheck_files(files, fd))
-					continue;
-				rcu_read_unlock();
-
-				len = snprintf(name, sizeof(name), "%d", fd);
-				rv = proc_fill_cache(filp, dirent, filldir,
-						     name, len, instantiate, p,
-						     (void *)(unsigned long)fd);
-				if (rv < 0)
-					goto out_fd_loop;
-				rcu_read_lock();
-			}
-			rcu_read_unlock();
-out_fd_loop:
-			put_files_struct(files);
-	}
-out:
-	put_task_struct(p);
-out_no_task:
-	return retval;
-}
-
-static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
-				    unsigned int flags)
-{
-	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
-}
-
-static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
-{
-	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
-}
-
-static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
-				      size_t len, loff_t *ppos)
-{
-	char tmp[PROC_FDINFO_MAX];
-	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
-	if (!err)
-		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
-	return err;
-}
-
-static const struct file_operations proc_fdinfo_file_operations = {
-	.open           = nonseekable_open,
-	.read		= proc_fdinfo_read,
-	.llseek		= no_llseek,
-};
-
-static const struct file_operations proc_fd_operations = {
-	.read		= generic_read_dir,
-	.readdir	= proc_readfd,
-	.llseek		= default_llseek,
-};
-
 #ifdef CONFIG_CHECKPOINT_RESTORE
 
 /*
@@ -2337,82 +2029,6 @@ static const struct file_operations proc_map_files_operations = {
 
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
-/*
- * /proc/pid/fd needs a special permission handler so that a process can still
- * access /proc/self/fd after it has executed a setuid().
- */
-static int proc_fd_permission(struct inode *inode, int mask)
-{
-	int rv = generic_permission(inode, mask);
-	if (rv == 0)
-		return 0;
-	if (task_pid(current) == proc_pid(inode))
-		rv = 0;
-	return rv;
-}
-
-/*
- * proc directories can do almost nothing..
- */
-static const struct inode_operations proc_fd_inode_operations = {
-	.lookup		= proc_lookupfd,
-	.permission	= proc_fd_permission,
-	.setattr	= proc_setattr,
-};
-
-static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
-{
-	unsigned fd = (unsigned long)ptr;
- 	struct inode *inode;
- 	struct proc_inode *ei;
-	struct dentry *error = ERR_PTR(-ENOENT);
-
-	inode = proc_pid_make_inode(dir->i_sb, task);
-	if (!inode)
-		goto out;
-	ei = PROC_I(inode);
-	ei->fd = fd;
-	inode->i_mode = S_IFREG | S_IRUSR;
-	inode->i_fop = &proc_fdinfo_file_operations;
-	d_set_d_op(dentry, &tid_fd_dentry_operations);
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (tid_fd_revalidate(dentry, 0))
-		error = NULL;
-
- out:
-	return error;
-}
-
-static struct dentry *proc_lookupfdinfo(struct inode *dir,
-					struct dentry *dentry,
-					unsigned int flags)
-{
-	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
-}
-
-static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
-{
-	return proc_readfd_common(filp, dirent, filldir,
-				  proc_fdinfo_instantiate);
-}
-
-static const struct file_operations proc_fdinfo_operations = {
-	.read		= generic_read_dir,
-	.readdir	= proc_readfdinfo,
-	.llseek		= default_llseek,
-};
-
-/*
- * proc directories can do almost nothing..
- */
-static const struct inode_operations proc_fdinfo_inode_operations = {
-	.lookup		= proc_lookupfdinfo,
-	.setattr	= proc_setattr,
-};
-
-
 static struct dentry *proc_pident_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
new file mode 100644
index 00000000000..1b0f932b4bd
--- /dev/null
+++ b/fs/proc/fd.c
@@ -0,0 +1,351 @@
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/dcache.h>
+#include <linux/path.h>
+#include <linux/fdtable.h>
+#include <linux/namei.h>
+#include <linux/pid.h>
+#include <linux/security.h>
+
+#include <linux/proc_fs.h>
+
+#include "internal.h"
+#include "fd.h"
+
+#define PROC_FDINFO_MAX 64
+
+static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+{
+	struct task_struct *task = get_proc_task(inode);
+	struct files_struct *files = NULL;
+	int fd = proc_fd(inode);
+	struct file *file;
+
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
+	if (files) {
+		/*
+		 * We are not taking a ref to the file structure, so we must
+		 * hold ->file_lock.
+		 */
+		spin_lock(&files->file_lock);
+		file = fcheck_files(files, fd);
+		if (file) {
+			unsigned int f_flags;
+			struct fdtable *fdt;
+
+			fdt = files_fdtable(files);
+			f_flags = file->f_flags & ~O_CLOEXEC;
+			if (close_on_exec(fd, fdt))
+				f_flags |= O_CLOEXEC;
+
+			if (path) {
+				*path = file->f_path;
+				path_get(&file->f_path);
+			}
+			if (info)
+				snprintf(info, PROC_FDINFO_MAX,
+					 "pos:\t%lli\n"
+					 "flags:\t0%o\n",
+					 (long long) file->f_pos,
+					 f_flags);
+			spin_unlock(&files->file_lock);
+			put_files_struct(files);
+			return 0;
+		}
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
+	}
+	return -ENOENT;
+}
+
+static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct files_struct *files;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct inode *inode;
+	int fd;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	fd = proc_fd(inode);
+
+	if (task) {
+		files = get_files_struct(task);
+		if (files) {
+			struct file *file;
+
+			rcu_read_lock();
+			file = fcheck_files(files, fd);
+			if (file) {
+				unsigned f_mode = file->f_mode;
+
+				rcu_read_unlock();
+				put_files_struct(files);
+
+				if (task_dumpable(task)) {
+					rcu_read_lock();
+					cred = __task_cred(task);
+					inode->i_uid = cred->euid;
+					inode->i_gid = cred->egid;
+					rcu_read_unlock();
+				} else {
+					inode->i_uid = GLOBAL_ROOT_UID;
+					inode->i_gid = GLOBAL_ROOT_GID;
+				}
+
+				if (S_ISLNK(inode->i_mode)) {
+					unsigned i_mode = S_IFLNK;
+					if (f_mode & FMODE_READ)
+						i_mode |= S_IRUSR | S_IXUSR;
+					if (f_mode & FMODE_WRITE)
+						i_mode |= S_IWUSR | S_IXUSR;
+					inode->i_mode = i_mode;
+				}
+
+				security_task_to_inode(task, inode);
+				put_task_struct(task);
+				return 1;
+			}
+			rcu_read_unlock();
+			put_files_struct(files);
+		}
+		put_task_struct(task);
+	}
+
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_fd_dentry_operations = {
+	.d_revalidate	= tid_fd_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_fd_link(struct dentry *dentry, struct path *path)
+{
+	return proc_fd_info(dentry->d_inode, path, NULL);
+}
+
+static struct dentry *
+proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
+		    struct task_struct *task, const void *ptr)
+{
+	struct dentry *error = ERR_PTR(-ENOENT);
+	unsigned fd = (unsigned long)ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ei = PROC_I(inode);
+	ei->fd = fd;
+
+	inode->i_mode = S_IFLNK;
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+
+	ei->op.proc_get_link = proc_fd_link;
+
+	d_set_d_op(dentry, &tid_fd_dentry_operations);
+	d_add(dentry, inode);
+
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, 0))
+		error = NULL;
+ out:
+	return error;
+}
+
+static struct dentry *proc_lookupfd_common(struct inode *dir,
+					   struct dentry *dentry,
+					   instantiate_t instantiate)
+{
+	struct task_struct *task = get_proc_task(dir);
+	struct dentry *result = ERR_PTR(-ENOENT);
+	unsigned fd = name_to_int(dentry);
+
+	if (!task)
+		goto out_no_task;
+	if (fd == ~0U)
+		goto out;
+
+	result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
+out:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_readfd_common(struct file * filp, void * dirent,
+			      filldir_t filldir, instantiate_t instantiate)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *p = get_proc_task(inode);
+	struct files_struct *files;
+	unsigned int fd, ino;
+	int retval;
+
+	retval = -ENOENT;
+	if (!p)
+		goto out_no_task;
+	retval = 0;
+
+	fd = filp->f_pos;
+	switch (fd) {
+		case 0:
+			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+				goto out;
+			filp->f_pos++;
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+				goto out;
+			filp->f_pos++;
+		default:
+			files = get_files_struct(p);
+			if (!files)
+				goto out;
+			rcu_read_lock();
+			for (fd = filp->f_pos - 2;
+			     fd < files_fdtable(files)->max_fds;
+			     fd++, filp->f_pos++) {
+				char name[PROC_NUMBUF];
+				int len;
+				int rv;
+
+				if (!fcheck_files(files, fd))
+					continue;
+				rcu_read_unlock();
+
+				len = snprintf(name, sizeof(name), "%d", fd);
+				rv = proc_fill_cache(filp, dirent, filldir,
+						     name, len, instantiate, p,
+						     (void *)(unsigned long)fd);
+				if (rv < 0)
+					goto out_fd_loop;
+				rcu_read_lock();
+			}
+			rcu_read_unlock();
+out_fd_loop:
+			put_files_struct(files);
+	}
+out:
+	put_task_struct(p);
+out_no_task:
+	return retval;
+}
+
+static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
+				size_t len, loff_t *ppos)
+{
+	char tmp[PROC_FDINFO_MAX];
+	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
+	if (!err)
+		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
+	return err;
+}
+
+static const struct file_operations proc_fdinfo_file_operations = {
+	.open           = nonseekable_open,
+	.read		= proc_fdinfo_read,
+	.llseek		= no_llseek,
+};
+
+static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
+{
+	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
+}
+
+const struct file_operations proc_fd_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_readfd,
+	.llseek		= default_llseek,
+};
+
+static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
+				    unsigned int flags)
+{
+	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
+}
+
+/*
+ * /proc/pid/fd needs a special permission handler so that a process can still
+ * access /proc/self/fd after it has executed a setuid().
+ */
+int proc_fd_permission(struct inode *inode, int mask)
+{
+	int rv = generic_permission(inode, mask);
+	if (rv == 0)
+		return 0;
+	if (task_pid(current) == proc_pid(inode))
+		rv = 0;
+	return rv;
+}
+
+const struct inode_operations proc_fd_inode_operations = {
+	.lookup		= proc_lookupfd,
+	.permission	= proc_fd_permission,
+	.setattr	= proc_setattr,
+};
+
+static struct dentry *
+proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
+			struct task_struct *task, const void *ptr)
+{
+	struct dentry *error = ERR_PTR(-ENOENT);
+	unsigned fd = (unsigned long)ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ei = PROC_I(inode);
+	ei->fd = fd;
+
+	inode->i_mode = S_IFREG | S_IRUSR;
+	inode->i_fop = &proc_fdinfo_file_operations;
+
+	d_set_d_op(dentry, &tid_fd_dentry_operations);
+	d_add(dentry, inode);
+
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, 0))
+		error = NULL;
+ out:
+	return error;
+}
+
+static struct dentry *
+proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
+}
+
+static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
+{
+	return proc_readfd_common(filp, dirent, filldir,
+				  proc_fdinfo_instantiate);
+}
+
+const struct inode_operations proc_fdinfo_inode_operations = {
+	.lookup		= proc_lookupfdinfo,
+	.setattr	= proc_setattr,
+};
+
+const struct file_operations proc_fdinfo_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_readfdinfo,
+	.llseek		= default_llseek,
+};
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
new file mode 100644
index 00000000000..cbb1d47deda
--- /dev/null
+++ b/fs/proc/fd.h
@@ -0,0 +1,14 @@
+#ifndef __PROCFS_FD_H__
+#define __PROCFS_FD_H__
+
+#include <linux/fs.h>
+
+extern const struct file_operations proc_fd_operations;
+extern const struct inode_operations proc_fd_inode_operations;
+
+extern const struct file_operations proc_fdinfo_operations;
+extern const struct inode_operations proc_fdinfo_inode_operations;
+
+extern int proc_fd_permission(struct inode *inode, int mask);
+
+#endif /* __PROCFS_FD_H__ */
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index e1167a1c912..67925a7bd8c 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -9,6 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/sched.h>
 #include <linux/proc_fs.h>
 struct  ctl_table_header;
 
@@ -65,6 +66,7 @@ extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
 extern const struct inode_operations proc_net_inode_operations;
+extern const struct inode_operations proc_pid_link_inode_operations;
 
 struct proc_maps_private {
 	struct pid *pid;
@@ -91,6 +93,52 @@ static inline int proc_fd(struct inode *inode)
 	return PROC_I(inode)->fd;
 }
 
+static inline int task_dumpable(struct task_struct *task)
+{
+	int dumpable = 0;
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm)
+		dumpable = get_dumpable(mm);
+	task_unlock(task);
+	if(dumpable == 1)
+		return 1;
+	return 0;
+}
+
+static inline int pid_delete_dentry(const struct dentry * dentry)
+{
+	/* Is the task we represent dead?
+	 * If so, then don't put the dentry on the lru list,
+	 * kill it immediately.
+	 */
+	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+}
+
+static inline unsigned name_to_int(struct dentry *dentry)
+{
+	const char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	unsigned n = 0;
+
+	if (len > 1 && *name == '0')
+		goto out;
+	while (len-- > 0) {
+		unsigned c = *name++ - '0';
+		if (c > 9)
+			goto out;
+		if (n >= (~0U-9)/10)
+			goto out;
+		n *= 10;
+		n += c;
+	}
+	return n;
+out:
+	return ~0U;
+}
+
 struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
 		struct dentry *dentry);
 int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
-- 
cgit v1.2.3


From ddd3e0771bc7b869c550687c204e21f0155d5496 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 26 Aug 2012 18:28:20 +0400
Subject: procfs: Convert /proc/pid/fdinfo/ handling routines to seq-file v2

This patch converts /proc/pid/fdinfo/ handling routines to seq-file which
is needed to extend seq operations and plug in auxiliary fdinfo provides
from subsystems like eventfd/eventpoll/fsnotify.

Note the proc_fd_link no longer call for proc_fd_info, simply because
the guts of proc_fd_info() got merged into ->show() of that seq_file

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/fd.c | 112 ++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 64 insertions(+), 48 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 1b0f932b4bd..9cef449c0f7 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -6,61 +6,68 @@
 #include <linux/namei.h>
 #include <linux/pid.h>
 #include <linux/security.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
 
 #include <linux/proc_fs.h>
 
 #include "internal.h"
 #include "fd.h"
 
-#define PROC_FDINFO_MAX 64
-
-static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+static int seq_show(struct seq_file *m, void *v)
 {
-	struct task_struct *task = get_proc_task(inode);
 	struct files_struct *files = NULL;
-	int fd = proc_fd(inode);
-	struct file *file;
+	int f_flags = 0, ret = -ENOENT;
+	struct file *file = NULL;
+	struct task_struct *task;
+
+	task = get_proc_task(m->private);
+	if (!task)
+		return -ENOENT;
+
+	files = get_files_struct(task);
+	put_task_struct(task);
 
-	if (task) {
-		files = get_files_struct(task);
-		put_task_struct(task);
-	}
 	if (files) {
-		/*
-		 * We are not taking a ref to the file structure, so we must
-		 * hold ->file_lock.
-		 */
+		int fd = proc_fd(m->private);
+
 		spin_lock(&files->file_lock);
 		file = fcheck_files(files, fd);
 		if (file) {
-			unsigned int f_flags;
-			struct fdtable *fdt;
+			struct fdtable *fdt = files_fdtable(files);
 
-			fdt = files_fdtable(files);
 			f_flags = file->f_flags & ~O_CLOEXEC;
 			if (close_on_exec(fd, fdt))
 				f_flags |= O_CLOEXEC;
 
-			if (path) {
-				*path = file->f_path;
-				path_get(&file->f_path);
-			}
-			if (info)
-				snprintf(info, PROC_FDINFO_MAX,
-					 "pos:\t%lli\n"
-					 "flags:\t0%o\n",
-					 (long long) file->f_pos,
-					 f_flags);
-			spin_unlock(&files->file_lock);
-			put_files_struct(files);
-			return 0;
+			get_file(file);
+			ret = 0;
 		}
 		spin_unlock(&files->file_lock);
 		put_files_struct(files);
 	}
-	return -ENOENT;
+
+	if (!ret) {
+                seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
+			   (long long)file->f_pos, f_flags);
+		fput(file);
+	}
+
+	return ret;
 }
 
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, seq_show, inode);
+}
+
+static const struct file_operations proc_fdinfo_file_operations = {
+	.open		= seq_fdinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct files_struct *files;
@@ -130,7 +137,32 @@ static const struct dentry_operations tid_fd_dentry_operations = {
 
 static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-	return proc_fd_info(dentry->d_inode, path, NULL);
+	struct files_struct *files = NULL;
+	struct task_struct *task;
+	int ret = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
+
+	if (files) {
+		int fd = proc_fd(dentry->d_inode);
+		struct file *fd_file;
+
+		spin_lock(&files->file_lock);
+		fd_file = fcheck_files(files, fd);
+		if (fd_file) {
+			*path = fd_file->f_path;
+			path_get(&fd_file->f_path);
+			ret = 0;
+		}
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
+	}
+
+	return ret;
 }
 
 static struct dentry *
@@ -245,22 +277,6 @@ out_no_task:
 	return retval;
 }
 
-static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
-				size_t len, loff_t *ppos)
-{
-	char tmp[PROC_FDINFO_MAX];
-	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
-	if (!err)
-		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
-	return err;
-}
-
-static const struct file_operations proc_fdinfo_file_operations = {
-	.open           = nonseekable_open,
-	.read		= proc_fdinfo_read,
-	.llseek		= no_llseek,
-};
-
 static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
 {
 	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
-- 
cgit v1.2.3


From c6f3d81115989e274c42a852222b80d2e14ced6f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 11:01:04 -0400
Subject: don't leak O_CLOEXEC into ->f_flags

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/fd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 9cef449c0f7..f28a875f877 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -36,7 +36,7 @@ static int seq_show(struct seq_file *m, void *v)
 		if (file) {
 			struct fdtable *fdt = files_fdtable(files);
 
-			f_flags = file->f_flags & ~O_CLOEXEC;
+			f_flags = file->f_flags;
 			if (close_on_exec(fd, fdt))
 				f_flags |= O_CLOEXEC;
 
-- 
cgit v1.2.3


From cb0942b81249798e15c3f04eee2946ef543e8115 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 14:48:26 -0400
Subject: make get_file() return its argument

simplifies a bunch of callers...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b55c3bb298e..f1e8438d21b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1979,8 +1979,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				if (++pos <= filp->f_pos)
 					continue;
 
-				get_file(vma->vm_file);
-				info.file = vma->vm_file;
+				info.file = get_file(vma->vm_file);
 				info.len = snprintf(info.name,
 						sizeof(info.name), "%lx-%lx",
 						vma->vm_start, vma->vm_end);
-- 
cgit v1.2.3


From 7b540d0646ce122f0ba4520412be91e530719742 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 14:55:26 -0400
Subject: proc_map_files_readdir(): don't bother with grabbing files

all we need is their ->f_mode, so just collect _that_

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f1e8438d21b..df18db61d6d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1805,7 +1805,7 @@ out:
 }
 
 struct map_files_info {
-	struct file	*file;
+	fmode_t		mode;
 	unsigned long	len;
 	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
@@ -1814,13 +1814,10 @@ static struct dentry *
 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 			   struct task_struct *task, const void *ptr)
 {
-	const struct file *file = ptr;
+	fmode_t mode = (fmode_t)(unsigned long)ptr;
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	if (!file)
-		return ERR_PTR(-ENOENT);
-
 	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
@@ -1832,9 +1829,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 	inode->i_size = 64;
 	inode->i_mode = S_IFLNK;
 
-	if (file->f_mode & FMODE_READ)
+	if (mode & FMODE_READ)
 		inode->i_mode |= S_IRUSR;
-	if (file->f_mode & FMODE_WRITE)
+	if (mode & FMODE_WRITE)
 		inode->i_mode |= S_IWUSR;
 
 	d_set_d_op(dentry, &tid_map_files_dentry_operations);
@@ -1878,7 +1875,8 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
 	if (!vma)
 		goto out_no_vma;
 
-	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+	result = proc_map_files_instantiate(dir, dentry, task,
+			(void *)(unsigned long)vma->vm_file->f_mode);
 
 out_no_vma:
 	up_read(&mm->mmap_sem);
@@ -1979,7 +1977,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				if (++pos <= filp->f_pos)
 					continue;
 
-				info.file = get_file(vma->vm_file);
+				info.mode = vma->vm_file->f_mode;
 				info.len = snprintf(info.name,
 						sizeof(info.name), "%lx-%lx",
 						vma->vm_start, vma->vm_end);
@@ -1994,19 +1992,11 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			ret = proc_fill_cache(filp, dirent, filldir,
 					      p->name, p->len,
 					      proc_map_files_instantiate,
-					      task, p->file);
+					      task,
+					      (void *)(unsigned long)p->mode);
 			if (ret)
 				break;
 			filp->f_pos++;
-			fput(p->file);
-		}
-		for (; i < nr_files; i++) {
-			/*
-			 * In case of error don't forget
-			 * to put rest of file refs.
-			 */
-			p = flex_array_get(fa, i);
-			fput(p->file);
 		}
 		if (fa)
 			flex_array_free(fa);
-- 
cgit v1.2.3


From 0f4cfb2e4e7a7e4e97a3e90e2ba1062f07fb2cb1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 4 Oct 2012 17:15:27 -0700
Subject: coredump: use SUID_DUMPABLE_ENABLED rather than hardcoded 1

Cosmetic. Change setup_new_exec() and task_dumpable() to use
SUID_DUMPABLE_ENABLED for /bin/grep.

[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 67925a7bd8c..cceaab07ad5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -103,7 +103,7 @@ static inline int task_dumpable(struct task_struct *task)
 	if (mm)
 		dumpable = get_dumpable(mm);
 	task_unlock(task);
-	if(dumpable == 1)
+	if (dumpable == SUID_DUMPABLE_ENABLED)
 		return 1;
 	return 0;
 }
-- 
cgit v1.2.3


From 620727506dc6da0562fa4f6950dedb8a51bd8237 Mon Sep 17 00:00:00 2001
From: yan <clouds.yan@gmail.com>
Date: Thu, 4 Oct 2012 17:15:38 -0700
Subject: proc: return -ENOMEM when inode allocation failed

If proc_get_inode() returns NULL then presumably it encountered memory
exhaustion.  proc_lookup_de() should return -ENOMEM in this case, not
-EINVAL.

Signed-off-by: yan <clouds.yan@gmail.com>
Cc: Ryan Mallon <rmallon@gmail.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index b3647fe6a60..9e8f6316430 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -427,7 +427,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
 			pde_get(de);
 			spin_unlock(&proc_subdir_lock);
-			error = -EINVAL;
+			error = -ENOMEM;
 			inode = proc_get_inode(dir->i_sb, de);
 			goto out_unlock;
 		}
-- 
cgit v1.2.3


From 0e06936057674ef3f2bd0b398a32ddc512642992 Mon Sep 17 00:00:00 2001
From: yan <clouds.yan@gmail.com>
Date: Thu, 4 Oct 2012 17:15:41 -0700
Subject: proc: no need to initialize proc_inode->fd in proc_get_inode()

proc_get_inode() obtains the inode via a call to iget_locked().
iget_locked() calls alloc_inode() which will call proc_alloc_inode() which
clears proc_inode.fd, so there is no need to clear this field in
proc_get_inode().

If iget_locked() instead found the inode via find_inode_fast(), that inode
will not have I_NEW set so this change has no effect.

Signed-off-by: yan <clouds.yan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7ac817b64a7..3b22bbdee9e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -450,7 +450,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 		return NULL;
 	if (inode->i_state & I_NEW) {
 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-		PROC_I(inode)->fd = 0;
 		PROC_I(inode)->pde = de;
 
 		if (de->mode) {
-- 
cgit v1.2.3


From 17baa2a2c40f2f0330d25819b589a515d36b2d40 Mon Sep 17 00:00:00 2001
From: yan <clouds.yan@gmail.com>
Date: Thu, 4 Oct 2012 17:15:43 -0700
Subject: proc: use kzalloc instead of kmalloc and memset

Part of the memory will be written twice after this change, but that
should be negligible.

[akpm@linux-foundation.org: fix __proc_create() coding-style issues, remove unneeded zero-initialisations]
Signed-off-by: yan <clouds.yan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 9e8f6316430..0d80cef4cfb 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -605,7 +605,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	unsigned int len;
 
 	/* make sure name is valid */
-	if (!name || !strlen(name)) goto out;
+	if (!name || !strlen(name))
+		goto out;
 
 	if (xlate_proc_name(name, parent, &fn) != 0)
 		goto out;
@@ -616,20 +617,18 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 
 	len = strlen(fn);
 
-	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
-	if (!ent) goto out;
+	ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
+	if (!ent)
+		goto out;
 
-	memset(ent, 0, sizeof(struct proc_dir_entry));
 	memcpy(ent->name, fn, len + 1);
 	ent->namelen = len;
 	ent->mode = mode;
 	ent->nlink = nlink;
 	atomic_set(&ent->count, 1);
-	ent->pde_users = 0;
 	spin_lock_init(&ent->pde_unload_lock);
-	ent->pde_unload_completion = NULL;
 	INIT_LIST_HEAD(&ent->pde_openers);
- out:
+out:
 	return ent;
 }
 
-- 
cgit v1.2.3


From ab4a1f247003ad146c1a1067b2ba043922d0dff2 Mon Sep 17 00:00:00 2001
From: Prasad Joshi <prasadjoshi.linux@gmail.com>
Date: Thu, 4 Oct 2012 17:15:45 -0700
Subject: proc_sysctl.c: use BUG_ON instead of BUG

The use of if (!head) BUG(); can be replaced with the single line
BUG_ON(!head).

Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index eb7cc91b725..dcd56f84db7 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -266,8 +266,7 @@ void sysctl_head_put(struct ctl_table_header *head)
 
 static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
 {
-	if (!head)
-		BUG();
+	BUG_ON(!head);
 	spin_lock(&sysctl_lock);
 	if (!use_table(head))
 		head = ERR_PTR(-ENOENT);
-- 
cgit v1.2.3


From 9fb88442105f5fce96ea343927500d6307c8c550 Mon Sep 17 00:00:00 2001
From: Sachin Kamat <sachin.kamat@linaro.org>
Date: Thu, 4 Oct 2012 17:15:46 -0700
Subject: fs/proc/root.c: use NULL instead of 0 for pointer

This cleanup also fixes the following sparse warning:

  fs/proc/root.c:64:45: warning: Using plain integer as NULL pointer

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/root.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9a2d9fd7cad..9889a92d2e0 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -61,7 +61,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
 		if (!*p)
 			continue;
 
-		args[0].to = args[0].from = 0;
+		args[0].to = args[0].from = NULL;
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_gid:
-- 
cgit v1.2.3


From 314e51b9851b4f4e8ab302243ff5a6fc6147f379 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon, 8 Oct 2012 16:29:02 -0700
Subject: mm: kill vma flag VM_RESERVED and mm->reserved_vm counter

A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA,
currently it lost original meaning but still has some effects:

 | effect                 | alternative flags
-+------------------------+---------------------------------------------
1| account as reserved_vm | VM_IO
2| skip in core dump      | VM_IO, VM_DONTDUMP
3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP
4| do not mlock           | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP

This patch removes reserved_vm counter from mm_struct.  Seems like nobody
cares about it, it does not exported into userspace directly, it only
reduces total_vm showed in proc.

Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP.

remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP.
remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP.

[akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4540b8f76f1..79827ce03e3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -54,7 +54,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmPTE:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
-		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+		total_vm << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
 		mm->pinned_vm << (PAGE_SHIFT-10),
 		hiwater_rss << (PAGE_SHIFT-10),
-- 
cgit v1.2.3


From 01dc52ebdf472f77cca623ca693ca24cfc0f1bbe Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Mon, 8 Oct 2012 16:29:30 -0700
Subject: oom: remove deprecated oom_adj

The deprecated /proc/<pid>/oom_adj is scheduled for removal this month.

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 117 +--------------------------------------------------------
 1 file changed, 1 insertion(+), 116 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d295af99367..ef5c84be66f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,111 +873,6 @@ static const struct file_operations proc_environ_operations = {
 	.release	= mem_release,
 };
 
-static ssize_t oom_adjust_read(struct file *file, char __user *buf,
-				size_t count, loff_t *ppos)
-{
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-	char buffer[PROC_NUMBUF];
-	size_t len;
-	int oom_adjust = OOM_DISABLE;
-	unsigned long flags;
-
-	if (!task)
-		return -ESRCH;
-
-	if (lock_task_sighand(task, &flags)) {
-		oom_adjust = task->signal->oom_adj;
-		unlock_task_sighand(task, &flags);
-	}
-
-	put_task_struct(task);
-
-	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
-
-	return simple_read_from_buffer(buf, count, ppos, buffer, len);
-}
-
-static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
-				size_t count, loff_t *ppos)
-{
-	struct task_struct *task;
-	char buffer[PROC_NUMBUF];
-	int oom_adjust;
-	unsigned long flags;
-	int err;
-
-	memset(buffer, 0, sizeof(buffer));
-	if (count > sizeof(buffer) - 1)
-		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count)) {
-		err = -EFAULT;
-		goto out;
-	}
-
-	err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
-	if (err)
-		goto out;
-	if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
-	     oom_adjust != OOM_DISABLE) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	task = get_proc_task(file->f_path.dentry->d_inode);
-	if (!task) {
-		err = -ESRCH;
-		goto out;
-	}
-
-	task_lock(task);
-	if (!task->mm) {
-		err = -EINVAL;
-		goto err_task_lock;
-	}
-
-	if (!lock_task_sighand(task, &flags)) {
-		err = -ESRCH;
-		goto err_task_lock;
-	}
-
-	if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-		err = -EACCES;
-		goto err_sighand;
-	}
-
-	/*
-	 * Warn that /proc/pid/oom_adj is deprecated, see
-	 * Documentation/feature-removal-schedule.txt.
-	 */
-	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
-		  current->comm, task_pid_nr(current), task_pid_nr(task),
-		  task_pid_nr(task));
-	task->signal->oom_adj = oom_adjust;
-	/*
-	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
-	 * value is always attainable.
-	 */
-	if (task->signal->oom_adj == OOM_ADJUST_MAX)
-		task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
-	else
-		task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
-								-OOM_DISABLE;
-	trace_oom_score_adj_update(task);
-err_sighand:
-	unlock_task_sighand(task, &flags);
-err_task_lock:
-	task_unlock(task);
-	put_task_struct(task);
-out:
-	return err < 0 ? err : count;
-}
-
-static const struct file_operations proc_oom_adjust_operations = {
-	.read		= oom_adjust_read,
-	.write		= oom_adjust_write,
-	.llseek		= generic_file_llseek,
-};
-
 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 					size_t count, loff_t *ppos)
 {
@@ -1051,15 +946,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
 		task->signal->oom_score_adj_min = oom_score_adj;
 	trace_oom_score_adj_update(task);
-	/*
-	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
-	 * always attainable.
-	 */
-	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		task->signal->oom_adj = OOM_DISABLE;
-	else
-		task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
-							OOM_SCORE_ADJ_MAX;
+
 err_sighand:
 	unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -2710,7 +2597,6 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
 	INF("oom_score",  S_IRUGO, proc_oom_score),
-	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -3077,7 +2963,6 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
 	INF("oom_score", S_IRUGO, proc_oom_score),
-	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
-- 
cgit v1.2.3


From 4c199a93a2d36b277a9fd209a0f2793f8460a215 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:32 -0700
Subject: rbtree: empty nodes have no color

Empty nodes have no color.  We can make use of this property to simplify
the code emitted by the RB_EMPTY_NODE and RB_CLEAR_NODE macros.  Also,
we can get rid of the rb_init_node function which had been introduced by
commit 88d19cf37952 ("timers: Add rb_init_node() to allow for stack
allocated rb nodes") to avoid some issue with the empty node's color not
being initialized.

I'm not sure what the RB_EMPTY_NODE checks in rb_prev() / rb_next() are
doing there, though.  axboe introduced them in commit 10fd48f2376d
("rbtree: fixed reversed RB_EMPTY_NODE and rb_next/prev").  The way I
see it, the 'empty node' abstraction is only used by rbtree users to
flag nodes that they haven't inserted in any rbtree, so asking the
predecessor or successor of such nodes doesn't make any sense.

One final rb_init_node() caller was recently added in sysctl code to
implement faster sysctl name lookups.  This code doesn't make use of
RB_EMPTY_NODE at all, and from what I could see it only called
rb_init_node() under the mistaken assumption that such initialization was
required before node insertion.

[sfr@canb.auug.org.au: fix net/ceph/osd_client.c build]
Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dcd56f84db7..fddc5072963 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -168,10 +168,8 @@ static void init_header(struct ctl_table_header *head,
 	head->node = node;
 	if (node) {
 		struct ctl_table *entry;
-		for (entry = table; entry->procname; entry++, node++) {
-			rb_init_node(&node->node);
+		for (entry = table; entry->procname; entry++, node++)
 			node->header = head;
-		}
 	}
 }
 
-- 
cgit v1.2.3


From ea5272f5c94fb2ee62f4f15a5b88eef6184cd506 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:35 -0700
Subject: rbtree: fix incorrect rbtree node insertion in fs/proc/proc_sysctl.c

The recently added code to use rbtrees in sysctl did not follow the proper
rbtree interface on insertion - it was calling rb_link_node() which
inserts a new node into the binary tree, but missed the call to
rb_insert_color() which properly balances the rbtree and establishes all
expected rbtree invariants.

I found out about this only because faulty commit also used
rb_init_node(), which I am removing within this patchset.  But I think
it's an easy mistake to make, and it makes me wonder if we should change
the rbtree API so that insertions would be done with a single rb_insert()
call (even if its implementation could still inline the rb_link_node()
part and call a private __rb_insert_color function to do the rebalancing).

Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/proc')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fddc5072963..a781bdf0669 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -142,6 +142,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
 	}
 
 	rb_link_node(node, parent, p);
+	rb_insert_color(node, &head->parent->root);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7a71932d5676b7410ab64d149bad8bde6b0d8632 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Mon, 8 Oct 2012 16:33:47 -0700
Subject: kpageflags: fix wrong KPF_THP on non-huge compound pages

KPF_THP can be set on non-huge compound pages (like slab pages or pages
allocated by drivers with __GFP_COMP) because PageTransCompound only
checks PG_head and PG_tail.  Obviously this is a bug and breaks user space
applications which look for thp via /proc/kpageflags.

This patch rules out setting KPF_THP wrongly by additionally checking
PageLRU on the head pages.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/page.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7fcd0d60a96..b8730d9ebae 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,7 +115,13 @@ u64 stable_page_flags(struct page *page)
 		u |= 1 << KPF_COMPOUND_TAIL;
 	if (PageHuge(page))
 		u |= 1 << KPF_HUGE;
-	else if (PageTransCompound(page))
+	/*
+	 * PageTransCompound can be true for non-huge compound pages (slab
+	 * pages or pages allocated by drivers with __GFP_COMP) because it
+	 * just checks PG_head/PG_tail, so we need to check PageLRU to make
+	 * sure a given page is a thp, not a non-huge compound page.
+	 */
+	else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
 		u |= 1 << KPF_THP;
 
 	/*
-- 
cgit v1.2.3


From 7386cdbf2f57ea8cff3c9fde93f206e58b9fe13f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Wed, 10 Oct 2012 11:51:09 +0530
Subject: nohz: Fix idle ticks in cpu summary line of /proc/stat

Git commit 09a1d34f8535ecf9 "nohz: Make idle/iowait counter update
conditional" introduced a bug in regard to cpu hotplug. The effect is
that the number of idle ticks in the cpu summary line in /proc/stat is
still counting ticks for offline cpus.

Reproduction is easy, just start a workload that keeps all cpus busy,
switch off one or more cpus and then watch the idle field in top.
On a dual-core with one cpu 100% busy and one offline cpu you will get
something like this:

%Cpu(s): 48.7 us,  1.3 sy,  0.0 ni, 50.0 id,  0.0 wa,  0.0 hi,  0.0 si,
%0.0 st

The problem is that an offline cpu still has ts->idle_active == 1.
To fix this we should make sure that the cpu is online when calling
get_cpu_idle_time_us and get_cpu_iowait_time_us.

[Srivatsa: Rebased to current mainline]

Reported-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20121010061820.8999.57245.stgit@srivatsabhat.in.ibm.com
Cc: deepthi@linux.vnet.ibm.com
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/stat.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 64c3b317236..e296572c73e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -45,10 +45,13 @@ static cputime64_t get_iowait_time(int cpu)
 
 static u64 get_idle_time(int cpu)
 {
-	u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL);
+	u64 idle, idle_time = -1ULL;
+
+	if (cpu_online(cpu))
+		idle_time = get_cpu_idle_time_us(cpu, NULL);
 
 	if (idle_time == -1ULL)
-		/* !NO_HZ so we can rely on cpustat.idle */
+		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
 		idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
 	else
 		idle = usecs_to_cputime64(idle_time);
@@ -58,10 +61,13 @@ static u64 get_idle_time(int cpu)
 
 static u64 get_iowait_time(int cpu)
 {
-	u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+	u64 iowait, iowait_time = -1ULL;
+
+	if (cpu_online(cpu))
+		iowait_time = get_cpu_iowait_time_us(cpu, NULL);
 
 	if (iowait_time == -1ULL)
-		/* !NO_HZ so we can rely on cpustat.iowait */
+		/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
 		iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
 	else
 		iowait = usecs_to_cputime64(iowait_time);
-- 
cgit v1.2.3


From f81700bd831efcd12eb7f0e66b24b16c2ad00a32 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Oct 2012 16:43:21 -0400
Subject: procfs: don't need a PATH_MAX allocation to hold a string
 representation of an int

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ef5c84be66f..144a96732dd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2258,7 +2258,8 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	char *name = ERR_PTR(-ENOENT);
 	if (tgid) {
-		name = __getname();
+		/* 11 for max length of signed int in decimal + NULL term */
+		name = kmalloc(12, GFP_KERNEL);
 		if (!name)
 			name = ERR_PTR(-ENOMEM);
 		else
@@ -2273,7 +2274,7 @@ static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
 {
 	char *s = nd_get_link(nd);
 	if (!IS_ERR(s))
-		__putname(s);
+		kfree(s);
 }
 
 static const struct inode_operations proc_self_inode_operations = {
-- 
cgit v1.2.3


From 32f8516a8c733d281faa9f6666b509035246505c Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 16 Oct 2012 17:31:23 -0700
Subject: mm, mempolicy: fix printing stack contents in numa_maps

When reading /proc/pid/numa_maps, it's possible to return the contents of
the stack where the mempolicy string should be printed if the policy gets
freed from beneath us.

This happens because mpol_to_str() may return an error the
stack-allocated buffer is then printed without ever being stored.

There are two possible error conditions in mpol_to_str():

 - if the buffer allocated is insufficient for the string to be stored,
   and

 - if the mempolicy has an invalid mode.

The first error condition is not triggered in any of the callers to
mpol_to_str(): at least 50 bytes is always allocated on the stack and this
is sufficient for the string to be written.  A future patch should convert
this into BUILD_BUG_ON() since we know the maximum strlen possible, but
that's not -rc material.

The second error condition is possible if a race occurs in dropping a
reference to a task's mempolicy causing it to be freed during the read().
The slab poison value is then used for the mode and mpol_to_str() returns
-EINVAL.

This race is only possible because get_vma_policy() believes that
mm->mmap_sem protects task->mempolicy, which isn't true.  The exit path
does not hold mm->mmap_sem when dropping the reference or setting
task->mempolicy to NULL: it uses task_lock(task) instead.

Thus, it's required for the caller of a task mempolicy to hold
task_lock(task) while grabbing the mempolicy and reading it.  Callers with
a vma policy store their mempolicy earlier and can simply increment the
reference count so it's guaranteed not to be freed.

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 79827ce03e3..14df8806ff2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1158,6 +1158,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md = &numa_priv->md;
 	struct file *file = vma->vm_file;
+	struct task_struct *task = proc_priv->task;
 	struct mm_struct *mm = vma->vm_mm;
 	struct mm_walk walk = {};
 	struct mempolicy *pol;
@@ -1177,9 +1178,11 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	walk.private = md;
 	walk.mm = mm;
 
-	pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
+	task_lock(task);
+	pol = get_vma_policy(task, vma, vma->vm_start);
 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
 	mpol_cond_put(pol);
+	task_unlock(task);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
@@ -1189,7 +1192,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
 		seq_printf(m, " heap");
 	} else {
-		pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid);
+		pid_t tid = vm_is_stack(task, vma, is_pid);
 		if (tid != 0) {
 			/*
 			 * Thread stack in /proc/PID/task/TID/maps or
-- 
cgit v1.2.3


From 9e7814404b77c3e8920bee4277162bf3a7460505 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 19 Oct 2012 17:00:55 +0900
Subject: hold task->mempolicy while numa_maps scans.

  /proc/<pid>/numa_maps scans vma and show mempolicy under
  mmap_sem. It sometimes accesses task->mempolicy which can
  be freed without mmap_sem and numa_maps can show some
  garbage while scanning.

This patch tries to take reference count of task->mempolicy at reading
numa_maps before calling get_vma_policy(). By this, task->mempolicy
will not be freed until numa_maps reaches its end.

V2->v3
  -  updated comments to be more verbose.
  -  removed task_lock() in numa_maps code.
V1->V2
  -  access task->mempolicy only once and remember it.  Becase kernel/exit.c
     can overwrite it.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/internal.h |  4 ++++
 fs/proc/task_mmu.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cceaab07ad5..43973b084ab 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -12,6 +12,7 @@
 #include <linux/sched.h>
 #include <linux/proc_fs.h>
 struct  ctl_table_header;
+struct  mempolicy;
 
 extern struct proc_dir_entry proc_root;
 #ifdef CONFIG_PROC_SYSCTL
@@ -74,6 +75,9 @@ struct proc_maps_private {
 #ifdef CONFIG_MMU
 	struct vm_area_struct *tail_vma;
 #endif
+#ifdef CONFIG_NUMA
+	struct mempolicy *task_mempolicy;
+#endif
 };
 
 void proc_init_inodecache(void);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 14df8806ff2..90c63f9392a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -90,10 +90,55 @@ static void pad_len_spaces(struct seq_file *m, int len)
 	seq_printf(m, "%*c", len, ' ');
 }
 
+#ifdef CONFIG_NUMA
+/*
+ * These functions are for numa_maps but called in generic **maps seq_file
+ * ->start(), ->stop() ops.
+ *
+ * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
+ * Each mempolicy object is controlled by reference counting. The problem here
+ * is how to avoid accessing dead mempolicy object.
+ *
+ * Because we're holding mmap_sem while reading seq_file, it's safe to access
+ * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
+ *
+ * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
+ * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
+ * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
+ * gurantee the task never exits under us. But taking task_lock() around
+ * get_vma_plicy() causes lock order problem.
+ *
+ * To access task->mempolicy without lock, we hold a reference count of an
+ * object pointed by task->mempolicy and remember it. This will guarantee
+ * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
+ */
+static void hold_task_mempolicy(struct proc_maps_private *priv)
+{
+	struct task_struct *task = priv->task;
+
+	task_lock(task);
+	priv->task_mempolicy = task->mempolicy;
+	mpol_get(priv->task_mempolicy);
+	task_unlock(task);
+}
+static void release_task_mempolicy(struct proc_maps_private *priv)
+{
+	mpol_put(priv->task_mempolicy);
+}
+#else
+static void hold_task_mempolicy(struct proc_maps_private *priv)
+{
+}
+static void release_task_mempolicy(struct proc_maps_private *priv)
+{
+}
+#endif
+
 static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
 {
 	if (vma && vma != priv->tail_vma) {
 		struct mm_struct *mm = vma->vm_mm;
+		release_task_mempolicy(priv);
 		up_read(&mm->mmap_sem);
 		mmput(mm);
 	}
@@ -132,7 +177,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 
 	tail_vma = get_gate_vma(priv->task->mm);
 	priv->tail_vma = tail_vma;
-
+	hold_task_mempolicy(priv);
 	/* Start with last addr hint */
 	vma = find_vma(mm, last_addr);
 	if (last_addr && vma) {
@@ -159,6 +204,7 @@ out:
 	if (vma)
 		return vma;
 
+	release_task_mempolicy(priv);
 	/* End of vmas has been reached */
 	m->version = (tail_vma != NULL)? 0: -1UL;
 	up_read(&mm->mmap_sem);
@@ -1178,11 +1224,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	walk.private = md;
 	walk.mm = mm;
 
-	task_lock(task);
 	pol = get_vma_policy(task, vma, vma->vm_start);
 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
 	mpol_cond_put(pol);
-	task_unlock(task);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
-- 
cgit v1.2.3


From 5258f386ea4e8454bc801fb443e8a4217da1947c Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sun, 28 Oct 2012 12:19:23 -0700
Subject: sched/autogroup: Fix crash on reboot when autogroup is disabled

Due to these two commits:

  8323f26ce342 sched: Fix race in task_group()
  800d4d30c8f2 sched, autogroup: Stop going ahead if autogroup is disabled

... autogroup scheduling's dynamic knobs are wrecked.

With both patches applied, all you have to do to crash a box is
disable autogroup during boot up, then reboot.. boom, NULL pointer
dereference due to 800d4d30 not allowing autogroup to move things,
and 8323f26ce making that the only way to switch runqueues.

Remove most of the (dysfunctional) knobs and turn the remaining
sched_autogroup_enabled knob readonly.

If the user fiddles with cgroups hereafter, once tasks
are moved, autogroup won't mess with them again unless
they call setsid().

No knobs, no glitz, nada, just a cute little thing folks can
turn on if they don't want to muck about with cgroups and/or
systemd.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Xiaotian Feng <xtfeng@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Xiaotian Feng <dannyfeng@tencent.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: <stable@vger.kernel.org> # v3.6
Link: http://lkml.kernel.org/r/1351451963.4999.8.camel@maggy.simpson.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/base.c | 78 ----------------------------------------------------------
 1 file changed, 78 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b6c84cbdb7..bb1d9623bad 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1271,81 +1271,6 @@ static const struct file_operations proc_pid_sched_operations = {
 
 #endif
 
-#ifdef CONFIG_SCHED_AUTOGROUP
-/*
- * Print out autogroup related information:
- */
-static int sched_autogroup_show(struct seq_file *m, void *v)
-{
-	struct inode *inode = m->private;
-	struct task_struct *p;
-
-	p = get_proc_task(inode);
-	if (!p)
-		return -ESRCH;
-	proc_sched_autogroup_show_task(p, m);
-
-	put_task_struct(p);
-
-	return 0;
-}
-
-static ssize_t
-sched_autogroup_write(struct file *file, const char __user *buf,
-	    size_t count, loff_t *offset)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct task_struct *p;
-	char buffer[PROC_NUMBUF];
-	int nice;
-	int err;
-
-	memset(buffer, 0, sizeof(buffer));
-	if (count > sizeof(buffer) - 1)
-		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count))
-		return -EFAULT;
-
-	err = kstrtoint(strstrip(buffer), 0, &nice);
-	if (err < 0)
-		return err;
-
-	p = get_proc_task(inode);
-	if (!p)
-		return -ESRCH;
-
-	err = proc_sched_autogroup_set_nice(p, nice);
-	if (err)
-		count = err;
-
-	put_task_struct(p);
-
-	return count;
-}
-
-static int sched_autogroup_open(struct inode *inode, struct file *filp)
-{
-	int ret;
-
-	ret = single_open(filp, sched_autogroup_show, NULL);
-	if (!ret) {
-		struct seq_file *m = filp->private_data;
-
-		m->private = inode;
-	}
-	return ret;
-}
-
-static const struct file_operations proc_pid_sched_autogroup_operations = {
-	.open		= sched_autogroup_open,
-	.read		= seq_read,
-	.write		= sched_autogroup_write,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
-
 static ssize_t comm_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *offset)
 {
@@ -3035,9 +2960,6 @@ static const struct pid_entry tgid_base_stuff[] = {
 	INF("limits",	  S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
-#ifdef CONFIG_SCHED_AUTOGROUP
-	REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
 #endif
 	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-- 
cgit v1.2.3


From fa0cbbf145aabbf29c6f28f8a11935c0b0fd86fc Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 12 Nov 2012 17:53:04 -0800
Subject: mm, oom: reintroduce /proc/pid/oom_adj

This is mostly a revert of 01dc52ebdf47 ("oom: remove deprecated oom_adj")
from Davidlohr Bueso.

It reintroduces /proc/pid/oom_adj for backwards compatibility with earlier
kernels.  It simply scales the value linearly when /proc/pid/oom_score_adj
is written.

The major difference is that its scheduled removal is no longer included
in Documentation/feature-removal-schedule.txt.  We do warn users with a
single printk, though, to suggest the more powerful and supported
/proc/pid/oom_score_adj interface.

Reported-by: Artem S. Tashkinov <t.artem@lycos.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 144a96732dd..3c231adf845 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,6 +873,113 @@ static const struct file_operations proc_environ_operations = {
 	.release	= mem_release,
 };
 
+static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	char buffer[PROC_NUMBUF];
+	int oom_adj = OOM_ADJUST_MIN;
+	size_t len;
+	unsigned long flags;
+
+	if (!task)
+		return -ESRCH;
+	if (lock_task_sighand(task, &flags)) {
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+			oom_adj = OOM_ADJUST_MAX;
+		else
+			oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+				  OOM_SCORE_ADJ_MAX;
+		unlock_task_sighand(task, &flags);
+	}
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF];
+	int oom_adj;
+	unsigned long flags;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+	if (err)
+		goto out;
+	if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+	     oom_adj != OOM_DISABLE) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	task = get_proc_task(file->f_path.dentry->d_inode);
+	if (!task) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	task_lock(task);
+	if (!task->mm) {
+		err = -EINVAL;
+		goto err_task_lock;
+	}
+
+	if (!lock_task_sighand(task, &flags)) {
+		err = -ESRCH;
+		goto err_task_lock;
+	}
+
+	/*
+	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+	 * value is always attainable.
+	 */
+	if (oom_adj == OOM_ADJUST_MAX)
+		oom_adj = OOM_SCORE_ADJ_MAX;
+	else
+		oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+
+	if (oom_adj < task->signal->oom_score_adj &&
+	    !capable(CAP_SYS_RESOURCE)) {
+		err = -EACCES;
+		goto err_sighand;
+	}
+
+	/*
+	 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+	 * /proc/pid/oom_score_adj instead.
+	 */
+	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+		  current->comm, task_pid_nr(current), task_pid_nr(task),
+		  task_pid_nr(task));
+
+	task->signal->oom_score_adj = oom_adj;
+	trace_oom_score_adj_update(task);
+err_sighand:
+	unlock_task_sighand(task, &flags);
+err_task_lock:
+	task_unlock(task);
+	put_task_struct(task);
+out:
+	return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_adj_operations = {
+	.read		= oom_adj_read,
+	.write		= oom_adj_write,
+	.llseek		= generic_file_llseek,
+};
+
 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 					size_t count, loff_t *ppos)
 {
@@ -2598,6 +2705,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
 	INF("oom_score",  S_IRUGO, proc_oom_score),
+	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -2964,6 +3072,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
 	INF("oom_score", S_IRUGO, proc_oom_score),
+	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
-- 
cgit v1.2.3


From 73f7ef435934e952c1d70d83d69921ea5d1f6bd4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 16 Nov 2012 03:02:58 +0000
Subject: sysctl: Pass useful parameters to sysctl permissions

- Current is implicitly avaiable so passing current->nsproxy isn't useful.
- The ctl_table_header is needed to find how the sysctl table is connected
  to the rest of sysctl.
- ctl_table_root is avaiable in the ctl_table_header so no need to it.

With these changes it becomes possible to write a version of
net_sysctl_permission that takes into account the network namespace of
the sysctl table, an important feature in extending the user namespace.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_sysctl.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a781bdf0669..701580ddfcc 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -378,12 +378,13 @@ static int test_perm(int mode, int op)
 	return -EACCES;
 }
 
-static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
 {
+	struct ctl_table_root *root = head->root;
 	int mode;
 
 	if (root->permissions)
-		mode = root->permissions(root, current->nsproxy, table);
+		mode = root->permissions(head, table);
 	else
 		mode = table->mode;
 
@@ -491,7 +492,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 	 * and won't be until we finish.
 	 */
 	error = -EPERM;
-	if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
+	if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
 		goto out;
 
 	/* if that can happen at all, it should be -EINVAL, not -EISDIR */
@@ -717,7 +718,7 @@ static int proc_sys_permission(struct inode *inode, int mask)
 	if (!table) /* global root - r-xr-xr-x */
 		error = mask & MAY_WRITE ? -EACCES : 0;
 	else /* Use the permissions on the sysctl table entry */
-		error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK);
+		error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
 
 	sysctl_head_finish(head);
 	return error;
-- 
cgit v1.2.3


From e656d8a6f7fdf7612d2f5771f0ddfca9487f59d9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 10 Jul 2010 14:52:49 -0700
Subject: procfs: Use the proc generic infrastructure for proc/self.

I had visions at one point of splitting proc into two filesystems.  If
that had happened proc/self being the the part of proc that actually deals
with pids would have been a nice cleanup.  As it is proc/self requires
a lot of unnecessary infrastructure for a single file.

The only user visible change is that a mounted /proc for a pid namespace
that is dead now shows a broken proc symlink, instead of being completely
invisible.  I don't think anyone will notice or care.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/Makefile   |   1 +
 fs/proc/base.c     | 154 +----------------------------------------------------
 fs/proc/internal.h |   1 +
 fs/proc/root.c     |   1 +
 fs/proc/self.c     |  59 ++++++++++++++++++++
 5 files changed, 64 insertions(+), 152 deletions(-)
 create mode 100644 fs/proc/self.c

(limited to 'fs/proc')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 99349efbbc2..981b0560193 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -21,6 +21,7 @@ proc-y	+= uptime.o
 proc-y	+= version.o
 proc-y	+= softirqs.o
 proc-y	+= namespaces.o
+proc-y	+= self.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 144a96732dd..cbe454e94af 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2237,146 +2237,6 @@ static const struct file_operations proc_coredump_filter_operations = {
 };
 #endif
 
-/*
- * /proc/self:
- */
-static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
-			      int buflen)
-{
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char tmp[PROC_NUMBUF];
-	if (!tgid)
-		return -ENOENT;
-	sprintf(tmp, "%d", tgid);
-	return vfs_readlink(dentry,buffer,buflen,tmp);
-}
-
-static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char *name = ERR_PTR(-ENOENT);
-	if (tgid) {
-		/* 11 for max length of signed int in decimal + NULL term */
-		name = kmalloc(12, GFP_KERNEL);
-		if (!name)
-			name = ERR_PTR(-ENOMEM);
-		else
-			sprintf(name, "%d", tgid);
-	}
-	nd_set_link(nd, name);
-	return NULL;
-}
-
-static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
-				void *cookie)
-{
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		kfree(s);
-}
-
-static const struct inode_operations proc_self_inode_operations = {
-	.readlink	= proc_self_readlink,
-	.follow_link	= proc_self_follow_link,
-	.put_link	= proc_self_put_link,
-};
-
-/*
- * proc base
- *
- * These are the directory entries in the root directory of /proc
- * that properly belong to the /proc filesystem, as they describe
- * describe something that is process related.
- */
-static const struct pid_entry proc_base_stuff[] = {
-	NOD("self", S_IFLNK|S_IRWXUGO,
-		&proc_self_inode_operations, NULL, {}),
-};
-
-static struct dentry *proc_base_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
-{
-	const struct pid_entry *p = ptr;
-	struct inode *inode;
-	struct proc_inode *ei;
-	struct dentry *error;
-
-	/* Allocate the inode */
-	error = ERR_PTR(-ENOMEM);
-	inode = new_inode(dir->i_sb);
-	if (!inode)
-		goto out;
-
-	/* Initialize the inode */
-	ei = PROC_I(inode);
-	inode->i_ino = get_next_ino();
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-
-	/*
-	 * grab the reference to the task.
-	 */
-	ei->pid = get_task_pid(task, PIDTYPE_PID);
-	if (!ei->pid)
-		goto out_iput;
-
-	inode->i_mode = p->mode;
-	if (S_ISDIR(inode->i_mode))
-		set_nlink(inode, 2);
-	if (S_ISLNK(inode->i_mode))
-		inode->i_size = 64;
-	if (p->iop)
-		inode->i_op = p->iop;
-	if (p->fop)
-		inode->i_fop = p->fop;
-	ei->op = p->op;
-	d_add(dentry, inode);
-	error = NULL;
-out:
-	return error;
-out_iput:
-	iput(inode);
-	goto out;
-}
-
-static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
-{
-	struct dentry *error;
-	struct task_struct *task = get_proc_task(dir);
-	const struct pid_entry *p, *last;
-
-	error = ERR_PTR(-ENOENT);
-
-	if (!task)
-		goto out_no_task;
-
-	/* Lookup the directory entry */
-	last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
-	for (p = proc_base_stuff; p <= last; p++) {
-		if (p->len != dentry->d_name.len)
-			continue;
-		if (!memcmp(dentry->d_name.name, p->name, p->len))
-			break;
-	}
-	if (p > last)
-		goto out;
-
-	error = proc_base_instantiate(dir, dentry, task, p);
-
-out:
-	put_task_struct(task);
-out_no_task:
-	return error;
-}
-
-static int proc_base_fill_cache(struct file *filp, void *dirent,
-	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
-	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
-				proc_base_instantiate, task, p);
-}
-
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
@@ -2767,15 +2627,11 @@ out:
 
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-	struct dentry *result;
+	struct dentry *result = NULL;
 	struct task_struct *task;
 	unsigned tgid;
 	struct pid_namespace *ns;
 
-	result = proc_base_lookup(dir, dentry);
-	if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
-		goto out;
-
 	tgid = name_to_int(dentry);
 	if (tgid == ~0U)
 		goto out;
@@ -2838,7 +2694,7 @@ retry:
 	return iter;
 }
 
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY)
 
 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	struct tgid_iter iter)
@@ -2872,12 +2728,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	if (!reaper)
 		goto out_no_task;
 
-	for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
-		const struct pid_entry *p = &proc_base_stuff[nr];
-		if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
-			goto out;
-	}
-
 	ns = filp->f_dentry->d_sb->s_fs_info;
 	iter.task = NULL;
 	iter.tgid = filp->f_pos - TGID_OFFSET;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 43973b084ab..252544c0520 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -15,6 +15,7 @@ struct  ctl_table_header;
 struct  mempolicy;
 
 extern struct proc_dir_entry proc_root;
+extern void proc_self_init(void);
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
 extern void sysctl_head_put(struct ctl_table_header *head);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9889a92d2e0..5da984959ed 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -169,6 +169,7 @@ void __init proc_root_init(void)
 		return;
 	}
 
+	proc_self_init();
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
diff --git a/fs/proc/self.c b/fs/proc/self.c
new file mode 100644
index 00000000000..aa5cc3bff14
--- /dev/null
+++ b/fs/proc/self.c
@@ -0,0 +1,59 @@
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+
+/*
+ * /proc/self:
+ */
+static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
+			      int buflen)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	char tmp[PROC_NUMBUF];
+	if (!tgid)
+		return -ENOENT;
+	sprintf(tmp, "%d", tgid);
+	return vfs_readlink(dentry,buffer,buflen,tmp);
+}
+
+static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	pid_t tgid = task_tgid_nr_ns(current, ns);
+	char *name = ERR_PTR(-ENOENT);
+	if (tgid) {
+		/* 11 for max length of signed int in decimal + NULL term */
+		name = kmalloc(12, GFP_KERNEL);
+		if (!name)
+			name = ERR_PTR(-ENOMEM);
+		else
+			sprintf(name, "%d", tgid);
+	}
+	nd_set_link(nd, name);
+	return NULL;
+}
+
+static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		kfree(s);
+}
+
+static const struct inode_operations proc_self_inode_operations = {
+	.readlink	= proc_self_readlink,
+	.follow_link	= proc_self_follow_link,
+	.put_link	= proc_self_put_link,
+};
+
+void __init proc_self_init(void)
+{
+	struct proc_dir_entry *proc_self_symlink;
+	mode_t mode;
+
+	mode = S_IFLNK | S_IRWXUGO;
+	proc_self_symlink = proc_create("self", mode, NULL, NULL );
+	proc_self_symlink->proc_iops = &proc_self_inode_operations;
+}
-- 
cgit v1.2.3


From ae06c7c83fc6e97ba247a261921c101960f3d28f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 10 Jul 2010 15:23:34 -0700
Subject: procfs: Don't cache a pid in the root inode.

Now that we have s_fs_info pointing to our pid namespace
the original reason for the proc root inode having a struct
pid is gone.

Caching a pid in the root inode has led to some complicated
code.  Now that we don't need the struct pid, just remove it.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/base.c | 11 +----------
 fs/proc/root.c |  8 --------
 2 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index cbe454e94af..6177fc238fd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2714,19 +2714,12 @@ static int fake_filldir(void *buf, const char *name, int namelen,
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-	unsigned int nr;
-	struct task_struct *reaper;
 	struct tgid_iter iter;
 	struct pid_namespace *ns;
 	filldir_t __filldir;
 
 	if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
-		goto out_no_task;
-	nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-
-	reaper = get_proc_task(filp->f_path.dentry->d_inode);
-	if (!reaper)
-		goto out_no_task;
+		goto out;
 
 	ns = filp->f_dentry->d_sb->s_fs_info;
 	iter.task = NULL;
@@ -2747,8 +2740,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	}
 	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
 out:
-	put_task_struct(reaper);
-out_no_task:
 	return 0;
 }
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5da984959ed..13ef6247e7a 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -100,7 +100,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	int err;
 	struct super_block *sb;
 	struct pid_namespace *ns;
-	struct proc_inode *ei;
 	char *options;
 
 	if (flags & MS_KERNMOUNT) {
@@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		sb->s_flags |= MS_ACTIVE;
 	}
 
-	ei = PROC_I(sb->s_root->d_inode);
-	if (!ei->pid) {
-		rcu_read_lock();
-		ei->pid = get_pid(find_pid_ns(1, ns));
-		rcu_read_unlock();
-	}
-
 	return dget(sb->s_root);
 }
 
-- 
cgit v1.2.3


From 17cf22c33e1f1b5e435469c84e43872579497653 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 2 Mar 2010 14:51:53 -0800
Subject: pidns: Use task_active_pid_ns where appropriate

The expressions tsk->nsproxy->pid_ns and task_active_pid_ns
aka ns_of_pid(task_pid(tsk)) should have the same number of
cache line misses with the practical difference that
ns_of_pid(task_pid(tsk)) is released later in a processes life.

Furthermore by using task_active_pid_ns it becomes trivial
to write an unshare implementation for the the pid namespace.

So I have used task_active_pid_ns everywhere I can.

In fork since the pid has not yet been attached to the
process I use ns_of_pid, to achieve the same effect.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/root.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 13ef6247e7a..fc1609321a7 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -106,7 +106,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		ns = (struct pid_namespace *)data;
 		options = NULL;
 	} else {
-		ns = current->nsproxy->pid_ns;
+		ns = task_active_pid_ns(current);
 		options = data;
 	}
 
-- 
cgit v1.2.3


From 0a01f2cc390e10633a54f72c608cc3fe19a50c3d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 1 Aug 2012 10:33:47 -0700
Subject: pidns: Make the pidns proc mount/umount logic obvious.

Track the number of pids in the proc hash table.  When the number of
pids goes to 0 schedule work to unmount the kernel mount of proc.

Move the mount of proc into alloc_pid when we allocate the pid for
init.

Remove the surprising calls of pid_ns_release proc in fork and
proc_flush_task.  Those code paths really shouldn't know about proc
namespace implementation details and people have demonstrated several
times that finding and understanding those code paths is difficult and
non-obvious.

Because of the call path detach pid is alwasy called with the
rtnl_lock held free_pid is not allowed to sleep, so the work to
unmounting proc is moved to a work queue.  This has the side benefit
of not blocking the entire world waiting for the unnecessary
rcu_barrier in deactivate_locked_super.

In the process of making the code clear and obvious this fixes a bug
reported by Gao feng <gaofeng@cn.fujitsu.com> where we would leak a
mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns
succeeded and copy_net_ns failed.

Acked-by: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/base.c | 4 ----
 fs/proc/root.c | 5 -----
 2 files changed, 9 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6177fc238fd..7621dc51cff 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task)
 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
 					tgid->numbers[i].nr);
 	}
-
-	upid = &pid->numbers[pid->level];
-	if (upid->nr == 1)
-		pid_ns_release_proc(upid->ns);
 }
 
 static struct dentry *proc_pid_instantiate(struct inode *dir,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index fc1609321a7..f2f251158d3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -155,11 +155,6 @@ void __init proc_root_init(void)
 	err = register_filesystem(&proc_fs_type);
 	if (err)
 		return;
-	err = pid_ns_prepare_proc(&init_pid_ns);
-	if (err) {
-		unregister_filesystem(&proc_fs_type);
-		return;
-	}
 
 	proc_self_init();
 	proc_symlink("mounts", NULL, "self/mounts");
-- 
cgit v1.2.3


From 57e8391d327609cbf12d843259c968b9e5c1838f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 7 Mar 2010 18:17:03 -0800
Subject: pidns: Add setns support

- Pid namespaces are designed to be inescapable so verify that the
  passed in pid namespace is a child of the currently active
  pid namespace or the currently active pid namespace itself.

  Allowing the currently active pid namespace is important so
  the effects of an earlier setns can be cancelled.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/namespaces.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index b178ed733c3..85ca047e35f 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -24,6 +24,9 @@ static const struct proc_ns_operations *ns_entries[] = {
 #ifdef CONFIG_IPC_NS
 	&ipcns_operations,
 #endif
+#ifdef CONFIG_PID_NS
+	&pidns_operations,
+#endif
 };
 
 static const struct file_operations ns_file_operations = {
-- 
cgit v1.2.3


From 8823c079ba7136dc1948d6f6dcb5f8022bde438e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 7 Mar 2010 18:49:36 -0800
Subject: vfs: Add setns support for the mount namespace

setns support for the mount namespace is a little tricky as an
arbitrary decision must be made about what to set fs->root and
fs->pwd to, as there is no expectation of a relationship between
the two mount namespaces.  Therefore I arbitrarily find the root
mount point, and follow every mount on top of it to find the top
of the mount stack.  Then I set fs->root and fs->pwd to that
location.  The topmost root of the mount stack seems like a
reasonable place to be.

Bind mount support for the mount namespace inodes has the
possibility of creating circular dependencies between mount
namespaces.  Circular dependencies can result in loops that
prevent mount namespaces from every being freed.  I avoid
creating those circular dependencies by adding a sequence number
to the mount namespace and require all bind mounts be of a
younger mount namespace into an older mount namespace.

Add a helper function proc_ns_inode so it is possible to
detect when we are attempting to bind mound a namespace inode.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/namespaces.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 85ca047e35f..2a17fd9ae6a 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -27,6 +27,7 @@ static const struct proc_ns_operations *ns_entries[] = {
 #ifdef CONFIG_PID_NS
 	&pidns_operations,
 #endif
+	&mntns_operations,
 };
 
 static const struct file_operations ns_file_operations = {
@@ -201,3 +202,7 @@ out_invalid:
 	return ERR_PTR(-EINVAL);
 }
 
+bool proc_ns_inode(struct inode *inode)
+{
+	return inode->i_fop == &ns_file_operations;
+}
-- 
cgit v1.2.3


From cde1975bc242f3e1072bde623ef378e547b73f91 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 26 Jul 2012 06:24:06 -0700
Subject: userns: Implent proc namespace operations

This allows entering a user namespace, and the ability
to store a reference to a user namespace with a bind
mount.

Addition of missing userns_ns_put in userns_install
from Gao feng <gaofeng@cn.fujitsu.com>

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/namespaces.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 2a17fd9ae6a..030250c27d7 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -11,6 +11,7 @@
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include "internal.h"
 
 
@@ -26,6 +27,9 @@ static const struct proc_ns_operations *ns_entries[] = {
 #endif
 #ifdef CONFIG_PID_NS
 	&pidns_operations,
+#endif
+#ifdef CONFIG_USER_NS
+	&userns_operations,
 #endif
 	&mntns_operations,
 };
-- 
cgit v1.2.3


From e9f238c3041e2582a710e75910c8cbf2a98e51b2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 11 Aug 2012 12:38:26 -0700
Subject: procfs: Print task uids and gids in the userns that opened the proc
 file

Instead of using current_userns() use the userns of the opener
of the file so that if the file is passed between processes
the contents of the file do not change.

Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/array.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index c1c207c36ca..55443426561 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk)
 static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *p)
 {
-	struct user_namespace *user_ns = current_user_ns();
+	struct user_namespace *user_ns = seq_user_ns(m);
 	struct group_info *group_info;
 	int g;
 	struct fdtable *fdt = NULL;
-- 
cgit v1.2.3


From 4f326c0064b20b78b8041f4d2f6fe188a1129f18 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 27 Jul 2012 05:56:48 -0700
Subject: userns: Allow unprivilged mounts of proc and sysfs

- The context in which proc and sysfs are mounted have no
  effect on the the uid/gid of their files so no conversion is
  needed except allowing the mount.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/root.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/proc')

diff --git a/fs/proc/root.c b/fs/proc/root.c
index f2f251158d3..c6e9fac26ba 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -145,6 +145,7 @@ static struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 void __init proc_root_init(void)
-- 
cgit v1.2.3


From 33d6dce607573b5fd7a43168e0d91221b3ca532b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 17 Jun 2011 13:33:20 -0700
Subject: proc: Generalize proc inode allocation

Generalize the proc inode allocation so that it can be
used without having to having to create a proc_dir_entry.

This will allow namespace file descriptors to remain light
weight entitities but still have the same inode number
when the backing namespace is the same.

Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/generic.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 0d80cef4cfb..7b3ae3cc0ef 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -350,14 +350,14 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
  * Return an inode number between PROC_DYNAMIC_FIRST and
  * 0xffffffff, or zero on failure.
  */
-static unsigned int get_inode_number(void)
+int proc_alloc_inum(unsigned int *inum)
 {
 	unsigned int i;
 	int error;
 
 retry:
-	if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0)
-		return 0;
+	if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
+		return -ENOMEM;
 
 	spin_lock(&proc_inum_lock);
 	error = ida_get_new(&proc_inum_ida, &i);
@@ -365,18 +365,19 @@ retry:
 	if (error == -EAGAIN)
 		goto retry;
 	else if (error)
-		return 0;
+		return error;
 
 	if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
 		spin_lock(&proc_inum_lock);
 		ida_remove(&proc_inum_ida, i);
 		spin_unlock(&proc_inum_lock);
-		return 0;
+		return -ENOSPC;
 	}
-	return PROC_DYNAMIC_FIRST + i;
+	*inum = PROC_DYNAMIC_FIRST + i;
+	return 0;
 }
 
-static void release_inode_number(unsigned int inum)
+void proc_free_inum(unsigned int inum)
 {
 	spin_lock(&proc_inum_lock);
 	ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
@@ -554,13 +555,12 @@ static const struct inode_operations proc_dir_inode_operations = {
 
 static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
 {
-	unsigned int i;
 	struct proc_dir_entry *tmp;
+	int ret;
 	
-	i = get_inode_number();
-	if (i == 0)
-		return -EAGAIN;
-	dp->low_ino = i;
+	ret = proc_alloc_inum(&dp->low_ino);
+	if (ret)
+		return ret;
 
 	if (S_ISDIR(dp->mode)) {
 		if (dp->proc_iops == NULL) {
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(proc_create_data);
 
 static void free_proc_entry(struct proc_dir_entry *de)
 {
-	release_inode_number(de->low_ino);
+	proc_free_inum(de->low_ino);
 
 	if (S_ISLNK(de->mode))
 		kfree(de->data);
-- 
cgit v1.2.3


From bf056bfa80596a5d14b26b17276a56a0dcb080e5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 18 Jun 2011 17:48:18 -0700
Subject: proc: Fix the namespace inode permission checks.

Change the proc namespace files into symlinks so that
we won't cache the dentries for the namespace files
which can bypass the ptrace_may_access checks.

To support the symlinks create an additional namespace
inode with it's own set of operations distinct from the
proc pid inode and dentry methods as those no longer
make sense.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/inode.c      |   6 +-
 fs/proc/namespaces.c | 169 ++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 152 insertions(+), 23 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3b22bbdee9e..439ae688650 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode)
 	struct proc_dir_entry *de;
 	struct ctl_table_header *head;
 	const struct proc_ns_operations *ns_ops;
+	void *ns;
 
 	truncate_inode_pages(&inode->i_data, 0);
 	clear_inode(inode);
@@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode)
 	}
 	/* Release any associated namespace */
 	ns_ops = PROC_I(inode)->ns_ops;
-	if (ns_ops && ns_ops->put)
-		ns_ops->put(PROC_I(inode)->ns);
+	ns = PROC_I(inode)->ns;
+	if (ns_ops && ns)
+		ns_ops->put(ns);
 }
 
 static struct kmem_cache * proc_inode_cachep;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 030250c27d7..7a6d8d69cdb 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -38,6 +38,151 @@ static const struct file_operations ns_file_operations = {
 	.llseek		= no_llseek,
 };
 
+static const struct inode_operations ns_inode_operations = {
+	.setattr	= proc_setattr,
+};
+
+static int ns_delete_dentry(const struct dentry *dentry)
+{
+	/* Don't cache namespace inodes when not in use */
+	return 1;
+}
+
+static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct inode *inode = dentry->d_inode;
+	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+
+	return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
+		ns_ops->name, inode->i_ino);
+}
+
+const struct dentry_operations ns_dentry_operations =
+{
+	.d_delete	= ns_delete_dentry,
+	.d_dname	= ns_dname,
+};
+
+static struct dentry *proc_ns_get_dentry(struct super_block *sb,
+	struct task_struct *task, const struct proc_ns_operations *ns_ops)
+{
+	struct dentry *dentry, *result;
+	struct inode *inode;
+	struct proc_inode *ei;
+	struct qstr qname = { .name = "", };
+	void *ns;
+
+	ns = ns_ops->get(task);
+	if (!ns)
+		return ERR_PTR(-ENOENT);
+
+	dentry = d_alloc_pseudo(sb, &qname);
+	if (!dentry) {
+		ns_ops->put(ns);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	inode = new_inode(sb);
+	if (!inode) {
+		dput(dentry);
+		ns_ops->put(ns);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ei = PROC_I(inode);
+	inode->i_ino = get_next_ino();
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode->i_op = &ns_inode_operations;
+	inode->i_mode = S_IFREG | S_IRUGO;
+	inode->i_fop = &ns_file_operations;
+	ei->ns_ops = ns_ops;
+	ei->ns = ns;
+
+	d_set_d_op(dentry, &ns_dentry_operations);
+	result = d_instantiate_unique(dentry, inode);
+	if (result) {
+		dput(dentry);
+		dentry = result;
+	}
+
+	return dentry;
+}
+
+static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct proc_inode *ei = PROC_I(inode);
+	struct task_struct *task;
+	struct dentry *ns_dentry;
+	void *error = ERR_PTR(-EACCES);
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_put_task;
+
+	ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
+	if (IS_ERR(ns_dentry)) {
+		error = ERR_CAST(ns_dentry);
+		goto out_put_task;
+	}
+
+	dput(nd->path.dentry);
+	nd->path.dentry = ns_dentry;
+	error = NULL;
+
+out_put_task:
+	put_task_struct(task);
+out:
+	return error;
+}
+
+static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	struct inode *inode = dentry->d_inode;
+	struct proc_inode *ei = PROC_I(inode);
+	const struct proc_ns_operations *ns_ops = ei->ns_ops;
+	struct task_struct *task;
+	void *ns;
+	char name[50];
+	int len = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_put_task;
+
+	len = -ENOENT;
+	ns = ns_ops->get(task);
+	if (!ns)
+		goto out_put_task;
+
+	snprintf(name, sizeof(name), "%s", ns_ops->name);
+	len = strlen(name);
+
+	if (len > buflen)
+		len = buflen;
+	if (copy_to_user(buffer, ns_ops->name, len))
+		len = -EFAULT;
+
+	ns_ops->put(ns);
+out_put_task:
+	put_task_struct(task);
+out:
+	return len;
+}
+
+static const struct inode_operations proc_ns_link_inode_operations = {
+	.readlink	= proc_ns_readlink,
+	.follow_link	= proc_ns_follow_link,
+	.setattr	= proc_setattr,
+};
+
 static struct dentry *proc_ns_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -45,21 +190,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
 	struct inode *inode;
 	struct proc_inode *ei;
 	struct dentry *error = ERR_PTR(-ENOENT);
-	void *ns;
 
 	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
 		goto out;
 
-	ns = ns_ops->get(task);
-	if (!ns)
-		goto out_iput;
-
 	ei = PROC_I(inode);
-	inode->i_mode = S_IFREG|S_IRUSR;
-	inode->i_fop  = &ns_file_operations;
-	ei->ns_ops    = ns_ops;
-	ei->ns	      = ns;
+	inode->i_mode = S_IFLNK|S_IRWXUGO;
+	inode->i_op = &proc_ns_link_inode_operations;
+	ei->ns_ops = ns_ops;
 
 	d_set_d_op(dentry, &pid_dentry_operations);
 	d_add(dentry, inode);
@@ -68,9 +207,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
 		error = NULL;
 out:
 	return error;
-out_iput:
-	iput(inode);
-	goto out;
 }
 
 static int proc_ns_fill_cache(struct file *filp, void *dirent,
@@ -97,10 +233,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent,
 	if (!task)
 		goto out_no_task;
 
-	ret = -EPERM;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto out;
-
 	ret = 0;
 	i = filp->f_pos;
 	switch (i) {
@@ -160,10 +292,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 	if (!task)
 		goto out_no_task;
 
-	error = ERR_PTR(-EPERM);
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto out;
-
 	last = &ns_entries[ARRAY_SIZE(ns_entries)];
 	for (entry = ns_entries; entry < last; entry++) {
 		if (strlen((*entry)->name) != len)
@@ -171,7 +299,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 		if (!memcmp(dentry->d_name.name, (*entry)->name, len))
 			break;
 	}
-	error = ERR_PTR(-ENOENT);
 	if (entry == last)
 		goto out;
 
-- 
cgit v1.2.3


From 98f842e675f96ffac96e6c50315790912b2812be Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 15 Jun 2011 10:21:48 -0700
Subject: proc: Usable inode numbers for the namespace file descriptors.

Assign a unique proc inode to each namespace, and use that
inode number to ensure we only allocate at most one proc
inode for every namespace in proc.

A single proc inode per namespace allows userspace to test
to see if two processes are in the same namespace.

This has been a long requested feature and only blocked because
a naive implementation would put the id in a global space and
would ultimately require having a namespace for the names of
namespaces, making migration and certain virtualization tricks
impossible.

We still don't have per superblock inode numbers for proc, which
appears necessary for application unaware checkpoint/restart and
migrations (if the application is using namespace file descriptors)
but that is now allowd by the design if it becomes important.

I have preallocated the ipc and uts initial proc inode numbers so
their structures can be statically initialized.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/namespaces.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 7a6d8d69cdb..b7a47196c8c 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -82,7 +82,7 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	inode = new_inode(sb);
+	inode = iget_locked(sb, ns_ops->inum(ns));
 	if (!inode) {
 		dput(dentry);
 		ns_ops->put(ns);
@@ -90,13 +90,17 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb,
 	}
 
 	ei = PROC_I(inode);
-	inode->i_ino = get_next_ino();
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_op = &ns_inode_operations;
-	inode->i_mode = S_IFREG | S_IRUGO;
-	inode->i_fop = &ns_file_operations;
-	ei->ns_ops = ns_ops;
-	ei->ns = ns;
+	if (inode->i_state & I_NEW) {
+		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		inode->i_op = &ns_inode_operations;
+		inode->i_mode = S_IFREG | S_IRUGO;
+		inode->i_fop = &ns_file_operations;
+		ei->ns_ops = ns_ops;
+		ei->ns = ns;
+		unlock_new_inode(inode);
+	} else {
+		ns_ops->put(ns);
+	}
 
 	d_set_d_op(dentry, &ns_dentry_operations);
 	result = d_instantiate_unique(dentry, inode);
@@ -162,12 +166,12 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	if (!ns)
 		goto out_put_task;
 
-	snprintf(name, sizeof(name), "%s", ns_ops->name);
+	snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
 	len = strlen(name);
 
 	if (len > buflen)
 		len = buflen;
-	if (copy_to_user(buffer, ns_ops->name, len))
+	if (copy_to_user(buffer, name, len))
 		len = -EFAULT;
 
 	ns_ops->put(ns);
-- 
cgit v1.2.3


From 05f564849d49499ced97913a0914b5950577d07d Mon Sep 17 00:00:00 2001
From: Stanislav Kinsbursky <skinsbursky@parallels.com>
Date: Mon, 26 Nov 2012 16:29:42 -0800
Subject: proc: check vma->vm_file before dereferencing

Commit 7b540d0646ce ("proc_map_files_readdir(): don't bother with
grabbing files") switched proc_map_files_readdir() to use @f_mode
directly instead of grabbing @file reference, but same time the test for
@vm_file presence was lost leading to nil dereference.  The patch brings
the test back.

The all proc_map_files feature is CONFIG_CHECKPOINT_RESTORE wrapped
(which is set to 'n' by default) so the bug doesn't affect regular
kernels.

The regression is 3.7-rc1 only as far as I can tell.

[gorcunov@openvz.org: provided changelog]
Signed-off-by: Stanislav Kinsbursky <skinsbursky@parallels.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3c231adf845..9e28356a959 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1877,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
 	if (!vma)
 		goto out_no_vma;
 
-	result = proc_map_files_instantiate(dir, dentry, task,
-			(void *)(unsigned long)vma->vm_file->f_mode);
+	if (vma->vm_file)
+		result = proc_map_files_instantiate(dir, dentry, task,
+				(void *)(unsigned long)vma->vm_file->f_mode);
 
 out_no_vma:
 	up_read(&mm->mmap_sem);
-- 
cgit v1.2.3


From e80d0a1ae8bb8fee0edd37427836f108b30f596b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 21 Nov 2012 16:26:44 +0100
Subject: cputime: Rename thread_group_times to thread_group_cputime_adjusted

We have thread_group_cputime() and thread_group_times(). The naming
doesn't provide enough information about the difference between
these two APIs.

To lower the confusion, rename thread_group_times() to
thread_group_cputime_adjusted(). This name better suggests that
it's a version of thread_group_cputime() that does some stabilization
on the raw cputime values. ie here: scale on top of CFS runtime
stats and bound lower value for monotonicity.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 fs/proc/array.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index c1c207c36ca..d3696708fc1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			thread_group_times(task, &utime, &stime);
+			thread_group_cputime_adjusted(task, &utime, &stime);
 			gtime += sig->gtime;
 		}
 
@@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	if (!whole) {
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
-		task_times(task, &utime, &stime);
+		task_cputime_adjusted(task, &utime, &stime);
 		gtime = task->gtime;
 	}
 
-- 
cgit v1.2.3


From c1ad41f1f7270c1956da13fa8fd59d8d5929d56e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 11 Dec 2012 10:23:45 +0100
Subject: Revert "sched/autogroup: Fix crash on reboot when autogroup is
 disabled"

This reverts commit 5258f386ea4e8454bc801fb443e8a4217da1947c,
because the underlying autogroups bug got fixed upstream in
a better way, via:

  fd8ef11730f1 Revert "sched, autogroup: Stop going ahead if autogroup is disabled"

Cc: Mike Galbraith <efault@gmx.de>
Cc: Yong Zhang <yong.zhang0@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/base.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 587631e1cd0..9e28356a959 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1272,6 +1272,81 @@ static const struct file_operations proc_pid_sched_operations = {
 
 #endif
 
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+	proc_sched_autogroup_show_task(p, m);
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+	    size_t count, loff_t *offset)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct task_struct *p;
+	char buffer[PROC_NUMBUF];
+	int nice;
+	int err;
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count))
+		return -EFAULT;
+
+	err = kstrtoint(strstrip(buffer), 0, &nice);
+	if (err < 0)
+		return err;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	err = proc_sched_autogroup_set_nice(p, nice);
+	if (err)
+		count = err;
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+
+	ret = single_open(filp, sched_autogroup_show, NULL);
+	if (!ret) {
+		struct seq_file *m = filp->private_data;
+
+		m->private = inode;
+	}
+	return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+	.open		= sched_autogroup_open,
+	.read		= seq_read,
+	.write		= sched_autogroup_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
 static ssize_t comm_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *offset)
 {
@@ -2582,6 +2657,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	INF("limits",	  S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+	REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
 #endif
 	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-- 
cgit v1.2.3


From a9c58b907dbc6821533dfc295b63caf111ff1f16 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 11 Dec 2012 16:02:54 -0800
Subject: mm, oom: change type of oom_score_adj to short

The maximum oom_score_adj is 1000 and the minimum oom_score_adj is -1000,
so this range can be represented by the signed short type with no
functional change.  The extra space this frees up in struct signal_struct
will be used for per-thread oom kill flags in the next patch.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: Anton Vorontsov <anton.vorontsov@linaro.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9e28356a959..aa63d25157b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -985,7 +985,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 {
 	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 	char buffer[PROC_NUMBUF];
-	int oom_score_adj = OOM_SCORE_ADJ_MIN;
+	short oom_score_adj = OOM_SCORE_ADJ_MIN;
 	unsigned long flags;
 	size_t len;
 
@@ -996,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 		unlock_task_sighand(task, &flags);
 	}
 	put_task_struct(task);
-	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+	len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
 	return simple_read_from_buffer(buf, count, ppos, buffer, len);
 }
 
@@ -1043,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		goto err_task_lock;
 	}
 
-	if (oom_score_adj < task->signal->oom_score_adj_min &&
+	if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
 			!capable(CAP_SYS_RESOURCE)) {
 		err = -EACCES;
 		goto err_sighand;
 	}
 
-	task->signal->oom_score_adj = oom_score_adj;
+	task->signal->oom_score_adj = (short)oom_score_adj;
 	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
-		task->signal->oom_score_adj_min = oom_score_adj;
+		task->signal->oom_score_adj_min = (short)oom_score_adj;
 	trace_oom_score_adj_update(task);
 
 err_sighand:
-- 
cgit v1.2.3


From e180377f1ae48b3cbc559c9875d9b038f7f000c6 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 12 Dec 2012 13:50:59 -0800
Subject: thp: change split_huge_page_pmd() interface

Pass vma instead of mm and add address parameter.

In most cases we already have vma on the stack. We provides
split_huge_page_pmd_mm() for few cases when we have mm, but not vma.

This change is preparation to huge zero pmd splitting implementation.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9392a..291a0d15a0b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	struct page *page;
 
-	split_huge_page_pmd(walk->mm, pmd);
+	split_huge_page_pmd(vma, addr, pmd);
 	if (pmd_trans_unstable(pmd))
 		return 0;
 
-- 
cgit v1.2.3


From 4ff1b2c29326a2a3e130b46f69b7ab0e853d09d8 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 12 Dec 2012 13:51:25 -0800
Subject: procfs: use N_MEMORY instead N_HIGH_MEMORY

N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Lin Feng <linfeng@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c    | 2 +-
 fs/proc/task_mmu.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 86c67eee439..e96d4f18ca3 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -249,7 +249,7 @@ static int kcore_update_ram(void)
 	/* Not inialized....update now */
 	/* find out "max pfn" */
 	end_pfn = 0;
-	for_each_node_state(nid, N_HIGH_MEMORY) {
+	for_each_node_state(nid, N_MEMORY) {
 		unsigned long node_end;
 		node_end  = NODE_DATA(nid)->node_start_pfn +
 			NODE_DATA(nid)->node_spanned_pages;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 291a0d15a0b..48775628abb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 		return NULL;
 
 	nid = page_to_nid(page);
-	if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+	if (!node_isset(nid, node_states[N_MEMORY]))
 		return NULL;
 
 	return page;
@@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	if (md->writeback)
 		seq_printf(m, " writeback=%lu", md->writeback);
 
-	for_each_node_state(n, N_HIGH_MEMORY)
+	for_each_node_state(n, N_MEMORY)
 		if (md->node[n])
 			seq_printf(m, " N%d=%lu", n, md->node[n]);
 out:
-- 
cgit v1.2.3


From f9a00e8738c209d95493cf97d3a82ab2655892e5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 17 Dec 2012 16:01:25 -0800
Subject: procfs: use kbasename()

[yongjun_wei@trendmicro.com.cn: remove duplicated include]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_devtree.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index df7dd08d439..de20ec480fa 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -195,11 +195,7 @@ void proc_device_tree_add_node(struct device_node *np,
 	set_node_proc_entry(np, de);
 	for (child = NULL; (child = of_get_next_child(np, child));) {
 		/* Use everything after the last slash, or the full name */
-		p = strrchr(child->full_name, '/');
-		if (!p)
-			p = child->full_name;
-		else
-			++p;
+		p = kbasename(child->full_name);
 
 		if (duplicate_name(de, p))
 			p = fixup_name(np, de, p);
-- 
cgit v1.2.3


From 7b9a7ec565505699f503b4fcf61500dceb36e744 Mon Sep 17 00:00:00 2001
From: Andrew Vagin <avagin@openvz.org>
Date: Mon, 17 Dec 2012 16:03:10 -0800
Subject: proc: don't show nonexistent capabilities

Without this patch it is really hard to interpret a bounding set, if
CAP_LAST_CAP is unknown for a current kernel.

Non-existant capabilities can not be deleted from a bounding set with help
of prctl.

E.g.: Here are two examples without/with this patch.

  CapBnd:	ffffffe0fdecffff
  CapBnd:	00000000fdecffff

I suggest to hide non-existent capabilities. Here is two reasons.
* It's logically and easier for using.
* It helps to checkpoint-restore capabilities of tasks, because tasks
can be restored on another kernel, where CAP_LAST_CAP is bigger.

Signed-off-by: Andrew Vagin <avagin@openvz.org>
Cc: Andrew G. Morgan <morgan@kernel.org>
Reviewed-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index d3696708fc1..377a37366dd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,6 +308,10 @@ static void render_cap_t(struct seq_file *m, const char *header,
 	seq_putc(m, '\n');
 }
 
+/* Remove non-existent capabilities */
+#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \
+				CAP_TO_MASK(CAP_LAST_CAP + 1) - 1)
+
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
 	const struct cred *cred;
@@ -321,6 +325,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 	cap_bset	= cred->cap_bset;
 	rcu_read_unlock();
 
+	NORM_CAPS(cap_inheritable);
+	NORM_CAPS(cap_permitted);
+	NORM_CAPS(cap_effective);
+	NORM_CAPS(cap_bset);
+
 	render_cap_t(m, "CapInh:\t", &cap_inheritable);
 	render_cap_t(m, "CapPrm:\t", &cap_permitted);
 	render_cap_t(m, "CapEff:\t", &cap_effective);
-- 
cgit v1.2.3


From 834f82e2aa9a8ede94b17b656329f850c1471514 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 17 Dec 2012 16:03:13 -0800
Subject: procfs: add VmFlags field in smaps output

During c/r sessions we've found that there is no way at the moment to
fetch some VMA associated flags, such as mlock() and madvise().

This leads us to a problem -- we don't know if we should call for mlock()
and/or madvise() after restore on the vma area we're bringing back to
life.

This patch intorduces a new field into "smaps" output called VmFlags,
where all set flags associated with the particular VMA is shown as two
letter mnemonics.

[ Strictly speaking for c/r we only need mlock/madvise bits but it has been
  said that providing just a few flags looks somehow inconsistent.  So all
  flags are here now. ]

This feature is made available on CONFIG_CHECKPOINT_RESTORE=n kernels, as
other applications may start to use these fields.

The data is encoded in a somewhat awkward two letters mnemonic form, to
encourage userspace to be prepared for fields being added or removed in
the future.

[a.p.zijlstra@chello.nl: props to use for_each_set_bit]
[sfr@canb.auug.org.au: props to use array instead of struct]
[akpm@linux-foundation.org: overall redesign and simplification]
[akpm@linux-foundation.org: remove unneeded braces per sfr, avoid using bloaty for_each_set_bit()]
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 48775628abb..448455b7fd9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -526,6 +526,57 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return 0;
 }
 
+static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
+{
+	/*
+	 * Don't forget to update Documentation/ on changes.
+	 */
+	static const char mnemonics[BITS_PER_LONG][2] = {
+		/*
+		 * In case if we meet a flag we don't know about.
+		 */
+		[0 ... (BITS_PER_LONG-1)] = "??",
+
+		[ilog2(VM_READ)]	= "rd",
+		[ilog2(VM_WRITE)]	= "wr",
+		[ilog2(VM_EXEC)]	= "ex",
+		[ilog2(VM_SHARED)]	= "sh",
+		[ilog2(VM_MAYREAD)]	= "mr",
+		[ilog2(VM_MAYWRITE)]	= "mw",
+		[ilog2(VM_MAYEXEC)]	= "me",
+		[ilog2(VM_MAYSHARE)]	= "ms",
+		[ilog2(VM_GROWSDOWN)]	= "gd",
+		[ilog2(VM_PFNMAP)]	= "pf",
+		[ilog2(VM_DENYWRITE)]	= "dw",
+		[ilog2(VM_LOCKED)]	= "lo",
+		[ilog2(VM_IO)]		= "io",
+		[ilog2(VM_SEQ_READ)]	= "sr",
+		[ilog2(VM_RAND_READ)]	= "rr",
+		[ilog2(VM_DONTCOPY)]	= "dc",
+		[ilog2(VM_DONTEXPAND)]	= "de",
+		[ilog2(VM_ACCOUNT)]	= "ac",
+		[ilog2(VM_NORESERVE)]	= "nr",
+		[ilog2(VM_HUGETLB)]	= "ht",
+		[ilog2(VM_NONLINEAR)]	= "nl",
+		[ilog2(VM_ARCH_1)]	= "ar",
+		[ilog2(VM_DONTDUMP)]	= "dd",
+		[ilog2(VM_MIXEDMAP)]	= "mm",
+		[ilog2(VM_HUGEPAGE)]	= "hg",
+		[ilog2(VM_NOHUGEPAGE)]	= "nh",
+		[ilog2(VM_MERGEABLE)]	= "mg",
+	};
+	size_t i;
+
+	seq_puts(m, "VmFlags: ");
+	for (i = 0; i < BITS_PER_LONG; i++) {
+		if (vma->vm_flags & (1UL << i)) {
+			seq_printf(m, "%c%c ",
+				   mnemonics[i][0], mnemonics[i][1]);
+		}
+	}
+	seq_putc(m, '\n');
+}
+
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
 	struct proc_maps_private *priv = m->private;
@@ -581,6 +632,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		seq_printf(m, "Nonlinear:      %8lu kB\n",
 				mss.nonlinear >> 10);
 
+	show_smap_vma_flags(m, vma);
+
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task->mm))
 			? vma->vm_start : 0;
-- 
cgit v1.2.3


From 2f4b3bf6b2318cfaa177ec5a802f4d8d6afbd816 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 17 Dec 2012 16:03:14 -0800
Subject: /proc/pid/status: add "Seccomp" field

It is currently impossible to examine the state of seccomp for a given
process.  While attaching with gdb and attempting "call
prctl(PR_GET_SECCOMP,...)" will work with some situations, it is not
reliable.  If the process is in seccomp mode 1, this query will kill the
process (prctl not allowed), if the process is in mode 2 with prctl not
allowed, it will similarly be killed, and in weird cases, if prctl is
filtered to return errno 0, it can look like seccomp is disabled.

When reviewing the state of running processes, there should be a way to
externally examine the seccomp mode.  ("Did this build of Chrome end up
using seccomp?" "Did my distro ship ssh with seccomp enabled?")

This adds the "Seccomp" line to /proc/$pid/status.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: James Morris <jmorris@namei.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 377a37366dd..077235ffb38 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -336,6 +336,13 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 	render_cap_t(m, "CapBnd:\t", &cap_bset);
 }
 
+static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+	seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
+#endif
+}
+
 static inline void task_context_switch_counts(struct seq_file *m,
 						struct task_struct *p)
 {
@@ -369,6 +376,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	}
 	task_sig(m, task);
 	task_cap(m, task);
+	task_seccomp(m, task);
 	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
-- 
cgit v1.2.3


From 8d238027b87e654be552eabdf492042a34c5c300 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 17 Dec 2012 16:03:17 -0800
Subject: proc: pid/status: show all supplementary groups

We display a list of supplementary group for each process in
/proc/<pid>/status.  However, we show only the first 32 groups, not all of
them.

Although this is rare, but sometimes processes do have more than 32
supplementary groups, and this kernel limitation breaks user-space apps
that rely on the group list in /proc/<pid>/status.

Number 32 comes from the internal NGROUPS_SMALL macro which defines the
length for the internal kernel "small" groups buffer.  There is no
apparent reason to limit to this value.

This patch removes the 32 groups printing limit.

The Linux kernel limits the amount of supplementary groups by NGROUPS_MAX,
which is currently set to 65536.  And this is the maximum count of groups
we may possibly print.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 077235ffb38..439544fec38 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -212,7 +212,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	group_info = cred->group_info;
 	task_unlock(p);
 
-	for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
+	for (g = 0; g < group_info->ngroups; g++)
 		seq_printf(m, "%d ",
 			   from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
 	put_cred(cred);
-- 
cgit v1.2.3


From 55985dd72ab27b47530dcc8bdddd28b69f4abe8b Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 17 Dec 2012 16:04:55 -0800
Subject: procfs: add ability to plug in auxiliary fdinfo providers

This patch brings ability to print out auxiliary data associated with
file in procfs interface /proc/pid/fdinfo/fd.

In particular further patches make eventfd, evenpoll, signalfd and
fsnotify to print additional information complete enough to restore
these objects after checkpoint.

To simplify the code we add show_fdinfo callback inside struct
file_operations (as Al and Pavel are proposing).

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: James Bottomley <jbottomley@parallels.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matthew Helsley <matt.helsley@gmail.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/fd.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/proc')

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index f28a875f877..d7a4a28ef63 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -50,6 +50,8 @@ static int seq_show(struct seq_file *m, void *v)
 	if (!ret) {
                 seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
 			   (long long)file->f_pos, f_flags);
+		if (file->f_op->show_fdinfo)
+			ret = file->f_op->show_fdinfo(m, file);
 		fput(file);
 	}
 
-- 
cgit v1.2.3


From 138d22b58696c506799f8de759804083ff9effae Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 17 Dec 2012 16:05:02 -0800
Subject: fs, epoll: add procfs fdinfo helper

This allows us to print out eventpoll target file descriptor, events and
data, the /proc/pid/fdinfo/fd consists of

 | pos:	0
 | flags:	02
 | tfd:        5 events:       1d data: ffffffffffffffff enabled: 1

[avagin@: fix for unitialized ret variable]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: James Bottomley <jbottomley@parallels.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matthew Helsley <matt.helsley@gmail.com>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@onelan.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 439544fec38..060a56a9127 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -220,7 +220,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	seq_putc(m, '\n');
 }
 
-static void render_sigset_t(struct seq_file *m, const char *header,
+void render_sigset_t(struct seq_file *m, const char *header,
 				sigset_t *set)
 {
 	int i;
-- 
cgit v1.2.3


From 46f69557103e11fb963ae5c98b7777e90493241b Mon Sep 17 00:00:00 2001
From: Marco Stornelli <marco.stornelli@gmail.com>
Date: Sat, 15 Dec 2012 11:48:48 +0100
Subject: procfs: drop vmtruncate

Removed vmtruncate

Signed-off-by: Marco Stornelli <marco.stornelli@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c        | 7 -------
 fs/proc/generic.c     | 9 +--------
 fs/proc/proc_sysctl.c | 7 -------
 3 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5a5a0be40e4..9b43ff77a51 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -542,13 +542,6 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
 	if (error)
 		return error;
 
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		error = vmtruncate(inode, attr->ia_size);
-		if (error)
-			return error;
-	}
-
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7b3ae3cc0ef..2e4ed13b9ee 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -261,16 +261,9 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 	if (error)
 		return error;
 
-	if ((iattr->ia_valid & ATTR_SIZE) &&
-	    iattr->ia_size != i_size_read(inode)) {
-		error = vmtruncate(inode, iattr->ia_size);
-		if (error)
-			return error;
-	}
-
 	setattr_copy(inode, iattr);
 	mark_inode_dirty(inode);
-	
+
 	de->uid = inode->i_uid;
 	de->gid = inode->i_gid;
 	de->mode = inode->i_mode;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 701580ddfcc..1827d88ad58 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -736,13 +736,6 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 	if (error)
 		return error;
 
-	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		error = vmtruncate(inode, attr->ia_size);
-		if (error)
-			return error;
-	}
-
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
-- 
cgit v1.2.3


From ee297209bf0a25c6717b7c063e76795142d32f37 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <xtfeng@gmail.com>
Date: Thu, 20 Dec 2012 15:05:44 -0800
Subject: proc: fix inconsistent lock state

Lockdep found an inconsistent lock state when rcu is processing delayed
work in softirq.  Currently, kernel is using spin_lock/spin_unlock to
protect proc_inum_ida, but proc_free_inum is called by rcu in softirq
context.

Use spin_lock_bh/spin_unlock_bh fix following lockdep warning.

  =================================
  [ INFO: inconsistent lock state ]
  3.7.0 #36 Not tainted
  ---------------------------------
  inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
  swapper/1/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
   (proc_inum_lock){+.?...}, at: proc_free_inum+0x1c/0x50
  {SOFTIRQ-ON-W} state was registered at:
     __lock_acquire+0x8ae/0xca0
     lock_acquire+0x199/0x200
     _raw_spin_lock+0x41/0x50
     proc_alloc_inum+0x4c/0xd0
     alloc_mnt_ns+0x49/0xc0
     create_mnt_ns+0x25/0x70
     mnt_init+0x161/0x1c7
     vfs_caches_init+0x107/0x11a
     start_kernel+0x348/0x38c
     x86_64_start_reservations+0x131/0x136
     x86_64_start_kernel+0x103/0x112
  irq event stamp: 2993422
  hardirqs last  enabled at (2993422):  _raw_spin_unlock_irqrestore+0x55/0x80
  hardirqs last disabled at (2993421):  _raw_spin_lock_irqsave+0x29/0x70
  softirqs last  enabled at (2993394):  _local_bh_enable+0x13/0x20
  softirqs last disabled at (2993395):  call_softirq+0x1c/0x30

  other info that might help us debug this:
   Possible unsafe locking scenario:

         CPU0
         ----
    lock(proc_inum_lock);
    <Interrupt>
      lock(proc_inum_lock);

   *** DEADLOCK ***

  no locks held by swapper/1/0.

  stack backtrace:
  Pid: 0, comm: swapper/1 Not tainted 3.7.0 #36
  Call Trace:
   <IRQ>  [<ffffffff810a40f1>] ? vprintk_emit+0x471/0x510
    print_usage_bug+0x2a5/0x2c0
    mark_lock+0x33b/0x5e0
    __lock_acquire+0x813/0xca0
    lock_acquire+0x199/0x200
    _raw_spin_lock+0x41/0x50
    proc_free_inum+0x1c/0x50
    free_pid_ns+0x1c/0x50
    put_pid_ns+0x2e/0x50
    put_pid+0x4a/0x60
    delayed_put_pid+0x12/0x20
    rcu_process_callbacks+0x462/0x790
    __do_softirq+0x1b4/0x3b0
    call_softirq+0x1c/0x30
    do_softirq+0x59/0xd0
    irq_exit+0x54/0xd0
    smp_apic_timer_interrupt+0x95/0xa3
    apic_timer_interrupt+0x72/0x80
    cpuidle_enter_tk+0x10/0x20
    cpuidle_enter_state+0x17/0x50
    cpuidle_idle_call+0x287/0x520
    cpu_idle+0xba/0x130
    start_secondary+0x2b3/0x2bc

Signed-off-by: Xiaotian Feng <dannyfeng@tencent.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7b3ae3cc0ef..e659a0ff1da 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -359,18 +359,18 @@ retry:
 	if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
 		return -ENOMEM;
 
-	spin_lock(&proc_inum_lock);
+	spin_lock_bh(&proc_inum_lock);
 	error = ida_get_new(&proc_inum_ida, &i);
-	spin_unlock(&proc_inum_lock);
+	spin_unlock_bh(&proc_inum_lock);
 	if (error == -EAGAIN)
 		goto retry;
 	else if (error)
 		return error;
 
 	if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
-		spin_lock(&proc_inum_lock);
+		spin_lock_bh(&proc_inum_lock);
 		ida_remove(&proc_inum_ida, i);
-		spin_unlock(&proc_inum_lock);
+		spin_unlock_bh(&proc_inum_lock);
 		return -ENOSPC;
 	}
 	*inum = PROC_DYNAMIC_FIRST + i;
@@ -379,9 +379,9 @@ retry:
 
 void proc_free_inum(unsigned int inum)
 {
-	spin_lock(&proc_inum_lock);
+	spin_lock_bh(&proc_inum_lock);
 	ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
-	spin_unlock(&proc_inum_lock);
+	spin_unlock_bh(&proc_inum_lock);
 }
 
 static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
-- 
cgit v1.2.3


From dfb2ea45becb198beeb75350d0b7b7ad9076a38f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 21 Dec 2012 20:38:00 -0800
Subject: proc: Allow proc_free_inum to be called from any context

While testing the pid namespace code I hit this nasty warning.

[  176.262617] ------------[ cut here ]------------
[  176.263388] WARNING: at /home/eric/projects/linux/linux-userns-devel/kernel/softirq.c:160 local_bh_enable_ip+0x7a/0xa0()
[  176.265145] Hardware name: Bochs
[  176.265677] Modules linked in:
[  176.266341] Pid: 742, comm: bash Not tainted 3.7.0userns+ #18
[  176.266564] Call Trace:
[  176.266564]  [<ffffffff810a539f>] warn_slowpath_common+0x7f/0xc0
[  176.266564]  [<ffffffff810a53fa>] warn_slowpath_null+0x1a/0x20
[  176.266564]  [<ffffffff810ad9ea>] local_bh_enable_ip+0x7a/0xa0
[  176.266564]  [<ffffffff819308c9>] _raw_spin_unlock_bh+0x19/0x20
[  176.266564]  [<ffffffff8123dbda>] proc_free_inum+0x3a/0x50
[  176.266564]  [<ffffffff8111d0dc>] free_pid_ns+0x1c/0x80
[  176.266564]  [<ffffffff8111d195>] put_pid_ns+0x35/0x50
[  176.266564]  [<ffffffff810c608a>] put_pid+0x4a/0x60
[  176.266564]  [<ffffffff8146b177>] tty_ioctl+0x717/0xc10
[  176.266564]  [<ffffffff810aa4d5>] ? wait_consider_task+0x855/0xb90
[  176.266564]  [<ffffffff81086bf9>] ? default_spin_lock_flags+0x9/0x10
[  176.266564]  [<ffffffff810cab0a>] ? remove_wait_queue+0x5a/0x70
[  176.266564]  [<ffffffff811e37e8>] do_vfs_ioctl+0x98/0x550
[  176.266564]  [<ffffffff810b8a0f>] ? recalc_sigpending+0x1f/0x60
[  176.266564]  [<ffffffff810b9127>] ? __set_task_blocked+0x37/0x80
[  176.266564]  [<ffffffff810ab95b>] ? sys_wait4+0xab/0xf0
[  176.266564]  [<ffffffff811e3d31>] sys_ioctl+0x91/0xb0
[  176.266564]  [<ffffffff810a95f0>] ? task_stopped_code+0x50/0x50
[  176.266564]  [<ffffffff81939199>] system_call_fastpath+0x16/0x1b
[  176.266564] ---[ end trace 387af88219ad6143 ]---

It turns out that spin_unlock_bh(proc_inum_lock) is not safe when
put_pid is called with another spinlock held and irqs disabled.

For now take the easy path and use spin_lock_irqsave(proc_inum_lock)
in proc_free_inum and spin_loc_irq in proc_alloc_inum(proc_inum_lock).

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/generic.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs/proc')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e064f562b1f..76ddae83daa 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -352,18 +352,18 @@ retry:
 	if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
 		return -ENOMEM;
 
-	spin_lock_bh(&proc_inum_lock);
+	spin_lock_irq(&proc_inum_lock);
 	error = ida_get_new(&proc_inum_ida, &i);
-	spin_unlock_bh(&proc_inum_lock);
+	spin_unlock_irq(&proc_inum_lock);
 	if (error == -EAGAIN)
 		goto retry;
 	else if (error)
 		return error;
 
 	if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
-		spin_lock_bh(&proc_inum_lock);
+		spin_lock_irq(&proc_inum_lock);
 		ida_remove(&proc_inum_ida, i);
-		spin_unlock_bh(&proc_inum_lock);
+		spin_unlock_irq(&proc_inum_lock);
 		return -ENOSPC;
 	}
 	*inum = PROC_DYNAMIC_FIRST + i;
@@ -372,9 +372,10 @@ retry:
 
 void proc_free_inum(unsigned int inum)
 {
-	spin_lock_bh(&proc_inum_lock);
+	unsigned long flags;
+	spin_lock_irqsave(&proc_inum_lock, flags);
 	ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
-	spin_unlock_bh(&proc_inum_lock);
+	spin_unlock_irqrestore(&proc_inum_lock, flags);
 }
 
 static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
-- 
cgit v1.2.3


From a7a88b23737095e6c18a20c5d4eef9e25ec5b829 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 2 Jan 2013 02:04:23 -0800
Subject: mempolicy: remove arg from mpol_parse_str, mpol_to_str

Remove the unused argument (formerly no_context) from mpol_parse_str()
and from mpol_to_str().

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 448455b7fd9..ca5ce7f9f80 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1278,7 +1278,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	walk.mm = mm;
 
 	pol = get_vma_policy(task, vma, vma->vm_start);
-	mpol_to_str(buffer, sizeof(buffer), pol, 0);
+	mpol_to_str(buffer, sizeof(buffer), pol);
 	mpol_cond_put(pol);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
-- 
cgit v1.2.3