aboutsummaryrefslogtreecommitdiffstats
path: root/fs/proc
diff options
context:
space:
mode:
authorPatrick McHardy <kaber@trash.net>2012-08-08 21:03:47 +0200
committerPatrick McHardy <kaber@trash.net>2012-08-08 21:03:47 +0200
commitd53b4ed072d9779cdf53582c46436dec06d0961f (patch)
treeac95ecab33e31cd79aae69c475e8348adac51230 /fs/proc
parent5d4dff7f1011a81a693a9c7b1f6a0b9c842eb60c (diff)
parent28a33cbc24e4256c143dce96c7d93bf423229f92 (diff)
Merge tag 'v3.5' of 192.168.0.154:/repos/git/linux-2.6
Conflicts: drivers/Kconfig Signed-off-by: Patrick McHardy <kaber@trash.net>
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/array.c280
-rw-r--r--fs/proc/base.c997
-rw-r--r--fs/proc/generic.c8
-rw-r--r--fs/proc/inode.c38
-rw-r--r--fs/proc/internal.h16
-rw-r--r--fs/proc/kcore.c8
-rw-r--r--fs/proc/namespaces.c9
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/proc_sysctl.c1276
-rw-r--r--fs/proc/root.c71
-rw-r--r--fs/proc/stat.c151
-rw-r--r--fs/proc/task_mmu.c429
-rw-r--r--fs/proc/task_nommu.c71
-rw-r--r--fs/proc/uptime.c11
-rw-r--r--fs/proc/vmcore.c23
16 files changed, 2659 insertions, 733 deletions
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3a1dafd228d..c1c207c36ca 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,6 +81,7 @@
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
+#include <linux/user_namespace.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -161,6 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk)
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
+ struct user_namespace *user_ns = current_user_ns();
struct group_info *group_info;
int g;
struct fdtable *fdt = NULL;
@@ -189,8 +191,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
task_tgid_nr_ns(p, ns),
pid_nr_ns(pid, ns),
ppid, tpid,
- cred->uid, cred->euid, cred->suid, cred->fsuid,
- cred->gid, cred->egid, cred->sgid, cred->fsgid);
+ from_kuid_munged(user_ns, cred->uid),
+ from_kuid_munged(user_ns, cred->euid),
+ from_kuid_munged(user_ns, cred->suid),
+ from_kuid_munged(user_ns, cred->fsuid),
+ from_kgid_munged(user_ns, cred->gid),
+ from_kgid_munged(user_ns, cred->egid),
+ from_kgid_munged(user_ns, cred->sgid),
+ from_kgid_munged(user_ns, cred->fsgid));
task_lock(p);
if (p->files)
@@ -205,7 +213,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
task_unlock(p);
for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
- seq_printf(m, "%d ", GROUP_AT(group_info, g));
+ seq_printf(m, "%d ",
+ from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
put_cred(cred);
seq_putc(m, '\n');
@@ -361,7 +370,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task, int whole)
{
unsigned long vsize, eip, esp, wchan = ~0UL;
- long priority, nice;
+ int priority, nice;
int tty_pgrp = -1, tty_nr = 0;
sigset_t sigign, sigcatch;
char state;
@@ -380,7 +389,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
state = *get_task_state(task);
vsize = eip = esp = 0;
- permitted = ptrace_may_access(task, PTRACE_MODE_READ);
+ permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
mm = get_task_mm(task);
if (mm) {
vsize = task_vsize(mm);
@@ -394,8 +403,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
sigemptyset(&sigign);
sigemptyset(&sigcatch);
- cutime = cstime = utime = stime = cputime_zero;
- cgtime = gtime = cputime_zero;
+ cutime = cstime = utime = stime = 0;
+ cgtime = gtime = 0;
if (lock_task_sighand(task, &flags)) {
struct signal_struct *sig = task->signal;
@@ -423,14 +432,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
- gtime = cputime_add(gtime, t->gtime);
+ gtime += t->gtime;
t = next_thread(t);
} while (t != task);
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
thread_group_times(task, &utime, &stime);
- gtime = cputime_add(gtime, sig->gtime);
+ gtime += sig->gtime;
}
sid = task_session_nr_ns(task, ns);
@@ -462,56 +471,70 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(start_time);
- seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
-%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
- pid_nr_ns(pid, ns),
- tcomm,
- state,
- ppid,
- pgid,
- sid,
- tty_nr,
- tty_pgrp,
- task->flags,
- min_flt,
- cmin_flt,
- maj_flt,
- cmaj_flt,
- cputime_to_clock_t(utime),
- cputime_to_clock_t(stime),
- cputime_to_clock_t(cutime),
- cputime_to_clock_t(cstime),
- priority,
- nice,
- num_threads,
- start_time,
- vsize,
- mm ? get_mm_rss(mm) : 0,
- rsslim,
- mm ? (permitted ? mm->start_code : 1) : 0,
- mm ? (permitted ? mm->end_code : 1) : 0,
- (permitted && mm) ? mm->start_stack : 0,
- esp,
- eip,
- /* The signal information here is obsolete.
- * It must be decimal for Linux 2.0 compatibility.
- * Use /proc/#/status for real-time signals.
- */
- task->pending.signal.sig[0] & 0x7fffffffUL,
- task->blocked.sig[0] & 0x7fffffffUL,
- sigign .sig[0] & 0x7fffffffUL,
- sigcatch .sig[0] & 0x7fffffffUL,
- wchan,
- 0UL,
- 0UL,
- task->exit_signal,
- task_cpu(task),
- task->rt_priority,
- task->policy,
- (unsigned long long)delayacct_blkio_ticks(task),
- cputime_to_clock_t(gtime),
- cputime_to_clock_t(cgtime));
+ seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
+ seq_put_decimal_ll(m, ' ', ppid);
+ seq_put_decimal_ll(m, ' ', pgid);
+ seq_put_decimal_ll(m, ' ', sid);
+ seq_put_decimal_ll(m, ' ', tty_nr);
+ seq_put_decimal_ll(m, ' ', tty_pgrp);
+ seq_put_decimal_ull(m, ' ', task->flags);
+ seq_put_decimal_ull(m, ' ', min_flt);
+ seq_put_decimal_ull(m, ' ', cmin_flt);
+ seq_put_decimal_ull(m, ' ', maj_flt);
+ seq_put_decimal_ull(m, ' ', cmaj_flt);
+ seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime));
+ seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime));
+ seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime));
+ seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime));
+ seq_put_decimal_ll(m, ' ', priority);
+ seq_put_decimal_ll(m, ' ', nice);
+ seq_put_decimal_ll(m, ' ', num_threads);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, ' ', start_time);
+ seq_put_decimal_ull(m, ' ', vsize);
+ seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0);
+ seq_put_decimal_ull(m, ' ', rsslim);
+ seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0);
+ seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0);
+ seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0);
+ seq_put_decimal_ull(m, ' ', esp);
+ seq_put_decimal_ull(m, ' ', eip);
+ /* The signal information here is obsolete.
+ * It must be decimal for Linux 2.0 compatibility.
+ * Use /proc/#/status for real-time signals.
+ */
+ seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
+ seq_put_decimal_ull(m, ' ', wchan);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ll(m, ' ', task->exit_signal);
+ seq_put_decimal_ll(m, ' ', task_cpu(task));
+ seq_put_decimal_ull(m, ' ', task->rt_priority);
+ seq_put_decimal_ull(m, ' ', task->policy);
+ seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
+ seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime));
+ seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime));
+
+ if (mm && permitted) {
+ seq_put_decimal_ull(m, ' ', mm->start_data);
+ seq_put_decimal_ull(m, ' ', mm->end_data);
+ seq_put_decimal_ull(m, ' ', mm->start_brk);
+ seq_put_decimal_ull(m, ' ', mm->arg_start);
+ seq_put_decimal_ull(m, ' ', mm->arg_end);
+ seq_put_decimal_ull(m, ' ', mm->env_start);
+ seq_put_decimal_ull(m, ' ', mm->env_end);
+ } else
+ seq_printf(m, " 0 0 0 0 0 0 0");
+
+ if (permitted)
+ seq_put_decimal_ll(m, ' ', task->exit_code);
+ else
+ seq_put_decimal_ll(m, ' ', 0);
+
+ seq_putc(m, '\n');
if (mm)
mmput(mm);
return 0;
@@ -539,8 +562,143 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
}
- seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
- size, resident, shared, text, data);
+ /*
+ * For quick read, open code by putting numbers directly
+ * expected format is
+ * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
+ * size, resident, shared, text, data);
+ */
+ seq_put_decimal_ull(m, 0, size);
+ seq_put_decimal_ull(m, ' ', resident);
+ seq_put_decimal_ull(m, ' ', shared);
+ seq_put_decimal_ull(m, ' ', text);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_put_decimal_ull(m, ' ', data);
+ seq_put_decimal_ull(m, ' ', 0);
+ seq_putc(m, '\n');
return 0;
}
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static struct pid *
+get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
+{
+ struct task_struct *start, *task;
+ struct pid *pid = NULL;
+
+ read_lock(&tasklist_lock);
+
+ start = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (!start)
+ goto out;
+
+ /*
+ * Lets try to continue searching first, this gives
+ * us significant speedup on children-rich processes.
+ */
+ if (pid_prev) {
+ task = pid_task(pid_prev, PIDTYPE_PID);
+ if (task && task->real_parent == start &&
+ !(list_empty(&task->sibling))) {
+ if (list_is_last(&task->sibling, &start->children))
+ goto out;
+ task = list_first_entry(&task->sibling,
+ struct task_struct, sibling);
+ pid = get_pid(task_pid(task));
+ goto out;
+ }
+ }
+
+ /*
+ * Slow search case.
+ *
+ * We might miss some children here if children
+ * are exited while we were not holding the lock,
+ * but it was never promised to be accurate that
+ * much.
+ *
+ * "Just suppose that the parent sleeps, but N children
+ * exit after we printed their tids. Now the slow paths
+ * skips N extra children, we miss N tasks." (c)
+ *
+ * So one need to stop or freeze the leader and all
+ * its children to get a precise result.
+ */
+ list_for_each_entry(task, &start->children, sibling) {
+ if (pos-- == 0) {
+ pid = get_pid(task_pid(task));
+ break;
+ }
+ }
+
+out:
+ read_unlock(&tasklist_lock);
+ return pid;
+}
+
+static int children_seq_show(struct seq_file *seq, void *v)
+{
+ struct inode *inode = seq->private;
+ pid_t pid;
+
+ pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
+ return seq_printf(seq, "%d ", pid);
+}
+
+static void *children_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ return get_children_pid(seq->private, NULL, *pos);
+}
+
+static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct pid *pid;
+
+ pid = get_children_pid(seq->private, v, *pos + 1);
+ put_pid(v);
+
+ ++*pos;
+ return pid;
+}
+
+static void children_seq_stop(struct seq_file *seq, void *v)
+{
+ put_pid(v);
+}
+
+static const struct seq_operations children_seq_ops = {
+ .start = children_seq_start,
+ .next = children_seq_next,
+ .stop = children_seq_stop,
+ .show = children_seq_show,
+};
+
+static int children_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &children_seq_ops);
+ if (ret)
+ return ret;
+
+ m = file->private_data;
+ m->private = inode;
+
+ return ret;
+}
+
+int children_seq_release(struct inode *inode, struct file *file)
+{
+ seq_release(inode, file);
+ return 0;
+}
+
+const struct file_operations proc_tid_children_operations = {
+ .open = children_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = children_seq_release,
+};
+#endif /* CONFIG_CHECKPOINT_RESTORE */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 851ba3dcdc2..437195f204e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,11 +81,14 @@
#include <linux/oom.h>
#include <linux/elf.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
+#include <linux/flex_array.h>
#ifdef CONFIG_HARDWALL
#include <asm/hardwall.h>
#endif
+#include <trace/events/oom.h>
#include "internal.h"
/* NOTE:
@@ -101,7 +104,7 @@
struct pid_entry {
char *name;
int len;
- mode_t mode;
+ umode_t mode;
const struct inode_operations *iop;
const struct file_operations *fop;
union proc_op op;
@@ -133,6 +136,8 @@ struct pid_entry {
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
+static int proc_fd_permission(struct inode *inode, int mask);
+
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
@@ -165,9 +170,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
return result;
}
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(dentry->d_inode);
int result = -ENOENT;
if (task) {
@@ -182,9 +187,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
return result;
}
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(dentry->d_inode);
int result = -ENOENT;
if (task) {
@@ -194,84 +199,6 @@ static int proc_root_link(struct inode *inode, struct path *path)
return result;
}
-static struct mm_struct *__check_mem_permission(struct task_struct *task)
-{
- struct mm_struct *mm;
-
- mm = get_task_mm(task);
- if (!mm)
- return ERR_PTR(-EINVAL);
-
- /*
- * A task can always look at itself, in case it chooses
- * to use system calls instead of load instructions.
- */
- if (task == current)
- return mm;
-
- /*
- * If current is actively ptrace'ing, and would also be
- * permitted to freshly attach with ptrace now, permit it.
- */
- if (task_is_stopped_or_traced(task)) {
- int match;
- rcu_read_lock();
- match = (ptrace_parent(task) == current);
- rcu_read_unlock();
- if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
- return mm;
- }
-
- /*
- * No one else is allowed.
- */
- mmput(mm);
- return ERR_PTR(-EPERM);
-}
-
-/*
- * If current may access user memory in @task return a reference to the
- * corresponding mm, otherwise ERR_PTR.
- */
-static struct mm_struct *check_mem_permission(struct task_struct *task)
-{
- struct mm_struct *mm;
- int err;
-
- /*
- * Avoid racing if task exec's as we might get a new mm but validate
- * against old credentials.
- */
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
- if (err)
- return ERR_PTR(err);
-
- mm = __check_mem_permission(task);
- mutex_unlock(&task->signal->cred_guard_mutex);
-
- return mm;
-}
-
-struct mm_struct *mm_for_maps(struct task_struct *task)
-{
- struct mm_struct *mm;
- int err;
-
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
- if (err)
- return ERR_PTR(err);
-
- mm = get_task_mm(task);
- if (mm && mm != current->mm &&
- !ptrace_may_access(task, PTRACE_MODE_READ)) {
- mmput(mm);
- mm = ERR_PTR(-EACCES);
- }
- mutex_unlock(&task->signal->cred_guard_mutex);
-
- return mm;
-}
-
static int proc_pid_cmdline(struct task_struct *task, char * buffer)
{
int res = 0;
@@ -311,7 +238,7 @@ out:
static int proc_pid_auxv(struct task_struct *task, char *buffer)
{
- struct mm_struct *mm = mm_for_maps(task);
+ struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
int res = PTR_ERR(mm);
if (mm && !IS_ERR(mm)) {
unsigned int nwords = 0;
@@ -479,12 +406,13 @@ static const struct file_operations proc_lstats_operations = {
static int proc_oom_score(struct task_struct *task, char *buffer)
{
+ unsigned long totalpages = totalram_pages + total_swap_pages;
unsigned long points = 0;
read_lock(&tasklist_lock);
if (pid_alive(task))
- points = oom_badness(task, NULL, NULL,
- totalram_pages + total_swap_pages);
+ points = oom_badness(task, NULL, NULL, totalpages) *
+ 1000 / totalpages;
read_unlock(&tasklist_lock);
return sprintf(buffer, "%lu\n", points);
}
@@ -627,122 +555,54 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
return 0;
}
-static const struct inode_operations proc_def_inode_operations = {
- .setattr = proc_setattr,
-};
-
-static int mounts_open_common(struct inode *inode, struct file *file,
- const struct seq_operations *op)
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+ struct task_struct *task,
+ int hide_pid_min)
{
- struct task_struct *task = get_proc_task(inode);
- struct nsproxy *nsp;
- struct mnt_namespace *ns = NULL;
- struct path root;
- struct proc_mounts *p;
- int ret = -EINVAL;
-
- if (task) {
- rcu_read_lock();
- nsp = task_nsproxy(task);
- if (nsp) {
- ns = nsp->mnt_ns;
- if (ns)
- get_mnt_ns(ns);
- }
- rcu_read_unlock();
- if (ns && get_task_root(task, &root) == 0)
- ret = 0;
- put_task_struct(task);
- }
-
- if (!ns)
- goto err;
- if (ret)
- goto err_put_ns;
-
- ret = -ENOMEM;
- p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
- if (!p)
- goto err_put_path;
-
- file->private_data = &p->m;
- ret = seq_open(file, op);
- if (ret)
- goto err_free;
-
- p->m.private = p;
- p->ns = ns;
- p->root = root;
- p->m.poll_event = ns->event;
-
- return 0;
-
- err_free:
- kfree(p);
- err_put_path:
- path_put(&root);
- err_put_ns:
- put_mnt_ns(ns);
- err:
- return ret;
+ if (pid->hide_pid < hide_pid_min)
+ return true;
+ if (in_group_p(pid->pid_gid))
+ return true;
+ return ptrace_may_access(task, PTRACE_MODE_READ);
}
-static int mounts_release(struct inode *inode, struct file *file)
-{
- struct proc_mounts *p = file->private_data;
- path_put(&p->root);
- put_mnt_ns(p->ns);
- return seq_release(inode, file);
-}
-static unsigned mounts_poll(struct file *file, poll_table *wait)
+static int proc_pid_permission(struct inode *inode, int mask)
{
- struct proc_mounts *p = file->private_data;
- unsigned res = POLLIN | POLLRDNORM;
-
- poll_wait(file, &p->ns->poll, wait);
- if (mnt_had_events(p))
- res |= POLLERR | POLLPRI;
-
- return res;
-}
+ struct pid_namespace *pid = inode->i_sb->s_fs_info;
+ struct task_struct *task;
+ bool has_perms;
-static int mounts_open(struct inode *inode, struct file *file)
-{
- return mounts_open_common(inode, file, &mounts_op);
-}
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+ has_perms = has_pid_permissions(pid, task, 1);
+ put_task_struct(task);
-static const struct file_operations proc_mounts_operations = {
- .open = mounts_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = mounts_release,
- .poll = mounts_poll,
-};
+ if (!has_perms) {
+ if (pid->hide_pid == 2) {
+ /*
+ * Let's make getdents(), stat(), and open()
+ * consistent with each other. If a process
+ * may not stat() a file, it shouldn't be seen
+ * in procfs at all.
+ */
+ return -ENOENT;
+ }
-static int mountinfo_open(struct inode *inode, struct file *file)
-{
- return mounts_open_common(inode, file, &mountinfo_op);
+ return -EPERM;
+ }
+ return generic_permission(inode, mask);
}
-static const struct file_operations proc_mountinfo_operations = {
- .open = mountinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = mounts_release,
- .poll = mounts_poll,
-};
-static int mountstats_open(struct inode *inode, struct file *file)
-{
- return mounts_open_common(inode, file, &mountstats_op);
-}
-static const struct file_operations proc_mountstats_operations = {
- .open = mountstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = mounts_release,
+static const struct inode_operations proc_def_inode_operations = {
+ .setattr = proc_setattr,
};
#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */
@@ -814,135 +674,103 @@ static const struct file_operations proc_single_file_operations = {
.release = single_release,
};
-static int mem_open(struct inode* inode, struct file* file)
-{
- file->private_data = (void*)((long)current->self_exec_id);
- /* OK to pass negative loff_t, we can catch out-of-range */
- file->f_mode |= FMODE_UNSIGNED_OFFSET;
- return 0;
-}
-
-static ssize_t mem_read(struct file * file, char __user * buf,
- size_t count, loff_t *ppos)
+static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- char *page;
- unsigned long src = *ppos;
- int ret = -ESRCH;
struct mm_struct *mm;
if (!task)
- goto out_no_task;
+ return -ESRCH;
- ret = -ENOMEM;
- page = (char *)__get_free_page(GFP_TEMPORARY);
- if (!page)
- goto out;
+ mm = mm_access(task, mode);
+ put_task_struct(task);
- mm = check_mem_permission(task);
- ret = PTR_ERR(mm);
if (IS_ERR(mm))
- goto out_free;
+ return PTR_ERR(mm);
- ret = -EIO;
-
- if (file->private_data != (void*)((long)current->self_exec_id))
- goto out_put;
-
- ret = 0;
-
- while (count > 0) {
- int this_len, retval;
+ if (mm) {
+ /* ensure this mm_struct can't be freed */
+ atomic_inc(&mm->mm_count);
+ /* but do not pin its memory */
+ mmput(mm);
+ }
- this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- retval = access_remote_vm(mm, src, page, this_len, 0);
- if (!retval) {
- if (!ret)
- ret = -EIO;
- break;
- }
+ /* OK to pass negative loff_t, we can catch out-of-range */
+ file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ file->private_data = mm;
- if (copy_to_user(buf, page, retval)) {
- ret = -EFAULT;
- break;
- }
-
- ret += retval;
- src += retval;
- buf += retval;
- count -= retval;
- }
- *ppos = src;
+ return 0;
+}
-out_put:
- mmput(mm);
-out_free:
- free_page((unsigned long) page);
-out:
- put_task_struct(task);
-out_no_task:
- return ret;
+static int mem_open(struct inode *inode, struct file *file)
+{
+ return __mem_open(inode, file, PTRACE_MODE_ATTACH);
}
-static ssize_t mem_write(struct file * file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t mem_rw(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, int write)
{
- int copied;
+ struct mm_struct *mm = file->private_data;
+ unsigned long addr = *ppos;
+ ssize_t copied;
char *page;
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- unsigned long dst = *ppos;
- struct mm_struct *mm;
- copied = -ESRCH;
- if (!task)
- goto out_no_task;
+ if (!mm)
+ return 0;
- copied = -ENOMEM;
page = (char *)__get_free_page(GFP_TEMPORARY);
if (!page)
- goto out_task;
-
- mm = check_mem_permission(task);
- copied = PTR_ERR(mm);
- if (IS_ERR(mm))
- goto out_free;
-
- copied = -EIO;
- if (file->private_data != (void *)((long)current->self_exec_id))
- goto out_mm;
+ return -ENOMEM;
copied = 0;
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ goto free;
+
while (count > 0) {
- int this_len, retval;
+ int this_len = min_t(int, count, PAGE_SIZE);
- this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- if (copy_from_user(page, buf, this_len)) {
+ if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
break;
}
- retval = access_remote_vm(mm, dst, page, this_len, 1);
- if (!retval) {
+
+ this_len = access_remote_vm(mm, addr, page, this_len, write);
+ if (!this_len) {
if (!copied)
copied = -EIO;
break;
}
- copied += retval;
- buf += retval;
- dst += retval;
- count -= retval;
+
+ if (!write && copy_to_user(buf, page, this_len)) {
+ copied = -EFAULT;
+ break;
+ }
+
+ buf += this_len;
+ addr += this_len;
+ copied += this_len;
+ count -= this_len;
}
- *ppos = dst;
+ *ppos = addr;
-out_mm:
mmput(mm);
-out_free:
+free:
free_page((unsigned long) page);
-out_task:
- put_task_struct(task);
-out_no_task:
return copied;
}
+static ssize_t mem_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, buf, count, ppos, 0);
+}
+
+static ssize_t mem_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
+
loff_t mem_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
@@ -959,37 +787,45 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
return file->f_pos;
}
+static int mem_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
static const struct file_operations proc_mem_operations = {
.llseek = mem_lseek,
.read = mem_read,
.write = mem_write,
.open = mem_open,
+ .release = mem_release,
};
+static int environ_open(struct inode *inode, struct file *file)
+{
+ return __mem_open(inode, file, PTRACE_MODE_READ);
+}
+
static ssize_t environ_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
char *page;
unsigned long src = *ppos;
- int ret = -ESRCH;
- struct mm_struct *mm;
+ int ret = 0;
+ struct mm_struct *mm = file->private_data;
- if (!task)
- goto out_no_task;
+ if (!mm)
+ return 0;
- ret = -ENOMEM;
page = (char *)__get_free_page(GFP_TEMPORARY);
if (!page)
- goto out;
-
-
- mm = mm_for_maps(task);
- ret = PTR_ERR(mm);
- if (!mm || IS_ERR(mm))
- goto out_free;
+ return -ENOMEM;
ret = 0;
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ goto free;
while (count > 0) {
int this_len, retval, max_len;
@@ -1001,7 +837,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
this_len = (this_len > max_len) ? max_len : this_len;
- retval = access_process_vm(task, (mm->env_start + src),
+ retval = access_remote_vm(mm, (mm->env_start + src),
page, this_len, 0);
if (retval <= 0) {
@@ -1020,19 +856,18 @@ static ssize_t environ_read(struct file *file, char __user *buf,
count -= retval;
}
*ppos = src;
-
mmput(mm);
-out_free:
+
+free:
free_page((unsigned long) page);
-out:
- put_task_struct(task);
-out_no_task:
return ret;
}
static const struct file_operations proc_environ_operations = {
+ .open = environ_open,
.read = environ_read,
.llseek = generic_file_llseek,
+ .release = mem_release,
};
static ssize_t oom_adjust_read(struct file *file, char __user *buf,
@@ -1124,6 +959,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
else
task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
-OOM_DISABLE;
+ trace_oom_score_adj_update(task);
err_sighand:
unlock_task_sighand(task, &flags);
err_task_lock:
@@ -1211,6 +1047,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
task->signal->oom_score_adj = oom_score_adj;
if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
task->signal->oom_score_adj_min = oom_score_adj;
+ trace_oom_score_adj_update(task);
/*
* Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
* always attainable.
@@ -1261,9 +1098,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
ssize_t length;
uid_t loginuid;
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
-
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
@@ -1292,7 +1126,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
goto out_free_page;
}
- length = audit_set_loginuid(current, loginuid);
+ length = audit_set_loginuid(loginuid);
if (likely(length == 0))
length = count;
@@ -1476,8 +1310,7 @@ sched_autogroup_write(struct file *file, const char __user *buf,
if (!p)
return -ESRCH;
- err = nice;
- err = proc_sched_autogroup_set_nice(p, &err);
+ err = proc_sched_autogroup_set_nice(p, nice);
if (err)
count = err;
@@ -1567,13 +1400,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
.release = single_release,
};
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
struct task_struct *task;
struct mm_struct *mm;
struct file *exe_file;
- task = get_proc_task(inode);
+ task = get_proc_task(dentry->d_inode);
if (!task)
return -ENOENT;
mm = get_task_mm(task);
@@ -1603,7 +1436,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
if (!proc_fd_access_allowed(inode))
goto out;
- error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+ error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
out:
return ERR_PTR(error);
}
@@ -1642,7 +1475,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
if (!proc_fd_access_allowed(inode))
goto out;
- error = PROC_I(inode)->op.proc_get_link(inode, &path);
+ error = PROC_I(inode)->op.proc_get_link(dentry, &path);
if (error)
goto out;
@@ -1723,14 +1556,23 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
struct inode *inode = dentry->d_inode;
struct task_struct *task;
const struct cred *cred;
+ struct pid_namespace *pid = dentry->d_sb->s_fs_info;
generic_fillattr(inode, stat);
rcu_read_lock();
- stat->uid = 0;
- stat->gid = 0;
+ stat->uid = GLOBAL_ROOT_UID;
+ stat->gid = GLOBAL_ROOT_GID;
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
+ if (!has_pid_permissions(pid, task, 2)) {
+ rcu_read_unlock();
+ /*
+ * This doesn't prevent learning whether PID exists,
+ * it only makes getattr() consistent with readdir().
+ */
+ return -ENOENT;
+ }
if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
task_dumpable(task)) {
cred = __task_cred(task);
@@ -1780,8 +1622,8 @@ int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
inode->i_gid = cred->egid;
rcu_read_unlock();
} else {
- inode->i_uid = 0;
- inode->i_gid = 0;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
}
inode->i_mode &= ~(S_ISUID | S_ISGID);
security_task_to_inode(task, inode);
@@ -1911,7 +1753,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
fdt = files_fdtable(files);
f_flags = file->f_flags & ~O_CLOEXEC;
- if (FD_ISSET(fd, fdt->close_on_exec))
+ if (close_on_exec(fd, fdt))
f_flags |= O_CLOEXEC;
if (path) {
@@ -1934,9 +1776,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
return -ENOENT;
}
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
{
- return proc_fd_info(inode, path, NULL);
+ return proc_fd_info(dentry->d_inode, path, NULL);
}
static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
@@ -1957,10 +1799,15 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
if (task) {
files = get_files_struct(task);
if (files) {
+ struct file *file;
rcu_read_lock();
- if (fcheck_files(files, fd)) {
+ file = fcheck_files(files, fd);
+ if (file) {
+ unsigned f_mode = file->f_mode;
+
rcu_read_unlock();
put_files_struct(files);
+
if (task_dumpable(task)) {
rcu_read_lock();
cred = __task_cred(task);
@@ -1968,10 +1815,19 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
inode->i_gid = cred->egid;
rcu_read_unlock();
} else {
- inode->i_uid = 0;
- inode->i_gid = 0;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ }
+
+ if (S_ISLNK(inode->i_mode)) {
+ unsigned i_mode = S_IFLNK;
+ if (f_mode & FMODE_READ)
+ i_mode |= S_IRUSR | S_IXUSR;
+ if (f_mode & FMODE_WRITE)
+ i_mode |= S_IWUSR | S_IXUSR;
+ inode->i_mode = i_mode;
}
- inode->i_mode &= ~(S_ISUID | S_ISGID);
+
security_task_to_inode(task, inode);
put_task_struct(task);
return 1;
@@ -1994,9 +1850,7 @@ static const struct dentry_operations tid_fd_dentry_operations =
static struct dentry *proc_fd_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
{
- unsigned fd = *(const unsigned *)ptr;
- struct file *file;
- struct files_struct *files;
+ unsigned fd = (unsigned long)ptr;
struct inode *inode;
struct proc_inode *ei;
struct dentry *error = ERR_PTR(-ENOENT);
@@ -2006,26 +1860,8 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
goto out;
ei = PROC_I(inode);
ei->fd = fd;
- files = get_files_struct(task);
- if (!files)
- goto out_iput;
- inode->i_mode = S_IFLNK;
-
- /*
- * We are not taking a ref to the file structure, so we must
- * hold ->file_lock.
- */
- spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
- if (!file)
- goto out_unlock;
- if (file->f_mode & FMODE_READ)
- inode->i_mode |= S_IRUSR | S_IXUSR;
- if (file->f_mode & FMODE_WRITE)
- inode->i_mode |= S_IWUSR | S_IXUSR;
- spin_unlock(&files->file_lock);
- put_files_struct(files);
+ inode->i_mode = S_IFLNK;
inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
ei->op.proc_get_link = proc_fd_link;
@@ -2037,12 +1873,6 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
out:
return error;
-out_unlock:
- spin_unlock(&files->file_lock);
- put_files_struct(files);
-out_iput:
- iput(inode);
- goto out;
}
static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -2058,7 +1888,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
if (fd == ~0U)
goto out;
- result = instantiate(dir, dentry, task, &fd);
+ result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
out:
put_task_struct(task);
out_no_task:
@@ -2101,21 +1931,22 @@ static int proc_readfd_common(struct file * filp, void * dirent,
fd++, filp->f_pos++) {
char name[PROC_NUMBUF];
int len;
+ int rv;
if (!fcheck_files(files, fd))
continue;
rcu_read_unlock();
len = snprintf(name, sizeof(name), "%d", fd);
- if (proc_fill_cache(filp, dirent, filldir,
- name, len, instantiate,
- p, &fd) < 0) {
- rcu_read_lock();
- break;
- }
+ rv = proc_fill_cache(filp, dirent, filldir,
+ name, len, instantiate, p,
+ (void *)(unsigned long)fd);
+ if (rv < 0)
+ goto out_fd_loop;
rcu_read_lock();
}
rcu_read_unlock();
+out_fd_loop:
put_files_struct(files);
}
out:
@@ -2157,6 +1988,348 @@ static const struct file_operations proc_fd_operations = {
.llseek = default_llseek,
};
+#ifdef CONFIG_CHECKPOINT_RESTORE
+
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+ unsigned long *start, unsigned long *end)
+{
+ if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ unsigned long vm_start, vm_end;
+ bool exact_vma_exists = false;
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ const struct cred *cred;
+ struct inode *inode;
+ int status = 0;
+
+ if (nd && nd->flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ status = -EACCES;
+ goto out_notask;
+ }
+
+ inode = dentry->d_inode;
+ task = get_proc_task(inode);
+ if (!task)
+ goto out_notask;
+
+ mm = mm_access(task, PTRACE_MODE_READ);
+ if (IS_ERR_OR_NULL(mm))
+ goto out;
+
+ if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+ down_read(&mm->mmap_sem);
+ exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+ up_read(&mm->mmap_sem);
+ }
+
+ mmput(mm);
+
+ if (exact_vma_exists) {
+ if (task_dumpable(task)) {
+ rcu_read_lock();
+ cred = __task_cred(task);
+ inode->i_uid = cred->euid;
+ inode->i_gid = cred->egid;
+ rcu_read_unlock();
+ } else {
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ }
+ security_task_to_inode(task, inode);
+ status = 1;
+ }
+
+out:
+ put_task_struct(task);
+
+out_notask:
+ if (status <= 0)
+ d_drop(dentry);
+
+ return status;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+ .d_revalidate = map_files_d_revalidate,
+ .d_delete = pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ int rc;
+
+ rc = -ENOENT;
+ task = get_proc_task(dentry->d_inode);
+ if (!task)
+ goto out;
+
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+ if (rc)
+ goto out_mmput;
+
+ down_read(&mm->mmap_sem);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (vma && vma->vm_file) {
+ *path = vma->vm_file->f_path;
+ path_get(path);
+ rc = 0;
+ }
+ up_read(&mm->mmap_sem);
+
+out_mmput:
+ mmput(mm);
+out:
+ return rc;
+}
+
+struct map_files_info {
+ struct file *file;
+ unsigned long len;
+ unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ const struct file *file = ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
+
+ if (!file)
+ return ERR_PTR(-ENOENT);
+
+ inode = proc_pid_make_inode(dir->i_sb, task);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ ei = PROC_I(inode);
+ ei->op.proc_get_link = proc_map_files_get_link;
+
+ inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_size = 64;
+ inode->i_mode = S_IFLNK;
+
+ if (file->f_mode & FMODE_READ)
+ inode->i_mode |= S_IRUSR;
+ if (file->f_mode & FMODE_WRITE)
+ inode->i_mode |= S_IWUSR;
+
+ d_set_d_op(dentry, &tid_map_files_dentry_operations);
+ d_add(dentry, inode);
+
+ return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+ struct dentry *dentry, struct nameidata *nd)
+{
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct dentry *result;
+ struct mm_struct *mm;
+
+ result = ERR_PTR(-EACCES);
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ result = ERR_PTR(-ENOENT);
+ task = get_proc_task(dir);
+ if (!task)
+ goto out;
+
+ result = ERR_PTR(-EACCES);
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out_put_task;
+
+ result = ERR_PTR(-ENOENT);
+ if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+ goto out_put_task;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_put_task;
+
+ down_read(&mm->mmap_sem);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (!vma)
+ goto out_no_vma;
+
+ result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+out_put_task:
+ put_task_struct(task);
+out:
+ return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+ .lookup = proc_map_files_lookup,
+ .permission = proc_fd_permission,
+ .setattr = proc_setattr,
+};
+
+static int
+proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ ino_t ino;
+ int ret;
+
+ ret = -EACCES;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ ret = -ENOENT;
+ task = get_proc_task(inode);
+ if (!task)
+ goto out;
+
+ ret = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out_put_task;
+
+ ret = 0;
+ switch (filp->f_pos) {
+ case 0:
+ ino = inode->i_ino;
+ if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+ goto out_put_task;
+ filp->f_pos++;
+ case 1:
+ ino = parent_ino(dentry);
+ if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+ goto out_put_task;
+ filp->f_pos++;
+ default:
+ {
+ unsigned long nr_files, pos, i;
+ struct flex_array *fa = NULL;
+ struct map_files_info info;
+ struct map_files_info *p;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_put_task;
+ down_read(&mm->mmap_sem);
+
+ nr_files = 0;
+
+ /*
+ * We need two passes here:
+ *
+ * 1) Collect vmas of mapped files with mmap_sem taken
+ * 2) Release mmap_sem and instantiate entries
+ *
+ * otherwise we get lockdep complained, since filldir()
+ * routine might require mmap_sem taken in might_fault().
+ */
+
+ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ if (vma->vm_file && ++pos > filp->f_pos)
+ nr_files++;
+ }
+
+ if (nr_files) {
+ fa = flex_array_alloc(sizeof(info), nr_files,
+ GFP_KERNEL);
+ if (!fa || flex_array_prealloc(fa, 0, nr_files,
+ GFP_KERNEL)) {
+ ret = -ENOMEM;
+ if (fa)
+ flex_array_free(fa);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ goto out_put_task;
+ }
+ for (i = 0, vma = mm->mmap, pos = 2; vma;
+ vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= filp->f_pos)
+ continue;
+
+ get_file(vma->vm_file);
+ info.file = vma->vm_file;
+ info.len = snprintf(info.name,
+ sizeof(info.name), "%lx-%lx",
+ vma->vm_start, vma->vm_end);
+ if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+ BUG();
+ }
+ }
+ up_read(&mm->mmap_sem);
+
+ for (i = 0; i < nr_files; i++) {
+ p = flex_array_get(fa, i);
+ ret = proc_fill_cache(filp, dirent, filldir,
+ p->name, p->len,
+ proc_map_files_instantiate,
+ task, p->file);
+ if (ret)
+ break;
+ filp->f_pos++;
+ fput(p->file);
+ }
+ for (; i < nr_files; i++) {
+ /*
+ * In case of error don't forget
+ * to put rest of file refs.
+ */
+ p = flex_array_get(fa, i);
+ fput(p->file);
+ }
+ if (fa)
+ flex_array_free(fa);
+ mmput(mm);
+ }
+ }
+
+out_put_task:
+ put_task_struct(task);
+out:
+ return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+ .read = generic_read_dir,
+ .readdir = proc_map_files_readdir,
+ .llseek = default_llseek,
+};
+
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
/*
* /proc/pid/fd needs a special permission handler so that a process can still
* access /proc/self/fd after it has executed a setuid().
@@ -2183,7 +2356,7 @@ static const struct inode_operations proc_fd_inode_operations = {
static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
{
- unsigned fd = *(unsigned *)ptr;
+ unsigned fd = (unsigned long)ptr;
struct inode *inode;
struct proc_inode *ei;
struct dentry *error = ERR_PTR(-ENOENT);
@@ -2752,6 +2925,74 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
}
#endif /* CONFIG_TASK_IO_ACCOUNTING */
+#ifdef CONFIG_USER_NS
+static int proc_id_map_open(struct inode *inode, struct file *file,
+ struct seq_operations *seq_ops)
+{
+ struct user_namespace *ns = NULL;
+ struct task_struct *task;
+ struct seq_file *seq;
+ int ret = -EINVAL;
+
+ task = get_proc_task(inode);
+ if (task) {
+ rcu_read_lock();
+ ns = get_user_ns(task_cred_xxx(task, user_ns));
+ rcu_read_unlock();
+ put_task_struct(task);
+ }
+ if (!ns)
+ goto err;
+
+ ret = seq_open(file, seq_ops);
+ if (ret)
+ goto err_put_ns;
+
+ seq = file->private_data;
+ seq->private = ns;
+
+ return 0;
+err_put_ns:
+ put_user_ns(ns);
+err:
+ return ret;
+}
+
+static int proc_id_map_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct user_namespace *ns = seq->private;
+ put_user_ns(ns);
+ return seq_release(inode, file);
+}
+
+static int proc_uid_map_open(struct inode *inode, struct file *file)
+{
+ return proc_id_map_open(inode, file, &proc_uid_seq_operations);
+}
+
+static int proc_gid_map_open(struct inode *inode, struct file *file)
+{
+ return proc_id_map_open(inode, file, &proc_gid_seq_operations);
+}
+
+static const struct file_operations proc_uid_map_operations = {
+ .open = proc_uid_map_open,
+ .write = proc_uid_map_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_id_map_release,
+};
+
+static const struct file_operations proc_gid_map_operations = {
+ .open = proc_gid_map_open,
+ .write = proc_gid_map_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_id_map_release,
+};
+#endif /* CONFIG_USER_NS */
+
static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -2772,6 +3013,9 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
@@ -2795,9 +3039,9 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
- REG("maps", S_IRUGO, proc_maps_operations),
+ REG("maps", S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_NUMA
- REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
+ REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
@@ -2808,7 +3052,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
- REG("smaps", S_IRUGO, proc_smaps_operations),
+ REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("pagemap", S_IRUGO, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -2851,6 +3095,10 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_HARDWALL
INF("hardwall", S_IRUGO, proc_pid_hardwall),
#endif
+#ifdef CONFIG_USER_NS
+ REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
+ REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
+#endif
};
static int proc_tgid_base_readdir(struct file * filp,
@@ -2875,6 +3123,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.lookup = proc_tgid_base_lookup,
.getattr = pid_getattr,
.setattr = proc_setattr,
+ .permission = proc_pid_permission,
};
static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -3078,6 +3327,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
proc_pid_instantiate, iter.task, NULL);
}
+static int fake_filldir(void *buf, const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned d_type)
+{
+ return 0;
+}
+
/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
@@ -3085,6 +3340,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
struct task_struct *reaper;
struct tgid_iter iter;
struct pid_namespace *ns;
+ filldir_t __filldir;
if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
goto out_no_task;
@@ -3106,8 +3362,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
for (iter = next_tgid(ns, iter);
iter.task;
iter.tgid += 1, iter = next_tgid(ns, iter)) {
+ if (has_pid_permissions(ns, iter.task, 2))
+ __filldir = filldir;
+ else
+ __filldir = fake_filldir;
+
filp->f_pos = iter.tgid + TGID_OFFSET;
- if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+ if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
put_task_struct(iter.task);
goto out;
}
@@ -3141,9 +3402,12 @@ static const struct pid_entry tid_base_stuff[] = {
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
- REG("maps", S_IRUGO, proc_maps_operations),
+ REG("maps", S_IRUGO, proc_tid_maps_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ REG("children", S_IRUGO, proc_tid_children_operations),
+#endif
#ifdef CONFIG_NUMA
- REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
+ REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
#endif
REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
LNK("cwd", proc_cwd_link),
@@ -3153,7 +3417,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
- REG("smaps", S_IRUGO, proc_smaps_operations),
+ REG("smaps", S_IRUGO, proc_tid_smaps_operations),
REG("pagemap", S_IRUGO, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -3193,6 +3457,10 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_HARDWALL
INF("hardwall", S_IRUGO, proc_pid_hardwall),
#endif
+#ifdef CONFIG_USER_NS
+ REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
+ REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
+#endif
};
static int proc_tid_base_readdir(struct file * filp,
@@ -3442,6 +3710,7 @@ static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
.setattr = proc_setattr,
+ .permission = proc_pid_permission,
};
static const struct file_operations proc_task_operations = {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 10090d9c7ad..2edf34f2eb6 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -597,7 +597,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
const char *name,
- mode_t mode,
+ umode_t mode,
nlink_t nlink)
{
struct proc_dir_entry *ent = NULL;
@@ -659,7 +659,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
}
EXPORT_SYMBOL(proc_symlink);
-struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
+struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
struct proc_dir_entry *parent)
{
struct proc_dir_entry *ent;
@@ -699,7 +699,7 @@ struct proc_dir_entry *proc_mkdir(const char *name,
}
EXPORT_SYMBOL(proc_mkdir);
-struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
+struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode,
struct proc_dir_entry *parent)
{
struct proc_dir_entry *ent;
@@ -728,7 +728,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
}
EXPORT_SYMBOL(create_proc_entry);
-struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
+struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct file_operations *proc_fops,
void *data)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7737c5468a4..7ac817b64a7 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -7,6 +7,7 @@
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
+#include <linux/pid_namespace.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/stat.h>
@@ -17,9 +18,10 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
+#include <linux/seq_file.h>
#include <linux/slab.h>
+#include <linux/mount.h>
-#include <asm/system.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -31,7 +33,7 @@ static void proc_evict_inode(struct inode *inode)
const struct proc_ns_operations *ns_ops;
truncate_inode_pages(&inode->i_data, 0);
- end_writeback(inode);
+ clear_inode(inode);
/* Stop tracking associated processes */
put_pid(PROC_I(inode)->pid);
@@ -77,7 +79,6 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
static void proc_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
- INIT_LIST_HEAD(&inode->i_dentry);
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}
@@ -102,12 +103,27 @@ void __init proc_init_inodecache(void)
init_once);
}
+static int proc_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct pid_namespace *pid = sb->s_fs_info;
+
+ if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
+ seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
+ if (pid->hide_pid != 0)
+ seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+
+ return 0;
+}
+
static const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.destroy_inode = proc_destroy_inode,
.drop_inode = generic_delete_inode,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
+ .remount_fs = proc_remount,
+ .show_options = proc_show_options,
};
static void __pde_users_dec(struct proc_dir_entry *pde)
@@ -469,8 +485,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
int proc_fill_super(struct super_block *s)
{
- struct inode * root_inode;
-
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
@@ -479,19 +493,11 @@ int proc_fill_super(struct super_block *s)
s->s_time_gran = 1;
pde_get(&proc_root);
- root_inode = proc_get_inode(s, &proc_root);
- if (!root_inode)
- goto out_no_root;
- root_inode->i_uid = 0;
- root_inode->i_gid = 0;
- s->s_root = d_alloc_root(root_inode);
- if (!s->s_root)
- goto out_no_root;
- return 0;
+ s->s_root = d_make_root(proc_get_inode(s, &proc_root));
+ if (s->s_root)
+ return 0;
-out_no_root:
printk("proc_read_super: get root inode failed\n");
- iput(root_inode);
pde_put(&proc_root);
return -ENOMEM;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7838e5cfec1..eca4aca5b6e 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,12 +10,15 @@
*/
#include <linux/proc_fs.h>
+struct ctl_table_header;
extern struct proc_dir_entry proc_root;
#ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void);
+extern void sysctl_head_put(struct ctl_table_header *head);
#else
static inline void proc_sys_init(void) { }
+static inline void sysctl_head_put(struct ctl_table_header *head) { }
#endif
#ifdef CONFIG_NET
extern int proc_net_init(void);
@@ -28,8 +31,6 @@ struct vmalloc_info {
unsigned long largest_chunk;
};
-extern struct mm_struct *mm_for_maps(struct task_struct *);
-
#ifdef CONFIG_MMU
#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
extern void get_vmalloc_info(struct vmalloc_info *vmi);
@@ -53,9 +54,13 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
-extern const struct file_operations proc_maps_operations;
-extern const struct file_operations proc_numa_maps_operations;
-extern const struct file_operations proc_smaps_operations;
+extern const struct file_operations proc_tid_children_operations;
+extern const struct file_operations proc_pid_maps_operations;
+extern const struct file_operations proc_tid_maps_operations;
+extern const struct file_operations proc_pid_numa_maps_operations;
+extern const struct file_operations proc_tid_numa_maps_operations;
+extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_tid_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
extern const struct file_operations proc_net_operations;
@@ -117,6 +122,7 @@ void pde_put(struct proc_dir_entry *pde);
int proc_fill_super(struct super_block *);
struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+int proc_remount(struct super_block *sb, int *flags, char *data);
/*
* These are generic /proc routines that use the internal
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d245cb23dd7..86c67eee439 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -157,7 +157,8 @@ static int kcore_update_ram(void)
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/* calculate vmemmap's address from given system ram pfn and register it */
-int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
{
unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
unsigned long nr_pages = ent->size >> PAGE_SHIFT;
@@ -189,7 +190,8 @@ int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
}
#else
-int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
+static int
+get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
{
return 1;
}
@@ -513,7 +515,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
n = copy_to_user(buffer, (char *)start, tsz);
/*
- * We cannot distingush between fault on source
+ * We cannot distinguish between fault on source
* and fault on destination. When this happens
* we clear too and hope it will trigger the
* EFAULT again.
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index be177f702ac..0d9e23a39e4 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -9,7 +9,6 @@
#include <linux/file.h>
#include <linux/utsname.h>
#include <net/net_namespace.h>
-#include <linux/mnt_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/pid_namespace.h>
#include "internal.h"
@@ -54,7 +53,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
ei->ns_ops = ns_ops;
ei->ns = ns;
- dentry->d_op = &pid_dentry_operations;
+ d_set_d_op(dentry, &pid_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
if (pid_revalidate(dentry, NULL))
@@ -157,15 +156,15 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out;
- last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
- for (entry = ns_entries; entry <= last; entry++) {
+ last = &ns_entries[ARRAY_SIZE(ns_entries)];
+ for (entry = ns_entries; entry < last; entry++) {
if (strlen((*entry)->name) != len)
continue;
if (!memcmp(dentry->d_name.name, (*entry)->name, len))
break;
}
error = ERR_PTR(-ENOENT);
- if (entry > last)
+ if (entry == last)
goto out;
error = proc_ns_instantiate(dir, dentry, task, *entry);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 6d8e6a9e93a..7fcd0d60a96 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,6 +115,8 @@ u64 stable_page_flags(struct page *page)
u |= 1 << KPF_COMPOUND_TAIL;
if (PageHuge(page))
u |= 1 << KPF_HUGE;
+ else if (PageTransCompound(page))
+ u |= 1 << KPF_THP;
/*
* Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index f738024ccc8..06e1cc17caf 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -179,7 +179,7 @@ const struct file_operations proc_net_operations = {
struct proc_dir_entry *proc_net_fops_create(struct net *net,
- const char *name, mode_t mode, const struct file_operations *fops)
+ const char *name, umode_t mode, const struct file_operations *fops)
{
return proc_create(name, mode, net->proc_net, fops);
}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a6b62173d4c..3476bca8f7a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -6,7 +6,10 @@
#include <linux/poll.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
+#include <linux/sched.h>
#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/module.h>
#include "internal.h"
static const struct dentry_operations proc_sys_dentry_operations;
@@ -24,6 +27,371 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)
wake_up_interruptible(&poll->wait);
}
+static struct ctl_table root_table[] = {
+ {
+ .procname = "",
+ .mode = S_IFDIR|S_IRUGO|S_IXUGO,
+ },
+ { }
+};
+static struct ctl_table_root sysctl_table_root = {
+ .default_set.dir.header = {
+ {{.count = 1,
+ .nreg = 1,
+ .ctl_table = root_table }},
+ .ctl_table_arg = root_table,
+ .root = &sysctl_table_root,
+ .set = &sysctl_table_root.default_set,
+ },
+};
+
+static DEFINE_SPINLOCK(sysctl_lock);
+
+static void drop_sysctl_table(struct ctl_table_header *header);
+static int sysctl_follow_link(struct ctl_table_header **phead,
+ struct ctl_table **pentry, struct nsproxy *namespaces);
+static int insert_links(struct ctl_table_header *head);
+static void put_links(struct ctl_table_header *header);
+
+static void sysctl_print_dir(struct ctl_dir *dir)
+{
+ if (dir->header.parent)
+ sysctl_print_dir(dir->header.parent);
+ printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname);
+}
+
+static int namecmp(const char *name1, int len1, const char *name2, int len2)
+{
+ int minlen;
+ int cmp;
+
+ minlen = len1;
+ if (minlen > len2)
+ minlen = len2;
+
+ cmp = memcmp(name1, name2, minlen);
+ if (cmp == 0)
+ cmp = len1 - len2;
+ return cmp;
+}
+
+/* Called under sysctl_lock */
+static struct ctl_table *find_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir, const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+ struct rb_node *node = dir->root.rb_node;
+
+ while (node)
+ {
+ struct ctl_node *ctl_node;
+ const char *procname;
+ int cmp;
+
+ ctl_node = rb_entry(node, struct ctl_node, node);
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ procname = entry->procname;
+
+ cmp = namecmp(name, namelen, procname, strlen(procname));
+ if (cmp < 0)
+ node = node->rb_left;
+ else if (cmp > 0)
+ node = node->rb_right;
+ else {
+ *phead = head;
+ return entry;
+ }
+ }
+ return NULL;
+}
+
+static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+ struct rb_node *node = &head->node[entry - head->ctl_table].node;
+ struct rb_node **p = &head->parent->root.rb_node;
+ struct rb_node *parent = NULL;
+ const char *name = entry->procname;
+ int namelen = strlen(name);
+
+ while (*p) {
+ struct ctl_table_header *parent_head;
+ struct ctl_table *parent_entry;
+ struct ctl_node *parent_node;
+ const char *parent_name;
+ int cmp;
+
+ parent = *p;
+ parent_node = rb_entry(parent, struct ctl_node, node);
+ parent_head = parent_node->header;
+ parent_entry = &parent_head->ctl_table[parent_node - parent_head->node];
+ parent_name = parent_entry->procname;
+
+ cmp = namecmp(name, namelen, parent_name, strlen(parent_name));
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else {
+ printk(KERN_ERR "sysctl duplicate entry: ");
+ sysctl_print_dir(head->parent);
+ printk(KERN_CONT "/%s\n", entry->procname);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(node, parent, p);
+ return 0;
+}
+
+static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+{
+ struct rb_node *node = &head->node[entry - head->ctl_table].node;
+
+ rb_erase(node, &head->parent->root);
+}
+
+static void init_header(struct ctl_table_header *head,
+ struct ctl_table_root *root, struct ctl_table_set *set,
+ struct ctl_node *node, struct ctl_table *table)
+{
+ head->ctl_table = table;
+ head->ctl_table_arg = table;
+ head->used = 0;
+ head->count = 1;
+ head->nreg = 1;
+ head->unregistering = NULL;
+ head->root = root;
+ head->set = set;
+ head->parent = NULL;
+ head->node = node;
+ if (node) {
+ struct ctl_table *entry;
+ for (entry = table; entry->procname; entry++, node++) {
+ rb_init_node(&node->node);
+ node->header = head;
+ }
+ }
+}
+
+static void erase_header(struct ctl_table_header *head)
+{
+ struct ctl_table *entry;
+ for (entry = head->ctl_table; entry->procname; entry++)
+ erase_entry(head, entry);
+}
+
+static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
+{
+ struct ctl_table *entry;
+ int err;
+
+ dir->header.nreg++;
+ header->parent = dir;
+ err = insert_links(header);
+ if (err)
+ goto fail_links;
+ for (entry = header->ctl_table; entry->procname; entry++) {
+ err = insert_entry(header, entry);
+ if (err)
+ goto fail;
+ }
+ return 0;
+fail:
+ erase_header(header);
+ put_links(header);
+fail_links:
+ header->parent = NULL;
+ drop_sysctl_table(&dir->header);
+ return err;
+}
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+ if (unlikely(p->unregistering))
+ return 0;
+ p->used++;
+ return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+ if (!--p->used)
+ if (unlikely(p->unregistering))
+ complete(p->unregistering);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+ /*
+ * if p->used is 0, nobody will ever touch that entry again;
+ * we'll eliminate all paths to it before dropping sysctl_lock
+ */
+ if (unlikely(p->used)) {
+ struct completion wait;
+ init_completion(&wait);
+ p->unregistering = &wait;
+ spin_unlock(&sysctl_lock);
+ wait_for_completion(&wait);
+ spin_lock(&sysctl_lock);
+ } else {
+ /* anything non-NULL; we'll never dereference it */
+ p->unregistering = ERR_PTR(-EINVAL);
+ }
+ /*
+ * do not remove from the list until nobody holds it; walking the
+ * list in do_sysctl() relies on that.
+ */
+ erase_header(p);
+}
+
+static void sysctl_head_get(struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ head->count++;
+ spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ if (!--head->count)
+ kfree_rcu(head, rcu);
+ spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+ if (!head)
+ BUG();
+ spin_lock(&sysctl_lock);
+ if (!use_table(head))
+ head = ERR_PTR(-ENOENT);
+ spin_unlock(&sysctl_lock);
+ return head;
+}
+
+static void sysctl_head_finish(struct ctl_table_header *head)
+{
+ if (!head)
+ return;
+ spin_lock(&sysctl_lock);
+ unuse_table(head);
+ spin_unlock(&sysctl_lock);
+}
+
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+ struct ctl_table_set *set = &root->default_set;
+ if (root->lookup)
+ set = root->lookup(root, namespaces);
+ return set;
+}
+
+static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+
+ spin_lock(&sysctl_lock);
+ entry = find_entry(&head, dir, name, namelen);
+ if (entry && use_table(head))
+ *phead = head;
+ else
+ entry = NULL;
+ spin_unlock(&sysctl_lock);
+ return entry;
+}
+
+static struct ctl_node *first_usable_entry(struct rb_node *node)
+{
+ struct ctl_node *ctl_node;
+
+ for (;node; node = rb_next(node)) {
+ ctl_node = rb_entry(node, struct ctl_node, node);
+ if (use_table(ctl_node->header))
+ return ctl_node;
+ }
+ return NULL;
+}
+
+static void first_entry(struct ctl_dir *dir,
+ struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+ struct ctl_table_header *head = NULL;
+ struct ctl_table *entry = NULL;
+ struct ctl_node *ctl_node;
+
+ spin_lock(&sysctl_lock);
+ ctl_node = first_usable_entry(rb_first(&dir->root));
+ spin_unlock(&sysctl_lock);
+ if (ctl_node) {
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ }
+ *phead = head;
+ *pentry = entry;
+}
+
+static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+{
+ struct ctl_table_header *head = *phead;
+ struct ctl_table *entry = *pentry;
+ struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
+
+ spin_lock(&sysctl_lock);
+ unuse_table(head);
+
+ ctl_node = first_usable_entry(rb_next(&ctl_node->node));
+ spin_unlock(&sysctl_lock);
+ head = NULL;
+ if (ctl_node) {
+ head = ctl_node->header;
+ entry = &head->ctl_table[ctl_node - head->node];
+ }
+ *phead = head;
+ *pentry = entry;
+}
+
+void register_sysctl_root(struct ctl_table_root *root)
+{
+}
+
+/*
+ * sysctl_perm does NOT grant the superuser all rights automatically, because
+ * some sysctl variables are readonly even to root.
+ */
+
+static int test_perm(int mode, int op)
+{
+ if (uid_eq(current_euid(), GLOBAL_ROOT_UID))
+ mode >>= 6;
+ else if (in_egroup_p(GLOBAL_ROOT_GID))
+ mode >>= 3;
+ if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
+ return 0;
+ return -EACCES;
+}
+
+static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+{
+ int mode;
+
+ if (root->permissions)
+ mode = root->permissions(root, current->nsproxy, table);
+ else
+ mode = table->mode;
+
+ return test_perm(mode, op);
+}
+
static struct inode *proc_sys_make_inode(struct super_block *sb,
struct ctl_table_header *head, struct ctl_table *table)
{
@@ -43,13 +411,12 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_mode = table->mode;
- if (!table->child) {
+ if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
inode->i_op = &proc_sys_inode_operations;
inode->i_fop = &proc_sys_file_operations;
} else {
inode->i_mode |= S_IFDIR;
- clear_nlink(inode);
inode->i_op = &proc_sys_dir_operations;
inode->i_fop = &proc_sys_dir_file_operations;
}
@@ -57,70 +424,42 @@ out:
return inode;
}
-static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
-{
- int len;
- for ( ; p->procname; p++) {
-
- if (!p->procname)
- continue;
-
- len = strlen(p->procname);
- if (len != name->len)
- continue;
-
- if (memcmp(p->procname, name->name, len) != 0)
- continue;
-
- /* I have a match */
- return p;
- }
- return NULL;
-}
-
static struct ctl_table_header *grab_header(struct inode *inode)
{
- if (PROC_I(inode)->sysctl)
- return sysctl_head_grab(PROC_I(inode)->sysctl);
- else
- return sysctl_head_next(NULL);
+ struct ctl_table_header *head = PROC_I(inode)->sysctl;
+ if (!head)
+ head = &sysctl_table_root.default_set.dir.header;
+ return sysctl_head_grab(head);
}
static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
struct ctl_table_header *head = grab_header(dir);
- struct ctl_table *table = PROC_I(dir)->sysctl_entry;
struct ctl_table_header *h = NULL;
struct qstr *name = &dentry->d_name;
struct ctl_table *p;
struct inode *inode;
struct dentry *err = ERR_PTR(-ENOENT);
+ struct ctl_dir *ctl_dir;
+ int ret;
if (IS_ERR(head))
return ERR_CAST(head);
- if (table && !table->child) {
- WARN_ON(1);
- goto out;
- }
-
- table = table ? table->child : head->ctl_table;
-
- p = find_in_table(table, name);
- if (!p) {
- for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
- if (h->attached_to != table)
- continue;
- p = find_in_table(h->attached_by, name);
- if (p)
- break;
- }
- }
+ ctl_dir = container_of(head, struct ctl_dir, header);
+ p = lookup_entry(&h, ctl_dir, name->name, name->len);
if (!p)
goto out;
+ if (S_ISLNK(p->mode)) {
+ ret = sysctl_follow_link(&h, &p, current->nsproxy);
+ err = ERR_PTR(ret);
+ if (ret)
+ goto out;
+ }
+
err = ERR_PTR(-ENOMEM);
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
if (h)
@@ -188,20 +527,32 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
static int proc_sys_open(struct inode *inode, struct file *filp)
{
+ struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ /* sysctl was unregistered */
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
if (table->poll)
filp->private_data = proc_sys_poll_event(table->poll);
+ sysctl_head_finish(head);
+
return 0;
}
static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
{
struct inode *inode = filp->f_path.dentry->d_inode;
+ struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
- unsigned long event = (unsigned long)filp->private_data;
unsigned int ret = DEFAULT_POLLMASK;
+ unsigned long event;
+
+ /* sysctl was unregistered */
+ if (IS_ERR(head))
+ return POLLERR | POLLHUP;
if (!table->proc_handler)
goto out;
@@ -209,6 +560,7 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
if (!table->poll)
goto out;
+ event = (unsigned long)filp->private_data;
poll_wait(filp, &table->poll->wait, wait);
if (event != atomic_read(&table->poll->event)) {
@@ -217,6 +569,8 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
}
out:
+ sysctl_head_finish(head);
+
return ret;
}
@@ -258,28 +612,45 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
}
+static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
+ filldir_t filldir,
+ struct ctl_table_header *head,
+ struct ctl_table *table)
+{
+ int err, ret = 0;
+ head = sysctl_head_grab(head);
+
+ if (S_ISLNK(table->mode)) {
+ /* It is not an error if we can not follow the link ignore it */
+ err = sysctl_follow_link(&head, &table, current->nsproxy);
+ if (err)
+ goto out;
+ }
+
+ ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
+out:
+ sysctl_head_finish(head);
+ return ret;
+}
+
static int scan(struct ctl_table_header *head, ctl_table *table,
unsigned long *pos, struct file *file,
void *dirent, filldir_t filldir)
{
+ int res;
- for (; table->procname; table++, (*pos)++) {
- int res;
-
- /* Can't do anything without a proc name */
- if (!table->procname)
- continue;
-
- if (*pos < file->f_pos)
- continue;
+ if ((*pos)++ < file->f_pos)
+ return 0;
+ if (unlikely(S_ISLNK(table->mode)))
+ res = proc_sys_link_fill_cache(file, dirent, filldir, head, table);
+ else
res = proc_sys_fill_cache(file, dirent, filldir, head, table);
- if (res)
- return res;
- file->f_pos = *pos + 1;
- }
- return 0;
+ if (res == 0)
+ file->f_pos = *pos;
+
+ return res;
}
static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
@@ -287,20 +658,16 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
struct ctl_table_header *h = NULL;
+ struct ctl_table *entry;
+ struct ctl_dir *ctl_dir;
unsigned long pos;
int ret = -EINVAL;
if (IS_ERR(head))
return PTR_ERR(head);
- if (table && !table->child) {
- WARN_ON(1);
- goto out;
- }
-
- table = table ? table->child : head->ctl_table;
+ ctl_dir = container_of(head, struct ctl_dir, header);
ret = 0;
/* Avoid a switch here: arm builds fail with missing __cmpdi2 */
@@ -318,14 +685,8 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
pos = 2;
- ret = scan(head, table, &pos, filp, dirent, filldir);
- if (ret)
- goto out;
-
- for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
- if (h->attached_to != table)
- continue;
- ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
+ for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
+ ret = scan(h, entry, &pos, filp, dirent, filldir);
if (ret) {
sysctl_head_finish(h);
break;
@@ -445,6 +806,21 @@ static int proc_sys_delete(const struct dentry *dentry)
return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
}
+static int sysctl_is_seen(struct ctl_table_header *p)
+{
+ struct ctl_table_set *set = p->set;
+ int res;
+ spin_lock(&sysctl_lock);
+ if (p->unregistering)
+ res = 0;
+ else if (!set->is_seen)
+ res = 1;
+ else
+ res = set->is_seen(set);
+ spin_unlock(&sysctl_lock);
+ return res;
+}
+
static int proc_sys_compare(const struct dentry *parent,
const struct inode *pinode,
const struct dentry *dentry, const struct inode *inode,
@@ -470,6 +846,753 @@ static const struct dentry_operations proc_sys_dentry_operations = {
.d_compare = proc_sys_compare,
};
+static struct ctl_dir *find_subdir(struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry;
+
+ entry = find_entry(&head, dir, name, namelen);
+ if (!entry)
+ return ERR_PTR(-ENOENT);
+ if (!S_ISDIR(entry->mode))
+ return ERR_PTR(-ENOTDIR);
+ return container_of(head, struct ctl_dir, header);
+}
+
+static struct ctl_dir *new_dir(struct ctl_table_set *set,
+ const char *name, int namelen)
+{
+ struct ctl_table *table;
+ struct ctl_dir *new;
+ struct ctl_node *node;
+ char *new_name;
+
+ new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
+ sizeof(struct ctl_table)*2 + namelen + 1,
+ GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ node = (struct ctl_node *)(new + 1);
+ table = (struct ctl_table *)(node + 1);
+ new_name = (char *)(table + 2);
+ memcpy(new_name, name, namelen);
+ new_name[namelen] = '\0';
+ table[0].procname = new_name;
+ table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
+ init_header(&new->header, set->dir.header.root, set, node, table);
+
+ return new;
+}
+
+/**
+ * get_subdir - find or create a subdir with the specified name.
+ * @dir: Directory to create the subdirectory in
+ * @name: The name of the subdirectory to find or create
+ * @namelen: The length of name
+ *
+ * Takes a directory with an elevated reference count so we know that
+ * if we drop the lock the directory will not go away. Upon success
+ * the reference is moved from @dir to the returned subdirectory.
+ * Upon error an error code is returned and the reference on @dir is
+ * simply dropped.
+ */
+static struct ctl_dir *get_subdir(struct ctl_dir *dir,
+ const char *name, int namelen)
+{
+ struct ctl_table_set *set = dir->header.set;
+ struct ctl_dir *subdir, *new = NULL;
+ int err;
+
+ spin_lock(&sysctl_lock);
+ subdir = find_subdir(dir, name, namelen);
+ if (!IS_ERR(subdir))
+ goto found;
+ if (PTR_ERR(subdir) != -ENOENT)
+ goto failed;
+
+ spin_unlock(&sysctl_lock);
+ new = new_dir(set, name, namelen);
+ spin_lock(&sysctl_lock);
+ subdir = ERR_PTR(-ENOMEM);
+ if (!new)
+ goto failed;
+
+ /* Was the subdir added while we dropped the lock? */
+ subdir = find_subdir(dir, name, namelen);
+ if (!IS_ERR(subdir))
+ goto found;
+ if (PTR_ERR(subdir) != -ENOENT)
+ goto failed;
+
+ /* Nope. Use the our freshly made directory entry. */
+ err = insert_header(dir, &new->header);
+ subdir = ERR_PTR(err);
+ if (err)
+ goto failed;
+ subdir = new;
+found:
+ subdir->header.nreg++;
+failed:
+ if (unlikely(IS_ERR(subdir))) {
+ printk(KERN_ERR "sysctl could not get directory: ");
+ sysctl_print_dir(dir);
+ printk(KERN_CONT "/%*.*s %ld\n",
+ namelen, namelen, name, PTR_ERR(subdir));
+ }
+ drop_sysctl_table(&dir->header);
+ if (new)
+ drop_sysctl_table(&new->header);
+ spin_unlock(&sysctl_lock);
+ return subdir;
+}
+
+static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
+{
+ struct ctl_dir *parent;
+ const char *procname;
+ if (!dir->header.parent)
+ return &set->dir;
+ parent = xlate_dir(set, dir->header.parent);
+ if (IS_ERR(parent))
+ return parent;
+ procname = dir->header.ctl_table[0].procname;
+ return find_subdir(parent, procname, strlen(procname));
+}
+
+static int sysctl_follow_link(struct ctl_table_header **phead,
+ struct ctl_table **pentry, struct nsproxy *namespaces)
+{
+ struct ctl_table_header *head;
+ struct ctl_table_root *root;
+ struct ctl_table_set *set;
+ struct ctl_table *entry;
+ struct ctl_dir *dir;
+ int ret;
+
+ ret = 0;
+ spin_lock(&sysctl_lock);
+ root = (*pentry)->data;
+ set = lookup_header_set(root, namespaces);
+ dir = xlate_dir(set, (*phead)->parent);
+ if (IS_ERR(dir))
+ ret = PTR_ERR(dir);
+ else {
+ const char *procname = (*pentry)->procname;
+ head = NULL;
+ entry = find_entry(&head, dir, procname, strlen(procname));
+ ret = -ENOENT;
+ if (entry && use_table(head)) {
+ unuse_table(*phead);
+ *phead = head;
+ *pentry = entry;
+ ret = 0;
+ }
+ }
+
+ spin_unlock(&sysctl_lock);
+ return ret;
+}
+
+static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n",
+ path, table->procname, &vaf);
+
+ va_end(args);
+ return -EINVAL;
+}
+
+static int sysctl_check_table(const char *path, struct ctl_table *table)
+{
+ int err = 0;
+ for (; table->procname; table++) {
+ if (table->child)
+ err = sysctl_err(path, table, "Not a file");
+
+ if ((table->proc_handler == proc_dostring) ||
+ (table->proc_handler == proc_dointvec) ||
+ (table->proc_handler == proc_dointvec_minmax) ||
+ (table->proc_handler == proc_dointvec_jiffies) ||
+ (table->proc_handler == proc_dointvec_userhz_jiffies) ||
+ (table->proc_handler == proc_dointvec_ms_jiffies) ||
+ (table->proc_handler == proc_doulongvec_minmax) ||
+ (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
+ if (!table->data)
+ err = sysctl_err(path, table, "No data");
+ if (!table->maxlen)
+ err = sysctl_err(path, table, "No maxlen");
+ }
+ if (!table->proc_handler)
+ err = sysctl_err(path, table, "No proc_handler");
+
+ if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
+ err = sysctl_err(path, table, "bogus .mode 0%o",
+ table->mode);
+ }
+ return err;
+}
+
+static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
+ struct ctl_table_root *link_root)
+{
+ struct ctl_table *link_table, *entry, *link;
+ struct ctl_table_header *links;
+ struct ctl_node *node;
+ char *link_name;
+ int nr_entries, name_bytes;
+
+ name_bytes = 0;
+ nr_entries = 0;
+ for (entry = table; entry->procname; entry++) {
+ nr_entries++;
+ name_bytes += strlen(entry->procname) + 1;
+ }
+
+ links = kzalloc(sizeof(struct ctl_table_header) +
+ sizeof(struct ctl_node)*nr_entries +
+ sizeof(struct ctl_table)*(nr_entries + 1) +
+ name_bytes,
+ GFP_KERNEL);
+
+ if (!links)
+ return NULL;
+
+ node = (struct ctl_node *)(links + 1);
+ link_table = (struct ctl_table *)(node + nr_entries);
+ link_name = (char *)&link_table[nr_entries + 1];
+
+ for (link = link_table, entry = table; entry->procname; link++, entry++) {
+ int len = strlen(entry->procname) + 1;
+ memcpy(link_name, entry->procname, len);
+ link->procname = link_name;
+ link->mode = S_IFLNK|S_IRWXUGO;
+ link->data = link_root;
+ link_name += len;
+ }
+ init_header(links, dir->header.root, dir->header.set, node, link_table);
+ links->nreg = nr_entries;
+
+ return links;
+}
+
+static bool get_links(struct ctl_dir *dir,
+ struct ctl_table *table, struct ctl_table_root *link_root)
+{
+ struct ctl_table_header *head;
+ struct ctl_table *entry, *link;
+
+ /* Are there links available for every entry in table? */
+ for (entry = table; entry->procname; entry++) {
+ const char *procname = entry->procname;
+ link = find_entry(&head, dir, procname, strlen(procname));
+ if (!link)
+ return false;
+ if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
+ continue;
+ if (S_ISLNK(link->mode) && (link->data == link_root))
+ continue;
+ return false;
+ }
+
+ /* The checks passed. Increase the registration count on the links */
+ for (entry = table; entry->procname; entry++) {
+ const char *procname = entry->procname;
+ link = find_entry(&head, dir, procname, strlen(procname));
+ head->nreg++;
+ }
+ return true;
+}
+
+static int insert_links(struct ctl_table_header *head)
+{
+ struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+ struct ctl_dir *core_parent = NULL;
+ struct ctl_table_header *links;
+ int err;
+
+ if (head->set == root_set)
+ return 0;
+
+ core_parent = xlate_dir(root_set, head->parent);
+ if (IS_ERR(core_parent))
+ return 0;
+
+ if (get_links(core_parent, head->ctl_table, head->root))
+ return 0;
+
+ core_parent->header.nreg++;
+ spin_unlock(&sysctl_lock);
+
+ links = new_links(core_parent, head->ctl_table, head->root);
+
+ spin_lock(&sysctl_lock);
+ err = -ENOMEM;
+ if (!links)
+ goto out;
+
+ err = 0;
+ if (get_links(core_parent, head->ctl_table, head->root)) {
+ kfree(links);
+ goto out;
+ }
+
+ err = insert_header(core_parent, links);
+ if (err)
+ kfree(links);
+out:
+ drop_sysctl_table(&core_parent->header);
+ return err;
+}
+
+/**
+ * __register_sysctl_table - register a leaf sysctl table
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * The members of the &struct ctl_table structure are used as follows:
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ * enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file
+ *
+ * child - must be %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * There must be a proc_handler routine for any terminal nodes.
+ * Several default handlers are available to cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
+ * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *__register_sysctl_table(
+ struct ctl_table_set *set,
+ const char *path, struct ctl_table *table)
+{
+ struct ctl_table_root *root = set->dir.header.root;
+ struct ctl_table_header *header;
+ const char *name, *nextname;
+ struct ctl_dir *dir;
+ struct ctl_table *entry;
+ struct ctl_node *node;
+ int nr_entries = 0;
+
+ for (entry = table; entry->procname; entry++)
+ nr_entries++;
+
+ header = kzalloc(sizeof(struct ctl_table_header) +
+ sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
+ if (!header)
+ return NULL;
+
+ node = (struct ctl_node *)(header + 1);
+ init_header(header, root, set, node, table);
+ if (sysctl_check_table(path, table))
+ goto fail;
+
+ spin_lock(&sysctl_lock);
+ dir = &set->dir;
+ /* Reference moved down the diretory tree get_subdir */
+ dir->header.nreg++;
+ spin_unlock(&sysctl_lock);
+
+ /* Find the directory for the ctl_table */
+ for (name = path; name; name = nextname) {
+ int namelen;
+ nextname = strchr(name, '/');
+ if (nextname) {
+ namelen = nextname - name;
+ nextname++;
+ } else {
+ namelen = strlen(name);
+ }
+ if (namelen == 0)
+ continue;
+
+ dir = get_subdir(dir, name, namelen);
+ if (IS_ERR(dir))
+ goto fail;
+ }
+
+ spin_lock(&sysctl_lock);
+ if (insert_header(dir, header))
+ goto fail_put_dir_locked;
+
+ drop_sysctl_table(&dir->header);
+ spin_unlock(&sysctl_lock);
+
+ return header;
+
+fail_put_dir_locked:
+ drop_sysctl_table(&dir->header);
+ spin_unlock(&sysctl_lock);
+fail:
+ kfree(header);
+ dump_stack();
+ return NULL;
+}
+
+/**
+ * register_sysctl - register a sysctl table
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the table structure
+ *
+ * Register a sysctl table. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+{
+ return __register_sysctl_table(&sysctl_table_root.default_set,
+ path, table);
+}
+EXPORT_SYMBOL(register_sysctl);
+
+static char *append_path(const char *path, char *pos, const char *name)
+{
+ int namelen;
+ namelen = strlen(name);
+ if (((pos - path) + namelen + 2) >= PATH_MAX)
+ return NULL;
+ memcpy(pos, name, namelen);
+ pos[namelen] = '/';
+ pos[namelen + 1] = '\0';
+ pos += namelen + 1;
+ return pos;
+}
+
+static int count_subheaders(struct ctl_table *table)
+{
+ int has_files = 0;
+ int nr_subheaders = 0;
+ struct ctl_table *entry;
+
+ /* special case: no directory and empty directory */
+ if (!table || !table->procname)
+ return 1;
+
+ for (entry = table; entry->procname; entry++) {
+ if (entry->child)
+ nr_subheaders += count_subheaders(entry->child);
+ else
+ has_files = 1;
+ }
+ return nr_subheaders + has_files;
+}
+
+static int register_leaf_sysctl_tables(const char *path, char *pos,
+ struct ctl_table_header ***subheader, struct ctl_table_set *set,
+ struct ctl_table *table)
+{
+ struct ctl_table *ctl_table_arg = NULL;
+ struct ctl_table *entry, *files;
+ int nr_files = 0;
+ int nr_dirs = 0;
+ int err = -ENOMEM;
+
+ for (entry = table; entry->procname; entry++) {
+ if (entry->child)
+ nr_dirs++;
+ else
+ nr_files++;
+ }
+
+ files = table;
+ /* If there are mixed files and directories we need a new table */
+ if (nr_dirs && nr_files) {
+ struct ctl_table *new;
+ files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1),
+ GFP_KERNEL);
+ if (!files)
+ goto out;
+
+ ctl_table_arg = files;
+ for (new = files, entry = table; entry->procname; entry++) {
+ if (entry->child)
+ continue;
+ *new = *entry;
+ new++;
+ }
+ }
+
+ /* Register everything except a directory full of subdirectories */
+ if (nr_files || !nr_dirs) {
+ struct ctl_table_header *header;
+ header = __register_sysctl_table(set, path, files);
+ if (!header) {
+ kfree(ctl_table_arg);
+ goto out;
+ }
+
+ /* Remember if we need to free the file table */
+ header->ctl_table_arg = ctl_table_arg;
+ **subheader = header;
+ (*subheader)++;
+ }
+
+ /* Recurse into the subdirectories. */
+ for (entry = table; entry->procname; entry++) {
+ char *child_pos;
+
+ if (!entry->child)
+ continue;
+
+ err = -ENAMETOOLONG;
+ child_pos = append_path(path, pos, entry->procname);
+ if (!child_pos)
+ goto out;
+
+ err = register_leaf_sysctl_tables(path, child_pos, subheader,
+ set, entry->child);
+ pos[0] = '\0';
+ if (err)
+ goto out;
+ }
+ err = 0;
+out:
+ /* On failure our caller will unregister all registered subheaders */
+ return err;
+}
+
+/**
+ * __register_sysctl_paths - register a sysctl table hierarchy
+ * @set: Sysctl tree to register on
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_table for more details.
+ */
+struct ctl_table_header *__register_sysctl_paths(
+ struct ctl_table_set *set,
+ const struct ctl_path *path, struct ctl_table *table)
+{
+ struct ctl_table *ctl_table_arg = table;
+ int nr_subheaders = count_subheaders(table);
+ struct ctl_table_header *header = NULL, **subheaders, **subheader;
+ const struct ctl_path *component;
+ char *new_path, *pos;
+
+ pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!new_path)
+ return NULL;
+
+ pos[0] = '\0';
+ for (component = path; component->procname; component++) {
+ pos = append_path(new_path, pos, component->procname);
+ if (!pos)
+ goto out;
+ }
+ while (table->procname && table->child && !table[1].procname) {
+ pos = append_path(new_path, pos, table->procname);
+ if (!pos)
+ goto out;
+ table = table->child;
+ }
+ if (nr_subheaders == 1) {
+ header = __register_sysctl_table(set, new_path, table);
+ if (header)
+ header->ctl_table_arg = ctl_table_arg;
+ } else {
+ header = kzalloc(sizeof(*header) +
+ sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
+ if (!header)
+ goto out;
+
+ subheaders = (struct ctl_table_header **) (header + 1);
+ subheader = subheaders;
+ header->ctl_table_arg = ctl_table_arg;
+
+ if (register_leaf_sysctl_tables(new_path, pos, &subheader,
+ set, table))
+ goto err_register_leaves;
+ }
+
+out:
+ kfree(new_path);
+ return header;
+
+err_register_leaves:
+ while (subheader > subheaders) {
+ struct ctl_table_header *subh = *(--subheader);
+ struct ctl_table *table = subh->ctl_table_arg;
+ unregister_sysctl_table(subh);
+ kfree(table);
+ }
+ kfree(header);
+ header = NULL;
+ goto out;
+}
+
+/**
+ * register_sysctl_table_path - register a sysctl table hierarchy
+ * @path: The path to the directory the sysctl table is in.
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See __register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+ struct ctl_table *table)
+{
+ return __register_sysctl_paths(&sysctl_table_root.default_set,
+ path, table);
+}
+EXPORT_SYMBOL(register_sysctl_paths);
+
+/**
+ * register_sysctl_table - register a sysctl table hierarchy
+ * @table: the top-level table structure
+ *
+ * Register a sysctl table hierarchy. @table should be a filled in ctl_table
+ * array. A completely 0 filled entry terminates the table.
+ *
+ * See register_sysctl_paths for more details.
+ */
+struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
+{
+ static const struct ctl_path null_path[] = { {} };
+
+ return register_sysctl_paths(null_path, table);
+}
+EXPORT_SYMBOL(register_sysctl_table);
+
+static void put_links(struct ctl_table_header *header)
+{
+ struct ctl_table_set *root_set = &sysctl_table_root.default_set;
+ struct ctl_table_root *root = header->root;
+ struct ctl_dir *parent = header->parent;
+ struct ctl_dir *core_parent;
+ struct ctl_table *entry;
+
+ if (header->set == root_set)
+ return;
+
+ core_parent = xlate_dir(root_set, parent);
+ if (IS_ERR(core_parent))
+ return;
+
+ for (entry = header->ctl_table; entry->procname; entry++) {
+ struct ctl_table_header *link_head;
+ struct ctl_table *link;
+ const char *name = entry->procname;
+
+ link = find_entry(&link_head, core_parent, name, strlen(name));
+ if (link &&
+ ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) ||
+ (S_ISLNK(link->mode) && (link->data == root)))) {
+ drop_sysctl_table(link_head);
+ }
+ else {
+ printk(KERN_ERR "sysctl link missing during unregister: ");
+ sysctl_print_dir(parent);
+ printk(KERN_CONT "/%s\n", name);
+ }
+ }
+}
+
+static void drop_sysctl_table(struct ctl_table_header *header)
+{
+ struct ctl_dir *parent = header->parent;
+
+ if (--header->nreg)
+ return;
+
+ put_links(header);
+ start_unregistering(header);
+ if (!--header->count)
+ kfree_rcu(header, rcu);
+
+ if (parent)
+ drop_sysctl_table(&parent->header);
+}
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+ int nr_subheaders;
+ might_sleep();
+
+ if (header == NULL)
+ return;
+
+ nr_subheaders = count_subheaders(header->ctl_table_arg);
+ if (unlikely(nr_subheaders > 1)) {
+ struct ctl_table_header **subheaders;
+ int i;
+
+ subheaders = (struct ctl_table_header **)(header + 1);
+ for (i = nr_subheaders -1; i >= 0; i--) {
+ struct ctl_table_header *subh = subheaders[i];
+ struct ctl_table *table = subh->ctl_table_arg;
+ unregister_sysctl_table(subh);
+ kfree(table);
+ }
+ kfree(header);
+ return;
+ }
+
+ spin_lock(&sysctl_lock);
+ drop_sysctl_table(header);
+ spin_unlock(&sysctl_lock);
+}
+EXPORT_SYMBOL(unregister_sysctl_table);
+
+void setup_sysctl_set(struct ctl_table_set *set,
+ struct ctl_table_root *root,
+ int (*is_seen)(struct ctl_table_set *))
+{
+ memset(set, 0, sizeof(*set));
+ set->is_seen = is_seen;
+ init_header(&set->dir.header, root, set, NULL, root_table);
+}
+
+void retire_sysctl_set(struct ctl_table_set *set)
+{
+ WARN_ON(!RB_EMPTY_ROOT(&set->dir.root));
+}
+
int __init proc_sys_init(void)
{
struct proc_dir_entry *proc_sys_root;
@@ -478,5 +1601,6 @@ int __init proc_sys_init(void)
proc_sys_root->proc_iops = &proc_sys_dir_operations;
proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
- return 0;
+
+ return sysctl_init();
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 03102d97818..7c30fce037c 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,7 @@
#include <linux/bitops.h>
#include <linux/mount.h>
#include <linux/pid_namespace.h>
+#include <linux/parser.h>
#include "internal.h"
@@ -36,6 +37,63 @@ static int proc_set_super(struct super_block *sb, void *data)
return err;
}
+enum {
+ Opt_gid, Opt_hidepid, Opt_err,
+};
+
+static const match_table_t tokens = {
+ {Opt_hidepid, "hidepid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_err, NULL},
+};
+
+static int proc_parse_options(char *options, struct pid_namespace *pid)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ args[0].to = args[0].from = 0;
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ pid->pid_gid = make_kgid(current_user_ns(), option);
+ break;
+ case Opt_hidepid:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0 || option > 2) {
+ pr_err("proc: hidepid value must be between 0 and 2.\n");
+ return 0;
+ }
+ pid->hide_pid = option;
+ break;
+ default:
+ pr_err("proc: unrecognized mount option \"%s\" "
+ "or missing value\n", p);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+int proc_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct pid_namespace *pid = sb->s_fs_info;
+ return !proc_parse_options(data, pid);
+}
+
static struct dentry *proc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
@@ -43,16 +101,25 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
struct super_block *sb;
struct pid_namespace *ns;
struct proc_inode *ei;
+ char *options;
- if (flags & MS_KERNMOUNT)
+ if (flags & MS_KERNMOUNT) {
ns = (struct pid_namespace *)data;
- else
+ options = NULL;
+ } else {
ns = current->nsproxy->pid_ns;
+ options = data;
+ }
sb = sget(fs_type, proc_test_super, proc_set_super, ns);
if (IS_ERR(sb))
return ERR_CAST(sb);
+ if (!proc_parse_options(options, ns)) {
+ deactivate_locked_super(sb);
+ return ERR_PTR(-EINVAL);
+ }
+
if (!sb->s_root) {
sb->s_flags = flags;
err = proc_fill_super(sb);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 0855e6f2039..64c3b317236 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -18,68 +18,87 @@
#ifndef arch_irq_stat
#define arch_irq_stat() 0
#endif
-#ifndef arch_idle_time
-#define arch_idle_time(cpu) 0
-#endif
+
+#ifdef arch_idle_time
static cputime64_t get_idle_time(int cpu)
{
- u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
cputime64_t idle;
- if (idle_time == -1ULL) {
+ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
+ idle += arch_idle_time(cpu);
+ return idle;
+}
+
+static cputime64_t get_iowait_time(int cpu)
+{
+ cputime64_t iowait;
+
+ iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ if (cpu_online(cpu) && nr_iowait_cpu(cpu))
+ iowait += arch_idle_time(cpu);
+ return iowait;
+}
+
+#else
+
+static u64 get_idle_time(int cpu)
+{
+ u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL);
+
+ if (idle_time == -1ULL)
/* !NO_HZ so we can rely on cpustat.idle */
- idle = kstat_cpu(cpu).cpustat.idle;
- idle = cputime64_add(idle, arch_idle_time(cpu));
- } else
+ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ else
idle = usecs_to_cputime64(idle_time);
return idle;
}
-static cputime64_t get_iowait_time(int cpu)
+static u64 get_iowait_time(int cpu)
{
- u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
- cputime64_t iowait;
+ u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL);
if (iowait_time == -1ULL)
/* !NO_HZ so we can rely on cpustat.iowait */
- iowait = kstat_cpu(cpu).cpustat.iowait;
+ iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
else
iowait = usecs_to_cputime64(iowait_time);
return iowait;
}
+#endif
+
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
unsigned long jif;
- cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
- cputime64_t guest, guest_nice;
+ u64 user, nice, system, idle, iowait, irq, softirq, steal;
+ u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec boottime;
user = nice = system = idle = iowait =
- irq = softirq = steal = cputime64_zero;
- guest = guest_nice = cputime64_zero;
+ irq = softirq = steal = 0;
+ guest = guest_nice = 0;
getboottime(&boottime);
jif = boottime.tv_sec;
for_each_possible_cpu(i) {
- user = cputime64_add(user, kstat_cpu(i).cpustat.user);
- nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
- system = cputime64_add(system, kstat_cpu(i).cpustat.system);
- idle = cputime64_add(idle, get_idle_time(i));
- iowait = cputime64_add(iowait, get_iowait_time(i));
- irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
- softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
- steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
- guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
- guest_nice = cputime64_add(guest_nice,
- kstat_cpu(i).cpustat.guest_nice);
+ user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
+ nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
+ system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(i);
+ iowait += get_iowait_time(i);
+ irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
+ softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
+ steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
+ guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
+ guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
@@ -92,50 +111,49 @@ static int show_stat(struct seq_file *p, void *v)
}
sum += arch_irq_stat();
- seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu "
- "%llu\n",
- (unsigned long long)cputime64_to_clock_t(user),
- (unsigned long long)cputime64_to_clock_t(nice),
- (unsigned long long)cputime64_to_clock_t(system),
- (unsigned long long)cputime64_to_clock_t(idle),
- (unsigned long long)cputime64_to_clock_t(iowait),
- (unsigned long long)cputime64_to_clock_t(irq),
- (unsigned long long)cputime64_to_clock_t(softirq),
- (unsigned long long)cputime64_to_clock_t(steal),
- (unsigned long long)cputime64_to_clock_t(guest),
- (unsigned long long)cputime64_to_clock_t(guest_nice));
+ seq_puts(p, "cpu ");
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+ seq_putc(p, '\n');
+
for_each_online_cpu(i) {
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kstat_cpu(i).cpustat.user;
- nice = kstat_cpu(i).cpustat.nice;
- system = kstat_cpu(i).cpustat.system;
+ user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
+ nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
+ system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
idle = get_idle_time(i);
iowait = get_iowait_time(i);
- irq = kstat_cpu(i).cpustat.irq;
- softirq = kstat_cpu(i).cpustat.softirq;
- steal = kstat_cpu(i).cpustat.steal;
- guest = kstat_cpu(i).cpustat.guest;
- guest_nice = kstat_cpu(i).cpustat.guest_nice;
- seq_printf(p,
- "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
- "%llu\n",
- i,
- (unsigned long long)cputime64_to_clock_t(user),
- (unsigned long long)cputime64_to_clock_t(nice),
- (unsigned long long)cputime64_to_clock_t(system),
- (unsigned long long)cputime64_to_clock_t(idle),
- (unsigned long long)cputime64_to_clock_t(iowait),
- (unsigned long long)cputime64_to_clock_t(irq),
- (unsigned long long)cputime64_to_clock_t(softirq),
- (unsigned long long)cputime64_to_clock_t(steal),
- (unsigned long long)cputime64_to_clock_t(guest),
- (unsigned long long)cputime64_to_clock_t(guest_nice));
+ irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
+ softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
+ steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
+ guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
+ guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ seq_printf(p, "cpu%d", i);
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest));
+ seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice));
+ seq_putc(p, '\n');
}
seq_printf(p, "intr %llu", (unsigned long long)sum);
/* sum again ? it could be updated? */
for_each_irq_nr(j)
- seq_printf(p, " %u", kstat_irqs(j));
+ seq_put_decimal_ull(p, ' ', kstat_irqs(j));
seq_printf(p,
"\nctxt %llu\n"
@@ -152,7 +170,7 @@ static int show_stat(struct seq_file *p, void *v)
seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
for (i = 0; i < NR_SOFTIRQS; i++)
- seq_printf(p, " %u", per_softirq_sums[i]);
+ seq_put_decimal_ull(p, ' ', per_softirq_sums[i]);
seq_putc(p, '\n');
return 0;
@@ -160,11 +178,14 @@ static int show_stat(struct seq_file *p, void *v)
static int stat_open(struct inode *inode, struct file *file)
{
- unsigned size = 4096 * (1 + num_possible_cpus() / 32);
+ unsigned size = 1024 + 128 * num_possible_cpus();
char *buf;
struct seq_file *m;
int res;
+ /* minimum size to display an interrupt count : 2 bytes */
+ size += 2 * nr_irqs;
+
/* don't ask for more than the kmalloc() max size */
if (size > KMALLOC_MAX_SIZE)
size = KMALLOC_MAX_SIZE;
@@ -176,7 +197,7 @@ static int stat_open(struct inode *inode, struct file *file)
if (!res) {
m = file->private_data;
m->buf = buf;
- m->size = size;
+ m->size = ksize(buf);
} else
kfree(buf);
return res;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e418c5abdb0..4540b8f76f1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -125,7 +125,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = mm_for_maps(priv->task);
+ mm = mm_access(priv->task, PTRACE_MODE_READ);
if (!mm || IS_ERR(mm))
return mm;
down_read(&mm->mmap_sem);
@@ -209,16 +209,20 @@ static int do_maps_open(struct inode *inode, struct file *file,
return ret;
}
-static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
+static void
+show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
+ struct proc_maps_private *priv = m->private;
+ struct task_struct *task = priv->task;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
int len;
+ const char *name = NULL;
if (file) {
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -252,36 +256,57 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
pad_len_spaces(m, len);
seq_path(m, &file->f_path, "\n");
- } else {
- const char *name = arch_vma_name(vma);
- if (!name) {
- if (mm) {
- if (vma->vm_start <= mm->brk &&
- vma->vm_end >= mm->start_brk) {
- name = "[heap]";
- } else if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
- name = "[stack]";
- }
+ goto done;
+ }
+
+ name = arch_vma_name(vma);
+ if (!name) {
+ pid_t tid;
+
+ if (!mm) {
+ name = "[vdso]";
+ goto done;
+ }
+
+ if (vma->vm_start <= mm->brk &&
+ vma->vm_end >= mm->start_brk) {
+ name = "[heap]";
+ goto done;
+ }
+
+ tid = vm_is_stack(task, vma, is_pid);
+
+ if (tid != 0) {
+ /*
+ * Thread stack in /proc/PID/task/TID/maps or
+ * the main process stack.
+ */
+ if (!is_pid || (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack)) {
+ name = "[stack]";
} else {
- name = "[vdso]";
+ /* Thread stack in /proc/PID/maps */
+ pad_len_spaces(m, len);
+ seq_printf(m, "[stack:%d]", tid);
}
}
- if (name) {
- pad_len_spaces(m, len);
- seq_puts(m, name);
- }
+ }
+
+done:
+ if (name) {
+ pad_len_spaces(m, len);
+ seq_puts(m, name);
}
seq_putc(m, '\n');
}
-static int show_map(struct seq_file *m, void *v)
+static int show_map(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
- show_map_vma(m, vma);
+ show_map_vma(m, vma, is_pid);
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task->mm))
@@ -289,20 +314,49 @@ static int show_map(struct seq_file *m, void *v)
return 0;
}
+static int show_pid_map(struct seq_file *m, void *v)
+{
+ return show_map(m, v, 1);
+}
+
+static int show_tid_map(struct seq_file *m, void *v)
+{
+ return show_map(m, v, 0);
+}
+
static const struct seq_operations proc_pid_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_map
+ .show = show_pid_map
+};
+
+static const struct seq_operations proc_tid_maps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_map
};
-static int maps_open(struct inode *inode, struct file *file)
+static int pid_maps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_pid_maps_op);
}
-const struct file_operations proc_maps_operations = {
- .open = maps_open,
+static int tid_maps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_tid_maps_op);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+ .open = pid_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+const struct file_operations proc_tid_maps_operations = {
+ .open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
@@ -339,6 +393,7 @@ struct mem_size_stats {
unsigned long anonymous;
unsigned long anonymous_thp;
unsigned long swap;
+ unsigned long nonlinear;
u64 pss;
};
@@ -348,24 +403,33 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = mss->vma;
- struct page *page;
+ pgoff_t pgoff = linear_page_index(vma, addr);
+ struct page *page = NULL;
int mapcount;
- if (is_swap_pte(ptent)) {
- mss->swap += ptent_size;
- return;
+ if (pte_present(ptent)) {
+ page = vm_normal_page(vma, addr, ptent);
+ } else if (is_swap_pte(ptent)) {
+ swp_entry_t swpent = pte_to_swp_entry(ptent);
+
+ if (!non_swap_entry(swpent))
+ mss->swap += ptent_size;
+ else if (is_migration_entry(swpent))
+ page = migration_entry_to_page(swpent);
+ } else if (pte_file(ptent)) {
+ if (pte_to_pgoff(ptent) != pgoff)
+ mss->nonlinear += ptent_size;
}
- if (!pte_present(ptent))
- return;
-
- page = vm_normal_page(vma, addr, ptent);
if (!page)
return;
if (PageAnon(page))
mss->anonymous += ptent_size;
+ if (page->index != pgoff)
+ mss->nonlinear += ptent_size;
+
mss->resident += ptent_size;
/* Accumulate the size in pages that have been accessed. */
if (pte_young(ptent) || PageReferenced(page))
@@ -394,21 +458,15 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
- spin_lock(&walk->mm->page_table_lock);
- if (pmd_trans_huge(*pmd)) {
- if (pmd_trans_splitting(*pmd)) {
- spin_unlock(&walk->mm->page_table_lock);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- smaps_pte_entry(*(pte_t *)pmd, addr,
- HPAGE_PMD_SIZE, walk);
- spin_unlock(&walk->mm->page_table_lock);
- mss->anonymous_thp += HPAGE_PMD_SIZE;
- return 0;
- }
- } else {
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
spin_unlock(&walk->mm->page_table_lock);
+ mss->anonymous_thp += HPAGE_PMD_SIZE;
+ return 0;
}
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
/*
* The mmap_sem held all the way back in m_start() is what
* keeps khugepaged out of here and from collapsing things
@@ -422,7 +480,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return 0;
}
-static int show_smap(struct seq_file *m, void *v)
+static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
@@ -440,7 +498,7 @@ static int show_smap(struct seq_file *m, void *v)
if (vma->vm_mm && !is_vm_hugetlb_page(vma))
walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
- show_map_vma(m, vma);
+ show_map_vma(m, vma, is_pid);
seq_printf(m,
"Size: %8lu kB\n"
@@ -473,26 +531,59 @@ static int show_smap(struct seq_file *m, void *v)
(vma->vm_flags & VM_LOCKED) ?
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
+ if (vma->vm_flags & VM_NONLINEAR)
+ seq_printf(m, "Nonlinear: %8lu kB\n",
+ mss.nonlinear >> 10);
+
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task->mm))
? vma->vm_start : 0;
return 0;
}
+static int show_pid_smap(struct seq_file *m, void *v)
+{
+ return show_smap(m, v, 1);
+}
+
+static int show_tid_smap(struct seq_file *m, void *v)
+{
+ return show_smap(m, v, 0);
+}
+
static const struct seq_operations proc_pid_smaps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_smap
+ .show = show_pid_smap
+};
+
+static const struct seq_operations proc_tid_smaps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_smap
};
-static int smaps_open(struct inode *inode, struct file *file)
+static int pid_smaps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_pid_smaps_op);
}
-const struct file_operations proc_smaps_operations = {
- .open = smaps_open,
+static int tid_smaps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_tid_smaps_op);
+}
+
+const struct file_operations proc_pid_smaps_operations = {
+ .open = pid_smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+const struct file_operations proc_tid_smaps_operations = {
+ .open = tid_smaps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
@@ -507,6 +598,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
split_huge_page_pmd(walk->mm, pmd);
+ if (pmd_trans_unstable(pmd))
+ return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -595,11 +688,18 @@ const struct file_operations proc_clear_refs_operations = {
.llseek = noop_llseek,
};
+typedef struct {
+ u64 pme;
+} pagemap_entry_t;
+
struct pagemapread {
int pos, len;
- u64 *buffer;
+ pagemap_entry_t *buffer;
};
+#define PAGEMAP_WALK_SIZE (PMD_SIZE)
+#define PAGEMAP_WALK_MASK (PMD_MASK)
+
#define PM_ENTRY_BYTES sizeof(u64)
#define PM_STATUS_BITS 3
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
@@ -614,13 +714,19 @@ struct pagemapread {
#define PM_PRESENT PM_STATUS(4LL)
#define PM_SWAP PM_STATUS(2LL)
+#define PM_FILE PM_STATUS(1LL)
#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
#define PM_END_OF_BUFFER 1
-static int add_to_pagemap(unsigned long addr, u64 pfn,
+static inline pagemap_entry_t make_pme(u64 val)
+{
+ return (pagemap_entry_t) { .pme = val };
+}
+
+static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
struct pagemapread *pm)
{
- pm->buffer[pm->pos++] = pfn;
+ pm->buffer[pm->pos++] = *pme;
if (pm->pos >= pm->len)
return PM_END_OF_BUFFER;
return 0;
@@ -632,31 +738,66 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
struct pagemapread *pm = walk->private;
unsigned long addr;
int err = 0;
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
- err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
+ err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
}
return err;
}
-static u64 swap_pte_to_pagemap_entry(pte_t pte)
+static void pte_to_pagemap_entry(pagemap_entry_t *pme,
+ struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- swp_entry_t e = pte_to_swp_entry(pte);
- return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
+ u64 frame, flags;
+ struct page *page = NULL;
+
+ if (pte_present(pte)) {
+ frame = pte_pfn(pte);
+ flags = PM_PRESENT;
+ page = vm_normal_page(vma, addr, pte);
+ } else if (is_swap_pte(pte)) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ frame = swp_type(entry) |
+ (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ flags = PM_SWAP;
+ if (is_migration_entry(entry))
+ page = migration_entry_to_page(entry);
+ } else {
+ *pme = make_pme(PM_NOT_PRESENT);
+ return;
+ }
+
+ if (page && !PageAnon(page))
+ flags |= PM_FILE;
+
+ *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags);
}
-static u64 pte_to_pagemap_entry(pte_t pte)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
+ pmd_t pmd, int offset)
+{
+ /*
+ * Currently pmd for thp is always present because thp can not be
+ * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
+ * This if-check is just to prepare for future implementation.
+ */
+ if (pmd_present(pmd))
+ *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
+ | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+ else
+ *pme = make_pme(PM_NOT_PRESENT);
+}
+#else
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
+ pmd_t pmd, int offset)
{
- u64 pme = 0;
- if (is_swap_pte(pte))
- pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
- | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
- else if (pte_present(pte))
- pme = PM_PFRAME(pte_pfn(pte))
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
- return pme;
}
+#endif
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -665,29 +806,46 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct pagemapread *pm = walk->private;
pte_t *pte;
int err = 0;
-
- split_huge_page_pmd(walk->mm, pmd);
+ pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
+ if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ for (; addr != end; addr += PAGE_SIZE) {
+ unsigned long offset;
+
+ offset = (addr & ~PAGEMAP_WALK_MASK) >>
+ PAGE_SHIFT;
+ thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
+ break;
+ }
+ spin_unlock(&walk->mm->page_table_lock);
+ return err;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
for (; addr != end; addr += PAGE_SIZE) {
- u64 pfn = PM_NOT_PRESENT;
/* check to see if we've left 'vma' behind
* and need a new, higher one */
- if (vma && (addr >= vma->vm_end))
+ if (vma && (addr >= vma->vm_end)) {
vma = find_vma(walk->mm, addr);
+ pme = make_pme(PM_NOT_PRESENT);
+ }
/* check that 'vma' actually covers this address,
* and that it isn't a huge page vma */
if (vma && (vma->vm_start <= addr) &&
!is_vm_hugetlb_page(vma)) {
pte = pte_offset_map(pmd, addr);
- pfn = pte_to_pagemap_entry(*pte);
+ pte_to_pagemap_entry(&pme, vma, addr, *pte);
/* unmap before userspace copy */
pte_unmap(pte);
}
- err = add_to_pagemap(addr, pfn, pm);
+ err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
}
@@ -698,13 +856,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
+ pte_t pte, int offset)
{
- u64 pme = 0;
if (pte_present(pte))
- pme = PM_PFRAME(pte_pfn(pte) + offset)
- | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
- return pme;
+ *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
+ | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+ else
+ *pme = make_pme(PM_NOT_PRESENT);
}
/* This function walks within one hugetlb entry in the single call */
@@ -714,12 +873,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct pagemapread *pm = walk->private;
int err = 0;
- u64 pfn;
+ pagemap_entry_t pme;
for (; addr != end; addr += PAGE_SIZE) {
int offset = (addr & ~hmask) >> PAGE_SHIFT;
- pfn = huge_pte_to_pagemap_entry(*pte, offset);
- err = add_to_pagemap(addr, pfn, pm);
+ huge_pte_to_pagemap_entry(&pme, *pte, offset);
+ err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
}
@@ -736,11 +895,11 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* For each page in the address space, this file contains one 64-bit entry
* consisting of the following:
*
- * Bits 0-55 page frame number (PFN) if present
+ * Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
- * Bits 5-55 swap offset if swapped
+ * Bits 5-54 swap offset if swapped
* Bits 55-60 page shift (page size = 1<<page shift)
- * Bit 61 reserved for future use
+ * Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
*
@@ -754,8 +913,6 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* determine which areas of memory are actually mapped and llseek to
* skip over unmapped regions.
*/
-#define PAGEMAP_WALK_SIZE (PMD_SIZE)
-#define PAGEMAP_WALK_MASK (PMD_MASK)
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -788,7 +945,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!pm.buffer)
goto out_task;
- mm = mm_for_maps(task);
+ mm = mm_access(task, PTRACE_MODE_READ);
ret = PTR_ERR(mm);
if (!mm || IS_ERR(mm))
goto out_free;
@@ -938,26 +1095,21 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *pte;
md = walk->private;
- spin_lock(&walk->mm->page_table_lock);
- if (pmd_trans_huge(*pmd)) {
- if (pmd_trans_splitting(*pmd)) {
- spin_unlock(&walk->mm->page_table_lock);
- wait_split_huge_page(md->vma->anon_vma, pmd);
- } else {
- pte_t huge_pte = *(pte_t *)pmd;
- struct page *page;
-
- page = can_gather_numa_stats(huge_pte, md->vma, addr);
- if (page)
- gather_stats(page, md, pte_dirty(huge_pte),
- HPAGE_PMD_SIZE/PAGE_SIZE);
- spin_unlock(&walk->mm->page_table_lock);
- return 0;
- }
- } else {
+
+ if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+ pte_t huge_pte = *(pte_t *)pmd;
+ struct page *page;
+
+ page = can_gather_numa_stats(huge_pte, md->vma, addr);
+ if (page)
+ gather_stats(page, md, pte_dirty(huge_pte),
+ HPAGE_PMD_SIZE/PAGE_SIZE);
spin_unlock(&walk->mm->page_table_lock);
+ return 0;
}
+ if (pmd_trans_unstable(pmd))
+ return 0;
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
@@ -999,7 +1151,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
/*
* Display pages allocated per node and memory policy via /proc.
*/
-static int show_numa_map(struct seq_file *m, void *v)
+static int show_numa_map(struct seq_file *m, void *v, int is_pid)
{
struct numa_maps_private *numa_priv = m->private;
struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
@@ -1036,9 +1188,19 @@ static int show_numa_map(struct seq_file *m, void *v)
seq_path(m, &file->f_path, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_printf(m, " heap");
- } else if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
- seq_printf(m, " stack");
+ } else {
+ pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid);
+ if (tid != 0) {
+ /*
+ * Thread stack in /proc/PID/task/TID/maps or
+ * the main process stack.
+ */
+ if (!is_pid || (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack))
+ seq_printf(m, " stack");
+ else
+ seq_printf(m, " stack:%d", tid);
+ }
}
if (is_vm_hugetlb_page(vma))
@@ -1081,21 +1243,39 @@ out:
return 0;
}
+static int show_pid_numa_map(struct seq_file *m, void *v)
+{
+ return show_numa_map(m, v, 1);
+}
+
+static int show_tid_numa_map(struct seq_file *m, void *v)
+{
+ return show_numa_map(m, v, 0);
+}
+
static const struct seq_operations proc_pid_numa_maps_op = {
- .start = m_start,
- .next = m_next,
- .stop = m_stop,
- .show = show_numa_map,
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_pid_numa_map,
+};
+
+static const struct seq_operations proc_tid_numa_maps_op = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_numa_map,
};
-static int numa_maps_open(struct inode *inode, struct file *file)
+static int numa_maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
{
struct numa_maps_private *priv;
int ret = -ENOMEM;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (priv) {
priv->proc_maps.pid = proc_pid(inode);
- ret = seq_open(file, &proc_pid_numa_maps_op);
+ ret = seq_open(file, ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = priv;
@@ -1106,8 +1286,25 @@ static int numa_maps_open(struct inode *inode, struct file *file)
return ret;
}
-const struct file_operations proc_numa_maps_operations = {
- .open = numa_maps_open,
+static int pid_numa_maps_open(struct inode *inode, struct file *file)
+{
+ return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
+}
+
+static int tid_numa_maps_open(struct inode *inode, struct file *file)
+{
+ return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
+}
+
+const struct file_operations proc_pid_numa_maps_operations = {
+ .open = pid_numa_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+const struct file_operations proc_tid_numa_maps_operations = {
+ .open = tid_numa_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 980de547c07..1ccfa537f5f 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -134,9 +134,11 @@ static void pad_len_spaces(struct seq_file *m, int len)
/*
* display a single VMA to a sequenced file
*/
-static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
+ int is_pid)
{
struct mm_struct *mm = vma->vm_mm;
+ struct proc_maps_private *priv = m->private;
unsigned long ino = 0;
struct file *file;
dev_t dev = 0;
@@ -168,10 +170,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
pad_len_spaces(m, len);
seq_path(m, &file->f_path, "");
} else if (mm) {
- if (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack) {
+ pid_t tid = vm_is_stack(priv->task, vma, is_pid);
+
+ if (tid != 0) {
pad_len_spaces(m, len);
- seq_puts(m, "[stack]");
+ /*
+ * Thread stack in /proc/PID/task/TID/maps or
+ * the main process stack.
+ */
+ if (!is_pid || (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack))
+ seq_printf(m, "[stack]");
+ else
+ seq_printf(m, "[stack:%d]", tid);
}
}
@@ -182,11 +193,22 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
/*
* display mapping lines for a particular process's /proc/pid/maps
*/
-static int show_map(struct seq_file *m, void *_p)
+static int show_map(struct seq_file *m, void *_p, int is_pid)
{
struct rb_node *p = _p;
- return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
+ return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb),
+ is_pid);
+}
+
+static int show_pid_map(struct seq_file *m, void *_p)
+{
+ return show_map(m, _p, 1);
+}
+
+static int show_tid_map(struct seq_file *m, void *_p)
+{
+ return show_map(m, _p, 0);
}
static void *m_start(struct seq_file *m, loff_t *pos)
@@ -201,7 +223,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
if (!priv->task)
return ERR_PTR(-ESRCH);
- mm = mm_for_maps(priv->task);
+ mm = mm_access(priv->task, PTRACE_MODE_READ);
if (!mm || IS_ERR(mm)) {
put_task_struct(priv->task);
priv->task = NULL;
@@ -240,10 +262,18 @@ static const struct seq_operations proc_pid_maps_ops = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_map
+ .show = show_pid_map
+};
+
+static const struct seq_operations proc_tid_maps_ops = {
+ .start = m_start,
+ .next = m_next,
+ .stop = m_stop,
+ .show = show_tid_map
};
-static int maps_open(struct inode *inode, struct file *file)
+static int maps_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
{
struct proc_maps_private *priv;
int ret = -ENOMEM;
@@ -251,7 +281,7 @@ static int maps_open(struct inode *inode, struct file *file)
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (priv) {
priv->pid = proc_pid(inode);
- ret = seq_open(file, &proc_pid_maps_ops);
+ ret = seq_open(file, ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = priv;
@@ -262,8 +292,25 @@ static int maps_open(struct inode *inode, struct file *file)
return ret;
}
-const struct file_operations proc_maps_operations = {
- .open = maps_open,
+static int pid_maps_open(struct inode *inode, struct file *file)
+{
+ return maps_open(inode, file, &proc_pid_maps_ops);
+}
+
+static int tid_maps_open(struct inode *inode, struct file *file)
+{
+ return maps_open(inode, file, &proc_tid_maps_ops);
+}
+
+const struct file_operations proc_pid_maps_operations = {
+ .open = pid_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+const struct file_operations proc_tid_maps_operations = {
+ .open = tid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 766b1d45605..9610ac772d7 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -11,15 +11,20 @@ static int uptime_proc_show(struct seq_file *m, void *v)
{
struct timespec uptime;
struct timespec idle;
+ u64 idletime;
+ u64 nsec;
+ u32 rem;
int i;
- cputime_t idletime = cputime_zero;
+ idletime = 0;
for_each_possible_cpu(i)
- idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
+ idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
do_posix_clock_monotonic_gettime(&uptime);
monotonic_to_bootbased(&uptime);
- cputime_to_timespec(idletime, &idle);
+ nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
+ idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+ idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index b0f450a2bb7..0d5071d2998 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -700,3 +700,26 @@ static int __init vmcore_init(void)
return 0;
}
module_init(vmcore_init)
+
+/* Cleanup function for vmcore module. */
+void vmcore_cleanup(void)
+{
+ struct list_head *pos, *next;
+
+ if (proc_vmcore) {
+ remove_proc_entry(proc_vmcore->name, proc_vmcore->parent);
+ proc_vmcore = NULL;
+ }
+
+ /* clear the vmcore list. */
+ list_for_each_safe(pos, next, &vmcore_list) {
+ struct vmcore *m;
+
+ m = list_entry(pos, struct vmcore, list);
+ list_del(&m->list);
+ kfree(m);
+ }
+ kfree(elfcorebuf);
+ elfcorebuf = NULL;
+}
+EXPORT_SYMBOL_GPL(vmcore_cleanup);