From 3292beb340c76884427faa1f5d6085719477d889 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 28 Nov 2011 14:45:17 -0200 Subject: sched/accounting: Change cpustat fields to an array This patch changes fields in cpustat from a structure, to an u64 array. Math gets easier, and the code is more flexible. Signed-off-by: Glauber Costa Reviewed-by: KAMEZAWA Hiroyuki Cc: Linus Torvalds Cc: Andrew Morton Cc: Paul Tuner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1322498719-2255-2-git-send-email-glommer@parallels.com Signed-off-by: Ingo Molnar --- fs/proc/stat.c | 63 ++++++++++++++++++++++++++------------------------------ fs/proc/uptime.c | 4 ++-- 2 files changed, 31 insertions(+), 36 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 42b274da92c..8a6ab666e9f 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -22,29 +22,27 @@ #define arch_idle_time(cpu) 0 #endif -static cputime64_t get_idle_time(int cpu) +static u64 get_idle_time(int cpu) { - u64 idle_time = get_cpu_idle_time_us(cpu, NULL); - cputime64_t idle; + u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL); if (idle_time == -1ULL) { /* !NO_HZ so we can rely on cpustat.idle */ - idle = kstat_cpu(cpu).cpustat.idle; - idle = cputime64_add(idle, arch_idle_time(cpu)); + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle += arch_idle_time(cpu); } else idle = usecs_to_cputime(idle_time); return idle; } -static cputime64_t get_iowait_time(int cpu) +static u64 get_iowait_time(int cpu) { - u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); - cputime64_t iowait; + u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL); if (iowait_time == -1ULL) /* !NO_HZ so we can rely on cpustat.iowait */ - iowait = kstat_cpu(cpu).cpustat.iowait; + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; else iowait = usecs_to_cputime(iowait_time); @@ -55,33 +53,30 @@ static int show_stat(struct seq_file *p, void *v) { int i, j; unsigned long jif; - cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - cputime64_t guest, guest_nice; + u64 user, nice, system, idle, iowait, irq, softirq, steal; + u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; user = nice = system = idle = iowait = - irq = softirq = steal = cputime64_zero; - guest = guest_nice = cputime64_zero; + irq = softirq = steal = 0; + guest = guest_nice = 0; getboottime(&boottime); jif = boottime.tv_sec; for_each_possible_cpu(i) { - user = cputime64_add(user, kstat_cpu(i).cpustat.user); - nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); - system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, get_idle_time(i)); - iowait = cputime64_add(iowait, get_iowait_time(i)); - irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); - softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); - steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); - guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - guest_nice = cputime64_add(guest_nice, - kstat_cpu(i).cpustat.guest_nice); - sum += kstat_cpu_irqs_sum(i); - sum += arch_irq_stat_cpu(i); + user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(i); + iowait += get_iowait_time(i); + irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); @@ -106,16 +101,16 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kstat_cpu(i).cpustat.user; - nice = kstat_cpu(i).cpustat.nice; - system = kstat_cpu(i).cpustat.system; + user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; idle = get_idle_time(i); iowait = get_iowait_time(i); - irq = kstat_cpu(i).cpustat.irq; - softirq = kstat_cpu(i).cpustat.softirq; - steal = kstat_cpu(i).cpustat.steal; - guest = kstat_cpu(i).cpustat.guest; - guest_nice = kstat_cpu(i).cpustat.guest_nice; + irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " "%llu\n", diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 766b1d45605..0fb22e464e7 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -12,10 +12,10 @@ static int uptime_proc_show(struct seq_file *m, void *v) struct timespec uptime; struct timespec idle; int i; - cputime_t idletime = cputime_zero; + u64 idletime = 0; for_each_possible_cpu(i) - idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); + idletime += kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); -- cgit v1.2.3 From 648616343cdbe904c585a6c12e323d3b3c72e46f Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 15 Dec 2011 14:56:09 +0100 Subject: [S390] cputime: add sparse checking and cleanup Make cputime_t and cputime64_t nocast to enable sparse checking to detect incorrect use of cputime. Drop the cputime macros for simple scalar operations. The conversion macros are still needed. Signed-off-by: Martin Schwidefsky --- fs/proc/array.c | 8 ++++---- fs/proc/stat.c | 27 +++++++++++++-------------- fs/proc/uptime.c | 4 ++-- 3 files changed, 19 insertions(+), 20 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 3a1dafd228d..8c344f037bd 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -394,8 +394,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = cputime_zero; - cgtime = gtime = cputime_zero; + cutime = cstime = utime = stime = 0; + cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { struct signal_struct *sig = task->signal; @@ -423,14 +423,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime = cputime_add(gtime, t->gtime); + gtime += t->gtime; t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; thread_group_times(task, &utime, &stime); - gtime = cputime_add(gtime, sig->gtime); + gtime += sig->gtime; } sid = task_session_nr_ns(task, ns); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 2a30d67dd6b..714d5d131e7 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -30,7 +30,7 @@ static cputime64_t get_idle_time(int cpu) if (idle_time == -1ULL) { /* !NO_HZ so we can rely on cpustat.idle */ idle = kstat_cpu(cpu).cpustat.idle; - idle = cputime64_add(idle, arch_idle_time(cpu)); + idle += arch_idle_time(cpu); } else idle = nsecs_to_jiffies64(1000 * idle_time); @@ -63,23 +63,22 @@ static int show_stat(struct seq_file *p, void *v) struct timespec boottime; user = nice = system = idle = iowait = - irq = softirq = steal = cputime64_zero; - guest = guest_nice = cputime64_zero; + irq = softirq = steal = 0; + guest = guest_nice = 0; getboottime(&boottime); jif = boottime.tv_sec; for_each_possible_cpu(i) { - user = cputime64_add(user, kstat_cpu(i).cpustat.user); - nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); - system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, get_idle_time(i)); - iowait = cputime64_add(iowait, get_iowait_time(i)); - irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); - softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); - steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); - guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - guest_nice = cputime64_add(guest_nice, - kstat_cpu(i).cpustat.guest_nice); + user += kstat_cpu(i).cpustat.user; + nice += kstat_cpu(i).cpustat.nice; + system += kstat_cpu(i).cpustat.system; + idle += get_idle_time(i); + iowait += get_iowait_time(i); + irq += kstat_cpu(i).cpustat.irq; + softirq += kstat_cpu(i).cpustat.softirq; + steal += kstat_cpu(i).cpustat.steal; + guest += kstat_cpu(i).cpustat.guest; + guest_nice += kstat_cpu(i).cpustat.guest_nice; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 766b1d45605..ac5243657da 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -12,10 +12,10 @@ static int uptime_proc_show(struct seq_file *m, void *v) struct timespec uptime; struct timespec idle; int i; - cputime_t idletime = cputime_zero; + cputime_t idletime = 0; for_each_possible_cpu(i) - idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); + idletime += kstat_cpu(i).cpustat.idle; do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); -- cgit v1.2.3 From c3e0ef9a298e028a82ada28101ccd5cf64d209ee Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 15 Dec 2011 14:56:10 +0100 Subject: [S390] fix cputime overflow in uptime_proc_show For 32-bit architectures using standard jiffies the idletime calculation in uptime_proc_show will quickly overflow. It takes (2^32 / HZ) seconds of idle-time, or e.g. 12.45 days with no load on a quad-core with HZ=1000. Switch to 64-bit calculations. Cc: stable@vger.kernel.org Cc: Michael Abbott Signed-off-by: Martin Schwidefsky --- fs/proc/uptime.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index ac5243657da..ab515109fec 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -11,15 +11,20 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; struct timespec idle; + cputime64_t idletime; + u64 nsec; + u32 rem; int i; - cputime_t idletime = 0; + idletime = 0; for_each_possible_cpu(i) idletime += kstat_cpu(i).cpustat.idle; do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); - cputime_to_timespec(idletime, &idle); + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, (uptime.tv_nsec / (NSEC_PER_SEC / 100)), -- cgit v1.2.3 From 6b520e0565422966cdf1c3759bd73df77b0f248c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 12 Dec 2011 15:51:45 -0500 Subject: vfs: fix the stupidity with i_dentry in inode destructors Seeing that just about every destructor got that INIT_LIST_HEAD() copied into it, there is no point whatsoever keeping this INIT_LIST_HEAD in inode_init_once(); the cost of taking it into inode_init_always() will be negligible for pipes and sockets and negative for everything else. Not to mention the removal of boilerplate code from ->destroy_inode() instances... Signed-off-by: Al Viro --- fs/proc/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 7737c5468a4..51a176622b8 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -77,7 +77,6 @@ static struct inode *proc_alloc_inode(struct super_block *sb) static void proc_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(proc_inode_cachep, PROC_I(inode)); } -- cgit v1.2.3 From d161a13f974c72fd7ff0069d39a3ae57cb5694ff Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jul 2011 03:36:29 -0400 Subject: switch procfs to umode_t use both proc_dir_entry ->mode and populating functions Signed-off-by: Al Viro --- fs/proc/base.c | 2 +- fs/proc/generic.c | 8 ++++---- fs/proc/proc_net.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 851ba3dcdc2..65054d38ca2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -101,7 +101,7 @@ struct pid_entry { char *name; int len; - mode_t mode; + umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; union proc_op op; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 10090d9c7ad..2edf34f2eb6 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -597,7 +597,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, const char *name, - mode_t mode, + umode_t mode, nlink_t nlink) { struct proc_dir_entry *ent = NULL; @@ -659,7 +659,7 @@ struct proc_dir_entry *proc_symlink(const char *name, } EXPORT_SYMBOL(proc_symlink); -struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, +struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, struct proc_dir_entry *parent) { struct proc_dir_entry *ent; @@ -699,7 +699,7 @@ struct proc_dir_entry *proc_mkdir(const char *name, } EXPORT_SYMBOL(proc_mkdir); -struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, +struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode, struct proc_dir_entry *parent) { struct proc_dir_entry *ent; @@ -728,7 +728,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, } EXPORT_SYMBOL(create_proc_entry); -struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops, void *data) diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index f738024ccc8..06e1cc17caf 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -179,7 +179,7 @@ const struct file_operations proc_net_operations = { struct proc_dir_entry *proc_net_fops_create(struct net *net, - const char *name, mode_t mode, const struct file_operations *fops) + const char *name, umode_t mode, const struct file_operations *fops) { return proc_create(name, mode, net->proc_net, fops); } -- cgit v1.2.3 From 0226f4923f6c9b40cfa1c1c1b19a6ac6b3924ead Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 6 Dec 2011 12:21:54 -0500 Subject: vfs: take /proc/*/mounts and friends to fs/proc_namespace.c rationale: that stuff is far tighter bound to fs/namespace.c than to the guts of procfs proper. Signed-off-by: Al Viro --- fs/proc/base.c | 114 --------------------------------------------------------- 1 file changed, 114 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 851ba3dcdc2..07446b55b7c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -631,120 +631,6 @@ static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; -static int mounts_open_common(struct inode *inode, struct file *file, - const struct seq_operations *op) -{ - struct task_struct *task = get_proc_task(inode); - struct nsproxy *nsp; - struct mnt_namespace *ns = NULL; - struct path root; - struct proc_mounts *p; - int ret = -EINVAL; - - if (task) { - rcu_read_lock(); - nsp = task_nsproxy(task); - if (nsp) { - ns = nsp->mnt_ns; - if (ns) - get_mnt_ns(ns); - } - rcu_read_unlock(); - if (ns && get_task_root(task, &root) == 0) - ret = 0; - put_task_struct(task); - } - - if (!ns) - goto err; - if (ret) - goto err_put_ns; - - ret = -ENOMEM; - p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); - if (!p) - goto err_put_path; - - file->private_data = &p->m; - ret = seq_open(file, op); - if (ret) - goto err_free; - - p->m.private = p; - p->ns = ns; - p->root = root; - p->m.poll_event = ns->event; - - return 0; - - err_free: - kfree(p); - err_put_path: - path_put(&root); - err_put_ns: - put_mnt_ns(ns); - err: - return ret; -} - -static int mounts_release(struct inode *inode, struct file *file) -{ - struct proc_mounts *p = file->private_data; - path_put(&p->root); - put_mnt_ns(p->ns); - return seq_release(inode, file); -} - -static unsigned mounts_poll(struct file *file, poll_table *wait) -{ - struct proc_mounts *p = file->private_data; - unsigned res = POLLIN | POLLRDNORM; - - poll_wait(file, &p->ns->poll, wait); - if (mnt_had_events(p)) - res |= POLLERR | POLLPRI; - - return res; -} - -static int mounts_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mounts_op); -} - -static const struct file_operations proc_mounts_operations = { - .open = mounts_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, - .poll = mounts_poll, -}; - -static int mountinfo_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mountinfo_op); -} - -static const struct file_operations proc_mountinfo_operations = { - .open = mountinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, - .poll = mounts_poll, -}; - -static int mountstats_open(struct inode *inode, struct file *file) -{ - return mounts_open_common(inode, file, &mountstats_op); -} - -static const struct file_operations proc_mountstats_operations = { - .open = mountstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mounts_release, -}; - #define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ static ssize_t proc_info_read(struct file * file, char __user * buf, -- cgit v1.2.3 From d10577a8d86a0c735488d66d32289a6d66bcfa20 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Dec 2011 13:06:11 -0500 Subject: vfs: trim includes a bit [folded fix for missing magic.h from Tetsuo Handa] Signed-off-by: Al Viro --- fs/proc/namespaces.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index be177f702ac..27da860115c 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include "internal.h" -- cgit v1.2.3 From 69f594a38967f4540ce7a29b3fd214e68a8330bd Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 12:25:15 -0500 Subject: ptrace: do not audit capability check when outputing /proc/pid/stat Reading /proc/pid/stat of another process checks if one has ptrace permissions on that process. If one does have permissions it outputs some data about the process which might have security and attack implications. If the current task does not have ptrace permissions the read still works, but those fields are filled with inocuous (0) values. Since this check and a subsequent denial is not a violation of the security policy we should not audit such denials. This can be quite useful to removing ptrace broadly across a system without flooding the logs when ps is run or something which harmlessly walks proc. Signed-off-by: Eric Paris Acked-by: Serge E. Hallyn --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 3a1dafd228d..ddffd7a88b9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, state = *get_task_state(task); vsize = eip = esp = 0; - permitted = ptrace_may_access(task, PTRACE_MODE_READ); + permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); -- cgit v1.2.3 From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:08:09 -0800 Subject: tracepoint: add tracepoints for debugging oom_score_adj oom_score_adj is used for guarding processes from OOM-Killer. One of problem is that it's inherited at fork(). When a daemon set oom_score_adj and make children, it's hard to know where the value is set. This patch adds some tracepoints useful for debugging. This patch adds 3 trace points. - creating new task - renaming a task (exec) - set oom_score_adj To debug, users need to enable some trace pointer. Maybe filtering is useful as # EVENT=/sys/kernel/debug/tracing/events/task/ # echo "oom_score_adj != 0" > $EVENT/task_newtask/filter # echo "oom_score_adj != 0" > $EVENT/task_rename/filter # echo 1 > $EVENT/enable # EVENT=/sys/kernel/debug/tracing/events/oom/ # echo 1 > $EVENT/enable output will be like this. # grep oom /sys/kernel/debug/tracing/trace bash-7699 [007] d..3 5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000 bash-7699 [007] ...1 5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000 ls-7729 [003] ...2 5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000 bash-7699 [002] ...1 5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000 grep-7730 [007] ...2 5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000 Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index a1dddda999f..1aab5fe05a1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -86,6 +86,7 @@ #ifdef CONFIG_HARDWALL #include #endif +#include #include "internal.h" /* NOTE: @@ -1010,6 +1011,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + trace_oom_score_adj_update(task); err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -1097,6 +1099,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; + trace_oom_score_adj_update(task); /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. -- cgit v1.2.3 From 7773fbc54182a90cd248656619c7d33859e5f91d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 10 Jan 2012 15:11:20 -0800 Subject: procfs: make proc_get_link to use dentry instead of inode Prepare the ground for the next "map_files" patch which needs a name of a link file to analyse. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Tejun Heo Cc: Vasiliy Kulikov Cc: "Kirill A. Shutemov" Cc: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1aab5fe05a1..e31d95055c6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -166,9 +166,9 @@ static int get_task_root(struct task_struct *task, struct path *root) return result; } -static int proc_cwd_link(struct inode *inode, struct path *path) +static int proc_cwd_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { @@ -183,9 +183,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path) return result; } -static int proc_root_link(struct inode *inode, struct path *path) +static int proc_root_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { @@ -1456,13 +1456,13 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -static int proc_exe_link(struct inode *inode, struct path *exe_path) +static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct mm_struct *mm; struct file *exe_file; - task = get_proc_task(inode); + task = get_proc_task(dentry->d_inode); if (!task) return -ENOENT; mm = get_task_mm(task); @@ -1492,7 +1492,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); + error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path); out: return ERR_PTR(error); } @@ -1531,7 +1531,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &path); + error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; @@ -1823,9 +1823,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) return -ENOENT; } -static int proc_fd_link(struct inode *inode, struct path *path) +static int proc_fd_link(struct dentry *dentry, struct path *path) { - return proc_fd_info(inode, path, NULL); + return proc_fd_info(dentry->d_inode, path, NULL); } static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) -- cgit v1.2.3 From 640708a2cff7f81e246243b0073c66e6ece7e53e Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 10 Jan 2012 15:11:23 -0800 Subject: procfs: introduce the /proc//map_files/ directory This one behaves similarly to the /proc//fd/ one - it contains symlinks one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end", the target is the file. Opening a symlink results in a file that point exactly to the same inode as them vma's one. For example the ls -l of some arbitrary /proc//map_files/ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so This *helps* checkpointing process in three ways: 1. When dumping a task mappings we do know exact file that is mapped by particular region. We do this by opening /proc/$pid/map_files/$address symlink the way we do with file descriptors. 2. This also helps in determining which anonymous shared mappings are shared with each other by comparing the inodes of them. 3. When restoring a set of processes in case two of them has a mapping shared, we map the memory by the 1st one and then open its /proc/$pid/map_files/$address file and map it by the 2nd task. Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable re-reading and reparsing for this text file which slows down restore procedure significantly. Also as being pointed in (3) it is a way easier to use top level shared mapping in children as /proc/$pid/map_files/$address when needed. [akpm@linux-foundation.org: coding-style fixes] [gorcunov@openvz.org: make map_files depend on CHECKPOINT_RESTORE] Signed-off-by: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Reviewed-by: Vasiliy Kulikov Reviewed-by: "Kirill A. Shutemov" Cc: Tejun Heo Cc: Alexey Dobriyan Cc: Al Viro Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 355 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 355 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index e31d95055c6..4d755fed3ec 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -83,6 +83,7 @@ #include #include #include +#include #ifdef CONFIG_HARDWALL #include #endif @@ -134,6 +135,8 @@ struct pid_entry { NULL, &proc_single_file_operations, \ { .proc_show = show } ) +static int proc_fd_permission(struct inode *inode, int mask); + /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. @@ -2046,6 +2049,355 @@ static const struct file_operations proc_fd_operations = { .llseek = default_llseek, }; +#ifdef CONFIG_CHECKPOINT_RESTORE + +/* + * dname_to_vma_addr - maps a dentry name into two unsigned longs + * which represent vma start and end addresses. + */ +static int dname_to_vma_addr(struct dentry *dentry, + unsigned long *start, unsigned long *end) +{ + if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + return -EINVAL; + + return 0; +} + +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + bool exact_vma_exists = false; + struct mm_struct *mm = NULL; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int status = 0; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + if (!capable(CAP_SYS_ADMIN)) { + status = -EACCES; + goto out_notask; + } + + inode = dentry->d_inode; + task = get_proc_task(inode); + if (!task) + goto out_notask; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + mm = get_task_mm(task); + if (!mm) + goto out; + + if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { + down_read(&mm->mmap_sem); + exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); + up_read(&mm->mmap_sem); + } + + mmput(mm); + + if (exact_vma_exists) { + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + status = 1; + } + +out: + put_task_struct(task); + +out_notask: + if (status <= 0) + d_drop(dentry); + + return status; +} + +static const struct dentry_operations tid_map_files_dentry_operations = { + .d_revalidate = map_files_d_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_map_files_get_link(struct dentry *dentry, struct path *path) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + int rc; + + rc = -ENOENT; + task = get_proc_task(dentry->d_inode); + if (!task) + goto out; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); + if (rc) + goto out_mmput; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (vma && vma->vm_file) { + *path = vma->vm_file->f_path; + path_get(path); + rc = 0; + } + up_read(&mm->mmap_sem); + +out_mmput: + mmput(mm); +out: + return rc; +} + +struct map_files_info { + struct file *file; + unsigned long len; + unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ +}; + +static struct dentry * +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + const struct file *file = ptr; + struct proc_inode *ei; + struct inode *inode; + + if (!file) + return ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + ei->op.proc_get_link = proc_map_files_get_link; + + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + inode->i_mode = S_IFLNK; + + if (file->f_mode & FMODE_READ) + inode->i_mode |= S_IRUSR; + if (file->f_mode & FMODE_WRITE) + inode->i_mode |= S_IWUSR; + + d_set_d_op(dentry, &tid_map_files_dentry_operations); + d_add(dentry, inode); + + return NULL; +} + +static struct dentry *proc_map_files_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct dentry *result; + struct mm_struct *mm; + + result = ERR_PTR(-EACCES); + if (!capable(CAP_SYS_ADMIN)) + goto out; + + result = ERR_PTR(-ENOENT); + task = get_proc_task(dir); + if (!task) + goto out; + + result = ERR_PTR(-EACCES); + if (lock_trace(task)) + goto out_put_task; + + result = ERR_PTR(-ENOENT); + if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) + goto out_unlock; + + mm = get_task_mm(task); + if (!mm) + goto out_unlock; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (!vma) + goto out_no_vma; + + result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); + +out_no_vma: + up_read(&mm->mmap_sem); + mmput(mm); +out_unlock: + unlock_trace(task); +out_put_task: + put_task_struct(task); +out: + return result; +} + +static const struct inode_operations proc_map_files_inode_operations = { + .lookup = proc_map_files_lookup, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + ino_t ino; + int ret; + + ret = -EACCES; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -ENOENT; + task = get_proc_task(inode); + if (!task) + goto out; + + ret = -EACCES; + if (lock_trace(task)) + goto out_put_task; + + ret = 0; + switch (filp->f_pos) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) + goto out_unlock; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out_unlock; + filp->f_pos++; + default: + { + unsigned long nr_files, pos, i; + struct flex_array *fa = NULL; + struct map_files_info info; + struct map_files_info *p; + + mm = get_task_mm(task); + if (!mm) + goto out_unlock; + down_read(&mm->mmap_sem); + + nr_files = 0; + + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_sem taken + * 2) Release mmap_sem and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_sem taken in might_fault(). + */ + + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + if (vma->vm_file && ++pos > filp->f_pos) + nr_files++; + } + + if (nr_files) { + fa = flex_array_alloc(sizeof(info), nr_files, + GFP_KERNEL); + if (!fa || flex_array_prealloc(fa, 0, nr_files, + GFP_KERNEL)) { + ret = -ENOMEM; + if (fa) + flex_array_free(fa); + up_read(&mm->mmap_sem); + mmput(mm); + goto out_unlock; + } + for (i = 0, vma = mm->mmap, pos = 2; vma; + vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (++pos <= filp->f_pos) + continue; + + get_file(vma->vm_file); + info.file = vma->vm_file; + info.len = snprintf(info.name, + sizeof(info.name), "%lx-%lx", + vma->vm_start, vma->vm_end); + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) + BUG(); + } + } + up_read(&mm->mmap_sem); + + for (i = 0; i < nr_files; i++) { + p = flex_array_get(fa, i); + ret = proc_fill_cache(filp, dirent, filldir, + p->name, p->len, + proc_map_files_instantiate, + task, p->file); + if (ret) + break; + filp->f_pos++; + fput(p->file); + } + for (; i < nr_files; i++) { + /* + * In case of error don't forget + * to put rest of file refs. + */ + p = flex_array_get(fa, i); + fput(p->file); + } + if (fa) + flex_array_free(fa); + mmput(mm); + } + } + +out_unlock: + unlock_trace(task); +out_put_task: + put_task_struct(task); +out: + return ret; +} + +static const struct file_operations proc_map_files_operations = { + .read = generic_read_dir, + .readdir = proc_map_files_readdir, + .llseek = default_llseek, +}; + +#endif /* CONFIG_CHECKPOINT_RESTORE */ + /* * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). @@ -2661,6 +3013,9 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), +#endif DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET -- cgit v1.2.3 From 97412950b10e64f347aec4a9b759395c2465adf6 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 10 Jan 2012 15:11:27 -0800 Subject: procfs: parse mount options Add support for procfs mount options. Actual mount options are coming in the next patches. Signed-off-by: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Al Viro Cc: Randy Dunlap Cc: "H. Peter Anvin" Cc: Greg KH Cc: Theodore Tso Cc: Alan Cox Cc: James Morris Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 10 ++++++++++ fs/proc/internal.h | 1 + fs/proc/root.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 64 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 51a176622b8..27c762f3487 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -17,7 +18,9 @@ #include #include #include +#include #include +#include #include #include @@ -101,12 +104,19 @@ void __init proc_init_inodecache(void) init_once); } +static int proc_show_options(struct seq_file *seq, struct dentry *root) +{ + return 0; +} + static const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, + .remount_fs = proc_remount, + .show_options = proc_show_options, }; static void __pde_users_dec(struct proc_dir_entry *pde) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7838e5cfec1..292577531ad 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde); int proc_fill_super(struct super_block *); struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +int proc_remount(struct super_block *sb, int *flags, char *data); /* * These are generic /proc routines that use the internal diff --git a/fs/proc/root.c b/fs/proc/root.c index 03102d97818..6a8ac1d361a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "internal.h" @@ -36,6 +37,48 @@ static int proc_set_super(struct super_block *sb, void *data) return err; } +enum { + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_err, NULL}, +}; + +static int proc_parse_options(char *options, struct pid_namespace *pid) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + pr_debug("proc: options = %s\n", options); + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + switch (token) { + default: + pr_err("proc: unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + + return 1; +} + +int proc_remount(struct super_block *sb, int *flags, char *data) +{ + struct pid_namespace *pid = sb->s_fs_info; + return !proc_parse_options(data, pid); +} + static struct dentry *proc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -43,11 +86,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, struct super_block *sb; struct pid_namespace *ns; struct proc_inode *ei; + char *options; - if (flags & MS_KERNMOUNT) + if (flags & MS_KERNMOUNT) { ns = (struct pid_namespace *)data; - else + options = NULL; + } else { ns = current->nsproxy->pid_ns; + options = data; + } sb = sget(fs_type, proc_test_super, proc_set_super, ns); if (IS_ERR(sb)) @@ -55,6 +102,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (!sb->s_root) { sb->s_flags = flags; + if (!proc_parse_options(options, ns)) { + deactivate_locked_super(sb); + return ERR_PTR(-EINVAL); + } err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); -- cgit v1.2.3 From 0499680a42141d86417a8fbaa8c8db806bea1201 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 10 Jan 2012 15:11:31 -0800 Subject: procfs: add hidepid= and gid= mount options Add support for mount options to restrict access to /proc/PID/ directories. The default backward-compatible "relaxed" behaviour is left untouched. The first mount option is called "hidepid" and its value defines how much info about processes we want to be available for non-owners: hidepid=0 (default) means the old behavior - anybody may read all world-readable /proc/PID/* files. hidepid=1 means users may not access any /proc// directories, but their own. Sensitive files like cmdline, sched*, status are now protected against other users. As permission checking done in proc_pid_permission() and files' permissions are left untouched, programs expecting specific files' modes are not confused. hidepid=2 means hidepid=1 plus all /proc/PID/ will be invisible to other users. It doesn't mean that it hides whether a process exists (it can be learned by other means, e.g. by kill -0 $PID), but it hides process' euid and egid. It compicates intruder's task of gathering info about running processes, whether some daemon runs with elevated privileges, whether another user runs some sensitive program, whether other users run any program at all, etc. gid=XXX defines a group that will be able to gather all processes' info (as in hidepid=0 mode). This group should be used instead of putting nonroot user in sudoers file or something. However, untrusted users (like daemons, etc.) which are not supposed to monitor the tasks in the whole system should not be added to the group. hidepid=1 or higher is designed to restrict access to procfs files, which might reveal some sensitive private information like precise keystrokes timings: http://www.openwall.com/lists/oss-security/2011/11/05/3 hidepid=1/2 doesn't break monitoring userspace tools. ps, top, pgrep, and conky gracefully handle EPERM/ENOENT and behave as if the current user is the only user running processes. pstree shows the process subtree which contains "pstree" process. Note: the patch doesn't deal with setuid/setgid issues of keeping preopened descriptors of procfs files (like https://lkml.org/lkml/2011/2/7/368). We rely on that the leaked information like the scheduling counters of setuid apps doesn't threaten anybody's privacy - only the user started the setuid program may read the counters. Signed-off-by: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Al Viro Cc: Randy Dunlap Cc: "H. Peter Anvin" Cc: Greg KH Cc: Theodore Tso Cc: Alan Cox Cc: James Morris Cc: Oleg Nesterov Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/proc/inode.c | 8 +++++++ fs/proc/root.c | 21 +++++++++++++++--- 3 files changed, 94 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 4d755fed3ec..8173dfd89cb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -631,6 +631,50 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) return 0; } +/* + * May current process learn task's sched/cmdline info (for hide_pid_min=1) + * or euid/egid (for hide_pid_min=2)? + */ +static bool has_pid_permissions(struct pid_namespace *pid, + struct task_struct *task, + int hide_pid_min) +{ + if (pid->hide_pid < hide_pid_min) + return true; + if (in_group_p(pid->pid_gid)) + return true; + return ptrace_may_access(task, PTRACE_MODE_READ); +} + + +static int proc_pid_permission(struct inode *inode, int mask) +{ + struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct task_struct *task; + bool has_perms; + + task = get_proc_task(inode); + has_perms = has_pid_permissions(pid, task, 1); + put_task_struct(task); + + if (!has_perms) { + if (pid->hide_pid == 2) { + /* + * Let's make getdents(), stat(), and open() + * consistent with each other. If a process + * may not stat() a file, it shouldn't be seen + * in procfs at all. + */ + return -ENOENT; + } + + return -EPERM; + } + return generic_permission(inode, mask); +} + + + static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; @@ -1615,6 +1659,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) struct inode *inode = dentry->d_inode; struct task_struct *task; const struct cred *cred; + struct pid_namespace *pid = dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); @@ -1623,6 +1668,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) stat->gid = 0; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { + if (!has_pid_permissions(pid, task, 2)) { + rcu_read_unlock(); + /* + * This doesn't prevent learning whether PID exists, + * it only makes getattr() consistent with readdir(). + */ + return -ENOENT; + } if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { cred = __task_cred(task); @@ -3119,6 +3172,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) @@ -3322,6 +3376,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi proc_pid_instantiate, iter.task, NULL); } +static int fake_filldir(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned d_type) +{ + return 0; +} + /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { @@ -3329,6 +3389,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; + filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) goto out_no_task; @@ -3350,8 +3411,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { + if (has_pid_permissions(ns, iter.task, 2)) + __filldir = filldir; + else + __filldir = fake_filldir; + filp->f_pos = iter.tgid + TGID_OFFSET; - if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { + if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { put_task_struct(iter.task); goto out; } @@ -3686,6 +3752,7 @@ static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static const struct file_operations proc_task_operations = { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 27c762f3487..84fd3235a59 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -106,6 +106,14 @@ void __init proc_init_inodecache(void) static int proc_show_options(struct seq_file *seq, struct dentry *root) { + struct super_block *sb = root->d_sb; + struct pid_namespace *pid = sb->s_fs_info; + + if (pid->pid_gid) + seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid); + if (pid->hide_pid != 0) + seq_printf(seq, ",hidepid=%u", pid->hide_pid); + return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 6a8ac1d361a..46a15d8a29c 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,10 +38,12 @@ static int proc_set_super(struct super_block *sb, void *data) } enum { - Opt_err, + Opt_gid, Opt_hidepid, Opt_err, }; static const match_table_t tokens = { + {Opt_hidepid, "hidepid=%u"}, + {Opt_gid, "gid=%u"}, {Opt_err, NULL}, }; @@ -49,8 +51,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) { char *p; substring_t args[MAX_OPT_ARGS]; - - pr_debug("proc: options = %s\n", options); + int option; if (!options) return 1; @@ -63,6 +64,20 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) args[0].to = args[0].from = 0; token = match_token(p, tokens, args); switch (token) { + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + pid->pid_gid = option; + break; + case Opt_hidepid: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 2) { + pr_err("proc: hidepid value must be between 0 and 2.\n"); + return 0; + } + pid->hide_pid = option; + break; default: pr_err("proc: unrecognized mount option \"%s\" " "or missing value\n", p); -- cgit v1.2.3 From a2ef990ab5a6705a356d146dd773a3b359787497 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 12 Jan 2012 17:17:08 -0800 Subject: proc: fix null pointer deref in proc_pid_permission() get_proc_task() can fail to search the task and return NULL, put_task_struct() will then bomb the kernel with following oops: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: [] proc_pid_permission+0x64/0xe0 PGD 112075067 PUD 112814067 PMD 0 Oops: 0002 [#1] PREEMPT SMP This is a regression introduced by commit 0499680a ("procfs: add hidepid= and gid= mount options"). The kernel should return -ESRCH if get_proc_task() failed. Signed-off-by: Xiaotian Feng Cc: Al Viro Cc: Vasiliy Kulikov Cc: Stephen Wilson Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 8173dfd89cb..5485a5388ec 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -654,6 +654,8 @@ static int proc_pid_permission(struct inode *inode, int mask) bool has_perms; task = get_proc_task(inode); + if (!task) + return -ESRCH; has_perms = has_pid_permissions(pid, task, 1); put_task_struct(task); -- cgit v1.2.3 From b3f7f573a20081910e34e99cbc91831f4f02f1ff Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 12 Jan 2012 17:20:53 -0800 Subject: c/r: procfs: add start_data, end_data, start_brk members to /proc/$pid/stat v4 The mm->start_code/end_code, mm->start_data/end_data, mm->start_brk are involved into calculation of program text/data segment sizes (which might be seen in /proc//statm) and into brk() call final address. For restore we need to know all these values. While mm->start_code/end_code already present in /proc/$pid/stat, the rest members are not, so this patch brings them in. The restore procedure of these members is addressed in another patch using prctl(). Signed-off-by: Cyrill Gorcunov Acked-by: Serge Hallyn Reviewed-by: Kees Cook Reviewed-by: KAMEZAWA Hiroyuki Cc: Alexey Dobriyan Cc: Tejun Heo Cc: Andrew Vagin Cc: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 8c344f037bd..9252ee3b71e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", pid_nr_ns(pid, ns), tcomm, state, @@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + cputime_to_clock_t(cgtime), + (mm && permitted) ? mm->start_data : 0, + (mm && permitted) ? mm->end_data : 0, + (mm && permitted) ? mm->start_brk : 0); if (mm) mmput(mm); return 0; -- cgit v1.2.3 From f7e6746ebae984ea67b0a1a1e23c7e6698240631 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 14 Jan 2012 00:01:01 +0000 Subject: sched/accounting, proc: Fix /proc/stat interrupts sum Commit 3292beb340c7688 ("sched/accounting: Change cpustat fields to an array") deleted the code which provides us with the sum of all interrupts in the system, causing vmstat to report zero interrupts occuring in the system. Fix this by restoring the code. Signed-off-by: Russell King Tested-by: Russell King # [on ARM] Tested-by: Tony Luck Tested-by: Steven Rostedt Cc: Glauber Costa Cc: KAMEZAWA Hiroyuki Cc: Linus Torvalds Cc: Andrew Morton Cc: Paul Tuner Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- fs/proc/stat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index d76ca6ae2b1..121f77cfef7 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -77,6 +77,8 @@ static int show_stat(struct seq_file *p, void *v) steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + sum += kstat_cpu_irqs_sum(i); + sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); -- cgit v1.2.3 From 0a300be6d5be8f66cd96609334710c268d0bfdce Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: remove task argument to audit_set_loginuid The function always deals with current. Don't expose an option pretending one can use it for something. You can't. Signed-off-by: Eric Paris --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 8173dfd89cb..e3cbebbabeb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1228,7 +1228,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(current, loginuid); + length = audit_set_loginuid(loginuid); if (likely(length == 0)) length = count; -- cgit v1.2.3 From 633b45454503489209b0d9a45f9e3cd1b852c614 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:08 -0500 Subject: audit: only allow tasks to set their loginuid if it is -1 At the moment we allow tasks to set their loginuid if they have CAP_AUDIT_CONTROL. In reality we want tasks to set the loginuid when they log in and it be impossible to ever reset. We had to make it mutable even after it was once set (with the CAP) because on update and admin might have to restart sshd. Now sshd would get his loginuid and the next user which logged in using ssh would not be able to set his loginuid. Systemd has changed how userspace works and allowed us to make the kernel work the way it should. With systemd users (even admins) are not supposed to restart services directly. The system will restart the service for them. Thus since systemd is going to loginuid==-1, sshd would get -1, and sshd would be allowed to set a new loginuid without special permissions. If an admin in this system were to manually start an sshd he is inserting himself into the system chain of trust and thus, logically, it's his loginuid that should be used! Since we have old systems I make this a Kconfig option. Signed-off-by: Eric Paris --- fs/proc/base.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index e3cbebbabeb..482df23036b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1197,9 +1197,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, ssize_t length; uid_t loginuid; - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; - rcu_read_lock(); if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { rcu_read_unlock(); -- cgit v1.2.3 From e268337dfe26dfc7efd422a804dbb27977a3cccc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 17 Jan 2012 15:21:19 -0800 Subject: proc: clean up and fix /proc//mem handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jüri Aedla reported that the /proc//mem handling really isn't very robust, and it also doesn't match the permission checking of any of the other related files. This changes it to do the permission checks at open time, and instead of tracking the process, it tracks the VM at the time of the open. That simplifies the code a lot, but does mean that if you hold the file descriptor open over an execve(), you'll continue to read from the _old_ VM. That is different from our previous behavior, but much simpler. If somebody actually finds a load where this matters, we'll need to revert this commit. I suspect that nobody will ever notice - because the process mapping addresses will also have changed as part of the execve. So you cannot actually usefully access the fd across a VM change simply because all the offsets for IO would have changed too. Reported-by: Jüri Aedla Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/proc/base.c | 145 ++++++++++++++++----------------------------------------- 1 file changed, 39 insertions(+), 106 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 5485a5388ec..662ddf2ec4f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -198,65 +198,7 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static struct mm_struct *__check_mem_permission(struct task_struct *task) -{ - struct mm_struct *mm; - - mm = get_task_mm(task); - if (!mm) - return ERR_PTR(-EINVAL); - - /* - * A task can always look at itself, in case it chooses - * to use system calls instead of load instructions. - */ - if (task == current) - return mm; - - /* - * If current is actively ptrace'ing, and would also be - * permitted to freshly attach with ptrace now, permit it. - */ - if (task_is_stopped_or_traced(task)) { - int match; - rcu_read_lock(); - match = (ptrace_parent(task) == current); - rcu_read_unlock(); - if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH)) - return mm; - } - - /* - * No one else is allowed. - */ - mmput(mm); - return ERR_PTR(-EPERM); -} - -/* - * If current may access user memory in @task return a reference to the - * corresponding mm, otherwise ERR_PTR. - */ -static struct mm_struct *check_mem_permission(struct task_struct *task) -{ - struct mm_struct *mm; - int err; - - /* - * Avoid racing if task exec's as we might get a new mm but validate - * against old credentials. - */ - err = mutex_lock_killable(&task->signal->cred_guard_mutex); - if (err) - return ERR_PTR(err); - - mm = __check_mem_permission(task); - mutex_unlock(&task->signal->cred_guard_mutex); - - return mm; -} - -struct mm_struct *mm_for_maps(struct task_struct *task) +static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) { struct mm_struct *mm; int err; @@ -267,7 +209,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task) mm = get_task_mm(task); if (mm && mm != current->mm && - !ptrace_may_access(task, PTRACE_MODE_READ)) { + !ptrace_may_access(task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } @@ -276,6 +218,11 @@ struct mm_struct *mm_for_maps(struct task_struct *task) return mm; } +struct mm_struct *mm_for_maps(struct task_struct *task) +{ + return mm_access(task, PTRACE_MODE_READ); +} + static int proc_pid_cmdline(struct task_struct *task, char * buffer) { int res = 0; @@ -752,38 +699,39 @@ static const struct file_operations proc_single_file_operations = { static int mem_open(struct inode* inode, struct file* file) { - file->private_data = (void*)((long)current->self_exec_id); + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct mm_struct *mm; + + if (!task) + return -ESRCH; + + mm = mm_access(task, PTRACE_MODE_ATTACH); + put_task_struct(task); + + if (IS_ERR(mm)) + return PTR_ERR(mm); + /* OK to pass negative loff_t, we can catch out-of-range */ file->f_mode |= FMODE_UNSIGNED_OFFSET; + file->private_data = mm; + return 0; } static ssize_t mem_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + int ret; char *page; unsigned long src = *ppos; - int ret = -ESRCH; - struct mm_struct *mm; + struct mm_struct *mm = file->private_data; - if (!task) - goto out_no_task; + if (!mm) + return 0; - ret = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - goto out; - - mm = check_mem_permission(task); - ret = PTR_ERR(mm); - if (IS_ERR(mm)) - goto out_free; - - ret = -EIO; - - if (file->private_data != (void*)((long)current->self_exec_id)) - goto out_put; + return -ENOMEM; ret = 0; @@ -810,13 +758,7 @@ static ssize_t mem_read(struct file * file, char __user * buf, } *ppos = src; -out_put: - mmput(mm); -out_free: free_page((unsigned long) page); -out: - put_task_struct(task); -out_no_task: return ret; } @@ -825,27 +767,15 @@ static ssize_t mem_write(struct file * file, const char __user *buf, { int copied; char *page; - struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); unsigned long dst = *ppos; - struct mm_struct *mm; + struct mm_struct *mm = file->private_data; - copied = -ESRCH; - if (!task) - goto out_no_task; + if (!mm) + return 0; - copied = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - goto out_task; - - mm = check_mem_permission(task); - copied = PTR_ERR(mm); - if (IS_ERR(mm)) - goto out_free; - - copied = -EIO; - if (file->private_data != (void *)((long)current->self_exec_id)) - goto out_mm; + return -ENOMEM; copied = 0; while (count > 0) { @@ -869,13 +799,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf, } *ppos = dst; -out_mm: - mmput(mm); -out_free: free_page((unsigned long) page); -out_task: - put_task_struct(task); -out_no_task: return copied; } @@ -895,11 +819,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig) return file->f_pos; } +static int mem_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + mmput(mm); + return 0; +} + static const struct file_operations proc_mem_operations = { .llseek = mem_lseek, .read = mem_read, .write = mem_write, .open = mem_open, + .release = mem_release, }; static ssize_t environ_read(struct file *file, char __user *buf, -- cgit v1.2.3 From 85e72aa5384b1a614563ad63257ded0e91d1a620 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 20 Jan 2012 14:34:09 -0800 Subject: proc: clear_refs: do not clear reserved pages /proc/pid/clear_refs is used to clear the Referenced and YOUNG bits for pages and corresponding page table entries of the task with PID pid, which includes any special mappings inserted into the page tables in order to provide things like vDSOs and user helper functions. On ARM this causes a problem because the vectors page is mapped as a global mapping and since ec706dab ("ARM: add a vma entry for the user accessible vector page"), a VMA is also inserted into each task for this page to aid unwinding through signals and syscall restarts. Since the vectors page is required for handling faults, clearing the YOUNG bit (and subsequently writing a faulting pte) means that we lose the vectors page *globally* and cannot fault it back in. This results in a system deadlock on the next exception. To see this problem in action, just run: $ echo 1 > /proc/self/clear_refs on an ARM platform (as any user) and watch your system hang. I think this has been the case since 2.6.37 This patch avoids clearing the aforementioned bits for reserved pages, therefore leaving the vectors page intact on ARM. Since reserved pages are not candidates for swap, this change should not have any impact on the usefulness of clear_refs. Signed-off-by: Will Deacon Reported-by: Moussa Ba Acked-by: Hugh Dickins Cc: David Rientjes Cc: Russell King Acked-by: Nicolas Pitre Cc: Matt Mackall Cc: [2.6.37+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e418c5abdb0..7dcd2a25049 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -518,6 +518,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!page) continue; + if (PageReserved(page)) + continue; + /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); ClearPageReferenced(page); -- cgit v1.2.3 From 36885d7b1121c779e4060d45472fe53a5b21e09f Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 10 Jun 2011 02:36:05 -0300 Subject: sysctl: remove impossible condition check Remove checks for conditions that will never happen. If procname is NULL the loop would already had bailed out, so there's no need to check it again. At the same time this also compacts the function find_in_table() by refactoring it to be easier to read. Signed-off-by: Lucas De Marchi Reviewed-by: Jesper Juhl Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a6b62173d4c..d82f4a8b4b8 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -59,17 +59,11 @@ out: static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) { - int len; for ( ; p->procname; p++) { - - if (!p->procname) - continue; - - len = strlen(p->procname); - if (len != name->len) + if (strlen(p->procname) != name->len) continue; - if (memcmp(p->procname, name->name, len) != 0) + if (memcmp(p->procname, name->name, name->len) != 0) continue; /* I have a match */ @@ -266,10 +260,6 @@ static int scan(struct ctl_table_header *head, ctl_table *table, for (; table->procname; table++, (*pos)++) { int res; - /* Can't do anything without a proc name */ - if (!table->procname) - continue; - if (*pos < file->f_pos) continue; -- cgit v1.2.3 From de4e83bd6b5e16d491ec068cd22801d5d063b07a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 03:34:20 -0800 Subject: sysctl: Register the base sysctl table like any other sysctl table. Simplify the code by treating the base sysctl table like any other sysctl table and register it with register_sysctl_table. To ensure this table is registered early enough to avoid problems call sysctl_init from proc_sys_init. Rename sysctl_net.c:sysctl_init() to net_sysctl_init() to avoid name conflicts now that kernel/sysctl.c:sysctl_init() is no longer static. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d82f4a8b4b8..9d29d28af57 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -468,5 +468,6 @@ int __init proc_sys_init(void) proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_fops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; - return 0; + + return sysctl_init(); } -- cgit v1.2.3 From 1f87f0b52b1d6581168cb80f86746bc4df918d01 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 04:07:15 -0800 Subject: sysctl: Move the implementation into fs/proc/proc_sysctl.c Move the core sysctl code from kernel/sysctl.c and kernel/sysctl_check.c into fs/proc/proc_sysctl.c. Currently sysctl maintenance is hampered by the sysctl implementation being split across 3 files with artificial layering between them. Consolidate the entire sysctl implementation into 1 file so that it is easier to see what is going on and hopefully allowing for simpler maintenance. For functions that are now only used in fs/proc/proc_sysctl.c remove their declarations from sysctl.h and make them static in fs/proc/proc_sysctl.c Signed-off-by: Eric W. Biederman --- fs/proc/internal.h | 3 + fs/proc/proc_sysctl.c | 622 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 625 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 292577531ad..3b5ecd960d6 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -10,12 +10,15 @@ */ #include +struct ctl_table_header; extern struct proc_dir_entry proc_root; #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); +extern void sysctl_head_put(struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } +static inline void sysctl_head_put(struct ctl_table_header *head) { } #endif #ifdef CONFIG_NET extern int proc_net_init(void); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 9d29d28af57..06e6f10ee8e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -24,6 +25,209 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll) wake_up_interruptible(&poll->wait); } +static struct ctl_table root_table[1]; +static struct ctl_table_root sysctl_table_root; +static struct ctl_table_header root_table_header = { + {{.count = 1, + .ctl_table = root_table, + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, + .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, +}; +static struct ctl_table_root sysctl_table_root = { + .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), + .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), +}; + +static DEFINE_SPINLOCK(sysctl_lock); + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + spin_lock(&sysctl_lock); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); + } + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + list_del_init(&p->ctl_entry); +} + +static void sysctl_head_get(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + head->count++; + spin_unlock(&sysctl_lock); +} + +void sysctl_head_put(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ + if (!head) + BUG(); + spin_lock(&sysctl_lock); + if (!use_table(head)) + head = ERR_PTR(-ENOENT); + spin_unlock(&sysctl_lock); + return head; +} + +static void sysctl_head_finish(struct ctl_table_header *head) +{ + if (!head) + return; + spin_lock(&sysctl_lock); + unuse_table(head); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = &root->default_set; + if (root->lookup) + set = root->lookup(root, namespaces); + return set; +} + +static struct list_head * +lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = lookup_header_set(root, namespaces); + return &set->list; +} + +static struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, + struct ctl_table_header *prev) +{ + struct ctl_table_root *root; + struct list_head *header_list; + struct ctl_table_header *head; + struct list_head *tmp; + + spin_lock(&sysctl_lock); + if (prev) { + head = prev; + tmp = &prev->ctl_entry; + unuse_table(prev); + goto next; + } + tmp = &root_table_header.ctl_entry; + for (;;) { + head = list_entry(tmp, struct ctl_table_header, ctl_entry); + + if (!use_table(head)) + goto next; + spin_unlock(&sysctl_lock); + return head; + next: + root = head->root; + tmp = tmp->next; + header_list = lookup_header_list(root, namespaces); + if (tmp != header_list) + continue; + + do { + root = list_entry(root->root_list.next, + struct ctl_table_root, root_list); + if (root == &sysctl_table_root) + goto out; + header_list = lookup_header_list(root, namespaces); + } while (list_empty(header_list)); + tmp = header_list->next; + } +out: + spin_unlock(&sysctl_lock); + return NULL; +} + +static struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) +{ + return __sysctl_head_next(current->nsproxy, prev); +} + +void register_sysctl_root(struct ctl_table_root *root) +{ + spin_lock(&sysctl_lock); + list_add_tail(&root->root_list, &sysctl_table_root.root_list); + spin_unlock(&sysctl_lock); +} + +/* + * sysctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (!current_euid()) + mode >>= 6; + else if (in_egroup_p(0)) + mode >>= 3; + if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) +{ + int mode; + + if (root->permissions) + mode = root->permissions(root, current->nsproxy, table); + else + mode = table->mode; + + return test_perm(mode, op); +} + +static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) +{ + for (; table->procname; table++) { + table->parent = parent; + if (table->child) + sysctl_set_parent(table, table->child); + } +} + + static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { @@ -435,6 +639,21 @@ static int proc_sys_delete(const struct dentry *dentry) return !!PROC_I(dentry->d_inode)->sysctl->unregistering; } +static int sysctl_is_seen(struct ctl_table_header *p) +{ + struct ctl_table_set *set = p->set; + int res; + spin_lock(&sysctl_lock); + if (p->unregistering) + res = 0; + else if (!set->is_seen) + res = 1; + else + res = set->is_seen(set); + spin_unlock(&sysctl_lock); + return res; +} + static int proc_sys_compare(const struct dentry *parent, const struct inode *pinode, const struct dentry *dentry, const struct inode *inode, @@ -460,6 +679,409 @@ static const struct dentry_operations proc_sys_dentry_operations = { .d_compare = proc_sys_compare, }; +static struct ctl_table *is_branch_in(struct ctl_table *branch, + struct ctl_table *table) +{ + struct ctl_table *p; + const char *s = branch->procname; + + /* branch should have named subdirectory as its first element */ + if (!s || !branch->child) + return NULL; + + /* ... and nothing else */ + if (branch[1].procname) + return NULL; + + /* table should contain subdirectory with the same name */ + for (p = table; p->procname; p++) { + if (!p->child) + continue; + if (p->procname && strcmp(p->procname, s) == 0) + return p; + } + return NULL; +} + +/* see if attaching q to p would be an improvement */ +static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) +{ + struct ctl_table *to = p->ctl_table, *by = q->ctl_table; + struct ctl_table *next; + int is_better = 0; + int not_in_parent = !p->attached_by; + + while ((next = is_branch_in(by, to)) != NULL) { + if (by == q->attached_by) + is_better = 1; + if (to == p->attached_by) + not_in_parent = 1; + by = by->child; + to = next->child; + } + + if (is_better && not_in_parent) { + q->attached_by = by; + q->attached_to = to; + q->parent = p; + } +} + +#ifdef CONFIG_SYSCTL_SYSCALL_CHECK +static int sysctl_depth(struct ctl_table *table) +{ + struct ctl_table *tmp; + int depth; + + depth = 0; + for (tmp = table; tmp->parent; tmp = tmp->parent) + depth++; + + return depth; +} + +static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) +{ + int i; + + for (i = 0; table && i < n; i++) + table = table->parent; + + return table; +} + + +static void sysctl_print_path(struct ctl_table *table) +{ + struct ctl_table *tmp; + int depth, i; + depth = sysctl_depth(table); + if (table->procname) { + for (i = depth; i >= 0; i--) { + tmp = sysctl_parent(table, i); + printk("/%s", tmp->procname?tmp->procname:""); + } + } + printk(" "); +} + +static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, + struct ctl_table *table) +{ + struct ctl_table_header *head; + struct ctl_table *ref, *test; + int depth, cur_depth; + + depth = sysctl_depth(table); + + for (head = __sysctl_head_next(namespaces, NULL); head; + head = __sysctl_head_next(namespaces, head)) { + cur_depth = depth; + ref = head->ctl_table; +repeat: + test = sysctl_parent(table, cur_depth); + for (; ref->procname; ref++) { + int match = 0; + if (cur_depth && !ref->child) + continue; + + if (test->procname && ref->procname && + (strcmp(test->procname, ref->procname) == 0)) + match++; + + if (match) { + if (cur_depth != 0) { + cur_depth--; + ref = ref->child; + goto repeat; + } + goto out; + } + } + } + ref = NULL; +out: + sysctl_head_finish(head); + return ref; +} + +static void set_fail(const char **fail, struct ctl_table *table, const char *str) +{ + if (*fail) { + printk(KERN_ERR "sysctl table check failed: "); + sysctl_print_path(table); + printk(" %s\n", *fail); + dump_stack(); + } + *fail = str; +} + +static void sysctl_check_leaf(struct nsproxy *namespaces, + struct ctl_table *table, const char **fail) +{ + struct ctl_table *ref; + + ref = sysctl_check_lookup(namespaces, table); + if (ref && (ref != table)) + set_fail(fail, table, "Sysctl already exists"); +} + +static int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) +{ + int error = 0; + for (; table->procname; table++) { + const char *fail = NULL; + + if (table->parent) { + if (!table->parent->procname) + set_fail(&fail, table, "Parent without procname"); + } + if (table->child) { + if (table->data) + set_fail(&fail, table, "Directory with data?"); + if (table->maxlen) + set_fail(&fail, table, "Directory with maxlen?"); + if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) + set_fail(&fail, table, "Writable sysctl directory"); + if (table->proc_handler) + set_fail(&fail, table, "Directory with proc_handler"); + if (table->extra1) + set_fail(&fail, table, "Directory with extra1"); + if (table->extra2) + set_fail(&fail, table, "Directory with extra2"); + } else { + if ((table->proc_handler == proc_dostring) || + (table->proc_handler == proc_dointvec) || + (table->proc_handler == proc_dointvec_minmax) || + (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_userhz_jiffies) || + (table->proc_handler == proc_dointvec_ms_jiffies) || + (table->proc_handler == proc_doulongvec_minmax) || + (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!table->data) + set_fail(&fail, table, "No data"); + if (!table->maxlen) + set_fail(&fail, table, "No maxlen"); + } +#ifdef CONFIG_PROC_SYSCTL + if (!table->proc_handler) + set_fail(&fail, table, "No proc_handler"); +#endif + sysctl_check_leaf(namespaces, table, &fail); + } + if (table->mode > 0777) + set_fail(&fail, table, "bogus .mode"); + if (fail) { + set_fail(&fail, table, NULL); + error = -EINVAL; + } + if (table->child) + error |= sysctl_check_table(namespaces, table->child); + } + return error; +} +#endif /* CONFIG_SYSCTL_SYSCALL_CHECK */ + +/** + * __register_sysctl_paths - register a sysctl hierarchy + * @root: List of sysctl headers to register on + * @namespaces: Data to compute which lists of sysctl entries are visible + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * The members of the &struct ctl_table structure are used as follows: + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file, and for sysctl(2) + * + * child - a pointer to the child sysctl table if this entry is a directory, or + * %NULL. + * + * proc_handler - the text handler routine (described below) + * + * de - for internal use by the sysctl routines + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * sysctl(2) can automatically manage read and write requests through + * the sysctl table. The data and maxlen fields of the ctl_table + * struct enable minimal validation of the values being written to be + * performed, and the mode field allows minimal authentication. + * + * There must be a proc_handler routine for any terminal nodes + * mirrored under /proc/sys (non-terminals are handled by a built-in + * directory handler). Several default handlers are available to + * cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_root *root, + struct nsproxy *namespaces, + const struct ctl_path *path, struct ctl_table *table) +{ + struct ctl_table_header *header; + struct ctl_table *new, **prevp; + unsigned int n, npath; + struct ctl_table_set *set; + + /* Count the path components */ + for (npath = 0; path[npath].procname; ++npath) + ; + + /* + * For each path component, allocate a 2-element ctl_table array. + * The first array element will be filled with the sysctl entry + * for this, the second will be the sentinel (procname == 0). + * + * We allocate everything in one go so that we don't have to + * worry about freeing additional memory in unregister_sysctl_table. + */ + header = kzalloc(sizeof(struct ctl_table_header) + + (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); + if (!header) + return NULL; + + new = (struct ctl_table *) (header + 1); + + /* Now connect the dots */ + prevp = &header->ctl_table; + for (n = 0; n < npath; ++n, ++path) { + /* Copy the procname */ + new->procname = path->procname; + new->mode = 0555; + + *prevp = new; + prevp = &new->child; + + new += 2; + } + *prevp = table; + header->ctl_table_arg = table; + + INIT_LIST_HEAD(&header->ctl_entry); + header->used = 0; + header->unregistering = NULL; + header->root = root; + sysctl_set_parent(NULL, header->ctl_table); + header->count = 1; +#ifdef CONFIG_SYSCTL_SYSCALL_CHECK + if (sysctl_check_table(namespaces, header->ctl_table)) { + kfree(header); + return NULL; + } +#endif + spin_lock(&sysctl_lock); + header->set = lookup_header_set(root, namespaces); + header->attached_by = header->ctl_table; + header->attached_to = root_table; + header->parent = &root_table_header; + for (set = header->set; set; set = set->parent) { + struct ctl_table_header *p; + list_for_each_entry(p, &set->list, ctl_entry) { + if (p->unregistering) + continue; + try_attach(p, header); + } + } + header->parent->count++; + list_add_tail(&header->ctl_entry, &header->set->list); + spin_unlock(&sysctl_lock); + + return header; +} + +/** + * register_sysctl_table_path - register a sysctl table hierarchy + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table) +{ + return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, + path, table); +} +EXPORT_SYMBOL(register_sysctl_paths); + +/** + * register_sysctl_table - register a sysctl table hierarchy + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) +{ + static const struct ctl_path null_path[] = { {} }; + + return register_sysctl_paths(null_path, table); +} +EXPORT_SYMBOL(register_sysctl_table); + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + might_sleep(); + + if (header == NULL) + return; + + spin_lock(&sysctl_lock); + start_unregistering(header); + if (!--header->parent->count) { + WARN_ON(1); + kfree_rcu(header->parent, rcu); + } + if (!--header->count) + kfree_rcu(header, rcu); + spin_unlock(&sysctl_lock); +} +EXPORT_SYMBOL(unregister_sysctl_table); + +void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_set *parent, + int (*is_seen)(struct ctl_table_set *)) +{ + INIT_LIST_HEAD(&p->list); + p->parent = parent ? parent : &sysctl_table_root.default_set; + p->is_seen = is_seen; +} + + int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; -- cgit v1.2.3 From a15e20982e2fbb06e85da584a0f150784042c17d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 8 Jan 2012 00:16:29 -0800 Subject: sysctl: Make the directories have nlink == 1 I goofed when I made sysctl directories have nlink == 0. nlink == 0 means the directory has been deleted. nlink == 1 meands a directory does not count subdirectories. Use the default nlink == 1 for sysctl directories. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 06e6f10ee8e..f6aa75111b4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -253,7 +253,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; - clear_nlink(inode); inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; } -- cgit v1.2.3 From 97324cd804b7b9fb6044e114329335db79810425 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 9 Jan 2012 22:19:13 -0800 Subject: sysctl: Implement retire_sysctl_set This adds a small helper retire_sysctl_set to remove the intimate knowledge about the how a sysctl_set is implemented from net/sysct_net.c Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f6aa75111b4..9d8223cd365 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1080,6 +1080,10 @@ void setup_sysctl_set(struct ctl_table_set *p, p->is_seen = is_seen; } +void retire_sysctl_set(struct ctl_table_set *set) +{ + WARN_ON(!list_empty(&set->list)); +} int __init proc_sys_init(void) { -- cgit v1.2.3 From bd295b56cfae85f2dd6c2b03951480c91e6d08f3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 22 Jan 2012 21:10:21 -0800 Subject: sysctl: Remove the unnecessary sysctl_set parent concept. In sysctl_net register the two networking roots in the proper order. In register_sysctl walk the sysctl sets in the reverse order of the sysctl roots. Remove parent from ctl_table_set and setup_sysctl_set as it is no longer needed. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 9d8223cd365..86d32a318e2 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -995,13 +995,20 @@ struct ctl_table_header *__register_sysctl_paths( header->attached_by = header->ctl_table; header->attached_to = root_table; header->parent = &root_table_header; - for (set = header->set; set; set = set->parent) { + set = header->set; + root = header->root; + for (;;) { struct ctl_table_header *p; list_for_each_entry(p, &set->list, ctl_entry) { if (p->unregistering) continue; try_attach(p, header); } + if (root == &sysctl_table_root) + break; + root = list_entry(root->root_list.prev, + struct ctl_table_root, root_list); + set = lookup_header_set(root, namespaces); } header->parent->count++; list_add_tail(&header->ctl_entry, &header->set->list); @@ -1072,11 +1079,9 @@ void unregister_sysctl_table(struct ctl_table_header * header) EXPORT_SYMBOL(unregister_sysctl_table); void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, int (*is_seen)(struct ctl_table_set *)) { INIT_LIST_HEAD(&p->list); - p->parent = parent ? parent : &sysctl_table_root.default_set; p->is_seen = is_seen; } -- cgit v1.2.3 From f05e53a7fbb28c951c0c8cf3963fa8019ae1d4d3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jan 2012 10:03:13 -0800 Subject: sysctl: Create local copies of directory names used in paths Creating local copies of directory names is a good idea for two reasons. - The dynamic names used by callers must be copied into new strings by the callers today to ensure the strings do not change between register and unregister of the sysctl table. - Sysctl directories have a potentially different lifetime than the time between register and unregister of any particular sysctl table. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 86d32a318e2..bcf60fb8dce 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -943,10 +943,12 @@ struct ctl_table_header *__register_sysctl_paths( struct ctl_table *new, **prevp; unsigned int n, npath; struct ctl_table_set *set; + size_t path_bytes = 0; + char *new_name; /* Count the path components */ for (npath = 0; path[npath].procname; ++npath) - ; + path_bytes += strlen(path[npath].procname) + 1; /* * For each path component, allocate a 2-element ctl_table array. @@ -956,24 +958,27 @@ struct ctl_table_header *__register_sysctl_paths( * We allocate everything in one go so that we don't have to * worry about freeing additional memory in unregister_sysctl_table. */ - header = kzalloc(sizeof(struct ctl_table_header) + + header = kzalloc(sizeof(struct ctl_table_header) + path_bytes + (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); if (!header) return NULL; new = (struct ctl_table *) (header + 1); + new_name = (char *)(new + (2 * npath)); /* Now connect the dots */ prevp = &header->ctl_table; for (n = 0; n < npath; ++n, ++path) { /* Copy the procname */ - new->procname = path->procname; + strcpy(new_name, path->procname); + new->procname = new_name; new->mode = 0555; *prevp = new; prevp = &new->child; new += 2; + new_name += strlen(new_name) + 1; } *prevp = table; header->ctl_table_arg = table; -- cgit v1.2.3 From 6e9d5164153ad6539edd31e7afb02a3e79124cad Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jan 2012 10:26:26 -0800 Subject: sysctl: Add support for register sysctl tables with a normal cstring path. Make __register_sysctl_table the core sysctl registration operation and make it take a char * string as path. Now that binary paths have been banished into the real of backwards compatibility in kernel/binary_sysctl.c where they can be safely ignored there is no longer a need to use struct ctl_path to represent path names when registering ctl_tables. Start the transition to using normal char * strings to represent pathnames when registering sysctl tables. Normal strings are easier to deal with both in the internal sysctl implementation and for programmers registering sysctl tables. __register_sysctl_paths is turned into a backwards compatibility wrapper that converts a ctl_path array into a normal char * string. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 94 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 10 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index bcf60fb8dce..5704ff0e889 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -882,7 +882,7 @@ static int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *tabl #endif /* CONFIG_SYSCTL_SYSCALL_CHECK */ /** - * __register_sysctl_paths - register a sysctl hierarchy + * __register_sysctl_table - register a sysctl table * @root: List of sysctl headers to register on * @namespaces: Data to compute which lists of sysctl entries are visible * @path: The path to the directory the sysctl table is in. @@ -934,21 +934,34 @@ static int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *tabl * This routine returns %NULL on a failure to register, and a pointer * to the table header on success. */ -struct ctl_table_header *__register_sysctl_paths( +struct ctl_table_header *__register_sysctl_table( struct ctl_table_root *root, struct nsproxy *namespaces, - const struct ctl_path *path, struct ctl_table *table) + const char *path, struct ctl_table *table) { struct ctl_table_header *header; struct ctl_table *new, **prevp; - unsigned int n, npath; + const char *name, *nextname; + unsigned int npath = 0; struct ctl_table_set *set; size_t path_bytes = 0; char *new_name; /* Count the path components */ - for (npath = 0; path[npath].procname; ++npath) - path_bytes += strlen(path[npath].procname) + 1; + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + path_bytes += namelen + 1; + npath++; + } /* * For each path component, allocate a 2-element ctl_table array. @@ -968,9 +981,20 @@ struct ctl_table_header *__register_sysctl_paths( /* Now connect the dots */ prevp = &header->ctl_table; - for (n = 0; n < npath; ++n, ++path) { - /* Copy the procname */ - strcpy(new_name, path->procname); + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + memcpy(new_name, name, namelen); + new_name[namelen] = '\0'; + new->procname = new_name; new->mode = 0555; @@ -978,7 +1002,7 @@ struct ctl_table_header *__register_sysctl_paths( prevp = &new->child; new += 2; - new_name += strlen(new_name) + 1; + new_name += namelen + 1; } *prevp = table; header->ctl_table_arg = table; @@ -1022,6 +1046,56 @@ struct ctl_table_header *__register_sysctl_paths( return header; } +static char *append_path(const char *path, char *pos, const char *name) +{ + int namelen; + namelen = strlen(name); + if (((pos - path) + namelen + 2) >= PATH_MAX) + return NULL; + memcpy(pos, name, namelen); + pos[namelen] = '/'; + pos[namelen + 1] = '\0'; + pos += namelen + 1; + return pos; +} + +/** + * __register_sysctl_paths - register a sysctl table hierarchy + * @root: List of sysctl headers to register on + * @namespaces: Data to compute which lists of sysctl entries are visible + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_root *root, + struct nsproxy *namespaces, + const struct ctl_path *path, struct ctl_table *table) +{ + struct ctl_table_header *header = NULL; + const struct ctl_path *component; + char *new_path, *pos; + + pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!new_path) + return NULL; + + pos[0] = '\0'; + for (component = path; component->procname; component++) { + pos = append_path(new_path, pos, component->procname); + if (!pos) + goto out; + } + header = __register_sysctl_table(root, namespaces, new_path, table); +out: + kfree(new_path); + return header; +} + /** * register_sysctl_table_path - register a sysctl table hierarchy * @path: The path to the directory the sysctl table is in. -- cgit v1.2.3 From ec6a52668d0bbc6d648e978c327150254bf1ce7f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jan 2012 12:35:23 -0800 Subject: sysctl: Add ctl_table chains into cstring paths For any component of table passed to __register_sysctl_paths that actually serves as a path, add that to the cstring path that is passed to __register_sysctl_table. The result is that for most calls to __register_sysctl_paths we only pass a table to __register_sysctl_table that contains no child directories. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5704ff0e889..9b91deeeb56 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1076,6 +1076,7 @@ struct ctl_table_header *__register_sysctl_paths( struct nsproxy *namespaces, const struct ctl_path *path, struct ctl_table *table) { + struct ctl_table *ctl_table_arg = table; struct ctl_table_header *header = NULL; const struct ctl_path *component; char *new_path, *pos; @@ -1090,7 +1091,15 @@ struct ctl_table_header *__register_sysctl_paths( if (!pos) goto out; } + while (table->procname && table->child && !table[1].procname) { + pos = append_path(new_path, pos, table->procname); + if (!pos) + goto out; + table = table->child; + } header = __register_sysctl_table(root, namespaces, new_path, table); + if (header) + header->ctl_table_arg = ctl_table_arg; out: kfree(new_path); return header; -- cgit v1.2.3 From f728019bb72e655680c02ad1829323054a8e875f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 22 Jan 2012 18:22:05 -0800 Subject: sysctl: register only tables of sysctl files Split the registration of a complex ctl_table array which may have arbitrary numbers of directories (->child != NULL) and tables of files into a series of simpler registrations that only register tables of files. Graphically: register('dir', { + file-a + file-b + subdir1 + file-c + subdir2 + file-d + file-e }) is transformed into: wrapper->subheaders[0] = register('dir', {file1-a, file1-b}) wrapper->subheaders[1] = register('dir/subdir1', {file-c}) wrapper->subheaders[2] = register('dir/subdir2', {file-d, file-e}) return wrapper This guarantees that __register_sysctl_table will only see a simple ctl_table array with all entries having (->child == NULL). Care was taken to pass the original simple ctl_table arrays to __register_sysctl_table whenever possible. This change is derived from a similar patch written by Lucrian Grijincu. Inspired-by: Lucian Adrian Grijincu Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 165 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 147 insertions(+), 18 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 9b91deeeb56..6bab2ae9e39 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -882,7 +882,7 @@ static int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *tabl #endif /* CONFIG_SYSCTL_SYSCALL_CHECK */ /** - * __register_sysctl_table - register a sysctl table + * __register_sysctl_table - register a leaf sysctl table * @root: List of sysctl headers to register on * @namespaces: Data to compute which lists of sysctl entries are visible * @path: The path to the directory the sysctl table is in. @@ -900,29 +900,19 @@ static int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *tabl * * maxlen - the maximum size in bytes of the data * - * mode - the file permissions for the /proc/sys file, and for sysctl(2) + * mode - the file permissions for the /proc/sys file * - * child - a pointer to the child sysctl table if this entry is a directory, or - * %NULL. + * child - must be %NULL. * * proc_handler - the text handler routine (described below) * - * de - for internal use by the sysctl routines - * * extra1, extra2 - extra pointers usable by the proc handler routines * * Leaf nodes in the sysctl tree will be represented by a single file * under /proc; non-leaf nodes will be represented by directories. * - * sysctl(2) can automatically manage read and write requests through - * the sysctl table. The data and maxlen fields of the ctl_table - * struct enable minimal validation of the values being written to be - * performed, and the mode field allows minimal authentication. - * - * There must be a proc_handler routine for any terminal nodes - * mirrored under /proc/sys (non-terminals are handled by a built-in - * directory handler). Several default handlers are available to - * cover common cases - + * There must be a proc_handler routine for any terminal nodes. + * Several default handlers are available to cover common cases - * * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), @@ -1059,6 +1049,100 @@ static char *append_path(const char *path, char *pos, const char *name) return pos; } +static int count_subheaders(struct ctl_table *table) +{ + int has_files = 0; + int nr_subheaders = 0; + struct ctl_table *entry; + + /* special case: no directory and empty directory */ + if (!table || !table->procname) + return 1; + + for (entry = table; entry->procname; entry++) { + if (entry->child) + nr_subheaders += count_subheaders(entry->child); + else + has_files = 1; + } + return nr_subheaders + has_files; +} + +static int register_leaf_sysctl_tables(const char *path, char *pos, + struct ctl_table_header ***subheader, + struct ctl_table_root *root, struct nsproxy *namespaces, + struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = NULL; + struct ctl_table *entry, *files; + int nr_files = 0; + int nr_dirs = 0; + int err = -ENOMEM; + + for (entry = table; entry->procname; entry++) { + if (entry->child) + nr_dirs++; + else + nr_files++; + } + + files = table; + /* If there are mixed files and directories we need a new table */ + if (nr_dirs && nr_files) { + struct ctl_table *new; + files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1), + GFP_KERNEL); + if (!files) + goto out; + + ctl_table_arg = files; + for (new = files, entry = table; entry->procname; entry++) { + if (entry->child) + continue; + *new = *entry; + new++; + } + } + + /* Register everything except a directory full of subdirectories */ + if (nr_files || !nr_dirs) { + struct ctl_table_header *header; + header = __register_sysctl_table(root, namespaces, path, files); + if (!header) { + kfree(ctl_table_arg); + goto out; + } + + /* Remember if we need to free the file table */ + header->ctl_table_arg = ctl_table_arg; + **subheader = header; + (*subheader)++; + } + + /* Recurse into the subdirectories. */ + for (entry = table; entry->procname; entry++) { + char *child_pos; + + if (!entry->child) + continue; + + err = -ENAMETOOLONG; + child_pos = append_path(path, pos, entry->procname); + if (!child_pos) + goto out; + + err = register_leaf_sysctl_tables(path, child_pos, subheader, + root, namespaces, entry->child); + pos[0] = '\0'; + if (err) + goto out; + } + err = 0; +out: + /* On failure our caller will unregister all registered subheaders */ + return err; +} + /** * __register_sysctl_paths - register a sysctl table hierarchy * @root: List of sysctl headers to register on @@ -1077,7 +1161,8 @@ struct ctl_table_header *__register_sysctl_paths( const struct ctl_path *path, struct ctl_table *table) { struct ctl_table *ctl_table_arg = table; - struct ctl_table_header *header = NULL; + int nr_subheaders = count_subheaders(table); + struct ctl_table_header *header = NULL, **subheaders, **subheader; const struct ctl_path *component; char *new_path, *pos; @@ -1097,12 +1182,39 @@ struct ctl_table_header *__register_sysctl_paths( goto out; table = table->child; } - header = __register_sysctl_table(root, namespaces, new_path, table); - if (header) + if (nr_subheaders == 1) { + header = __register_sysctl_table(root, namespaces, new_path, table); + if (header) + header->ctl_table_arg = ctl_table_arg; + } else { + header = kzalloc(sizeof(*header) + + sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); + if (!header) + goto out; + + subheaders = (struct ctl_table_header **) (header + 1); + subheader = subheaders; header->ctl_table_arg = ctl_table_arg; + + if (register_leaf_sysctl_tables(new_path, pos, &subheader, + root, namespaces, table)) + goto err_register_leaves; + } + out: kfree(new_path); return header; + +err_register_leaves: + while (subheader > subheaders) { + struct ctl_table_header *subh = *(--subheader); + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + header = NULL; + goto out; } /** @@ -1149,11 +1261,28 @@ EXPORT_SYMBOL(register_sysctl_table); */ void unregister_sysctl_table(struct ctl_table_header * header) { + int nr_subheaders; might_sleep(); if (header == NULL) return; + nr_subheaders = count_subheaders(header->ctl_table_arg); + if (unlikely(nr_subheaders > 1)) { + struct ctl_table_header **subheaders; + int i; + + subheaders = (struct ctl_table_header **)(header + 1); + for (i = nr_subheaders -1; i >= 0; i--) { + struct ctl_table_header *subh = subheaders[i]; + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + return; + } + spin_lock(&sysctl_lock); start_unregistering(header); if (!--header->parent->count) { -- cgit v1.2.3 From 7c60c48f58a78195acc1f71c9a9d01958c02ab89 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jan 2012 13:34:05 -0800 Subject: sysctl: Improve the sysctl sanity checks - Stop validating subdirectories now that we only register leaf tables - Cleanup and improve the duplicate filename check. * Run the duplicate filename check under the sysctl_lock to guarantee we never add duplicate names. * Reduce the duplicate filename check to nearly O(M*N) where M is the number of entries in tthe table we are registering and N is the number of entries in the directory before we got there. - Move the duplicate filename check into it's own function and call it directtly from __register_sysctl_table - Kill the config option as the sanity checks are now cheap enough the config option is unnecessary. The original reason for the config option was because we had a huge table used to verify the proc filename to binary sysctl mapping. That table has now evolved into the binary_sysctl translation layer and is no longer part of the sysctl_check code. - Tighten up the permission checks. Guarnateeing that files only have read or write permissions. - Removed redudant check for parents having a procname as now everything has a procname. - Generalize the backtrace logic so that we print a backtrace from any failure of __register_sysctl_table that was not caused by a memmory allocation failure. The backtrace allows us to track down who erroneously registered a sysctl table. Bechmark before (CONFIG_SYSCTL_CHECK=y): make-dummies 0 999 -> 12s rmmod dummy -> 0.08s Bechmark before (CONFIG_SYSCTL_CHECK=n): make-dummies 0 999 -> 0.7s rmmod dummy -> 0.06s make-dummies 0 99999 -> 1m13s rmmod dummy -> 0.38s Benchmark after: make-dummies 0 999 -> 0.65s rmmod dummy -> 0.055s make-dummies 0 9999 -> 1m10s rmmod dummy -> 0.39s The sysctl sanity checks now impose no measurable cost. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 222 +++++++++++++++++++------------------------------- 1 file changed, 86 insertions(+), 136 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 6bab2ae9e39..a492ff60e07 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -726,160 +726,106 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) } } -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK -static int sysctl_depth(struct ctl_table *table) +static int sysctl_check_table_dups(const char *path, struct ctl_table *old, + struct ctl_table *table) { - struct ctl_table *tmp; - int depth; - - depth = 0; - for (tmp = table; tmp->parent; tmp = tmp->parent) - depth++; + struct ctl_table *entry, *test; + int error = 0; - return depth; + for (entry = old; entry->procname; entry++) { + for (test = table; test->procname; test++) { + if (strcmp(entry->procname, test->procname) == 0) { + printk(KERN_ERR "sysctl duplicate entry: %s/%s\n", + path, test->procname); + error = -EEXIST; + } + } + } + return error; } -static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) +static int sysctl_check_dups(struct nsproxy *namespaces, + struct ctl_table_header *header, + const char *path, struct ctl_table *table) { - int i; + struct ctl_table_root *root; + struct ctl_table_set *set; + struct ctl_table_header *dir_head, *head; + struct ctl_table *dir_table; + int error = 0; - for (i = 0; table && i < n; i++) - table = table->parent; + /* No dups if we are the only member of our directory */ + if (header->attached_by != table) + return 0; - return table; -} + dir_head = header->parent; + dir_table = header->attached_to; + error = sysctl_check_table_dups(path, dir_table, table); -static void sysctl_print_path(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth, i; - depth = sysctl_depth(table); - if (table->procname) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk("/%s", tmp->procname?tmp->procname:""); - } - } - printk(" "); -} + root = &sysctl_table_root; + do { + set = lookup_header_set(root, namespaces); -static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table_header *head; - struct ctl_table *ref, *test; - int depth, cur_depth; - - depth = sysctl_depth(table); - - for (head = __sysctl_head_next(namespaces, NULL); head; - head = __sysctl_head_next(namespaces, head)) { - cur_depth = depth; - ref = head->ctl_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->procname; ref++) { - int match = 0; - if (cur_depth && !ref->child) + list_for_each_entry(head, &set->list, ctl_entry) { + if (head->unregistering) continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } + if (head->attached_to != dir_table) + continue; + error = sysctl_check_table_dups(path, head->attached_by, + table); } - } - ref = NULL; -out: - sysctl_head_finish(head); - return ref; + root = list_entry(root->root_list.next, + struct ctl_table_root, root_list); + } while (root != &sysctl_table_root); + return error; } -static void set_fail(const char **fail, struct ctl_table *table, const char *str) +static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) { - if (*fail) { - printk(KERN_ERR "sysctl table check failed: "); - sysctl_print_path(table); - printk(" %s\n", *fail); - dump_stack(); - } - *fail = str; -} + struct va_format vaf; + va_list args; -static void sysctl_check_leaf(struct nsproxy *namespaces, - struct ctl_table *table, const char **fail) -{ - struct ctl_table *ref; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n", + path, table->procname, &vaf); - ref = sysctl_check_lookup(namespaces, table); - if (ref && (ref != table)) - set_fail(fail, table, "Sysctl already exists"); + va_end(args); + return -EINVAL; } -static int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) +static int sysctl_check_table(const char *path, struct ctl_table *table) { - int error = 0; + int err = 0; for (; table->procname; table++) { - const char *fail = NULL; - - if (table->parent) { - if (!table->parent->procname) - set_fail(&fail, table, "Parent without procname"); - } - if (table->child) { - if (table->data) - set_fail(&fail, table, "Directory with data?"); - if (table->maxlen) - set_fail(&fail, table, "Directory with maxlen?"); - if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) - set_fail(&fail, table, "Writable sysctl directory"); - if (table->proc_handler) - set_fail(&fail, table, "Directory with proc_handler"); - if (table->extra1) - set_fail(&fail, table, "Directory with extra1"); - if (table->extra2) - set_fail(&fail, table, "Directory with extra2"); - } else { - if ((table->proc_handler == proc_dostring) || - (table->proc_handler == proc_dointvec) || - (table->proc_handler == proc_dointvec_minmax) || - (table->proc_handler == proc_dointvec_jiffies) || - (table->proc_handler == proc_dointvec_userhz_jiffies) || - (table->proc_handler == proc_dointvec_ms_jiffies) || - (table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (!table->data) - set_fail(&fail, table, "No data"); - if (!table->maxlen) - set_fail(&fail, table, "No maxlen"); - } -#ifdef CONFIG_PROC_SYSCTL - if (!table->proc_handler) - set_fail(&fail, table, "No proc_handler"); -#endif - sysctl_check_leaf(namespaces, table, &fail); - } - if (table->mode > 0777) - set_fail(&fail, table, "bogus .mode"); - if (fail) { - set_fail(&fail, table, NULL); - error = -EINVAL; - } if (table->child) - error |= sysctl_check_table(namespaces, table->child); + err = sysctl_err(path, table, "Not a file"); + + if ((table->proc_handler == proc_dostring) || + (table->proc_handler == proc_dointvec) || + (table->proc_handler == proc_dointvec_minmax) || + (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_userhz_jiffies) || + (table->proc_handler == proc_dointvec_ms_jiffies) || + (table->proc_handler == proc_doulongvec_minmax) || + (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!table->data) + err = sysctl_err(path, table, "No data"); + if (!table->maxlen) + err = sysctl_err(path, table, "No maxlen"); + } + if (!table->proc_handler) + err = sysctl_err(path, table, "No proc_handler"); + + if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) + err = sysctl_err(path, table, "bogus .mode 0%o", + table->mode); } - return error; + return err; } -#endif /* CONFIG_SYSCTL_SYSCALL_CHECK */ /** * __register_sysctl_table - register a leaf sysctl table @@ -1003,12 +949,8 @@ struct ctl_table_header *__register_sysctl_table( header->root = root; sysctl_set_parent(NULL, header->ctl_table); header->count = 1; -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - if (sysctl_check_table(namespaces, header->ctl_table)) { - kfree(header); - return NULL; - } -#endif + if (sysctl_check_table(path, table)) + goto fail; spin_lock(&sysctl_lock); header->set = lookup_header_set(root, namespaces); header->attached_by = header->ctl_table; @@ -1029,11 +971,19 @@ struct ctl_table_header *__register_sysctl_table( struct ctl_table_root, root_list); set = lookup_header_set(root, namespaces); } + if (sysctl_check_dups(namespaces, header, path, table)) + goto fail_locked; header->parent->count++; list_add_tail(&header->ctl_entry, &header->set->list); spin_unlock(&sysctl_lock); return header; +fail_locked: + spin_unlock(&sysctl_lock); +fail: + kfree(header); + dump_stack(); + return NULL; } static char *append_path(const char *path, char *pos, const char *name) -- cgit v1.2.3 From 8d6ecfcc014332fd2fe933f64194160f0e3a6696 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 11:55:30 -0800 Subject: sysctl: Remove the now unused ctl_table parent field. While useful at one time for selinux and the sysctl sanity checks those users no longer use the parent field and we can safely remove it. Inspired-by: Lucian Adrian Grijincu Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a492ff60e07..e573f9b4f22 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -218,16 +218,6 @@ static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int return test_perm(mode, op); } -static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) -{ - for (; table->procname; table++) { - table->parent = parent; - if (table->child) - sysctl_set_parent(table, table->child); - } -} - - static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { @@ -947,10 +937,10 @@ struct ctl_table_header *__register_sysctl_table( header->used = 0; header->unregistering = NULL; header->root = root; - sysctl_set_parent(NULL, header->ctl_table); header->count = 1; if (sysctl_check_table(path, table)) goto fail; + spin_lock(&sysctl_lock); header->set = lookup_header_set(root, namespaces); header->attached_by = header->ctl_table; -- cgit v1.2.3 From 3cc3e04636d603778d921854b84ae7bd34a349a2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 7 Jan 2012 06:57:47 -0800 Subject: sysctl: A more obvious version of grab_header. Instead of relying on sysct_head_next(NULL) to magically return the right header for the root directory instead explicitly transform NULL into the root directories header. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index e573f9b4f22..15444850b3e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -267,10 +267,10 @@ static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) static struct ctl_table_header *grab_header(struct inode *inode) { - if (PROC_I(inode)->sysctl) - return sysctl_head_grab(PROC_I(inode)->sysctl); - else - return sysctl_head_next(NULL); + struct ctl_table_header *head = PROC_I(inode)->sysctl; + if (!head) + head = &root_table_header; + return sysctl_head_grab(head); } static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, -- cgit v1.2.3 From 938aaa4f9249aa1519fd0db07fc72125de2df338 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 9 Jan 2012 17:24:30 -0800 Subject: sysctl: Initial support for auto-unregistering sysctl tables. Add nreg to ctl_table_header. When nreg drops to 0 the ctl_table_header will be unregistered. Factor out drop_sysctl_table from unregister_sysctl_table, and add the logic for decrementing nreg. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 15444850b3e..13faa48c467 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -29,8 +29,9 @@ static struct ctl_table root_table[1]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { {{.count = 1, - .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, + .nreg = 1, + .ctl_table = root_table, + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }; @@ -938,6 +939,7 @@ struct ctl_table_header *__register_sysctl_table( header->unregistering = NULL; header->root = root; header->count = 1; + header->nreg = 1; if (sysctl_check_table(path, table)) goto fail; @@ -1192,6 +1194,20 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) } EXPORT_SYMBOL(register_sysctl_table); +static void drop_sysctl_table(struct ctl_table_header *header) +{ + if (--header->nreg) + return; + + start_unregistering(header); + if (!--header->parent->count) { + WARN_ON(1); + kfree_rcu(header->parent, rcu); + } + if (!--header->count) + kfree_rcu(header, rcu); +} + /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl_table @@ -1224,13 +1240,7 @@ void unregister_sysctl_table(struct ctl_table_header * header) } spin_lock(&sysctl_lock); - start_unregistering(header); - if (!--header->parent->count) { - WARN_ON(1); - kfree_rcu(header->parent, rcu); - } - if (!--header->count) - kfree_rcu(header, rcu); + drop_sysctl_table(header); spin_unlock(&sysctl_lock); } EXPORT_SYMBOL(unregister_sysctl_table); -- cgit v1.2.3 From e0d045290a8454ecd7f63c78c10d412f35d6ef94 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 9 Jan 2012 22:36:41 -0800 Subject: sysctl: Factor out init_header from __register_sysctl_paths Factor out a routing to initialize the sysctl_table_header. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 13faa48c467..49799259b0f 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -42,6 +42,21 @@ static struct ctl_table_root sysctl_table_root = { static DEFINE_SPINLOCK(sysctl_lock); +static void init_header(struct ctl_table_header *head, + struct ctl_table_root *root, struct ctl_table_set *set, + struct ctl_table *table) +{ + head->ctl_table_arg = table; + INIT_LIST_HEAD(&head->ctl_entry); + head->used = 0; + head->count = 1; + head->nreg = 1; + head->unregistering = NULL; + head->root = root; + head->set = set; + head->parent = NULL; +} + /* called under sysctl_lock */ static int use_table(struct ctl_table_header *p) { @@ -932,14 +947,8 @@ struct ctl_table_header *__register_sysctl_table( new_name += namelen + 1; } *prevp = table; - header->ctl_table_arg = table; - - INIT_LIST_HEAD(&header->ctl_entry); - header->used = 0; - header->unregistering = NULL; - header->root = root; - header->count = 1; - header->nreg = 1; + + init_header(header, root, NULL, table); if (sysctl_check_table(path, table)) goto fail; -- cgit v1.2.3 From 8425d6aaf0704b98480131ed339c208ffce12e44 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 9 Jan 2012 17:35:01 -0800 Subject: sysctl: Factor out insert_header and erase_header Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 49799259b0f..7e96a2681b6 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -57,6 +57,17 @@ static void init_header(struct ctl_table_header *head, head->parent = NULL; } +static void erase_header(struct ctl_table_header *head) +{ + list_del_init(&head->ctl_entry); +} + +static void insert_header(struct ctl_table_header *header) +{ + header->parent->count++; + list_add_tail(&header->ctl_entry, &header->set->list); +} + /* called under sysctl_lock */ static int use_table(struct ctl_table_header *p) { @@ -96,7 +107,7 @@ static void start_unregistering(struct ctl_table_header *p) * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. */ - list_del_init(&p->ctl_entry); + erase_header(p); } static void sysctl_head_get(struct ctl_table_header *head) @@ -974,8 +985,7 @@ struct ctl_table_header *__register_sysctl_table( } if (sysctl_check_dups(namespaces, header, path, table)) goto fail_locked; - header->parent->count++; - list_add_tail(&header->ctl_entry, &header->set->list); + insert_header(header); spin_unlock(&sysctl_lock); return header; -- cgit v1.2.3 From a194558e8698621a9ce7f2c6a720123e644af131 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jan 2012 17:51:48 -0800 Subject: sysctl: Normalize the root_table data structure. Every other directory has a .child member and we look at the .child for our entries. Do the same for the root_table. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 7e96a2681b6..88d1b06cc5c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -25,7 +25,14 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll) wake_up_interruptible(&poll->wait); } -static struct ctl_table root_table[1]; +static struct ctl_table root_table[] = { + { + .procname = "", + .mode = S_IRUGO|S_IXUGO, + .child = &root_table[1], + }, + { } +}; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { {{.count = 1, @@ -319,7 +326,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; } - table = table ? table->child : head->ctl_table; + table = table ? table->child : &head->ctl_table[1]; p = find_in_table(table, name); if (!p) { @@ -510,7 +517,7 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out; } - table = table ? table->child : head->ctl_table; + table = table ? table->child : &head->ctl_table[1]; ret = 0; /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ @@ -966,7 +973,7 @@ struct ctl_table_header *__register_sysctl_table( spin_lock(&sysctl_lock); header->set = lookup_header_set(root, namespaces); header->attached_by = header->ctl_table; - header->attached_to = root_table; + header->attached_to = &root_table[1]; header->parent = &root_table_header; set = header->set; root = header->root; -- cgit v1.2.3 From 076c3eed2c31773200b082568957fd8852ae93d7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 9 Jan 2012 21:42:02 -0800 Subject: sysctl: Rewrite proc_sys_lookup introducing find_entry and lookup_entry. Replace the helpers that proc_sys_lookup uses with helpers that work in terms of an entire sysctl directory. This is worse for sysctl_lock hold times but it is much better for code clarity and the code cleanups to come. find_in_table is no longer needed so it is removed. find_entry a general helper to find entries in a directory is added. lookup_entry is a simple wrapper around find_entry that takes the sysctl_lock increases the use count if an entry is found and drops the sysctl_lock. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 102 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 26 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 88d1b06cc5c..3b63f298ce2 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -49,6 +49,55 @@ static struct ctl_table_root sysctl_table_root = { static DEFINE_SPINLOCK(sysctl_lock); +static int namecmp(const char *name1, int len1, const char *name2, int len2) +{ + int minlen; + int cmp; + + minlen = len1; + if (minlen > len2) + minlen = len2; + + cmp = memcmp(name1, name2, minlen); + if (cmp == 0) + cmp = len1 - len2; + return cmp; +} + +static struct ctl_table *find_entry(struct ctl_table_header **phead, + struct ctl_table_set *set, + struct ctl_table_header *dir_head, struct ctl_table *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + if (dir_head->set == set) { + for (entry = dir; entry->procname; entry++) { + const char *procname = entry->procname; + if (namecmp(procname, strlen(procname), name, namelen) == 0) { + *phead = dir_head; + return entry; + } + } + } + + list_for_each_entry(head, &set->list, ctl_entry) { + if (head->unregistering) + continue; + if (head->attached_to != dir) + continue; + for (entry = head->attached_by; entry->procname; entry++) { + const char *procname = entry->procname; + if (namecmp(procname, strlen(procname), name, namelen) == 0) { + *phead = head; + return entry; + } + } + } + return NULL; +} + static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, struct ctl_table *table) @@ -168,6 +217,32 @@ lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) return &set->list; } +static struct ctl_table *lookup_entry(struct ctl_table_header **phead, + struct ctl_table_header *dir_head, + struct ctl_table *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + struct ctl_table_root *root; + struct ctl_table_set *set; + + spin_lock(&sysctl_lock); + root = &sysctl_table_root; + do { + set = lookup_header_set(root, current->nsproxy); + entry = find_entry(&head, set, dir_head, dir, name, namelen); + if (entry && use_table(head)) + *phead = head; + else + entry = NULL; + root = list_entry(root->root_list.next, + struct ctl_table_root, root_list); + } while (!entry && root != &sysctl_table_root); + spin_unlock(&sysctl_lock); + return entry; +} + static struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, struct ctl_table_header *prev) { @@ -284,21 +359,6 @@ out: return inode; } -static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) -{ - for ( ; p->procname; p++) { - if (strlen(p->procname) != name->len) - continue; - - if (memcmp(p->procname, name->name, name->len) != 0) - continue; - - /* I have a match */ - return p; - } - return NULL; -} - static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; @@ -328,17 +388,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, table = table ? table->child : &head->ctl_table[1]; - p = find_in_table(table, name); - if (!p) { - for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { - if (h->attached_to != table) - continue; - p = find_in_table(h->attached_by, name); - if (p) - break; - } - } - + p = lookup_entry(&h, head, table, name->name, name->len); if (!p) goto out; -- cgit v1.2.3 From 6a75ce167c53b41f15088d3c2c7e51c89dc8798a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 18 Jan 2012 03:15:51 -0800 Subject: sysctl: Rewrite proc_sys_readdir in terms of first_entry and next_entry Replace sysctl_head_next with first_entry and next_entry. These new iterators operate at the level of sysctl table entries and filter out any sysctl tables that should not be shown. Utilizing two specialized functions instead of a single function removes conditionals for handling awkward special cases that only come up at the beginning of iteration, making the iterators easier to read and understand. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 98 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 36 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 3b63f298ce2..d9c3ae6afe4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -243,31 +243,25 @@ static struct ctl_table *lookup_entry(struct ctl_table_header **phead, return entry; } -static struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, - struct ctl_table_header *prev) +static struct ctl_table_header *next_usable_entry(struct ctl_table *dir, + struct ctl_table_root *root, struct list_head *tmp) { - struct ctl_table_root *root; + struct nsproxy *namespaces = current->nsproxy; struct list_head *header_list; struct ctl_table_header *head; - struct list_head *tmp; - spin_lock(&sysctl_lock); - if (prev) { - head = prev; - tmp = &prev->ctl_entry; - unuse_table(prev); - goto next; - } - tmp = &root_table_header.ctl_entry; + goto next; for (;;) { head = list_entry(tmp, struct ctl_table_header, ctl_entry); + root = head->root; - if (!use_table(head)) + if (head->attached_to != dir || + !head->attached_by->procname || + !use_table(head)) goto next; - spin_unlock(&sysctl_lock); + return head; next: - root = head->root; tmp = tmp->next; header_list = lookup_header_list(root, namespaces); if (tmp != header_list) @@ -283,13 +277,53 @@ static struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, tmp = header_list->next; } out: - spin_unlock(&sysctl_lock); return NULL; } -static struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) +static void first_entry( + struct ctl_table_header *dir_head, struct ctl_table *dir, + struct ctl_table_header **phead, struct ctl_table **pentry) { - return __sysctl_head_next(current->nsproxy, prev); + struct ctl_table_header *head = dir_head; + struct ctl_table *entry = dir; + + spin_lock(&sysctl_lock); + if (entry->procname) { + use_table(head); + } else { + head = next_usable_entry(dir, &sysctl_table_root, + &sysctl_table_root.default_set.list); + if (head) + entry = head->attached_by; + } + spin_unlock(&sysctl_lock); + *phead = head; + *pentry = entry; +} + +static void next_entry(struct ctl_table *dir, + struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = *phead; + struct ctl_table *entry = *pentry; + + entry++; + if (!entry->procname) { + struct ctl_table_root *root = head->root; + struct list_head *tmp = &head->ctl_entry; + if (head->attached_to != dir) { + root = &sysctl_table_root; + tmp = &sysctl_table_root.default_set.list; + } + spin_lock(&sysctl_lock); + unuse_table(head); + head = next_usable_entry(dir, root, tmp); + spin_unlock(&sysctl_lock); + if (head) + entry = head->attached_by; + } + *phead = head; + *pentry = entry; } void register_sysctl_root(struct ctl_table_root *root) @@ -533,20 +567,17 @@ static int scan(struct ctl_table_header *head, ctl_table *table, unsigned long *pos, struct file *file, void *dirent, filldir_t filldir) { + int res; - for (; table->procname; table++, (*pos)++) { - int res; + if ((*pos)++ < file->f_pos) + return 0; - if (*pos < file->f_pos) - continue; + res = proc_sys_fill_cache(file, dirent, filldir, head, table); - res = proc_sys_fill_cache(file, dirent, filldir, head, table); - if (res) - return res; + if (res == 0) + file->f_pos = *pos; - file->f_pos = *pos + 1; - } - return 0; + return res; } static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) @@ -556,6 +587,7 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; struct ctl_table_header *h = NULL; + struct ctl_table *entry; unsigned long pos; int ret = -EINVAL; @@ -585,14 +617,8 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) } pos = 2; - ret = scan(head, table, &pos, filp, dirent, filldir); - if (ret) - goto out; - - for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) { - if (h->attached_to != table) - continue; - ret = scan(h, h->attached_by, &pos, filp, dirent, filldir); + for (first_entry(head, table, &h, &entry); h; next_entry(table, &h, &entry)) { + ret = scan(h, entry, &pos, filp, dirent, filldir); if (ret) { sysctl_head_finish(h); break; -- cgit v1.2.3 From 9eb47c26f09e27506d343ef52e634b2a50ee21ef Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 22 Jan 2012 21:26:00 -0800 Subject: sysctl: Add a root pointer to ctl_table_set Add a ctl_table_root pointer to ctl_table set so it is easy to go from a ctl_table_set to a ctl_table_root. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d9c3ae6afe4..65c13dddcea 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -45,6 +45,7 @@ static struct ctl_table_header root_table_header = { static struct ctl_table_root sysctl_table_root = { .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), + .default_set.root = &sysctl_table_root, }; static DEFINE_SPINLOCK(sysctl_lock); @@ -1348,9 +1349,11 @@ void unregister_sysctl_table(struct ctl_table_header * header) EXPORT_SYMBOL(unregister_sysctl_table); void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)) { INIT_LIST_HEAD(&p->list); + p->root = root; p->is_seen = is_seen; } -- cgit v1.2.3 From 7ec66d06362da7684a4948c4c2bf1f8546425df4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 29 Dec 2011 08:24:29 -0800 Subject: sysctl: Stop requiring explicit management of sysctl directories Simplify the code and the sysctl semantics by autogenerating sysctl directories when a sysctl table is registered that needs the directories and autodeleting the directories when there are no more sysctl tables registered that need them. Autogenerating directories keeps sysctl tables from depending on each other, removing all of the arcane register/unregister ordering constraints and makes it impossible to get the order wrong when reigsering and unregistering sysctl tables. Autogenerating directories yields one unique entity that dentries can point to, retaining the current effective use of the dcache. Add struct ctl_dir as the type of these new autogenerated directories. The attached_by and attached_to fields in ctl_table_header are removed as they are no longer needed. The child field in ctl_table is no longer needed by the core of the sysctl code. ctl_table.child can be removed once all of the existing users have been updated. Benchmark before: make-dummies 0 999 -> 0.7s rmmod dummy -> 0.07s make-dummies 0 9999 -> 1m10s rmmod dummy -> 0.4s Benchmark after: make-dummies 0 999 -> 0.44s rmmod dummy -> 0.065s make-dummies 0 9999 -> 1m36s rmmod dummy -> 0.4s Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 342 +++++++++++++++++++++----------------------------- 1 file changed, 143 insertions(+), 199 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 65c13dddcea..3c0767d5a55 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -28,28 +28,31 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll) static struct ctl_table root_table[] = { { .procname = "", - .mode = S_IRUGO|S_IXUGO, - .child = &root_table[1], + .mode = S_IFDIR|S_IRUGO|S_IXUGO, }, { } }; static struct ctl_table_root sysctl_table_root; -static struct ctl_table_header root_table_header = { - {{.count = 1, - .nreg = 1, - .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, - .root = &sysctl_table_root, - .set = &sysctl_table_root.default_set, +static struct ctl_dir sysctl_root_dir = { + .header = { + {{.count = 1, + .nreg = 1, + .ctl_table = root_table, + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, + .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, + }, }; static struct ctl_table_root sysctl_table_root = { .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), - .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), + .default_set.list = LIST_HEAD_INIT(sysctl_root_dir.header.ctl_entry), .default_set.root = &sysctl_table_root, }; static DEFINE_SPINLOCK(sysctl_lock); +static void drop_sysctl_table(struct ctl_table_header *header); + static int namecmp(const char *name1, int len1, const char *name2, int len2) { int minlen; @@ -66,29 +69,18 @@ static int namecmp(const char *name1, int len1, const char *name2, int len2) } static struct ctl_table *find_entry(struct ctl_table_header **phead, - struct ctl_table_set *set, - struct ctl_table_header *dir_head, struct ctl_table *dir, + struct ctl_table_set *set, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; struct ctl_table *entry; - if (dir_head->set == set) { - for (entry = dir; entry->procname; entry++) { - const char *procname = entry->procname; - if (namecmp(procname, strlen(procname), name, namelen) == 0) { - *phead = dir_head; - return entry; - } - } - } - list_for_each_entry(head, &set->list, ctl_entry) { if (head->unregistering) continue; - if (head->attached_to != dir) + if (head->parent != dir) continue; - for (entry = head->attached_by; entry->procname; entry++) { + for (entry = head->ctl_table; entry->procname; entry++) { const char *procname = entry->procname; if (namecmp(procname, strlen(procname), name, namelen) == 0) { *phead = head; @@ -103,6 +95,7 @@ static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, struct ctl_table *table) { + head->ctl_table = table; head->ctl_table_arg = table; INIT_LIST_HEAD(&head->ctl_entry); head->used = 0; @@ -119,9 +112,10 @@ static void erase_header(struct ctl_table_header *head) list_del_init(&head->ctl_entry); } -static void insert_header(struct ctl_table_header *header) +static void insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { - header->parent->count++; + header->parent = dir; + header->parent->header.nreg++; list_add_tail(&header->ctl_entry, &header->set->list); } @@ -219,8 +213,7 @@ lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) } static struct ctl_table *lookup_entry(struct ctl_table_header **phead, - struct ctl_table_header *dir_head, - struct ctl_table *dir, + struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; @@ -232,7 +225,7 @@ static struct ctl_table *lookup_entry(struct ctl_table_header **phead, root = &sysctl_table_root; do { set = lookup_header_set(root, current->nsproxy); - entry = find_entry(&head, set, dir_head, dir, name, namelen); + entry = find_entry(&head, set, dir, name, namelen); if (entry && use_table(head)) *phead = head; else @@ -244,7 +237,7 @@ static struct ctl_table *lookup_entry(struct ctl_table_header **phead, return entry; } -static struct ctl_table_header *next_usable_entry(struct ctl_table *dir, +static struct ctl_table_header *next_usable_entry(struct ctl_dir *dir, struct ctl_table_root *root, struct list_head *tmp) { struct nsproxy *namespaces = current->nsproxy; @@ -256,8 +249,8 @@ static struct ctl_table_header *next_usable_entry(struct ctl_table *dir, head = list_entry(tmp, struct ctl_table_header, ctl_entry); root = head->root; - if (head->attached_to != dir || - !head->attached_by->procname || + if (head->parent != dir || + !head->ctl_table->procname || !use_table(head)) goto next; @@ -281,47 +274,35 @@ out: return NULL; } -static void first_entry( - struct ctl_table_header *dir_head, struct ctl_table *dir, +static void first_entry(struct ctl_dir *dir, struct ctl_table_header **phead, struct ctl_table **pentry) { - struct ctl_table_header *head = dir_head; - struct ctl_table *entry = dir; + struct ctl_table_header *head; + struct ctl_table *entry = NULL; spin_lock(&sysctl_lock); - if (entry->procname) { - use_table(head); - } else { - head = next_usable_entry(dir, &sysctl_table_root, - &sysctl_table_root.default_set.list); - if (head) - entry = head->attached_by; - } + head = next_usable_entry(dir, &sysctl_table_root, + &sysctl_table_root.default_set.list); spin_unlock(&sysctl_lock); + if (head) + entry = head->ctl_table; *phead = head; *pentry = entry; } -static void next_entry(struct ctl_table *dir, - struct ctl_table_header **phead, struct ctl_table **pentry) +static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) { struct ctl_table_header *head = *phead; struct ctl_table *entry = *pentry; entry++; if (!entry->procname) { - struct ctl_table_root *root = head->root; - struct list_head *tmp = &head->ctl_entry; - if (head->attached_to != dir) { - root = &sysctl_table_root; - tmp = &sysctl_table_root.default_set.list; - } spin_lock(&sysctl_lock); unuse_table(head); - head = next_usable_entry(dir, root, tmp); + head = next_usable_entry(head->parent, head->root, &head->ctl_entry); spin_unlock(&sysctl_lock); if (head) - entry = head->attached_by; + entry = head->ctl_table; } *phead = head; *pentry = entry; @@ -381,7 +362,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_mode = table->mode; - if (!table->child) { + if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; inode->i_op = &proc_sys_inode_operations; inode->i_fop = &proc_sys_file_operations; @@ -398,7 +379,7 @@ static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; if (!head) - head = &root_table_header; + head = &sysctl_root_dir.header; return sysctl_head_grab(head); } @@ -406,24 +387,19 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct ctl_table_header *head = grab_header(dir); - struct ctl_table *table = PROC_I(dir)->sysctl_entry; struct ctl_table_header *h = NULL; struct qstr *name = &dentry->d_name; struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); + struct ctl_dir *ctl_dir; if (IS_ERR(head)) return ERR_CAST(head); - if (table && !table->child) { - WARN_ON(1); - goto out; - } + ctl_dir = container_of(head, struct ctl_dir, header); - table = table ? table->child : &head->ctl_table[1]; - - p = lookup_entry(&h, head, table, name->name, name->len); + p = lookup_entry(&h, ctl_dir, name->name, name->len); if (!p) goto out; @@ -586,21 +562,16 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; struct ctl_table_header *head = grab_header(inode); - struct ctl_table *table = PROC_I(inode)->sysctl_entry; struct ctl_table_header *h = NULL; struct ctl_table *entry; + struct ctl_dir *ctl_dir; unsigned long pos; int ret = -EINVAL; if (IS_ERR(head)) return PTR_ERR(head); - if (table && !table->child) { - WARN_ON(1); - goto out; - } - - table = table ? table->child : &head->ctl_table[1]; + ctl_dir = container_of(head, struct ctl_dir, header); ret = 0; /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ @@ -618,7 +589,7 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) } pos = 2; - for (first_entry(head, table, &h, &entry); h; next_entry(table, &h, &entry)) { + for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { ret = scan(h, entry, &pos, filp, dirent, filldir); if (ret) { sysctl_head_finish(h); @@ -779,52 +750,86 @@ static const struct dentry_operations proc_sys_dentry_operations = { .d_compare = proc_sys_compare, }; -static struct ctl_table *is_branch_in(struct ctl_table *branch, - struct ctl_table *table) +static struct ctl_dir *find_subdir(struct ctl_table_set *set, struct ctl_dir *dir, + const char *name, int namelen) { - struct ctl_table *p; - const char *s = branch->procname; + struct ctl_table_header *head; + struct ctl_table *entry; - /* branch should have named subdirectory as its first element */ - if (!s || !branch->child) - return NULL; + entry = find_entry(&head, set, dir, name, namelen); + if (!entry) + return ERR_PTR(-ENOENT); + if (S_ISDIR(entry->mode)) + return container_of(head, struct ctl_dir, header); + return ERR_PTR(-ENOTDIR); +} + +static struct ctl_dir *new_dir(struct ctl_table_set *set, + const char *name, int namelen) +{ + struct ctl_table *table; + struct ctl_dir *new; + char *new_name; - /* ... and nothing else */ - if (branch[1].procname) + new = kzalloc(sizeof(*new) + sizeof(struct ctl_table)*2 + + namelen + 1, GFP_KERNEL); + if (!new) return NULL; - /* table should contain subdirectory with the same name */ - for (p = table; p->procname; p++) { - if (!p->child) - continue; - if (p->procname && strcmp(p->procname, s) == 0) - return p; - } - return NULL; + table = (struct ctl_table *)(new + 1); + new_name = (char *)(table + 2); + memcpy(new_name, name, namelen); + new_name[namelen] = '\0'; + table[0].procname = new_name; + table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; + init_header(&new->header, set->root, set, table); + + return new; } -/* see if attaching q to p would be an improvement */ -static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) +static struct ctl_dir *get_subdir(struct ctl_table_set *set, + struct ctl_dir *dir, const char *name, int namelen) { - struct ctl_table *to = p->ctl_table, *by = q->ctl_table; - struct ctl_table *next; - int is_better = 0; - int not_in_parent = !p->attached_by; - - while ((next = is_branch_in(by, to)) != NULL) { - if (by == q->attached_by) - is_better = 1; - if (to == p->attached_by) - not_in_parent = 1; - by = by->child; - to = next->child; - } + struct ctl_dir *subdir, *new = NULL; - if (is_better && not_in_parent) { - q->attached_by = by; - q->attached_to = to; - q->parent = p; + spin_lock(&sysctl_lock); + subdir = find_subdir(dir->header.set, dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if ((PTR_ERR(subdir) == -ENOENT) && set != dir->header.set) + subdir = find_subdir(set, dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + spin_unlock(&sysctl_lock); + new = new_dir(set, name, namelen); + spin_lock(&sysctl_lock); + subdir = ERR_PTR(-ENOMEM); + if (!new) + goto failed; + + subdir = find_subdir(set, dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + insert_header(dir, &new->header); + subdir = new; +found: + subdir->header.nreg++; +failed: + if (unlikely(IS_ERR(subdir))) { + printk(KERN_ERR "sysctl could not get directory: %*.*s %ld\n", + namelen, namelen, name, PTR_ERR(subdir)); } + drop_sysctl_table(&dir->header); + if (new) + drop_sysctl_table(&new->header); + spin_unlock(&sysctl_lock); + return subdir; } static int sysctl_check_table_dups(const char *path, struct ctl_table *old, @@ -846,24 +851,14 @@ static int sysctl_check_table_dups(const char *path, struct ctl_table *old, } static int sysctl_check_dups(struct nsproxy *namespaces, - struct ctl_table_header *header, + struct ctl_dir *dir, const char *path, struct ctl_table *table) { struct ctl_table_root *root; struct ctl_table_set *set; - struct ctl_table_header *dir_head, *head; - struct ctl_table *dir_table; + struct ctl_table_header *head; int error = 0; - /* No dups if we are the only member of our directory */ - if (header->attached_by != table) - return 0; - - dir_head = header->parent; - dir_table = header->attached_to; - - error = sysctl_check_table_dups(path, dir_table, table); - root = &sysctl_table_root; do { set = lookup_header_set(root, namespaces); @@ -871,9 +866,9 @@ static int sysctl_check_dups(struct nsproxy *namespaces, list_for_each_entry(head, &set->list, ctl_entry) { if (head->unregistering) continue; - if (head->attached_to != dir_table) + if (head->parent != dir) continue; - error = sysctl_check_table_dups(path, head->attached_by, + error = sysctl_check_table_dups(path, head->ctl_table, table); } root = list_entry(root->root_list.next, @@ -977,47 +972,25 @@ struct ctl_table_header *__register_sysctl_table( const char *path, struct ctl_table *table) { struct ctl_table_header *header; - struct ctl_table *new, **prevp; const char *name, *nextname; - unsigned int npath = 0; struct ctl_table_set *set; - size_t path_bytes = 0; - char *new_name; - - /* Count the path components */ - for (name = path; name; name = nextname) { - int namelen; - nextname = strchr(name, '/'); - if (nextname) { - namelen = nextname - name; - nextname++; - } else { - namelen = strlen(name); - } - if (namelen == 0) - continue; - path_bytes += namelen + 1; - npath++; - } + struct ctl_dir *dir; - /* - * For each path component, allocate a 2-element ctl_table array. - * The first array element will be filled with the sysctl entry - * for this, the second will be the sentinel (procname == 0). - * - * We allocate everything in one go so that we don't have to - * worry about freeing additional memory in unregister_sysctl_table. - */ - header = kzalloc(sizeof(struct ctl_table_header) + path_bytes + - (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); + header = kzalloc(sizeof(struct ctl_table_header), GFP_KERNEL); if (!header) return NULL; - new = (struct ctl_table *) (header + 1); - new_name = (char *)(new + (2 * npath)); + init_header(header, root, NULL, table); + if (sysctl_check_table(path, table)) + goto fail; + + spin_lock(&sysctl_lock); + header->set = set = lookup_header_set(root, namespaces); + dir = &sysctl_root_dir; + dir->header.nreg++; + spin_unlock(&sysctl_lock); - /* Now connect the dots */ - prevp = &header->ctl_table; + /* Find the directory for the ctl_table */ for (name = path; name; name = nextname) { int namelen; nextname = strchr(name, '/'); @@ -1029,51 +1002,21 @@ struct ctl_table_header *__register_sysctl_table( } if (namelen == 0) continue; - memcpy(new_name, name, namelen); - new_name[namelen] = '\0'; - - new->procname = new_name; - new->mode = 0555; - - *prevp = new; - prevp = &new->child; - new += 2; - new_name += namelen + 1; + dir = get_subdir(set, dir, name, namelen); + if (IS_ERR(dir)) + goto fail; } - *prevp = table; - - init_header(header, root, NULL, table); - if (sysctl_check_table(path, table)) - goto fail; - spin_lock(&sysctl_lock); - header->set = lookup_header_set(root, namespaces); - header->attached_by = header->ctl_table; - header->attached_to = &root_table[1]; - header->parent = &root_table_header; - set = header->set; - root = header->root; - for (;;) { - struct ctl_table_header *p; - list_for_each_entry(p, &set->list, ctl_entry) { - if (p->unregistering) - continue; - try_attach(p, header); - } - if (root == &sysctl_table_root) - break; - root = list_entry(root->root_list.prev, - struct ctl_table_root, root_list); - set = lookup_header_set(root, namespaces); - } - if (sysctl_check_dups(namespaces, header, path, table)) - goto fail_locked; - insert_header(header); + if (sysctl_check_dups(namespaces, dir, path, table)) + goto fail_put_dir_locked; + insert_header(dir, header); + drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); return header; -fail_locked: +fail_put_dir_locked: + drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); fail: kfree(header); @@ -1299,16 +1242,17 @@ EXPORT_SYMBOL(register_sysctl_table); static void drop_sysctl_table(struct ctl_table_header *header) { + struct ctl_dir *parent = header->parent; + if (--header->nreg) return; start_unregistering(header); - if (!--header->parent->count) { - WARN_ON(1); - kfree_rcu(header->parent, rcu); - } if (!--header->count) kfree_rcu(header, rcu); + + if (parent) + drop_sysctl_table(&parent->header); } /** -- cgit v1.2.3 From 6980128fe1b834c92a85e556ca8198030f0d8d01 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jan 2012 20:09:45 -0800 Subject: sysctl: Add sysctl_print_dir and use it in get_subdir When there are errors it is very nice to know the full sysctl path. Add a simple function that computes the sysctl path and prints it out. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 3c0767d5a55..a78556514a8 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -53,6 +53,13 @@ static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); +static void sysctl_print_dir(struct ctl_dir *dir) +{ + if (dir->header.parent) + sysctl_print_dir(dir->header.parent); + printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname); +} + static int namecmp(const char *name1, int len1, const char *name2, int len2) { int minlen; @@ -822,7 +829,9 @@ found: subdir->header.nreg++; failed: if (unlikely(IS_ERR(subdir))) { - printk(KERN_ERR "sysctl could not get directory: %*.*s %ld\n", + printk(KERN_ERR "sysctl could not get directory: "); + sysctl_print_dir(dir); + printk(KERN_CONT "/%*.*s %ld\n", namelen, namelen, name, PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); -- cgit v1.2.3 From 0e47c99d7fe25e0f3907d9f3401079169d904891 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 7 Jan 2012 23:24:30 -0800 Subject: sysctl: Replace root_list with links between sysctl_table_sets. Piecing together directories by looking first in one directory tree, than in another directory tree and finally in a third directory tree makes it hard to verify that some directory entries are not multiply defined and makes it hard to create efficient implementations the sysctl filesystem. Replace the sysctl wide list of roots with autogenerated links from the core sysctl directory tree to the other sysctl directory trees. This simplifies sysctl directory reading and lookups as now only entries in a single sysctl directory tree need to be considered. Benchmark before: make-dummies 0 999 -> 0.44s rmmod dummy -> 0.065s make-dummies 0 9999 -> 1m36s rmmod dummy -> 0.4s Benchmark after: make-dummies 0 999 -> 0.63s rmmod dummy -> 0.12s make-dummies 0 9999 -> 2m35s rmmod dummy -> 18s The slowdown is caused by the lookups used in insert_headers and put_links to see if we need to add links or remove links. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 397 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 295 insertions(+), 102 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a78556514a8..ec54a57c469 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -32,26 +32,26 @@ static struct ctl_table root_table[] = { }, { } }; -static struct ctl_table_root sysctl_table_root; -static struct ctl_dir sysctl_root_dir = { - .header = { +static struct ctl_table_root sysctl_table_root = { + .default_set.list = LIST_HEAD_INIT(sysctl_table_root.default_set.dir.header.ctl_entry), + .default_set.dir.header = { {{.count = 1, .nreg = 1, .ctl_table = root_table, .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, + .ctl_table_arg = root_table, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }, }; -static struct ctl_table_root sysctl_table_root = { - .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), - .default_set.list = LIST_HEAD_INIT(sysctl_root_dir.header.ctl_entry), - .default_set.root = &sysctl_table_root, -}; static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry, struct nsproxy *namespaces); +static int insert_links(struct ctl_table_header *head); +static void put_links(struct ctl_table_header *header); static void sysctl_print_dir(struct ctl_dir *dir) { @@ -76,9 +76,9 @@ static int namecmp(const char *name1, int len1, const char *name2, int len2) } static struct ctl_table *find_entry(struct ctl_table_header **phead, - struct ctl_table_set *set, struct ctl_dir *dir, - const char *name, int namelen) + struct ctl_dir *dir, const char *name, int namelen) { + struct ctl_table_set *set = dir->header.set; struct ctl_table_header *head; struct ctl_table *entry; @@ -119,11 +119,21 @@ static void erase_header(struct ctl_table_header *head) list_del_init(&head->ctl_entry); } -static void insert_header(struct ctl_dir *dir, struct ctl_table_header *header) +static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { + int err; + + dir->header.nreg++; header->parent = dir; - header->parent->header.nreg++; + err = insert_links(header); + if (err) + goto fail_links; list_add_tail(&header->ctl_entry, &header->set->list); + return 0; +fail_links: + header->parent = NULL; + drop_sysctl_table(&dir->header); + return err; } /* called under sysctl_lock */ @@ -212,72 +222,39 @@ lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) return set; } -static struct list_head * -lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = lookup_header_set(root, namespaces); - return &set->list; -} - static struct ctl_table *lookup_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; struct ctl_table *entry; - struct ctl_table_root *root; - struct ctl_table_set *set; spin_lock(&sysctl_lock); - root = &sysctl_table_root; - do { - set = lookup_header_set(root, current->nsproxy); - entry = find_entry(&head, set, dir, name, namelen); - if (entry && use_table(head)) - *phead = head; - else - entry = NULL; - root = list_entry(root->root_list.next, - struct ctl_table_root, root_list); - } while (!entry && root != &sysctl_table_root); + entry = find_entry(&head, dir, name, namelen); + if (entry && use_table(head)) + *phead = head; + else + entry = NULL; spin_unlock(&sysctl_lock); return entry; } static struct ctl_table_header *next_usable_entry(struct ctl_dir *dir, - struct ctl_table_root *root, struct list_head *tmp) + struct list_head *tmp) { - struct nsproxy *namespaces = current->nsproxy; - struct list_head *header_list; + struct ctl_table_set *set = dir->header.set; struct ctl_table_header *head; - goto next; - for (;;) { + for (tmp = tmp->next; tmp != &set->list; tmp = tmp->next) { head = list_entry(tmp, struct ctl_table_header, ctl_entry); - root = head->root; if (head->parent != dir || !head->ctl_table->procname || !use_table(head)) - goto next; - - return head; - next: - tmp = tmp->next; - header_list = lookup_header_list(root, namespaces); - if (tmp != header_list) continue; - do { - root = list_entry(root->root_list.next, - struct ctl_table_root, root_list); - if (root == &sysctl_table_root) - goto out; - header_list = lookup_header_list(root, namespaces); - } while (list_empty(header_list)); - tmp = header_list->next; + return head; } -out: return NULL; } @@ -288,8 +265,7 @@ static void first_entry(struct ctl_dir *dir, struct ctl_table *entry = NULL; spin_lock(&sysctl_lock); - head = next_usable_entry(dir, &sysctl_table_root, - &sysctl_table_root.default_set.list); + head = next_usable_entry(dir, &dir->header.set->list); spin_unlock(&sysctl_lock); if (head) entry = head->ctl_table; @@ -306,7 +282,7 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr if (!entry->procname) { spin_lock(&sysctl_lock); unuse_table(head); - head = next_usable_entry(head->parent, head->root, &head->ctl_entry); + head = next_usable_entry(head->parent, &head->ctl_entry); spin_unlock(&sysctl_lock); if (head) entry = head->ctl_table; @@ -317,9 +293,6 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr void register_sysctl_root(struct ctl_table_root *root) { - spin_lock(&sysctl_lock); - list_add_tail(&root->root_list, &sysctl_table_root.root_list); - spin_unlock(&sysctl_lock); } /* @@ -386,7 +359,7 @@ static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; if (!head) - head = &sysctl_root_dir.header; + head = &sysctl_table_root.default_set.dir.header; return sysctl_head_grab(head); } @@ -400,6 +373,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); struct ctl_dir *ctl_dir; + int ret; if (IS_ERR(head)) return ERR_CAST(head); @@ -410,6 +384,11 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, if (!p) goto out; + ret = sysctl_follow_link(&h, &p, current->nsproxy); + err = ERR_PTR(ret); + if (ret) + goto out; + err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); if (h) @@ -547,6 +526,25 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent, return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); } +static int proc_sys_link_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, + struct ctl_table_header *head, + struct ctl_table *table) +{ + int err, ret = 0; + head = sysctl_head_grab(head); + + /* It is not an error if we can not follow the link ignore it */ + err = sysctl_follow_link(&head, &table, current->nsproxy); + if (err) + goto out; + + ret = proc_sys_fill_cache(filp, dirent, filldir, head, table); +out: + sysctl_head_finish(head); + return ret; +} + static int scan(struct ctl_table_header *head, ctl_table *table, unsigned long *pos, struct file *file, void *dirent, filldir_t filldir) @@ -556,7 +554,10 @@ static int scan(struct ctl_table_header *head, ctl_table *table, if ((*pos)++ < file->f_pos) return 0; - res = proc_sys_fill_cache(file, dirent, filldir, head, table); + if (unlikely(S_ISLNK(table->mode))) + res = proc_sys_link_fill_cache(file, dirent, filldir, head, table); + else + res = proc_sys_fill_cache(file, dirent, filldir, head, table); if (res == 0) file->f_pos = *pos; @@ -757,13 +758,13 @@ static const struct dentry_operations proc_sys_dentry_operations = { .d_compare = proc_sys_compare, }; -static struct ctl_dir *find_subdir(struct ctl_table_set *set, struct ctl_dir *dir, - const char *name, int namelen) +static struct ctl_dir *find_subdir(struct ctl_dir *dir, + const char *name, int namelen) { struct ctl_table_header *head; struct ctl_table *entry; - entry = find_entry(&head, set, dir, name, namelen); + entry = find_entry(&head, dir, name, namelen); if (!entry) return ERR_PTR(-ENOENT); if (S_ISDIR(entry->mode)) @@ -772,7 +773,7 @@ static struct ctl_dir *find_subdir(struct ctl_table_set *set, struct ctl_dir *di } static struct ctl_dir *new_dir(struct ctl_table_set *set, - const char *name, int namelen) + const char *name, int namelen) { struct ctl_table *table; struct ctl_dir *new; @@ -789,22 +790,19 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, new_name[namelen] = '\0'; table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; - init_header(&new->header, set->root, set, table); + init_header(&new->header, set->dir.header.root, set, table); return new; } -static struct ctl_dir *get_subdir(struct ctl_table_set *set, - struct ctl_dir *dir, const char *name, int namelen) +static struct ctl_dir *get_subdir(struct ctl_dir *dir, + const char *name, int namelen) { + struct ctl_table_set *set = dir->header.set; struct ctl_dir *subdir, *new = NULL; spin_lock(&sysctl_lock); - subdir = find_subdir(dir->header.set, dir, name, namelen); - if (!IS_ERR(subdir)) - goto found; - if ((PTR_ERR(subdir) == -ENOENT) && set != dir->header.set) - subdir = find_subdir(set, dir, name, namelen); + subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) @@ -817,13 +815,14 @@ static struct ctl_dir *get_subdir(struct ctl_table_set *set, if (!new) goto failed; - subdir = find_subdir(set, dir, name, namelen); + subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; - insert_header(dir, &new->header); + if (insert_header(dir, &new->header)) + goto failed; subdir = new; found: subdir->header.nreg++; @@ -841,6 +840,57 @@ failed: return subdir; } +static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) +{ + struct ctl_dir *parent; + const char *procname; + if (!dir->header.parent) + return &set->dir; + parent = xlate_dir(set, dir->header.parent); + if (IS_ERR(parent)) + return parent; + procname = dir->header.ctl_table[0].procname; + return find_subdir(parent, procname, strlen(procname)); +} + +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry, struct nsproxy *namespaces) +{ + struct ctl_table_header *head; + struct ctl_table_root *root; + struct ctl_table_set *set; + struct ctl_table *entry; + struct ctl_dir *dir; + int ret; + + /* Get out quickly if not a link */ + if (!S_ISLNK((*pentry)->mode)) + return 0; + + ret = 0; + spin_lock(&sysctl_lock); + root = (*pentry)->data; + set = lookup_header_set(root, namespaces); + dir = xlate_dir(set, (*phead)->parent); + if (IS_ERR(dir)) + ret = PTR_ERR(dir); + else { + const char *procname = (*pentry)->procname; + head = NULL; + entry = find_entry(&head, dir, procname, strlen(procname)); + ret = -ENOENT; + if (entry && use_table(head)) { + unuse_table(*phead); + *phead = head; + *pentry = entry; + ret = 0; + } + } + + spin_unlock(&sysctl_lock); + return ret; +} + static int sysctl_check_table_dups(const char *path, struct ctl_table *old, struct ctl_table *table) { @@ -859,30 +909,21 @@ static int sysctl_check_table_dups(const char *path, struct ctl_table *old, return error; } -static int sysctl_check_dups(struct nsproxy *namespaces, - struct ctl_dir *dir, +static int sysctl_check_dups(struct ctl_dir *dir, const char *path, struct ctl_table *table) { - struct ctl_table_root *root; struct ctl_table_set *set; struct ctl_table_header *head; int error = 0; - root = &sysctl_table_root; - do { - set = lookup_header_set(root, namespaces); - - list_for_each_entry(head, &set->list, ctl_entry) { - if (head->unregistering) - continue; - if (head->parent != dir) - continue; - error = sysctl_check_table_dups(path, head->ctl_table, - table); - } - root = list_entry(root->root_list.next, - struct ctl_table_root, root_list); - } while (root != &sysctl_table_root); + set = dir->header.set; + list_for_each_entry(head, &set->list, ctl_entry) { + if (head->unregistering) + continue; + if (head->parent != dir) + continue; + error = sysctl_check_table_dups(path, head->ctl_table, table); + } return error; } @@ -932,6 +973,115 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) return err; } +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, + struct ctl_table_root *link_root) +{ + struct ctl_table *link_table, *entry, *link; + struct ctl_table_header *links; + char *link_name; + int nr_entries, name_bytes; + + name_bytes = 0; + nr_entries = 0; + for (entry = table; entry->procname; entry++) { + nr_entries++; + name_bytes += strlen(entry->procname) + 1; + } + + links = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_table)*(nr_entries + 1) + + name_bytes, + GFP_KERNEL); + + if (!links) + return NULL; + + link_table = (struct ctl_table *)(links + 1); + link_name = (char *)&link_table[nr_entries + 1]; + + for (link = link_table, entry = table; entry->procname; link++, entry++) { + int len = strlen(entry->procname) + 1; + memcpy(link_name, entry->procname, len); + link->procname = link_name; + link->mode = S_IFLNK|S_IRWXUGO; + link->data = link_root; + link_name += len; + } + init_header(links, dir->header.root, dir->header.set, link_table); + links->nreg = nr_entries; + + return links; +} + +static bool get_links(struct ctl_dir *dir, + struct ctl_table *table, struct ctl_table_root *link_root) +{ + struct ctl_table_header *head; + struct ctl_table *entry, *link; + + /* Are there links available for every entry in table? */ + for (entry = table; entry->procname; entry++) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + if (!link) + return false; + if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) + continue; + if (S_ISLNK(link->mode) && (link->data == link_root)) + continue; + return false; + } + + /* The checks passed. Increase the registration count on the links */ + for (entry = table; entry->procname; entry++) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + head->nreg++; + } + return true; +} + +static int insert_links(struct ctl_table_header *head) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_dir *core_parent = NULL; + struct ctl_table_header *links; + int err; + + if (head->set == root_set) + return 0; + + core_parent = xlate_dir(root_set, head->parent); + if (IS_ERR(core_parent)) + return 0; + + if (get_links(core_parent, head->ctl_table, head->root)) + return 0; + + core_parent->header.nreg++; + spin_unlock(&sysctl_lock); + + links = new_links(core_parent, head->ctl_table, head->root); + + spin_lock(&sysctl_lock); + err = -ENOMEM; + if (!links) + goto out; + + err = 0; + if (get_links(core_parent, head->ctl_table, head->root)) { + kfree(links); + goto out; + } + + err = insert_header(core_parent, links); + if (err) + kfree(links); +out: + drop_sysctl_table(&core_parent->header); + return err; +} + /** * __register_sysctl_table - register a leaf sysctl table * @root: List of sysctl headers to register on @@ -980,6 +1130,7 @@ struct ctl_table_header *__register_sysctl_table( struct nsproxy *namespaces, const char *path, struct ctl_table *table) { + struct ctl_table_header *links = NULL; struct ctl_table_header *header; const char *name, *nextname; struct ctl_table_set *set; @@ -995,7 +1146,7 @@ struct ctl_table_header *__register_sysctl_table( spin_lock(&sysctl_lock); header->set = set = lookup_header_set(root, namespaces); - dir = &sysctl_root_dir; + dir = &set->dir; dir->header.nreg++; spin_unlock(&sysctl_lock); @@ -1012,22 +1163,28 @@ struct ctl_table_header *__register_sysctl_table( if (namelen == 0) continue; - dir = get_subdir(set, dir, name, namelen); + dir = get_subdir(dir, name, namelen); if (IS_ERR(dir)) goto fail; } + spin_lock(&sysctl_lock); - if (sysctl_check_dups(namespaces, dir, path, table)) + if (sysctl_check_dups(dir, path, table)) + goto fail_put_dir_locked; + + if (insert_header(dir, header)) goto fail_put_dir_locked; - insert_header(dir, header); + drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); return header; + fail_put_dir_locked: drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); fail: + kfree(links); kfree(header); dump_stack(); return NULL; @@ -1249,6 +1406,40 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) } EXPORT_SYMBOL(register_sysctl_table); +static void put_links(struct ctl_table_header *header) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_table_root *root = header->root; + struct ctl_dir *parent = header->parent; + struct ctl_dir *core_parent; + struct ctl_table *entry; + + if (header->set == root_set) + return; + + core_parent = xlate_dir(root_set, parent); + if (IS_ERR(core_parent)) + return; + + for (entry = header->ctl_table; entry->procname; entry++) { + struct ctl_table_header *link_head; + struct ctl_table *link; + const char *name = entry->procname; + + link = find_entry(&link_head, core_parent, name, strlen(name)); + if (link && + ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || + (S_ISLNK(link->mode) && (link->data == root)))) { + drop_sysctl_table(link_head); + } + else { + printk(KERN_ERR "sysctl link missing during unregister: "); + sysctl_print_dir(parent); + printk(KERN_CONT "/%s\n", name); + } + } +} + static void drop_sysctl_table(struct ctl_table_header *header) { struct ctl_dir *parent = header->parent; @@ -1256,6 +1447,7 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; + put_links(header); start_unregistering(header); if (!--header->count) kfree_rcu(header, rcu); @@ -1301,13 +1493,14 @@ void unregister_sysctl_table(struct ctl_table_header * header) } EXPORT_SYMBOL(unregister_sysctl_table); -void setup_sysctl_set(struct ctl_table_set *p, +void setup_sysctl_set(struct ctl_table_set *set, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)) { - INIT_LIST_HEAD(&p->list); - p->root = root; - p->is_seen = is_seen; + memset(set, sizeof(*set), 0); + INIT_LIST_HEAD(&set->list); + set->is_seen = is_seen; + init_header(&set->dir.header, root, set, root_table); } void retire_sysctl_set(struct ctl_table_set *set) -- cgit v1.2.3 From 60a47a2e823cbe6b609346bffff61a00c0c76470 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 8 Jan 2012 00:02:37 -0800 Subject: sysctl: Modify __register_sysctl_paths to take a set instead of a root and an nsproxy An nsproxy argument here has always been awkard and now the nsproxy argument is completely unnecessary so remove it, replacing it with the set we want the registered tables to show up in. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ec54a57c469..e0d3e7e59cb 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1084,8 +1084,7 @@ out: /** * __register_sysctl_table - register a leaf sysctl table - * @root: List of sysctl headers to register on - * @namespaces: Data to compute which lists of sysctl entries are visible + * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. * @table: the top-level table structure * @@ -1126,26 +1125,24 @@ out: * to the table header on success. */ struct ctl_table_header *__register_sysctl_table( - struct ctl_table_root *root, - struct nsproxy *namespaces, + struct ctl_table_set *set, const char *path, struct ctl_table *table) { + struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *links = NULL; struct ctl_table_header *header; const char *name, *nextname; - struct ctl_table_set *set; struct ctl_dir *dir; header = kzalloc(sizeof(struct ctl_table_header), GFP_KERNEL); if (!header) return NULL; - init_header(header, root, NULL, table); + init_header(header, root, set, table); if (sysctl_check_table(path, table)) goto fail; spin_lock(&sysctl_lock); - header->set = set = lookup_header_set(root, namespaces); dir = &set->dir; dir->header.nreg++; spin_unlock(&sysctl_lock); @@ -1223,8 +1220,7 @@ static int count_subheaders(struct ctl_table *table) } static int register_leaf_sysctl_tables(const char *path, char *pos, - struct ctl_table_header ***subheader, - struct ctl_table_root *root, struct nsproxy *namespaces, + struct ctl_table_header ***subheader, struct ctl_table_set *set, struct ctl_table *table) { struct ctl_table *ctl_table_arg = NULL; @@ -1261,7 +1257,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos, /* Register everything except a directory full of subdirectories */ if (nr_files || !nr_dirs) { struct ctl_table_header *header; - header = __register_sysctl_table(root, namespaces, path, files); + header = __register_sysctl_table(set, path, files); if (!header) { kfree(ctl_table_arg); goto out; @@ -1286,7 +1282,7 @@ static int register_leaf_sysctl_tables(const char *path, char *pos, goto out; err = register_leaf_sysctl_tables(path, child_pos, subheader, - root, namespaces, entry->child); + set, entry->child); pos[0] = '\0'; if (err) goto out; @@ -1299,8 +1295,7 @@ out: /** * __register_sysctl_paths - register a sysctl table hierarchy - * @root: List of sysctl headers to register on - * @namespaces: Data to compute which lists of sysctl entries are visible + * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. * @table: the top-level table structure * @@ -1310,8 +1305,7 @@ out: * See __register_sysctl_table for more details. */ struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_root *root, - struct nsproxy *namespaces, + struct ctl_table_set *set, const struct ctl_path *path, struct ctl_table *table) { struct ctl_table *ctl_table_arg = table; @@ -1337,7 +1331,7 @@ struct ctl_table_header *__register_sysctl_paths( table = table->child; } if (nr_subheaders == 1) { - header = __register_sysctl_table(root, namespaces, new_path, table); + header = __register_sysctl_table(set, new_path, table); if (header) header->ctl_table_arg = ctl_table_arg; } else { @@ -1351,7 +1345,7 @@ struct ctl_table_header *__register_sysctl_paths( header->ctl_table_arg = ctl_table_arg; if (register_leaf_sysctl_tables(new_path, pos, &subheader, - root, namespaces, table)) + set, table)) goto err_register_leaves; } @@ -1384,7 +1378,7 @@ err_register_leaves: struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, struct ctl_table *table) { - return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, + return __register_sysctl_paths(&sysctl_table_root.default_set, path, table); } EXPORT_SYMBOL(register_sysctl_paths); -- cgit v1.2.3 From e54012cede6749528899f66a72312522a179d427 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 18 Jan 2012 22:57:15 -0800 Subject: sysctl: Move sysctl_check_dups into insert_header Simplify the callers of insert_header by removing explicit calls to check for duplicates and instead have insert_header do the work. This makes the code slightly more maintainable by enabling changes to data structures where the insertion of new entries without duplicate suppression is not possible. There is not always a convenient path string where insert_header is called so modify sysctl_check_dups to use sysctl_print_dir when printing the full path when a duplicate is discovered. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index e0d3e7e59cb..160d5781638 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -52,6 +52,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_table **pentry, struct nsproxy *namespaces); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); +static int sysctl_check_dups(struct ctl_dir *dir, struct ctl_table *table); static void sysctl_print_dir(struct ctl_dir *dir) { @@ -123,6 +124,10 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { int err; + err = sysctl_check_dups(dir, header->ctl_table); + if (err) + return err; + dir->header.nreg++; header->parent = dir; err = insert_links(header); @@ -891,7 +896,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead, return ret; } -static int sysctl_check_table_dups(const char *path, struct ctl_table *old, +static int sysctl_check_table_dups(struct ctl_dir *dir, struct ctl_table *old, struct ctl_table *table) { struct ctl_table *entry, *test; @@ -900,8 +905,9 @@ static int sysctl_check_table_dups(const char *path, struct ctl_table *old, for (entry = old; entry->procname; entry++) { for (test = table; test->procname; test++) { if (strcmp(entry->procname, test->procname) == 0) { - printk(KERN_ERR "sysctl duplicate entry: %s/%s\n", - path, test->procname); + printk(KERN_ERR "sysctl duplicate entry: "); + sysctl_print_dir(dir); + printk(KERN_CONT "/%s\n", test->procname); error = -EEXIST; } } @@ -909,8 +915,7 @@ static int sysctl_check_table_dups(const char *path, struct ctl_table *old, return error; } -static int sysctl_check_dups(struct ctl_dir *dir, - const char *path, struct ctl_table *table) +static int sysctl_check_dups(struct ctl_dir *dir, struct ctl_table *table) { struct ctl_table_set *set; struct ctl_table_header *head; @@ -922,7 +927,7 @@ static int sysctl_check_dups(struct ctl_dir *dir, continue; if (head->parent != dir) continue; - error = sysctl_check_table_dups(path, head->ctl_table, table); + error = sysctl_check_table_dups(dir, head->ctl_table, table); } return error; } @@ -1166,9 +1171,6 @@ struct ctl_table_header *__register_sysctl_table( } spin_lock(&sysctl_lock); - if (sysctl_check_dups(dir, path, table)) - goto fail_put_dir_locked; - if (insert_header(dir, header)) goto fail_put_dir_locked; -- cgit v1.2.3 From 9e3d47df35abd6430fed04fb40a76c7358b1e815 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 7 Jan 2012 23:45:12 -0800 Subject: sysctl: Make the header lists per directory. Slightly enhance efficiency and clarity of the code by making the header list per directory instead of per set. Benchmark before: make-dummies 0 999 -> 0.63s rmmod dummy -> 0.12s make-dummies 0 9999 -> 2m35s rmmod dummy -> 18s Benchmark after: make-dummies 0 999 -> 0.32s rmmod dummy -> 0.12s make-dummies 0 9999 -> 1m17s rmmod dummy -> 17s Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 160d5781638..e971ccccac4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -33,12 +33,12 @@ static struct ctl_table root_table[] = { { } }; static struct ctl_table_root sysctl_table_root = { - .default_set.list = LIST_HEAD_INIT(sysctl_table_root.default_set.dir.header.ctl_entry), + .default_set.dir.list = LIST_HEAD_INIT(sysctl_table_root.default_set.dir.list), .default_set.dir.header = { {{.count = 1, .nreg = 1, .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.dir.header.ctl_entry),}}, .ctl_table_arg = root_table, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, @@ -79,15 +79,12 @@ static int namecmp(const char *name1, int len1, const char *name2, int len2) static struct ctl_table *find_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { - struct ctl_table_set *set = dir->header.set; struct ctl_table_header *head; struct ctl_table *entry; - list_for_each_entry(head, &set->list, ctl_entry) { + list_for_each_entry(head, &dir->list, ctl_entry) { if (head->unregistering) continue; - if (head->parent != dir) - continue; for (entry = head->ctl_table; entry->procname; entry++) { const char *procname = entry->procname; if (namecmp(procname, strlen(procname), name, namelen) == 0) { @@ -133,7 +130,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) err = insert_links(header); if (err) goto fail_links; - list_add_tail(&header->ctl_entry, &header->set->list); + list_add_tail(&header->ctl_entry, &header->parent->list); return 0; fail_links: header->parent = NULL; @@ -247,14 +244,12 @@ static struct ctl_table *lookup_entry(struct ctl_table_header **phead, static struct ctl_table_header *next_usable_entry(struct ctl_dir *dir, struct list_head *tmp) { - struct ctl_table_set *set = dir->header.set; struct ctl_table_header *head; - for (tmp = tmp->next; tmp != &set->list; tmp = tmp->next) { + for (tmp = tmp->next; tmp != &dir->list; tmp = tmp->next) { head = list_entry(tmp, struct ctl_table_header, ctl_entry); - if (head->parent != dir || - !head->ctl_table->procname || + if (!head->ctl_table->procname || !use_table(head)) continue; @@ -270,7 +265,7 @@ static void first_entry(struct ctl_dir *dir, struct ctl_table *entry = NULL; spin_lock(&sysctl_lock); - head = next_usable_entry(dir, &dir->header.set->list); + head = next_usable_entry(dir, &dir->list); spin_unlock(&sysctl_lock); if (head) entry = head->ctl_table; @@ -793,6 +788,7 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, new_name = (char *)(table + 2); memcpy(new_name, name, namelen); new_name[namelen] = '\0'; + INIT_LIST_HEAD(&new->list); table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; init_header(&new->header, set->dir.header.root, set, table); @@ -917,12 +913,10 @@ static int sysctl_check_table_dups(struct ctl_dir *dir, struct ctl_table *old, static int sysctl_check_dups(struct ctl_dir *dir, struct ctl_table *table) { - struct ctl_table_set *set; struct ctl_table_header *head; int error = 0; - set = dir->header.set; - list_for_each_entry(head, &set->list, ctl_entry) { + list_for_each_entry(head, &dir->list, ctl_entry) { if (head->unregistering) continue; if (head->parent != dir) @@ -1494,14 +1488,14 @@ void setup_sysctl_set(struct ctl_table_set *set, int (*is_seen)(struct ctl_table_set *)) { memset(set, sizeof(*set), 0); - INIT_LIST_HEAD(&set->list); set->is_seen = is_seen; + INIT_LIST_HEAD(&set->dir.list); init_header(&set->dir.header, root, set, root_table); } void retire_sysctl_set(struct ctl_table_set *set) { - WARN_ON(!list_empty(&set->list)); + WARN_ON(!list_empty(&set->dir.list)); } int __init proc_sys_init(void) -- cgit v1.2.3 From ac13ac6f4c6c0504d2c927862216f4e422a2c0b5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 9 Jan 2012 17:24:30 -0800 Subject: sysctl: Index sysctl directories with rbtrees. One of the most important jobs of sysctl is to export network stack tunables. Several of those tunables are per network device. In several instances people are running with 1000+ network devices in there network stacks, which makes the simple per directory linked list in sysctl a scaling bottleneck. Replace O(N^2) sysctl insertion and lookup times with O(NlogN) by using an rbtree to index the sysctl directories. Benchmark before: make-dummies 0 999 -> 0.32s rmmod dummy -> 0.12s make-dummies 0 9999 -> 1m17s rmmod dummy -> 17s Benchmark after: make-dummies 0 999 -> 0.074s rmmod dummy -> 0.070s make-dummies 0 9999 -> 3.4s rmmod dummy -> 0.44s Benchmark after (without dev_snmp6): make-dummies 0 9999 -> 0.75s rmmod dummy -> 0.44s make-dummies 0 99999 -> 11s rmmod dummy -> 4.3s At 10,000 dummy devices the bottleneck becomes the time to add and remove the files under /proc/sys/net/dev_snmp6. I have commented out the code that adds and removes files under /proc/sys/net/dev_snmp6 and taken measurments of creating and destroying 100,000 dummies to verify the sysctl continues to scale. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 224 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 134 insertions(+), 90 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index e971ccccac4..05c393a5c53 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -33,12 +33,10 @@ static struct ctl_table root_table[] = { { } }; static struct ctl_table_root sysctl_table_root = { - .default_set.dir.list = LIST_HEAD_INIT(sysctl_table_root.default_set.dir.list), .default_set.dir.header = { {{.count = 1, .nreg = 1, - .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.dir.header.ctl_entry),}}, + .ctl_table = root_table }}, .ctl_table_arg = root_table, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, @@ -52,7 +50,6 @@ static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_table **pentry, struct nsproxy *namespaces); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); -static int sysctl_check_dups(struct ctl_dir *dir, struct ctl_table *table); static void sysctl_print_dir(struct ctl_dir *dir) { @@ -81,28 +78,83 @@ static struct ctl_table *find_entry(struct ctl_table_header **phead, { struct ctl_table_header *head; struct ctl_table *entry; + struct rb_node *node = dir->root.rb_node; - list_for_each_entry(head, &dir->list, ctl_entry) { - if (head->unregistering) - continue; - for (entry = head->ctl_table; entry->procname; entry++) { - const char *procname = entry->procname; - if (namecmp(procname, strlen(procname), name, namelen) == 0) { - *phead = head; - return entry; - } + while (node) + { + struct ctl_node *ctl_node; + const char *procname; + int cmp; + + ctl_node = rb_entry(node, struct ctl_node, node); + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + procname = entry->procname; + + cmp = namecmp(name, namelen, procname, strlen(procname)); + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else { + *phead = head; + return entry; } } return NULL; } +static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + struct rb_node **p = &head->parent->root.rb_node; + struct rb_node *parent = NULL; + const char *name = entry->procname; + int namelen = strlen(name); + + while (*p) { + struct ctl_table_header *parent_head; + struct ctl_table *parent_entry; + struct ctl_node *parent_node; + const char *parent_name; + int cmp; + + parent = *p; + parent_node = rb_entry(parent, struct ctl_node, node); + parent_head = parent_node->header; + parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; + parent_name = parent_entry->procname; + + cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + printk(KERN_ERR "sysctl duplicate entry: "); + sysctl_print_dir(head->parent); + printk(KERN_CONT "/%s\n", entry->procname); + return -EEXIST; + } + } + + rb_link_node(node, parent, p); + return 0; +} + +static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + + rb_erase(node, &head->parent->root); +} + static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, - struct ctl_table *table) + struct ctl_node *node, struct ctl_table *table) { head->ctl_table = table; head->ctl_table_arg = table; - INIT_LIST_HEAD(&head->ctl_entry); head->used = 0; head->count = 1; head->nreg = 1; @@ -110,28 +162,42 @@ static void init_header(struct ctl_table_header *head, head->root = root; head->set = set; head->parent = NULL; + head->node = node; + if (node) { + struct ctl_table *entry; + for (entry = table; entry->procname; entry++, node++) { + rb_init_node(&node->node); + node->header = head; + } + } } static void erase_header(struct ctl_table_header *head) { - list_del_init(&head->ctl_entry); + struct ctl_table *entry; + for (entry = head->ctl_table; entry->procname; entry++) + erase_entry(head, entry); } static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { + struct ctl_table *entry; int err; - err = sysctl_check_dups(dir, header->ctl_table); - if (err) - return err; - dir->header.nreg++; header->parent = dir; err = insert_links(header); if (err) goto fail_links; - list_add_tail(&header->ctl_entry, &header->parent->list); + for (entry = header->ctl_table; entry->procname; entry++) { + err = insert_entry(header, entry); + if (err) + goto fail; + } return 0; +fail: + erase_header(header); + put_links(header); fail_links: header->parent = NULL; drop_sysctl_table(&dir->header); @@ -241,19 +307,14 @@ static struct ctl_table *lookup_entry(struct ctl_table_header **phead, return entry; } -static struct ctl_table_header *next_usable_entry(struct ctl_dir *dir, - struct list_head *tmp) +static struct ctl_node *first_usable_entry(struct rb_node *node) { - struct ctl_table_header *head; - - for (tmp = tmp->next; tmp != &dir->list; tmp = tmp->next) { - head = list_entry(tmp, struct ctl_table_header, ctl_entry); + struct ctl_node *ctl_node; - if (!head->ctl_table->procname || - !use_table(head)) - continue; - - return head; + for (;node; node = rb_next(node)) { + ctl_node = rb_entry(node, struct ctl_node, node); + if (use_table(ctl_node->header)) + return ctl_node; } return NULL; } @@ -261,14 +322,17 @@ static struct ctl_table_header *next_usable_entry(struct ctl_dir *dir, static void first_entry(struct ctl_dir *dir, struct ctl_table_header **phead, struct ctl_table **pentry) { - struct ctl_table_header *head; + struct ctl_table_header *head = NULL; struct ctl_table *entry = NULL; + struct ctl_node *ctl_node; spin_lock(&sysctl_lock); - head = next_usable_entry(dir, &dir->list); + ctl_node = first_usable_entry(rb_first(&dir->root)); spin_unlock(&sysctl_lock); - if (head) - entry = head->ctl_table; + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } *phead = head; *pentry = entry; } @@ -277,15 +341,17 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr { struct ctl_table_header *head = *phead; struct ctl_table *entry = *pentry; + struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; - entry++; - if (!entry->procname) { - spin_lock(&sysctl_lock); - unuse_table(head); - head = next_usable_entry(head->parent, &head->ctl_entry); - spin_unlock(&sysctl_lock); - if (head) - entry = head->ctl_table; + spin_lock(&sysctl_lock); + unuse_table(head); + + ctl_node = first_usable_entry(rb_next(&ctl_node->node)); + spin_unlock(&sysctl_lock); + head = NULL; + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; @@ -777,21 +843,23 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, { struct ctl_table *table; struct ctl_dir *new; + struct ctl_node *node; char *new_name; - new = kzalloc(sizeof(*new) + sizeof(struct ctl_table)*2 + - namelen + 1, GFP_KERNEL); + new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + + sizeof(struct ctl_table)*2 + namelen + 1, + GFP_KERNEL); if (!new) return NULL; - table = (struct ctl_table *)(new + 1); + node = (struct ctl_node *)(new + 1); + table = (struct ctl_table *)(node + 1); new_name = (char *)(table + 2); memcpy(new_name, name, namelen); new_name[namelen] = '\0'; - INIT_LIST_HEAD(&new->list); table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; - init_header(&new->header, set->dir.header.root, set, table); + init_header(&new->header, set->dir.header.root, set, node, table); return new; } @@ -892,40 +960,6 @@ static int sysctl_follow_link(struct ctl_table_header **phead, return ret; } -static int sysctl_check_table_dups(struct ctl_dir *dir, struct ctl_table *old, - struct ctl_table *table) -{ - struct ctl_table *entry, *test; - int error = 0; - - for (entry = old; entry->procname; entry++) { - for (test = table; test->procname; test++) { - if (strcmp(entry->procname, test->procname) == 0) { - printk(KERN_ERR "sysctl duplicate entry: "); - sysctl_print_dir(dir); - printk(KERN_CONT "/%s\n", test->procname); - error = -EEXIST; - } - } - } - return error; -} - -static int sysctl_check_dups(struct ctl_dir *dir, struct ctl_table *table) -{ - struct ctl_table_header *head; - int error = 0; - - list_for_each_entry(head, &dir->list, ctl_entry) { - if (head->unregistering) - continue; - if (head->parent != dir) - continue; - error = sysctl_check_table_dups(dir, head->ctl_table, table); - } - return error; -} - static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) { struct va_format vaf; @@ -977,6 +1011,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table { struct ctl_table *link_table, *entry, *link; struct ctl_table_header *links; + struct ctl_node *node; char *link_name; int nr_entries, name_bytes; @@ -988,6 +1023,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table } links = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries + sizeof(struct ctl_table)*(nr_entries + 1) + name_bytes, GFP_KERNEL); @@ -995,7 +1031,8 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table if (!links) return NULL; - link_table = (struct ctl_table *)(links + 1); + node = (struct ctl_node *)(links + 1); + link_table = (struct ctl_table *)(node + nr_entries); link_name = (char *)&link_table[nr_entries + 1]; for (link = link_table, entry = table; entry->procname; link++, entry++) { @@ -1006,7 +1043,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table link->data = link_root; link_name += len; } - init_header(links, dir->header.root, dir->header.set, link_table); + init_header(links, dir->header.root, dir->header.set, node, link_table); links->nreg = nr_entries; return links; @@ -1132,12 +1169,20 @@ struct ctl_table_header *__register_sysctl_table( struct ctl_table_header *header; const char *name, *nextname; struct ctl_dir *dir; + struct ctl_table *entry; + struct ctl_node *node; + int nr_entries = 0; + + for (entry = table; entry->procname; entry++) + nr_entries++; - header = kzalloc(sizeof(struct ctl_table_header), GFP_KERNEL); + header = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); if (!header) return NULL; - init_header(header, root, set, table); + node = (struct ctl_node *)(header + 1); + init_header(header, root, set, node, table); if (sysctl_check_table(path, table)) goto fail; @@ -1489,13 +1534,12 @@ void setup_sysctl_set(struct ctl_table_set *set, { memset(set, sizeof(*set), 0); set->is_seen = is_seen; - INIT_LIST_HEAD(&set->dir.list); - init_header(&set->dir.header, root, set, root_table); + init_header(&set->dir.header, root, set, NULL, root_table); } void retire_sysctl_set(struct ctl_table_set *set) { - WARN_ON(!list_empty(&set->dir.list)); + WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); } int __init proc_sys_init(void) -- cgit v1.2.3 From fea478d4101a4285aa25c5bafaaf4cec35026fe0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jan 2012 21:47:03 -0800 Subject: sysctl: Add register_sysctl for normal sysctl users The plan is to convert all callers of register_sysctl_table and register_sysctl_paths to register_sysctl. The interface to register_sysctl is enough nicer this should make the callers a bit more readable. Additionally after the conversion the 230 lines of backwards compatibility can be removed. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 05c393a5c53..8dc7f0e46e7 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1228,6 +1228,23 @@ fail: return NULL; } +/** + * register_sysctl - register a sysctl table + * @path: The path to the directory the sysctl table is in. + * @table: the table structure + * + * Register a sysctl table. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +{ + return __register_sysctl_table(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl); + static char *append_path(const char *path, char *pos, const char *name) { int namelen; -- cgit v1.2.3 From 47981787092aecb87dc3cb2d478455dcfb77516a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 30 Jan 2012 16:39:59 +0300 Subject: sysctl: remove an unused variable "links" is never used, so we can remove it. Signed-off-by: Dan Carpenter Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 8dc7f0e46e7..1b1f5b8f4e0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1165,7 +1165,6 @@ struct ctl_table_header *__register_sysctl_table( const char *path, struct ctl_table *table) { struct ctl_table_root *root = set->dir.header.root; - struct ctl_table_header *links = NULL; struct ctl_table_header *header; const char *name, *nextname; struct ctl_dir *dir; @@ -1222,7 +1221,6 @@ fail_put_dir_locked: drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); fail: - kfree(links); kfree(header); dump_stack(); return NULL; -- cgit v1.2.3 From 1347440db6f76ec5ae0af8d8558387f571a5e1dd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 30 Jan 2012 16:40:29 +0300 Subject: sysctl: fix memset parameters in setup_sysctl_set() The current code is a nop. Signed-off-by: Dan Carpenter Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1b1f5b8f4e0..27e265ba1af 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1547,7 +1547,7 @@ void setup_sysctl_set(struct ctl_table_set *set, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)) { - memset(set, sizeof(*set), 0); + memset(set, 0, sizeof(*set)); set->is_seen = is_seen; init_header(&set->dir.header, root, set, NULL, root_table); } -- cgit v1.2.3 From 71879d3cb3dd8f2dfdefb252775c1b3ea04a3dd4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 31 Jan 2012 17:14:38 +0100 Subject: proc: mem_release() should check mm != NULL mem_release() can hit mm == NULL, add the necessary check. Cc: stable@kernel.org Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 9cde9edf9c4..c3617ea7830 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -822,8 +822,8 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig) static int mem_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; - - mmput(mm); + if (mm) + mmput(mm); return 0; } -- cgit v1.2.3 From 572d34b946bae070debd42db1143034d9687e13f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 31 Jan 2012 17:14:54 +0100 Subject: proc: unify mem_read() and mem_write() No functional changes, cleanup and preparation. mem_read() and mem_write() are very similar. Move this code into the new common helper, mem_rw(), which takes the additional "int write" argument. Cc: stable@kernel.org Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 90 +++++++++++++++++++++------------------------------------- 1 file changed, 32 insertions(+), 58 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index c3617ea7830..be190904168 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -718,57 +718,13 @@ static int mem_open(struct inode* inode, struct file* file) return 0; } -static ssize_t mem_read(struct file * file, char __user * buf, - size_t count, loff_t *ppos) +static ssize_t mem_rw(struct file *file, char __user *buf, + size_t count, loff_t *ppos, int write) { - int ret; - char *page; - unsigned long src = *ppos; struct mm_struct *mm = file->private_data; - - if (!mm) - return 0; - - page = (char *)__get_free_page(GFP_TEMPORARY); - if (!page) - return -ENOMEM; - - ret = 0; - - while (count > 0) { - int this_len, retval; - - this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - retval = access_remote_vm(mm, src, page, this_len, 0); - if (!retval) { - if (!ret) - ret = -EIO; - break; - } - - if (copy_to_user(buf, page, retval)) { - ret = -EFAULT; - break; - } - - ret += retval; - src += retval; - buf += retval; - count -= retval; - } - *ppos = src; - - free_page((unsigned long) page); - return ret; -} - -static ssize_t mem_write(struct file * file, const char __user *buf, - size_t count, loff_t *ppos) -{ - int copied; + unsigned long addr = *ppos; + ssize_t copied; char *page; - unsigned long dst = *ppos; - struct mm_struct *mm = file->private_data; if (!mm) return 0; @@ -779,30 +735,48 @@ static ssize_t mem_write(struct file * file, const char __user *buf, copied = 0; while (count > 0) { - int this_len, retval; + int this_len = min_t(int, count, PAGE_SIZE); - this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - if (copy_from_user(page, buf, this_len)) { + if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; break; } - retval = access_remote_vm(mm, dst, page, this_len, 1); - if (!retval) { + + this_len = access_remote_vm(mm, addr, page, this_len, write); + if (!this_len) { if (!copied) copied = -EIO; break; } - copied += retval; - buf += retval; - dst += retval; - count -= retval; + + if (!write && copy_to_user(buf, page, this_len)) { + copied = -EFAULT; + break; + } + + buf += this_len; + addr += this_len; + copied += this_len; + count -= this_len; } - *ppos = dst; + *ppos = addr; free_page((unsigned long) page); return copied; } +static ssize_t mem_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, buf, count, ppos, 0); +} + +static ssize_t mem_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, (char __user*)buf, count, ppos, 1); +} + loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { -- cgit v1.2.3 From 6d08f2c7139790c268820a2e590795cb8333181a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 31 Jan 2012 17:15:11 +0100 Subject: proc: make sure mem_open() doesn't pin the target's memory Once /proc/pid/mem is opened, the memory can't be released until mem_release() even if its owner exits. Change mem_open() to do atomic_inc(mm_count) + mmput(), this only pins mm_struct. Change mem_rw() to do atomic_inc_not_zero(mm_count) before access_remote_vm(), this verifies that this mm is still alive. I am not sure what should mem_rw() return if atomic_inc_not_zero() fails. With this patch it returns zero to match the "mm == NULL" case, may be it should return -EINVAL like it did before e268337d. Perhaps it makes sense to add the additional fatal_signal_pending() check into the main loop, to ensure we do not hold this memory if the target task was oom-killed. Cc: stable@kernel.org Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index be190904168..d9512bd03e6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -711,6 +711,13 @@ static int mem_open(struct inode* inode, struct file* file) if (IS_ERR(mm)) return PTR_ERR(mm); + if (mm) { + /* ensure this mm_struct can't be freed */ + atomic_inc(&mm->mm_count); + /* but do not pin its memory */ + mmput(mm); + } + /* OK to pass negative loff_t, we can catch out-of-range */ file->f_mode |= FMODE_UNSIGNED_OFFSET; file->private_data = mm; @@ -734,6 +741,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf, return -ENOMEM; copied = 0; + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; + while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); @@ -761,6 +771,8 @@ static ssize_t mem_rw(struct file *file, char __user *buf, } *ppos = addr; + mmput(mm); +free: free_page((unsigned long) page); return copied; } @@ -797,7 +809,7 @@ static int mem_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) - mmput(mm); + mmdrop(mm); return 0; } -- cgit v1.2.3 From 51f72f4a0f92e4abde33a8bca0fac9667575d035 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 30 Jan 2012 20:09:33 -0800 Subject: sysctl: An easier to read version of find_subdir Suggested-by: Lucian Adrian Grijincu Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 27e265ba1af..ebe8b3076db 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -833,9 +833,9 @@ static struct ctl_dir *find_subdir(struct ctl_dir *dir, entry = find_entry(&head, dir, name, namelen); if (!entry) return ERR_PTR(-ENOENT); - if (S_ISDIR(entry->mode)) - return container_of(head, struct ctl_dir, header); - return ERR_PTR(-ENOTDIR); + if (!S_ISDIR(entry->mode)) + return ERR_PTR(-ENOTDIR); + return container_of(head, struct ctl_dir, header); } static struct ctl_dir *new_dir(struct ctl_table_set *set, -- cgit v1.2.3 From 0eb97f38d2bfaea289b44c5140a7b04e7b369bad Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 30 Jan 2012 20:37:51 -0800 Subject: sysctl: Correct error return from get_subdir When insert_header fails ensure we return the proper error value from get_subdir. In practice nothing cares, but there is no need to be sloppy. Reported-by: Lucian Adrian Grijincu Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ebe8b3076db..722ec116208 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -869,6 +869,7 @@ static struct ctl_dir *get_subdir(struct ctl_dir *dir, { struct ctl_table_set *set = dir->header.set; struct ctl_dir *subdir, *new = NULL; + int err; spin_lock(&sysctl_lock); subdir = find_subdir(dir, name, namelen); @@ -890,7 +891,9 @@ static struct ctl_dir *get_subdir(struct ctl_dir *dir, if (PTR_ERR(subdir) != -ENOENT) goto failed; - if (insert_header(dir, &new->header)) + err = insert_header(dir, &new->header); + subdir = ERR_PTR(err); + if (err) goto failed; subdir = new; found: -- cgit v1.2.3 From 60f126d93b210ae708e2a5bb4a3be2121831f2a0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 30 Jan 2012 21:23:52 -0800 Subject: sysctl: Comments to make the code clearer. Document get_subdir and that find_subdir alwasy takes a reference. Suggested-by: Lucian Adrian Grijincu Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 722ec116208..e5601dc2408 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -73,6 +73,7 @@ static int namecmp(const char *name1, int len1, const char *name2, int len2) return cmp; } +/* Called under sysctl_lock */ static struct ctl_table *find_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { @@ -864,6 +865,18 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, return new; } +/** + * get_subdir - find or create a subdir with the specified name. + * @dir: Directory to create the subdirectory in + * @name: The name of the subdirectory to find or create + * @namelen: The length of name + * + * Takes a directory with an elevated reference count so we know that + * if we drop the lock the directory will not go away. Upon success + * the reference is moved from @dir to the returned subdirectory. + * Upon error an error code is returned and the reference on @dir is + * simply dropped. + */ static struct ctl_dir *get_subdir(struct ctl_dir *dir, const char *name, int namelen) { @@ -885,12 +898,14 @@ static struct ctl_dir *get_subdir(struct ctl_dir *dir, if (!new) goto failed; + /* Was the subdir added while we dropped the lock? */ subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; + /* Nope. Use the our freshly made directory entry. */ err = insert_header(dir, &new->header); subdir = ERR_PTR(err); if (err) @@ -1190,6 +1205,7 @@ struct ctl_table_header *__register_sysctl_table( spin_lock(&sysctl_lock); dir = &set->dir; + /* Reference moved down the diretory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); -- cgit v1.2.3 From 4e75732035d7e97e001bdf6e3149d3967c0221de Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 30 Jan 2012 21:24:59 -0800 Subject: sysctl: Don't call sysctl_follow_link unless we are a link. There are no functional changes. Just code motion to make it clear that we don't follow a link between sysctl roots unless the directory entry actually is a link. Suggested-by: Lucian Adrian Grijincu Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index e5601dc2408..a7708b7c957 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -451,10 +451,12 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, if (!p) goto out; - ret = sysctl_follow_link(&h, &p, current->nsproxy); - err = ERR_PTR(ret); - if (ret) - goto out; + if (S_ISLNK(p->mode)) { + ret = sysctl_follow_link(&h, &p, current->nsproxy); + err = ERR_PTR(ret); + if (ret) + goto out; + } err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); @@ -601,10 +603,12 @@ static int proc_sys_link_fill_cache(struct file *filp, void *dirent, int err, ret = 0; head = sysctl_head_grab(head); - /* It is not an error if we can not follow the link ignore it */ - err = sysctl_follow_link(&head, &table, current->nsproxy); - if (err) - goto out; + if (S_ISLNK(table->mode)) { + /* It is not an error if we can not follow the link ignore it */ + err = sysctl_follow_link(&head, &table, current->nsproxy); + if (err) + goto out; + } ret = proc_sys_fill_cache(filp, dirent, filldir, head, table); out: @@ -950,10 +954,6 @@ static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_dir *dir; int ret; - /* Get out quickly if not a link */ - if (!S_ISLNK((*pentry)->mode)) - return 0; - ret = 0; spin_lock(&sysctl_lock); root = (*pentry)->data; -- cgit v1.2.3 From 8cdb878dcb359fd1137e9abdee9322f5e9bcfdf8 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Thu, 2 Feb 2012 11:34:09 +1030 Subject: Fix race in process_vm_rw_core This fixes the race in process_vm_core found by Oleg (see http://article.gmane.org/gmane.linux.kernel/1235667/ for details). This has been updated since I last sent it as the creation of the new mm_access() function did almost exactly the same thing as parts of the previous version of this patch did. In order to use mm_access() even when /proc isn't enabled, we move it to kernel/fork.c where other related process mm access functions already are. Signed-off-by: Chris Yeoh Signed-off-by: Linus Torvalds --- fs/proc/base.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index d9512bd03e6..d4548dd49b0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -198,26 +198,6 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) -{ - struct mm_struct *mm; - int err; - - err = mutex_lock_killable(&task->signal->cred_guard_mutex); - if (err) - return ERR_PTR(err); - - mm = get_task_mm(task); - if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { - mmput(mm); - mm = ERR_PTR(-EACCES); - } - mutex_unlock(&task->signal->cred_guard_mutex); - - return mm; -} - struct mm_struct *mm_for_maps(struct task_struct *task) { return mm_access(task, PTRACE_MODE_READ); -- cgit v1.2.3 From 4040153087478993cbf0809f444400a3c808074c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 13 Feb 2012 03:58:52 +0000 Subject: security: trim security.h Trim security.h Signed-off-by: Al Viro Signed-off-by: James Morris --- fs/proc/proc_sysctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a6b62173d4c..67bbf6e4e19 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -6,7 +6,9 @@ #include #include #include +#include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; -- cgit v1.2.3 From 1dce27c5aa6770e9d195f2bb7db1db3d4dde5591 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Feb 2012 17:49:42 +0000 Subject: Wrap accesses to the fd_sets in struct fdtable Wrap accesses to the fd_sets in struct fdtable (for recording open files and close-on-exec flags) so that we can move away from using fd_sets since we abuse the fd_set structs by not allocating the full-sized structure under normal circumstances and by non-core code looking at the internals of the fd_sets. The first abuse means that use of FD_ZERO() on these fd_sets is not permitted, since that cannot be told about their abnormal lengths. This introduces six wrapper functions for setting, clearing and testing close-on-exec flags and fd-is-open flags: void __set_close_on_exec(int fd, struct fdtable *fdt); void __clear_close_on_exec(int fd, struct fdtable *fdt); bool close_on_exec(int fd, const struct fdtable *fdt); void __set_open_fd(int fd, struct fdtable *fdt); void __clear_open_fd(int fd, struct fdtable *fdt); bool fd_is_open(int fd, const struct fdtable *fdt); Note that I've prepended '__' to the names of the set/clear functions because they require the caller to hold a lock to use them. Note also that I haven't added wrappers for looking behind the scenes at the the array. Possibly that should exist too. Signed-off-by: David Howells Link: http://lkml.kernel.org/r/20120216174942.23314.1364.stgit@warthog.procyon.org.uk Signed-off-by: H. Peter Anvin Cc: Al Viro --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index d4548dd49b0..db6ab4b36a0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1754,7 +1754,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) fdt = files_fdtable(files); f_flags = file->f_flags & ~O_CLOEXEC; - if (FD_ISSET(fd, fdt->close_on_exec)) + if (close_on_exec(fd, fdt)) f_flags |= O_CLOEXEC; if (path) { -- cgit v1.2.3 From 162573937679ff36c9acd54268c047199dab564e Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 16 Feb 2012 01:15:00 +0000 Subject: fadump: Introduce cleanup routine to invalidate /proc/vmcore. With the firmware-assisted dump support we don't require a reboot when we are in second kernel after crash. The second kernel after crash is a normal kernel boot and has knowledge about entire system RAM with the page tables initialized for entire system RAM. Hence once the dump is saved to disk, we can just release the reserved memory area for general use and continue with second kernel as production kernel. Hence when we release the reserved memory that contains dump data, the '/proc/vmcore' will not be valid anymore. Hence this patch introduces a cleanup routine that invalidates and removes the /proc/vmcore file. This routine will be invoked before we release the reserved dump memory area. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Benjamin Herrenschmidt --- fs/proc/vmcore.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index b0f450a2bb7..0d5071d2998 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -700,3 +700,26 @@ static int __init vmcore_init(void) return 0; } module_init(vmcore_init) + +/* Cleanup function for vmcore module. */ +void vmcore_cleanup(void) +{ + struct list_head *pos, *next; + + if (proc_vmcore) { + remove_proc_entry(proc_vmcore->name, proc_vmcore->parent); + proc_vmcore = NULL; + } + + /* clear the vmcore list. */ + list_for_each_safe(pos, next, &vmcore_list) { + struct vmcore *m; + + m = list_entry(pos, struct vmcore, list); + list_del(&m->list); + kfree(m); + } + kfree(elfcorebuf); + elfcorebuf = NULL; +} +EXPORT_SYMBOL_GPL(vmcore_cleanup); -- cgit v1.2.3 From 2e5b5b3a1b7768c89fbfeca18e75f8ee377e924c Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 23 Feb 2012 17:41:27 +0900 Subject: sched: Clean up parameter passing of proc_sched_autogroup_set_nice() Pass nice as a value to proc_sched_autogroup_set_nice(). No side effect is expected, and the variable err will be overwritten with the return value. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4F45FBB7.5090607@ct.jp.nec.com Signed-off-by: Ingo Molnar --- fs/proc/base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index d4548dd49b0..965d4bde3a3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1310,8 +1310,7 @@ sched_autogroup_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - err = nice; - err = proc_sched_autogroup_set_nice(p, &err); + err = proc_sched_autogroup_set_nice(p, nice); if (err) count = err; -- cgit v1.2.3 From f1f996b66cc3908a8f5ffccc2ff41840e92f3b10 Mon Sep 17 00:00:00 2001 From: Laura Vasilescu Date: Mon, 19 Mar 2012 15:41:15 +0200 Subject: kcore: fix spelling in read_kcore() comment Signed-off-by: Laura Vasilescu Signed-off-by: Jiri Kosina --- fs/proc/kcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d245cb23dd7..e5e69aff6c6 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -513,7 +513,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) n = copy_to_user(buffer, (char *)start, tsz); /* - * We cannot distingush between fault on source + * We cannot distinguish between fault on source * and fault on destination. When this happens * we clear too and hope it will trigger the * EFAULT again. -- cgit v1.2.3 From 6b4231e2f92adbcf96fb2a3fa751d7ca0a61b21f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Feb 2012 21:56:08 -0500 Subject: procfs: clean proc_fill_super() up First of all, there's no need to zero ->i_uid/->i_gid on root inode - both had been set to zero already. Moreover, let's take the iput() on failure to the failure exit it belongs to... Signed-off-by: Al Viro --- fs/proc/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 84fd3235a59..a70af3a44f4 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -499,16 +499,15 @@ int proc_fill_super(struct super_block *s) root_inode = proc_get_inode(s, &proc_root); if (!root_inode) goto out_no_root; - root_inode->i_uid = 0; - root_inode->i_gid = 0; s->s_root = d_alloc_root(root_inode); - if (!s->s_root) + if (!s->s_root) { + iput(root_inode); goto out_no_root; + } return 0; out_no_root: printk("proc_read_super: get root inode failed\n"); - iput(root_inode); pde_put(&proc_root); return -ENOMEM; } -- cgit v1.2.3 From 48fde701aff662559b38d9a609574068f22d00fe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Jan 2012 22:15:13 -0500 Subject: switch open-coded instances of d_make_root() to new helper Signed-off-by: Al Viro --- fs/proc/inode.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a70af3a44f4..8461a7b82fd 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -486,8 +486,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) int proc_fill_super(struct super_block *s) { - struct inode * root_inode; - s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; @@ -496,17 +494,10 @@ int proc_fill_super(struct super_block *s) s->s_time_gran = 1; pde_get(&proc_root); - root_inode = proc_get_inode(s, &proc_root); - if (!root_inode) - goto out_no_root; - s->s_root = d_alloc_root(root_inode); - if (!s->s_root) { - iput(root_inode); - goto out_no_root; - } - return 0; + s->s_root = d_make_root(proc_get_inode(s, &proc_root)); + if (s->s_root) + return 0; -out_no_root: printk("proc_read_super: get root inode failed\n"); pde_put(&proc_root); return -ENOMEM; -- cgit v1.2.3 From 1a5a9906d4e8d1976b701f889d8f35d54b928f25 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 21 Mar 2012 16:33:42 -0700 Subject: mm: thp: fix pmd_bad() triggering in code paths holding mmap_sem read mode In some cases it may happen that pmd_none_or_clear_bad() is called with the mmap_sem hold in read mode. In those cases the huge page faults can allocate hugepmds under pmd_none_or_clear_bad() and that can trigger a false positive from pmd_bad() that will not like to see a pmd materializing as trans huge. It's not khugepaged causing the problem, khugepaged holds the mmap_sem in write mode (and all those sites must hold the mmap_sem in read mode to prevent pagetables to go away from under them, during code review it seems vm86 mode on 32bit kernels requires that too unless it's restricted to 1 thread per process or UP builds). The race is only with the huge pagefaults that can convert a pmd_none() into a pmd_trans_huge(). Effectively all these pmd_none_or_clear_bad() sites running with mmap_sem in read mode are somewhat speculative with the page faults, and the result is always undefined when they run simultaneously. This is probably why it wasn't common to run into this. For example if the madvise(MADV_DONTNEED) runs zap_page_range() shortly before the page fault, the hugepage will not be zapped, if the page fault runs first it will be zapped. Altering pmd_bad() not to error out if it finds hugepmds won't be enough to fix this, because zap_pmd_range would then proceed to call zap_pte_range (which would be incorrect if the pmd become a pmd_trans_huge()). The simplest way to fix this is to read the pmd in the local stack (regardless of what we read, no need of actual CPU barriers, only compiler barrier needed), and be sure it is not changing under the code that computes its value. Even if the real pmd is changing under the value we hold on the stack, we don't care. If we actually end up in zap_pte_range it means the pmd was not none already and it was not huge, and it can't become huge from under us (khugepaged locking explained above). All we need is to enforce that there is no way anymore that in a code path like below, pmd_trans_huge can be false, but pmd_none_or_clear_bad can run into a hugepmd. The overhead of a barrier() is just a compiler tweak and should not be measurable (I only added it for THP builds). I don't exclude different compiler versions may have prevented the race too by caching the value of *pmd on the stack (that hasn't been verified, but it wouldn't be impossible considering pmd_none_or_clear_bad, pmd_bad, pmd_trans_huge, pmd_none are all inlines and there's no external function called in between pmd_trans_huge and pmd_none_or_clear_bad). if (pmd_trans_huge(*pmd)) { if (next-addr != HPAGE_PMD_SIZE) { VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) continue; /* fall through */ } if (pmd_none_or_clear_bad(pmd)) Because this race condition could be exercised without special privileges this was reported in CVE-2012-1179. The race was identified and fully explained by Ulrich who debugged it. I'm quoting his accurate explanation below, for reference. ====== start quote ======= mapcount 0 page_mapcount 1 kernel BUG at mm/huge_memory.c:1384! At some point prior to the panic, a "bad pmd ..." message similar to the following is logged on the console: mm/memory.c:145: bad pmd ffff8800376e1f98(80000000314000e7). The "bad pmd ..." message is logged by pmd_clear_bad() before it clears the page's PMD table entry. 143 void pmd_clear_bad(pmd_t *pmd) 144 { -> 145 pmd_ERROR(*pmd); 146 pmd_clear(pmd); 147 } After the PMD table entry has been cleared, there is an inconsistency between the actual number of PMD table entries that are mapping the page and the page's map count (_mapcount field in struct page). When the page is subsequently reclaimed, __split_huge_page() detects this inconsistency. 1381 if (mapcount != page_mapcount(page)) 1382 printk(KERN_ERR "mapcount %d page_mapcount %d\n", 1383 mapcount, page_mapcount(page)); -> 1384 BUG_ON(mapcount != page_mapcount(page)); The root cause of the problem is a race of two threads in a multithreaded process. Thread B incurs a page fault on a virtual address that has never been accessed (PMD entry is zero) while Thread A is executing an madvise() system call on a virtual address within the same 2 MB (huge page) range. virtual address space .---------------------. | | | | .-|---------------------| | | | | | |<-- B(fault) | | | 2 MB | |/////////////////////|-. huge < |/////////////////////| > A(range) page | |/////////////////////|-' | | | | | | '-|---------------------| | | | | '---------------------' - Thread A is executing an madvise(..., MADV_DONTNEED) system call on the virtual address range "A(range)" shown in the picture. sys_madvise // Acquire the semaphore in shared mode. down_read(¤t->mm->mmap_sem) ... madvise_vma switch (behavior) case MADV_DONTNEED: madvise_dontneed zap_page_range unmap_vmas unmap_page_range zap_pud_range zap_pmd_range // // Assume that this huge page has never been accessed. // I.e. content of the PMD entry is zero (not mapped). // if (pmd_trans_huge(*pmd)) { // We don't get here due to the above assumption. } // // Assume that Thread B incurred a page fault and .---------> // sneaks in here as shown below. | // | if (pmd_none_or_clear_bad(pmd)) | { | if (unlikely(pmd_bad(*pmd))) | pmd_clear_bad | { | pmd_ERROR | // Log "bad pmd ..." message here. | pmd_clear | // Clear the page's PMD entry. | // Thread B incremented the map count | // in page_add_new_anon_rmap(), but | // now the page is no longer mapped | // by a PMD entry (-> inconsistency). | } | } | v - Thread B is handling a page fault on virtual address "B(fault)" shown in the picture. ... do_page_fault __do_page_fault // Acquire the semaphore in shared mode. down_read_trylock(&mm->mmap_sem) ... handle_mm_fault if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) // We get here due to the above assumption (PMD entry is zero). do_huge_pmd_anonymous_page alloc_hugepage_vma // Allocate a new transparent huge page here. ... __do_huge_pmd_anonymous_page ... spin_lock(&mm->page_table_lock) ... page_add_new_anon_rmap // Here we increment the page's map count (starts at -1). atomic_set(&page->_mapcount, 0) set_pmd_at // Here we set the page's PMD entry which will be cleared // when Thread A calls pmd_clear_bad(). ... spin_unlock(&mm->page_table_lock) The mmap_sem does not prevent the race because both threads are acquiring it in shared mode (down_read). Thread B holds the page_table_lock while the page's map count and PMD table entry are updated. However, Thread A does not synchronize on that lock. ====== end quote ======= [akpm@linux-foundation.org: checkpatch fixes] Reported-by: Ulrich Obergfell Signed-off-by: Andrea Arcangeli Acked-by: Johannes Weiner Cc: Mel Gorman Cc: Hugh Dickins Cc: Dave Jones Acked-by: Larry Woodman Acked-by: Rik van Riel Cc: [2.6.38+] Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7dcd2a25049..3efa7253523 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -409,6 +409,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } else { spin_unlock(&walk->mm->page_table_lock); } + + if (pmd_trans_unstable(pmd)) + return 0; /* * The mmap_sem held all the way back in m_start() is what * keeps khugepaged out of here and from collapsing things @@ -507,6 +510,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -670,6 +675,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); @@ -961,6 +968,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, spin_unlock(&walk->mm->page_table_lock); } + if (pmd_trans_unstable(pmd)) + return 0; orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { struct page *page = can_gather_numa_stats(*pte, md->vma, addr); -- cgit v1.2.3 From 5aaabe831eb527e0d9284f0745d830a755f70393 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Mar 2012 16:33:57 -0700 Subject: pagemap: avoid splitting thp when reading /proc/pid/pagemap Thp split is not necessary if we explicitly check whether pmds are mapping thps or not. This patch introduces this check and adds code to generate pagemap entries for pmds mapping thps, which results in less performance impact of pagemap on thp. Signed-off-by: Naoya Horiguchi Reviewed-by: Andi Kleen Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Wu Fengguang Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3efa7253523..95264c0ef30 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -608,6 +608,9 @@ struct pagemapread { u64 *buffer; }; +#define PAGEMAP_WALK_SIZE (PMD_SIZE) +#define PAGEMAP_WALK_MASK (PMD_MASK) + #define PM_ENTRY_BYTES sizeof(u64) #define PM_STATUS_BITS 3 #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) @@ -666,6 +669,27 @@ static u64 pte_to_pagemap_entry(pte_t pte) return pme; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static u64 thp_pmd_to_pagemap_entry(pmd_t pmd, int offset) +{ + u64 pme = 0; + /* + * Currently pmd for thp is always present because thp can not be + * swapped-out, migrated, or HWPOISONed (split in such cases instead.) + * This if-check is just to prepare for future implementation. + */ + if (pmd_present(pmd)) + pme = PM_PFRAME(pmd_pfn(pmd) + offset) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; + return pme; +} +#else +static inline u64 thp_pmd_to_pagemap_entry(pmd_t pmd, int offset) +{ + return 0; +} +#endif + static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -673,15 +697,37 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; + u64 pfn = PM_NOT_PRESENT; - split_huge_page_pmd(walk->mm, pmd); if (pmd_trans_unstable(pmd)) return 0; /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); + spin_lock(&walk->mm->page_table_lock); + if (pmd_trans_huge(*pmd)) { + if (pmd_trans_splitting(*pmd)) { + spin_unlock(&walk->mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + for (; addr != end; addr += PAGE_SIZE) { + unsigned long offset; + + offset = (addr & ~PAGEMAP_WALK_MASK) >> + PAGE_SHIFT; + pfn = thp_pmd_to_pagemap_entry(*pmd, offset); + err = add_to_pagemap(addr, pfn, pm); + if (err) + break; + } + spin_unlock(&walk->mm->page_table_lock); + return err; + } + } else { + spin_unlock(&walk->mm->page_table_lock); + } + for (; addr != end; addr += PAGE_SIZE) { - u64 pfn = PM_NOT_PRESENT; /* check to see if we've left 'vma' behind * and need a new, higher one */ @@ -764,8 +810,6 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ -#define PAGEMAP_WALK_SIZE (PMD_SIZE) -#define PAGEMAP_WALK_MASK (PMD_MASK) static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { -- cgit v1.2.3 From 025c5b2451e42c9e8dfdecd6dc84956ce8f321b5 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Mar 2012 16:33:57 -0700 Subject: thp: optimize away unnecessary page table locking Currently when we check if we can handle thp as it is or we need to split it into regular sized pages, we hold page table lock prior to check whether a given pmd is mapping thp or not. Because of this, when it's not "huge pmd" we suffer from unnecessary lock/unlock overhead. To remove it, this patch introduces a optimized check function and replace several similar logics with it. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Naoya Horiguchi Cc: David Rientjes Cc: Andi Kleen Cc: Wu Fengguang Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Cc: Jiri Slaby Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 73 +++++++++++++++++++----------------------------------- 1 file changed, 25 insertions(+), 48 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 95264c0ef30..328843de6e9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -394,20 +394,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - spin_lock(&walk->mm->page_table_lock); - if (pmd_trans_huge(*pmd)) { - if (pmd_trans_splitting(*pmd)) { - spin_unlock(&walk->mm->page_table_lock); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - smaps_pte_entry(*(pte_t *)pmd, addr, - HPAGE_PMD_SIZE, walk); - spin_unlock(&walk->mm->page_table_lock); - mss->anonymous_thp += HPAGE_PMD_SIZE; - return 0; - } - } else { + if (pmd_trans_huge_lock(pmd, vma) == 1) { + smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); spin_unlock(&walk->mm->page_table_lock); + mss->anonymous_thp += HPAGE_PMD_SIZE; + return 0; } if (pmd_trans_unstable(pmd)) @@ -705,26 +696,19 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); spin_lock(&walk->mm->page_table_lock); - if (pmd_trans_huge(*pmd)) { - if (pmd_trans_splitting(*pmd)) { - spin_unlock(&walk->mm->page_table_lock); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - for (; addr != end; addr += PAGE_SIZE) { - unsigned long offset; - - offset = (addr & ~PAGEMAP_WALK_MASK) >> - PAGE_SHIFT; - pfn = thp_pmd_to_pagemap_entry(*pmd, offset); - err = add_to_pagemap(addr, pfn, pm); - if (err) - break; - } - spin_unlock(&walk->mm->page_table_lock); - return err; + if (pmd_trans_huge_lock(pmd, vma) == 1) { + for (; addr != end; addr += PAGE_SIZE) { + unsigned long offset; + + offset = (addr & ~PAGEMAP_WALK_MASK) >> + PAGE_SHIFT; + pfn = thp_pmd_to_pagemap_entry(*pmd, offset); + err = add_to_pagemap(addr, pfn, pm); + if (err) + break; } - } else { spin_unlock(&walk->mm->page_table_lock); + return err; } for (; addr != end; addr += PAGE_SIZE) { @@ -992,24 +976,17 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *pte; md = walk->private; - spin_lock(&walk->mm->page_table_lock); - if (pmd_trans_huge(*pmd)) { - if (pmd_trans_splitting(*pmd)) { - spin_unlock(&walk->mm->page_table_lock); - wait_split_huge_page(md->vma->anon_vma, pmd); - } else { - pte_t huge_pte = *(pte_t *)pmd; - struct page *page; - - page = can_gather_numa_stats(huge_pte, md->vma, addr); - if (page) - gather_stats(page, md, pte_dirty(huge_pte), - HPAGE_PMD_SIZE/PAGE_SIZE); - spin_unlock(&walk->mm->page_table_lock); - return 0; - } - } else { + + if (pmd_trans_huge_lock(pmd, md->vma) == 1) { + pte_t huge_pte = *(pte_t *)pmd; + struct page *page; + + page = can_gather_numa_stats(huge_pte, md->vma, addr); + if (page) + gather_stats(page, md, pte_dirty(huge_pte), + HPAGE_PMD_SIZE/PAGE_SIZE); spin_unlock(&walk->mm->page_table_lock); + return 0; } if (pmd_trans_unstable(pmd)) -- cgit v1.2.3 From e873c49fbfdd595481976b915850e682441bcbec Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Mar 2012 16:33:58 -0700 Subject: pagemap: export KPF_THP This flag shows that a given page is a subpage of a transparent hugepage. It helps us debug and test the kernel by showing physical address of thp. Signed-off-by: Naoya Horiguchi Reviewed-by: Wu Fengguang Reviewed-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Cc: David Rientjes Cc: Andi Kleen Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/page.c b/fs/proc/page.c index 6d8e6a9e93a..7fcd0d60a96 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,6 +115,8 @@ u64 stable_page_flags(struct page *page) u |= 1 << KPF_COMPOUND_TAIL; if (PageHuge(page)) u |= 1 << KPF_HUGE; + else if (PageTransCompound(page)) + u |= 1 << KPF_THP; /* * Caveats on high order pages: page->_count will only be set -- cgit v1.2.3 From 092b50bacd1cdbffef2643b7a46f2a215407919c Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Mar 2012 16:33:59 -0700 Subject: pagemap: introduce data structure for pagemap entry Currently a local variable of pagemap entry in pagemap_pte_range() is named pfn and typed with u64, but it's not correct (pfn should be unsigned long.) This patch introduces special type for pagemap entries and replaces code with it. Signed-off-by: Naoya Horiguchi Cc: David Rientjes Cc: Andi Kleen Cc: Wu Fengguang Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 69 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 31 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 328843de6e9..c7e3a163295 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -594,9 +594,13 @@ const struct file_operations proc_clear_refs_operations = { .llseek = noop_llseek, }; +typedef struct { + u64 pme; +} pagemap_entry_t; + struct pagemapread { int pos, len; - u64 *buffer; + pagemap_entry_t *buffer; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -619,10 +623,15 @@ struct pagemapread { #define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) #define PM_END_OF_BUFFER 1 -static int add_to_pagemap(unsigned long addr, u64 pfn, +static inline pagemap_entry_t make_pme(u64 val) +{ + return (pagemap_entry_t) { .pme = val }; +} + +static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, struct pagemapread *pm) { - pm->buffer[pm->pos++] = pfn; + pm->buffer[pm->pos++] = *pme; if (pm->pos >= pm->len) return PM_END_OF_BUFFER; return 0; @@ -634,8 +643,10 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + for (addr = start; addr < end; addr += PAGE_SIZE) { - err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); + err = add_to_pagemap(addr, &pme, pm); if (err) break; } @@ -648,36 +659,33 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte) return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); } -static u64 pte_to_pagemap_entry(pte_t pte) +static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte) { - u64 pme = 0; if (is_swap_pte(pte)) - pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte)) - | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP; + *pme = make_pme(PM_PFRAME(swap_pte_to_pagemap_entry(pte)) + | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP); else if (pte_present(pte)) - pme = PM_PFRAME(pte_pfn(pte)) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; - return pme; + *pme = make_pme(PM_PFRAME(pte_pfn(pte)) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static u64 thp_pmd_to_pagemap_entry(pmd_t pmd, int offset) +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, + pmd_t pmd, int offset) { - u64 pme = 0; /* * Currently pmd for thp is always present because thp can not be * swapped-out, migrated, or HWPOISONed (split in such cases instead.) * This if-check is just to prepare for future implementation. */ if (pmd_present(pmd)) - pme = PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; - return pme; + *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); } #else -static inline u64 thp_pmd_to_pagemap_entry(pmd_t pmd, int offset) +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, + pmd_t pmd, int offset) { - return 0; } #endif @@ -688,7 +696,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; - u64 pfn = PM_NOT_PRESENT; + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); if (pmd_trans_unstable(pmd)) return 0; @@ -702,8 +710,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - pfn = thp_pmd_to_pagemap_entry(*pmd, offset); - err = add_to_pagemap(addr, pfn, pm); + thp_pmd_to_pagemap_entry(&pme, *pmd, offset); + err = add_to_pagemap(addr, &pme, pm); if (err) break; } @@ -723,11 +731,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pfn = pte_to_pagemap_entry(*pte); + pte_to_pagemap_entry(&pme, *pte); /* unmap before userspace copy */ pte_unmap(pte); } - err = add_to_pagemap(addr, pfn, pm); + err = add_to_pagemap(addr, &pme, pm); if (err) return err; } @@ -738,13 +746,12 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, + pte_t pte, int offset) { - u64 pme = 0; if (pte_present(pte)) - pme = PM_PFRAME(pte_pfn(pte) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT; - return pme; + *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); } /* This function walks within one hugetlb entry in the single call */ @@ -754,12 +761,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, { struct pagemapread *pm = walk->private; int err = 0; - u64 pfn; + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - pfn = huge_pte_to_pagemap_entry(*pte, offset); - err = add_to_pagemap(addr, pfn, pm); + huge_pte_to_pagemap_entry(&pme, *pte, offset); + err = add_to_pagemap(addr, &pme, pm); if (err) return err; } -- cgit v1.2.3 From b76437579d1344b612cf1851ae610c636cec7db0 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Wed, 21 Mar 2012 16:34:04 -0700 Subject: procfs: mark thread stack correctly in proc//maps Stack for a new thread is mapped by userspace code and passed via sys_clone. This memory is currently seen as anonymous in /proc//maps, which makes it difficult to ascertain which mappings are being used for thread stacks. This patch uses the individual task stack pointers to determine which vmas are actually thread stacks. For a multithreaded program like the following: #include void *thread_main(void *foo) { while(1); } int main() { pthread_t t; pthread_create(&t, NULL, thread_main, NULL); pthread_join(t, NULL); } proc/PID/maps looks like the following: 00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out 00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out 019ef000-01a10000 rw-p 00000000 00:00 0 [heap] 7f8a44491000-7f8a44492000 ---p 00000000 00:00 0 7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0 7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0 7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0 7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0 7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0 7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack] 7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso] ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall] Here, one could guess that 7f8a44492000-7f8a44c92000 is a stack since the earlier vma that has no permissions (7f8a44e3d000-7f8a4503d000) but that is not always a reliable way to find out which vma is a thread stack. Also, /proc/PID/maps and /proc/PID/task/TID/maps has the same content. With this patch in place, /proc/PID/task/TID/maps are treated as 'maps as the task would see it' and hence, only the vma that that task uses as stack is marked as [stack]. All other 'stack' vmas are marked as anonymous memory. /proc/PID/maps acts as a thread group level view, where all thread stack vmas are marked as [stack:TID] where TID is the process ID of the task that uses that vma as stack, while the process stack is marked as [stack]. So /proc/PID/maps will look like this: 00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out 00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out 019ef000-01a10000 rw-p 00000000 00:00 0 [heap] 7f8a44491000-7f8a44492000 ---p 00000000 00:00 0 7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442] 7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0 7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0 7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0 7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0 7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0 7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack] 7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso] ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall] Thus marking all vmas that are used as stacks by the threads in the thread group along with the process stack. The task level maps will however like this: 00400000-00401000 r-xp 00000000 fd:0a 3671804 /home/siddhesh/a.out 00600000-00601000 rw-p 00000000 fd:0a 3671804 /home/siddhesh/a.out 019ef000-01a10000 rw-p 00000000 00:00 0 [heap] 7f8a44491000-7f8a44492000 ---p 00000000 00:00 0 7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack] 7f8a44c92000-7f8a44e3d000 r-xp 00000000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a44e3d000-7f8a4503d000 ---p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a4503d000-7f8a45041000 r--p 001ab000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45041000-7f8a45043000 rw-p 001af000 fd:00 2097482 /lib64/libc-2.14.90.so 7f8a45043000-7f8a45048000 rw-p 00000000 00:00 0 7f8a45048000-7f8a4505f000 r-xp 00000000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4505f000-7f8a4525e000 ---p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525e000-7f8a4525f000 r--p 00016000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a4525f000-7f8a45260000 rw-p 00017000 fd:00 2099938 /lib64/libpthread-2.14.90.so 7f8a45260000-7f8a45264000 rw-p 00000000 00:00 0 7f8a45264000-7f8a45286000 r-xp 00000000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45457000-7f8a4545a000 rw-p 00000000 00:00 0 7f8a45484000-7f8a45485000 rw-p 00000000 00:00 0 7f8a45485000-7f8a45486000 r--p 00021000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45486000-7f8a45487000 rw-p 00022000 fd:00 2097348 /lib64/ld-2.14.90.so 7f8a45487000-7f8a45488000 rw-p 00000000 00:00 0 7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 7fff627ff000-7fff62800000 r-xp 00000000 00:00 0 [vdso] ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall] where only the vma that is being used as a stack by *that* task is marked as [stack]. Analogous changes have been made to /proc/PID/smaps, /proc/PID/numa_maps, /proc/PID/task/TID/smaps and /proc/PID/task/TID/numa_maps. Relevant snippets from smaps and numa_maps: [siddhesh@localhost ~ ]$ pgrep a.out 1441 [siddhesh@localhost ~ ]$ cat /proc/1441/smaps | grep "\[stack" 7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack:1442] 7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack] [siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/smaps | grep "\[stack" 7f8a44492000-7f8a44c92000 rw-p 00000000 00:00 0 [stack] [siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/smaps | grep "\[stack" 7fff6273b000-7fff6275c000 rw-p 00000000 00:00 0 [stack] [siddhesh@localhost ~ ]$ cat /proc/1441/numa_maps | grep "stack" 7f8a44492000 default stack:1442 anon=2 dirty=2 N0=2 7fff6273a000 default stack anon=3 dirty=3 N0=3 [siddhesh@localhost ~ ]$ cat /proc/1441/task/1442/numa_maps | grep "stack" 7f8a44492000 default stack anon=2 dirty=2 N0=2 [siddhesh@localhost ~ ]$ cat /proc/1441/task/1441/numa_maps | grep "stack" 7fff6273a000 default stack anon=3 dirty=3 N0=3 [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: fix build] Signed-off-by: Siddhesh Poyarekar Cc: KOSAKI Motohiro Cc: Alexander Viro Cc: Jamie Lokier Cc: Mike Frysinger Cc: Alexey Dobriyan Cc: Matt Mackall Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 12 +-- fs/proc/internal.h | 9 ++- fs/proc/task_mmu.c | 210 +++++++++++++++++++++++++++++++++++++++++---------- fs/proc/task_nommu.c | 69 ++++++++++++++--- 4 files changed, 239 insertions(+), 61 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 965d4bde3a3..3b42c1418f3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2989,9 +2989,9 @@ static const struct pid_entry tgid_base_stuff[] = { INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), - REG("maps", S_IRUGO, proc_maps_operations), + REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, proc_numa_maps_operations), + REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), @@ -3002,7 +3002,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("mountstats", S_IRUSR, proc_mountstats_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), - REG("smaps", S_IRUGO, proc_smaps_operations), + REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("pagemap", S_IRUGO, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY @@ -3348,9 +3348,9 @@ static const struct pid_entry tid_base_stuff[] = { INF("cmdline", S_IRUGO, proc_pid_cmdline), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), - REG("maps", S_IRUGO, proc_maps_operations), + REG("maps", S_IRUGO, proc_tid_maps_operations), #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, proc_numa_maps_operations), + REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), @@ -3360,7 +3360,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), - REG("smaps", S_IRUGO, proc_smaps_operations), + REG("smaps", S_IRUGO, proc_tid_smaps_operations), REG("pagemap", S_IRUGO, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 292577531ad..c44efe19798 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -53,9 +53,12 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); -extern const struct file_operations proc_maps_operations; -extern const struct file_operations proc_numa_maps_operations; -extern const struct file_operations proc_smaps_operations; +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_tid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_tid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_tid_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_net_operations; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c7e3a163295..9694cc28351 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -209,16 +209,20 @@ static int do_maps_open(struct inode *inode, struct file *file, return ret; } -static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) +static void +show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; int len; + const char *name = NULL; if (file) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; @@ -252,36 +256,57 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) if (file) { pad_len_spaces(m, len); seq_path(m, &file->f_path, "\n"); - } else { - const char *name = arch_vma_name(vma); - if (!name) { - if (mm) { - if (vma->vm_start <= mm->brk && - vma->vm_end >= mm->start_brk) { - name = "[heap]"; - } else if (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack) { - name = "[stack]"; - } + goto done; + } + + name = arch_vma_name(vma); + if (!name) { + pid_t tid; + + if (!mm) { + name = "[vdso]"; + goto done; + } + + if (vma->vm_start <= mm->brk && + vma->vm_end >= mm->start_brk) { + name = "[heap]"; + goto done; + } + + tid = vm_is_stack(task, vma, is_pid); + + if (tid != 0) { + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) { + name = "[stack]"; } else { - name = "[vdso]"; + /* Thread stack in /proc/PID/maps */ + pad_len_spaces(m, len); + seq_printf(m, "[stack:%d]", tid); } } - if (name) { - pad_len_spaces(m, len); - seq_puts(m, name); - } + } + +done: + if (name) { + pad_len_spaces(m, len); + seq_puts(m, name); } seq_putc(m, '\n'); } -static int show_map(struct seq_file *m, void *v) +static int show_map(struct seq_file *m, void *v, int is_pid) { struct vm_area_struct *vma = v; struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; - show_map_vma(m, vma); + show_map_vma(m, vma, is_pid); if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task->mm)) @@ -289,20 +314,49 @@ static int show_map(struct seq_file *m, void *v) return 0; } +static int show_pid_map(struct seq_file *m, void *v) +{ + return show_map(m, v, 1); +} + +static int show_tid_map(struct seq_file *m, void *v) +{ + return show_map(m, v, 0); +} + static const struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_map + .show = show_pid_map +}; + +static const struct seq_operations proc_tid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_map }; -static int maps_open(struct inode *inode, struct file *file) +static int pid_maps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_maps_op); } -const struct file_operations proc_maps_operations = { - .open = maps_open, +static int tid_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_tid_maps_op); +} + +const struct file_operations proc_pid_maps_operations = { + .open = pid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +const struct file_operations proc_tid_maps_operations = { + .open = tid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, @@ -416,7 +470,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } -static int show_smap(struct seq_file *m, void *v) +static int show_smap(struct seq_file *m, void *v, int is_pid) { struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; @@ -434,7 +488,7 @@ static int show_smap(struct seq_file *m, void *v) if (vma->vm_mm && !is_vm_hugetlb_page(vma)) walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); - show_map_vma(m, vma); + show_map_vma(m, vma, is_pid); seq_printf(m, "Size: %8lu kB\n" @@ -473,20 +527,49 @@ static int show_smap(struct seq_file *m, void *v) return 0; } +static int show_pid_smap(struct seq_file *m, void *v) +{ + return show_smap(m, v, 1); +} + +static int show_tid_smap(struct seq_file *m, void *v) +{ + return show_smap(m, v, 0); +} + static const struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_smap + .show = show_pid_smap +}; + +static const struct seq_operations proc_tid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_smap }; -static int smaps_open(struct inode *inode, struct file *file) +static int pid_smaps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_smaps_op); } -const struct file_operations proc_smaps_operations = { - .open = smaps_open, +static int tid_smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_tid_smaps_op); +} + +const struct file_operations proc_pid_smaps_operations = { + .open = pid_smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +const struct file_operations proc_tid_smaps_operations = { + .open = tid_smaps_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, @@ -1039,7 +1122,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, /* * Display pages allocated per node and memory policy via /proc. */ -static int show_numa_map(struct seq_file *m, void *v) +static int show_numa_map(struct seq_file *m, void *v, int is_pid) { struct numa_maps_private *numa_priv = m->private; struct proc_maps_private *proc_priv = &numa_priv->proc_maps; @@ -1076,9 +1159,19 @@ static int show_numa_map(struct seq_file *m, void *v) seq_path(m, &file->f_path, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_printf(m, " heap"); - } else if (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack) { - seq_printf(m, " stack"); + } else { + pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid); + if (tid != 0) { + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) + seq_printf(m, " stack"); + else + seq_printf(m, " stack:%d", tid); + } } if (is_vm_hugetlb_page(vma)) @@ -1121,21 +1214,39 @@ out: return 0; } +static int show_pid_numa_map(struct seq_file *m, void *v) +{ + return show_numa_map(m, v, 1); +} + +static int show_tid_numa_map(struct seq_file *m, void *v) +{ + return show_numa_map(m, v, 0); +} + static const struct seq_operations proc_pid_numa_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_numa_map, + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_pid_numa_map, }; -static int numa_maps_open(struct inode *inode, struct file *file) +static const struct seq_operations proc_tid_numa_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_numa_map, +}; + +static int numa_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) { struct numa_maps_private *priv; int ret = -ENOMEM; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { priv->proc_maps.pid = proc_pid(inode); - ret = seq_open(file, &proc_pid_numa_maps_op); + ret = seq_open(file, ops); if (!ret) { struct seq_file *m = file->private_data; m->private = priv; @@ -1146,8 +1257,25 @@ static int numa_maps_open(struct inode *inode, struct file *file) return ret; } -const struct file_operations proc_numa_maps_operations = { - .open = numa_maps_open, +static int pid_numa_maps_open(struct inode *inode, struct file *file) +{ + return numa_maps_open(inode, file, &proc_pid_numa_maps_op); +} + +static int tid_numa_maps_open(struct inode *inode, struct file *file) +{ + return numa_maps_open(inode, file, &proc_tid_numa_maps_op); +} + +const struct file_operations proc_pid_numa_maps_operations = { + .open = pid_numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +const struct file_operations proc_tid_numa_maps_operations = { + .open = tid_numa_maps_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 980de547c07..74fe164d1b2 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -134,9 +134,11 @@ static void pad_len_spaces(struct seq_file *m, int len) /* * display a single VMA to a sequenced file */ -static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, + int is_pid) { struct mm_struct *mm = vma->vm_mm; + struct proc_maps_private *priv = m->private; unsigned long ino = 0; struct file *file; dev_t dev = 0; @@ -168,10 +170,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) pad_len_spaces(m, len); seq_path(m, &file->f_path, ""); } else if (mm) { - if (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack) { + pid_t tid = vm_is_stack(priv->task, vma, is_pid); + + if (tid != 0) { pad_len_spaces(m, len); - seq_puts(m, "[stack]"); + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) + seq_printf(m, "[stack]"); + else + seq_printf(m, "[stack:%d]", tid); } } @@ -182,11 +193,22 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) /* * display mapping lines for a particular process's /proc/pid/maps */ -static int show_map(struct seq_file *m, void *_p) +static int show_map(struct seq_file *m, void *_p, int is_pid) { struct rb_node *p = _p; - return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb), + is_pid); +} + +static int show_pid_map(struct seq_file *m, void *_p) +{ + return show_map(m, _p, 1); +} + +static int show_tid_map(struct seq_file *m, void *_p) +{ + return show_map(m, _p, 0); } static void *m_start(struct seq_file *m, loff_t *pos) @@ -240,10 +262,18 @@ static const struct seq_operations proc_pid_maps_ops = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_map + .show = show_pid_map +}; + +static const struct seq_operations proc_tid_maps_ops = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_map }; -static int maps_open(struct inode *inode, struct file *file) +static int maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) { struct proc_maps_private *priv; int ret = -ENOMEM; @@ -251,7 +281,7 @@ static int maps_open(struct inode *inode, struct file *file) priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { priv->pid = proc_pid(inode); - ret = seq_open(file, &proc_pid_maps_ops); + ret = seq_open(file, ops); if (!ret) { struct seq_file *m = file->private_data; m->private = priv; @@ -262,8 +292,25 @@ static int maps_open(struct inode *inode, struct file *file) return ret; } -const struct file_operations proc_maps_operations = { - .open = maps_open, +static int pid_maps_open(struct inode *inode, struct file *file) +{ + return maps_open(inode, file, &proc_pid_maps_ops); +} + +static int tid_maps_open(struct inode *inode, struct file *file) +{ + return maps_open(inode, file, &proc_tid_maps_ops); +} + +const struct file_operations proc_pid_maps_operations = { + .open = pid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +const struct file_operations proc_tid_maps_operations = { + .open = tid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, -- cgit v1.2.3 From 4e474a00d7ff746ed177ddae14fa8b2d4bad7a00 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 22 Mar 2012 14:42:22 -0700 Subject: sysctl: protect poll() in entries that may go away Protect code accessing ctl_table by grabbing the header with grab_header() and after releasing with sysctl_head_finish(). This is needed if poll() is called in entries created by modules: currently only hostname and domainname support poll(), but this bug may be triggered when/if modules use it and if user called poll() in a file that doesn't support it. Dave Jones reported the following when using a syscall fuzzer while hibernating/resuming: RIP: 0010:[] [] proc_sys_poll+0x4e/0x90 RAX: 0000000000000145 RBX: ffff88020cab6940 RCX: 0000000000000000 RDX: ffffffff81233df0 RSI: 6b6b6b6b6b6b6b6b RDI: ffff88020cab6940 [ ... ] Code: 00 48 89 fb 48 89 f1 48 8b 40 30 4c 8b 60 e8 b8 45 01 00 00 49 83 7c 24 28 00 74 2e 49 8b 74 24 30 48 85 f6 74 24 48 85 c9 75 32 <8b> 16 b8 45 01 00 00 48 63 d2 49 39 d5 74 10 8b 06 48 98 48 89 If an entry goes away while we are polling() it, ctl_table may not exist anymore. Reported-by: Dave Jones Signed-off-by: Lucas De Marchi Cc: Al Viro Cc: Linus Torvalds Cc: Alexey Dobriyan Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a7708b7c957..47b474b572c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -525,20 +525,32 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf, static int proc_sys_open(struct inode *inode, struct file *filp) { + struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; + /* sysctl was unregistered */ + if (IS_ERR(head)) + return PTR_ERR(head); + if (table->poll) filp->private_data = proc_sys_poll_event(table->poll); + sysctl_head_finish(head); + return 0; } static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = filp->f_path.dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; - unsigned long event = (unsigned long)filp->private_data; unsigned int ret = DEFAULT_POLLMASK; + unsigned long event; + + /* sysctl was unregistered */ + if (IS_ERR(head)) + return POLLERR | POLLHUP; if (!table->proc_handler) goto out; @@ -546,6 +558,7 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) if (!table->poll) goto out; + event = (unsigned long)filp->private_data; poll_wait(filp, &table->poll->wait, wait); if (event != atomic_read(&table->poll->event)) { @@ -554,6 +567,8 @@ static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) } out: + sysctl_head_finish(head); + return ret; } -- cgit v1.2.3 From b908243c549448fc0662f9cdd8d5cfe620fcdc31 Mon Sep 17 00:00:00 2001 From: Djalal Harouni Date: Fri, 23 Mar 2012 15:02:52 -0700 Subject: fs/proc/kcore.c: make get_sparsemem_vmemmap_info() static get_sparsemem_vmemmap_info() is only used inside fs/proc/kcore.c Signed-off-by: Djalal Harouni Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e5e69aff6c6..86c67eee439 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -157,7 +157,8 @@ static int kcore_update_ram(void) #ifdef CONFIG_SPARSEMEM_VMEMMAP /* calculate vmemmap's address from given system ram pfn and register it */ -int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) { unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT; unsigned long nr_pages = ent->size >> PAGE_SHIFT; @@ -189,7 +190,8 @@ int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) } #else -int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) { return 1; } -- cgit v1.2.3 From 59a32e2ce5eb809967cac4e718bc527beca83c59 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 23 Mar 2012 15:02:53 -0700 Subject: proc: speed up /proc/stat handling On a typical 16 cpus machine, "cat /proc/stat" gives more than 4096 bytes, and is slow : # strace -T -o /tmp/STRACE cat /proc/stat | wc -c 5826 # grep "cpu " /tmp/STRACE read(0, "cpu 1949310 19 2144714 12117253"..., 32768) = 5826 <0.001504> Thats partly because show_stat() must be called twice since initial buffer size is too small (4096 bytes for less than 32 possible cpus) Fix this by : 1) Taking into account nr_irqs in the initial buffer sizing. 2) Using ksize() to allow better filling of initial buffer. Signed-off-by: Eric Dumazet Cc: Glauber Costa Cc: Russell King - ARM Linux Cc: KAMEZAWA Hiroyuki Cc: Paul Turner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 121f77cfef7..ac446114cd4 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -157,11 +157,14 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - unsigned size = 4096 * (1 + num_possible_cpus() / 32); + unsigned size = 1024 + 128 * num_possible_cpus(); char *buf; struct seq_file *m; int res; + /* minimum size to display an interrupt count : 2 bytes */ + size += 2 * nr_irqs; + /* don't ask for more than the kmalloc() max size */ if (size > KMALLOC_MAX_SIZE) size = KMALLOC_MAX_SIZE; @@ -173,7 +176,7 @@ static int stat_open(struct inode *inode, struct file *file) if (!res) { m = file->private_data; m->buf = buf; - m->size = size; + m->size = ksize(buf); } else kfree(buf); return res; -- cgit v1.2.3 From 1ac101a5d675aca2426c5cd460c73fb95acb8391 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 23 Mar 2012 15:02:54 -0700 Subject: procfs: add num_to_str() to speed up /proc/stat == stat_check.py num = 0 with open("/proc/stat") as f: while num < 1000 : data = f.read() f.seek(0, 0) num = num + 1 == perf shows 20.39% stat_check.py [kernel.kallsyms] [k] format_decode 13.41% stat_check.py [kernel.kallsyms] [k] number 12.61% stat_check.py [kernel.kallsyms] [k] vsnprintf 10.85% stat_check.py [kernel.kallsyms] [k] memcpy 4.85% stat_check.py [kernel.kallsyms] [k] radix_tree_lookup 4.43% stat_check.py [kernel.kallsyms] [k] seq_printf This patch removes most of calls to vsnprintf() by adding num_to_str() and seq_print_decimal_ull(), which prints decimal numbers without rich functions provided by printf(). On my 8cpu box. == Before patch == [root@bluextal test]# time ./stat_check.py real 0m0.150s user 0m0.026s sys 0m0.121s == After patch == [root@bluextal test]# time ./stat_check.py real 0m0.055s user 0m0.022s sys 0m0.030s [akpm@linux-foundation.org: remove incorrect comment, use less statck in num_to_str(), move comment from .h to .c, simplify seq_put_decimal_ull()] [andrea@betterlinux.com: avoid breaking the ABI in /proc/stat] Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrea Righi Cc: Eric Dumazet Cc: Glauber Costa Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Paul Turner Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 55 +++++++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 28 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index ac446114cd4..6a0c62d6e44 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -89,18 +89,19 @@ static int show_stat(struct seq_file *p, void *v) } sum += arch_irq_stat(); - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu " - "%llu\n", - (unsigned long long)cputime64_to_clock_t(user), - (unsigned long long)cputime64_to_clock_t(nice), - (unsigned long long)cputime64_to_clock_t(system), - (unsigned long long)cputime64_to_clock_t(idle), - (unsigned long long)cputime64_to_clock_t(iowait), - (unsigned long long)cputime64_to_clock_t(irq), - (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest), - (unsigned long long)cputime64_to_clock_t(guest_nice)); + seq_puts(p, "cpu "); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; @@ -113,26 +114,24 @@ static int show_stat(struct seq_file *p, void *v) steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; - seq_printf(p, - "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " - "%llu\n", - i, - (unsigned long long)cputime64_to_clock_t(user), - (unsigned long long)cputime64_to_clock_t(nice), - (unsigned long long)cputime64_to_clock_t(system), - (unsigned long long)cputime64_to_clock_t(idle), - (unsigned long long)cputime64_to_clock_t(iowait), - (unsigned long long)cputime64_to_clock_t(irq), - (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal), - (unsigned long long)cputime64_to_clock_t(guest), - (unsigned long long)cputime64_to_clock_t(guest_nice)); + seq_printf(p, "cpu%d", i); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_putc(p, '\n'); } seq_printf(p, "intr %llu", (unsigned long long)sum); /* sum again ? it could be updated? */ for_each_irq_nr(j) - seq_printf(p, " %u", kstat_irqs(j)); + seq_put_decimal_ull(p, ' ', kstat_irqs(j)); seq_printf(p, "\nctxt %llu\n" @@ -149,7 +148,7 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) - seq_printf(p, " %u", per_softirq_sums[i]); + seq_put_decimal_ull(p, ' ', per_softirq_sums[i]); seq_putc(p, '\n'); return 0; -- cgit v1.2.3 From bda7bad62bc4c4e0783348e8db51abe094153c56 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 23 Mar 2012 15:02:54 -0700 Subject: procfs: speed up /proc/pid/stat, statm Process accounting applications as top, ps visit some files under /proc/. With seq_put_decimal_ull(), we can optimize /proc//stat and /proc//statm files. This patch adds - seq_put_decimal_ll() for signed values. - allow delimiter == 0. - convert seq_printf() to seq_put_decimal_ull/ll in /proc/stat, statm. Test result on a system with 2000+ procs. Before patch: [kamezawa@bluextal test]$ top -b -n 1 | wc -l 2223 [kamezawa@bluextal test]$ time top -b -n 1 > /dev/null real 0m0.675s user 0m0.044s sys 0m0.121s [kamezawa@bluextal test]$ time ps -elf > /dev/null real 0m0.236s user 0m0.056s sys 0m0.176s After patch: kamezawa@bluextal ~]$ time top -b -n 1 > /dev/null real 0m0.657s user 0m0.052s sys 0m0.100s [kamezawa@bluextal ~]$ time ps -elf > /dev/null real 0m0.198s user 0m0.050s sys 0m0.145s Considering top, ps tend to scan /proc periodically, this will reduce cpu consumption by top/ps to some extent. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: KAMEZAWA Hiroyuki Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 119 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 64 insertions(+), 55 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index c602b8d20f0..fbb53c24908 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -462,59 +462,56 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); - seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ -%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", - pid_nr_ns(pid, ns), - tcomm, - state, - ppid, - pgid, - sid, - tty_nr, - tty_pgrp, - task->flags, - min_flt, - cmin_flt, - maj_flt, - cmaj_flt, - cputime_to_clock_t(utime), - cputime_to_clock_t(stime), - cputime_to_clock_t(cutime), - cputime_to_clock_t(cstime), - priority, - nice, - num_threads, - start_time, - vsize, - mm ? get_mm_rss(mm) : 0, - rsslim, - mm ? (permitted ? mm->start_code : 1) : 0, - mm ? (permitted ? mm->end_code : 1) : 0, - (permitted && mm) ? mm->start_stack : 0, - esp, - eip, - /* The signal information here is obsolete. - * It must be decimal for Linux 2.0 compatibility. - * Use /proc/#/status for real-time signals. - */ - task->pending.signal.sig[0] & 0x7fffffffUL, - task->blocked.sig[0] & 0x7fffffffUL, - sigign .sig[0] & 0x7fffffffUL, - sigcatch .sig[0] & 0x7fffffffUL, - wchan, - 0UL, - 0UL, - task->exit_signal, - task_cpu(task), - task->rt_priority, - task->policy, - (unsigned long long)delayacct_blkio_ticks(task), - cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime), - (mm && permitted) ? mm->start_data : 0, - (mm && permitted) ? mm->end_data : 0, - (mm && permitted) ? mm->start_brk : 0); + seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ll(m, ' ', ppid); + seq_put_decimal_ll(m, ' ', pgid); + seq_put_decimal_ll(m, ' ', sid); + seq_put_decimal_ll(m, ' ', tty_nr); + seq_put_decimal_ll(m, ' ', tty_pgrp); + seq_put_decimal_ull(m, ' ', task->flags); + seq_put_decimal_ull(m, ' ', min_flt); + seq_put_decimal_ull(m, ' ', cmin_flt); + seq_put_decimal_ull(m, ' ', maj_flt); + seq_put_decimal_ull(m, ' ', cmaj_flt); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime)); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime)); + seq_put_decimal_ll(m, ' ', priority); + seq_put_decimal_ll(m, ' ', nice); + seq_put_decimal_ll(m, ' ', num_threads); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', start_time); + seq_put_decimal_ull(m, ' ', vsize); + seq_put_decimal_ll(m, ' ', mm ? get_mm_rss(mm) : 0); + seq_put_decimal_ull(m, ' ', rsslim); + seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); + seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); + seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0); + seq_put_decimal_ull(m, ' ', esp); + seq_put_decimal_ull(m, ' ', eip); + /* The signal information here is obsolete. + * It must be decimal for Linux 2.0 compatibility. + * Use /proc/#/status for real-time signals. + */ + seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', wchan); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ll(m, ' ', task->exit_signal); + seq_put_decimal_ll(m, ' ', task_cpu(task)); + seq_put_decimal_ull(m, ' ', task->rt_priority); + seq_put_decimal_ull(m, ' ', task->policy); + seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); + seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_data : 0); + seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->end_data : 0); + seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_brk : 0); + seq_putc(m, '\n'); if (mm) mmput(mm); return 0; @@ -542,8 +539,20 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); } - seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", - size, resident, shared, text, data); + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, 0, size); + seq_put_decimal_ull(m, ' ', resident); + seq_put_decimal_ull(m, ' ', shared); + seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', 0); + seq_putc(m, '\n'); return 0; } -- cgit v1.2.3 From 1b26c9b334044cff6d1d2698f2be41bc7d9a0864 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 23 Mar 2012 15:02:55 -0700 Subject: proc-ns: use d_set_d_op() API to set dentry ops in proc_ns_instantiate(). The namespace cleanup path leaks a dentry which holds a reference count on a network namespace. Keeping that network namespace from being freed when the last user goes away. Leaving things like vlan devices in the leaked network namespace. If you use ip netns add for much real work this problem becomes apparent pretty quickly. It light testing the problem hides because frequently you simply don't notice the leak. Use d_set_d_op() so that DCACHE_OP_* flags are set correctly. This issue exists back to 3.0. Acked-by: "Eric W. Biederman" Reported-by: Justin Pettit Signed-off-by: Pravin B Shelar Signed-off-by: Jesse Gross Cc: David Miller Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/namespaces.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 27da860115c..3551f1f839e 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -53,7 +53,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, ei->ns_ops = ns_ops; ei->ns = ns; - dentry->d_op = &pid_dentry_operations; + d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, NULL)) -- cgit v1.2.3 From 9ffc93f203c18a70623f21950f1dd473c9ec48cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Remove all #inclusions of asm/system.h Remove all #inclusions of asm/system.h preparatory to splitting and killing it. Performed with the following command: perl -p -i -e 's!^#\s*include\s*.*\n!!' `grep -Irl '^#\s*include\s*' *` Signed-off-by: David Howells --- fs/proc/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8461a7b82fd..205c9228083 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -22,7 +22,6 @@ #include #include -#include #include #include "internal.h" -- cgit v1.2.3 From 3748b2f15b06ea1861df39d5e9693dcd6e9542b1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 28 Mar 2012 14:42:39 -0700 Subject: procfs: fix /proc/statm bda7bad62bc4 ("procfs: speed up /proc/pid/stat, statm") broke /proc/statm - 'text' is printed twice by mistake. Signed-off-by: KAMEZAWA Hiroyuki Reported-by: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index fbb53c24908..f9bd395b347 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -550,7 +550,7 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', shared); seq_put_decimal_ull(m, ' ', text); seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', data); seq_put_decimal_ull(m, ' ', 0); seq_putc(m, '\n'); -- cgit v1.2.3 From 45f83cefe3a5f0476ac3f96382ebfdc3fe4caab2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 28 Mar 2012 14:42:40 -0700 Subject: mm: thp: fix up pmd_trans_unstable() locations pmd_trans_unstable() should be called before pmd_offset_map() in the locations where the mmap_sem is held for reading. Signed-off-by: Andrea Arcangeli Cc: Mel Gorman Cc: Hugh Dickins Cc: Larry Woodman Cc: Ulrich Obergfell Cc: Rik van Riel Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9694cc28351..c283832d411 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -781,9 +781,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); - if (pmd_trans_unstable(pmd)) - return 0; - /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); spin_lock(&walk->mm->page_table_lock); @@ -802,6 +799,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } + if (pmd_trans_unstable(pmd)) + return 0; for (; addr != end; addr += PAGE_SIZE) { /* check to see if we've left 'vma' behind -- cgit v1.2.3 From 4c619aa0ba171c092a0ae5d969364deb82dbe371 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 28 Mar 2012 14:42:52 -0700 Subject: fs/proc/namespaces.c: prevent crash when ns_entries[] is empty If CONFIG_NET_NS, CONFIG_UTS_NS and CONFIG_IPC_NS are disabled, ns_entries[] becomes empty and things like ns_entries[ARRAY_SIZE(ns_entries) - 1] will explode. Reported-by: Richard Weinberger Cc: "Eric W. Biederman" Cc: Daniel Lezcano Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/namespaces.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 3551f1f839e..0d9e23a39e4 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -156,15 +156,15 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out; - last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; - for (entry = ns_entries; entry <= last; entry++) { + last = &ns_entries[ARRAY_SIZE(ns_entries)]; + for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) continue; if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } error = ERR_PTR(-ENOENT); - if (entry > last) + if (entry == last) goto out; error = proc_ns_instantiate(dir, dentry, task, *entry); -- cgit v1.2.3 From 10bdfb5ef7e1a429a3de31e498942a8ae5749a46 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 29 Mar 2012 13:58:17 -0700 Subject: pagemap: remove remaining unneeded spin_lock() Commit 025c5b2451e4 ("thp: optimize away unnecessary page table locking") moves spin_lock() into pmd_trans_huge_lock() in order to avoid locking unless pmd is for thp. So this spin_lock() is a bug. Reported-by: Sasha Levin Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c283832d411..2b9a7607cbd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -783,7 +783,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); - spin_lock(&walk->mm->page_table_lock); if (pmd_trans_huge_lock(pmd, vma) == 1) { for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; -- cgit v1.2.3 From cb85a6ed67e979c59a29b7b4e8217e755b951cf4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 30 Mar 2012 12:23:08 +0200 Subject: proc: stats: Use arch_idle_time for idle and iowait times if available Git commit a25cac5198d4ff28 "proc: Consider NO_HZ when printing idle and iowait times" changes the code for /proc/stat to use get_cpu_idle_time_us and get_cpu_iowait_time_us if the system is running with nohz enabled. For architectures which define arch_idle_time (currently s390 only) this is a change for the worse. The result of arch_idle_time is supposed to be the exact sleep time of the target cpu and should be used instead of the value kept by the scheduler. Signed-off-by: Martin Schwidefsky Reviewed-by: Michal Hocko Reviewed-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/20120330122308.18720283@de.ibm.com Signed-off-by: Thomas Gleixner --- fs/proc/stat.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 6a0c62d6e44..64c3b317236 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -18,19 +18,39 @@ #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif -#ifndef arch_idle_time -#define arch_idle_time(cpu) 0 -#endif + +#ifdef arch_idle_time + +static cputime64_t get_idle_time(int cpu) +{ + cputime64_t idle; + + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) + idle += arch_idle_time(cpu); + return idle; +} + +static cputime64_t get_iowait_time(int cpu) +{ + cputime64_t iowait; + + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + if (cpu_online(cpu) && nr_iowait_cpu(cpu)) + iowait += arch_idle_time(cpu); + return iowait; +} + +#else static u64 get_idle_time(int cpu) { u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL); - if (idle_time == -1ULL) { + if (idle_time == -1ULL) /* !NO_HZ so we can rely on cpustat.idle */ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; - idle += arch_idle_time(cpu); - } else + else idle = usecs_to_cputime64(idle_time); return idle; @@ -49,6 +69,8 @@ static u64 get_iowait_time(int cpu) return iowait; } +#endif + static int show_stat(struct seq_file *p, void *v) { int i, j; -- cgit v1.2.3 From 99663be772c827b8f5f594fe87eb4807be1994e5 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Thu, 5 Apr 2012 14:25:04 -0700 Subject: proc: fix mount -t proc -o AAA The proc_parse_options() call from proc_mount() runs only once at boot time. So on any later mount attempt, any mount options are ignored because ->s_root is already initialized. As a consequence, "mount -o " will ignore the options. The only way to change mount options is "mount -o remount,". To fix this, parse the mount options unconditionally. Signed-off-by: Vasiliy Kulikov Reported-by: Arkadiusz Miskiewicz Tested-by: Arkadiusz Miskiewicz Cc: Alexey Dobriyan Cc: Al Viro Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/root.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/root.c b/fs/proc/root.c index 46a15d8a29c..eed44bfc85d 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -115,12 +115,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) return ERR_CAST(sb); + if (!proc_parse_options(options, ns)) { + deactivate_locked_super(sb); + return ERR_PTR(-EINVAL); + } + if (!sb->s_root) { sb->s_flags = flags; - if (!proc_parse_options(options, ns)) { - deactivate_locked_super(sb); - return ERR_PTR(-EINVAL); - } err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); -- cgit v1.2.3 From 63f61a6f4633ff34c17bea7a0ed827eaeb0733e1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 25 Apr 2012 16:01:52 -0700 Subject: revert "proc: clear_refs: do not clear reserved pages" Revert commit 85e72aa5384 ("proc: clear_refs: do not clear reserved pages"), which was a quick fix suitable for -stable until ARM had been moved over to the gate_vma mechanism: https://lkml.org/lkml/2012/1/14/55 With commit f9d4861f ("ARM: 7294/1: vectors: use gate_vma for vectors user mapping"), ARM does now use the gate_vma, so the PageReserved check can be removed from the proc code. Signed-off-by: Will Deacon Cc: Nicolas Pitre Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2b9a7607cbd..2d60492d6df 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -597,9 +597,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!page) continue; - if (PageReserved(page)) - continue; - /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); ClearPageReferenced(page); -- cgit v1.2.3 From 22d917d80e842829d0ca0a561967d728eb1d6303 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 17 Nov 2011 00:11:58 -0800 Subject: userns: Rework the user_namespace adding uid/gid mapping support - Convert the old uid mapping functions into compatibility wrappers - Add a uid/gid mapping layer from user space uid and gids to kernel internal uids and gids that is extent based for simplicty and speed. * Working with number space after mapping uids/gids into their kernel internal version adds only mapping complexity over what we have today, leaving the kernel code easy to understand and test. - Add proc files /proc/self/uid_map /proc/self/gid_map These files display the mapping and allow a mapping to be added if a mapping does not exist. - Allow entering the user namespace without a uid or gid mapping. Since we are starting with an existing user our uids and gids still have global mappings so are still valid and useful they just don't have local mappings. The requirement for things to work are global uid and gid so it is odd but perfectly fine not to have a local uid and gid mapping. Not requiring global uid and gid mappings greatly simplifies the logic of setting up the uid and gid mappings by allowing the mappings to be set after the namespace is created which makes the slight weirdness worth it. - Make the mappings in the initial user namespace to the global uid/gid space explicit. Today it is an identity mapping but in the future we may want to twist this for debugging, similar to what we do with jiffies. - Document the memory ordering requirements of setting the uid and gid mappings. We only allow the mappings to be set once and there are no pointers involved so the requirments are trivial but a little atypical. Performance: In this scheme for the permission checks the performance is expected to stay the same as the actuall machine instructions should remain the same. The worst case I could think of is ls -l on a large directory where all of the stat results need to be translated with from kuids and kgids to uids and gids. So I benchmarked that case on my laptop with a dual core hyperthread Intel i5-2520M cpu with 3M of cpu cache. My benchmark consisted of going to single user mode where nothing else was running. On an ext4 filesystem opening 1,000,000 files and looping through all of the files 1000 times and calling fstat on the individuals files. This was to ensure I was benchmarking stat times where the inodes were in the kernels cache, but the inode values were not in the processors cache. My results: v3.4-rc1: ~= 156ns (unmodified v3.4-rc1 with user namespace support disabled) v3.4-rc1-userns-: ~= 155ns (v3.4-rc1 with my user namespace patches and user namespace support disabled) v3.4-rc1-userns+: ~= 164ns (v3.4-rc1 with my user namespace patches and user namespace support enabled) All of the configurations ran in roughly 120ns when I performed tests that ran in the cpu cache. So in summary the performance impact is: 1ns improvement in the worst case with user namespace support compiled out. 8ns aka 5% slowdown in the worst case with user namespace support compiled in. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1c8b280146d..2ee514c7e64 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include @@ -2943,6 +2944,74 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) } #endif /* CONFIG_TASK_IO_ACCOUNTING */ +#ifdef CONFIG_USER_NS +static int proc_id_map_open(struct inode *inode, struct file *file, + struct seq_operations *seq_ops) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + struct seq_file *seq; + int ret = -EINVAL; + + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + ret = seq_open(file, seq_ops); + if (ret) + goto err_put_ns; + + seq = file->private_data; + seq->private = ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_id_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + put_user_ns(ns); + return seq_release(inode, file); +} + +static int proc_uid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_uid_seq_operations); +} + +static int proc_gid_map_open(struct inode *inode, struct file *file) +{ + return proc_id_map_open(inode, file, &proc_gid_seq_operations); +} + +static const struct file_operations proc_uid_map_operations = { + .open = proc_uid_map_open, + .write = proc_uid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; + +static const struct file_operations proc_gid_map_operations = { + .open = proc_gid_map_open, + .write = proc_gid_map_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_id_map_release, +}; +#endif /* CONFIG_USER_NS */ + static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -3045,6 +3114,10 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), #endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), +#endif }; static int proc_tgid_base_readdir(struct file * filp, @@ -3400,6 +3473,10 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), #endif +#ifdef CONFIG_USER_NS + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), +#endif }; static int proc_tid_base_readdir(struct file * filp, -- cgit v1.2.3 From ae2975bc3476243b45a1e2344236d7920c268f38 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Nov 2011 15:56:38 -0800 Subject: userns: Convert group_info values from gid_t to kgid_t. As a first step to converting struct cred to be all kuid_t and kgid_t values convert the group values stored in group_info to always be kgid_t values. Unless user namespaces are used this change should have no effect. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/proc/array.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index f9bd395b347..36a0a9192ec 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include @@ -161,6 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk) static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { + struct user_namespace *user_ns = current_user_ns(); struct group_info *group_info; int g; struct fdtable *fdt = NULL; @@ -205,7 +207,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_unlock(p); for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) - seq_printf(m, "%d ", GROUP_AT(group_info, g)); + seq_printf(m, "%d ", + from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); seq_putc(m, '\n'); -- cgit v1.2.3 From dbd5768f87ff6fb0a4fe09c4d7b6c4a24de99430 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 May 2012 14:48:02 +0200 Subject: vfs: Rename end_writeback() to clear_inode() After we moved inode_sync_wait() from end_writeback() it doesn't make sense to call the function end_writeback() anymore. Rename it to clear_inode() which well says what the function really does - set I_CLEAR flag. Signed-off-by: Jan Kara Signed-off-by: Fengguang Wu --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 205c9228083..29ab406b370 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -33,7 +33,7 @@ static void proc_evict_inode(struct inode *inode) const struct proc_ns_operations *ns_ops; truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + clear_inode(inode); /* Stop tracking associated processes */ put_pid(PROC_I(inode)->pid); -- cgit v1.2.3 From 16fbdce62d9c89b794e303f4a232e4749b77e9ac Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 10 May 2012 13:01:43 -0700 Subject: proc/pid/pagemap: correctly report non-present ptes and holes between vmas Reset the current pagemap-entry if the current pte isn't present, or if current vma is over. Otherwise pagemap reports last entry again and again. Non-present pte reporting was broken in commit 092b50bacd1c ("pagemap: introduce data structure for pagemap entry") Reporting for holes was broken in commit 5aaabe831eb5 ("pagemap: avoid splitting thp when reading /proc/pid/pagemap") Signed-off-by: Konstantin Khlebnikov Reported-by: Pavel Emelyanov Cc: Naoya Horiguchi Cc: KAMEZAWA Hiroyuki Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2d60492d6df..1030a716d15 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -747,6 +747,8 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte) else if (pte_present(pte)) *pme = make_pme(PM_PFRAME(pte_pfn(pte)) | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + else + *pme = make_pme(PM_NOT_PRESENT); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -761,6 +763,8 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + else + *pme = make_pme(PM_NOT_PRESENT); } #else static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, @@ -801,8 +805,10 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, /* check to see if we've left 'vma' behind * and need a new, higher one */ - if (vma && (addr >= vma->vm_end)) + if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); + pme = make_pme(PM_NOT_PRESENT); + } /* check that 'vma' actually covers this address, * and that it isn't a huge page vma */ @@ -830,6 +836,8 @@ static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, if (pte_present(pte)) *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + else + *pme = make_pme(PM_NOT_PRESENT); } /* This function walks within one hugetlb entry in the single call */ @@ -839,7 +847,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, { struct pagemapread *pm = walk->private; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme; for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; -- cgit v1.2.3 From dcb0f22282e680ee5202ab7574ce78beb3803a9f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Feb 2012 08:48:21 -0800 Subject: userns: Convert proc to use kuid/kgid where appropriate Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/proc/array.c | 10 ++++++++-- fs/proc/base.c | 16 ++++++++-------- fs/proc/inode.c | 4 ++-- fs/proc/root.c | 2 +- 4 files changed, 19 insertions(+), 13 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 36a0a9192ec..dc4c5a7b9ec 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -191,8 +191,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_tgid_nr_ns(p, ns), pid_nr_ns(pid, ns), ppid, tpid, - cred->uid, cred->euid, cred->suid, cred->fsuid, - cred->gid, cred->egid, cred->sgid, cred->fsgid); + from_kuid_munged(user_ns, cred->uid), + from_kuid_munged(user_ns, cred->euid), + from_kuid_munged(user_ns, cred->suid), + from_kuid_munged(user_ns, cred->fsuid), + from_kgid_munged(user_ns, cred->gid), + from_kgid_munged(user_ns, cred->egid), + from_kgid_munged(user_ns, cred->sgid), + from_kgid_munged(user_ns, cred->fsgid)); task_lock(p); if (p->files) diff --git a/fs/proc/base.c b/fs/proc/base.c index 2ee514c7e64..c47904994b7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1562,8 +1562,8 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) generic_fillattr(inode, stat); rcu_read_lock(); - stat->uid = 0; - stat->gid = 0; + stat->uid = GLOBAL_ROOT_UID; + stat->gid = GLOBAL_ROOT_GID; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(pid, task, 2)) { @@ -1623,8 +1623,8 @@ int pid_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); @@ -1811,8 +1811,8 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } inode->i_mode &= ~(S_ISUID | S_ISGID); security_task_to_inode(task, inode); @@ -2061,8 +2061,8 @@ static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = cred->egid; rcu_read_unlock(); } else { - inode->i_uid = 0; - inode->i_gid = 0; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; } security_task_to_inode(task, inode); status = 1; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 205c9228083..554ecc54799 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -108,8 +108,8 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) struct super_block *sb = root->d_sb; struct pid_namespace *pid = sb->s_fs_info; - if (pid->pid_gid) - seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid); + if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); if (pid->hide_pid != 0) seq_printf(seq, ",hidepid=%u", pid->hide_pid); diff --git a/fs/proc/root.c b/fs/proc/root.c index 46a15d8a29c..df4e4561dbb 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -67,7 +67,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) case Opt_gid: if (match_int(&args[0], &option)) return 0; - pid->pid_gid = option; + pid->pid_gid = make_kgid(current_user_ns(), option); break; case Opt_hidepid: if (match_int(&args[0], &option)) -- cgit v1.2.3 From 091bd3ea4e7ff4da8509978b9be93dc9d8cf0680 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 13 Feb 2012 18:02:50 -0800 Subject: userns: Convert sysctl permission checks to use kuid and kgids. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 21d836f4029..3476bca8f7a 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -371,9 +371,9 @@ void register_sysctl_root(struct ctl_table_root *root) static int test_perm(int mode, int op) { - if (!current_euid()) + if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) mode >>= 6; - else if (in_egroup_p(0)) + else if (in_egroup_p(GLOBAL_ROOT_GID)) mode >>= 3; if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) return 0; -- cgit v1.2.3 From eb94cd96e05d6c65a07937e66a04ea265c1b767d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 17 May 2012 17:03:25 -0700 Subject: fs, proc: fix ABBA deadlock in case of execution attempt of map_files/ entries map_files/ entries are never supposed to be executed, still curious minds might try to run them, which leads to the following deadlock ====================================================== [ INFO: possible circular locking dependency detected ] 3.4.0-rc4-24406-g841e6a6 #121 Not tainted ------------------------------------------------------- bash/1556 is trying to acquire lock: (&sb->s_type->i_mutex_key#8){+.+.+.}, at: do_lookup+0x267/0x2b1 but task is already holding lock: (&sig->cred_guard_mutex){+.+.+.}, at: prepare_bprm_creds+0x2d/0x69 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&sig->cred_guard_mutex){+.+.+.}: validate_chain+0x444/0x4f4 __lock_acquire+0x387/0x3f8 lock_acquire+0x12b/0x158 __mutex_lock_common+0x56/0x3a9 mutex_lock_killable_nested+0x40/0x45 lock_trace+0x24/0x59 proc_map_files_lookup+0x5a/0x165 __lookup_hash+0x52/0x73 do_lookup+0x276/0x2b1 walk_component+0x3d/0x114 do_last+0xfc/0x540 path_openat+0xd3/0x306 do_filp_open+0x3d/0x89 do_sys_open+0x74/0x106 sys_open+0x21/0x23 tracesys+0xdd/0xe2 -> #0 (&sb->s_type->i_mutex_key#8){+.+.+.}: check_prev_add+0x6a/0x1ef validate_chain+0x444/0x4f4 __lock_acquire+0x387/0x3f8 lock_acquire+0x12b/0x158 __mutex_lock_common+0x56/0x3a9 mutex_lock_nested+0x40/0x45 do_lookup+0x267/0x2b1 walk_component+0x3d/0x114 link_path_walk+0x1f9/0x48f path_openat+0xb6/0x306 do_filp_open+0x3d/0x89 open_exec+0x25/0xa0 do_execve_common+0xea/0x2f9 do_execve+0x43/0x45 sys_execve+0x43/0x5a stub_execve+0x6c/0xc0 This is because prepare_bprm_creds grabs task->signal->cred_guard_mutex and when do_lookup happens we try to grab task->signal->cred_guard_mutex again in lock_trace. Fix it using plain ptrace_may_access() helper in proc_map_files_lookup() and in proc_map_files_readdir() instead of lock_trace(), the caller must be CAP_SYS_ADMIN granted anyway. Signed-off-by: Cyrill Gorcunov Reported-by: Sasha Levin Cc: Konstantin Khlebnikov Cc: Pavel Emelyanov Cc: Dave Jones Cc: Vasiliy Kulikov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1c8b280146d..8e139c90f9f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2177,16 +2177,16 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out; result = ERR_PTR(-EACCES); - if (lock_trace(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; result = ERR_PTR(-ENOENT); if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) - goto out_unlock; + goto out_put_task; mm = get_task_mm(task); if (!mm) - goto out_unlock; + goto out_put_task; down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); @@ -2198,8 +2198,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, out_no_vma: up_read(&mm->mmap_sem); mmput(mm); -out_unlock: - unlock_trace(task); out_put_task: put_task_struct(task); out: @@ -2233,7 +2231,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) goto out; ret = -EACCES; - if (lock_trace(task)) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; ret = 0; @@ -2241,12 +2239,12 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) case 0: ino = inode->i_ino; if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) - goto out_unlock; + goto out_put_task; filp->f_pos++; case 1: ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out_unlock; + goto out_put_task; filp->f_pos++; default: { @@ -2257,7 +2255,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) mm = get_task_mm(task); if (!mm) - goto out_unlock; + goto out_put_task; down_read(&mm->mmap_sem); nr_files = 0; @@ -2287,7 +2285,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) flex_array_free(fa); up_read(&mm->mmap_sem); mmput(mm); - goto out_unlock; + goto out_put_task; } for (i = 0, vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { @@ -2332,8 +2330,6 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) } } -out_unlock: - unlock_trace(task); out_put_task: put_task_struct(task); out: -- cgit v1.2.3 From 30a08bf2d31d275c6fc71dd1811342777e95c831 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 18 May 2012 11:32:15 -0700 Subject: proc: move fd symlink i_mode calculations into tid_fd_revalidate() Instead of doing the i_mode calculations at proc_fd_instantiate() time, move them into tid_fd_revalidate(), which is where the other inode state (notably uid/gid information) is updated too. Otherwise we'll end up with stale i_mode information if an fd is re-used while the dentry still hangs around. Not that anything really *cares* (symlink permissions don't really matter), but Tetsuo Handa noticed that the owner read/write bits don't always match the state of the readability of the file descriptor, and we _used_ to get this right a long time ago in a galaxy far, far away. Besides, aside from fixing an ugly detail (that has apparently been this way since commit 61a28784028e: "proc: Remove the hard coded inode numbers" in 2006), this removes more lines of code than it adds. And it just makes sense to update i_mode in the same place we update i_uid/gid. Al Viro correctly points out that we could just do the inode fill in the inode iops ->getattr() function instead. However, that does require somewhat slightly more invasive changes, and adds yet *another* lookup of the file descriptor. We need to do the revalidate() for other reasons anyway, and have the file descriptor handy, so we might as well fill in the information at this point. Reported-by: Tetsuo Handa Cc: Al Viro Acked-by: Eric Biederman Signed-off-by: Linus Torvalds --- fs/proc/base.c | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1c8b280146d..7d6ad98631f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1799,10 +1799,15 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) if (task) { files = get_files_struct(task); if (files) { + struct file *file; rcu_read_lock(); - if (fcheck_files(files, fd)) { + file = fcheck_files(files, fd); + if (file) { + unsigned i_mode, f_mode = file->f_mode; + rcu_read_unlock(); put_files_struct(files); + if (task_dumpable(task)) { rcu_read_lock(); cred = __task_cred(task); @@ -1813,7 +1818,14 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_uid = 0; inode->i_gid = 0; } - inode->i_mode &= ~(S_ISUID | S_ISGID); + + i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + security_task_to_inode(task, inode); put_task_struct(task); return 1; @@ -1837,8 +1849,6 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { unsigned fd = *(const unsigned *)ptr; - struct file *file; - struct files_struct *files; struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); @@ -1848,25 +1858,6 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, goto out; ei = PROC_I(inode); ei->fd = fd; - files = get_files_struct(task); - if (!files) - goto out_iput; - inode->i_mode = S_IFLNK; - - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (!file) - goto out_unlock; - if (file->f_mode & FMODE_READ) - inode->i_mode |= S_IRUSR | S_IXUSR; - if (file->f_mode & FMODE_WRITE) - inode->i_mode |= S_IWUSR | S_IXUSR; - spin_unlock(&files->file_lock); - put_files_struct(files); inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -1879,12 +1870,6 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, out: return error; -out_unlock: - spin_unlock(&files->file_lock); - put_files_struct(files); -out_iput: - iput(inode); - goto out; } static struct dentry *proc_lookupfd_common(struct inode *dir, -- cgit v1.2.3 From 08fa29d916c6e271ad13978cd993e7238c68db97 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 29 May 2012 15:06:15 -0700 Subject: mm: fix NULL ptr deref when walking hugepages A missing validation of the value returned by find_vma() could cause a NULL ptr dereference when walking the pagetable. This is triggerable from usermode by a simple user by trying to read a page info out of /proc/pid/pagemap which doesn't exist. Introduced by commit 025c5b2451e4 ("thp: optimize away unnecessary page table locking"). Signed-off-by: Sasha Levin Reviewed-by: Naoya Horiguchi Cc: David Rientjes Cc: Andi Kleen Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: [3.4.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1030a716d15..7faaf2acc57 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -784,7 +784,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; -- cgit v1.2.3 From a7f638f999ff42310e9582273b1fe25ea6e469ba Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 29 May 2012 15:06:47 -0700 Subject: mm, oom: normalize oom scores to oom_score_adj scale only for userspace The oom_score_adj scale ranges from -1000 to 1000 and represents the proportion of memory available to the process at allocation time. This means an oom_score_adj value of 300, for example, will bias a process as though it was using an extra 30.0% of available memory and a value of -350 will discount 35.0% of available memory from its usage. The oom killer badness heuristic also uses this scale to report the oom score for each eligible process in determining the "best" process to kill. Thus, it can only differentiate each process's memory usage by 0.1% of system RAM. On large systems, this can end up being a large amount of memory: 256MB on 256GB systems, for example. This can be fixed by having the badness heuristic to use the actual memory usage in scoring threads and then normalizing it to the oom_score_adj scale for userspace. This results in better comparison between eligible threads for kill and no change from the userspace perspective. Suggested-by: KOSAKI Motohiro Tested-by: Dave Jones Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index d2d3108a611..d7d711876b6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -411,12 +411,13 @@ static const struct file_operations proc_lstats_operations = { static int proc_oom_score(struct task_struct *task, char *buffer) { + unsigned long totalpages = totalram_pages + total_swap_pages; unsigned long points = 0; read_lock(&tasklist_lock); if (pid_alive(task)) - points = oom_badness(task, NULL, NULL, - totalram_pages + total_swap_pages); + points = oom_badness(task, NULL, NULL, totalpages) * + 1000 / totalpages; read_unlock(&tasklist_lock); return sprintf(buffer, "%lu\n", points); } -- cgit v1.2.3 From b409e578d9a4ec95913e06d8fea2a33f1754ea69 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 31 May 2012 16:26:17 -0700 Subject: proc: clean up /proc//environ handling Similar to e268337dfe26 ("proc: clean up and fix /proc//mem handling"), move the check of permission to open(), this will simplify read() code. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Cong Wang Cc: Oleg Nesterov Cc: Alexey Dobriyan Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index d7d711876b6..e2d23424326 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -679,7 +679,7 @@ static const struct file_operations proc_single_file_operations = { .release = single_release, }; -static int mem_open(struct inode* inode, struct file* file) +static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); struct mm_struct *mm; @@ -687,7 +687,7 @@ static int mem_open(struct inode* inode, struct file* file) if (!task) return -ESRCH; - mm = mm_access(task, PTRACE_MODE_ATTACH); + mm = mm_access(task, mode); put_task_struct(task); if (IS_ERR(mm)) @@ -707,6 +707,11 @@ static int mem_open(struct inode* inode, struct file* file) return 0; } +static int mem_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_ATTACH); +} + static ssize_t mem_rw(struct file *file, char __user *buf, size_t count, loff_t *ppos, int write) { @@ -803,30 +808,29 @@ static const struct file_operations proc_mem_operations = { .release = mem_release, }; +static int environ_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_READ); +} + static ssize_t environ_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = get_proc_task(file->f_dentry->d_inode); char *page; unsigned long src = *ppos; - int ret = -ESRCH; - struct mm_struct *mm; + int ret = 0; + struct mm_struct *mm = file->private_data; - if (!task) - goto out_no_task; + if (!mm) + return 0; - ret = -ENOMEM; page = (char *)__get_free_page(GFP_TEMPORARY); if (!page) - goto out; - - - mm = mm_for_maps(task); - ret = PTR_ERR(mm); - if (!mm || IS_ERR(mm)) - goto out_free; + return -ENOMEM; ret = 0; + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; while (count > 0) { int this_len, retval, max_len; @@ -838,7 +842,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; this_len = (this_len > max_len) ? max_len : this_len; - retval = access_process_vm(task, (mm->env_start + src), + retval = access_remote_vm(mm, (mm->env_start + src), page, this_len, 0); if (retval <= 0) { @@ -857,19 +861,18 @@ static ssize_t environ_read(struct file *file, char __user *buf, count -= retval; } *ppos = src; - mmput(mm); -out_free: + +free: free_page((unsigned long) page); -out: - put_task_struct(task); -out_no_task: return ret; } static const struct file_operations proc_environ_operations = { + .open = environ_open, .read = environ_read, .llseek = generic_file_llseek, + .release = mem_release, }; static ssize_t oom_adjust_read(struct file *file, char __user *buf, -- cgit v1.2.3 From e7dcd9990e42ccfc798d4eb55e2dbf9d7d434c6b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 31 May 2012 16:26:17 -0700 Subject: proc: remove mm_for_maps() mm_for_maps() is a simple wrapper for mm_access(), and the name is misleading, so just remove it and use mm_access() directly. Signed-off-by: Cong Wang Cc: Oleg Nesterov Cc: Alexey Dobriyan Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 7 +------ fs/proc/internal.h | 2 -- fs/proc/task_mmu.c | 4 ++-- fs/proc/task_nommu.c | 2 +- 4 files changed, 4 insertions(+), 11 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index e2d23424326..cca635d252d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -199,11 +199,6 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -struct mm_struct *mm_for_maps(struct task_struct *task) -{ - return mm_access(task, PTRACE_MODE_READ); -} - static int proc_pid_cmdline(struct task_struct *task, char * buffer) { int res = 0; @@ -243,7 +238,7 @@ out: static int proc_pid_auxv(struct task_struct *task, char *buffer) { - struct mm_struct *mm = mm_for_maps(task); + struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); int res = PTR_ERR(mm); if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5f79bb8b4c6..a30643784db 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -31,8 +31,6 @@ struct vmalloc_info { unsigned long largest_chunk; }; -extern struct mm_struct *mm_for_maps(struct task_struct *); - #ifdef CONFIG_MMU #define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) extern void get_vmalloc_info(struct vmalloc_info *vmi); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7faaf2acc57..83ede6e3b5e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -125,7 +125,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (!priv->task) return ERR_PTR(-ESRCH); - mm = mm_for_maps(priv->task); + mm = mm_access(priv->task, PTRACE_MODE_READ); if (!mm || IS_ERR(mm)) return mm; down_read(&mm->mmap_sem); @@ -919,7 +919,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!pm.buffer) goto out_task; - mm = mm_for_maps(task); + mm = mm_access(task, PTRACE_MODE_READ); ret = PTR_ERR(mm); if (!mm || IS_ERR(mm)) goto out_free; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 74fe164d1b2..1ccfa537f5f 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -223,7 +223,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (!priv->task) return ERR_PTR(-ESRCH); - mm = mm_for_maps(priv->task); + mm = mm_access(priv->task, PTRACE_MODE_READ); if (!mm || IS_ERR(mm)) { put_task_struct(priv->task); priv->task = NULL; -- cgit v1.2.3 From 2344bec788b097b2d1198758bd29c583812b864e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 31 May 2012 16:26:18 -0700 Subject: proc: use mm_access() instead of ptrace_may_access() mm_access() handles this much better, and avoids some race conditions. Signed-off-by: Cong Wang Cc: Oleg Nesterov Cc: Alexey Dobriyan Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index cca635d252d..155dee600ed 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2022,11 +2022,8 @@ static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) if (!task) goto out_notask; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out; - - mm = get_task_mm(task); - if (!mm) + mm = mm_access(task, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(mm)) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { -- cgit v1.2.3 From f05ed3f1abefd37c08fbf08c766d2abd40607777 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 31 May 2012 16:26:18 -0700 Subject: proc: don't do dummy rcu_read_lock/rcu_read_unlock on error path rcu_read_lock()/rcu_read_unlock() is nop for TINY_RCU, but is not a nop for, say, PREEMPT_RCU. proc_fill_cache() is called without RCU lock, there is no need to lock/unlock on error path, simply jump out of the loop. Signed-off-by: Alexey Dobriyan Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 155dee600ed..0f928cbba4a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1928,21 +1928,22 @@ static int proc_readfd_common(struct file * filp, void * dirent, fd++, filp->f_pos++) { char name[PROC_NUMBUF]; int len; + int rv; if (!fcheck_files(files, fd)) continue; rcu_read_unlock(); len = snprintf(name, sizeof(name), "%d", fd); - if (proc_fill_cache(filp, dirent, filldir, - name, len, instantiate, - p, &fd) < 0) { - rcu_read_lock(); - break; - } + rv = proc_fill_cache(filp, dirent, filldir, + name, len, instantiate, p, + &fd); + if (rv < 0) + goto out_fd_loop; rcu_read_lock(); } rcu_read_unlock(); +out_fd_loop: put_files_struct(files); } out: -- cgit v1.2.3 From af5e6171437c9d62d84459b24877c94c23782676 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 31 May 2012 16:26:18 -0700 Subject: proc: pass "fd" by value in /proc/*/{fd,fdinfo} code Pass "fd" directly, not via pointer -- one less memory read. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0f928cbba4a..bd8b4ca6a61 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1848,7 +1848,7 @@ static const struct dentry_operations tid_fd_dentry_operations = static struct dentry *proc_fd_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - unsigned fd = *(const unsigned *)ptr; + unsigned fd = (unsigned long)ptr; struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); @@ -1885,7 +1885,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, if (fd == ~0U) goto out; - result = instantiate(dir, dentry, task, &fd); + result = instantiate(dir, dentry, task, (void *)(unsigned long)fd); out: put_task_struct(task); out_no_task: @@ -1937,7 +1937,7 @@ static int proc_readfd_common(struct file * filp, void * dirent, len = snprintf(name, sizeof(name), "%d", fd); rv = proc_fill_cache(filp, dirent, filldir, name, len, instantiate, p, - &fd); + (void *)(unsigned long)fd); if (rv < 0) goto out_fd_loop; rcu_read_lock(); @@ -2353,7 +2353,7 @@ static const struct inode_operations proc_fd_inode_operations = { static struct dentry *proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { - unsigned fd = *(unsigned *)ptr; + unsigned fd = (unsigned long)ptr; struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); -- cgit v1.2.3 From 715be1fce0d964aca15618b24f6f415f3cbd03c8 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Thu, 31 May 2012 16:26:19 -0700 Subject: procfs: use more apprioriate types when dumping /proc/N/stat - use int fpr priority and nice, since task_nice()/task_prio() return that - field 24: get_mm_rss() returns unsigned long Signed-off-by: Jan Engelhardt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index dc4c5a7b9ec..3a26d23420c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -370,7 +370,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { unsigned long vsize, eip, esp, wchan = ~0UL; - long priority, nice; + int priority, nice; int tty_pgrp = -1, tty_nr = 0; sigset_t sigign, sigcatch; char state; @@ -492,7 +492,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', 0); seq_put_decimal_ull(m, ' ', start_time); seq_put_decimal_ull(m, ' ', vsize); - seq_put_decimal_ll(m, ' ', mm ? get_mm_rss(mm) : 0); + seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0); seq_put_decimal_ull(m, ' ', rsslim); seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); -- cgit v1.2.3 From 052fb0d635df5d49dfc85687d94e1a87bf09378d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 31 May 2012 16:26:19 -0700 Subject: proc: report file/anon bit in /proc/pid/pagemap This is an implementation of Andrew's proposal to extend the pagemap file bits to report what is missing about tasks' working set. The problem with the working set detection is multilateral. In the criu (checkpoint/restore) project we dump the tasks' memory into image files and to do it properly we need to detect which pages inside mappings are really in use. The mincore syscall I though could help with this did not. First, it doesn't report swapped pages, thus we cannot find out which parts of anonymous mappings to dump. Next, it does report pages from page cache as present even if they are not mapped, and it doesn't make that has not been cow-ed. Note, that issue with swap pages is critical -- we must dump swap pages to image file. But the issues with file pages are optimization -- we can take all file pages to image, this would be correct, but if we know that a page is not mapped or not cow-ed, we can remove them from dump file. The dump would still be self-consistent, though significantly smaller in size (up to 10 times smaller on real apps). Andrew noticed, that the proc pagemap file solved 2 of 3 above issues -- it reports whether a page is present or swapped and it doesn't report not mapped page cache pages. But, it doesn't distinguish cow-ed file pages from not cow-ed. I would like to make the last unused bit in this file to report whether the page mapped into respective pte is PageAnon or not. [comment stolen from Pavel Emelyanov's v1 patch] Signed-off-by: Konstantin Khlebnikov Cc: Pavel Emelyanov Cc: Matt Mackall Cc: Hugh Dickins Cc: Rik van Riel Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 83ede6e3b5e..02476f5889f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -700,6 +700,7 @@ struct pagemapread { #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) +#define PM_FILE PM_STATUS(1LL) #define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) #define PM_END_OF_BUFFER 1 @@ -733,22 +734,33 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, return err; } -static u64 swap_pte_to_pagemap_entry(pte_t pte) +static void pte_to_pagemap_entry(pagemap_entry_t *pme, + struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - swp_entry_t e = pte_to_swp_entry(pte); - return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); -} - -static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte) -{ - if (is_swap_pte(pte)) - *pme = make_pme(PM_PFRAME(swap_pte_to_pagemap_entry(pte)) - | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP); - else if (pte_present(pte)) - *pme = make_pme(PM_PFRAME(pte_pfn(pte)) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); - else + u64 frame, flags; + struct page *page = NULL; + + if (pte_present(pte)) { + frame = pte_pfn(pte); + flags = PM_PRESENT; + page = vm_normal_page(vma, addr, pte); + } else if (is_swap_pte(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + flags = PM_SWAP; + if (is_migration_entry(entry)) + page = migration_entry_to_page(entry); + } else { *pme = make_pme(PM_NOT_PRESENT); + return; + } + + if (page && !PageAnon(page)) + flags |= PM_FILE; + + *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -815,7 +827,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pte_to_pagemap_entry(&pme, *pte); + pte_to_pagemap_entry(&pme, vma, addr, *pte); /* unmap before userspace copy */ pte_unmap(pte); } @@ -869,11 +881,11 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * For each page in the address space, this file contains one 64-bit entry * consisting of the following: * - * Bits 0-55 page frame number (PFN) if present + * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped - * Bits 5-55 swap offset if swapped + * Bits 5-54 swap offset if swapped * Bits 55-60 page shift (page size = 1< Date: Thu, 31 May 2012 16:26:20 -0700 Subject: proc/smaps: carefully handle migration entries Currently smaps reports migration entries as "swap", as result "swap" can appears in shared mapping. This patch converts migration entries into pages and handles them as usual. Signed-off-by: Konstantin Khlebnikov Cc: Andi Kleen Cc: Pavel Emelyanov Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 02476f5889f..e2c1155ac09 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -402,18 +402,20 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; - struct page *page; + struct page *page = NULL; int mapcount; - if (is_swap_pte(ptent)) { - mss->swap += ptent_size; - return; - } + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); - if (!pte_present(ptent)) - return; + if (!non_swap_entry(swpent)) + mss->swap += ptent_size; + else if (is_migration_entry(swpent)) + page = migration_entry_to_page(swpent); + } - page = vm_normal_page(vma, addr, ptent); if (!page) return; -- cgit v1.2.3 From bca15543736f9be6d84e0bbc262ea7069076b9e6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 31 May 2012 16:26:20 -0700 Subject: proc/smaps: show amount of nonlinear ptes in vma Currently, nonlinear mappings can not be distinguished from ordinary mappings. This patch adds into /proc/pid/smaps line "Nonlinear: kB", where size is amount of nonlinear ptes in vma, this line appears only if VM_NONLINEAR is set. This information may be useful not only for checkpoint/restore project. Requested by Pavel Emelyanov. Signed-off-by: Konstantin Khlebnikov Cc: Andi Kleen Cc: Pavel Emelyanov Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e2c1155ac09..4540b8f76f1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -393,6 +393,7 @@ struct mem_size_stats { unsigned long anonymous; unsigned long anonymous_thp; unsigned long swap; + unsigned long nonlinear; u64 pss; }; @@ -402,6 +403,7 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; + pgoff_t pgoff = linear_page_index(vma, addr); struct page *page = NULL; int mapcount; @@ -414,6 +416,9 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr, mss->swap += ptent_size; else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + } else if (pte_file(ptent)) { + if (pte_to_pgoff(ptent) != pgoff) + mss->nonlinear += ptent_size; } if (!page) @@ -422,6 +427,9 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr, if (PageAnon(page)) mss->anonymous += ptent_size; + if (page->index != pgoff) + mss->nonlinear += ptent_size; + mss->resident += ptent_size; /* Accumulate the size in pages that have been accessed. */ if (pte_young(ptent) || PageReferenced(page)) @@ -523,6 +531,10 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) (vma->vm_flags & VM_LOCKED) ? (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); + if (vma->vm_flags & VM_NONLINEAR) + seq_printf(m, "Nonlinear: %8lu kB\n", + mss.nonlinear >> 10); + if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task->mm)) ? vma->vm_start : 0; -- cgit v1.2.3 From 818411616baf46ceba0cff6f05af3a9b294734f7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:43 -0700 Subject: fs, proc: introduce /proc//task//children entry When we do checkpoint of a task we need to know the list of children the task, has but there is no easy and fast way to generate reverse parent->children chain from arbitrary (while a parent pid is provided in "PPid" field of /proc//status). So instead of walking over all pids in the system (creating one big process tree in memory, just to figure out which children a task has) -- we add explicit /proc//task//children entry, because the kernel already has this kind of information but it is not yet exported. This is a first level children, not the whole process tree. Signed-off-by: Cyrill Gorcunov Reviewed-by: Oleg Nesterov Reviewed-by: Kees Cook Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/proc/base.c | 3 ++ fs/proc/internal.h | 1 + 3 files changed, 127 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 3a26d23420c..62887e39a2d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -565,3 +565,126 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, return 0; } + +#ifdef CONFIG_CHECKPOINT_RESTORE +static struct pid * +get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) +{ + struct task_struct *start, *task; + struct pid *pid = NULL; + + read_lock(&tasklist_lock); + + start = pid_task(proc_pid(inode), PIDTYPE_PID); + if (!start) + goto out; + + /* + * Lets try to continue searching first, this gives + * us significant speedup on children-rich processes. + */ + if (pid_prev) { + task = pid_task(pid_prev, PIDTYPE_PID); + if (task && task->real_parent == start && + !(list_empty(&task->sibling))) { + if (list_is_last(&task->sibling, &start->children)) + goto out; + task = list_first_entry(&task->sibling, + struct task_struct, sibling); + pid = get_pid(task_pid(task)); + goto out; + } + } + + /* + * Slow search case. + * + * We might miss some children here if children + * are exited while we were not holding the lock, + * but it was never promised to be accurate that + * much. + * + * "Just suppose that the parent sleeps, but N children + * exit after we printed their tids. Now the slow paths + * skips N extra children, we miss N tasks." (c) + * + * So one need to stop or freeze the leader and all + * its children to get a precise result. + */ + list_for_each_entry(task, &start->children, sibling) { + if (pos-- == 0) { + pid = get_pid(task_pid(task)); + break; + } + } + +out: + read_unlock(&tasklist_lock); + return pid; +} + +static int children_seq_show(struct seq_file *seq, void *v) +{ + struct inode *inode = seq->private; + pid_t pid; + + pid = pid_nr_ns(v, inode->i_sb->s_fs_info); + return seq_printf(seq, "%d ", pid); +} + +static void *children_seq_start(struct seq_file *seq, loff_t *pos) +{ + return get_children_pid(seq->private, NULL, *pos); +} + +static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct pid *pid; + + pid = get_children_pid(seq->private, v, *pos + 1); + put_pid(v); + + ++*pos; + return pid; +} + +static void children_seq_stop(struct seq_file *seq, void *v) +{ + put_pid(v); +} + +static const struct seq_operations children_seq_ops = { + .start = children_seq_start, + .next = children_seq_next, + .stop = children_seq_stop, + .show = children_seq_show, +}; + +static int children_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &children_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = inode; + + return ret; +} + +int children_seq_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + return 0; +} + +const struct file_operations proc_tid_children_operations = { + .open = children_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = children_seq_release, +}; +#endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/fs/proc/base.c b/fs/proc/base.c index bd8b4ca6a61..616f41a7cde 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3400,6 +3400,9 @@ static const struct pid_entry tid_base_stuff[] = { ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_tid_maps_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + REG("children", S_IRUGO, proc_tid_children_operations), +#endif #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), #endif diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a30643784db..eca4aca5b6e 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -54,6 +54,7 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); +extern const struct file_operations proc_tid_children_operations; extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; -- cgit v1.2.3 From 5b172087f99189416d5f47fd7ab5e6fb762a9ba3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:44 -0700 Subject: c/r: procfs: add arg_start/end, env_start/end and exit_code members to /proc/$pid/stat We would like to have an ability to restore command line arguments and program environment pointers but first we need to obtain them somehow. Thus we put these values into /proc/$pid/stat. The exit_code is needed to restore zombie tasks. Signed-off-by: Cyrill Gorcunov Acked-by: Kees Cook Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Cc: Alexey Dobriyan Cc: Tejun Heo Cc: Andrew Vagin Cc: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 62887e39a2d..c1c207c36ca 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -517,9 +517,23 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); - seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_data : 0); - seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->end_data : 0); - seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_brk : 0); + + if (mm && permitted) { + seq_put_decimal_ull(m, ' ', mm->start_data); + seq_put_decimal_ull(m, ' ', mm->end_data); + seq_put_decimal_ull(m, ' ', mm->start_brk); + seq_put_decimal_ull(m, ' ', mm->arg_start); + seq_put_decimal_ull(m, ' ', mm->arg_end); + seq_put_decimal_ull(m, ' ', mm->env_start); + seq_put_decimal_ull(m, ' ', mm->env_end); + } else + seq_printf(m, " 0 0 0 0 0 0 0"); + + if (permitted) + seq_put_decimal_ll(m, ' ', task->exit_code); + else + seq_put_decimal_ll(m, ' ', 0); + seq_putc(m, '\n'); if (mm) mmput(mm); -- cgit v1.2.3 From 0640113be25d283e0ff77a9f041e1242182387f0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 4 Jun 2012 11:00:45 -0700 Subject: vfs: Fix /proc//fdinfo/ file handling Cyrill Gorcunov reports that I broke the fdinfo files with commit 30a08bf2d31d ("proc: move fd symlink i_mode calculations into tid_fd_revalidate()"), and he's quite right. The tid_fd_revalidate() function is not just used for the /fd symlinks, it's also used for the /fdinfo/ files, and the permission model for those are different. So do the dynamic symlink permission handling just for symlinks, making the fdinfo files once more appear as the proper regular files they are. Of course, Al Viro argued (probably correctly) that we shouldn't do the symlink permission games at all, and make the symlinks always just be the normal 'lrwxrwxrwx'. That would have avoided this issue too, but since somebody noticed that the permissions had changed (which was the reason for that original commit 30a08bf2d31d in the first place), people do apparently use this feature. [ Basically, you can use the symlink permission data as a cheap "fdinfo" replacement, since you see whether the file is open for reading and/or writing by just looking at st_mode of the symlink. So the feature does make sense, even if the pain it has caused means we probably shouldn't have done it to begin with. ] Reported-and-tested-by: Cyrill Gorcunov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 616f41a7cde..437195f204e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1803,7 +1803,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - unsigned i_mode, f_mode = file->f_mode; + unsigned f_mode = file->f_mode; rcu_read_unlock(); put_files_struct(files); @@ -1819,12 +1819,14 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = GLOBAL_ROOT_GID; } - i_mode = S_IFLNK; - if (f_mode & FMODE_READ) - i_mode |= S_IRUSR | S_IXUSR; - if (f_mode & FMODE_WRITE) - i_mode |= S_IWUSR | S_IXUSR; - inode->i_mode = i_mode; + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } security_task_to_inode(task, inode); put_task_struct(task); @@ -1859,6 +1861,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, ei = PROC_I(inode); ei->fd = fd; + inode->i_mode = S_IFLNK; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; -- cgit v1.2.3