From b2f73922d119686323f14fbbe46587f863852328 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 30 Sep 2015 15:59:17 +0200 Subject: fs/proc, core/debug: Don't expose absolute kernel addresses via wchan So the /proc/PID/stat 'wchan' field (the 30th field, which contains the absolute kernel address of the kernel function a task is blocked in) leaks absolute kernel addresses to unprivileged user-space: seq_put_decimal_ull(m, ' ', wchan); The absolute address might also leak via /proc/PID/wchan as well, if KALLSYMS is turned off or if the symbol lookup fails for some reason: static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; char symname[KSYM_NAME_LEN]; wchan = get_wchan(task); if (lookup_symbol_name(wchan, symname) < 0) { if (!ptrace_may_access(task, PTRACE_MODE_READ)) return 0; seq_printf(m, "%lu", wchan); } else { seq_printf(m, "%s", symname); } return 0; } This isn't ideal, because for example it trivially leaks the KASLR offset to any local attacker: fomalhaut:~> printf "%016lx\n" $(cat /proc/$$/stat | cut -d' ' -f35) ffffffff8123b380 Most real-life uses of wchan are symbolic: ps -eo pid:10,tid:10,wchan:30,comm and procps uses /proc/PID/wchan, not the absolute address in /proc/PID/stat: triton:~/tip> strace -f ps -eo pid:10,tid:10,wchan:30,comm 2>&1 | grep wchan | tail -1 open("/proc/30833/wchan", O_RDONLY) = 6 There's one compatibility quirk here: procps relies on whether the absolute value is non-zero - and we can provide that functionality by outputing "0" or "1" depending on whether the task is blocked (whether there's a wchan address). These days there appears to be very little legitimate reason user-space would be interested in the absolute address. The absolute address is mostly historic: from the days when we didn't have kallsyms and user-space procps had to do the decoding itself via the System.map. So this patch sets all numeric output to "0" or "1" and keeps only symbolic output, in /proc/PID/wchan. ( The absolute sleep address can generally still be profiled via perf, by tasks with sufficient privileges. ) Reviewed-by: Thomas Gleixner Acked-by: Kees Cook Acked-by: Linus Torvalds Cc: Cc: Al Viro Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: Kostya Serebryany Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Sasha Levin Cc: kasan-dev Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150930135917.GA3285@gmail.com Signed-off-by: Ingo Molnar --- fs/proc/base.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index b25eee4cead5..29595af32866 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -430,13 +430,10 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (lookup_symbol_name(wchan, symname) < 0) { - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - return 0; - seq_printf(m, "%lu", wchan); - } else { + if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); - } + else + seq_putc(m, '0'); return 0; } -- cgit v1.2.3 From b72bdfa73603f2f81fbac651b9ae36807e877752 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Nov 2015 18:50:32 -0800 Subject: mm, oom: add comment for why oom_adj exists /proc/pid/oom_adj exists solely to avoid breaking existing userspace binaries that write to the tunable. Add a comment in the only possible location within the kernel tree to describe the situation and motivation for keeping it around. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 29595af32866..bd3e9e68125b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1032,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, return simple_read_from_buffer(buf, count, ppos, buffer, len); } +/* + * /proc/pid/oom_adj exists solely for backwards compatibility with previous + * kernels. The effective policy is defined by oom_score_adj, which has a + * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. + * Values written to oom_adj are simply mapped linearly to oom_score_adj. + * Processes that become oom disabled via oom_adj will still be oom disabled + * with this implementation. + * + * oom_adj cannot be removed since existing userspace binaries use it. + */ static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { -- cgit v1.2.3