From c82389947d90c8b0dd206d9ef082191eb58df8fc Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 11 Apr 2024 12:20:57 +0200 Subject: tracing: Add sched_prepare_exec tracepoint Add "sched_prepare_exec" tracepoint, which is run right after the point of no return but before the current task assumes its new exec identity. Unlike the tracepoint "sched_process_exec", the "sched_prepare_exec" tracepoint runs before flushing the old exec, i.e. while the task still has the original state (such as original MM), but when the new exec either succeeds or crashes (but never returns to the original exec). Being able to trace this event can be helpful in a number of use cases: * allowing tracing eBPF programs access to the original MM on exec, before current->mm is replaced; * counting exec in the original task (via perf event); * profiling flush time ("sched_prepare_exec" to "sched_process_exec"). Example of tracing output: $ cat /sys/kernel/debug/tracing/trace_pipe <...>-379 [003] ..... 179.626921: sched_prepare_exec: interp=/usr/bin/sshd filename=/usr/bin/sshd pid=379 comm=sshd <...>-381 [002] ..... 180.048580: sched_prepare_exec: interp=/bin/bash filename=/bin/bash pid=381 comm=sshd <...>-385 [001] ..... 180.068277: sched_prepare_exec: interp=/usr/bin/tty filename=/usr/bin/tty pid=385 comm=bash <...>-389 [006] ..... 192.020147: sched_prepare_exec: interp=/usr/bin/dmesg filename=/usr/bin/dmesg pid=389 comm=bash Signed-off-by: Marco Elver Acked-by: Steven Rostedt (Google) Reviewed-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20240411102158.1272267-1-elver@google.com Signed-off-by: Kees Cook --- fs/exec.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index cf1df7f16e55..b3c40fbb325f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1267,6 +1267,14 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) return retval; + /* + * This tracepoint marks the point before flushing the old exec where + * the current task is still unchanged, but errors are fatal (point of + * no return). The later "sched_process_exec" tracepoint is called after + * the current task has successfully switched to the new exec. + */ + trace_sched_prepare_exec(current, bprm); + /* * Ensure all future errors are fatal. */ -- cgit v1.2.3 From 2a5eb9995528441447d33838727f6ec1caf08139 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 16 Feb 2024 22:25:44 -0800 Subject: binfmt_elf: Leave a gap between .bss and brk Currently the brk starts its randomization immediately after .bss, which means there is a chance that when the random offset is 0, linear overflows from .bss can reach into the brk area. Leave at least a single page gap between .bss and brk (when it has not already been explicitly relocated into the mmap range). Reported-by: Closes: https://lore.kernel.org/linux-hardening/CA+2EKTVLvc8hDZc+2Yhwmus=dzOUG5E4gV7ayCbu0MPJTZzWkw@mail.gmail.com/ Link: https://lore.kernel.org/r/20240217062545.1631668-2-keescook@chromium.org Signed-off-by: Kees Cook --- fs/binfmt_elf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5397b552fbeb..7862962f7a85 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1262,6 +1262,9 @@ out_free_interp: if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && elf_ex->e_type == ET_DYN && !interpreter) { mm->brk = mm->start_brk = ELF_ET_DYN_BASE; + } else { + /* Otherwise leave a gap between .bss and brk. */ + mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; } mm->brk = mm->start_brk = arch_randomize_brk(mm); -- cgit v1.2.3 From 10e29251be0e9f774910c1baaa89355859491769 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 22 Mar 2024 12:54:18 -0700 Subject: binfmt_elf_fdpic: fix /proc//auxv Althought FDPIC linux kernel provides /proc//auxv files they are empty because there's no code that initializes mm->saved_auxv in the FDPIC ELF loader. Synchronize FDPIC ELF aux vector setup with ELF. Replace entry-by-entry aux vector copying to userspace with initialization of mm->saved_auxv first and then copying it to userspace as a whole. Signed-off-by: Max Filippov Link: https://lore.kernel.org/r/20240322195418.2160164-1-jcmvbkbc@gmail.com Signed-off-by: Kees Cook --- fs/binfmt_elf_fdpic.c | 85 +++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 3314249e8674..b799701454a9 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -505,8 +505,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, char *k_platform, *k_base_platform; char __user *u_platform, *u_base_platform, *p; int loop; - int nr; /* reset for each csp adjustment */ unsigned long flags = 0; + int ei_index; + elf_addr_t *elf_info; #ifdef CONFIG_MMU /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions @@ -601,44 +602,24 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, csp -= sp & 15UL; sp -= sp & 15UL; - /* put the ELF interpreter info on the stack */ -#define NEW_AUX_ENT(id, val) \ - do { \ - struct { unsigned long _id, _val; } __user *ent, v; \ - \ - ent = (void __user *) csp; \ - v._id = (id); \ - v._val = (val); \ - if (copy_to_user(ent + nr, &v, sizeof(v))) \ - return -EFAULT; \ - nr++; \ + /* Create the ELF interpreter info */ + elf_info = (elf_addr_t *)mm->saved_auxv; + /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ +#define NEW_AUX_ENT(id, val) \ + do { \ + *elf_info++ = id; \ + *elf_info++ = val; \ } while (0) - nr = 0; - csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_NULL, 0); - if (k_platform) { - nr = 0; - csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_PLATFORM, - (elf_addr_t) (unsigned long) u_platform); - } - - if (k_base_platform) { - nr = 0; - csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_BASE_PLATFORM, - (elf_addr_t) (unsigned long) u_base_platform); - } - - if (bprm->have_execfd) { - nr = 0; - csp -= 2 * sizeof(unsigned long); - NEW_AUX_ENT(AT_EXECFD, bprm->execfd); - } - - nr = 0; - csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); +#ifdef ARCH_DLINFO + /* + * ARCH_DLINFO must come first so PPC can do its special alignment of + * AUXV. + * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in + * ARCH_DLINFO changes + */ + ARCH_DLINFO; +#endif NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); @@ -659,17 +640,29 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, bprm->secureexec); NEW_AUX_ENT(AT_EXECFN, bprm->exec); + if (k_platform) + NEW_AUX_ENT(AT_PLATFORM, + (elf_addr_t)(unsigned long)u_platform); + if (k_base_platform) + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t)(unsigned long)u_base_platform); + if (bprm->have_execfd) + NEW_AUX_ENT(AT_EXECFD, bprm->execfd); +#undef NEW_AUX_ENT + /* AT_NULL is zero; clear the rest too */ + memset(elf_info, 0, (char *)mm->saved_auxv + + sizeof(mm->saved_auxv) - (char *)elf_info); -#ifdef ARCH_DLINFO - nr = 0; - csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long); + /* And advance past the AT_NULL entry. */ + elf_info += 2; - /* ARCH_DLINFO must come last so platform specific code can enforce - * special alignment requirements on the AUXV if necessary (eg. PPC). - */ - ARCH_DLINFO; -#endif -#undef NEW_AUX_ENT + ei_index = elf_info - (elf_addr_t *)mm->saved_auxv; + csp -= ei_index * sizeof(elf_addr_t); + + /* Put the elf_info on the stack in the right place. */ + if (copy_to_user((void __user *)csp, mm->saved_auxv, + ei_index * sizeof(elf_addr_t))) + return -EFAULT; /* allocate room for argv[] and envv[] */ csp -= (bprm->envc + 1) * sizeof(elf_caddr_t); -- cgit v1.2.3 From 4bbf9c3b53e637eb3a14ee27b996300ce88e752a Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Mon, 6 May 2024 19:37:00 +0000 Subject: fs/coredump: Enable dynamic configuration of max file note size Introduce the capability to dynamically configure the maximum file note size for ELF core dumps via sysctl. Why is this being done? We have observed that during a crash when there are more than 65k mmaps in memory, the existing fixed limit on the size of the ELF notes section becomes a bottleneck. The notes section quickly reaches its capacity, leading to incomplete memory segment information in the resulting coredump. This truncation compromises the utility of the coredumps, as crucial information about the memory state at the time of the crash might be omitted. This enhancement removes the previous static limit of 4MB, allowing system administrators to adjust the size based on system-specific requirements or constraints. Eg: $ sysctl -a | grep core_file_note_size_limit kernel.core_file_note_size_limit = 4194304 $ sysctl -n kernel.core_file_note_size_limit 4194304 $echo 519304 > /proc/sys/kernel/core_file_note_size_limit $sysctl -n kernel.core_file_note_size_limit 519304 Attempting to write beyond the ceiling value of 16MB $echo 17194304 > /proc/sys/kernel/core_file_note_size_limit bash: echo: write error: Invalid argument Signed-off-by: Vijay Nag Signed-off-by: Allen Pais Link: https://lore.kernel.org/r/20240506193700.7884-1-apais@linux.microsoft.com Signed-off-by: Kees Cook --- fs/binfmt_elf.c | 7 +++++-- fs/coredump.c | 17 +++++++++++++++++ include/linux/coredump.h | 2 ++ 3 files changed, 24 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7862962f7a85..b5a25ee49eea 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1567,7 +1567,6 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); } -#define MAX_FILE_NOTE_SIZE (4*1024*1024) /* * Format of NT_FILE note: * @@ -1595,8 +1594,12 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm names_ofs = (2 + 3 * count) * sizeof(data[0]); alloc: - if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ + /* paranoia check */ + if (size >= core_file_note_size_limit) { + pr_warn_once("coredump Note size too large: %u (does kernel.core_file_note_size_limit sysctl need adjustment?\n", + size); return -EINVAL; + } size = round_up(size, PAGE_SIZE); /* * "size" can be 0 here legitimately. diff --git a/fs/coredump.c b/fs/coredump.c index be6403b4b14b..317065e3eb9b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -56,10 +56,15 @@ static bool dump_vma_snapshot(struct coredump_params *cprm); static void free_vma_snapshot(struct coredump_params *cprm); +#define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024) +/* Define a reasonable max cap */ +#define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024) + static int core_uses_pid; static unsigned int core_pipe_limit; static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; +unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; struct core_name { char *corename; @@ -998,6 +1003,9 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, return error; } +static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT; +static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX; + static struct ctl_table coredump_sysctls[] = { { .procname = "core_uses_pid", @@ -1020,6 +1028,15 @@ static struct ctl_table coredump_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "core_file_note_size_limit", + .data = &core_file_note_size_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (unsigned int *)&core_file_note_size_min, + .extra2 = (unsigned int *)&core_file_note_size_max, + }, }; static int __init init_fs_coredump_sysctls(void) diff --git a/include/linux/coredump.h b/include/linux/coredump.h index d3eba4360150..0904ba010341 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -30,6 +30,8 @@ struct coredump_params { struct core_vma_metadata *vma_meta; }; +extern unsigned int core_file_note_size_limit; + /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. -- cgit v1.2.3