summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2025-04-14 16:41:42 +0200
committerChristian Brauner <brauner@kernel.org>2025-05-02 14:28:47 +0200
commit4dd6566b5a8ca1e8c9ff2652c2249715d6c64217 (patch)
tree19d4689861b1b941221e84cfea2a8ff23603a151
parente1b477c21300a928a7765a03cb96ca36c4dcf5ed (diff)
parentb5325b2a270fcaf7b2a9a0f23d422ca8a5a8bdea (diff)
Merge patch series "coredump: hand a pidfd to the usermode coredump helper"
Christian Brauner <brauner@kernel.org> says: Give userspace a way to instruct the kernel to install a pidfd for the crashing process into the process started as a usermode helper. There's still tricky race-windows that cannot be easily or sometimes not closed at all by userspace. There's various ways like looking at the start time of a process to make sure that the usermode helper process is started after the crashing process but it's all very very brittle and fraught with peril. The crashed-but-not-reaped process can be killed by userspace before coredump processing programs like systemd-coredump have had time to manually open a PIDFD from the PID the kernel provides them, which means they can be tricked into reading from an arbitrary process, and they run with full privileges as they are usermode helper processes. Even if that specific race-window wouldn't exist it's still the safest and cleanest way to let the kernel provide the pidfd directly instead of requiring userspace to do it manually. In parallel with this commit we already have systemd adding support for this in [1]. When the usermode helper process is forked we install a pidfd file descriptor three into the usermode helper's file descriptor table so it's available to the exec'd program. Since usermode helpers are either children of the system_unbound_wq workqueue or kthreadd we know that the file descriptor table is empty and can thus always use three as the file descriptor number. Note, that we'll install a pidfd for the thread-group leader even if a subthread is calling do_coredump(). We know that task linkage hasn't been removed yet and even if this @current isn't the actual thread-group leader we know that the thread-group leader cannot be reaped until @current has exited. [1]: https://github.com/systemd/systemd/pull/37125 * patches from https://lore.kernel.org/20250414-work-coredump-v2-0-685bf231f828@kernel.org: coredump: hand a pidfd to the usermode coredump helper coredump: fix error handling for replace_fd() pidfs: move O_RDWR into pidfs_alloc_file() Link: https://lore.kernel.org/20250414-work-coredump-v2-0-685bf231f828@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
-rw-r--r--fs/coredump.c65
-rw-r--r--fs/pidfs.c1
-rw-r--r--include/linux/coredump.h1
3 files changed, 61 insertions, 6 deletions
diff --git a/fs/coredump.c b/fs/coredump.c
index c33c177a701b..d740a0411266 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -43,6 +43,8 @@
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
#include <linux/elf.h>
+#include <linux/pidfs.h>
+#include <uapi/linux/pidfd.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -60,6 +62,12 @@ static void free_vma_snapshot(struct coredump_params *cprm);
#define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024)
/* Define a reasonable max cap */
#define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024)
+/*
+ * File descriptor number for the pidfd for the thread-group leader of
+ * the coredumping task installed into the usermode helper's file
+ * descriptor table.
+ */
+#define COREDUMP_PIDFD_NUMBER 3
static int core_uses_pid;
static unsigned int core_pipe_limit;
@@ -339,6 +347,27 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
case 'C':
err = cn_printf(cn, "%d", cprm->cpu);
break;
+ /* pidfd number */
+ case 'F': {
+ /*
+ * Installing a pidfd only makes sense if
+ * we actually spawn a usermode helper.
+ */
+ if (!ispipe)
+ break;
+
+ /*
+ * Note that we'll install a pidfd for the
+ * thread-group leader. We know that task
+ * linkage hasn't been removed yet and even if
+ * this @current isn't the actual thread-group
+ * leader we know that the thread-group leader
+ * cannot be reaped until @current has exited.
+ */
+ cprm->pid = task_tgid(current);
+ err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER);
+ break;
+ }
default:
break;
}
@@ -493,7 +522,7 @@ static void wait_for_dump_helpers(struct file *file)
}
/*
- * umh_pipe_setup
+ * umh_coredump_setup
* helper function to customize the process used
* to collect the core in userspace. Specifically
* it sets up a pipe and installs it as fd 0 (stdin)
@@ -503,11 +532,32 @@ static void wait_for_dump_helpers(struct file *file)
* is a special value that we use to trap recursive
* core dumps
*/
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
{
struct file *files[2];
struct coredump_params *cp = (struct coredump_params *)info->data;
- int err = create_pipe_files(files, 0);
+ int err;
+
+ if (cp->pid) {
+ struct file *pidfs_file __free(fput) = NULL;
+
+ pidfs_file = pidfs_alloc_file(cp->pid, 0);
+ if (IS_ERR(pidfs_file))
+ return PTR_ERR(pidfs_file);
+
+ /*
+ * Usermode helpers are childen of either
+ * system_unbound_wq or of kthreadd. So we know that
+ * we're starting off with a clean file descriptor
+ * table. So we should always be able to use
+ * COREDUMP_PIDFD_NUMBER as our file descriptor value.
+ */
+ err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0);
+ if (err < 0)
+ return err;
+ }
+
+ err = create_pipe_files(files, 0);
if (err)
return err;
@@ -515,10 +565,13 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
err = replace_fd(0, files[0], 0);
fput(files[0]);
+ if (err < 0)
+ return err;
+
/* and disallow core files too */
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
- return err;
+ return 0;
}
void do_coredump(const kernel_siginfo_t *siginfo)
@@ -593,7 +646,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
}
if (cprm.limit == 1) {
- /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
+ /* See umh_coredump_setup() which sets RLIMIT_CORE = 1.
*
* Normally core limits are irrelevant to pipes, since
* we're not writing to the file system, but we use
@@ -632,7 +685,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
retval = -ENOMEM;
sub_info = call_usermodehelper_setup(helper_argv[0],
helper_argv, NULL, GFP_KERNEL,
- umh_pipe_setup, NULL, &cprm);
+ umh_coredump_setup, NULL, &cprm);
if (sub_info)
retval = call_usermodehelper_exec(sub_info,
UMH_WAIT_EXEC);
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 9d993ecadad7..3b39e471840b 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -892,6 +892,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
return ERR_PTR(-ESRCH);
flags &= ~PIDFD_STALE;
+ flags |= O_RDWR;
pidfd_file = dentry_open(&path, flags, current_cred());
/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
if (!IS_ERR(pidfd_file))
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 77e6e195d1d6..76e41805b92d 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -28,6 +28,7 @@ struct coredump_params {
int vma_count;
size_t vma_data_size;
struct core_vma_metadata *vma_meta;
+ struct pid *pid;
};
extern unsigned int core_file_note_size_limit;