diff options
author | Josh Poimboeuf <jpoimboe@kernel.org> | 2025-07-29 14:23:08 -0400 |
---|---|---|
committer | Steven Rostedt (Google) <rostedt@goodmis.org> | 2025-07-31 10:20:10 -0400 |
commit | 2dffa355f6c279e7d2e574abf9446c41a631c9e5 (patch) | |
tree | 671cedcf45eeff3e1fee729ab799df5d014ae85c /kernel/unwind/deferred.c | |
parent | b9c73524106e1c0c857006fb9ff2e5a510dc4021 (diff) |
unwind_user/deferred: Add deferred unwinding interface
Add an interface for scheduling task work to unwind the user space stack
before returning to user space. This solves several problems for its
callers:
- Ensure the unwind happens in task context even if the caller may be
running in interrupt context.
- Avoid duplicate unwinds, whether called multiple times by the same
caller or by different callers.
- Create a "context cookie" which allows trace post-processing to
correlate kernel unwinds/traces with the user unwind.
A concept of a "cookie" is created to detect when the stacktrace is the
same. A cookie is generated the first time a user space stacktrace is
requested after the task enters the kernel. As the stacktrace is saved on
the task_struct while the task is in the kernel, if another request comes
in, if the cookie is still the same, it will use the saved stacktrace,
and not have to regenerate one.
The cookie is passed to the caller on request, and when the stacktrace is
generated upon returning to user space, it calls the requester's callback
with the cookie as well as the stacktrace. The cookie is cleared
when it goes back to user space. Note, this currently adds another
conditional to the unwind_reset_info() path that is always called
returning to user space, but future changes will put this back to a single
conditional.
A global list is created and protected by a global mutex that holds
tracers that register with the unwind infrastructure. The number of
registered tracers will be limited in future changes. Each perf program or
ftrace instance will register its own descriptor to use for deferred
unwind stack traces.
Note, in the function unwind_deferred_task_work() that gets called when
returning to user space, it uses a global mutex for synchronization which
will cause a big bottleneck. This will be replaced by SRCU, but that
change adds some complex synchronization that deservers its own commit.
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Indu Bhagat <indu.bhagat@oracle.com>
Cc: "Jose E. Marchesi" <jemarch@gnu.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Jens Remus <jremus@linux.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/20250729182405.488066537@kernel.org
Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Diffstat (limited to 'kernel/unwind/deferred.c')
-rw-r--r-- | kernel/unwind/deferred.c | 156 |
1 files changed, 155 insertions, 1 deletions
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index 96368a5aa522..2cbae2ada309 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -2,16 +2,63 @@ /* * Deferred user space unwinding */ +#include <linux/sched/task_stack.h> +#include <linux/unwind_deferred.h> +#include <linux/sched/clock.h> +#include <linux/task_work.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sizes.h> #include <linux/slab.h> -#include <linux/unwind_deferred.h> +#include <linux/mm.h> /* Make the cache fit in a 4K page */ #define UNWIND_MAX_ENTRIES \ ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) +/* Guards adding to and reading the list of callbacks */ +static DEFINE_MUTEX(callback_mutex); +static LIST_HEAD(callbacks); + +/* + * This is a unique percpu identifier for a given task entry context. + * Conceptually, it's incremented every time the CPU enters the kernel from + * user space, so that each "entry context" on the CPU gets a unique ID. In + * reality, as an optimization, it's only incremented on demand for the first + * deferred unwind request after a given entry-from-user. + * + * It's combined with the CPU id to make a systemwide-unique "context cookie". + */ +static DEFINE_PER_CPU(u32, unwind_ctx_ctr); + +/* + * The context cookie is a unique identifier that is assigned to a user + * space stacktrace. As the user space stacktrace remains the same while + * the task is in the kernel, the cookie is an identifier for the stacktrace. + * Although it is possible for the stacktrace to get another cookie if another + * request is made after the cookie was cleared and before reentering user + * space. + */ +static u64 get_cookie(struct unwind_task_info *info) +{ + u32 cnt = 1; + u32 old = 0; + + if (info->id.cpu) + return info->id.id; + + /* LSB is always set to ensure 0 is an invalid value */ + cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; + if (try_cmpxchg(&info->id.cnt, &old, cnt)) { + /* Update the per cpu counter */ + __this_cpu_write(unwind_ctx_ctr, cnt); + } + /* Interrupts are disabled, the CPU will always be same */ + info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ + + return info->id.id; +} + /** * unwind_user_faultable - Produce a user stacktrace in faultable context * @trace: The descriptor that will store the user stacktrace @@ -62,11 +109,117 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) return 0; } +static void unwind_deferred_task_work(struct callback_head *head) +{ + struct unwind_task_info *info = container_of(head, struct unwind_task_info, work); + struct unwind_stacktrace trace; + struct unwind_work *work; + u64 cookie; + + if (WARN_ON_ONCE(!info->pending)) + return; + + /* Allow work to come in again */ + WRITE_ONCE(info->pending, 0); + + /* + * From here on out, the callback must always be called, even if it's + * just an empty trace. + */ + trace.nr = 0; + trace.entries = NULL; + + unwind_user_faultable(&trace); + + cookie = info->id.id; + + guard(mutex)(&callback_mutex); + list_for_each_entry(work, &callbacks, list) { + work->func(work, &trace, cookie); + } +} + +/** + * unwind_deferred_request - Request a user stacktrace on task kernel exit + * @work: Unwind descriptor requesting the trace + * @cookie: The cookie of the first request made for this task + * + * Schedule a user space unwind to be done in task work before exiting the + * kernel. + * + * The returned @cookie output is the generated cookie of the very first + * request for a user space stacktrace for this task since it entered the + * kernel. It can be from a request by any caller of this infrastructure. + * Its value will also be passed to the callback function. It can be + * used to stitch kernel and user stack traces together in post-processing. + * + * It's valid to call this function multiple times for the same @work within + * the same task entry context. Each call will return the same cookie + * while the task hasn't left the kernel. If the callback is not pending + * because it has already been previously called for the same entry context, + * it will be called again with the same stack trace and cookie. + * + * Return: 1 if the the callback was already queued. + * 0 if the callback successfully was queued. + * Negative if there's an error. + * @cookie holds the cookie of the first request by any user + */ +int unwind_deferred_request(struct unwind_work *work, u64 *cookie) +{ + struct unwind_task_info *info = ¤t->unwind_info; + int ret; + + *cookie = 0; + + if (WARN_ON_ONCE(in_nmi())) + return -EINVAL; + + if ((current->flags & (PF_KTHREAD | PF_EXITING)) || + !user_mode(task_pt_regs(current))) + return -EINVAL; + + guard(irqsave)(); + + *cookie = get_cookie(info); + + /* callback already pending? */ + if (info->pending) + return 1; + + /* The work has been claimed, now schedule it. */ + ret = task_work_add(current, &info->work, TWA_RESUME); + if (WARN_ON_ONCE(ret)) + return ret; + + info->pending = 1; + return 0; +} + +void unwind_deferred_cancel(struct unwind_work *work) +{ + if (!work) + return; + + guard(mutex)(&callback_mutex); + list_del(&work->list); +} + +int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) +{ + memset(work, 0, sizeof(*work)); + + guard(mutex)(&callback_mutex); + list_add(&work->list, &callbacks); + work->func = func; + return 0; +} + void unwind_task_init(struct task_struct *task) { struct unwind_task_info *info = &task->unwind_info; memset(info, 0, sizeof(*info)); + init_task_work(&info->work, unwind_deferred_task_work); } void unwind_task_free(struct task_struct *task) @@ -74,4 +227,5 @@ void unwind_task_free(struct task_struct *task) struct unwind_task_info *info = &task->unwind_info; kfree(info->cache); + task_work_cancel(task, &info->work); } |