From: Steven Rostedt <rostedt@goodmis.org>
To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
bpf@vger.kernel.org, x86@kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Josh Poimboeuf <jpoimboe@kernel.org>,
Peter Zijlstra <peterz@infradead.org>,
Ingo Molnar <mingo@kernel.org>, Jiri Olsa <jolsa@kernel.org>,
Namhyung Kim <namhyung@kernel.org>
Subject: [PATCH v8 09/18] unwind_user/deferred: Add deferred unwinding interface
Date: Fri, 09 May 2025 12:45:33 -0400 [thread overview]
Message-ID: <20250509165155.124809873@goodmis.org> (raw)
In-Reply-To: 20250509164524.448387100@goodmis.org
From: Josh Poimboeuf <jpoimboe@kernel.org>
Add an interface for scheduling task work to unwind the user space stack
before returning to user space. This solves several problems for its
callers:
- Ensure the unwind happens in task context even if the caller may be
running in NMI or interrupt context.
- Avoid duplicate unwinds, whether called multiple times by the same
caller or by different callers.
- Take a timestamp when the first request comes in since the task
entered the kernel. This will be returned to the calling function
along with the stack trace when the task leaves the kernel. This
timestamp can be used to correlate kernel unwinds/traces with the user
unwind.
The timestamp is created to detect when the stacktrace is the same. It is
generated the first time a user space stacktrace is requested after the
task enters the kernel.
The timestamp is passed to the caller on request, and when the stacktrace is
generated upon returning to user space, it call the requester's callback
with the timestamp as well as the stacktrace.
Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v7: https://lore.kernel.org/20250502165008.904786447@goodmis.org
- Use a timestamp instead of a "cookie"
- Updated comments to kerneldoc for unwind_deferred_request()
include/linux/unwind_deferred.h | 18 ++++
include/linux/unwind_deferred_types.h | 3 +
kernel/unwind/deferred.c | 131 +++++++++++++++++++++++++-
3 files changed, 151 insertions(+), 1 deletion(-)
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 7d6cb2ffd084..a384eef719a3 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -2,9 +2,19 @@
#ifndef _LINUX_UNWIND_USER_DEFERRED_H
#define _LINUX_UNWIND_USER_DEFERRED_H
+#include <linux/task_work.h>
#include <linux/unwind_user.h>
#include <linux/unwind_deferred_types.h>
+struct unwind_work;
+
+typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 timestamp);
+
+struct unwind_work {
+ struct list_head list;
+ unwind_callback_t func;
+};
+
#ifdef CONFIG_UNWIND_USER
void unwind_task_init(struct task_struct *task);
@@ -12,10 +22,15 @@ void unwind_task_free(struct task_struct *task);
int unwind_deferred_trace(struct unwind_stacktrace *trace);
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
+int unwind_deferred_request(struct unwind_work *work, u64 *timestamp);
+void unwind_deferred_cancel(struct unwind_work *work);
+
static __always_inline void unwind_exit_to_user_mode(void)
{
if (unlikely(current->unwind_info.cache))
current->unwind_info.cache->nr_entries = 0;
+ current->unwind_info.timestamp = 0;
}
#else /* !CONFIG_UNWIND_USER */
@@ -24,6 +39,9 @@ static inline void unwind_task_init(struct task_struct *task) {}
static inline void unwind_task_free(struct task_struct *task) {}
static inline int unwind_deferred_trace(struct unwind_stacktrace *trace) { return -ENOSYS; }
+static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; }
+static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; }
+static inline void unwind_deferred_cancel(struct unwind_work *work) {}
static inline void unwind_exit_to_user_mode(void) {}
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index db5b54b18828..5df264cf81ad 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -9,6 +9,9 @@ struct unwind_cache {
struct unwind_task_info {
struct unwind_cache *cache;
+ struct callback_head work;
+ u64 timestamp;
+ int pending;
};
#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index e3913781c8c6..b76c704ddc6d 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -2,13 +2,35 @@
/*
* Deferred user space unwinding
*/
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_deferred.h>
+#include <linux/sched/clock.h>
+#include <linux/task_work.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/unwind_deferred.h>
+#include <linux/mm.h>
#define UNWIND_MAX_ENTRIES 512
+/* Guards adding to and reading the list of callbacks */
+static DEFINE_MUTEX(callback_mutex);
+static LIST_HEAD(callbacks);
+
+/*
+ * Read the task context timestamp, if this is the first caller then
+ * it will set the timestamp.
+ */
+static u64 get_timestamp(struct unwind_task_info *info)
+{
+ lockdep_assert_irqs_disabled();
+
+ if (!info->timestamp)
+ info->timestamp = local_clock();
+
+ return info->timestamp;
+}
+
/**
* unwind_deferred_trace - Produce a user stacktrace in faultable context
* @trace: The descriptor that will store the user stacktrace
@@ -59,11 +81,117 @@ int unwind_deferred_trace(struct unwind_stacktrace *trace)
return 0;
}
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+ struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
+ struct unwind_stacktrace trace;
+ struct unwind_work *work;
+ u64 timestamp;
+
+ if (WARN_ON_ONCE(!info->pending))
+ return;
+
+ /* Allow work to come in again */
+ WRITE_ONCE(info->pending, 0);
+
+ /*
+ * From here on out, the callback must always be called, even if it's
+ * just an empty trace.
+ */
+ trace.nr = 0;
+ trace.entries = NULL;
+
+ unwind_deferred_trace(&trace);
+
+ timestamp = info->timestamp;
+
+ guard(mutex)(&callback_mutex);
+ list_for_each_entry(work, &callbacks, list) {
+ work->func(work, &trace, timestamp);
+ }
+}
+
+/**
+ * unwind_deferred_request - Request a user stacktrace on task exit
+ * @work: Unwind descriptor requesting the trace
+ * @timestamp: The time stamp of the first request made for this task
+ *
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The returned @timestamp output is the timestamp of the very first request
+ * for a user space stacktrace for this task since it entered the kernel.
+ * It can be from a request by any caller of this infrastructure.
+ * Its value will also be passed to the callback function. It can be
+ * used to stitch kernel and user stack traces together in post-processing.
+ *
+ * It's valid to call this function multiple times for the same @work within
+ * the same task entry context. Each call will return the same timestamp
+ * while the task hasn't left the kernel. If the callback is not pending because
+ * it has already been previously called for the same entry context, it will be
+ * called again with the same stack trace and timestamp.
+ *
+ * Return: 1 if the the callback was already queued.
+ * 0 if the callback successfully was queued.
+ * Negative if there's an error.
+ * @timestamp holds the timestamp of the first request by any user
+ */
+int unwind_deferred_request(struct unwind_work *work, u64 *timestamp)
+{
+ struct unwind_task_info *info = ¤t->unwind_info;
+ int ret;
+
+ *timestamp = 0;
+
+ if (WARN_ON_ONCE(in_nmi()))
+ return -EINVAL;
+
+ if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
+ !user_mode(task_pt_regs(current)))
+ return -EINVAL;
+
+ guard(irqsave)();
+
+ *timestamp = get_timestamp(info);
+
+ /* callback already pending? */
+ if (info->pending)
+ return 1;
+
+ /* The work has been claimed, now schedule it. */
+ ret = task_work_add(current, &info->work, TWA_RESUME);
+ if (WARN_ON_ONCE(ret))
+ return ret;
+
+ info->pending = 1;
+ return 0;
+}
+
+void unwind_deferred_cancel(struct unwind_work *work)
+{
+ if (!work)
+ return;
+
+ guard(mutex)(&callback_mutex);
+ list_del(&work->list);
+}
+
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{
+ memset(work, 0, sizeof(*work));
+
+ guard(mutex)(&callback_mutex);
+ list_add(&work->list, &callbacks);
+ work->func = func;
+ return 0;
+}
+
void unwind_task_init(struct task_struct *task)
{
struct unwind_task_info *info = &task->unwind_info;
memset(info, 0, sizeof(*info));
+ init_task_work(&info->work, unwind_deferred_task_work);
}
void unwind_task_free(struct task_struct *task)
@@ -71,4 +199,5 @@ void unwind_task_free(struct task_struct *task)
struct unwind_task_info *info = &task->unwind_info;
kfree(info->cache);
+ task_work_cancel(task, &info->work);
}
--
2.47.2
next prev parent reply other threads:[~2025-05-09 16:51 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-09 16:45 [PATCH v8 00/18] unwind_user: perf: x86: Deferred unwinding infrastructure Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 01/18] unwind_user: Add user space unwinding API Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 02/18] unwind_user: Add frame pointer support Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 03/18] unwind_user/x86: Enable frame pointer unwinding on x86 Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 04/18] perf/x86: Rename and move get_segment_base() and make it global Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 05/18] unwind_user: Add compat mode frame pointer support Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 06/18] unwind_user/x86: Enable compat mode frame pointer unwinding on x86 Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 07/18] unwind_user/deferred: Add unwind_deferred_trace() Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 08/18] unwind_user/deferred: Add unwind cache Steven Rostedt
2025-05-09 16:45 ` Steven Rostedt [this message]
2025-05-09 16:45 ` [PATCH v8 10/18] unwind_user/deferred: Make unwind deferral requests NMI-safe Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 11/18] unwind deferred: Use bitmask to determine which callbacks to call Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 12/18] unwind deferred: Use SRCU unwind_deferred_task_work() Steven Rostedt
2025-05-09 21:49 ` Andrii Nakryiko
2025-05-10 13:41 ` Steven Rostedt
2025-05-12 16:17 ` Andrii Nakryiko
2025-05-09 16:45 ` [PATCH v8 13/18] unwind: Clear unwind_mask on exit back to user space Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 14/18] perf: Remove get_perf_callchain() init_nr argument Steven Rostedt
2025-05-09 17:11 ` Alexei Starovoitov
2025-05-09 16:45 ` [PATCH v8 15/18] perf: Have get_perf_callchain() return NULL if crosstask and user are set Steven Rostedt
2025-05-09 21:53 ` Andrii Nakryiko
2025-05-10 13:46 ` Steven Rostedt
2025-05-10 17:59 ` Josh Poimboeuf
2025-05-12 22:27 ` Andrii Nakryiko
2025-05-09 16:45 ` [PATCH v8 16/18] perf: Use current->flags & PF_KTHREAD instead of current->mm == NULL Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 17/18] perf: Simplify get_perf_callchain() user logic Steven Rostedt
2025-05-09 16:45 ` [PATCH v8 18/18] perf: Skip user unwind if the task is a kernel thread Steven Rostedt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250509165155.124809873@goodmis.org \
--to=rostedt@goodmis.org \
--cc=bpf@vger.kernel.org \
--cc=jolsa@kernel.org \
--cc=jpoimboe@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=mhiramat@kernel.org \
--cc=mingo@kernel.org \
--cc=namhyung@kernel.org \
--cc=peterz@infradead.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).