From: Josh Poimboeuf <jpoimboe@kernel.org>
To: x86@kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>,
Steven Rostedt <rostedt@goodmis.org>,
Ingo Molnar <mingo@kernel.org>,
Arnaldo Carvalho de Melo <acme@kernel.org>,
linux-kernel@vger.kernel.org,
Indu Bhagat <indu.bhagat@oracle.com>,
Mark Rutland <mark.rutland@arm.com>,
Alexander Shishkin <alexander.shishkin@linux.intel.com>,
Jiri Olsa <jolsa@kernel.org>, Namhyung Kim <namhyung@kernel.org>,
Ian Rogers <irogers@google.com>,
Adrian Hunter <adrian.hunter@intel.com>,
linux-perf-users@vger.kernel.org, Mark Brown <broonie@kernel.org>,
linux-toolchains@vger.kernel.org, Jordan Rome <jordalgo@meta.com>,
Sam James <sam@gentoo.org>,
linux-trace-kernel@vger.kerne.org,
Andrii Nakryiko <andrii.nakryiko@gmail.com>,
Jens Remus <jremus@linux.ibm.com>,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Florian Weimer <fweimer@redhat.com>,
Andy Lutomirski <luto@kernel.org>
Subject: [PATCH v3 15/19] perf: Add deferred user callchains
Date: Mon, 28 Oct 2024 14:47:42 -0700 [thread overview]
Message-ID: <1ce857387c781afa66efaa61eb88ff596b352500.1730150953.git.jpoimboe@kernel.org> (raw)
In-Reply-To: <cover.1730150953.git.jpoimboe@kernel.org>
Instead of attempting to unwind user space from the NMI handler, defer
it to run in task context by sending a self-IPI and then scheduling the
unwind to run in the IRQ's exit task work before returning to user space.
This allows the user stack page to be paged in if needed, avoids
duplicate unwinds for kernel-bound workloads, and prepares for SFrame
unwinding (so .sframe sections can be paged in on demand).
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
arch/Kconfig | 3 ++
include/linux/perf_event.h | 10 ++++-
include/uapi/linux/perf_event.h | 22 +++++++++-
kernel/bpf/stackmap.c | 6 +--
kernel/events/callchain.c | 11 ++++-
kernel/events/core.c | 63 ++++++++++++++++++++++++++-
tools/include/uapi/linux/perf_event.h | 22 +++++++++-
7 files changed, 129 insertions(+), 8 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index e769c39dd221..33449485eafd 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -446,6 +446,9 @@ config HAVE_UNWIND_USER_SFRAME
bool
select UNWIND_USER
+config HAVE_PERF_CALLCHAIN_DEFERRED
+ bool
+
config HAVE_PERF_REGS
bool
help
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 788f6971d32d..2193b3d16820 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -808,9 +808,11 @@ struct perf_event {
unsigned long pending_addr; /* SIGTRAP */
struct irq_work pending_irq;
struct irq_work pending_disable_irq;
+ struct irq_work pending_unwind_irq;
struct callback_head pending_task;
unsigned int pending_work;
struct rcuwait pending_work_wait;
+ unsigned int pending_unwind;
atomic_t event_limit;
@@ -1569,12 +1571,18 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool add_mark);
+ u32 max_stack, bool add_mark, bool defer_user);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
extern void put_callchain_entry(int rctx);
+#ifdef CONFIG_HAVE_PERF_CALLCHAIN_DEFERRED
+extern void perf_callchain_user_deferred(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+#else
+static inline void perf_callchain_user_deferred(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) {}
+#endif
+
extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 4842c36fdf80..6d0524b7d082 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -460,7 +460,8 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
- __reserved_1 : 26;
+ defer_callchain: 1, /* generate PERF_RECORD_CALLCHAIN_DEFERRED records */
+ __reserved_1 : 25;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1217,6 +1218,24 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
+ /*
+ * This user callchain capture was deferred until shortly before
+ * returning to user space. Previous samples would have kernel
+ * callchains only and they need to be stitched with this to make full
+ * callchains.
+ *
+ * TODO: do PERF_SAMPLE_{REGS,STACK}_USER also need deferral?
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u64 ctx_cookie;
+ * u64 nr;
+ * u64 ips[nr];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_CALLCHAIN_DEFERRED = 22,
+
PERF_RECORD_MAX, /* non-ABI */
};
@@ -1247,6 +1266,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
+ PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index ee9701337912..f073ebaf9c30 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -314,8 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
- trace = get_perf_callchain(regs, kernel, user, max_depth, false);
-
+ trace = get_perf_callchain(regs, kernel, user, max_depth, false, false);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
return -EFAULT;
@@ -448,7 +447,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else if (kernel && task)
trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, kernel, user, max_depth,false);
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
+ false, false);
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 2278402b7ac9..eeb15ba0137f 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -217,7 +217,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool add_mark)
+ u32 max_stack, bool add_mark, bool defer_user)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
@@ -246,6 +246,15 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
regs = task_pt_regs(current);
}
+ if (defer_user) {
+ /*
+ * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+ * which can be stitched to this one.
+ */
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+ goto exit_put;
+ }
+
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ebf143aa427b..bf97b2fa8a9c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -55,11 +55,14 @@
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>
+#include <linux/unwind_user.h>
#include "internal.h"
#include <asm/irq_regs.h>
+static struct unwind_callback perf_unwind_callback_cb;
+
typedef int (*remote_function_f)(void *);
struct remote_function_call {
@@ -6955,6 +6958,53 @@ static void perf_pending_irq(struct irq_work *entry)
perf_swevent_put_recursion_context(rctx);
}
+static void perf_pending_unwind_irq(struct irq_work *entry)
+{
+ struct perf_event *event = container_of(entry, struct perf_event, pending_unwind_irq);
+
+ if (event->pending_unwind) {
+ unwind_user_deferred(&perf_unwind_callback_cb, NULL, event);
+ event->pending_unwind = 0;
+ }
+}
+
+struct perf_callchain_deferred_event {
+ struct perf_event_header header;
+ u64 ctx_cookie;
+ u64 nr;
+ u64 ips[];
+};
+
+static void perf_event_callchain_deferred(struct unwind_stacktrace *trace,
+ u64 ctx_cookie, void *_data)
+{
+ struct perf_callchain_deferred_event deferred_event;
+ u64 callchain_context = PERF_CONTEXT_USER;
+ struct perf_output_handle handle;
+ struct perf_event *event = _data;
+ struct perf_sample_data data;
+ u64 nr = trace->nr + 1 /* callchain_context */;
+
+ deferred_event.header.type = PERF_RECORD_CALLCHAIN_DEFERRED;
+ deferred_event.header.misc = PERF_RECORD_MISC_USER;
+ deferred_event.header.size = sizeof(deferred_event) + (nr * sizeof(u64));
+
+ deferred_event.ctx_cookie = ctx_cookie;
+ deferred_event.nr = nr;
+
+ perf_event_header__init_id(&deferred_event.header, &data, event);
+
+ if (perf_output_begin(&handle, &data, event, deferred_event.header.size))
+ return;
+
+ perf_output_put(&handle, deferred_event);
+ perf_output_put(&handle, callchain_context);
+ perf_output_copy(&handle, trace->entries, trace->nr * sizeof(u64));
+ perf_event__output_id_sample(event, &handle, &data);
+
+ perf_output_end(&handle);
+}
+
static void perf_pending_task(struct callback_head *head)
{
struct perf_event *event = container_of(head, struct perf_event, pending_task);
@@ -7794,6 +7844,8 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
bool user = !event->attr.exclude_callchain_user;
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) &&
+ event->attr.defer_callchain;
if (!kernel && !user)
return &__empty_callchain;
@@ -7802,7 +7854,14 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (event->ctx->task && event->ctx->task != current)
return &__empty_callchain;
- callchain = get_perf_callchain(regs, kernel, user, max_stack, true);
+ callchain = get_perf_callchain(regs, kernel, user, max_stack, true,
+ defer_user);
+
+ if (user && defer_user && !event->pending_unwind) {
+ event->pending_unwind = 1;
+ irq_work_queue(&event->pending_unwind_irq);
+ }
+
return callchain ?: &__empty_callchain;
}
@@ -12171,6 +12230,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending_irq, perf_pending_irq);
+ event->pending_unwind_irq = IRQ_WORK_INIT_HARD(perf_pending_unwind_irq);
event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
init_task_work(&event->pending_task, perf_pending_task);
rcuwait_init(&event->pending_work_wait);
@@ -14093,6 +14153,7 @@ void __init perf_event_init(void)
perf_tp_register();
perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
+ unwind_user_register(&perf_unwind_callback_cb, perf_event_callchain_deferred);
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 4842c36fdf80..6d0524b7d082 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -460,7 +460,8 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
- __reserved_1 : 26;
+ defer_callchain: 1, /* generate PERF_RECORD_CALLCHAIN_DEFERRED records */
+ __reserved_1 : 25;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1217,6 +1218,24 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
+ /*
+ * This user callchain capture was deferred until shortly before
+ * returning to user space. Previous samples would have kernel
+ * callchains only and they need to be stitched with this to make full
+ * callchains.
+ *
+ * TODO: do PERF_SAMPLE_{REGS,STACK}_USER also need deferral?
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u64 ctx_cookie;
+ * u64 nr;
+ * u64 ips[nr];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_CALLCHAIN_DEFERRED = 22,
+
PERF_RECORD_MAX, /* non-ABI */
};
@@ -1247,6 +1266,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
+ PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
--
2.47.0
WARNING: multiple messages have this Message-ID (diff)
From: Josh Poimboeuf <jpoimboe@kernel.org>
To: x86@kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>,
Steven Rostedt <rostedt@goodmis.org>,
Ingo Molnar <mingo@kernel.org>,
Arnaldo Carvalho de Melo <acme@kernel.org>,
linux-kernel@vger.kernel.org,
Indu Bhagat <indu.bhagat@oracle.com>,
Mark Rutland <mark.rutland@arm.com>,
Alexander Shishkin <alexander.shishkin@linux.intel.com>,
Jiri Olsa <jolsa@kernel.org>, Namhyung Kim <namhyung@kernel.org>,
Ian Rogers <irogers@google.com>,
Adrian Hunter <adrian.hunter@intel.com>,
linux-perf-users@vger.kernel.org, Mark Brown <broonie@kernel.org>,
linux-toolchains@vger.kernel.org, Jordan Rome <jordalgo@meta.com>,
Sam James <sam@gentoo.org>,
linux-trace-kernel@vger.kerne.org,
Andrii Nakryiko <andrii.nakryiko@gmail.com>,
Jens Remus <jremus@linux.ibm.com>,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Florian Weimer <fweimer@redhat.com>,
Andy Lutomirski <luto@kernel.org>
Subject: [PATCH v3 15/19] perf: Add deferred user callchains
Date: Mon, 28 Oct 2024 14:48:02 -0700 [thread overview]
Message-ID: <1ce857387c781afa66efaa61eb88ff596b352500.1730150953.git.jpoimboe@kernel.org> (raw)
Message-ID: <20241028214802.cvv5IW-Ty7flsvaHpls0an4IzN7UbgW2vJxpPhOanH8@z> (raw)
In-Reply-To: <cover.1730150953.git.jpoimboe@kernel.org>
Instead of attempting to unwind user space from the NMI handler, defer
it to run in task context by sending a self-IPI and then scheduling the
unwind to run in the IRQ's exit task work before returning to user space.
This allows the user stack page to be paged in if needed, avoids
duplicate unwinds for kernel-bound workloads, and prepares for SFrame
unwinding (so .sframe sections can be paged in on demand).
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
arch/Kconfig | 3 ++
include/linux/perf_event.h | 10 ++++-
include/uapi/linux/perf_event.h | 22 +++++++++-
kernel/bpf/stackmap.c | 6 +--
kernel/events/callchain.c | 11 ++++-
kernel/events/core.c | 63 ++++++++++++++++++++++++++-
tools/include/uapi/linux/perf_event.h | 22 +++++++++-
7 files changed, 129 insertions(+), 8 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index e769c39dd221..33449485eafd 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -446,6 +446,9 @@ config HAVE_UNWIND_USER_SFRAME
bool
select UNWIND_USER
+config HAVE_PERF_CALLCHAIN_DEFERRED
+ bool
+
config HAVE_PERF_REGS
bool
help
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 788f6971d32d..2193b3d16820 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -808,9 +808,11 @@ struct perf_event {
unsigned long pending_addr; /* SIGTRAP */
struct irq_work pending_irq;
struct irq_work pending_disable_irq;
+ struct irq_work pending_unwind_irq;
struct callback_head pending_task;
unsigned int pending_work;
struct rcuwait pending_work_wait;
+ unsigned int pending_unwind;
atomic_t event_limit;
@@ -1569,12 +1571,18 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool add_mark);
+ u32 max_stack, bool add_mark, bool defer_user);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
extern void put_callchain_entry(int rctx);
+#ifdef CONFIG_HAVE_PERF_CALLCHAIN_DEFERRED
+extern void perf_callchain_user_deferred(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+#else
+static inline void perf_callchain_user_deferred(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) {}
+#endif
+
extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 4842c36fdf80..6d0524b7d082 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -460,7 +460,8 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
- __reserved_1 : 26;
+ defer_callchain: 1, /* generate PERF_RECORD_CALLCHAIN_DEFERRED records */
+ __reserved_1 : 25;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1217,6 +1218,24 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
+ /*
+ * This user callchain capture was deferred until shortly before
+ * returning to user space. Previous samples would have kernel
+ * callchains only and they need to be stitched with this to make full
+ * callchains.
+ *
+ * TODO: do PERF_SAMPLE_{REGS,STACK}_USER also need deferral?
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u64 ctx_cookie;
+ * u64 nr;
+ * u64 ips[nr];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_CALLCHAIN_DEFERRED = 22,
+
PERF_RECORD_MAX, /* non-ABI */
};
@@ -1247,6 +1266,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
+ PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index ee9701337912..f073ebaf9c30 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -314,8 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
- trace = get_perf_callchain(regs, kernel, user, max_depth, false);
-
+ trace = get_perf_callchain(regs, kernel, user, max_depth, false, false);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
return -EFAULT;
@@ -448,7 +447,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else if (kernel && task)
trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, kernel, user, max_depth,false);
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
+ false, false);
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 2278402b7ac9..eeb15ba0137f 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -217,7 +217,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool add_mark)
+ u32 max_stack, bool add_mark, bool defer_user)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
@@ -246,6 +246,15 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
regs = task_pt_regs(current);
}
+ if (defer_user) {
+ /*
+ * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+ * which can be stitched to this one.
+ */
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+ goto exit_put;
+ }
+
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ebf143aa427b..bf97b2fa8a9c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -55,11 +55,14 @@
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>
+#include <linux/unwind_user.h>
#include "internal.h"
#include <asm/irq_regs.h>
+static struct unwind_callback perf_unwind_callback_cb;
+
typedef int (*remote_function_f)(void *);
struct remote_function_call {
@@ -6955,6 +6958,53 @@ static void perf_pending_irq(struct irq_work *entry)
perf_swevent_put_recursion_context(rctx);
}
+static void perf_pending_unwind_irq(struct irq_work *entry)
+{
+ struct perf_event *event = container_of(entry, struct perf_event, pending_unwind_irq);
+
+ if (event->pending_unwind) {
+ unwind_user_deferred(&perf_unwind_callback_cb, NULL, event);
+ event->pending_unwind = 0;
+ }
+}
+
+struct perf_callchain_deferred_event {
+ struct perf_event_header header;
+ u64 ctx_cookie;
+ u64 nr;
+ u64 ips[];
+};
+
+static void perf_event_callchain_deferred(struct unwind_stacktrace *trace,
+ u64 ctx_cookie, void *_data)
+{
+ struct perf_callchain_deferred_event deferred_event;
+ u64 callchain_context = PERF_CONTEXT_USER;
+ struct perf_output_handle handle;
+ struct perf_event *event = _data;
+ struct perf_sample_data data;
+ u64 nr = trace->nr + 1 /* callchain_context */;
+
+ deferred_event.header.type = PERF_RECORD_CALLCHAIN_DEFERRED;
+ deferred_event.header.misc = PERF_RECORD_MISC_USER;
+ deferred_event.header.size = sizeof(deferred_event) + (nr * sizeof(u64));
+
+ deferred_event.ctx_cookie = ctx_cookie;
+ deferred_event.nr = nr;
+
+ perf_event_header__init_id(&deferred_event.header, &data, event);
+
+ if (perf_output_begin(&handle, &data, event, deferred_event.header.size))
+ return;
+
+ perf_output_put(&handle, deferred_event);
+ perf_output_put(&handle, callchain_context);
+ perf_output_copy(&handle, trace->entries, trace->nr * sizeof(u64));
+ perf_event__output_id_sample(event, &handle, &data);
+
+ perf_output_end(&handle);
+}
+
static void perf_pending_task(struct callback_head *head)
{
struct perf_event *event = container_of(head, struct perf_event, pending_task);
@@ -7794,6 +7844,8 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
bool user = !event->attr.exclude_callchain_user;
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) &&
+ event->attr.defer_callchain;
if (!kernel && !user)
return &__empty_callchain;
@@ -7802,7 +7854,14 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (event->ctx->task && event->ctx->task != current)
return &__empty_callchain;
- callchain = get_perf_callchain(regs, kernel, user, max_stack, true);
+ callchain = get_perf_callchain(regs, kernel, user, max_stack, true,
+ defer_user);
+
+ if (user && defer_user && !event->pending_unwind) {
+ event->pending_unwind = 1;
+ irq_work_queue(&event->pending_unwind_irq);
+ }
+
return callchain ?: &__empty_callchain;
}
@@ -12171,6 +12230,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending_irq, perf_pending_irq);
+ event->pending_unwind_irq = IRQ_WORK_INIT_HARD(perf_pending_unwind_irq);
event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
init_task_work(&event->pending_task, perf_pending_task);
rcuwait_init(&event->pending_work_wait);
@@ -14093,6 +14153,7 @@ void __init perf_event_init(void)
perf_tp_register();
perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
+ unwind_user_register(&perf_unwind_callback_cb, perf_event_callchain_deferred);
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 4842c36fdf80..6d0524b7d082 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -460,7 +460,8 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
- __reserved_1 : 26;
+ defer_callchain: 1, /* generate PERF_RECORD_CALLCHAIN_DEFERRED records */
+ __reserved_1 : 25;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1217,6 +1218,24 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
+ /*
+ * This user callchain capture was deferred until shortly before
+ * returning to user space. Previous samples would have kernel
+ * callchains only and they need to be stitched with this to make full
+ * callchains.
+ *
+ * TODO: do PERF_SAMPLE_{REGS,STACK}_USER also need deferral?
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u64 ctx_cookie;
+ * u64 nr;
+ * u64 ips[nr];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_CALLCHAIN_DEFERRED = 22,
+
PERF_RECORD_MAX, /* non-ABI */
};
@@ -1247,6 +1266,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
+ PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
--
2.47.0
next prev parent reply other threads:[~2024-10-28 21:48 UTC|newest]
Thread overview: 119+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-28 21:47 [PATCH v3 00/19] unwind, perf: sframe user space unwinding Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 01/19] x86/vdso: Fix DWARF generation for getrandom() Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 02/19] x86/asm: Avoid emitting DWARF CFI for non-VDSO Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-30 17:19 ` Jens Remus
2024-10-30 17:51 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 03/19] x86/asm: Fix VDSO DWARF generation with kernel IBT enabled Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 04/19] x86/vdso: Use SYM_FUNC_{START,END} in __kernel_vsyscall() Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 05/19] x86/vdso: Use CFI macros in __vdso_sgx_enter_enclave() Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 06/19] x86/vdso: Enable sframe generation in VDSO Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-30 18:20 ` Jens Remus
2024-10-30 19:17 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 07/19] unwind: Add user space unwinding API Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-12-06 10:29 ` Jens Remus
2024-12-09 20:54 ` Josh Poimboeuf
2024-12-11 14:53 ` Jens Remus
2024-12-11 17:48 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 08/19] unwind/x86: Enable CONFIG_HAVE_UNWIND_USER_FP Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-29 13:13 ` Peter Zijlstra
2024-10-29 16:31 ` Josh Poimboeuf
2024-10-29 18:08 ` Peter Zijlstra
2024-10-28 21:47 ` [PATCH v3 09/19] unwind: Introduce sframe user space unwinding Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-29 13:27 ` Peter Zijlstra
2024-10-29 16:50 ` Josh Poimboeuf
2024-10-29 18:10 ` Peter Zijlstra
2024-10-29 23:32 ` Andrii Nakryiko
2024-10-30 5:53 ` Josh Poimboeuf
2024-10-31 20:57 ` Andrii Nakryiko
2024-10-31 21:00 ` Nick Desaulniers
2024-10-31 21:38 ` Indu Bhagat
2024-11-01 18:38 ` Andrii Nakryiko
2024-11-01 18:47 ` Steven Rostedt
2024-11-01 18:54 ` Andrii Nakryiko
2024-11-03 0:07 ` Indu Bhagat
2024-10-31 23:03 ` Josh Poimboeuf
2024-11-01 18:34 ` Andrii Nakryiko
2024-11-01 19:29 ` Josh Poimboeuf
2024-11-01 19:44 ` Andrii Nakryiko
2024-11-01 19:46 ` Andrii Nakryiko
2024-11-01 19:51 ` Josh Poimboeuf
2024-11-01 19:09 ` Segher Boessenkool
2024-11-01 19:33 ` Josh Poimboeuf
2024-11-01 19:35 ` Josh Poimboeuf
2024-11-01 19:48 ` Josh Poimboeuf
2024-11-01 21:35 ` Segher Boessenkool
2024-11-05 17:40 ` Steven Rostedt
2024-11-05 17:45 ` Steven Rostedt
2024-11-06 17:04 ` Jens Remus
2024-11-07 8:25 ` Weinan Liu
2024-11-07 16:59 ` Jens Remus
2024-11-13 20:50 ` Steven Rostedt
2024-11-13 21:15 ` Josh Poimboeuf
2024-11-13 22:13 ` Steven Rostedt
2024-11-13 22:21 ` Steven Rostedt
2024-11-13 22:25 ` Steven Rostedt
2024-11-14 9:57 ` Jens Remus
2024-11-13 15:56 ` Jens Remus
2024-11-13 20:50 ` Steven Rostedt
2024-11-13 21:13 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 10/19] unwind/x86: Enable CONFIG_HAVE_UNWIND_USER_SFRAME Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-29 13:14 ` Peter Zijlstra
2024-10-28 21:47 ` [PATCH v3 11/19] unwind: Add deferred user space unwinding API Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-29 13:48 ` Peter Zijlstra
2024-10-29 16:51 ` Josh Poimboeuf
2024-10-29 13:49 ` Peter Zijlstra
2024-10-29 17:05 ` Josh Poimboeuf
2024-10-29 18:11 ` Peter Zijlstra
2024-10-29 13:56 ` Peter Zijlstra
2024-10-29 17:17 ` Josh Poimboeuf
2024-10-29 17:47 ` Mathieu Desnoyers
2024-10-29 18:20 ` Peter Zijlstra
2024-10-30 6:17 ` Steven Rostedt
2024-10-30 14:03 ` Peter Zijlstra
2024-10-30 19:58 ` Steven Rostedt
2024-10-30 20:48 ` Josh Poimboeuf
2024-10-29 18:34 ` Josh Poimboeuf
2024-10-30 13:44 ` Mathieu Desnoyers
2024-10-30 17:47 ` Josh Poimboeuf
2024-10-30 17:55 ` Josh Poimboeuf
2024-10-30 18:25 ` Josh Poimboeuf
2024-10-29 23:32 ` Andrii Nakryiko
2024-10-30 6:10 ` Josh Poimboeuf
2024-10-31 21:22 ` Andrii Nakryiko
2024-10-31 23:13 ` Josh Poimboeuf
2024-10-31 23:28 ` Andrii Nakryiko
2024-11-01 17:41 ` Josh Poimboeuf
2024-11-01 18:05 ` Andrii Nakryiko
2024-10-28 21:47 ` [PATCH v3 12/19] perf: Remove get_perf_callchain() 'init_nr' argument Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 13/19] perf: Remove get_perf_callchain() 'crosstask' argument Josh Poimboeuf
2024-10-28 21:48 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 14/19] perf: Simplify get_perf_callchain() user logic Josh Poimboeuf
2024-10-28 21:48 ` Josh Poimboeuf
2024-10-28 21:47 ` Josh Poimboeuf [this message]
2024-10-28 21:48 ` [PATCH v3 15/19] perf: Add deferred user callchains Josh Poimboeuf
2024-10-29 14:06 ` Peter Zijlstra
2024-11-06 9:45 ` Jens Remus
2024-10-28 21:47 ` [PATCH v3 16/19] perf tools: Minimal CALLCHAIN_DEFERRED support Josh Poimboeuf
2024-10-28 21:48 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 17/19] perf record: Enable defer_callchain for user callchains Josh Poimboeuf
2024-10-28 21:48 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 18/19] perf script: Display PERF_RECORD_CALLCHAIN_DEFERRED Josh Poimboeuf
2024-10-28 21:48 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 19/19] perf tools: Merge deferred user callchains Josh Poimboeuf
2024-10-28 21:48 ` Josh Poimboeuf
2024-10-28 21:47 ` [PATCH v3 00/19] unwind, perf: sframe user space unwinding Josh Poimboeuf
2024-10-28 21:54 ` Josh Poimboeuf
2024-10-28 23:55 ` Josh Poimboeuf
2024-10-29 14:08 ` Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1ce857387c781afa66efaa61eb88ff596b352500.1730150953.git.jpoimboe@kernel.org \
--to=jpoimboe@kernel.org \
--cc=acme@kernel.org \
--cc=adrian.hunter@intel.com \
--cc=alexander.shishkin@linux.intel.com \
--cc=andrii.nakryiko@gmail.com \
--cc=broonie@kernel.org \
--cc=fweimer@redhat.com \
--cc=indu.bhagat@oracle.com \
--cc=irogers@google.com \
--cc=jolsa@kernel.org \
--cc=jordalgo@meta.com \
--cc=jremus@linux.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-perf-users@vger.kernel.org \
--cc=linux-toolchains@vger.kernel.org \
--cc=linux-trace-kernel@vger.kerne.org \
--cc=luto@kernel.org \
--cc=mark.rutland@arm.com \
--cc=mathieu.desnoyers@efficios.com \
--cc=mingo@kernel.org \
--cc=namhyung@kernel.org \
--cc=peterz@infradead.org \
--cc=rostedt@goodmis.org \
--cc=sam@gentoo.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).