* [PATCH v7 01/18] unwind_user: Add user space unwinding API
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 02/18] unwind_user: Add frame pointer support Steven Rostedt
` (16 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
Introduce a generic API for unwinding user stacks.
In order to expand user space unwinding to be able to handle more complex
scenarios, such as deferred unwinding and reading user space information,
create a generic interface that all architectures can use that support the
various unwinding methods.
This is an alternative method for handling user space stack traces from
the simple stack_trace_save_user() API. This does not replace that
interface, but this interface will be used to expand the functionality of
user space stack walking.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes from v6: https://lore.kernel.org/20250425145811.822676841@goodmis.org
- Use (current->flags & PF_KTHREAD) instead of !(current->mm) for testing
if a task is a kernel thread or not. (Josh Poimboeuf)
MAINTAINERS | 8 +++++
arch/Kconfig | 3 ++
include/linux/unwind_user.h | 15 +++++++++
include/linux/unwind_user_types.h | 31 +++++++++++++++++
kernel/Makefile | 1 +
kernel/unwind/Makefile | 1 +
kernel/unwind/user.c | 55 +++++++++++++++++++++++++++++++
7 files changed, 114 insertions(+)
create mode 100644 include/linux/unwind_user.h
create mode 100644 include/linux/unwind_user_types.h
create mode 100644 kernel/unwind/Makefile
create mode 100644 kernel/unwind/user.c
diff --git a/MAINTAINERS b/MAINTAINERS
index fedcbcba8397..f94b8d05543d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -25308,6 +25308,14 @@ F: Documentation/driver-api/uio-howto.rst
F: drivers/uio/
F: include/linux/uio_driver.h
+USERSPACE STACK UNWINDING
+M: Josh Poimboeuf <jpoimboe@kernel.org>
+M: Steven Rostedt <rostedt@goodmis.org>
+S: Maintained
+F: include/linux/unwind*.h
+F: kernel/unwind/
+
+
UTIL-LINUX PACKAGE
M: Karel Zak <kzak@redhat.com>
L: util-linux@vger.kernel.org
diff --git a/arch/Kconfig b/arch/Kconfig
index b0adb665041f..ccbcead9fac0 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -435,6 +435,9 @@ config HAVE_HARDLOCKUP_DETECTOR_ARCH
It uses the same command line parameters, and sysctl interface,
as the generic hardlockup detectors.
+config UNWIND_USER
+ bool
+
config HAVE_PERF_REGS
bool
help
diff --git a/include/linux/unwind_user.h b/include/linux/unwind_user.h
new file mode 100644
index 000000000000..aa7923c1384f
--- /dev/null
+++ b/include/linux/unwind_user.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_H
+#define _LINUX_UNWIND_USER_H
+
+#include <linux/unwind_user_types.h>
+
+int unwind_user_start(struct unwind_user_state *state);
+int unwind_user_next(struct unwind_user_state *state);
+
+int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries);
+
+#define for_each_user_frame(state) \
+ for (unwind_user_start((state)); !(state)->done; unwind_user_next((state)))
+
+#endif /* _LINUX_UNWIND_USER_H */
diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
new file mode 100644
index 000000000000..6ed1b4ae74e1
--- /dev/null
+++ b/include/linux/unwind_user_types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_TYPES_H
+#define _LINUX_UNWIND_USER_TYPES_H
+
+#include <linux/types.h>
+
+enum unwind_user_type {
+ UNWIND_USER_TYPE_NONE,
+};
+
+struct unwind_stacktrace {
+ unsigned int nr;
+ unsigned long *entries;
+};
+
+struct unwind_user_frame {
+ s32 cfa_off;
+ s32 ra_off;
+ s32 fp_off;
+ bool use_fp;
+};
+
+struct unwind_user_state {
+ unsigned long ip;
+ unsigned long sp;
+ unsigned long fp;
+ enum unwind_user_type type;
+ bool done;
+};
+
+#endif /* _LINUX_UNWIND_USER_TYPES_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 434929de17ef..5a2b2be2a32d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-y += rcu/
obj-y += livepatch/
obj-y += dma/
obj-y += entry/
+obj-y += unwind/
obj-$(CONFIG_MODULES) += module/
obj-$(CONFIG_KCMP) += kcmp.o
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
new file mode 100644
index 000000000000..349ce3677526
--- /dev/null
+++ b/kernel/unwind/Makefile
@@ -0,0 +1 @@
+ obj-$(CONFIG_UNWIND_USER) += user.o
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
new file mode 100644
index 000000000000..d30449328981
--- /dev/null
+++ b/kernel/unwind/user.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+* Generic interfaces for unwinding user space
+*/
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_user.h>
+
+int unwind_user_next(struct unwind_user_state *state)
+{
+ /* no implementation yet */
+ return -EINVAL;
+}
+
+int unwind_user_start(struct unwind_user_state *state)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+
+ memset(state, 0, sizeof(*state));
+
+ if ((current->flags & PF_KTHREAD) || !user_mode(regs)) {
+ state->done = true;
+ return -EINVAL;
+ }
+
+ state->type = UNWIND_USER_TYPE_NONE;
+
+ state->ip = instruction_pointer(regs);
+ state->sp = user_stack_pointer(regs);
+ state->fp = frame_pointer(regs);
+
+ return 0;
+}
+
+int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries)
+{
+ struct unwind_user_state state;
+
+ trace->nr = 0;
+
+ if (!max_entries)
+ return -EINVAL;
+
+ if (current->flags & PF_KTHREAD)
+ return 0;
+
+ for_each_user_frame(&state) {
+ trace->entries[trace->nr++] = state.ip;
+ if (trace->nr >= max_entries)
+ break;
+ }
+
+ return 0;
+}
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 02/18] unwind_user: Add frame pointer support
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 01/18] unwind_user: Add user space unwinding API Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 03/18] unwind_user/x86: Enable frame pointer unwinding on x86 Steven Rostedt
` (15 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
Add optional support for user space frame pointer unwinding. If
supported, the arch needs to enable CONFIG_HAVE_UNWIND_USER_FP and
define ARCH_INIT_USER_FP_FRAME.
By encoding the frame offsets in struct unwind_user_frame, much of this
code can also be reused for future unwinder implementations like sframe.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
arch/Kconfig | 4 +++
include/asm-generic/unwind_user.h | 9 ++++++
include/linux/unwind_user_types.h | 1 +
kernel/unwind/user.c | 51 +++++++++++++++++++++++++++++--
4 files changed, 63 insertions(+), 2 deletions(-)
create mode 100644 include/asm-generic/unwind_user.h
diff --git a/arch/Kconfig b/arch/Kconfig
index ccbcead9fac0..0e3844c0e200 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -438,6 +438,10 @@ config HAVE_HARDLOCKUP_DETECTOR_ARCH
config UNWIND_USER
bool
+config HAVE_UNWIND_USER_FP
+ bool
+ select UNWIND_USER
+
config HAVE_PERF_REGS
bool
help
diff --git a/include/asm-generic/unwind_user.h b/include/asm-generic/unwind_user.h
new file mode 100644
index 000000000000..832425502fb3
--- /dev/null
+++ b/include/asm-generic/unwind_user.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_UNWIND_USER_H
+#define _ASM_GENERIC_UNWIND_USER_H
+
+#ifndef ARCH_INIT_USER_FP_FRAME
+ #define ARCH_INIT_USER_FP_FRAME
+#endif
+
+#endif /* _ASM_GENERIC_UNWIND_USER_H */
diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
index 6ed1b4ae74e1..65bd070eb6b0 100644
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -6,6 +6,7 @@
enum unwind_user_type {
UNWIND_USER_TYPE_NONE,
+ UNWIND_USER_TYPE_FP,
};
struct unwind_stacktrace {
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index d30449328981..0671a81494d3 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -6,10 +6,54 @@
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/unwind_user.h>
+#include <linux/uaccess.h>
+#include <asm/unwind_user.h>
+
+static struct unwind_user_frame fp_frame = {
+ ARCH_INIT_USER_FP_FRAME
+};
+
+static inline bool fp_state(struct unwind_user_state *state)
+{
+ return IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP) &&
+ state->type == UNWIND_USER_TYPE_FP;
+}
int unwind_user_next(struct unwind_user_state *state)
{
- /* no implementation yet */
+ struct unwind_user_frame _frame;
+ struct unwind_user_frame *frame = &_frame;
+ unsigned long cfa = 0, fp, ra = 0;
+
+ if (state->done)
+ return -EINVAL;
+
+ if (fp_state(state))
+ frame = &fp_frame;
+ else
+ goto the_end;
+
+ cfa = (frame->use_fp ? state->fp : state->sp) + frame->cfa_off;
+
+ /* stack going in wrong direction? */
+ if (cfa <= state->sp)
+ goto the_end;
+
+ if (get_user(ra, (unsigned long *)(cfa + frame->ra_off)))
+ goto the_end;
+
+ if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off)))
+ goto the_end;
+
+ state->ip = ra;
+ state->sp = cfa;
+ if (frame->fp_off)
+ state->fp = fp;
+
+ return 0;
+
+the_end:
+ state->done = true;
return -EINVAL;
}
@@ -24,7 +68,10 @@ int unwind_user_start(struct unwind_user_state *state)
return -EINVAL;
}
- state->type = UNWIND_USER_TYPE_NONE;
+ if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
+ state->type = UNWIND_USER_TYPE_FP;
+ else
+ state->type = UNWIND_USER_TYPE_NONE;
state->ip = instruction_pointer(regs);
state->sp = user_stack_pointer(regs);
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 03/18] unwind_user/x86: Enable frame pointer unwinding on x86
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 01/18] unwind_user: Add user space unwinding API Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 02/18] unwind_user: Add frame pointer support Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 04/18] perf/x86: Rename and move get_segment_base() and make it global Steven Rostedt
` (14 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
Use ARCH_INIT_USER_FP_FRAME to describe how frame pointers are unwound
on x86, and enable CONFIG_HAVE_UNWIND_USER_FP accordingly so the
unwind_user interfaces can be used.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
arch/x86/Kconfig | 1 +
arch/x86/include/asm/unwind_user.h | 11 +++++++++++
2 files changed, 12 insertions(+)
create mode 100644 arch/x86/include/asm/unwind_user.h
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index aeac63b11fc2..b5a85d2be5ee 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -301,6 +301,7 @@ config X86
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL
select HAVE_UNSTABLE_SCHED_CLOCK
+ select HAVE_UNWIND_USER_FP if X86_64
select HAVE_USER_RETURN_NOTIFIER
select HAVE_GENERIC_VDSO
select VDSO_GETRANDOM if X86_64
diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h
new file mode 100644
index 000000000000..8597857bf896
--- /dev/null
+++ b/arch/x86/include/asm/unwind_user.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UNWIND_USER_H
+#define _ASM_X86_UNWIND_USER_H
+
+#define ARCH_INIT_USER_FP_FRAME \
+ .cfa_off = (s32)sizeof(long) * 2, \
+ .ra_off = (s32)sizeof(long) * -1, \
+ .fp_off = (s32)sizeof(long) * -2, \
+ .use_fp = true,
+
+#endif /* _ASM_X86_UNWIND_USER_H */
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 04/18] perf/x86: Rename and move get_segment_base() and make it global
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (2 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 03/18] unwind_user/x86: Enable frame pointer unwinding on x86 Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 05/18] unwind_user: Add compat mode frame pointer support Steven Rostedt
` (13 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
get_segment_base() will be used by the unwind_user code, so make it
global and rename it so it doesn't conflict with a KVM function of the
same name.
As the function is no longer specific to perf, move it to ptrace.c as that
seems to be a better location for a generic function like this.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
arch/x86/events/core.c | 44 ++++-------------------------------
arch/x86/include/asm/ptrace.h | 2 ++
arch/x86/kernel/ptrace.c | 38 ++++++++++++++++++++++++++++++
3 files changed, 45 insertions(+), 39 deletions(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 85eb0eb1b284..524a59d9c2c4 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -42,6 +42,7 @@
#include <asm/ldt.h>
#include <asm/unwind.h>
#include <asm/uprobes.h>
+#include <asm/ptrace.h>
#include <asm/ibt.h>
#include "perf_event.h"
@@ -2807,41 +2808,6 @@ valid_user_frame(const void __user *fp, unsigned long size)
return __access_ok(fp, size);
}
-static unsigned long get_segment_base(unsigned int segment)
-{
- struct desc_struct *desc;
- unsigned int idx = segment >> 3;
-
- if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
- struct ldt_struct *ldt;
-
- /*
- * If we're not in a valid context with a real (not just lazy)
- * user mm, then don't even try.
- */
- if (!nmi_uaccess_okay())
- return 0;
-
- /* IRQs are off, so this synchronizes with smp_store_release */
- ldt = smp_load_acquire(¤t->mm->context.ldt);
- if (!ldt || idx >= ldt->nr_entries)
- return 0;
-
- desc = &ldt->entries[idx];
-#else
- return 0;
-#endif
- } else {
- if (idx >= GDT_ENTRIES)
- return 0;
-
- desc = raw_cpu_ptr(gdt_page.gdt) + idx;
- }
-
- return get_desc_base(desc);
-}
-
#ifdef CONFIG_UPROBES
/*
* Heuristic-based check if uprobe is installed at the function entry.
@@ -2898,8 +2864,8 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
if (user_64bit_mode(regs))
return 0;
- cs_base = get_segment_base(regs->cs);
- ss_base = get_segment_base(regs->ss);
+ cs_base = segment_base_address(regs->cs);
+ ss_base = segment_base_address(regs->ss);
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
@@ -3018,11 +2984,11 @@ static unsigned long code_segment_base(struct pt_regs *regs)
return 0x10 * regs->cs;
if (user_mode(regs) && regs->cs != __USER_CS)
- return get_segment_base(regs->cs);
+ return segment_base_address(regs->cs);
#else
if (user_mode(regs) && !user_64bit_mode(regs) &&
regs->cs != __USER32_CS)
- return get_segment_base(regs->cs);
+ return segment_base_address(regs->cs);
#endif
return 0;
}
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 50f75467f73d..59357ec98e52 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -314,6 +314,8 @@ static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
return !(regs->flags & X86_EFLAGS_IF);
}
+unsigned long segment_base_address(unsigned int segment);
+
/* Query offset/name of register from its name/offset */
extern int regs_query_register_offset(const char *name);
extern const char *regs_query_register_name(unsigned int offset);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 095f04bdabdc..81353a09701b 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -41,6 +41,7 @@
#include <asm/syscall.h>
#include <asm/fsgsbase.h>
#include <asm/io_bitmap.h>
+#include <asm/mmu_context.h>
#include "tls.h"
@@ -339,6 +340,43 @@ static int set_segment_reg(struct task_struct *task,
#endif /* CONFIG_X86_32 */
+unsigned long segment_base_address(unsigned int segment)
+{
+ struct desc_struct *desc;
+ unsigned int idx = segment >> 3;
+
+ lockdep_assert_irqs_disabled();
+
+ if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+ /*
+ * If we're not in a valid context with a real (not just lazy)
+ * user mm, then don't even try.
+ */
+ if (!nmi_uaccess_okay())
+ return 0;
+
+ /* IRQs are off, so this synchronizes with smp_store_release */
+ ldt = smp_load_acquire(¤t->mm->context.ldt);
+ if (!ldt || idx >= ldt->nr_entries)
+ return 0;
+
+ desc = &ldt->entries[idx];
+#else
+ return 0;
+#endif
+ } else {
+ if (idx >= GDT_ENTRIES)
+ return 0;
+
+ desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+ }
+
+ return get_desc_base(desc);
+}
+
static unsigned long get_flags(struct task_struct *task)
{
unsigned long retval = task_pt_regs(task)->flags;
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 05/18] unwind_user: Add compat mode frame pointer support
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (3 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 04/18] perf/x86: Rename and move get_segment_base() and make it global Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 06/18] unwind_user/x86: Enable compat mode frame pointer unwinding on x86 Steven Rostedt
` (12 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
Add optional support for user space compat mode frame pointer unwinding.
If supported, the arch needs to enable CONFIG_HAVE_UNWIND_USER_COMPAT_FP
and define ARCH_INIT_USER_COMPAT_FP_FRAME.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
arch/Kconfig | 4 +++
include/asm-generic/Kbuild | 2 ++
include/asm-generic/unwind_user.h | 15 +++++++++++
include/asm-generic/unwind_user_types.h | 9 +++++++
include/linux/unwind_user_types.h | 3 +++
kernel/unwind/user.c | 36 ++++++++++++++++++++++---
6 files changed, 65 insertions(+), 4 deletions(-)
create mode 100644 include/asm-generic/unwind_user_types.h
diff --git a/arch/Kconfig b/arch/Kconfig
index 0e3844c0e200..dbb1cc89e040 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -442,6 +442,10 @@ config HAVE_UNWIND_USER_FP
bool
select UNWIND_USER
+config HAVE_UNWIND_USER_COMPAT_FP
+ bool
+ depends on HAVE_UNWIND_USER_FP
+
config HAVE_PERF_REGS
bool
help
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 8675b7b4ad23..b797a2434396 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -59,6 +59,8 @@ mandatory-y += tlbflush.h
mandatory-y += topology.h
mandatory-y += trace_clock.h
mandatory-y += uaccess.h
+mandatory-y += unwind_user.h
+mandatory-y += unwind_user_types.h
mandatory-y += vermagic.h
mandatory-y += vga.h
mandatory-y += video.h
diff --git a/include/asm-generic/unwind_user.h b/include/asm-generic/unwind_user.h
index 832425502fb3..385638ce4aec 100644
--- a/include/asm-generic/unwind_user.h
+++ b/include/asm-generic/unwind_user.h
@@ -2,8 +2,23 @@
#ifndef _ASM_GENERIC_UNWIND_USER_H
#define _ASM_GENERIC_UNWIND_USER_H
+#include <asm/unwind_user_types.h>
+
#ifndef ARCH_INIT_USER_FP_FRAME
#define ARCH_INIT_USER_FP_FRAME
#endif
+#ifndef ARCH_INIT_USER_COMPAT_FP_FRAME
+ #define ARCH_INIT_USER_COMPAT_FP_FRAME
+ #define in_compat_mode(regs) false
+#endif
+
+#ifndef arch_unwind_user_init
+static inline void arch_unwind_user_init(struct unwind_user_state *state, struct pt_regs *reg) {}
+#endif
+
+#ifndef arch_unwind_user_next
+static inline void arch_unwind_user_next(struct unwind_user_state *state) {}
+#endif
+
#endif /* _ASM_GENERIC_UNWIND_USER_H */
diff --git a/include/asm-generic/unwind_user_types.h b/include/asm-generic/unwind_user_types.h
new file mode 100644
index 000000000000..ee803de7c998
--- /dev/null
+++ b/include/asm-generic/unwind_user_types.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_UNWIND_USER_TYPES_H
+#define _ASM_GENERIC_UNWIND_USER_TYPES_H
+
+#ifndef arch_unwind_user_state
+struct arch_unwind_user_state {};
+#endif
+
+#endif /* _ASM_GENERIC_UNWIND_USER_TYPES_H */
diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
index 65bd070eb6b0..3ec4a097a3dd 100644
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -3,10 +3,12 @@
#define _LINUX_UNWIND_USER_TYPES_H
#include <linux/types.h>
+#include <asm/unwind_user_types.h>
enum unwind_user_type {
UNWIND_USER_TYPE_NONE,
UNWIND_USER_TYPE_FP,
+ UNWIND_USER_TYPE_COMPAT_FP,
};
struct unwind_stacktrace {
@@ -25,6 +27,7 @@ struct unwind_user_state {
unsigned long ip;
unsigned long sp;
unsigned long fp;
+ struct arch_unwind_user_state arch;
enum unwind_user_type type;
bool done;
};
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 0671a81494d3..635cc04bb299 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -13,12 +13,32 @@ static struct unwind_user_frame fp_frame = {
ARCH_INIT_USER_FP_FRAME
};
+static struct unwind_user_frame compat_fp_frame = {
+ ARCH_INIT_USER_COMPAT_FP_FRAME
+};
+
static inline bool fp_state(struct unwind_user_state *state)
{
return IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP) &&
state->type == UNWIND_USER_TYPE_FP;
}
+static inline bool compat_state(struct unwind_user_state *state)
+{
+ return IS_ENABLED(CONFIG_HAVE_UNWIND_USER_COMPAT_FP) &&
+ state->type == UNWIND_USER_TYPE_COMPAT_FP;
+}
+
+#define UNWIND_GET_USER_LONG(to, from, state) \
+({ \
+ int __ret; \
+ if (compat_state(state)) \
+ __ret = get_user(to, (u32 __user *)(from)); \
+ else \
+ __ret = get_user(to, (u64 __user *)(from)); \
+ __ret; \
+})
+
int unwind_user_next(struct unwind_user_state *state)
{
struct unwind_user_frame _frame;
@@ -28,7 +48,9 @@ int unwind_user_next(struct unwind_user_state *state)
if (state->done)
return -EINVAL;
- if (fp_state(state))
+ if (compat_state(state))
+ frame = &compat_fp_frame;
+ else if (fp_state(state))
frame = &fp_frame;
else
goto the_end;
@@ -39,10 +61,10 @@ int unwind_user_next(struct unwind_user_state *state)
if (cfa <= state->sp)
goto the_end;
- if (get_user(ra, (unsigned long *)(cfa + frame->ra_off)))
+ if (UNWIND_GET_USER_LONG(ra, cfa + frame->ra_off, state))
goto the_end;
- if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off)))
+ if (frame->fp_off && UNWIND_GET_USER_LONG(fp, cfa + frame->fp_off, state))
goto the_end;
state->ip = ra;
@@ -50,6 +72,8 @@ int unwind_user_next(struct unwind_user_state *state)
if (frame->fp_off)
state->fp = fp;
+ arch_unwind_user_next(state);
+
return 0;
the_end:
@@ -68,7 +92,9 @@ int unwind_user_start(struct unwind_user_state *state)
return -EINVAL;
}
- if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
+ if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_COMPAT_FP) && in_compat_mode(regs))
+ state->type = UNWIND_USER_TYPE_COMPAT_FP;
+ else if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
state->type = UNWIND_USER_TYPE_FP;
else
state->type = UNWIND_USER_TYPE_NONE;
@@ -77,6 +103,8 @@ int unwind_user_start(struct unwind_user_state *state)
state->sp = user_stack_pointer(regs);
state->fp = frame_pointer(regs);
+ arch_unwind_user_init(state, regs);
+
return 0;
}
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 06/18] unwind_user/x86: Enable compat mode frame pointer unwinding on x86
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (4 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 05/18] unwind_user: Add compat mode frame pointer support Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 07/18] unwind_user/deferred: Add unwind_deferred_trace() Steven Rostedt
` (11 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
Use ARCH_INIT_USER_COMPAT_FP_FRAME to describe how frame pointers are
unwound on x86, and implement the hooks needed to add the segment base
addresses. Enable HAVE_UNWIND_USER_COMPAT_FP if the system has compat
mode compiled in.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
arch/x86/Kconfig | 1 +
arch/x86/include/asm/unwind_user.h | 50 ++++++++++++++++++++++++
arch/x86/include/asm/unwind_user_types.h | 17 ++++++++
3 files changed, 68 insertions(+)
create mode 100644 arch/x86/include/asm/unwind_user_types.h
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b5a85d2be5ee..35d3b01b65c6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -301,6 +301,7 @@ config X86
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL
select HAVE_UNSTABLE_SCHED_CLOCK
+ select HAVE_UNWIND_USER_COMPAT_FP if IA32_EMULATION
select HAVE_UNWIND_USER_FP if X86_64
select HAVE_USER_RETURN_NOTIFIER
select HAVE_GENERIC_VDSO
diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h
index 8597857bf896..bb1148111259 100644
--- a/arch/x86/include/asm/unwind_user.h
+++ b/arch/x86/include/asm/unwind_user.h
@@ -2,10 +2,60 @@
#ifndef _ASM_X86_UNWIND_USER_H
#define _ASM_X86_UNWIND_USER_H
+#include <linux/unwind_user_types.h>
+#include <asm/ptrace.h>
+#include <asm/perf_event.h>
+
#define ARCH_INIT_USER_FP_FRAME \
.cfa_off = (s32)sizeof(long) * 2, \
.ra_off = (s32)sizeof(long) * -1, \
.fp_off = (s32)sizeof(long) * -2, \
.use_fp = true,
+#ifdef CONFIG_IA32_EMULATION
+
+#define ARCH_INIT_USER_COMPAT_FP_FRAME \
+ .cfa_off = (s32)sizeof(u32) * 2, \
+ .ra_off = (s32)sizeof(u32) * -1, \
+ .fp_off = (s32)sizeof(u32) * -2, \
+ .use_fp = true,
+
+#define in_compat_mode(regs) !user_64bit_mode(regs)
+
+static inline void arch_unwind_user_init(struct unwind_user_state *state,
+ struct pt_regs *regs)
+{
+ unsigned long cs_base, ss_base;
+
+ if (state->type != UNWIND_USER_TYPE_COMPAT_FP)
+ return;
+
+ scoped_guard(irqsave) {
+ cs_base = segment_base_address(regs->cs);
+ ss_base = segment_base_address(regs->ss);
+ }
+
+ state->arch.cs_base = cs_base;
+ state->arch.ss_base = ss_base;
+
+ state->ip += cs_base;
+ state->sp += ss_base;
+ state->fp += ss_base;
+}
+#define arch_unwind_user_init arch_unwind_user_init
+
+static inline void arch_unwind_user_next(struct unwind_user_state *state)
+{
+ if (state->type != UNWIND_USER_TYPE_COMPAT_FP)
+ return;
+
+ state->ip += state->arch.cs_base;
+ state->fp += state->arch.ss_base;
+}
+#define arch_unwind_user_next arch_unwind_user_next
+
+#endif /* CONFIG_IA32_EMULATION */
+
+#include <asm-generic/unwind_user.h>
+
#endif /* _ASM_X86_UNWIND_USER_H */
diff --git a/arch/x86/include/asm/unwind_user_types.h b/arch/x86/include/asm/unwind_user_types.h
new file mode 100644
index 000000000000..d7074dc5f0ce
--- /dev/null
+++ b/arch/x86/include/asm/unwind_user_types.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_UNWIND_USER_TYPES_H
+#define _ASM_UNWIND_USER_TYPES_H
+
+#ifdef CONFIG_IA32_EMULATION
+
+struct arch_unwind_user_state {
+ unsigned long ss_base;
+ unsigned long cs_base;
+};
+#define arch_unwind_user_state arch_unwind_user_state
+
+#endif /* CONFIG_IA32_EMULATION */
+
+#include <asm-generic/unwind_user_types.h>
+
+#endif /* _ASM_UNWIND_USER_TYPES_H */
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 07/18] unwind_user/deferred: Add unwind_deferred_trace()
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (5 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 06/18] unwind_user/x86: Enable compat mode frame pointer unwinding on x86 Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 08/18] unwind_user/deferred: Add unwind cache Steven Rostedt
` (10 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Steven Rostedt <rostedt@goodmis.org>
Add a function that must be called inside a faultable context that will
retrieve a user space stack trace. The function unwind_deferred_trace()
can be called by a tracer when a task is about to enter user space, or has
just come back from user space and has interrupts enabled.
This code is based on work by Josh Poimboeuf's deferred unwinding code:
Link: https://lore.kernel.org/all/6052e8487746603bdb29b65f4033e739092d9925.1737511963.git.jpoimboe@kernel.org/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v6: https://lore.kernel.org/20250425145812.835672647@goodmis.org
- Use (current->flags & PF_EXITING) instead of checking !current->mm
include/linux/sched.h | 5 +++
include/linux/unwind_deferred.h | 24 ++++++++++++++
include/linux/unwind_deferred_types.h | 9 +++++
kernel/fork.c | 4 +++
kernel/unwind/Makefile | 2 +-
kernel/unwind/deferred.c | 48 +++++++++++++++++++++++++++
6 files changed, 91 insertions(+), 1 deletion(-)
create mode 100644 include/linux/unwind_deferred.h
create mode 100644 include/linux/unwind_deferred_types.h
create mode 100644 kernel/unwind/deferred.c
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ecc0c6b1cb0..a1e1c07cadfb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -47,6 +47,7 @@
#include <linux/livepatch_sched.h>
#include <linux/uidgid_types.h>
#include <linux/tracepoint-defs.h>
+#include <linux/unwind_deferred_types.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
@@ -1646,6 +1647,10 @@ struct task_struct {
struct user_event_mm *user_event_mm;
#endif
+#ifdef CONFIG_UNWIND_USER
+ struct unwind_task_info unwind_info;
+#endif
+
/* CPU-specific state of this task: */
struct thread_struct thread;
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
new file mode 100644
index 000000000000..5064ebe38c4f
--- /dev/null
+++ b/include/linux/unwind_deferred.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_DEFERRED_H
+#define _LINUX_UNWIND_USER_DEFERRED_H
+
+#include <linux/unwind_user.h>
+#include <linux/unwind_deferred_types.h>
+
+#ifdef CONFIG_UNWIND_USER
+
+void unwind_task_init(struct task_struct *task);
+void unwind_task_free(struct task_struct *task);
+
+int unwind_deferred_trace(struct unwind_stacktrace *trace);
+
+#else /* !CONFIG_UNWIND_USER */
+
+static inline void unwind_task_init(struct task_struct *task) {}
+static inline void unwind_task_free(struct task_struct *task) {}
+
+static inline int unwind_deferred_trace(struct unwind_stacktrace *trace) { return -ENOSYS; }
+
+#endif /* !CONFIG_UNWIND_USER */
+
+#endif /* _LINUX_UNWIND_USER_DEFERRED_H */
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
new file mode 100644
index 000000000000..aa32db574e43
--- /dev/null
+++ b/include/linux/unwind_deferred_types.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+#define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+
+struct unwind_task_info {
+ unsigned long *entries;
+};
+
+#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index c4b26cd8998b..8c79c7c2c553 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>
#include <linux/tick.h>
+#include <linux/unwind_deferred.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -991,6 +992,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ unwind_task_free(tsk);
sched_ext_free(tsk);
io_uring_free(tsk);
cgroup_free(tsk);
@@ -2395,6 +2397,8 @@ __latent_entropy struct task_struct *copy_process(
p->bpf_ctx = NULL;
#endif
+ unwind_task_init(p);
+
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index 349ce3677526..6752ac96d7e2 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1 +1 @@
- obj-$(CONFIG_UNWIND_USER) += user.o
+ obj-$(CONFIG_UNWIND_USER) += user.o deferred.o
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
new file mode 100644
index 000000000000..5a3789e38c00
--- /dev/null
+++ b/kernel/unwind/deferred.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Deferred user space unwinding
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/unwind_deferred.h>
+
+#define UNWIND_MAX_ENTRIES 512
+
+int unwind_deferred_trace(struct unwind_stacktrace *trace)
+{
+ struct unwind_task_info *info = ¤t->unwind_info;
+
+ /* Should always be called from faultable context */
+ might_fault();
+
+ if (current->flags & PF_EXITING)
+ return -EINVAL;
+
+ if (!info->entries) {
+ info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
+ GFP_KERNEL);
+ if (!info->entries)
+ return -ENOMEM;
+ }
+
+ trace->nr = 0;
+ trace->entries = info->entries;
+ unwind_user(trace, UNWIND_MAX_ENTRIES);
+
+ return 0;
+}
+
+void unwind_task_init(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+
+ memset(info, 0, sizeof(*info));
+}
+
+void unwind_task_free(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+
+ kfree(info->entries);
+}
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 08/18] unwind_user/deferred: Add unwind cache
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (6 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 07/18] unwind_user/deferred: Add unwind_deferred_trace() Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 09/18] perf: Remove get_perf_callchain() init_nr argument Steven Rostedt
` (9 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
Cache the results of the unwind to ensure the unwind is only performed
once, even when called by multiple tracers.
The cache nr_entries gets cleared every time the task exits the kernel.
When a stacktrace is requested, nr_entries gets set to the number of
entries in the stacktrace. If another stacktrace is requested, if
nr_entries is not zero, then it contains the same stacktrace that would be
retrieved so it is not processed again and the entries is given to the
caller.
Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/linux/entry-common.h | 2 ++
include/linux/unwind_deferred.h | 7 +++++++
include/linux/unwind_deferred_types.h | 7 ++++++-
kernel/unwind/deferred.c | 27 ++++++++++++++++++++-------
4 files changed, 35 insertions(+), 8 deletions(-)
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index fc61d0205c97..725ec0e87cdd 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -12,6 +12,7 @@
#include <linux/resume_user_mode.h>
#include <linux/tick.h>
#include <linux/kmsan.h>
+#include <linux/unwind_deferred.h>
#include <asm/entry-common.h>
@@ -361,6 +362,7 @@ static __always_inline void exit_to_user_mode(void)
lockdep_hardirqs_on_prepare();
instrumentation_end();
+ unwind_exit_to_user_mode();
user_enter_irqoff();
arch_exit_to_user_mode();
lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 5064ebe38c4f..c2d760e5e257 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -12,6 +12,11 @@ void unwind_task_free(struct task_struct *task);
int unwind_deferred_trace(struct unwind_stacktrace *trace);
+static __always_inline void unwind_exit_to_user_mode(void)
+{
+ current->unwind_info.cache.nr_entries = 0;
+}
+
#else /* !CONFIG_UNWIND_USER */
static inline void unwind_task_init(struct task_struct *task) {}
@@ -19,6 +24,8 @@ static inline void unwind_task_free(struct task_struct *task) {}
static inline int unwind_deferred_trace(struct unwind_stacktrace *trace) { return -ENOSYS; }
+static inline void unwind_exit_to_user_mode(void) {}
+
#endif /* !CONFIG_UNWIND_USER */
#endif /* _LINUX_UNWIND_USER_DEFERRED_H */
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index aa32db574e43..b3b7389ee6eb 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -2,8 +2,13 @@
#ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
#define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
-struct unwind_task_info {
+struct unwind_cache {
unsigned long *entries;
+ unsigned int nr_entries;
+};
+
+struct unwind_task_info {
+ struct unwind_cache cache;
};
#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index 5a3789e38c00..89ed04b1c527 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -12,6 +12,7 @@
int unwind_deferred_trace(struct unwind_stacktrace *trace)
{
struct unwind_task_info *info = ¤t->unwind_info;
+ struct unwind_cache *cache = &info->cache;
/* Should always be called from faultable context */
might_fault();
@@ -19,17 +20,29 @@ int unwind_deferred_trace(struct unwind_stacktrace *trace)
if (current->flags & PF_EXITING)
return -EINVAL;
- if (!info->entries) {
- info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
- GFP_KERNEL);
- if (!info->entries)
- return -ENOMEM;
+ if (!cache->entries) {
+ cache->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
+ GFP_KERNEL);
+ if (!cache->entries)
+ return -ENOMEM;
+ }
+
+ trace->entries = cache->entries;
+
+ if (cache->nr_entries) {
+ /*
+ * The user stack has already been previously unwound in this
+ * entry context. Skip the unwind and use the cache.
+ */
+ trace->nr = cache->nr_entries;
+ return 0;
}
trace->nr = 0;
- trace->entries = info->entries;
unwind_user(trace, UNWIND_MAX_ENTRIES);
+ cache->nr_entries = trace->nr;
+
return 0;
}
@@ -44,5 +57,5 @@ void unwind_task_free(struct task_struct *task)
{
struct unwind_task_info *info = &task->unwind_info;
- kfree(info->entries);
+ kfree(info->cache.entries);
}
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 09/18] perf: Remove get_perf_callchain() init_nr argument
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (7 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 08/18] unwind_user/deferred: Add unwind cache Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 10/18] perf: Have get_perf_callchain() return NULL if crosstask and user are set Steven Rostedt
` (8 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring, Namhyung Kim
From: Josh Poimboeuf <jpoimboe@kernel.org>
The 'init_nr' argument has double duty: it's used to initialize both the
number of contexts and the number of stack entries. That's confusing
and the callers always pass zero anyway. Hard code the zero.
Acked-by: Namhyung Kim <Namhyung@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/linux/perf_event.h | 2 +-
kernel/bpf/stackmap.c | 4 ++--
kernel/events/callchain.c | 12 ++++++------
kernel/events/core.c | 2 +-
4 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 947ad12dfdbe..3cc0b0ea0afa 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1651,7 +1651,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
-get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3615c06b7dfa..ec3a57a5fba1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
- trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
false, false);
if (unlikely(!trace))
@@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else if (kernel && task)
trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ trace = get_perf_callchain(regs, kernel, user, max_depth,
crosstask, false);
if (unlikely(!trace) || trace->nr < skip) {
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6c83ad674d01..b0f5bd228cd8 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -217,7 +217,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
}
struct perf_callchain_entry *
-get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
{
struct perf_callchain_entry *entry;
@@ -228,11 +228,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
if (!entry)
return NULL;
- ctx.entry = entry;
- ctx.max_stack = max_stack;
- ctx.nr = entry->nr = init_nr;
- ctx.contexts = 0;
- ctx.contexts_maxed = false;
+ ctx.entry = entry;
+ ctx.max_stack = max_stack;
+ ctx.nr = entry->nr = 0;
+ ctx.contexts = 0;
+ ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) {
if (add_mark)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3c69a1a3f41c..67581babe9ba 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8110,7 +8110,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (!kernel && !user)
return &__empty_callchain;
- callchain = get_perf_callchain(regs, 0, kernel, user,
+ callchain = get_perf_callchain(regs, kernel, user,
max_stack, crosstask, true);
return callchain ?: &__empty_callchain;
}
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 10/18] perf: Have get_perf_callchain() return NULL if crosstask and user are set
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (8 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 09/18] perf: Remove get_perf_callchain() init_nr argument Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 11/18] perf: Use current->flags & PF_KTHREAD instead of current->mm == NULL Steven Rostedt
` (7 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
get_perf_callchain() doesn't support cross-task unwinding for user space
stacks, have it return NULL if both the crosstask and user arguments are
set.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/events/callchain.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index b0f5bd228cd8..abf258913ab6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -224,6 +224,10 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
struct perf_callchain_entry_ctx ctx;
int rctx, start_entry_idx;
+ /* crosstask is not supported for user stacks */
+ if (crosstask && user)
+ return NULL;
+
entry = get_callchain_entry(&rctx);
if (!entry)
return NULL;
@@ -249,9 +253,6 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
}
if (regs) {
- if (crosstask)
- goto exit_put;
-
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
@@ -261,7 +262,6 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
}
}
-exit_put:
put_callchain_entry(rctx);
return entry;
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 11/18] perf: Use current->flags & PF_KTHREAD instead of current->mm == NULL
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (9 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 10/18] perf: Have get_perf_callchain() return NULL if crosstask and user are set Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 12/18] perf: Simplify get_perf_callchain() user logic Steven Rostedt
` (6 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Steven Rostedt <rostedt@goodmis.org>
To determine if a task is a kernel thread or not, it is more reliable to
use (current->flags & PF_KTHREAD) than to rely on current->mm being NULL.
That is because some kernel tasks (io_uring helpers) may have a mm field.
Link: https://lore.kernel.org/linux-trace-kernel/20250424163607.GE18306@noisy.programming.kicks-ass.net/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/events/callchain.c | 6 +++---
kernel/events/core.c | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index abf258913ab6..cda145dc11bd 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -246,10 +246,10 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
if (user) {
if (!user_mode(regs)) {
- if (current->mm)
- regs = task_pt_regs(current);
- else
+ if (current->flags & PF_KTHREAD)
regs = NULL;
+ else
+ regs = task_pt_regs(current);
}
if (regs) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67581babe9ba..430dd158b1ee 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7989,7 +7989,7 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe get_user_page_fast_only first.
* If failed, leave phys_addr as 0.
*/
- if (current->mm != NULL) {
+ if (!(current->flags & PF_KTHREAD)) {
struct page *p;
pagefault_disable();
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 12/18] perf: Simplify get_perf_callchain() user logic
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (10 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 11/18] perf: Use current->flags & PF_KTHREAD instead of current->mm == NULL Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:57 ` [PATCH v7 13/18] perf: Skip user unwind if the task is a kernel thread Steven Rostedt
` (5 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
Simplify the get_perf_callchain() user logic a bit. task_pt_regs()
should never be NULL.
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/events/callchain.c | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index cda145dc11bd..2798c0c9f782 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -247,21 +247,19 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
if (user) {
if (!user_mode(regs)) {
if (current->flags & PF_KTHREAD)
- regs = NULL;
- else
- regs = task_pt_regs(current);
+ goto exit_put;
+ regs = task_pt_regs(current);
}
- if (regs) {
- if (add_mark)
- perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
+ if (add_mark)
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- start_entry_idx = entry->nr;
- perf_callchain_user(&ctx, regs);
- fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
- }
+ start_entry_idx = entry->nr;
+ perf_callchain_user(&ctx, regs);
+ fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
}
+exit_put:
put_callchain_entry(rctx);
return entry;
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 13/18] perf: Skip user unwind if the task is a kernel thread.
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (11 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 12/18] perf: Simplify get_perf_callchain() user logic Steven Rostedt
@ 2025-04-30 19:57 ` Steven Rostedt
2025-04-30 19:58 ` [PATCH v7 14/18] perf: Support deferred user callchains Steven Rostedt
` (4 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:57 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
If the task is not a user thread, there's no user stack to unwind.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/events/core.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 430dd158b1ee..ec9edf602974 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8101,7 +8101,8 @@ struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
bool kernel = !event->attr.exclude_callchain_kernel;
- bool user = !event->attr.exclude_callchain_user;
+ bool user = !event->attr.exclude_callchain_user &&
+ !(current->flags & PF_KTHREAD);
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
const u32 max_stack = event->attr.sample_max_stack;
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 14/18] perf: Support deferred user callchains
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (12 preceding siblings ...)
2025-04-30 19:57 ` [PATCH v7 13/18] perf: Skip user unwind if the task is a kernel thread Steven Rostedt
@ 2025-04-30 19:58 ` Steven Rostedt
2025-04-30 19:58 ` [PATCH v7 15/18] perf tools: Minimal CALLCHAIN_DEFERRED support Steven Rostedt
` (3 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:58 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Josh Poimboeuf <jpoimboe@kernel.org>
Use the new unwind_deferred_trace() interface (if available) to defer
unwinds to task context. This will allow the use of .sframe (when it
becomes available) and also prevents duplicate userspace unwinds.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v6: https://lore.kernel.org/20250425145814.033122445@goodmis.org
- Only defer unwind if event is attached to a specific task (not global per CPU)
- Changed a !current->mm to a (current->flags & PF_KTHREAD)
- Added a missing rcuwait_init(&event->pending_unwind_wait);
arch/Kconfig | 3 +
include/linux/perf_event.h | 7 +-
include/uapi/linux/perf_event.h | 19 ++-
kernel/bpf/stackmap.c | 4 +-
kernel/events/callchain.c | 11 +-
kernel/events/core.c | 168 +++++++++++++++++++++++++-
tools/include/uapi/linux/perf_event.h | 19 ++-
7 files changed, 223 insertions(+), 8 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index dbb1cc89e040..681946b5f2c4 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -446,6 +446,9 @@ config HAVE_UNWIND_USER_COMPAT_FP
bool
depends on HAVE_UNWIND_USER_FP
+config HAVE_PERF_CALLCHAIN_DEFERRED
+ bool
+
config HAVE_PERF_REGS
bool
help
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3cc0b0ea0afa..10603a8344d3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -62,6 +62,7 @@ struct perf_guest_info_callbacks {
#include <linux/security.h>
#include <linux/static_call.h>
#include <linux/lockdep.h>
+#include <linux/unwind_deferred.h>
#include <asm/local.h>
struct perf_callchain_entry {
@@ -830,6 +831,10 @@ struct perf_event {
struct callback_head pending_task;
unsigned int pending_work;
+ unsigned int pending_unwind_callback;
+ struct callback_head pending_unwind_work;
+ struct rcuwait pending_unwind_wait;
+
atomic_t event_limit;
/* address range filters */
@@ -1652,7 +1657,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool crosstask, bool add_mark);
+ u32 max_stack, bool crosstask, bool add_mark, bool defer_user);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 5fc753c23734..65fe495c012e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -462,7 +462,8 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
- __reserved_1 : 26;
+ defer_callchain: 1, /* generate PERF_RECORD_CALLCHAIN_DEFERRED records */
+ __reserved_1 : 25;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1228,6 +1229,21 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
+ /*
+ * This user callchain capture was deferred until shortly before
+ * returning to user space. Previous samples would have kernel
+ * callchains only and they need to be stitched with this to make full
+ * callchains.
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u64 nr;
+ * u64 ips[nr];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_CALLCHAIN_DEFERRED = 22,
+
PERF_RECORD_MAX, /* non-ABI */
};
@@ -1258,6 +1274,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
+ PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index ec3a57a5fba1..339f7cbbcf36 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
max_depth = sysctl_perf_event_max_stack;
trace = get_perf_callchain(regs, kernel, user, max_depth,
- false, false);
+ false, false, false);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
trace = get_callchain_entry_for_task(task, max_depth);
else
trace = get_perf_callchain(regs, kernel, user, max_depth,
- crosstask, false);
+ crosstask, false, false);
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 2798c0c9f782..50c637e960b9 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool crosstask, bool add_mark)
+ u32 max_stack, bool crosstask, bool add_mark, bool defer_user)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
@@ -251,6 +251,15 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
regs = task_pt_regs(current);
}
+ if (defer_user) {
+ /*
+ * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+ * which can be stitched to this one.
+ */
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+ goto exit_put;
+ }
+
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ec9edf602974..a5d9c6220589 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5537,6 +5537,89 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
+static void perf_pending_unwind_sync(struct perf_event *event)
+{
+ might_sleep();
+
+ if (!event->pending_unwind_callback)
+ return;
+
+ /*
+ * If the task is queued to the current task's queue, we
+ * obviously can't wait for it to complete. Simply cancel it.
+ */
+ if (task_work_cancel(current, &event->pending_unwind_work)) {
+ event->pending_unwind_callback = 0;
+ local_dec(&event->ctx->nr_no_switch_fast);
+ return;
+ }
+
+ /*
+ * All accesses related to the event are within the same RCU section in
+ * perf_event_callchain_deferred(). The RCU grace period before the
+ * event is freed will make sure all those accesses are complete by then.
+ */
+ rcuwait_wait_event(&event->pending_unwind_wait, !event->pending_unwind_callback, TASK_UNINTERRUPTIBLE);
+}
+
+struct perf_callchain_deferred_event {
+ struct perf_event_header header;
+ u64 nr;
+ u64 ips[];
+};
+
+static void perf_event_callchain_deferred(struct callback_head *work)
+{
+ struct perf_event *event = container_of(work, struct perf_event, pending_unwind_work);
+ struct perf_callchain_deferred_event deferred_event;
+ u64 callchain_context = PERF_CONTEXT_USER;
+ struct unwind_stacktrace trace;
+ struct perf_output_handle handle;
+ struct perf_sample_data data;
+ u64 nr;
+
+ if (!event->pending_unwind_callback)
+ return;
+
+ if (unwind_deferred_trace(&trace) < 0)
+ goto out;
+
+ /*
+ * All accesses to the event must belong to the same implicit RCU
+ * read-side critical section as the ->pending_unwind_callback reset.
+ * See comment in perf_pending_unwind_sync().
+ */
+ guard(rcu)();
+
+ if (current->flags & PF_KTHREAD)
+ goto out;
+
+ nr = trace.nr + 1 ; /* '+1' == callchain_context */
+
+ deferred_event.header.type = PERF_RECORD_CALLCHAIN_DEFERRED;
+ deferred_event.header.misc = PERF_RECORD_MISC_USER;
+ deferred_event.header.size = sizeof(deferred_event) + (nr * sizeof(u64));
+
+ deferred_event.nr = nr;
+
+ perf_event_header__init_id(&deferred_event.header, &data, event);
+
+ if (perf_output_begin(&handle, &data, event, deferred_event.header.size))
+ goto out;
+
+ perf_output_put(&handle, deferred_event);
+ perf_output_put(&handle, callchain_context);
+ perf_output_copy(&handle, trace.entries, trace.nr * sizeof(u64));
+ perf_event__output_id_sample(event, &handle, &data);
+
+ perf_output_end(&handle);
+
+out:
+ event->pending_unwind_callback = 0;
+ local_dec(&event->ctx->nr_no_switch_fast);
+ rcuwait_wake_up(&event->pending_unwind_wait);
+}
+
static void perf_free_addr_filters(struct perf_event *event);
/* vs perf_event_alloc() error */
@@ -5604,6 +5687,7 @@ static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending_irq);
irq_work_sync(&event->pending_disable_irq);
+ perf_pending_unwind_sync(event);
unaccount_event(event);
@@ -8097,6 +8181,65 @@ static u64 perf_get_page_size(unsigned long addr)
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
+/* Returns the same as deferred_request() below */
+static int deferred_request_nmi(struct perf_event *event)
+{
+ struct callback_head *work = &event->pending_unwind_work;
+ int ret;
+
+ if (event->pending_unwind_callback)
+ return 1;
+
+ ret = task_work_add(current, work, TWA_NMI_CURRENT);
+ if (ret)
+ return ret;
+
+ event->pending_unwind_callback = 1;
+ return 0;
+}
+
+/*
+ * Returns:
+* > 0 : if already queued.
+ * 0 : if it performed the queuing
+ * < 0 : if it did not get queued.
+ */
+static int deferred_request(struct perf_event *event)
+{
+ struct callback_head *work = &event->pending_unwind_work;
+ int pending;
+ int ret;
+
+ /* Only defer for task events */
+ if (!event->ctx->task)
+ return -EINVAL;
+
+ if ((current->flags & PF_KTHREAD) || !user_mode(task_pt_regs(current)))
+ return -EINVAL;
+
+ if (in_nmi())
+ return deferred_request_nmi(event);
+
+ guard(irqsave)();
+
+ /* callback already pending? */
+ pending = READ_ONCE(event->pending_unwind_callback);
+ if (pending)
+ return 1;
+
+ /* Claim the work unless an NMI just now swooped in to do so. */
+ if (!try_cmpxchg(&event->pending_unwind_callback, &pending, 1))
+ return 1;
+
+ /* The work has been claimed, now schedule it. */
+ ret = task_work_add(current, work, TWA_RESUME);
+ if (WARN_ON_ONCE(ret)) {
+ WRITE_ONCE(event->pending_unwind_callback, 0);
+ return ret;
+ }
+ return 0;
+}
+
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
@@ -8107,12 +8250,27 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
bool crosstask = event->ctx->task && event->ctx->task != current;
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+ event->attr.defer_callchain;
if (!kernel && !user)
return &__empty_callchain;
- callchain = get_perf_callchain(regs, kernel, user,
- max_stack, crosstask, true);
+ /* Disallow cross-task callchains. */
+ if (event->ctx->task && event->ctx->task != current)
+ return &__empty_callchain;
+
+ if (defer_user) {
+ int ret = deferred_request(event);
+ if (!ret)
+ local_inc(&event->ctx->nr_no_switch_fast);
+ else if (ret < 0)
+ defer_user = false;
+ }
+
+ callchain = get_perf_callchain(regs, kernel, user, max_stack,
+ crosstask, true, defer_user);
+
return callchain ?: &__empty_callchain;
}
@@ -12776,6 +12934,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
init_task_work(&event->pending_task, perf_pending_task);
+ rcuwait_init(&event->pending_unwind_wait);
+
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -12944,6 +13104,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (err)
return ERR_PTR(err);
+ if (event->attr.defer_callchain)
+ init_task_work(&event->pending_unwind_work,
+ perf_event_callchain_deferred);
+
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 5fc753c23734..65fe495c012e 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -462,7 +462,8 @@ struct perf_event_attr {
inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */
remove_on_exec : 1, /* event is removed from task on exec */
sigtrap : 1, /* send synchronous SIGTRAP on event */
- __reserved_1 : 26;
+ defer_callchain: 1, /* generate PERF_RECORD_CALLCHAIN_DEFERRED records */
+ __reserved_1 : 25;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1228,6 +1229,21 @@ enum perf_event_type {
*/
PERF_RECORD_AUX_OUTPUT_HW_ID = 21,
+ /*
+ * This user callchain capture was deferred until shortly before
+ * returning to user space. Previous samples would have kernel
+ * callchains only and they need to be stitched with this to make full
+ * callchains.
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u64 nr;
+ * u64 ips[nr];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_CALLCHAIN_DEFERRED = 22,
+
PERF_RECORD_MAX, /* non-ABI */
};
@@ -1258,6 +1274,7 @@ enum perf_callchain_context {
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
+ PERF_CONTEXT_USER_DEFERRED = (__u64)-640,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 15/18] perf tools: Minimal CALLCHAIN_DEFERRED support
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (13 preceding siblings ...)
2025-04-30 19:58 ` [PATCH v7 14/18] perf: Support deferred user callchains Steven Rostedt
@ 2025-04-30 19:58 ` Steven Rostedt
2025-04-30 19:58 ` [PATCH v7 16/18] perf record: Enable defer_callchain for user callchains Steven Rostedt
` (2 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:58 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Namhyung Kim <namhyung@kernel.org>
Add a new event type for deferred callchains and a new callback for the
struct perf_tool. For now it doesn't actually handle the deferred
callchains but it just marks the sample if it has the PERF_CONTEXT_
USER_DEFFERED in the callchain array.
At least, perf report can dump the raw data with this change. Actually
this requires the next commit to enable attr.defer_callchain, but if you
already have a data file, it'll show the following result.
$ perf report -D
...
0x5fe0@perf.data [0x40]: event: 22
.
. ... raw event: size 64 bytes
. 0000: 16 00 00 00 02 00 40 00 02 00 00 00 00 00 00 00 ......@.........
. 0010: 00 fe ff ff ff ff ff ff 4b d3 3f 25 45 7f 00 00 ........K.?%E...
. 0020: 21 03 00 00 21 03 00 00 43 02 12 ab 05 00 00 00 !...!...C.......
. 0030: 00 00 00 00 00 00 00 00 09 00 00 00 00 00 00 00 ................
0 24344920643 0x5fe0 [0x40]: PERF_RECORD_CALLCHAIN_DEFERRED(IP, 0x2): 801/801: 0
... FP chain: nr:2
..... 0: fffffffffffffe00
..... 1: 00007f45253fd34b
: unhandled!
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tools/lib/perf/include/perf/event.h | 7 +++++++
tools/perf/util/event.c | 1 +
tools/perf/util/evsel.c | 15 +++++++++++++++
tools/perf/util/machine.c | 1 +
tools/perf/util/perf_event_attr_fprintf.c | 1 +
tools/perf/util/sample.h | 3 ++-
tools/perf/util/session.c | 17 +++++++++++++++++
tools/perf/util/tool.c | 1 +
tools/perf/util/tool.h | 3 ++-
9 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
index 37bb7771d914..f643a6a2b9fc 100644
--- a/tools/lib/perf/include/perf/event.h
+++ b/tools/lib/perf/include/perf/event.h
@@ -151,6 +151,12 @@ struct perf_record_switch {
__u32 next_prev_tid;
};
+struct perf_record_callchain_deferred {
+ struct perf_event_header header;
+ __u64 nr;
+ __u64 ips[];
+};
+
struct perf_record_header_attr {
struct perf_event_header header;
struct perf_event_attr attr;
@@ -494,6 +500,7 @@ union perf_event {
struct perf_record_read read;
struct perf_record_throttle throttle;
struct perf_record_sample sample;
+ struct perf_record_callchain_deferred callchain_deferred;
struct perf_record_bpf_event bpf;
struct perf_record_ksymbol ksymbol;
struct perf_record_text_poke_event text_poke;
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index c23b77f8f854..fec86519b7d4 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -58,6 +58,7 @@ static const char *perf_event__names[] = {
[PERF_RECORD_CGROUP] = "CGROUP",
[PERF_RECORD_TEXT_POKE] = "TEXT_POKE",
[PERF_RECORD_AUX_OUTPUT_HW_ID] = "AUX_OUTPUT_HW_ID",
+ [PERF_RECORD_CALLCHAIN_DEFERRED] = "CALLCHAIN_DEFERRED",
[PERF_RECORD_HEADER_ATTR] = "ATTR",
[PERF_RECORD_HEADER_EVENT_TYPE] = "EVENT_TYPE",
[PERF_RECORD_HEADER_TRACING_DATA] = "TRACING_DATA",
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 3c030da2e477..b872236a2413 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2948,6 +2948,18 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
data->data_src = PERF_MEM_DATA_SRC_NONE;
data->vcpu = -1;
+ if (event->header.type == PERF_RECORD_CALLCHAIN_DEFERRED) {
+ const u64 max_callchain_nr = UINT64_MAX / sizeof(u64);
+
+ data->callchain = (struct ip_callchain *)&event->callchain_deferred.nr;
+ if (data->callchain->nr > max_callchain_nr)
+ return -EFAULT;
+
+ if (evsel->core.attr.sample_id_all)
+ perf_evsel__parse_id_sample(evsel, event, data);
+ return 0;
+ }
+
if (event->header.type != PERF_RECORD_SAMPLE) {
if (!evsel->core.attr.sample_id_all)
return 0;
@@ -3078,6 +3090,9 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
if (data->callchain->nr > max_callchain_nr)
return -EFAULT;
sz = data->callchain->nr * sizeof(u64);
+ if (evsel->core.attr.defer_callchain && data->callchain->nr >= 1 &&
+ data->callchain->ips[data->callchain->nr - 1] == PERF_CONTEXT_USER_DEFERRED)
+ data->deferred_callchain = true;
OVERFLOW_CHECK(array, sz, max_size);
array = (void *)array + sz;
}
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 2531b373f2cf..df76adce89ff 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2089,6 +2089,7 @@ static int add_callchain_ip(struct thread *thread,
*cpumode = PERF_RECORD_MISC_KERNEL;
break;
case PERF_CONTEXT_USER:
+ case PERF_CONTEXT_USER_DEFERRED:
*cpumode = PERF_RECORD_MISC_USER;
break;
default:
diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c
index 66b666d9ce64..abfd9b9a718c 100644
--- a/tools/perf/util/perf_event_attr_fprintf.c
+++ b/tools/perf/util/perf_event_attr_fprintf.c
@@ -343,6 +343,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
PRINT_ATTRf(inherit_thread, p_unsigned);
PRINT_ATTRf(remove_on_exec, p_unsigned);
PRINT_ATTRf(sigtrap, p_unsigned);
+ PRINT_ATTRf(defer_callchain, p_unsigned);
PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned, false);
PRINT_ATTRf(bp_type, p_unsigned);
diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index 0e96240052e9..9d6e2f14551c 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -108,7 +108,8 @@ struct perf_sample {
u16 p_stage_cyc;
u16 retire_lat;
};
- bool no_hw_idx; /* No hw_idx collected in branch_stack */
+ bool no_hw_idx; /* No hw_idx collected in branch_stack */
+ bool deferred_callchain; /* Has deferred user callchains */
char insn[MAX_INSN];
void *raw_data;
struct ip_callchain *callchain;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 60fb9997ea0d..30fb1d281be8 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -715,6 +715,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
[PERF_RECORD_CGROUP] = perf_event__cgroup_swap,
[PERF_RECORD_TEXT_POKE] = perf_event__text_poke_swap,
[PERF_RECORD_AUX_OUTPUT_HW_ID] = perf_event__all64_swap,
+ [PERF_RECORD_CALLCHAIN_DEFERRED] = perf_event__all64_swap,
[PERF_RECORD_HEADER_ATTR] = perf_event__hdr_attr_swap,
[PERF_RECORD_HEADER_EVENT_TYPE] = perf_event__event_type_swap,
[PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap,
@@ -1118,6 +1119,19 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
sample_read__printf(sample, evsel->core.attr.read_format);
}
+static void dump_deferred_callchain(struct evsel *evsel, union perf_event *event,
+ struct perf_sample *sample)
+{
+ if (!dump_trace)
+ return;
+
+ printf("(IP, 0x%x): %d/%d: %#" PRIx64 "\n",
+ event->header.misc, sample->pid, sample->tid, sample->ip);
+
+ if (evsel__has_callchain(evsel))
+ callchain__printf(evsel, sample);
+}
+
static void dump_read(struct evsel *evsel, union perf_event *event)
{
struct perf_record_read *read_event = &event->read;
@@ -1348,6 +1362,9 @@ static int machines__deliver_event(struct machines *machines,
return tool->text_poke(tool, event, sample, machine);
case PERF_RECORD_AUX_OUTPUT_HW_ID:
return tool->aux_output_hw_id(tool, event, sample, machine);
+ case PERF_RECORD_CALLCHAIN_DEFERRED:
+ dump_deferred_callchain(evsel, event, sample);
+ return tool->callchain_deferred(tool, event, sample, evsel, machine);
default:
++evlist->stats.nr_unknown_events;
return -1;
diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
index 3b7f390f26eb..e78f16de912e 100644
--- a/tools/perf/util/tool.c
+++ b/tools/perf/util/tool.c
@@ -259,6 +259,7 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
tool->read = process_event_sample_stub;
tool->throttle = process_event_stub;
tool->unthrottle = process_event_stub;
+ tool->callchain_deferred = process_event_sample_stub;
tool->attr = process_event_synth_attr_stub;
tool->event_update = process_event_synth_event_update_stub;
tool->tracing_data = process_event_synth_tracing_data_stub;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index db1c7642b0d1..9987bbde6d5e 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -42,7 +42,8 @@ enum show_feature_header {
struct perf_tool {
event_sample sample,
- read;
+ read,
+ callchain_deferred;
event_op mmap,
mmap2,
comm,
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 16/18] perf record: Enable defer_callchain for user callchains
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (14 preceding siblings ...)
2025-04-30 19:58 ` [PATCH v7 15/18] perf tools: Minimal CALLCHAIN_DEFERRED support Steven Rostedt
@ 2025-04-30 19:58 ` Steven Rostedt
2025-04-30 19:58 ` [PATCH v7 17/18] perf script: Display PERF_RECORD_CALLCHAIN_DEFERRED Steven Rostedt
2025-04-30 19:58 ` [PATCH v7 18/18] perf tools: Merge deferred user callchains Steven Rostedt
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:58 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Namhyung Kim <namhyung@kernel.org>
And add the missing feature detection logic to clear the flag on old
kernels.
$ perf record -g -vv true
...
------------------------------------------------------------
perf_event_attr:
type 0 (PERF_TYPE_HARDWARE)
size 136
config 0 (PERF_COUNT_HW_CPU_CYCLES)
{ sample_period, sample_freq } 4000
sample_type IP|TID|TIME|CALLCHAIN|PERIOD
read_format ID|LOST
disabled 1
inherit 1
mmap 1
comm 1
freq 1
enable_on_exec 1
task 1
sample_id_all 1
mmap2 1
comm_exec 1
ksymbol 1
bpf_event 1
defer_callchain 1
------------------------------------------------------------
sys_perf_event_open: pid 162755 cpu 0 group_fd -1 flags 0x8
sys_perf_event_open failed, error -22
switching off deferred callchain support
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tools/perf/util/evsel.c | 24 ++++++++++++++++++++++++
tools/perf/util/evsel.h | 1 +
2 files changed, 25 insertions(+)
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index b872236a2413..669e585dedee 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1076,6 +1076,14 @@ static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *o
}
}
+ if (param->record_mode == CALLCHAIN_FP && !attr->exclude_callchain_user) {
+ /*
+ * Enable deferred callchains optimistically. It'll be switched
+ * off later if the kernel doesn't support it.
+ */
+ attr->defer_callchain = 1;
+ }
+
if (function) {
pr_info("Disabling user space callchains for function trace event.\n");
attr->exclude_callchain_user = 1;
@@ -2123,6 +2131,8 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus,
static void evsel__disable_missing_features(struct evsel *evsel)
{
+ if (perf_missing_features.defer_callchain)
+ evsel->core.attr.defer_callchain = 0;
if (perf_missing_features.inherit_sample_read && evsel->core.attr.inherit &&
(evsel->core.attr.sample_type & PERF_SAMPLE_READ))
evsel->core.attr.inherit = 0;
@@ -2397,6 +2407,15 @@ static bool evsel__detect_missing_features(struct evsel *evsel, struct perf_cpu
/* Please add new feature detection here. */
+ attr.defer_callchain = true;
+ attr.sample_type = PERF_SAMPLE_CALLCHAIN;
+ if (has_attr_feature(&attr, /*flags=*/0))
+ goto found;
+ perf_missing_features.defer_callchain = true;
+ pr_debug2("switching off deferred callchain support\n");
+ attr.defer_callchain = false;
+ attr.sample_type = 0;
+
attr.inherit = true;
attr.sample_type = PERF_SAMPLE_READ;
if (has_attr_feature(&attr, /*flags=*/0))
@@ -2508,6 +2527,11 @@ static bool evsel__detect_missing_features(struct evsel *evsel, struct perf_cpu
errno = old_errno;
check:
+ if (evsel->core.attr.defer_callchain &&
+ evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN &&
+ perf_missing_features.defer_callchain)
+ return true;
+
if (evsel->core.attr.inherit &&
(evsel->core.attr.sample_type & PERF_SAMPLE_READ) &&
perf_missing_features.inherit_sample_read)
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index aae431d63d64..7ded99c774c7 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -211,6 +211,7 @@ struct perf_missing_features {
bool branch_counters;
bool aux_action;
bool inherit_sample_read;
+ bool defer_callchain;
};
extern struct perf_missing_features perf_missing_features;
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 17/18] perf script: Display PERF_RECORD_CALLCHAIN_DEFERRED
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (15 preceding siblings ...)
2025-04-30 19:58 ` [PATCH v7 16/18] perf record: Enable defer_callchain for user callchains Steven Rostedt
@ 2025-04-30 19:58 ` Steven Rostedt
2025-04-30 19:58 ` [PATCH v7 18/18] perf tools: Merge deferred user callchains Steven Rostedt
17 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:58 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Namhyung Kim <namhyung@kernel.org>
Handle the deferred callchains in the script output.
$ perf script
perf 801 [000] 18.031793: 1 cycles:P:
ffffffff91a14c36 __intel_pmu_enable_all.isra.0+0x56 ([kernel.kallsyms])
ffffffff91d373e9 perf_ctx_enable+0x39 ([kernel.kallsyms])
ffffffff91d36af7 event_function+0xd7 ([kernel.kallsyms])
ffffffff91d34222 remote_function+0x42 ([kernel.kallsyms])
ffffffff91c1ebe1 generic_exec_single+0x61 ([kernel.kallsyms])
ffffffff91c1edac smp_call_function_single+0xec ([kernel.kallsyms])
ffffffff91d37a9d event_function_call+0x10d ([kernel.kallsyms])
ffffffff91d33557 perf_event_for_each_child+0x37 ([kernel.kallsyms])
ffffffff91d47324 _perf_ioctl+0x204 ([kernel.kallsyms])
ffffffff91d47c43 perf_ioctl+0x33 ([kernel.kallsyms])
ffffffff91e2f216 __x64_sys_ioctl+0x96 ([kernel.kallsyms])
ffffffff9265f1ae do_syscall_64+0x9e ([kernel.kallsyms])
ffffffff92800130 entry_SYSCALL_64+0xb0 ([kernel.kallsyms])
perf 801 [000] 18.031814: DEFERRED CALLCHAIN
7fb5fc22034b __GI___ioctl+0x3b (/usr/lib/x86_64-linux-gnu/libc.so.6)
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tools/perf/builtin-script.c | 89 +++++++++++++++++++++++++++++++++++++
1 file changed, 89 insertions(+)
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 9b16df881af8..176b8f299afc 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2483,6 +2483,93 @@ static int process_sample_event(const struct perf_tool *tool,
return ret;
}
+static int process_deferred_sample_event(const struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct evsel *evsel,
+ struct machine *machine)
+{
+ struct perf_script *scr = container_of(tool, struct perf_script, tool);
+ struct perf_event_attr *attr = &evsel->core.attr;
+ struct evsel_script *es = evsel->priv;
+ unsigned int type = output_type(attr->type);
+ struct addr_location al;
+ FILE *fp = es->fp;
+ int ret = 0;
+
+ if (output[type].fields == 0)
+ return 0;
+
+ /* Set thread to NULL to indicate addr_al and al are not initialized */
+ addr_location__init(&al);
+
+ if (perf_time__ranges_skip_sample(scr->ptime_range, scr->range_num,
+ sample->time)) {
+ goto out_put;
+ }
+
+ if (debug_mode) {
+ if (sample->time < last_timestamp) {
+ pr_err("Samples misordered, previous: %" PRIu64
+ " this: %" PRIu64 "\n", last_timestamp,
+ sample->time);
+ nr_unordered++;
+ }
+ last_timestamp = sample->time;
+ goto out_put;
+ }
+
+ if (filter_cpu(sample))
+ goto out_put;
+
+ if (machine__resolve(machine, &al, sample) < 0) {
+ pr_err("problem processing %d event, skipping it.\n",
+ event->header.type);
+ ret = -1;
+ goto out_put;
+ }
+
+ if (al.filtered)
+ goto out_put;
+
+ if (!show_event(sample, evsel, al.thread, &al, NULL))
+ goto out_put;
+
+ if (evswitch__discard(&scr->evswitch, evsel))
+ goto out_put;
+
+ perf_sample__fprintf_start(scr, sample, al.thread, evsel,
+ PERF_RECORD_CALLCHAIN_DEFERRED, fp);
+ fprintf(fp, "DEFERRED CALLCHAIN");
+
+ if (PRINT_FIELD(IP)) {
+ struct callchain_cursor *cursor = NULL;
+
+ if (symbol_conf.use_callchain && sample->callchain) {
+ cursor = get_tls_callchain_cursor();
+ if (thread__resolve_callchain(al.thread, cursor, evsel,
+ sample, NULL, NULL,
+ scripting_max_stack)) {
+ pr_info("cannot resolve deferred callchains\n");
+ cursor = NULL;
+ }
+ }
+
+ fputc(cursor ? '\n' : ' ', fp);
+ sample__fprintf_sym(sample, &al, 0, output[type].print_ip_opts,
+ cursor, symbol_conf.bt_stop_list, fp);
+ }
+
+ fprintf(fp, "\n");
+
+ if (verbose > 0)
+ fflush(fp);
+
+out_put:
+ addr_location__exit(&al);
+ return ret;
+}
+
// Used when scr->per_event_dump is not set
static struct evsel_script es_stdout;
@@ -4069,6 +4156,7 @@ int cmd_script(int argc, const char **argv)
perf_tool__init(&script.tool, !unsorted_dump);
script.tool.sample = process_sample_event;
+ script.tool.callchain_deferred = process_deferred_sample_event;
script.tool.mmap = perf_event__process_mmap;
script.tool.mmap2 = perf_event__process_mmap2;
script.tool.comm = perf_event__process_comm;
@@ -4095,6 +4183,7 @@ int cmd_script(int argc, const char **argv)
script.tool.throttle = process_throttle_event;
script.tool.unthrottle = process_throttle_event;
script.tool.ordering_requires_timestamps = true;
+ script.tool.merge_deferred_callchains = false;
session = perf_session__new(&data, &script.tool);
if (IS_ERR(session))
return PTR_ERR(session);
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v7 18/18] perf tools: Merge deferred user callchains
2025-04-30 19:57 [PATCH v7 00/18] perf: Deferred unwinding of user space stack traces Steven Rostedt
` (16 preceding siblings ...)
2025-04-30 19:58 ` [PATCH v7 17/18] perf script: Display PERF_RECORD_CALLCHAIN_DEFERRED Steven Rostedt
@ 2025-04-30 19:58 ` Steven Rostedt
2025-04-30 20:07 ` Ian Rogers
17 siblings, 1 reply; 20+ messages in thread
From: Steven Rostedt @ 2025-04-30 19:58 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Josh Poimboeuf, x86, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Indu Bhagat, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Ian Rogers, Adrian Hunter,
linux-perf-users, Mark Brown, linux-toolchains, Jordan Rome,
Sam James, Andrii Nakryiko, Jens Remus, Florian Weimer,
Andy Lutomirski, Weinan Liu, Blake Jones, Beau Belgrave,
Jose E. Marchesi, Alexander Aring
From: Namhyung Kim <namhyung@kernel.org>
Save samples with deferred callchains in a separate list and deliver
them after merging the user callchains. If users don't want to merge
they can set tool->merge_deferred_callchains to false to prevent the
behavior.
With previous result, now perf script will show the merged callchains.
$ perf script
perf 801 [000] 18.031793: 1 cycles:P:
ffffffff91a14c36 __intel_pmu_enable_all.isra.0+0x56 ([kernel.kallsyms])
ffffffff91d373e9 perf_ctx_enable+0x39 ([kernel.kallsyms])
ffffffff91d36af7 event_function+0xd7 ([kernel.kallsyms])
ffffffff91d34222 remote_function+0x42 ([kernel.kallsyms])
ffffffff91c1ebe1 generic_exec_single+0x61 ([kernel.kallsyms])
ffffffff91c1edac smp_call_function_single+0xec ([kernel.kallsyms])
ffffffff91d37a9d event_function_call+0x10d ([kernel.kallsyms])
ffffffff91d33557 perf_event_for_each_child+0x37 ([kernel.kallsyms])
ffffffff91d47324 _perf_ioctl+0x204 ([kernel.kallsyms])
ffffffff91d47c43 perf_ioctl+0x33 ([kernel.kallsyms])
ffffffff91e2f216 __x64_sys_ioctl+0x96 ([kernel.kallsyms])
ffffffff9265f1ae do_syscall_64+0x9e ([kernel.kallsyms])
ffffffff92800130 entry_SYSCALL_64+0xb0 ([kernel.kallsyms])
7fb5fc22034b __GI___ioctl+0x3b (/usr/lib/x86_64-linux-gnu/libc.so.6)
...
The old output can be get using --no-merge-callchain option.
Also perf report can get the user callchain entry at the end.
$ perf report --no-children --percent-limit=0 --stdio -q -S __intel_pmu_enable_all.isra.0
# symbol: __intel_pmu_enable_all.isra.0
0.00% perf [kernel.kallsyms]
|
---__intel_pmu_enable_all.isra.0
perf_ctx_enable
event_function
remote_function
generic_exec_single
smp_call_function_single
event_function_call
perf_event_for_each_child
_perf_ioctl
perf_ioctl
__x64_sys_ioctl
do_syscall_64
entry_SYSCALL_64
__GI___ioctl
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
tools/perf/Documentation/perf-script.txt | 5 ++
tools/perf/builtin-script.c | 5 +-
tools/perf/util/callchain.c | 24 +++++++++
tools/perf/util/callchain.h | 3 ++
tools/perf/util/evlist.c | 1 +
tools/perf/util/evlist.h | 1 +
tools/perf/util/session.c | 63 +++++++++++++++++++++++-
tools/perf/util/tool.c | 1 +
tools/perf/util/tool.h | 1 +
9 files changed, 102 insertions(+), 2 deletions(-)
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 28bec7e78bc8..03d112960632 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -527,6 +527,11 @@ include::itrace.txt[]
The known limitations include exception handing such as
setjmp/longjmp will have calls/returns not match.
+--merge-callchains::
+ Enable merging deferred user callchains if available. This is the
+ default behavior. If you want to see separate CALLCHAIN_DEFERRED
+ records for some reason, use --no-merge-callchains explicitly.
+
:GMEXAMPLECMD: script
:GMEXAMPLESUBCMD:
include::guest-files.txt[]
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 176b8f299afc..dd17c11af0c8 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -3775,6 +3775,7 @@ int cmd_script(int argc, const char **argv)
bool header_only = false;
bool script_started = false;
bool unsorted_dump = false;
+ bool merge_deferred_callchains = true;
char *rec_script_path = NULL;
char *rep_script_path = NULL;
struct perf_session *session;
@@ -3928,6 +3929,8 @@ int cmd_script(int argc, const char **argv)
"Guest code can be found in hypervisor process"),
OPT_BOOLEAN('\0', "stitch-lbr", &script.stitch_lbr,
"Enable LBR callgraph stitching approach"),
+ OPT_BOOLEAN('\0', "merge-callchains", &merge_deferred_callchains,
+ "Enable merge deferred user callchains"),
OPTS_EVSWITCH(&script.evswitch),
OPT_END()
};
@@ -4183,7 +4186,7 @@ int cmd_script(int argc, const char **argv)
script.tool.throttle = process_throttle_event;
script.tool.unthrottle = process_throttle_event;
script.tool.ordering_requires_timestamps = true;
- script.tool.merge_deferred_callchains = false;
+ script.tool.merge_deferred_callchains = merge_deferred_callchains;
session = perf_session__new(&data, &script.tool);
if (IS_ERR(session))
return PTR_ERR(session);
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index d7b7eef740b9..6d423d92861b 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1828,3 +1828,27 @@ int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
}
return 0;
}
+
+int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
+ struct perf_sample *sample_callchain)
+{
+ u64 nr_orig = sample_orig->callchain->nr - 1;
+ u64 nr_deferred = sample_callchain->callchain->nr;
+ struct ip_callchain *callchain;
+
+ callchain = calloc(1 + nr_orig + nr_deferred, sizeof(u64));
+ if (callchain == NULL) {
+ sample_orig->deferred_callchain = false;
+ return -ENOMEM;
+ }
+
+ callchain->nr = nr_orig + nr_deferred;
+ /* copy except for the last PERF_CONTEXT_USER_DEFERRED */
+ memcpy(callchain->ips, sample_orig->callchain->ips, nr_orig * sizeof(u64));
+ /* copy deferred use callchains */
+ memcpy(&callchain->ips[nr_orig], sample_callchain->callchain->ips,
+ nr_deferred * sizeof(u64));
+
+ sample_orig->callchain = callchain;
+ return 0;
+}
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 86ed9e4d04f9..89785125ed25 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -317,4 +317,7 @@ int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
struct perf_sample *sample, int max_stack,
bool symbols, callchain_iter_fn cb, void *data);
+int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
+ struct perf_sample *sample_callchain);
+
#endif /* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index c1a04141aed0..d23a3f8e8649 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -82,6 +82,7 @@ void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus,
evlist->ctl_fd.ack = -1;
evlist->ctl_fd.pos = -1;
evlist->nr_br_cntr = -1;
+ INIT_LIST_HEAD(&evlist->deferred_samples);
}
struct evlist *evlist__new(void)
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index edcbf1c10e92..a8cb5a29d55e 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -84,6 +84,7 @@ struct evlist {
int pos; /* index at evlist core object to check signals */
} ctl_fd;
struct event_enable_timer *eet;
+ struct list_head deferred_samples;
};
struct evsel_str_handler {
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 30fb1d281be8..51f17bf42dd9 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1277,6 +1277,56 @@ static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool
per_thread);
}
+struct deferred_event {
+ struct list_head list;
+ union perf_event *event;
+};
+
+static int evlist__deliver_deferred_samples(struct evlist *evlist,
+ const struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct machine *machine)
+{
+ struct deferred_event *de, *tmp;
+ struct evsel *evsel;
+ int ret = 0;
+
+ if (!tool->merge_deferred_callchains) {
+ evsel = evlist__id2evsel(evlist, sample->id);
+ return tool->callchain_deferred(tool, event, sample,
+ evsel, machine);
+ }
+
+ list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
+ struct perf_sample orig_sample;
+
+ ret = evlist__parse_sample(evlist, de->event, &orig_sample);
+ if (ret < 0) {
+ pr_err("failed to parse original sample\n");
+ break;
+ }
+
+ if (sample->tid != orig_sample.tid)
+ continue;
+
+ evsel = evlist__id2evsel(evlist, orig_sample.id);
+ sample__merge_deferred_callchain(&orig_sample, sample);
+ ret = evlist__deliver_sample(evlist, tool, de->event,
+ &orig_sample, evsel, machine);
+
+ if (orig_sample.deferred_callchain)
+ free(orig_sample.callchain);
+
+ list_del(&de->list);
+ free(de);
+
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
static int machines__deliver_event(struct machines *machines,
struct evlist *evlist,
union perf_event *event,
@@ -1305,6 +1355,16 @@ static int machines__deliver_event(struct machines *machines,
return 0;
}
dump_sample(evsel, event, sample, perf_env__arch(machine->env));
+ if (sample->deferred_callchain && tool->merge_deferred_callchains) {
+ struct deferred_event *de = malloc(sizeof(*de));
+
+ if (de == NULL)
+ return -ENOMEM;
+
+ de->event = event;
+ list_add_tail(&de->list, &evlist->deferred_samples);
+ return 0;
+ }
return evlist__deliver_sample(evlist, tool, event, sample, evsel, machine);
case PERF_RECORD_MMAP:
return tool->mmap(tool, event, sample, machine);
@@ -1364,7 +1424,8 @@ static int machines__deliver_event(struct machines *machines,
return tool->aux_output_hw_id(tool, event, sample, machine);
case PERF_RECORD_CALLCHAIN_DEFERRED:
dump_deferred_callchain(evsel, event, sample);
- return tool->callchain_deferred(tool, event, sample, evsel, machine);
+ return evlist__deliver_deferred_samples(evlist, tool, event,
+ sample, machine);
default:
++evlist->stats.nr_unknown_events;
return -1;
diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
index e78f16de912e..385043e06627 100644
--- a/tools/perf/util/tool.c
+++ b/tools/perf/util/tool.c
@@ -238,6 +238,7 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
tool->cgroup_events = false;
tool->no_warn = false;
tool->show_feat_hdr = SHOW_FEAT_NO_HEADER;
+ tool->merge_deferred_callchains = true;
tool->sample = process_event_sample_stub;
tool->mmap = process_event_stub;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 9987bbde6d5e..d06580478ab1 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -87,6 +87,7 @@ struct perf_tool {
bool cgroup_events;
bool no_warn;
bool dont_split_sample_group;
+ bool merge_deferred_callchains;
enum show_feature_header show_feat_hdr;
};
--
2.47.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH v7 18/18] perf tools: Merge deferred user callchains
2025-04-30 19:58 ` [PATCH v7 18/18] perf tools: Merge deferred user callchains Steven Rostedt
@ 2025-04-30 20:07 ` Ian Rogers
0 siblings, 0 replies; 20+ messages in thread
From: Ian Rogers @ 2025-04-30 20:07 UTC (permalink / raw)
To: Steven Rostedt
Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
Mathieu Desnoyers, Andrew Morton, Josh Poimboeuf, x86,
Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
Indu Bhagat, Alexander Shishkin, Jiri Olsa, Namhyung Kim,
Adrian Hunter, linux-perf-users, Mark Brown, linux-toolchains,
Jordan Rome, Sam James, Andrii Nakryiko, Jens Remus,
Florian Weimer, Andy Lutomirski, Weinan Liu, Blake Jones,
Beau Belgrave, Jose E. Marchesi, Alexander Aring
On Wed, Apr 30, 2025 at 1:01 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> From: Namhyung Kim <namhyung@kernel.org>
>
> Save samples with deferred callchains in a separate list and deliver
> them after merging the user callchains. If users don't want to merge
> they can set tool->merge_deferred_callchains to false to prevent the
> behavior.
>
> With previous result, now perf script will show the merged callchains.
>
> $ perf script
> perf 801 [000] 18.031793: 1 cycles:P:
> ffffffff91a14c36 __intel_pmu_enable_all.isra.0+0x56 ([kernel.kallsyms])
> ffffffff91d373e9 perf_ctx_enable+0x39 ([kernel.kallsyms])
> ffffffff91d36af7 event_function+0xd7 ([kernel.kallsyms])
> ffffffff91d34222 remote_function+0x42 ([kernel.kallsyms])
> ffffffff91c1ebe1 generic_exec_single+0x61 ([kernel.kallsyms])
> ffffffff91c1edac smp_call_function_single+0xec ([kernel.kallsyms])
> ffffffff91d37a9d event_function_call+0x10d ([kernel.kallsyms])
> ffffffff91d33557 perf_event_for_each_child+0x37 ([kernel.kallsyms])
> ffffffff91d47324 _perf_ioctl+0x204 ([kernel.kallsyms])
> ffffffff91d47c43 perf_ioctl+0x33 ([kernel.kallsyms])
> ffffffff91e2f216 __x64_sys_ioctl+0x96 ([kernel.kallsyms])
> ffffffff9265f1ae do_syscall_64+0x9e ([kernel.kallsyms])
> ffffffff92800130 entry_SYSCALL_64+0xb0 ([kernel.kallsyms])
> 7fb5fc22034b __GI___ioctl+0x3b (/usr/lib/x86_64-linux-gnu/libc.so.6)
> ...
>
> The old output can be get using --no-merge-callchain option.
> Also perf report can get the user callchain entry at the end.
>
> $ perf report --no-children --percent-limit=0 --stdio -q -S __intel_pmu_enable_all.isra.0
> # symbol: __intel_pmu_enable_all.isra.0
> 0.00% perf [kernel.kallsyms]
> |
> ---__intel_pmu_enable_all.isra.0
> perf_ctx_enable
> event_function
> remote_function
> generic_exec_single
> smp_call_function_single
> event_function_call
> perf_event_for_each_child
> _perf_ioctl
> perf_ioctl
> __x64_sys_ioctl
> do_syscall_64
> entry_SYSCALL_64
> __GI___ioctl
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> tools/perf/Documentation/perf-script.txt | 5 ++
> tools/perf/builtin-script.c | 5 +-
> tools/perf/util/callchain.c | 24 +++++++++
> tools/perf/util/callchain.h | 3 ++
> tools/perf/util/evlist.c | 1 +
> tools/perf/util/evlist.h | 1 +
> tools/perf/util/session.c | 63 +++++++++++++++++++++++-
> tools/perf/util/tool.c | 1 +
> tools/perf/util/tool.h | 1 +
> 9 files changed, 102 insertions(+), 2 deletions(-)
>
> diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
> index 28bec7e78bc8..03d112960632 100644
> --- a/tools/perf/Documentation/perf-script.txt
> +++ b/tools/perf/Documentation/perf-script.txt
> @@ -527,6 +527,11 @@ include::itrace.txt[]
> The known limitations include exception handing such as
> setjmp/longjmp will have calls/returns not match.
>
> +--merge-callchains::
> + Enable merging deferred user callchains if available. This is the
> + default behavior. If you want to see separate CALLCHAIN_DEFERRED
> + records for some reason, use --no-merge-callchains explicitly.
> +
> :GMEXAMPLECMD: script
> :GMEXAMPLESUBCMD:
> include::guest-files.txt[]
> diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
> index 176b8f299afc..dd17c11af0c8 100644
> --- a/tools/perf/builtin-script.c
> +++ b/tools/perf/builtin-script.c
> @@ -3775,6 +3775,7 @@ int cmd_script(int argc, const char **argv)
> bool header_only = false;
> bool script_started = false;
> bool unsorted_dump = false;
> + bool merge_deferred_callchains = true;
> char *rec_script_path = NULL;
> char *rep_script_path = NULL;
> struct perf_session *session;
> @@ -3928,6 +3929,8 @@ int cmd_script(int argc, const char **argv)
> "Guest code can be found in hypervisor process"),
> OPT_BOOLEAN('\0', "stitch-lbr", &script.stitch_lbr,
> "Enable LBR callgraph stitching approach"),
> + OPT_BOOLEAN('\0', "merge-callchains", &merge_deferred_callchains,
> + "Enable merge deferred user callchains"),
> OPTS_EVSWITCH(&script.evswitch),
> OPT_END()
> };
> @@ -4183,7 +4186,7 @@ int cmd_script(int argc, const char **argv)
> script.tool.throttle = process_throttle_event;
> script.tool.unthrottle = process_throttle_event;
> script.tool.ordering_requires_timestamps = true;
> - script.tool.merge_deferred_callchains = false;
> + script.tool.merge_deferred_callchains = merge_deferred_callchains;
> session = perf_session__new(&data, &script.tool);
> if (IS_ERR(session))
> return PTR_ERR(session);
> diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
> index d7b7eef740b9..6d423d92861b 100644
> --- a/tools/perf/util/callchain.c
> +++ b/tools/perf/util/callchain.c
> @@ -1828,3 +1828,27 @@ int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
> }
> return 0;
> }
> +
> +int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
> + struct perf_sample *sample_callchain)
> +{
> + u64 nr_orig = sample_orig->callchain->nr - 1;
> + u64 nr_deferred = sample_callchain->callchain->nr;
> + struct ip_callchain *callchain;
> +
> + callchain = calloc(1 + nr_orig + nr_deferred, sizeof(u64));
> + if (callchain == NULL) {
> + sample_orig->deferred_callchain = false;
> + return -ENOMEM;
> + }
> +
> + callchain->nr = nr_orig + nr_deferred;
> + /* copy except for the last PERF_CONTEXT_USER_DEFERRED */
> + memcpy(callchain->ips, sample_orig->callchain->ips, nr_orig * sizeof(u64));
> + /* copy deferred use callchains */
> + memcpy(&callchain->ips[nr_orig], sample_callchain->callchain->ips,
> + nr_deferred * sizeof(u64));
> +
> + sample_orig->callchain = callchain;
> + return 0;
> +}
> diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
> index 86ed9e4d04f9..89785125ed25 100644
> --- a/tools/perf/util/callchain.h
> +++ b/tools/perf/util/callchain.h
> @@ -317,4 +317,7 @@ int sample__for_each_callchain_node(struct thread *thread, struct evsel *evsel,
> struct perf_sample *sample, int max_stack,
> bool symbols, callchain_iter_fn cb, void *data);
>
> +int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
> + struct perf_sample *sample_callchain);
> +
> #endif /* __PERF_CALLCHAIN_H */
> diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
> index c1a04141aed0..d23a3f8e8649 100644
> --- a/tools/perf/util/evlist.c
> +++ b/tools/perf/util/evlist.c
> @@ -82,6 +82,7 @@ void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus,
> evlist->ctl_fd.ack = -1;
> evlist->ctl_fd.pos = -1;
> evlist->nr_br_cntr = -1;
> + INIT_LIST_HEAD(&evlist->deferred_samples);
> }
>
> struct evlist *evlist__new(void)
> diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
> index edcbf1c10e92..a8cb5a29d55e 100644
> --- a/tools/perf/util/evlist.h
> +++ b/tools/perf/util/evlist.h
> @@ -84,6 +84,7 @@ struct evlist {
> int pos; /* index at evlist core object to check signals */
> } ctl_fd;
> struct event_enable_timer *eet;
> + struct list_head deferred_samples;
> };
>
> struct evsel_str_handler {
> diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
> index 30fb1d281be8..51f17bf42dd9 100644
> --- a/tools/perf/util/session.c
> +++ b/tools/perf/util/session.c
> @@ -1277,6 +1277,56 @@ static int evlist__deliver_sample(struct evlist *evlist, const struct perf_tool
> per_thread);
> }
>
> +struct deferred_event {
> + struct list_head list;
> + union perf_event *event;
> +};
> +
> +static int evlist__deliver_deferred_samples(struct evlist *evlist,
> + const struct perf_tool *tool,
> + union perf_event *event,
> + struct perf_sample *sample,
> + struct machine *machine)
> +{
> + struct deferred_event *de, *tmp;
> + struct evsel *evsel;
> + int ret = 0;
> +
> + if (!tool->merge_deferred_callchains) {
> + evsel = evlist__id2evsel(evlist, sample->id);
> + return tool->callchain_deferred(tool, event, sample,
> + evsel, machine);
> + }
> +
> + list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
> + struct perf_sample orig_sample;
> +
> + ret = evlist__parse_sample(evlist, de->event, &orig_sample);
> + if (ret < 0) {
> + pr_err("failed to parse original sample\n");
> + break;
> + }
> +
> + if (sample->tid != orig_sample.tid)
> + continue;
> +
> + evsel = evlist__id2evsel(evlist, orig_sample.id);
> + sample__merge_deferred_callchain(&orig_sample, sample);
> + ret = evlist__deliver_sample(evlist, tool, de->event,
> + &orig_sample, evsel, machine);
> +
> + if (orig_sample.deferred_callchain)
> + free(orig_sample.callchain);
> +
> + list_del(&de->list);
> + free(de);
> +
> + if (ret)
> + break;
> + }
> + return ret;
> +}
> +
> static int machines__deliver_event(struct machines *machines,
> struct evlist *evlist,
> union perf_event *event,
> @@ -1305,6 +1355,16 @@ static int machines__deliver_event(struct machines *machines,
> return 0;
> }
> dump_sample(evsel, event, sample, perf_env__arch(machine->env));
> + if (sample->deferred_callchain && tool->merge_deferred_callchains) {
> + struct deferred_event *de = malloc(sizeof(*de));
> +
> + if (de == NULL)
> + return -ENOMEM;
> +
> + de->event = event;
> + list_add_tail(&de->list, &evlist->deferred_samples);
> + return 0;
> + }
> return evlist__deliver_sample(evlist, tool, event, sample, evsel, machine);
> case PERF_RECORD_MMAP:
> return tool->mmap(tool, event, sample, machine);
> @@ -1364,7 +1424,8 @@ static int machines__deliver_event(struct machines *machines,
> return tool->aux_output_hw_id(tool, event, sample, machine);
> case PERF_RECORD_CALLCHAIN_DEFERRED:
> dump_deferred_callchain(evsel, event, sample);
> - return tool->callchain_deferred(tool, event, sample, evsel, machine);
> + return evlist__deliver_deferred_samples(evlist, tool, event,
> + sample, machine);
> default:
> ++evlist->stats.nr_unknown_events;
> return -1;
> diff --git a/tools/perf/util/tool.c b/tools/perf/util/tool.c
> index e78f16de912e..385043e06627 100644
> --- a/tools/perf/util/tool.c
> +++ b/tools/perf/util/tool.c
> @@ -238,6 +238,7 @@ void perf_tool__init(struct perf_tool *tool, bool ordered_events)
> tool->cgroup_events = false;
> tool->no_warn = false;
> tool->show_feat_hdr = SHOW_FEAT_NO_HEADER;
> + tool->merge_deferred_callchains = true;
I think this should be false as otherwise we're going to duplicate
callchains in the same way leader sampling duplicated the sibling
event samples and needed fixing in:
https://lore.kernel.org/r/20240729220620.2957754-1-irogers@google.com
Thanks,
Ian
>
> tool->sample = process_event_sample_stub;
> tool->mmap = process_event_stub;
> diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
> index 9987bbde6d5e..d06580478ab1 100644
> --- a/tools/perf/util/tool.h
> +++ b/tools/perf/util/tool.h
> @@ -87,6 +87,7 @@ struct perf_tool {
> bool cgroup_events;
> bool no_warn;
> bool dont_split_sample_group;
> + bool merge_deferred_callchains;
> enum show_feature_header show_feat_hdr;
> };
>
> --
> 2.47.2
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread