public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Steven Rostedt <rostedt@kernel.org>, jremus@linux.ibm.com
Cc: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
	bpf@vger.kernel.org, x86@kernel.org,
	Masami Hiramatsu <mhiramat@kernel.org>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	Josh Poimboeuf <jpoimboe@kernel.org>,
	Ingo Molnar <mingo@kernel.org>, Jiri Olsa <jolsa@kernel.org>,
	Arnaldo Carvalho de Melo <acme@kernel.org>,
	Namhyung Kim <namhyung@kernel.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Andrii Nakryiko <andrii@kernel.org>,
	Indu Bhagat <indu.bhagat@oracle.com>,
	"Jose E. Marchesi" <jemarch@gnu.org>,
	Beau Belgrave <beaub@linux.microsoft.com>,
	Jens Remus <jremus@linux.ibm.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Florian Weimer <fweimer@redhat.com>, Sam James <sam@gentoo.org>,
	Kees Cook <kees@kernel.org>,
	Carlos O'Donell <codonell@redhat.com>
Subject: Re: [PATCH v16 0/4] perf: Support the deferred unwinding infrastructure
Date: Fri, 24 Oct 2025 12:41:19 +0200	[thread overview]
Message-ID: <20251024104119.GJ4068168@noisy.programming.kicks-ass.net> (raw)
In-Reply-To: <20251024092926.GI4068168@noisy.programming.kicks-ass.net>

On Fri, Oct 24, 2025 at 11:29:26AM +0200, Peter Zijlstra wrote:
> On Thu, Oct 23, 2025 at 05:00:02PM +0200, Peter Zijlstra wrote:
> 
> > Trouble is, pretty much every unwind is 510 entries long -- this cannot
> > be right. I'm sure there's a silly mistake in unwind/user.c but I'm too
> > tired to find it just now. I'll try again tomorrow.
> 
> PEBKAC

Anyway, while staring at this, I noted that the perf userspace unwind
code has a few bits that are missing from the new shiny thing.

How about something like so? This add an optional arch specific unwinder
at the very highest priority (bit 0) and uses that to do a few extra
bits before disabling itself and falling back to whatever lower prio
unwinder to do the actual unwinding.

---
 arch/x86/events/core.c             |   40 ---------------------------
 arch/x86/include/asm/unwind_user.h |    4 ++
 arch/x86/include/asm/uprobes.h     |    9 ++++++
 arch/x86/kernel/unwind_user.c      |   53 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/uprobes.c          |   32 ++++++++++++++++++++++
 include/linux/unwind_user_types.h  |    5 ++-
 kernel/unwind/user.c               |    7 ++++
 7 files changed, 109 insertions(+), 41 deletions(-)

--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2845,46 +2845,6 @@ static unsigned long get_segment_base(un
 	return get_desc_base(desc);
 }
 
-#ifdef CONFIG_UPROBES
-/*
- * Heuristic-based check if uprobe is installed at the function entry.
- *
- * Under assumption of user code being compiled with frame pointers,
- * `push %rbp/%ebp` is a good indicator that we indeed are.
- *
- * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
- * If we get this wrong, captured stack trace might have one extra bogus
- * entry, but the rest of stack trace will still be meaningful.
- */
-static bool is_uprobe_at_func_entry(struct pt_regs *regs)
-{
-	struct arch_uprobe *auprobe;
-
-	if (!current->utask)
-		return false;
-
-	auprobe = current->utask->auprobe;
-	if (!auprobe)
-		return false;
-
-	/* push %rbp/%ebp */
-	if (auprobe->insn[0] == 0x55)
-		return true;
-
-	/* endbr64 (64-bit only) */
-	if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
-		return true;
-
-	return false;
-}
-
-#else
-static bool is_uprobe_at_func_entry(struct pt_regs *regs)
-{
-	return false;
-}
-#endif /* CONFIG_UPROBES */
-
 #ifdef CONFIG_IA32_EMULATION
 
 #include <linux/compat.h>
--- a/arch/x86/include/asm/unwind_user.h
+++ b/arch/x86/include/asm/unwind_user.h
@@ -8,4 +8,8 @@
 	.fp_off		= -2*(ws),			\
 	.use_fp		= true,
 
+#define HAVE_UNWIND_USER_ARCH 1
+
+extern int unwind_user_next_arch(struct unwind_user_state *state);
+
 #endif /* _ASM_X86_UNWIND_USER_H */
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -62,4 +62,13 @@ struct arch_uprobe_task {
 	unsigned int			saved_tf;
 };
 
+#ifdef CONFIG_UPROBES
+extern bool is_uprobe_at_func_entry(struct pt_regs *regs);
+#else
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+	return false;
+}
+#endif /* CONFIG_UPROBES */
+
 #endif	/* _ASM_UPROBES_H */
--- /dev/null
+++ b/arch/x86/kernel/unwind_user.c
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/unwind_user.h>
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+#include <linux/sched/task_stack.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+
+int unwind_user_next_arch(struct unwind_user_state *state)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+
+	/* only once, on the first iteration */
+	state->available_types &= ~UNWIND_USER_TYPE_ARCH;
+
+	/* We don't know how to unwind VM86 stacks. */
+	if (regs->flags & X86_VM_MASK) {
+		state->done = true;
+		return 0;
+	}
+
+	/*
+	 * If we are called from uprobe handler, and we are indeed at the very
+	 * entry to user function (which is normally a `push %rbp` instruction,
+	 * under assumption of application being compiled with frame pointers),
+	 * we should read return address from *regs->sp before proceeding
+	 * to follow frame pointers, otherwise we'll skip immediate caller
+	 * as %rbp is not yet setup.
+	 */
+	if (!is_uprobe_at_func_entry(regs))
+		return -EINVAL;
+
+#ifdef CONFIG_COMPAT
+	if (state->ws == sizeof(int)) {
+		unsigned int retaddr;
+		int ret = get_user(retaddr, (unsigned int __user *)regs->sp);
+		if (ret)
+			return ret;
+
+		state->ip = retaddr;
+		return 0;
+	}
+#endif
+	unsigned long retaddr;
+	int ret = get_user(retaddr, (unsigned long __user *)regs->sp);
+	if (ret)
+		return ret;
+
+	state->ip = retaddr;
+	return 0;
+}
+
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1791,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct retu
 	else
 		return regs->sp <= ret->stack;
 }
+
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+	struct arch_uprobe *auprobe;
+
+	if (!current->utask)
+		return false;
+
+	auprobe = current->utask->auprobe;
+	if (!auprobe)
+		return false;
+
+	/* push %rbp/%ebp */
+	if (auprobe->insn[0] == 0x55)
+		return true;
+
+	/* endbr64 (64-bit only) */
+	if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn))
+		return true;
+
+	return false;
+}
--- a/include/linux/unwind_user_types.h
+++ b/include/linux/unwind_user_types.h
@@ -3,13 +3,15 @@
 #define _LINUX_UNWIND_USER_TYPES_H
 
 #include <linux/types.h>
+#include <linux/bits.h>
 
 /*
  * Unwind types, listed in priority order: lower numbers are attempted first if
  * available.
  */
 enum unwind_user_type_bits {
-	UNWIND_USER_TYPE_FP_BIT =		0,
+	UNWIND_USER_TYPE_ARCH_BIT = 0,
+	UNWIND_USER_TYPE_FP_BIT,
 
 	NR_UNWIND_USER_TYPE_BITS,
 };
@@ -17,6 +19,7 @@ enum unwind_user_type_bits {
 enum unwind_user_type {
 	/* Type "none" for the start of stack walk iteration. */
 	UNWIND_USER_TYPE_NONE =			0,
+	UNWIND_USER_TYPE_ARCH =			BIT(UNWIND_USER_TYPE_ARCH_BIT),
 	UNWIND_USER_TYPE_FP =			BIT(UNWIND_USER_TYPE_FP_BIT),
 };
 
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -79,6 +79,10 @@ static int unwind_user_next(struct unwin
 
 		state->current_type = type;
 		switch (type) {
+		case UNWIND_USER_TYPE_ARCH:
+			if (!unwind_user_next_arch(state))
+				return 0;
+			continue;
 		case UNWIND_USER_TYPE_FP:
 			if (!unwind_user_next_fp(state))
 				return 0;
@@ -107,6 +111,9 @@ static int unwind_user_start(struct unwi
 		return -EINVAL;
 	}
 
+	if (HAVE_UNWIND_USER_ARCH)
+		state->available_types |= UNWIND_USER_TYPE_ARCH;
+
 	if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
 		state->available_types |= UNWIND_USER_TYPE_FP;
 

  reply	other threads:[~2025-10-24 10:41 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-07 21:40 [PATCH v16 0/4] perf: Support the deferred unwinding infrastructure Steven Rostedt
2025-10-07 21:40 ` [PATCH v16 1/4] unwind: Add interface to allow tracing a single task Steven Rostedt
2025-10-07 21:40 ` [PATCH v16 2/4] perf: Support deferred user callchains Steven Rostedt
2025-10-07 21:40 ` [PATCH v16 3/4] perf: Have the deferred request record the user context cookie Steven Rostedt
2025-10-07 21:40 ` [PATCH v16 4/4] perf: Support deferred user callchains for per CPU events Steven Rostedt
2025-10-23 15:00 ` [PATCH v16 0/4] perf: Support the deferred unwinding infrastructure Peter Zijlstra
2025-10-23 16:40   ` Steven Rostedt
2025-10-24  8:26     ` Peter Zijlstra
2025-10-24 12:58       ` Peter Zijlstra
2025-10-26  0:58         ` Namhyung Kim
2025-10-24  9:29   ` Peter Zijlstra
2025-10-24 10:41     ` Peter Zijlstra [this message]
2025-10-24 13:58       ` Jens Remus
2025-10-24 14:08         ` Peter Zijlstra
2025-10-24 14:51           ` Peter Zijlstra
2025-10-24 14:54             ` Peter Zijlstra
2025-10-24 14:57               ` Peter Zijlstra
2025-10-24 15:10                 ` Peter Zijlstra
2025-10-24 15:09             ` Jens Remus
2025-10-24 15:11               ` Peter Zijlstra
2025-10-29  9:36             ` [tip: perf/core] unwind_user/x86: Teach FP unwind about start of function tip-bot2 for Peter Zijlstra
2025-11-04 11:22             ` [PATCH v16 0/4] perf: Support the deferred unwinding infrastructure Florian Weimer
2025-11-05 13:08               ` Peter Zijlstra
2025-10-29  9:36   ` [tip: perf/core] perf: Support deferred user unwind tip-bot2 for Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251024104119.GJ4068168@noisy.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=acme@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=andrii@kernel.org \
    --cc=beaub@linux.microsoft.com \
    --cc=bpf@vger.kernel.org \
    --cc=codonell@redhat.com \
    --cc=fweimer@redhat.com \
    --cc=indu.bhagat@oracle.com \
    --cc=jemarch@gnu.org \
    --cc=jolsa@kernel.org \
    --cc=jpoimboe@kernel.org \
    --cc=jremus@linux.ibm.com \
    --cc=kees@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mhiramat@kernel.org \
    --cc=mingo@kernel.org \
    --cc=namhyung@kernel.org \
    --cc=rostedt@kernel.org \
    --cc=sam@gentoo.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox