Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCHv4 08/13] selftests/bpf: Emit nop,nop10 instructions combo for x86_64 arch
From: Jiri Olsa @ 2026-05-26 20:58 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: Jakub Sitnicki, bpf, linux-trace-kernel
In-Reply-To: <20260526205840.173790-1-jolsa@kernel.org>

Syncing latest usdt.h change [1].

Now that we have nop10 optimization support in kernel, let's emit
nop,nop10 for usdt probe. We leave it up to the library to use
desirable nop instruction.

[1] TBD
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/testing/selftests/bpf/usdt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h
index c71e21df38b3..75687f50f4e2 100644
--- a/tools/testing/selftests/bpf/usdt.h
+++ b/tools/testing/selftests/bpf/usdt.h
@@ -313,7 +313,7 @@ struct usdt_sema { volatile unsigned short active; };
 #if defined(__ia64__) || defined(__s390__) || defined(__s390x__)
 #define USDT_NOP			nop 0
 #elif defined(__x86_64__)
-#define USDT_NOP                       .byte 0x90, 0x0f, 0x1f, 0x44, 0x00, 0x0 /* nop, nop5 */
+#define USDT_NOP                       .byte 0x90, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 /* nop, nop10 */
 #else
 #define USDT_NOP			nop
 #endif
-- 
2.54.0


^ permalink raw reply related

* [PATCHv4 07/13] libbpf: Detect uprobe syscall with new error
From: Jiri Olsa @ 2026-05-26 20:58 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260526205840.173790-1-jolsa@kernel.org>

In the previous optimized uprobe fix we changed the syscall
error used for its detection from ENXIO to EPROTO.

Changing related probe_uprobe_syscall detection check.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Fixes: 05738da0efa1 ("libbpf: Add uprobe syscall feature detection")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/lib/bpf/features.c                                | 4 ++--
 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
index b7e388f99d0b..e5641fa60163 100644
--- a/tools/lib/bpf/features.c
+++ b/tools/lib/bpf/features.c
@@ -577,10 +577,10 @@ static int probe_ldimm64_full_range_off(int token_fd)
 static int probe_uprobe_syscall(int token_fd)
 {
 	/*
-	 * If kernel supports uprobe() syscall, it will return -ENXIO when called
+	 * If kernel supports uprobe() syscall, it will return -EPROTO when called
 	 * from the outside of a kernel-generated uprobe trampoline.
 	 */
-	return syscall(__NR_uprobe) < 0 && errno == ENXIO;
+	return syscall(__NR_uprobe) < 0 && errno == EPROTO;
 }
 #else
 static int probe_uprobe_syscall(int token_fd)
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index 955a37751b52..c944136252c6 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -762,7 +762,7 @@ static void test_uprobe_error(void)
 	long err = syscall(__NR_uprobe);
 
 	ASSERT_EQ(err, -1, "error");
-	ASSERT_EQ(errno, ENXIO, "errno");
+	ASSERT_EQ(errno, EPROTO, "errno");
 }
 
 static void __test_uprobe_syscall(void)
-- 
2.54.0


^ permalink raw reply related

* [PATCHv4 06/13] libbpf: Change has_nop_combo to work on top of nop10
From: Jiri Olsa @ 2026-05-26 20:58 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: Jakub Sitnicki, bpf, linux-trace-kernel
In-Reply-To: <20260526205840.173790-1-jolsa@kernel.org>

We now expect nop combo with 10 bytes nop instead of 5 bytes nop,
fixing has_nop_combo to reflect that.

Fixes: 41a5c7df4466 ("libbpf: Add support to detect nop,nop5 instructions combo for usdt probe")
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/lib/bpf/usdt.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
index e3710933fd52..484a4354e82b 100644
--- a/tools/lib/bpf/usdt.c
+++ b/tools/lib/bpf/usdt.c
@@ -305,7 +305,7 @@ struct usdt_manager *usdt_manager_new(struct bpf_object *obj)
 
 	/*
 	 * Detect kernel support for uprobe() syscall, it's presence means we can
-	 * take advantage of faster nop5 uprobe handling.
+	 * take advantage of faster nop10 uprobe handling.
 	 * Added in: 56101b69c919 ("uprobes/x86: Add uprobe syscall to speed up uprobe")
 	 */
 	man->has_uprobe_syscall = kernel_supports(obj, FEAT_UPROBE_SYSCALL);
@@ -596,14 +596,14 @@ static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note,
 #if defined(__x86_64__)
 static bool has_nop_combo(int fd, long off)
 {
-	unsigned char nop_combo[6] = {
-		0x90, 0x0f, 0x1f, 0x44, 0x00, 0x00 /* nop,nop5 */
+	unsigned char nop_combo[11] = {
+		0x90, 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00,
 	};
-	unsigned char buf[6];
+	unsigned char buf[11];
 
-	if (pread(fd, buf, 6, off) != 6)
+	if (pread(fd, buf, 11, off) != 11)
 		return false;
-	return memcmp(buf, nop_combo, 6) == 0;
+	return memcmp(buf, nop_combo, 11) == 0;
 }
 #else
 static bool has_nop_combo(int fd, long off)
@@ -814,8 +814,8 @@ static int collect_usdt_targets(struct usdt_manager *man, struct elf_fd *elf_fd,
 		memset(target, 0, sizeof(*target));
 
 		/*
-		 * We have uprobe syscall and usdt with nop,nop5 instructions combo,
-		 * so we can place the uprobe directly on nop5 (+1) and get this probe
+		 * We have uprobe syscall and usdt with nop,nop10 instructions combo,
+		 * so we can place the uprobe directly on nop10 (+1) and get this probe
 		 * optimized.
 		 */
 		if (man->has_uprobe_syscall && has_nop_combo(elf_fd->fd, usdt_rel_ip)) {
-- 
2.54.0


^ permalink raw reply related

* [PATCHv4 05/13] uprobes/x86: Move optimized uprobe from nop5 to nop10
From: Jiri Olsa @ 2026-05-26 20:58 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260526205840.173790-1-jolsa@kernel.org>

Andrii reported an issue with optimized uprobes [1] that can clobber
redzone area with call instruction storing return address on stack
where user code may keep temporary data without adjusting rsp.

Fixing this by moving the optimized uprobes on top of 10-bytes nop
instruction, so we can squeeze another instruction to escape the
redzone area before doing the call, like:

  lea -0x80(%rsp), %rsp
  call tramp

Note the lea instruction is used to adjust the rsp register without
changing the flags.

We use nop10 and following transformation to optimized instructions
above and back as suggested by Peterz [2].

Optimize path (int3_update_optimize):

  1) Initial state after set_swbp() installed the uprobe:
      cc 2e 0f 1f 84 00 00 00 00 00

     From offset 0 this is INT3 followed by the tail of the original
     10-byte NOP.

     After a previous unoptimization bytes 5..9 may still contain the
     old call instruction, which remains valid for threads already there.

  2) Rewrite the LEA tail and call displacement:
      cc [8d 64 24 80 e8 d0 d1 d2 d3]

     From offset 0 this traps on the uprobe INT3.  Bytes 1..9 are not
     executable entry points while byte 0 is trapped.

  3) Publish the first LEA byte:
      [48] 8d 64 24 80 e8 d0 d1 d2 d3

     From offset 0 this is:
        lea -0x80(%rsp), %rsp
        call <uprobe-trampoline>

Unoptimize path (int3_update_unoptimize):

  1) Initial optimized state:
      48 8d 64 24 80 e8 d0 d1 d2 d3
     Same as 3) above.

  2) Trap new entries before restoring the NOP bytes:
      [cc] 8d 64 24 80 e8 d0 d1 d2 d3

     From offset 0 this traps. A thread that had already executed the
     LEA can still reach the intact CALL at offset 5.

  3) Restore bytes 1..4 of the original NOP while keeping byte 0 trapped
     and byte 5 as CALL.
      cc [2e 0f 1f 84] e8 d0 d1 d2 d3

     From offset 0 this still traps. Offset 5 is still the CALL for any
     thread that was already past the first LEA byte.

  4) Publish the first byte of the original NOP:
      [66] 2e 0f 1f 84 e8 d0 d1 d2 d3

     From offset 0 this is the restored 10-byte NOP; the CALL opcode and
     displacement are now only NOP operands.  Offset 5 still decodes as
     CALL for a thread that was already there.

     Tthere is only a single target uprobe-trampoline for the given nop10
     instruction address, so the CALL instruction will not be changed across
     unoptimization/optimization cycles.
     Therefore, any task that is preempted at the CALL instruction is guaranteed
     to observe that CALL and not anything else.

Note as explained in [2] we need to use following nop10:
       PF1   PF2   ESC   NOPL  MOD   SIB   DISP32
NOP10: 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 -- cs nopw 0x00000000(%rax,%rax,1)

which means we need to allow 0x2e prefix which maps to INAT_PFX_CS
attribute in is_prefix_bad function.

Also changing the uprobe syscall error when called out of uprobe
trampoline to -EPROTO, so we are able to detect the fixed kernel.

The optimized uprobe performance stays the same:

        uprobe-nop     :    3.129 ± 0.013M/s
        uprobe-push    :    3.045 ± 0.006M/s
        uprobe-ret     :    1.095 ± 0.004M/s
  -->   uprobe-nop10   :    7.170 ± 0.020M/s
        uretprobe-nop  :    2.143 ± 0.021M/s
        uretprobe-push :    2.090 ± 0.000M/s
        uretprobe-ret  :    0.942 ± 0.000M/s
  -->   uretprobe-nop10:    3.381 ± 0.003M/s
        usdt-nop       :    3.245 ± 0.004M/s
  -->   usdt-nop10     :    7.256 ± 0.023M/s

[1] https://lore.kernel.org/bpf/20260509003146.976844-1-andrii@kernel.org/
[2] https://lore.kernel.org/bpf/20260518104306.GU3102624@noisy.programming.kicks-ass.net/#t
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Closes: https://lore.kernel.org/bpf/20260509003146.976844-1-andrii@kernel.org/
Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
Assisted-by: Codex:GPT-5.5
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/kernel/uprobes.c | 255 ++++++++++++++++++++++++++++----------
 1 file changed, 190 insertions(+), 65 deletions(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index af5af7d67999..de544516ea70 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -266,7 +266,6 @@ static bool is_prefix_bad(struct insn *insn)
 		attr = inat_get_opcode_attribute(p);
 		switch (attr) {
 		case INAT_MAKE_PREFIX(INAT_PFX_ES):
-		case INAT_MAKE_PREFIX(INAT_PFX_CS):
 		case INAT_MAKE_PREFIX(INAT_PFX_DS):
 		case INAT_MAKE_PREFIX(INAT_PFX_SS):
 		case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
@@ -631,9 +630,29 @@ static struct vm_special_mapping tramp_mapping = {
 	.pages  = tramp_mapping_pages,
 };
 
+
+#define LEA_INSN_SIZE		5
+#define OPT_INSN_SIZE		(LEA_INSN_SIZE + CALL_INSN_SIZE)
+#define REDZONE_SIZE		0x80
+
+static const u8 lea_rsp[] = { 0x48, 0x8d, 0x64, 0x24, 0x80 };
+
+static bool is_opt_insns(const uprobe_opcode_t *insn)
+{
+	return !memcmp(insn, lea_rsp, LEA_INSN_SIZE) &&
+	       insn[LEA_INSN_SIZE] == CALL_INSN_OPCODE;
+}
+
+static bool is_swbp_opt_insns(uprobe_opcode_t *insn)
+{
+	return is_swbp_insn(&insn[0]) &&
+	       !memcmp(&insn[1], &lea_rsp[1], LEA_INSN_SIZE - 1) &&
+	       insn[LEA_INSN_SIZE] == CALL_INSN_OPCODE;
+}
+
 static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
 {
-	long delta = (long)(vaddr + 5 - vtramp);
+	long delta = (long)(vaddr + OPT_INSN_SIZE - vtramp);
 
 	return delta >= INT_MIN && delta <= INT_MAX;
 }
@@ -646,7 +665,7 @@ static unsigned long find_nearest_trampoline(unsigned long vaddr)
 	};
 	unsigned long low_limit, high_limit;
 	unsigned long low_tramp, high_tramp;
-	unsigned long call_end = vaddr + 5;
+	unsigned long call_end = vaddr + OPT_INSN_SIZE;
 
 	if (check_add_overflow(call_end, INT_MIN, &low_limit))
 		low_limit = PAGE_SIZE;
@@ -754,7 +773,7 @@ SYSCALL_DEFINE0(uprobe)
 
 	/* Allow execution only from uprobe trampolines. */
 	if (!in_uprobe_trampoline(regs->ip))
-		return -ENXIO;
+		return -EPROTO;
 
 	err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
 	if (err)
@@ -770,8 +789,8 @@ SYSCALL_DEFINE0(uprobe)
 	regs->ax  = args.ax;
 	regs->r11 = args.r11;
 	regs->cx  = args.cx;
-	regs->ip  = args.retaddr - 5;
-	regs->sp += sizeof(args);
+	regs->ip  = args.retaddr - OPT_INSN_SIZE;
+	regs->sp += sizeof(args) + REDZONE_SIZE;
 	regs->orig_ax = -1;
 
 	sp = regs->sp;
@@ -788,12 +807,12 @@ SYSCALL_DEFINE0(uprobe)
 	 */
 	if (regs->sp != sp) {
 		/* skip the trampoline call */
-		if (args.retaddr - 5 == regs->ip)
-			regs->ip += 5;
+		if (args.retaddr - OPT_INSN_SIZE == regs->ip)
+			regs->ip += OPT_INSN_SIZE;
 		return regs->ax;
 	}
 
-	regs->sp -= sizeof(args);
+	regs->sp -= sizeof(args) + REDZONE_SIZE;
 
 	/* for the case uprobe_consumer has changed ax/r11/cx */
 	args.ax  = regs->ax;
@@ -801,7 +820,7 @@ SYSCALL_DEFINE0(uprobe)
 	args.cx  = regs->cx;
 
 	/* keep return address unless we are instructed otherwise */
-	if (args.retaddr - 5 != regs->ip)
+	if (args.retaddr - OPT_INSN_SIZE != regs->ip)
 		args.retaddr = regs->ip;
 
 	if (shstk_push(args.retaddr) == -EFAULT)
@@ -835,7 +854,7 @@ asm (
 	"pop %rax\n"
 	"pop %r11\n"
 	"pop %rcx\n"
-	"ret\n"
+	"ret $" __stringify(REDZONE_SIZE) "\n"
 	"int3\n"
 	".balign " __stringify(PAGE_SIZE) "\n"
 	".popsection\n"
@@ -853,7 +872,8 @@ late_initcall(arch_uprobes_init);
 
 enum {
 	EXPECT_SWBP,
-	EXPECT_CALL,
+	EXPECT_OPTIMIZED,
+	EXPECT_SWBP_OPTIMIZED,
 };
 
 struct write_opcode_ctx {
@@ -861,30 +881,29 @@ struct write_opcode_ctx {
 	int expect;
 };
 
-static int is_call_insn(uprobe_opcode_t *insn)
-{
-	return *insn == CALL_INSN_OPCODE;
-}
-
 /*
- * Verification callback used by int3_update uprobe_write calls to make sure
- * the underlying instruction is as expected - either int3 or call.
+ * Verification callback used by uprobe_write calls to make sure the underlying
+ * instruction is in the expected stage of the INT3 update sequence.
  */
 static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
 		       int nbytes, void *data)
 {
 	struct write_opcode_ctx *ctx = data;
-	uprobe_opcode_t old_opcode[5];
+	uprobe_opcode_t old_opcode[OPT_INSN_SIZE];
 
-	uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+	uprobe_copy_from_page(page, ctx->base, old_opcode, OPT_INSN_SIZE);
 
 	switch (ctx->expect) {
 	case EXPECT_SWBP:
 		if (is_swbp_insn(&old_opcode[0]))
 			return 1;
 		break;
-	case EXPECT_CALL:
-		if (is_call_insn(&old_opcode[0]))
+	case EXPECT_OPTIMIZED:
+		if (is_opt_insns(&old_opcode[0]))
+			return 1;
+		break;
+	case EXPECT_SWBP_OPTIMIZED:
+		if (is_swbp_opt_insns(&old_opcode[0]))
 			return 1;
 		break;
 	}
@@ -893,48 +912,112 @@ static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *
 }
 
 /*
- * Modify multi-byte instructions by using INT3 breakpoints on SMP.
+ * Modify the optimized instruction by using INT3 breakpoints on SMP.
  * We completely avoid using stop_machine() here, and achieve the
  * synchronization using INT3 breakpoints and SMP cross-calls.
  * (borrowed comment from smp_text_poke_batch_finish)
  *
- * The way it is done:
- *   - Add an INT3 trap to the address that will be patched
- *   - SMP sync all CPUs
- *   - Update all but the first byte of the patched range
- *   - SMP sync all CPUs
- *   - Replace the first byte (INT3) by the first byte of the replacing opcode
- *   - SMP sync all CPUs
+ * For optimization (int3_update_optimize):
+ *   1) Start with the uprobe INT3 trap already installed
+ *   2) Update everything but the first byte
+ *   3) Replace the first INT3 by the first byte of the LEA instruction
+ *
+ * For unoptimization (int3_update_unoptimize):
+ *   1) Start with the optimized uprobe lea/call instructions
+ *   2) Add an INT3 trap to the address that will be patched
+ *   3) Restore the NOP bytes before the call opcode
+ *   4) Replace the first INT3 by the first byte of the NOP instruction
+ *
+ * Note that unoptimization deliberately keeps the call opcode and displacement
+ * in bytes 5..9. Those bytes become operands of the restored 10-byte NOP.
+ *
+ * Since there is only a single target uprobe-trampoline for the given nop10
+ * instruction address, the CALL instruction will not be changed across
+ * unoptimization/optimization cycles.
+ * Therefore, any task that is preempted at the CALL instruction is guaranteed
+ * to observe that CALL and not anything else.
  */
-static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
-		       unsigned long vaddr, char *insn, bool optimize)
+static int int3_update_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+				unsigned long vaddr, uprobe_opcode_t *insn)
 {
-	uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
 	struct write_opcode_ctx ctx = {
 		.base = vaddr,
 	};
 	int err;
 
 	/*
-	 * Write int3 trap.
+	 * 1) Initial state after set_swbp() installed the uprobe:
+	 *    cc 2e 0f 1f 84 00 00 00 00 00
 	 *
-	 * The swbp_optimize path comes with breakpoint already installed,
-	 * so we can skip this step for optimize == true.
+	 *    After a previous unoptimization bytes 5..9 may still contain the
+	 *    old call instruction, which remains valid for threads already there.
 	 */
-	if (!optimize) {
-		ctx.expect = EXPECT_CALL;
-		err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
-				   true /* is_register */, false /* do_update_ref_ctr */,
-				   &ctx);
-		if (err)
-			return err;
-	}
+	smp_text_poke_sync_each_cpu();
+
+	/*
+	 * 2) Rewrite the LEA tail and call displacement:
+	 *    cc [8d 64 24 80 e8 d0 d1 d2 d3]
+	 */
+	ctx.expect = EXPECT_SWBP;
+	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1,
+			   OPT_INSN_SIZE - 1, verify_insn,
+			   true /* is_register */, false /* do_update_ref_ctr */,
+			   &ctx);
+	if (err)
+		return err;
 
 	smp_text_poke_sync_each_cpu();
 
-	/* Write all but the first byte of the patched range. */
+	/*
+	 * 3) Publish the first LEA byte:
+	 *    [48] 8d 64 24 80 e8 d0 d1 d2 d3
+	 *
+	 *    From offset 0 this is:
+	 *      lea -0x80(%rsp), %rsp
+	 *      call <uprobe-trampoline>
+	 */
+	ctx.expect = EXPECT_SWBP_OPTIMIZED;
+	err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
+			   true /* is_register */, false /* do_update_ref_ctr */,
+			   &ctx);
+	if (err)
+		goto error;
+
+	smp_text_poke_sync_each_cpu();
+	return 0;
+
+error:
+	/*
+	 * In all intermediate states byte 0 is INT3, so EXPECT_SWBP covers every
+	 * case. Restore NOP bytes 1..4, but keep the valid CALL at bytes 5..9
+	 * for a thread that had already executed the LEA before a previous
+	 * unoptimization.
+	 */
 	ctx.expect = EXPECT_SWBP;
-	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
+	uprobe_write(auprobe, vma, vaddr + 1, auprobe->insn + 1,
+		     LEA_INSN_SIZE - 1, verify_insn, true, false, &ctx);
+	smp_text_poke_sync_each_cpu();
+	return err;
+}
+
+static int int3_update_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+				  unsigned long vaddr, uprobe_opcode_t *insn)
+{
+	uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
+	struct write_opcode_ctx ctx = {
+		.base = vaddr,
+		.expect = EXPECT_OPTIMIZED,
+	};
+	int err;
+
+	/*
+	 * 1) Initial optimized state:
+	 *    48 8d 64 24 80 e8 d0 d1 d2 d3
+	 *
+	 * 2) Trap new entries before restoring the NOP bytes:
+	 *    [cc] 8d 64 24 80 e8 d0 d1 d2 d3
+	 */
+	err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
 			   true /* is_register */, false /* do_update_ref_ctr */,
 			   &ctx);
 	if (err)
@@ -943,13 +1026,31 @@ static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 	smp_text_poke_sync_each_cpu();
 
 	/*
-	 * Write first byte.
+	 * 3) Restore bytes 1..4 of the original NOP while keeping byte 0 trapped
+	 *    and byte 5 as CALL:
+	 *    cc [2e 0f 1f 84] e8 d0 d1 d2 d3
+	 */
+	ctx.expect = EXPECT_SWBP_OPTIMIZED;
+	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1,
+			   LEA_INSN_SIZE - 1, verify_insn,
+			   true /* is_register */, false /* do_update_ref_ctr */,
+			   &ctx);
+	if (err)
+		return err;
+
+	smp_text_poke_sync_each_cpu();
+
+	/*
+	 * 4) Publish the first byte of the original NOP:
+	 *    [66] 2e 0f 1f 84 e8 d0 d1 d2 d3
 	 *
-	 * The swbp_unoptimize needs to finish uprobe removal together
-	 * with ref_ctr update, using uprobe_write with proper flags.
+	 * From offset 0 this is the restored 10-byte NOP; the CALL opcode and
+	 * displacement are now only NOP operands.  Offset 5 still decodes as
+	 * CALL for a thread that was already there.
 	 */
+	ctx.expect = EXPECT_SWBP;
 	err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
-			   optimize /* is_register */, !optimize /* do_update_ref_ctr */,
+			   false /* is_register */, true /* do_update_ref_ctr */,
 			   &ctx);
 	if (err)
 		return err;
@@ -961,17 +1062,25 @@ static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 			 unsigned long vaddr, unsigned long tramp)
 {
-	u8 call[5];
+	u8 insn[OPT_INSN_SIZE], *call = &insn[LEA_INSN_SIZE];
 
-	__text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
+	/*
+	 * We have nop10 instruction (with first byte overwritten to int3),
+	 * changing it to:
+	 *   lea -0x80(%rsp), %rsp
+	 *   call tramp
+	 */
+	memcpy(insn, lea_rsp, LEA_INSN_SIZE);
+	__text_gen_insn(call, CALL_INSN_OPCODE,
+			(const void *) (vaddr + LEA_INSN_SIZE),
 			(const void *) tramp, CALL_INSN_SIZE);
-	return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
+	return int3_update_optimize(auprobe, vma, vaddr, insn);
 }
 
 static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 			   unsigned long vaddr)
 {
-	return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
+	return int3_update_unoptimize(auprobe, vma, vaddr, auprobe->insn);
 }
 
 static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
@@ -993,19 +1102,19 @@ static bool __is_optimized(struct mm_struct *mm, uprobe_opcode_t *insn, unsigned
 	struct __packed __arch_relative_insn {
 		u8 op;
 		s32 raddr;
-	} *call = (struct __arch_relative_insn *) insn;
+	} *call = (struct __arch_relative_insn *)(insn + LEA_INSN_SIZE);
 
-	if (!is_call_insn(insn))
+	if (!is_opt_insns(insn))
 		return false;
-	return __in_uprobe_trampoline(mm, vaddr + 5 + call->raddr);
+	return __in_uprobe_trampoline(mm, vaddr + OPT_INSN_SIZE + call->raddr);
 }
 
 static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
 {
-	uprobe_opcode_t insn[5];
+	uprobe_opcode_t insn[OPT_INSN_SIZE];
 	int err;
 
-	err = copy_from_vaddr(mm, vaddr, &insn, 5);
+	err = copy_from_vaddr(mm, vaddr, &insn, OPT_INSN_SIZE);
 	if (err)
 		return err;
 	return __is_optimized(mm, (uprobe_opcode_t *)&insn, vaddr);
@@ -1077,7 +1186,7 @@ static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct
 void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
 {
 	struct mm_struct *mm = current->mm;
-	uprobe_opcode_t insn[5];
+	uprobe_opcode_t insn[OPT_INSN_SIZE];
 
 	if (!should_optimize(auprobe))
 		return;
@@ -1088,7 +1197,7 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
 	 * Check if some other thread already optimized the uprobe for us,
 	 * if it's the case just go away silently.
 	 */
-	if (copy_from_vaddr(mm, vaddr, &insn, 5))
+	if (copy_from_vaddr(mm, vaddr, &insn, OPT_INSN_SIZE))
 		goto unlock;
 	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
 		goto unlock;
@@ -1104,16 +1213,32 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
 	mmap_write_unlock(mm);
 }
 
+static bool is_optimizable_nop10(struct insn *insn)
+{
+	static const u8 nop10_prefix[] = {
+		0x66, 0x2e, 0x0f, 0x1f, 0x84
+	};
+
+	/*
+	 * Restrict this to the 10-byte NOP form whose last 5 bytes are
+	 * SIB/displacement operands. Unoptimization keeps the call opcode and
+	 * displacement in those bytes, so other NOP encodings are not safe.
+	 */
+	return insn->length == OPT_INSN_SIZE &&
+	       insn_is_nop(insn) &&
+	       !memcmp(insn->kaddr, nop10_prefix, ARRAY_SIZE(nop10_prefix));
+}
+
 static bool can_optimize(struct insn *insn, unsigned long vaddr)
 {
-	if (!insn->x86_64 || insn->length != 5)
+	if (!insn->x86_64)
 		return false;
 
-	if (!insn_is_nop(insn))
+	if (!is_optimizable_nop10(insn))
 		return false;
 
 	/* We can't do cross page atomic writes yet. */
-	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= OPT_INSN_SIZE;
 }
 #else /* 32-bit: */
 /*
-- 
2.54.0


^ permalink raw reply related

* [PATCHv4 04/13] uprobes/x86: Unmap trampoline vma object in case it's unused
From: Jiri Olsa @ 2026-05-26 20:58 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260526205840.173790-1-jolsa@kernel.org>

In case the optimization fails, we leak new-ly created trampoline
vma mapping (in case we just created it), let's unmap it.

Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/kernel/uprobes.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index c8af41ed681a..af5af7d67999 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -677,11 +677,14 @@ static unsigned long find_nearest_trampoline(unsigned long vaddr)
 	return high_tramp;
 }
 
-static struct vm_area_struct *get_uprobe_trampoline(struct mm_struct *mm, unsigned long vaddr)
+static struct vm_area_struct *get_uprobe_trampoline(struct mm_struct *mm, unsigned long vaddr,
+						    bool *new_mapping)
 {
 	VMA_ITERATOR(vmi, mm, 0);
 	struct vm_area_struct *vma;
 
+	*new_mapping = false;
+
 	if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
 		return ERR_PTR(-EINVAL);
 
@@ -696,6 +699,7 @@ static struct vm_area_struct *get_uprobe_trampoline(struct mm_struct *mm, unsign
 	if (IS_ERR_VALUE(vaddr))
 		return ERR_PTR(vaddr);
 
+	*new_mapping = true;
 	return _install_special_mapping(mm, vaddr, PAGE_SIZE,
 				VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_IO,
 				&tramp_mapping);
@@ -1053,16 +1057,21 @@ static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct
 {
 	struct pt_regs *regs = task_pt_regs(current);
 	struct vm_area_struct *vma, *tramp;
+	bool new_mapping;
+	int ret;
 
 	if (!user_64bit_mode(regs))
 		return -EINVAL;
 	vma = find_vma(mm, vaddr);
 	if (!vma)
 		return -EINVAL;
-	tramp = get_uprobe_trampoline(mm, vaddr);
+	tramp = get_uprobe_trampoline(mm, vaddr, &new_mapping);
 	if (IS_ERR(tramp))
 		return PTR_ERR(tramp);
-	return WARN_ON_ONCE(swbp_optimize(auprobe, vma, vaddr, tramp->vm_start));
+	ret = swbp_optimize(auprobe, vma, vaddr, tramp->vm_start);
+	if (WARN_ON_ONCE(ret) && new_mapping)
+		WARN_ON_ONCE(do_munmap(mm, tramp->vm_start, PAGE_SIZE, NULL));
+	return ret;
 }
 
 void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
-- 
2.54.0


^ permalink raw reply related

* [PATCHv4 03/13] uprobes/x86: Allow to copy uprobe trampolines on fork
From: Jiri Olsa @ 2026-05-26 20:58 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260526205840.173790-1-jolsa@kernel.org>

When we do fork or clone without CLONE_VM the new process won't
have uprobe trampoline vma objects and at the same time it will
have optimized code calling that trampoline and crash.

Fixing this by allowing vma uprobe trampoline objects to be copied
on fork to the new process.

Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/kernel/uprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 733655bc610e..c8af41ed681a 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -697,7 +697,7 @@ static struct vm_area_struct *get_uprobe_trampoline(struct mm_struct *mm, unsign
 		return ERR_PTR(vaddr);
 
 	return _install_special_mapping(mm, vaddr, PAGE_SIZE,
-				VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
+				VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_IO,
 				&tramp_mapping);
 }
 
-- 
2.54.0


^ permalink raw reply related

* [PATCHv4 02/13] uprobes/x86: Remove struct uprobe_trampoline object
From: Jiri Olsa @ 2026-05-26 20:58 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260526205840.173790-1-jolsa@kernel.org>

Removing struct uprobe_trampoline object and it's tracking code,
because it's not needed. We can do same thing directly on top of
struct vm_area_struct objects.

This makes the code simpler and allows easy propagation of the
trampoline vma object into child process in following change.

Note the original code called destroy_uprobe_trampoline if the
optimiation failed, but it only freed the struct uprobe_trampoline
object, not the vma. The new vma leak is fixed in following change.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/kernel/uprobes.c | 105 ++++++++------------------------------
 include/linux/uprobes.h   |   5 --
 kernel/events/uprobes.c   |  10 ----
 kernel/fork.c             |   1 -
 4 files changed, 21 insertions(+), 100 deletions(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 2be6707e3320..733655bc610e 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -631,11 +631,6 @@ static struct vm_special_mapping tramp_mapping = {
 	.pages  = tramp_mapping_pages,
 };
 
-struct uprobe_trampoline {
-	struct hlist_node	node;
-	unsigned long		vaddr;
-};
-
 static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
 {
 	long delta = (long)(vaddr + 5 - vtramp);
@@ -682,83 +677,28 @@ static unsigned long find_nearest_trampoline(unsigned long vaddr)
 	return high_tramp;
 }
 
-static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
+static struct vm_area_struct *get_uprobe_trampoline(struct mm_struct *mm, unsigned long vaddr)
 {
-	struct pt_regs *regs = task_pt_regs(current);
-	struct mm_struct *mm = current->mm;
-	struct uprobe_trampoline *tramp;
+	VMA_ITERATOR(vmi, mm, 0);
 	struct vm_area_struct *vma;
 
-	if (!user_64bit_mode(regs))
-		return NULL;
+	if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	for_each_vma(vmi, vma) {
+		if (!vma_is_special_mapping(vma, &tramp_mapping))
+			continue;
+		if (is_reachable_by_call(vma->vm_start, vaddr))
+			return vma;
+	}
 
 	vaddr = find_nearest_trampoline(vaddr);
 	if (IS_ERR_VALUE(vaddr))
-		return NULL;
+		return ERR_PTR(vaddr);
 
-	tramp = kzalloc_obj(*tramp);
-	if (unlikely(!tramp))
-		return NULL;
-
-	tramp->vaddr = vaddr;
-	vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
+	return _install_special_mapping(mm, vaddr, PAGE_SIZE,
 				VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
 				&tramp_mapping);
-	if (IS_ERR(vma)) {
-		kfree(tramp);
-		return NULL;
-	}
-	return tramp;
-}
-
-static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
-{
-	struct uprobes_state *state = &current->mm->uprobes_state;
-	struct uprobe_trampoline *tramp = NULL;
-
-	if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
-		return NULL;
-
-	hlist_for_each_entry(tramp, &state->head_tramps, node) {
-		if (is_reachable_by_call(tramp->vaddr, vaddr)) {
-			*new = false;
-			return tramp;
-		}
-	}
-
-	tramp = create_uprobe_trampoline(vaddr);
-	if (!tramp)
-		return NULL;
-
-	*new = true;
-	hlist_add_head(&tramp->node, &state->head_tramps);
-	return tramp;
-}
-
-static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
-{
-	/*
-	 * We do not unmap and release uprobe trampoline page itself,
-	 * because there's no easy way to make sure none of the threads
-	 * is still inside the trampoline.
-	 */
-	hlist_del(&tramp->node);
-	kfree(tramp);
-}
-
-void arch_uprobe_init_state(struct mm_struct *mm)
-{
-	INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps);
-}
-
-void arch_uprobe_clear_state(struct mm_struct *mm)
-{
-	struct uprobes_state *state = &mm->uprobes_state;
-	struct uprobe_trampoline *tramp;
-	struct hlist_node *n;
-
-	hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
-		destroy_uprobe_trampoline(tramp);
 }
 
 static bool __in_uprobe_trampoline(struct mm_struct *mm, unsigned long ip)
@@ -1111,21 +1051,18 @@ int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
 				  unsigned long vaddr)
 {
-	struct uprobe_trampoline *tramp;
-	struct vm_area_struct *vma;
-	bool new = false;
-	int err = 0;
+	struct pt_regs *regs = task_pt_regs(current);
+	struct vm_area_struct *vma, *tramp;
 
+	if (!user_64bit_mode(regs))
+		return -EINVAL;
 	vma = find_vma(mm, vaddr);
 	if (!vma)
 		return -EINVAL;
-	tramp = get_uprobe_trampoline(vaddr, &new);
-	if (!tramp)
-		return -EINVAL;
-	err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
-	if (WARN_ON_ONCE(err) && new)
-		destroy_uprobe_trampoline(tramp);
-	return err;
+	tramp = get_uprobe_trampoline(mm, vaddr);
+	if (IS_ERR(tramp))
+		return PTR_ERR(tramp);
+	return WARN_ON_ONCE(swbp_optimize(auprobe, vma, vaddr, tramp->vm_start));
 }
 
 void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f548fea2adec..18be159bbc34 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -186,9 +186,6 @@ struct xol_area;
 
 struct uprobes_state {
 	struct xol_area		*xol_area;
-#ifdef CONFIG_X86_64
-	struct hlist_head	head_tramps;
-#endif
 };
 
 typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr,
@@ -238,8 +235,6 @@ extern void uprobe_handle_trampoline(struct pt_regs *regs);
 extern void *arch_uretprobe_trampoline(unsigned long *psize);
 extern unsigned long uprobe_get_trampoline_vaddr(void);
 extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
-extern void arch_uprobe_clear_state(struct mm_struct *mm);
-extern void arch_uprobe_init_state(struct mm_struct *mm);
 extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
 extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
 extern unsigned long arch_uprobe_get_xol_area(void);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4084e926e284..b5c516168f84 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1806,14 +1806,6 @@ static struct xol_area *get_xol_area(void)
 	return area;
 }
 
-void __weak arch_uprobe_clear_state(struct mm_struct *mm)
-{
-}
-
-void __weak arch_uprobe_init_state(struct mm_struct *mm)
-{
-}
-
 /*
  * uprobe_clear_state - Free the area allocated for slots.
  */
@@ -1825,8 +1817,6 @@ void uprobe_clear_state(struct mm_struct *mm)
 	delayed_uprobe_remove(NULL, mm);
 	mutex_unlock(&delayed_uprobe_lock);
 
-	arch_uprobe_clear_state(mm);
-
 	if (!area)
 		return;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5f3fdfdb14c7..9c6baabdc961 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1059,7 +1059,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
 {
 #ifdef CONFIG_UPROBES
 	mm->uprobes_state.xol_area = NULL;
-	arch_uprobe_init_state(mm);
 #endif
 }
 
-- 
2.54.0


^ permalink raw reply related

* [PATCHv4 01/13] uprobes/x86: Use proper mm_struct in __in_uprobe_trampoline
From: Jiri Olsa @ 2026-05-26 20:58 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260526205840.173790-1-jolsa@kernel.org>

In the unregister path we use __in_uprobe_trampoline check with
current->mm for the VMA lookup, which is wrong, because we are
in the tracer context, not the traced process.

Add mm_struct pointer argument to __in_uprobe_trampoline and
changing related callers to pass proper mm_struct pointer.

Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/kernel/uprobes.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index ebb1baf1eb1d..2be6707e3320 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -761,9 +761,9 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
 		destroy_uprobe_trampoline(tramp);
 }
 
-static bool __in_uprobe_trampoline(unsigned long ip)
+static bool __in_uprobe_trampoline(struct mm_struct *mm, unsigned long ip)
 {
-	struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+	struct vm_area_struct *vma = vma_lookup(mm, ip);
 
 	return vma && vma_is_special_mapping(vma, &tramp_mapping);
 }
@@ -776,14 +776,14 @@ static bool in_uprobe_trampoline(unsigned long ip)
 
 	rcu_read_lock();
 	if (mmap_lock_speculate_try_begin(mm, &seq)) {
-		found = __in_uprobe_trampoline(ip);
+		found = __in_uprobe_trampoline(mm, ip);
 		retry = mmap_lock_speculate_retry(mm, seq);
 	}
 	rcu_read_unlock();
 
 	if (retry) {
 		mmap_read_lock(mm);
-		found = __in_uprobe_trampoline(ip);
+		found = __in_uprobe_trampoline(mm, ip);
 		mmap_read_unlock(mm);
 	}
 	return found;
@@ -1044,7 +1044,7 @@ static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst,
 	return 0;
 }
 
-static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+static bool __is_optimized(struct mm_struct *mm, uprobe_opcode_t *insn, unsigned long vaddr)
 {
 	struct __packed __arch_relative_insn {
 		u8 op;
@@ -1053,7 +1053,7 @@ static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
 
 	if (!is_call_insn(insn))
 		return false;
-	return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+	return __in_uprobe_trampoline(mm, vaddr + 5 + call->raddr);
 }
 
 static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
@@ -1064,7 +1064,7 @@ static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
 	err = copy_from_vaddr(mm, vaddr, &insn, 5);
 	if (err)
 		return err;
-	return __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+	return __is_optimized(mm, (uprobe_opcode_t *)&insn, vaddr);
 }
 
 static bool should_optimize(struct arch_uprobe *auprobe)
-- 
2.54.0


^ permalink raw reply related

* [PATCHv4 00/13] uprobes/x86: Fix red zone issue for optimized uprobes
From: Jiri Olsa @ 2026-05-26 20:58 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel

hi,
Andrii reported an issue with optimized uprobes [1] that can clobber
redzone area with call instruction storing return address on stack
where user code may keep temporary data without adjusting rsp.

Fixing this by moving the optimized uprobes on top of 10-bytes nop
instruction, so we can squeeze another instruction to escape the
redzone area before doing the call.

Note we need upstream update first for patch 3 (github.com/libbpf/usdt),
if we decide to take this change.

thanks,
jirka


v1: https://lore.kernel.org/bpf/20260514135342.22130-1-jolsa@kernel.org/
v2: https://lore.kernel.org/bpf/20260518105957.123445-1-jolsa@kernel.org/
v3: https://lore.kernel.org/bpf/20260521124411.31133-1-jolsa@kernel.org/

v4 changes:
- do not use 2nd int3 (ont +5 offset) because the call instruction
  is allways the same for the given nop10 address [Andrii/Peter]
- unmap unused trampoline vma after unsuccesfull optimization [sashiko]
- small change to patch#2 moved user_64bit_mode earlier in the path
  and pass/use mm_struct pointer directly from arch_uprobe_optimize
  instead of gettting current->mm
  Andrii, keeping your ack, please shout otherwise

v3 changes:
- use nop10 update suggested by Peter in [2]
- remove struct uprobe_trampoline object, use vma objects directly instead
- selftests fixes [sashiko]
- ack from Andrii

v2 changes:
- several selftest fixes [sashiko]
- consolidate is_lea_insn and is_call_insn insto single check [Jakub Sitnicki]
- use proper mm_struct object in __in_uprobe_trampoline check [sashiko]
- allow to copy uprobe trampolines vma objects on fork [sashiko]
- change uprobe syscall detection error from -ENXIO to -EPROTO [Andrii]
- added fork/clone tests
- I kept the selftest changes and nop5->nop10 changes in separate
  commits for easier review, we can squash them later if we want to keep
  bisect working properly


[1] https://lore.kernel.org/bpf/20260509003146.976844-1-andrii@kernel.org/
[2] https://lore.kernel.org/bpf/20260518104306.GU3102624@noisy.programming.kicks-ass.net/#t
---
Andrii Nakryiko (1):
      selftests/bpf: Add tests for uprobe nop10 red zone clobbering

Jiri Olsa (12):
      uprobes/x86: Use proper mm_struct in __in_uprobe_trampoline
      uprobes/x86: Remove struct uprobe_trampoline object
      uprobes/x86: Allow to copy uprobe trampolines on fork
      uprobes/x86: Unmap trampoline vma object in case it's unused
      uprobes/x86: Move optimized uprobe from nop5 to nop10
      libbpf: Change has_nop_combo to work on top of nop10
      libbpf: Detect uprobe syscall with new error
      selftests/bpf: Emit nop,nop10 instructions combo for x86_64 arch
      selftests/bpf: Change uprobe syscall tests to use nop10
      selftests/bpf: Change uprobe/usdt trigger bench code to use nop10
      selftests/bpf: Add reattach tests for uprobe syscall
      selftests/bpf: Add tests for forked/cloned optimized uprobes

 arch/x86/kernel/uprobes.c                               | 379 +++++++++++++++++++++++++++++++++++++++++++-----------------------------
 include/linux/uprobes.h                                 |   5 -
 kernel/events/uprobes.c                                 |  10 --
 kernel/fork.c                                           |   1 -
 tools/lib/bpf/features.c                                |   4 +-
 tools/lib/bpf/usdt.c                                    |  16 +--
 tools/testing/selftests/bpf/bench.c                     |  20 ++--
 tools/testing/selftests/bpf/benchs/bench_trigger.c      |  38 ++++----
 tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh |   2 +-
 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 307 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 tools/testing/selftests/bpf/prog_tests/usdt.c           |  74 ++++++++++++--
 tools/testing/selftests/bpf/progs/test_usdt.c           |  25 +++++
 tools/testing/selftests/bpf/usdt.h                      |   2 +-
 tools/testing/selftests/bpf/usdt_2.c                    |  15 ++-
 14 files changed, 653 insertions(+), 245 deletions(-)

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] trace: stack trace deduplication for ftrace ring buffer
From: Steven Rostedt @ 2026-05-26 19:39 UTC (permalink / raw)
  To: Li Pengfei
  Cc: mhiramat, linux-trace-kernel, linux-kernel, cmllamas, zhangbo56,
	Pengfei Li
In-Reply-To: <cover.1779769138.git.lipengfei28@xiaomi.com>


Please DO NOT SEND new versions of a patch or patch series as a reply to
the old one. It makes it extremely difficult for maintainers to manage the
replies and patches.

A new version should ALWAYS start a new email thread!

On Tue, 26 May 2026 19:52:42 +0800
Li Pengfei <ljdlns1987@gmail.com> wrote:

> From: Pengfei Li <lipengfei28@xiaomi.com>
> 
> Hi Masami, Steven, all,
> 
> This is v3 of the ftrace stackmap series. It addresses the Sashiko
> review on v2 [1] that Masami pointed out.
> 
> [1] https://sashiko.dev/#/patchset/20260522104017.1668638-1-lipengfei28%40xiaomi.com
> 
> The series adds stack trace deduplication to ftrace. When the
> stacktrace option is enabled, the ring buffer stores a 4-byte
> stack_id instead of a full kernel stack trace, while the full
> stacks are exported via tracefs.
> 
> Rebased onto v7.1-rc5 (e8c2f9fdadee) before sending.
> 
> Changes since v2
> ================

Then you can use lore to add a link to the old version via the Message-ID
of the old version. You can have the above as:

 Changes since v2: https://lore.kernel.org/all/20260522104017.1668638-1-lipengfei28@xiaomi.com/

-- Steve

^ permalink raw reply

* [PATCH 6.12] x86/fgraph: Fix return_to_handler regs.rsp value
From: Gyokhan Kochmarla @ 2026-05-26 19:23 UTC (permalink / raw)
  To: stable, gregkh
  Cc: jolsa, rostedt, mhiramat, tglx, mingo, bp, x86,
	linux-trace-kernel, bpf, Andrii Nakryiko, Gyokhan Kochmarla

From: Jiri Olsa <jolsa@kernel.org>

commit 8bc11700e0d23d4fdb7d8d5a73b2e95de427cabc upstream.

The previous change (Fixes commit) messed up the rsp register value,
which is wrong because it's already adjusted with FRAME_SIZE, we need
the original rsp value.

This change does not affect fprobe current kernel unwind, the !perf_hw_regs
path perf_callchain_kernel:

        if (perf_hw_regs(regs)) {
                if (perf_callchain_store(entry, regs->ip))
                        return;
                unwind_start(&state, current, regs, NULL);
        } else {
                unwind_start(&state, current, NULL, (void *)regs->sp);
        }

which uses pt_regs.sp as first_frame boundary (FRAME_SIZE shift makes
no difference, unwind stil stops at the right frame).

This change fixes the other path when we want to unwind directly from
pt_regs sp/fp/ip state, which is coming in following change.

Fixes: 20a0bc10272f ("x86/fgraph,bpf: Fix stack ORC unwind from kprobe_multi return probe")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/bpf/20260126211837.472802-2-jolsa@kernel.org
Signed-off-by: Gyokhan Kochmarla <gyokhan@amazon.de>
---
 arch/x86/kernel/ftrace_64.S | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 8a3cff618692..143fc62bf6f8 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -349,6 +349,9 @@ SYM_CODE_START(return_to_handler)
 	UNWIND_HINT_UNDEFINED
 	ANNOTATE_NOENDBR
 
+	/* Store original rsp for pt_regs.sp value. */
+	movq %rsp, %rdi
+
 	/* Restore return_to_handler value that got eaten by previous ret instruction. */
 	subq $8, %rsp
 	UNWIND_HINT_FUNC
@@ -359,7 +362,7 @@ SYM_CODE_START(return_to_handler)
 	movq %rax, RAX(%rsp)
 	movq %rdx, RDX(%rsp)
 	movq %rbp, RBP(%rsp)
-	movq %rsp, RSP(%rsp)
+	movq %rdi, RSP(%rsp)
 	movq %rsp, %rdi
 
 	call ftrace_return_to_handler
-- 
2.47.3




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related

* [PATCH 6.12] tracing: Fix the bug where bpf_get_stackid returns -EFAULT on the ARM64
From: Gyokhan Kochmarla @ 2026-05-26 19:20 UTC (permalink / raw)
  To: stable, gregkh
  Cc: yangfeng, rostedt, mhiramat, mark.rutland, catalin.marinas, will,
	jolsa, linux-trace-kernel, linux-arm-kernel, bpf,
	Gyokhan Kochmarla

From: Feng Yang <yangfeng@kylinos.cn>

commit fd2f74f8f3d3c1a524637caf5bead9757fae4332 upstream.

When using bpf_program__attach_kprobe_multi_opts on ARM64 to hook a BPF program
that contains the bpf_get_stackid function, the BPF program fails
to obtain the stack trace and returns -EFAULT.

This is because ftrace_partial_regs omits the configuration of the pstate register,
leaving pstate at the default value of 0. When get_perf_callchain executes,
it uses user_mode(regs) to determine whether it is in kernel mode.
This leads to a misjudgment that the code is in user mode,
so perf_callchain_kernel is not executed and the function returns directly.
As a result, trace->nr becomes 0, and finally -EFAULT is returned.

Therefore, the assignment of the pstate register is added here.

Fixes: b9b55c8912ce ("tracing: Add ftrace_partial_regs() for converting ftrace_regs to pt_regs")
Closes: https://lore.kernel.org/bpf/20250919071902.554223-1-yangfeng59949@163.com/
Signed-off-by: Feng Yang <yangfeng@kylinos.cn>
Tested-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Gyokhan Kochmarla <gyokhan@amazon.de>
---
 arch/arm64/include/asm/ftrace.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index 10e56522122a..46d4300dd48d 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -145,6 +145,7 @@ ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs *regs)
 	regs->pc = afregs->pc;
 	regs->regs[29] = afregs->fp;
 	regs->regs[30] = afregs->lr;
+	regs->pstate = PSR_MODE_EL1h;
 	return regs;
 }
 
-- 
2.47.3




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related

* Re: [PATCH v21 0/9] ring-buffer: Making persistent ring buffers robust
From: Steven Rostedt @ 2026-05-26 17:42 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: linux-kernel, linux-trace-kernel, Mark Rutland, Mathieu Desnoyers,
	Andrew Morton, Ian Rogers
In-Reply-To: <20260526141746.cf0b3c0bed5db3baf8914095@kernel.org>

On Tue, 26 May 2026 14:17:46 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Fri, 22 May 2026 13:08:57 -0400
> Steven Rostedt <rostedt@kernel.org> wrote:
> 
> > This is to make the persistent ring buffer more robust when sub-buffers
> > are detected to be corrupted. Instead of invalidating the entire buffer,
> > just invalidate the individual sub-buffers.
> > 
> > I started with Masami's patches and modified some from Sashiko reviews.
> > I added a few patches to display the dropped events when the persistent
> > ring buffers validation checks found sub-buffers were dropped due to being
> > corrupted data.  
> 
> It seems that Sashiko still marks it "Incompleted".
> Maybe we need base-commit: tag in this cover mail?
> I also guess that this series does not use "In-Reply-To:" but
> only uses "References:" tag in the mail header. I guess
> Sashiko's mail header parser missed it.

But this applies to mainline. I'm not sure why it's having problems.

-- Steve

^ permalink raw reply

* Re: [PATCH v21 8/9] ring-buffer: Show persistent buffer dropped events in trace file
From: Steven Rostedt @ 2026-05-26 17:41 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: linux-kernel, linux-trace-kernel, Mark Rutland, Mathieu Desnoyers,
	Andrew Morton, Ian Rogers
In-Reply-To: <20260526140609.28faf45d9d347e5d748e7cf1@kernel.org>

On Tue, 26 May 2026 14:06:09 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> > @@ -7204,10 +7209,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
> >  	 * Set a flag in the commit field if we lost events
> >  	 */
> >  	if (missed_events) {
> > -		/* If there is room at the end of the page to save the
> > +		/*
> > +		 * If there is room at the end of the page to save the
> >  		 * missed events, then record it there.
> >  		 */
> > -		if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
> > +		if (missed_events > 0 &&
> > +		    buffer->subbuf_size - commit >= sizeof(missed_events)) {
> >  			memcpy(&dpage->data[commit], &missed_events,
> >  			       sizeof(missed_events));
> >  			local_add(RB_MISSED_STORED, &dpage->commit);  
> 
> After this line, we "add" RB_MISSED_EVENTS instead of set.
> In this case, does it clear the RB_MISSED_EVENTS bit because
> it already sets RB_MISSED_EVENTS.
> 
> 			commit += sizeof(missed_events);
> 		}
> 		local_add(RB_MISSED_EVENTS, &bpage->commit);
>                       ^^^ here.

Perhaps this needs to be commented better.

The answer to your question is "No". The reason is that this is a *copy* of
the page we are reading. As persistent pages are always assigned to
specific memory, it can never leave the buffer even for the splice system
call. It is always copied to a new page.

The new page doesn't have these bits set and needs to set them depending on
what was found when reading the page from the buffer.

Now if this was a normal ring buffer where it did a zero copy from the
buffer itself by swapping pages with the passed in page, if the bit was set
before, then adding would cause a problem. But normal ring buffer pages
never set these bits while in the buffer. They are only set by this function.

-- Steve

^ permalink raw reply

* [PATCH v4 2/2] serial: qcom-geni: Add tracepoints for Qualcomm GENI serial driver
From: Praveen Talari @ 2026-05-26 17:37 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Greg Kroah-Hartman, Jiri Slaby, konrad.dybcio
  Cc: Praveen Talari, linux-kernel, linux-trace-kernel, linux-arm-msm,
	linux-serial, mukesh.savaliya, aniket.randive, chandana.chiluveru
In-Reply-To: <20260526-add-tracepoints-for-qcom-geni-serial-v4-0-e94fbaec0232@oss.qualcomm.com>

Add tracing to the Qualcomm GENI serial driver to improve runtime
observability.

Trace hooks are added at key points including termios and clock
configuration, manual control get/set, interrupt handling, and data
TX/RX paths.

Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Signed-off-by: Praveen Talari <praveen.talari@oss.qualcomm.com>
---
v2->v3:
- Updated commit text(removed example as it was available on cover
  letter).
---
 drivers/tty/serial/qcom_geni_serial.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index d81b539cff7f..4b62e58d4918 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -7,6 +7,9 @@
 /* Disable MMIO tracing to prevent excessive logging of unwanted MMIO traces */
 #define __DISABLE_TRACE_MMIO__
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/qcom_geni_serial.h>
+
 #include <linux/clk.h>
 #include <linux/console.h>
 #include <linux/io.h>
@@ -226,7 +229,7 @@ static void qcom_geni_serial_config_port(struct uart_port *uport, int cfg_flags)
 static unsigned int qcom_geni_serial_get_mctrl(struct uart_port *uport)
 {
 	unsigned int mctrl = TIOCM_DSR | TIOCM_CAR;
-	u32 geni_ios;
+	u32 geni_ios = 0;
 
 	if (uart_console(uport)) {
 		mctrl |= TIOCM_CTS;
@@ -236,6 +239,8 @@ static unsigned int qcom_geni_serial_get_mctrl(struct uart_port *uport)
 			mctrl |= TIOCM_CTS;
 	}
 
+	trace_geni_serial_get_mctrl(uport->dev, mctrl, geni_ios);
+
 	return mctrl;
 }
 
@@ -254,6 +259,8 @@ static void qcom_geni_serial_set_mctrl(struct uart_port *uport,
 	if (port->manual_flow && !(mctrl & TIOCM_RTS) && !uport->suspended)
 		uart_manual_rfr = UART_MANUAL_RFR_EN | UART_RFR_NOT_READY;
 	writel(uart_manual_rfr, uport->membase + SE_UART_MANUAL_RFR);
+
+	trace_geni_serial_set_mctrl(uport->dev, mctrl, uart_manual_rfr);
 }
 
 static const char *qcom_geni_serial_get_type(struct uart_port *uport)
@@ -684,6 +691,8 @@ static void qcom_geni_serial_start_tx_dma(struct uart_port *uport)
 	xmit_size = kfifo_out_linear_ptr(&tport->xmit_fifo, &tail,
 			UART_XMIT_SIZE);
 
+	trace_geni_serial_tx_data(uport->dev, tail, xmit_size);
+
 	qcom_geni_set_rs485_mode(uport, SER_RS485_RTS_ON_SEND);
 
 	qcom_geni_serial_setup_tx(uport, xmit_size);
@@ -910,8 +919,10 @@ static void qcom_geni_serial_handle_rx_dma(struct uart_port *uport, bool drop)
 		return;
 	}
 
-	if (!drop)
+	if (!drop) {
+		trace_geni_serial_rx_data(uport->dev, port->rx_buf, rx_in);
 		handle_rx_uart(uport, rx_in);
+	}
 
 	ret = geni_se_rx_dma_prep(&port->se, port->rx_buf,
 				  DMA_RX_BUF_SIZE,
@@ -1082,6 +1093,10 @@ static irqreturn_t qcom_geni_serial_isr(int isr, void *dev)
 	geni_status = readl(uport->membase + SE_GENI_STATUS);
 	dma = readl(uport->membase + SE_GENI_DMA_MODE_EN);
 	m_irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN);
+
+	trace_geni_serial_irq(uport->dev, m_irq_status, s_irq_status,
+			      dma_tx_status, dma_rx_status);
+
 	writel(m_irq_status, uport->membase + SE_GENI_M_IRQ_CLEAR);
 	writel(s_irq_status, uport->membase + SE_GENI_S_IRQ_CLEAR);
 	writel(dma_tx_status, uport->membase + SE_DMA_TX_IRQ_CLR);
@@ -1294,8 +1309,8 @@ static int geni_serial_set_rate(struct uart_port *uport, unsigned int baud)
 		return -EINVAL;
 	}
 
-	dev_dbg(port->se.dev, "desired_rate = %u, clk_rate = %lu, clk_div = %u, clk_idx = %u\n",
-		baud * sampling_rate, clk_rate, clk_div, clk_idx);
+	trace_geni_serial_clk_cfg(uport->dev, baud * sampling_rate, clk_rate,
+				  clk_div, clk_idx);
 
 	uport->uartclk = clk_rate;
 	port->clk_rate = clk_rate;
@@ -1455,6 +1470,10 @@ static void qcom_geni_serial_set_termios(struct uart_port *uport,
 	writel(bits_per_char, uport->membase + SE_UART_TX_WORD_LEN);
 	writel(bits_per_char, uport->membase + SE_UART_RX_WORD_LEN);
 	writel(stop_bit_len, uport->membase + SE_UART_TX_STOP_BIT_LEN);
+
+	trace_geni_serial_set_termios(uport->dev, baud, bits_per_char,
+				      tx_trans_cfg, tx_parity_cfg, rx_trans_cfg,
+				      rx_parity_cfg, stop_bit_len);
 }
 
 #ifdef CONFIG_SERIAL_QCOM_GENI_CONSOLE

-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 1/2] serial: qcom-geni: trace: Add tracepoint support for Qualcomm GENI serial
From: Praveen Talari @ 2026-05-26 17:37 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Greg Kroah-Hartman, Jiri Slaby, konrad.dybcio
  Cc: Praveen Talari, linux-kernel, linux-trace-kernel, linux-arm-msm,
	linux-serial, mukesh.savaliya, aniket.randive, chandana.chiluveru
In-Reply-To: <20260526-add-tracepoints-for-qcom-geni-serial-v4-0-e94fbaec0232@oss.qualcomm.com>

Add tracepoint support to the Qualcomm GENI serial driver to provide
runtime visibility into driver behavior without requiring invasive debug
patches.

The trace events cover UART termios configuration, clock setup, modem
control state, interrupt status, and TX/RX data, making it easier to
diagnose communication issues in the field.

Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Signed-off-by: Praveen Talari <praveen.talari@oss.qualcomm.com>
---
v2->v3:
- Removed \n from geni_serial_tx_data and geni_serial_rx_data events.
- Resolved aligment issues in geni_serial_data, geni_serial_tx_data and
  geni_serial_rx_data events.

v1->v2:
- Removed multiple TX/RX trace events, instead used
  DECLARE_EVENT_CLASS and DEFINE_EVENT.
---
 include/trace/events/qcom_geni_serial.h | 164 ++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/include/trace/events/qcom_geni_serial.h b/include/trace/events/qcom_geni_serial.h
new file mode 100644
index 000000000000..417ec01f9fc8
--- /dev/null
+++ b/include/trace/events/qcom_geni_serial.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM qcom_geni_serial
+
+#if !defined(_TRACE_QCOM_GENI_SERIAL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_QCOM_GENI_SERIAL_H
+
+#include <linux/device.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(geni_serial_set_termios,
+	    TP_PROTO(struct device *dev, unsigned int baud,
+		     unsigned int bits_per_char, u32 tx_trans_cfg,
+		     u32 tx_parity_cfg, u32 rx_trans_cfg,
+		     u32 rx_parity_cfg, u32 stop_bit_len),
+	    TP_ARGS(dev, baud, bits_per_char, tx_trans_cfg, tx_parity_cfg,
+		    rx_trans_cfg, rx_parity_cfg, stop_bit_len),
+
+	    TP_STRUCT__entry(__string(name, dev_name(dev))
+			     __field(unsigned int, baud)
+			     __field(unsigned int, bits_per_char)
+			     __field(u32, tx_trans_cfg)
+			     __field(u32, tx_parity_cfg)
+			     __field(u32, rx_trans_cfg)
+			     __field(u32, rx_parity_cfg)
+			     __field(u32, stop_bit_len)
+	    ),
+
+	    TP_fast_assign(__assign_str(name);
+			   __entry->baud = baud;
+			   __entry->bits_per_char = bits_per_char;
+			   __entry->tx_trans_cfg = tx_trans_cfg;
+			   __entry->tx_parity_cfg = tx_parity_cfg;
+			   __entry->rx_trans_cfg = rx_trans_cfg;
+			   __entry->rx_parity_cfg = rx_parity_cfg;
+			   __entry->stop_bit_len = stop_bit_len;
+	    ),
+
+	    TP_printk("%s: baud=%u bpc=%u tx_trans=0x%08x tx_par=0x%08x rx_trans=0x%08x rx_par=0x%08x stop=%u",
+		      __get_str(name), __entry->baud, __entry->bits_per_char,
+		      __entry->tx_trans_cfg, __entry->tx_parity_cfg,
+		      __entry->rx_trans_cfg, __entry->rx_parity_cfg,
+		      __entry->stop_bit_len)
+);
+
+TRACE_EVENT(geni_serial_clk_cfg,
+	    TP_PROTO(struct device *dev, unsigned int desired_rate,
+		     unsigned long clk_rate, unsigned int clk_div,
+		     unsigned int clk_idx),
+	    TP_ARGS(dev, desired_rate, clk_rate, clk_div, clk_idx),
+
+	    TP_STRUCT__entry(__string(name, dev_name(dev))
+			     __field(unsigned int, desired_rate)
+			     __field(unsigned long, clk_rate)
+			     __field(unsigned int, clk_div)
+			     __field(unsigned int, clk_idx)
+	    ),
+
+	    TP_fast_assign(__assign_str(name);
+			   __entry->desired_rate = desired_rate;
+			   __entry->clk_rate = clk_rate;
+			   __entry->clk_div = clk_div;
+			   __entry->clk_idx = clk_idx;
+	    ),
+
+	    TP_printk("%s: desired_rate=%u clk_rate=%lu clk_div=%u clk_idx=%u",
+		      __get_str(name), __entry->desired_rate, __entry->clk_rate,
+		      __entry->clk_div, __entry->clk_idx)
+);
+
+TRACE_EVENT(geni_serial_irq,
+	    TP_PROTO(struct device *dev, u32 m_irq, u32 s_irq,
+		     u32 dma_tx, u32 dma_rx),
+	    TP_ARGS(dev, m_irq, s_irq, dma_tx, dma_rx),
+
+	    TP_STRUCT__entry(__string(name, dev_name(dev))
+			     __field(u32, m_irq)
+			     __field(u32, s_irq)
+			     __field(u32, dma_tx)
+			     __field(u32, dma_rx)
+	    ),
+
+	    TP_fast_assign(__assign_str(name);
+			   __entry->m_irq = m_irq;
+			   __entry->s_irq = s_irq;
+			   __entry->dma_tx = dma_tx;
+			   __entry->dma_rx = dma_rx;
+	    ),
+
+	    TP_printk("%s: m_irq=0x%08x s_irq=0x%08x dma_tx=0x%08x dma_rx=0x%08x",
+		      __get_str(name), __entry->m_irq, __entry->s_irq,
+		      __entry->dma_tx, __entry->dma_rx)
+);
+
+DECLARE_EVENT_CLASS(geni_serial_data,
+		    TP_PROTO(struct device *dev, const u8 *buf, unsigned int len),
+		    TP_ARGS(dev, buf, len),
+
+		    TP_STRUCT__entry(__string(name, dev_name(dev))
+				     __field(unsigned int, len)
+				     __dynamic_array(u8, data, len)
+		    ),
+
+		    TP_fast_assign(__assign_str(name);
+				   __entry->len = len;
+				   memcpy(__get_dynamic_array(data), buf, len);
+		    ),
+
+		    TP_printk("%s: len=%u data=%s",
+			      __get_str(name), __entry->len,
+			      __print_hex(__get_dynamic_array(data), __entry->len))
+);
+
+DEFINE_EVENT(geni_serial_data, geni_serial_tx_data,
+	     TP_PROTO(struct device *dev, const u8 *buf, unsigned int len),
+	     TP_ARGS(dev, buf, len)
+);
+
+DEFINE_EVENT(geni_serial_data, geni_serial_rx_data,
+	     TP_PROTO(struct device *dev, const u8 *buf, unsigned int len),
+	     TP_ARGS(dev, buf, len)
+);
+
+TRACE_EVENT(geni_serial_set_mctrl,
+	    TP_PROTO(struct device *dev, unsigned int mctrl,
+		     u32 uart_manual_rfr),
+	    TP_ARGS(dev, mctrl, uart_manual_rfr),
+
+	    TP_STRUCT__entry(__string(name, dev_name(dev))
+			     __field(unsigned int, mctrl)
+			     __field(u32, uart_manual_rfr)
+	    ),
+
+	    TP_fast_assign(__assign_str(name);
+			   __entry->mctrl = mctrl;
+			   __entry->uart_manual_rfr = uart_manual_rfr;
+	    ),
+
+	    TP_printk("%s: mctrl=0x%04x uart_manual_rfr=0x%08x",
+		      __get_str(name), __entry->mctrl, __entry->uart_manual_rfr)
+);
+
+TRACE_EVENT(geni_serial_get_mctrl,
+	    TP_PROTO(struct device *dev, unsigned int mctrl, u32 geni_ios),
+	    TP_ARGS(dev, mctrl, geni_ios),
+
+	    TP_STRUCT__entry(__string(name, dev_name(dev))
+			     __field(unsigned int, mctrl)
+			     __field(u32, geni_ios)
+	    ),
+
+	    TP_fast_assign(__assign_str(name);
+			   __entry->mctrl = mctrl;
+			   __entry->geni_ios = geni_ios;
+	    ),
+
+	    TP_printk("%s: mctrl=0x%04x geni_ios=0x%08x",
+		      __get_str(name), __entry->mctrl, __entry->geni_ios)
+);
+
+#endif /* _TRACE_QCOM_GENI_SERIAL_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 0/2] Add tracepoints support for Qualcomm GENI Serial drivers
From: Praveen Talari @ 2026-05-26 17:37 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Greg Kroah-Hartman, Jiri Slaby, konrad.dybcio
  Cc: Praveen Talari, linux-kernel, linux-trace-kernel, linux-arm-msm,
	linux-serial, mukesh.savaliya, aniket.randive, chandana.chiluveru

Add tracepoints to the Qualcomm GENI (Generic Interface) serial driver.
These trace events enable runtime debugging and performance analysis of
UART operations.

The trace events cover UART termios configuration, clock setup, manual
control state, interrupt status, and actual transmitted/received data in
hexadecimal format.

Usage examples:

Enable all serial traces:
  echo 1 > /sys/kernel/debug/tracing/events/qcom_geni_serial/enable
  cat /sys/kernel/debug/tracing/trace_pipe

Example trace output:

2517.938432: geni_serial_clk_cfg: a94000.serial: desired_rate=1843200
     clk_rate=7372800 clk_div=4 clk_idx=0
2517.938753: geni_serial_irq: a94000.serial: m_irq=0x88800000
     s_irq=0x08000111 dma_tx=0x00000000 dma_rx=0x00000000
2517.938803: geni_serial_set_termios: a94000.serial: baud=115200 bpc=8
     tx_trans=0x00000002 tx_par=0x00000000 rx_trans=0x00000000
rx_par=0x00000000 stop=0
2517.938807: geni_serial_set_mctrl: a94000.serial: mctrl=0x8006
     uart_manual_rfr=0x00000000
2517.938818: geni_serial_get_mctrl: a94000.serial: mctrl=0x0160
     geni_ios=0x00000001
2517.939165: geni_serial_irq: a94000.serial: m_irq=0x00400000
     s_irq=0x00000000 dma_tx=0x00000000 dma_rx=0x00000000
2517.939592: geni_serial_tx_data: a94000.serial: tx_len=8 data=61 62 63
     64 65 66 67 68
2517.940610: geni_serial_irq: a94000.serial: m_irq=0x00000001
     s_irq=0x00000000 dma_tx=0x00000003 dma_rx=0x00000000
2517.942174: geni_serial_irq: a94000.serial: m_irq=0x08000000
     s_irq=0x08000100 dma_tx=0x00000000 dma_rx=0x00000003
2517.942323: geni_serial_rx_data: a94000.serial: rx_len=8 data=61 62 63
     64 65 66 67 68
2517.942680: geni_serial_set_mctrl: a94000.serial: mctrl=0x8000
     uart_manual_rfr=0x80000002

Signed-off-by: Praveen Talari <praveen.talari@oss.qualcomm.com>
---
Changes in v4:
- Rebased patch(02/02) on latest linux-next.
- Link to v3: https://lore.kernel.org/all/20260518-add-tracepoints-for-qcom-geni-serial-v3-0-b4addb151376@oss.qualcomm.com

Changes in v3:
- Removed \n from geni_serial_tx_data and geni_serial_rx_data events.
- Resolved aligment issues in geni_serial_data, geni_serial_tx_data and
  geni_serial_rx_data events.
- Link to v2: https://lore.kernel.org/r/20260512-add-tracepoints-for-qcom-geni-serial-v2-0-a5726421b3af@oss.qualcomm.com

Changes in v2:
- removed multiple trace events for TX/RX events, instead used
  DECLARE_EVENT_CLASS and DEFINE_EVENT.
- Link to v1: https://lore.kernel.org/r/20260506-add-tracepoints-for-qcom-geni-serial-v1-0-544b22612e08@oss.qualcomm.com

To: Steven Rostedt <rostedt@goodmis.org>
To: Masami Hiramatsu <mhiramat@kernel.org>
To: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: Jiri Slaby <jirislaby@kernel.org>
To: konrad.dybcio@oss.qualcomm.com
Cc: linux-kernel@vger.kernel.org
Cc: linux-trace-kernel@vger.kernel.org
Cc: linux-arm-msm@vger.kernel.org
Cc: linux-serial@vger.kernel.org
Cc: mukesh.savaliya@oss.qualcomm.com
Cc: aniket.randive@oss.qualcomm.com
Cc: chandana.chiluveru@oss.qualcomm.com

---
Praveen Talari (2):
      serial: qcom-geni: trace: Add tracepoint support for Qualcomm GENI serial
      serial: qcom-geni: Add tracepoints for Qualcomm GENI serial driver

 drivers/tty/serial/qcom_geni_serial.c   |  27 +++++-
 include/trace/events/qcom_geni_serial.h | 164 ++++++++++++++++++++++++++++++++
 2 files changed, 187 insertions(+), 4 deletions(-)
---
base-commit: d387b06f7c15b4639244ad66b4b0900c6a02b430
change-id: 20260427-add-tracepoints-for-qcom-geni-serial-948777218b7b

Best regards,
--  
Praveen Talari <praveen.talari@oss.qualcomm.com>


^ permalink raw reply

* Re: [PATCH 6/9] rv: Ensure synchronous cleanup for HA monitors
From: Wen Yang @ 2026-05-26 17:27 UTC (permalink / raw)
  To: Gabriele Monaco; +Cc: linux-kernel, Steven Rostedt, Nam Cao, linux-trace-kernel
In-Reply-To: <1d7268da9c74d99c84a83d7039e67a9f0da14d98.camel@redhat.com>



On 5/20/26 19:22, Gabriele Monaco wrote:
> On Wed, 2026-05-20 at 00:48 +0800, Wen Yang wrote:
>> The goal is right.  One thing worth double-checking is the load order
>> in the callback against the "SMP BARRIER PAIRING" section of
>> Documentation/memory-barriers.txt, which states:
>>
> 
> Yeah I realised that after I sent my answer.. You might have noticed but I
> proposed a version using acquire/release semantics in [1].
> 
> I'm waiting to send it all in a V2 for the fixes series.
> 
> Are you going to send your patch with tracepoint_synchronize_unregister() in the
> per-task destruction (can be a patch alone)?
> 
> If not I'll do it myself and append that too, I prefer to have everything
> together to avoid conflict resolution issues.
> 

Thanks for the update.

I did notice your acquire/release version — much appreciated.

For the tracepoint_synchronize_unregister() patch, I would prefer that 
you send it yourself, whether as a standalone patch or within your fix 
series. That way we can concentrate our efforts on the tlob monitor 
without worrying about cross-patch conflicts.

So please go ahead and submit it.
Thanks a lot for handling this!


--
Best wishes,
Wen

> 
> [1] -
> https://lore.kernel.org/lkml/02c522f2a09183c9e1a6ff5b0110d0d5cc5e35bd.camel@redhat.com/
> 
>>     [!] Note that the stores before the write barrier would normally be
>>     expected to match the loads after the read barrier or the
>>     address-dependency barrier, and vice versa ...
>>
>> So, we should to swap the read order in the callback so that it matches
>> the standard pattern:
>>
>>     void __ha_monitor_timer_callback() {
>>           guard(rcu)();
>>           curr_state = READ_ONCE(ha_mon->da_mon.curr_state);  /* B:
>> before rmb */
>>           smp_rmb();
>>           if (unlikely(!da_monitoring(&ha_mon->da_mon)))       /* A:
>> after rmb */
>>                   return;
>>           /*
>>            * Reached here: monitoring = 1 (old_A).
>>            * Standard wmb/rmb guarantee: curr_state (read before rmb) is also
>>            * old, i.e. not initial_state.
>>            */
>>           ha_react(curr_state, EVENT_NONE, env_string.buffer);
>>           ...
>>     }
>>
>>     void da_monitor_reset() {
>>           da_monitor_reset_hook(da_mon);
>>           WRITE_ONCE(da_mon->monitoring, 0);   /* A: before wmb */
>>           smp_wmb();
>>           WRITE_ONCE(da_mon->curr_state, model_get_initial_state());  /*
>> B: after wmb */
>>     }
>>
>>
>>
>> --
>> Best wishes,
>> Wen
>>
>>>
>>> [1] -
>>> https://lore.kernel.org/lkml/8af5ba4bd93d2acb8a546e8e47ced974a87c1eb8.1778522945.git.wen.yang@linux.dev
>>>
>>>>
>>>>
>>>> --
>>>> Best wishes,
>>>> Wen
>>>>
>>>>
>>>> On 5/12/26 22:02, Gabriele Monaco wrote:
>>>>> HA monitors may start timers, all cleanup functions currently stop the
>>>>> timers asynchronously to avoid sleeping in the wrong context.
>>>>> Nothing makes sure running callbacks terminate on cleanup.
>>>>>
>>>>> Run the entire HA timer callback in an RCU read-side critical section,
>>>>> this way we can simply synchronize_rcu() with any pending timer and are
>>>>> sure any cleanup using kfree_rcu() runs after callbacks terminated.
>>>>> Additionally make sure any unlikely callback running late won't run any
>>>>> code if the monitor is marked as disabled.
>>>>>
>>>>> Fixes: f5587d1b6ec9 ("rv: Add Hybrid Automata monitor type")
>>>>> Fixes: 4a24127bd6cb ("rv: Add support for per-object monitors in DA/HA")
>>>>> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
>>>>> ---
>>>>>     include/rv/da_monitor.h | 23 +++++++++++++++++++----
>>>>>     include/rv/ha_monitor.h | 18 ++++++++++++++++--
>>>>>     2 files changed, 35 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
>>>>> index a4a13b62d1a4..402d3b935c08 100644
>>>>> --- a/include/rv/da_monitor.h
>>>>> +++ b/include/rv/da_monitor.h
>>>>> @@ -57,6 +57,15 @@ static struct rv_monitor rv_this;
>>>>>     #define da_monitor_reset_hook(da_mon)
>>>>>     #endif
>>>>>     
>>>>> +/*
>>>>> + * Hook to allow the implementation of hybrid automata: define it with
>>>>> a
>>>>> + * function that waits for the termination of all monitors background
>>>>> + * activities (e.g. all timers). This hook can sleep.
>>>>> + */
>>>>> +#ifndef da_monitor_sync_hook
>>>>> +#define da_monitor_sync_hook()
>>>>> +#endif
>>>>> +
>>>>>     /*
>>>>>      * Type for the target id, default to int but can be overridden.
>>>>>      * A long type can work as hash table key (PER_OBJ) but will be
>>>>> downgraded
>>>>> to
>>>>> @@ -179,6 +188,7 @@ static inline int da_monitor_init(void)
>>>>>     static inline void da_monitor_destroy(void)
>>>>>     {
>>>>>     	da_monitor_reset_all();
>>>>> +	da_monitor_sync_hook();
>>>>>     }
>>>>>     
>>>>>     #ifndef da_implicit_guard
>>>>> @@ -232,6 +242,7 @@ static inline int da_monitor_init(void)
>>>>>     static inline void da_monitor_destroy(void)
>>>>>     {
>>>>>     	da_monitor_reset_all();
>>>>> +	da_monitor_sync_hook();
>>>>>     }
>>>>>     
>>>>>     #ifndef da_implicit_guard
>>>>> @@ -319,6 +330,7 @@ static inline void da_monitor_destroy(void)
>>>>>     	}
>>>>>     
>>>>>     	da_monitor_reset_all();
>>>>> +	da_monitor_sync_hook();
>>>>>     
>>>>>     	rv_put_task_monitor_slot(task_mon_slot);
>>>>>     	task_mon_slot = RV_PER_TASK_MONITOR_INIT;
>>>>> @@ -497,10 +509,9 @@ static void da_monitor_reset_all(void)
>>>>>     	struct da_monitor_storage *mon_storage;
>>>>>     	int bkt;
>>>>>     
>>>>> -	rcu_read_lock();
>>>>> +	guard(rcu)();
>>>>>     	hash_for_each_rcu(da_monitor_ht, bkt, mon_storage, node)
>>>>>     		da_monitor_reset(&mon_storage->rv.da_mon);
>>>>> -	rcu_read_unlock();
>>>>>     }
>>>>>     
>>>>>     static inline int da_monitor_init(void)
>>>>> @@ -516,13 +527,17 @@ static inline void da_monitor_destroy(void)
>>>>>     	int bkt;
>>>>>     
>>>>>     	tracepoint_synchronize_unregister();
>>>>> +	scoped_guard(rcu) {
>>>>> +		hash_for_each_rcu(da_monitor_ht, bkt, mon_storage,
>>>>> node) {
>>>>> +			da_monitor_reset_hook(&mon_storage->rv.da_mon);
>>>>> +		}
>>>>> +	}
>>>>> +	da_monitor_sync_hook();
>>>>>     	/*
>>>>>     	 * This function is called after all probes are disabled and no
>>>>> longer
>>>>>     	 * pending, we can safely assume no concurrent user.
>>>>>     	 */
>>>>> -	synchronize_rcu();
>>>>>     	hash_for_each_safe(da_monitor_ht, bkt, tmp, mon_storage, node)
>>>>> {
>>>>> -		da_monitor_reset_hook(&mon_storage->rv.da_mon);
>>>>>     		hash_del_rcu(&mon_storage->node);
>>>>>     		kfree(mon_storage);
>>>>>     	}
>>>>> diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
>>>>> index d59507e8cb30..47ff1a41febe 100644
>>>>> --- a/include/rv/ha_monitor.h
>>>>> +++ b/include/rv/ha_monitor.h
>>>>> @@ -36,6 +36,7 @@ static bool ha_monitor_handle_constraint(struct
>>>>> da_monitor
>>>>> *da_mon,
>>>>>     #define da_monitor_event_hook ha_monitor_handle_constraint
>>>>>     #define da_monitor_init_hook ha_monitor_init_env
>>>>>     #define da_monitor_reset_hook ha_monitor_reset_env
>>>>> +#define da_monitor_sync_hook() synchronize_rcu()
>>>>>     
>>>>>     #include <rv/da_monitor.h>
>>>>>     #include <linux/seq_buf.h>
>>>>> @@ -237,12 +238,25 @@ static bool ha_monitor_handle_constraint(struct
>>>>> da_monitor *da_mon,
>>>>>     	return false;
>>>>>     }
>>>>>     
>>>>> +/*
>>>>> + * __ha_monitor_timer_callback - generic callback representation
>>>>> + *
>>>>> + * This callback runs in an RCU read-side critical section to allow the
>>>>> + * destruction sequence to easily synchronize_rcu() with all pending
>>>>> timer
>>>>> + * after asynchronously disabling them.
>>>>> + */
>>>>>     static inline void __ha_monitor_timer_callback(struct ha_monitor
>>>>> *ha_mon)
>>>>>     {
>>>>> -	enum states curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
>>>>>     	DECLARE_SEQ_BUF(env_string, ENV_BUFFER_SIZE);
>>>>> -	u64 time_ns = ha_get_ns();
>>>>> +	enum states curr_state;
>>>>> +	u64 time_ns;
>>>>> +
>>>>> +	if (unlikely(!da_monitor_handling_event(&ha_mon->da_mon)))
>>>>> +		return;
>>>>>     
>>>>> +	guard(rcu)();
>>>>> +	curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
>>>>> +	time_ns = ha_get_ns();
>>>>>     	ha_get_env_string(&env_string, ha_mon, time_ns);
>>>>>     	ha_react(curr_state, EVENT_NONE, env_string.buffer);
>>>>>     	ha_trace_error_env(ha_mon, model_get_state_name(curr_state),
>>>
> 

^ permalink raw reply

* Re: [PATCH] tracing: Do not call map->ops->elt_free() if elt_alloc() is not succeeded
From: Tom Zanussi @ 2026-05-26 16:48 UTC (permalink / raw)
  To: Masami Hiramatsu (Google), Steven Rostedt, Tom Zanussi
  Cc: linux-trace-kernel, linux-kernel, Mathieu Desnoyers, Rosen Penev
In-Reply-To: <177933895460.108746.5396070821443932634.stgit@devnote2>

Hi Masami,

On Thu, 2026-05-21 at 13:49 +0900, Masami Hiramatsu (Google) wrote:
> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> In paths where tracing_map_elt_alloc() failed to allocate objects,
> the map->ops->elt_alloc() call was never successful. In this case,
> map->ops->elt_free() should not be called.
> 
> This bug was found by Sashiko.
> Link: https://sashiko.dev/#/patchset/20260520223101.34710-1-rosenp%40gmail.com
> 
> Fixes: 2734b629525a ("tracing: Add per-element variable support to tracing_map")
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> Cc: stable@vger.kernel.org
> ---
>  kernel/trace/tracing_map.c |   17 +++++++++++++----
>  1 file changed, 13 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
> index bf1a507695b6..0dd7927df22a 100644
> --- a/kernel/trace/tracing_map.c
> +++ b/kernel/trace/tracing_map.c
> @@ -386,13 +386,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt)
>  	}
>  }
>  
> -static void tracing_map_elt_free(struct tracing_map_elt *elt)
> +static void __tracing_map_elt_free(struct tracing_map_elt *elt)
>  {
>  	if (!elt)
>  		return;
>  
> -	if (elt->map->ops && elt->map->ops->elt_free)
> -		elt->map->ops->elt_free(elt);
>  	kfree(elt->fields);
>  	kfree(elt->vars);
>  	kfree(elt->var_set);
> @@ -400,6 +398,17 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
>  	kfree(elt);
>  }
>  
> +static void tracing_map_elt_free(struct tracing_map_elt *elt)
> +{
> +	if (!elt)
> +		return;
> +
> +	/* Only objects initialized with alloc_elt() should be passed to free_elt().*/
> +	if (elt->map->ops && elt->map->ops->elt_free)
> +		elt->map->ops->elt_free(elt);
> +	__tracing_map_elt_free(elt);
> +}
> +
>  static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
>  {
>  	struct tracing_map_elt *elt;
> @@ -444,7 +453,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
>  	}
>  	return elt;
>   free:
> -	tracing_map_elt_free(elt);
> +	__tracing_map_elt_free(elt);
>  
>  	return ERR_PTR(err);
>  }
> 
> 

Looks good to me. Thanks for fixing it!

Reviewed-by: Tom Zanussi <zanussi@kernel.org>


^ permalink raw reply

* Re: [PATCH v9] blk-mq: add tracepoint block_rq_tag_wait
From: Jens Axboe @ 2026-05-26 16:37 UTC (permalink / raw)
  To: rostedt, mhiramat, mathieu.desnoyers, Aaron Tomlin
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	john.g.garry, loberman, neelx, sean, mproche, chjohnst,
	linux-block, linux-kernel, linux-trace-kernel
In-Reply-To: <20260525005123.722277-1-atomlin@atomlin.com>


On Sun, 24 May 2026 20:51:23 -0400, Aaron Tomlin wrote:
> In high-performance storage environments, particularly when utilising
> RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
> latency spikes can occur when fast devices (SSDs) are starved of hardware
> tags when sharing the same blk_mq_tag_set.
> 
> Currently, diagnosing this specific hardware queue contention is
> difficult. When a CPU thread exhausts the tag pool, blk_mq_get_tag()
> forces the current thread to block uninterruptible via io_schedule().
> While this can be inferred via sched:sched_switch or dynamically
> traced by attaching a kprobe to blk_mq_mark_tag_wait(), there is no
> dedicated, out-of-the-box observability for this event.
> 
> [...]

Applied, thanks!

[1/1] blk-mq: add tracepoint block_rq_tag_wait
      commit: 9ece10778f8931630f86e802f94dc71115de0c8c

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH v2] perf/ftrace: Fix WARNING in __unregister_ftrace_function
From: Steven Rostedt @ 2026-05-26 15:38 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Rik van Riel, Mathieu Desnoyers, linux-kernel, linux-trace-kernel,
	kernel-team, sashiko-bot, sashiko-reviews
In-Reply-To: <20260525143947.ea1c0a7b29146d22faa5feda@kernel.org>

On Mon, 25 May 2026 14:39:47 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> > The error handling there all seems to "goto err_destroy"
> > 
> > err_destroy:
> >         if (event->destroy) {
> >                 event->destroy(event);
> >                 event->destroy = NULL;
> >         }
> > 
> >   
> > > > However, it does not set event->destroy to NULL.  
> > 
> > ... but it does?
> > 
> > I am not sure what code Sashiko is looking at,
> > but it does not look like the code I just pulled.  
> 
> Indeed.
> 
> > 
> > Is there a different tree I should be looking at
> > than upstream Linus?  
> 
> You can see the baseline info if you expand the collapsed triangle.
> Anyway, it said:
> 
> linux-trace/HEAD (70575e77839f4c5337ce2653b39b86bb365a870e)
> 
> So that is linux-trace/master.
> 
> commit 70575e77839f4c5337ce2653b39b86bb365a870e (linux-trace/master)
> Merge: 7bc6e90d7aa4 a43ae8057cc1
> Author: Linus Torvalds <torvalds@linux-foundation.org>
> Date:   Fri Sep 30 09:41:34 2022 -0700
> 
>     Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
> 
> 
> Hmm, this is too old. And linux-trace/master is not used anymore.


Hmm, I probably should delete that branch.

Thanks,

-- Steve

^ permalink raw reply

* Re: [PATCH v6] tracing/eprobes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-26 15:33 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: LKML, Linux trace kernel, Mathieu Desnoyers, Mark Rutland,
	Peter Zijlstra, Namhyung Kim, Takaya Saeki, Douglas Raillard,
	Tom Zanussi, Andrew Morton, Thomas Gleixner, Ian Rogers,
	Jiri Olsa
In-Reply-To: <20260526090910.8aae5c9357e119bd0043b18b@kernel.org>

On Tue, 26 May 2026 09:09:10 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Thu, 21 May 2026 22:50:33 -0400
> Steven Rostedt <rostedt@kernel.org> wrote:
> 
> > @@ -640,7 +673,7 @@ static int parse_btf_arg(char *varname,
> >  	int i, is_ptr, ret;
> >  	u32 tid;
> >  
> > -	if (WARN_ON_ONCE(!ctx->funcname))
> > +	if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_TEVENT)))
> >  		return -EINVAL;
> >  
> >  	is_ptr = split_next_field(varname, &field, ctx);
> > @@ -653,6 +686,20 @@ static int parse_btf_arg(char *varname,
> >  		return -EOPNOTSUPP;
> >  	}
> >  
> > +	if (ctx->flags & TPARG_FL_TEVENT) {
> > +		int ret;  
> 
> nit: parse_btf_arg already declared @ret. So we don't need this.

Ah, will fix.

Thanks,

-- Steve


^ permalink raw reply

* Re: [PATCH v6] tracing/eprobes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-26 15:33 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: LKML, Linux trace kernel, Mathieu Desnoyers, Mark Rutland,
	Peter Zijlstra, Namhyung Kim, Takaya Saeki, Douglas Raillard,
	Tom Zanussi, Andrew Morton, Thomas Gleixner, Ian Rogers,
	Jiri Olsa
In-Reply-To: <20260525235507.a81c565023258c63fc9201f4@kernel.org>

On Mon, 25 May 2026 23:55:07 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Thu, 21 May 2026 22:50:33 -0400
> Steven Rostedt <rostedt@kernel.org> wrote:
> 
> > +static int handle_typecast(char *arg, struct fetch_insn **pcode,
> > +			   struct fetch_insn *end,
> > +			   struct traceprobe_parse_context *ctx)
> > +{
> > +	char *tmp;
> > +	int ret;
> > +
> > +	/* Currently this only works for eprobes */
> > +	if (!(ctx->flags & TPARG_FL_TEVENT)) {
> > +		trace_probe_log_err(ctx->offset, TYPECAST_NOT_EVENT);
> > +		return -EINVAL;
> > +	}
> > +
> > +	tmp = strchr(arg, ')');
> > +	if (!tmp) {
> > +		trace_probe_log_err(ctx->offset + strlen(arg),
> > +				    DEREF_OPEN_BRACE);
> > +		return -EINVAL;
> > +	}
> > +	*tmp = '\0';
> > +	ret = query_btf_struct(arg + 1, ctx);
> > +	*tmp = ')';  
> 
> BTW, is there any reason to recover this? The @arg is copied
> string, see traceprobe_parse_probe_arg_body().

Yeah I know. But it's just something I prefer to do to keep code more
robust. That is, don't leave side effects if you can help it. It likely
doesn't matter here, but I've always errored on the side of caution. ;-)

-- Steve


^ permalink raw reply

* Re: [PATCH RFC 3/3] mm: use non-temporal stores for demotion
From: Gregory Price @ 2026-05-26 15:25 UTC (permalink / raw)
  To: Yiannis Nikolakopoulos
  Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Trond Myklebust, Anna Schumaker,
	Zi Yan, Matthew Brost, Joshua Hahn, Rakie Kim, Byungchul Park,
	Ying Huang, Alistair Popple, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Brendan Jackman, Johannes Weiner,
	David Rientjes, Davidlohr Bueso, Fan Ni, Frank van der Linden,
	Jonathan Cameron, Raghavendra K T, Rao, Bharata Bhasker,
	SeongJae Park, Wei Xu, Xuezheng Chu, Yiannis Nikolakopoulos,
	dimitrios, Ryan Roberts, linux-kernel, linux-mm, linux-nfs,
	linux-trace-kernel, Alirad Malek
In-Reply-To: <20260526-rfc-nt-demote-v1-3-eb9c9422daef@zptcorp.com>

On Tue, May 26, 2026 at 01:37:04PM +0200, Yiannis Nikolakopoulos wrote:
> From: Alirad Malek <alirad.malek@zptcorp.com>
> 
> Memory demoted to a lower tier is assumed to be cold and most likely out of
> the CPU's last level cache. Additionally, in certain demotion targets (e.g.
> CXL devices with compressed memory) the bandwidth can be negatively
> impacted by the eviction patterns of the last level cache when standard
> memcpy is used. When the feature is enabled, use the
> MIGRATE_ASYNC_NON_TEMPORAL_STORES flag in demotions to trigger the folio
> copy path using non-temporal stores.
> 
> Signed-off-by: Alirad Malek <alirad.malek@zptcorp.com>
> Co-developed-by: Yiannis Nikolakopoulos <yiannis.nikolakop@gmail.com>
> Signed-off-by: Yiannis Nikolakopoulos <yiannis.nikolakop@gmail.com>
> ---
>  mm/Kconfig   | 8 ++++++++
>  mm/migrate.c | 9 ++++++++-
>  2 files changed, 16 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index ebd8ea353687..4b7a75b57f6e 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -645,6 +645,14 @@ config MIGRATION
>  	  pages as migration can relocate pages to satisfy a huge page
>  	  allocation instead of reclaiming.
>  
> +config DEMOTION_WITH_NON_TEMPORAL_STORES
> +	bool "Use non-temporal stores for demotion"
> +	default n
> +	depends on MIGRATION
> +	help
> +	  Enable non-temporal stores when migrating pages due to demotion.
> +	  If disabled, demotion uses regular migration copy paths.
> +

Do we actually need this config flag or should we just default to this
(if the arch supports NT stores)?

>  config DEVICE_MIGRATION
>  	def_bool MIGRATION && ZONE_DEVICE
>  
> diff --git a/mm/migrate.c b/mm/migrate.c
> index ff6cf50e7b0b..368d40dc8772 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -862,7 +862,10 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
>  	if (folio_ref_count(src) != expected_count)
>  		return -EAGAIN;
>  
> -	rc = folio_mc_copy(dst, src);
> +	if (mode == MIGRATE_ASYNC_NON_TEMPORAL_STORES)
> +		rc = folio_mc_copy_nt(dst, src);
> +	else
> +		rc = folio_mc_copy(dst, src);
>  	if (unlikely(rc))
>  		return rc;
>  
> @@ -2081,6 +2084,10 @@ int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
>  	LIST_HEAD(split_folios);
>  	struct migrate_pages_stats stats;
>  
> +	if (IS_ENABLED(CONFIG_DEMOTION_WITH_NON_TEMPORAL_STORES) &&
> +		reason == MR_DEMOTION && mode == MIGRATE_ASYNC)
> +		mode = MIGRATE_ASYNC_NON_TEMPORAL_STORES;
> +
>  	trace_mm_migrate_pages_start(mode, reason);
>  
>  	memset(&stats, 0, sizeof(stats));
> 
> -- 
> 2.43.0
> 

^ permalink raw reply

* [PATCH 1/5] x86/xen: Drop lazy mode from trace entries
From: Juergen Gross @ 2026-05-26 15:05 UTC (permalink / raw)
  To: linux-kernel, x86, linux-trace-kernel
  Cc: Juergen Gross, Boris Ostrovsky, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, xen-devel
In-Reply-To: <20260526150514.129330-1-jgross@suse.com>

Drop the lazy mode (cpu or mmu) from the xen_mc_batch and xen_mc_issue
trace entries.

This is done in preparation of removing the xen_lazy_mode percpu
variable.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/xen-ops.h     | 11 +++++++----
 include/trace/events/xen.h | 33 +++++++++++++++++++--------------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 6808010ac379..dc892f421f25 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -98,7 +98,7 @@ static inline void xen_mc_batch(void)
 
 	/* need to disable interrupts until this entry is complete */
 	local_irq_save(flags);
-	trace_xen_mc_batch(xen_get_lazy_mode());
+	trace_xen_mc_batch(flags);
 	__this_cpu_write(xen_mc_irq_flags, flags);
 }
 
@@ -114,13 +114,16 @@ void xen_mc_flush(void);
 /* Issue a multicall if we're not in a lazy mode */
 static inline void xen_mc_issue(unsigned mode)
 {
-	trace_xen_mc_issue(mode);
+	bool flush = !(xen_get_lazy_mode() & mode);
+	unsigned long flags = this_cpu_read(xen_mc_irq_flags);
 
-	if ((xen_get_lazy_mode() & mode) == 0)
+	trace_xen_mc_issue(flush, flags);
+
+	if (flush)
 		xen_mc_flush();
 
 	/* restore flags saved in xen_mc_batch */
-	local_irq_restore(this_cpu_read(xen_mc_irq_flags));
+	local_irq_restore(flags);
 }
 
 /* Set up a callback to be called when the current batch is flushed */
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index e3f139f0bc78..ad384969e2cb 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -12,24 +12,29 @@
 struct multicall_entry;
 
 /* Multicalls */
-DECLARE_EVENT_CLASS(xen_mc__batch,
-	    TP_PROTO(enum xen_lazy_mode mode),
-	    TP_ARGS(mode),
+TRACE_EVENT(xen_mc_batch,
+	    TP_PROTO(unsigned long flags),
+	    TP_ARGS(flags),
 	    TP_STRUCT__entry(
-		    __field(enum xen_lazy_mode, mode)
+		    __field(unsigned long, flags)
 		    ),
-	    TP_fast_assign(__entry->mode = mode),
-	    TP_printk("start batch LAZY_%s",
-		      (__entry->mode == XEN_LAZY_MMU) ? "MMU" :
-		      (__entry->mode == XEN_LAZY_CPU) ? "CPU" : "NONE")
+	    TP_fast_assign(__entry->flags = flags),
+	    TP_printk("start batch lazy flags %lx", __entry->flags)
 	);
-#define DEFINE_XEN_MC_BATCH(name)			\
-	DEFINE_EVENT(xen_mc__batch, name,		\
-		TP_PROTO(enum xen_lazy_mode mode),	\
-		     TP_ARGS(mode))
 
-DEFINE_XEN_MC_BATCH(xen_mc_batch);
-DEFINE_XEN_MC_BATCH(xen_mc_issue);
+TRACE_EVENT(xen_mc_issue,
+	    TP_PROTO(bool flush, unsigned long flags),
+	    TP_ARGS(flush, flags),
+	    TP_STRUCT__entry(
+		    __field(unsigned long, flags)
+		    __field(bool, flush)
+		    ),
+	    TP_fast_assign(__entry->flush = flush;
+			   __entry->flags = flags;
+		    ),
+	    TP_printk("flush: %s, flags %lx",
+		      __entry->flush ? "yes" : "no", __entry->flags)
+	);
 
 TRACE_DEFINE_SIZEOF(ulong);
 
-- 
2.54.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox