Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCHv2 00/11] uprobes/x86: Fix red zone issue for optimized uprobes
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel

hi,
Andrii reported an issue with optimized uprobes [1] that can clobber
redzone area with call instruction storing return address on stack
where user code may keep temporary data without adjusting rsp.

Fixing this by moving the optimized uprobes on top of 10-bytes nop
instruction, so we can squeeze another instruction to escape the
redzone area before doing the call.

Note we need upstream update first for patch 3 (github.com/libbpf/usdt),
if we decide to take this change.

thanks,
jirka

v1: https://lore.kernel.org/bpf/20260514135342.22130-1-jolsa@kernel.org/

v2 changes:
- several selftest fixes [sashiko]
- consolidate is_lea_insn and is_call_insn insto single check [Jakub Sitnicki]
- use proper mm_struct object in __in_uprobe_trampoline check [sashiko]
- allow to copy uprobe trampolines vma objects on fork [sashiko]
- change uprobe syscall detection error from -ENXIO to -EPROTO [Andrii]
- added fork/clone tests
- I kept the selftest changes and nop5->nop10 changes in separate
  commits for easier review, we can squash them later if we want to keep
  bisect working properly


[1] https://lore.kernel.org/bpf/20260509003146.976844-1-andrii@kernel.org/
---
Andrii Nakryiko (1):
      selftests/bpf: Add tests for uprobe nop10 red zone clobbering

Jiri Olsa (10):
      uprobes/x86: Use proper mm_struct in __in_uprobe_trampoline
      uprobes/x86: Allow to copy uprobe trampolines on fork
      uprobes/x86: Move optimized uprobe from nop5 to nop10
      libbpf: Change has_nop_combo to work on top of nop10
      libbpf: Detect uprobe syscall with new error
      selftests/bpf: Emit nop,nop10 instructions combo for x86_64 arch
      selftests/bpf: Change uprobe syscall tests to use nop10
      selftests/bpf: Change uprobe/usdt trigger bench code to use nop10
      selftests/bpf: Add reattach tests for uprobe syscall
      selftests/bpf: Add tests for forked/cloned optimized uprobes

 arch/x86/kernel/uprobes.c                               | 144 ++++++++++++++++++++++------------
 tools/lib/bpf/features.c                                |   4 +-
 tools/lib/bpf/usdt.c                                    |  16 ++--
 tools/testing/selftests/bpf/bench.c                     |  20 ++---
 tools/testing/selftests/bpf/benchs/bench_trigger.c      |  38 ++++-----
 tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh |   2 +-
 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 307 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 tools/testing/selftests/bpf/prog_tests/usdt.c           |  74 +++++++++++++++---
 tools/testing/selftests/bpf/progs/test_usdt.c           |  25 ++++++
 tools/testing/selftests/bpf/usdt.h                      |   2 +-
 tools/testing/selftests/bpf/usdt_2.c                    |  15 +++-
 11 files changed, 524 insertions(+), 123 deletions(-)

^ permalink raw reply

* [PATCHv2 01/11] uprobes/x86: Use proper mm_struct in __in_uprobe_trampoline
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

In the unregister path we use __in_uprobe_trampoline check with
current->mm for the VMA lookup, which is wrong, because we are
in the tracer context, not the traced process.

Add mm_struct pointer argument to __in_uprobe_trampoline and
changing related callers to pass proper mm_struct pointer.

Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/kernel/uprobes.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index ebb1baf1eb1d..2be6707e3320 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -761,9 +761,9 @@ void arch_uprobe_clear_state(struct mm_struct *mm)
 		destroy_uprobe_trampoline(tramp);
 }
 
-static bool __in_uprobe_trampoline(unsigned long ip)
+static bool __in_uprobe_trampoline(struct mm_struct *mm, unsigned long ip)
 {
-	struct vm_area_struct *vma = vma_lookup(current->mm, ip);
+	struct vm_area_struct *vma = vma_lookup(mm, ip);
 
 	return vma && vma_is_special_mapping(vma, &tramp_mapping);
 }
@@ -776,14 +776,14 @@ static bool in_uprobe_trampoline(unsigned long ip)
 
 	rcu_read_lock();
 	if (mmap_lock_speculate_try_begin(mm, &seq)) {
-		found = __in_uprobe_trampoline(ip);
+		found = __in_uprobe_trampoline(mm, ip);
 		retry = mmap_lock_speculate_retry(mm, seq);
 	}
 	rcu_read_unlock();
 
 	if (retry) {
 		mmap_read_lock(mm);
-		found = __in_uprobe_trampoline(ip);
+		found = __in_uprobe_trampoline(mm, ip);
 		mmap_read_unlock(mm);
 	}
 	return found;
@@ -1044,7 +1044,7 @@ static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst,
 	return 0;
 }
 
-static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
+static bool __is_optimized(struct mm_struct *mm, uprobe_opcode_t *insn, unsigned long vaddr)
 {
 	struct __packed __arch_relative_insn {
 		u8 op;
@@ -1053,7 +1053,7 @@ static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
 
 	if (!is_call_insn(insn))
 		return false;
-	return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
+	return __in_uprobe_trampoline(mm, vaddr + 5 + call->raddr);
 }
 
 static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
@@ -1064,7 +1064,7 @@ static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
 	err = copy_from_vaddr(mm, vaddr, &insn, 5);
 	if (err)
 		return err;
-	return __is_optimized((uprobe_opcode_t *)&insn, vaddr);
+	return __is_optimized(mm, (uprobe_opcode_t *)&insn, vaddr);
 }
 
 static bool should_optimize(struct arch_uprobe *auprobe)
-- 
2.53.0


^ permalink raw reply related

* [PATCHv2 02/11] uprobes/x86: Allow to copy uprobe trampolines on fork
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

When we do fork or clone without CLONE_VM the new process won't
have uprobe trampoline vma objects and at the same time it will
have optimized code calling the trampolines.

Fixing this by allowing vma uprobe trampoline objects to be copied
on fork to the new process.

Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/kernel/uprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 2be6707e3320..37faf038be33 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -702,7 +702,7 @@ static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
 
 	tramp->vaddr = vaddr;
 	vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
-				VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
+				VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_IO,
 				&tramp_mapping);
 	if (IS_ERR(vma)) {
 		kfree(tramp);
-- 
2.53.0


^ permalink raw reply related

* [PATCHv2 03/11] uprobes/x86: Move optimized uprobe from nop5 to nop10
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

Andrii reported an issue with optimized uprobes [1] that can clobber
redzone area with call instruction storing return address on stack
where user code may keep temporary data without adjusting rsp.

Fixing this by moving the optimized uprobes on top of 10-bytes nop
instruction, so we can squeeze another instruction to escape the
redzone area before doing the call, like:

  lea -0x80(%rsp), %rsp
  call tramp

Note the lea instruction is used to adjust the rsp register without
changing the flags.

The unoptimize path is bit tricky, because we can't change back to nop10
instruction, because we could have some thread already inside lea instruction.
Instead we change it to 'jmp rel8' jump instruction to end of the 10-byte
slot. The `jmp rel8' is also added as another instruction that allows
optimized uprobe in can_optimize function.

The optimized uprobe performance stays the same:

        uprobe-nop     :    3.129 ± 0.013M/s
        uprobe-push    :    3.045 ± 0.006M/s
        uprobe-ret     :    1.095 ± 0.004M/s
  -->   uprobe-nop10   :    7.170 ± 0.020M/s
        uretprobe-nop  :    2.143 ± 0.021M/s
        uretprobe-push :    2.090 ± 0.000M/s
        uretprobe-ret  :    0.942 ± 0.000M/s
  -->   uretprobe-nop10:    3.381 ± 0.003M/s
        usdt-nop       :    3.245 ± 0.004M/s
  -->   usdt-nop10     :    7.256 ± 0.023M/s

[1] https://lore.kernel.org/bpf/20260509003146.976844-1-andrii@kernel.org/
Reported-by: Andrii Nakryiko <andrii@kernel.org>
Closes: https://lore.kernel.org/bpf/20260509003146.976844-1-andrii@kernel.org/
Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/kernel/uprobes.c | 130 ++++++++++++++++++++++++++------------
 1 file changed, 89 insertions(+), 41 deletions(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 37faf038be33..e0067d1b6242 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -636,9 +636,26 @@ struct uprobe_trampoline {
 	unsigned long		vaddr;
 };
 
+#define LEA_INSN_SIZE		5
+#define OPT_INSN_SIZE		(LEA_INSN_SIZE + CALL_INSN_SIZE)
+#define OPT_JMP8_OFFSET		(OPT_INSN_SIZE - JMP8_INSN_SIZE)
+#define REDZONE_SIZE		0x80
+
+static const u8 lea_rsp[] = { 0x48, 0x8d, 0x64, 0x24, 0x80 };
+
+static bool is_opt_insns(const uprobe_opcode_t *insn)
+{
+	static const u8 opt_insns[] = {
+		0x48, 0x8d, 0x64, 0x24, REDZONE_SIZE, /* lea -0x80(%rsp), %rsp */
+		CALL_INSN_OPCODE
+	};
+
+	return !memcmp(insn, opt_insns, ARRAY_SIZE(opt_insns));
+}
+
 static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
 {
-	long delta = (long)(vaddr + 5 - vtramp);
+	long delta = (long)(vaddr + OPT_INSN_SIZE - vtramp);
 
 	return delta >= INT_MIN && delta <= INT_MAX;
 }
@@ -651,7 +668,7 @@ static unsigned long find_nearest_trampoline(unsigned long vaddr)
 	};
 	unsigned long low_limit, high_limit;
 	unsigned long low_tramp, high_tramp;
-	unsigned long call_end = vaddr + 5;
+	unsigned long call_end = vaddr + OPT_INSN_SIZE;
 
 	if (check_add_overflow(call_end, INT_MIN, &low_limit))
 		low_limit = PAGE_SIZE;
@@ -810,7 +827,7 @@ SYSCALL_DEFINE0(uprobe)
 
 	/* Allow execution only from uprobe trampolines. */
 	if (!in_uprobe_trampoline(regs->ip))
-		return -ENXIO;
+		return -EPROTO;
 
 	err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
 	if (err)
@@ -826,8 +843,8 @@ SYSCALL_DEFINE0(uprobe)
 	regs->ax  = args.ax;
 	regs->r11 = args.r11;
 	regs->cx  = args.cx;
-	regs->ip  = args.retaddr - 5;
-	regs->sp += sizeof(args);
+	regs->ip  = args.retaddr - OPT_INSN_SIZE;
+	regs->sp += sizeof(args) + REDZONE_SIZE;
 	regs->orig_ax = -1;
 
 	sp = regs->sp;
@@ -844,12 +861,12 @@ SYSCALL_DEFINE0(uprobe)
 	 */
 	if (regs->sp != sp) {
 		/* skip the trampoline call */
-		if (args.retaddr - 5 == regs->ip)
-			regs->ip += 5;
+		if (args.retaddr - OPT_INSN_SIZE == regs->ip)
+			regs->ip += OPT_INSN_SIZE;
 		return regs->ax;
 	}
 
-	regs->sp -= sizeof(args);
+	regs->sp -= sizeof(args) + REDZONE_SIZE;
 
 	/* for the case uprobe_consumer has changed ax/r11/cx */
 	args.ax  = regs->ax;
@@ -857,7 +874,7 @@ SYSCALL_DEFINE0(uprobe)
 	args.cx  = regs->cx;
 
 	/* keep return address unless we are instructed otherwise */
-	if (args.retaddr - 5 != regs->ip)
+	if (args.retaddr - OPT_INSN_SIZE != regs->ip)
 		args.retaddr = regs->ip;
 
 	if (shstk_push(args.retaddr) == -EFAULT)
@@ -891,7 +908,7 @@ asm (
 	"pop %rax\n"
 	"pop %r11\n"
 	"pop %rcx\n"
-	"ret\n"
+	"ret $" __stringify(REDZONE_SIZE) "\n"
 	"int3\n"
 	".balign " __stringify(PAGE_SIZE) "\n"
 	".popsection\n"
@@ -909,7 +926,7 @@ late_initcall(arch_uprobes_init);
 
 enum {
 	EXPECT_SWBP,
-	EXPECT_CALL,
+	EXPECT_OPTIMIZED,
 };
 
 struct write_opcode_ctx {
@@ -917,11 +934,6 @@ struct write_opcode_ctx {
 	int expect;
 };
 
-static int is_call_insn(uprobe_opcode_t *insn)
-{
-	return *insn == CALL_INSN_OPCODE;
-}
-
 /*
  * Verification callback used by int3_update uprobe_write calls to make sure
  * the underlying instruction is as expected - either int3 or call.
@@ -930,17 +942,17 @@ static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *
 		       int nbytes, void *data)
 {
 	struct write_opcode_ctx *ctx = data;
-	uprobe_opcode_t old_opcode[5];
+	uprobe_opcode_t old_opcode[OPT_INSN_SIZE];
 
-	uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
+	uprobe_copy_from_page(page, ctx->base, old_opcode, OPT_INSN_SIZE);
 
 	switch (ctx->expect) {
 	case EXPECT_SWBP:
 		if (is_swbp_insn(&old_opcode[0]))
 			return 1;
 		break;
-	case EXPECT_CALL:
-		if (is_call_insn(&old_opcode[0]))
+	case EXPECT_OPTIMIZED:
+		if (is_opt_insns(&old_opcode[0]))
 			return 1;
 		break;
 	}
@@ -963,7 +975,7 @@ static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *
  *   - SMP sync all CPUs
  */
 static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
-		       unsigned long vaddr, char *insn, bool optimize)
+		       unsigned long vaddr, char *insn, int size, bool optimize)
 {
 	uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
 	struct write_opcode_ctx ctx = {
@@ -978,7 +990,7 @@ static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 	 * so we can skip this step for optimize == true.
 	 */
 	if (!optimize) {
-		ctx.expect = EXPECT_CALL;
+		ctx.expect = EXPECT_OPTIMIZED;
 		err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
 				   true /* is_register */, false /* do_update_ref_ctr */,
 				   &ctx);
@@ -990,7 +1002,7 @@ static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 
 	/* Write all but the first byte of the patched range. */
 	ctx.expect = EXPECT_SWBP;
-	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
+	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, size - 1, verify_insn,
 			   true /* is_register */, false /* do_update_ref_ctr */,
 			   &ctx);
 	if (err)
@@ -1017,17 +1029,32 @@ static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 			 unsigned long vaddr, unsigned long tramp)
 {
-	u8 call[5];
+	u8 insn[OPT_INSN_SIZE], *call = &insn[LEA_INSN_SIZE];
 
-	__text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
+	/*
+	 * We have nop10 instruction (with first byte overwritten to int3),
+	 * changing it to:
+	 *   lea -0x80(%rsp), %rsp
+	 *   call tramp
+	 */
+	memcpy(insn, lea_rsp, LEA_INSN_SIZE);
+	__text_gen_insn(call, CALL_INSN_OPCODE,
+			(const void *) (vaddr + LEA_INSN_SIZE),
 			(const void *) tramp, CALL_INSN_SIZE);
-	return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
+	return int3_update(auprobe, vma, vaddr, insn, OPT_INSN_SIZE, true /* optimize */);
 }
 
 static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 			   unsigned long vaddr)
 {
-	return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
+	/*
+	 * We have optimized nop10 (lea, call), changing it to 'jmp rel8' to
+	 * end of the 10-byte slot instead of restoring the original nop10,
+	 * because we could have thread already inside lea instruction.
+	 */
+	u8 jmp[OPT_INSN_SIZE] = { JMP8_INSN_OPCODE, OPT_JMP8_OFFSET };
+
+	return int3_update(auprobe, vma, vaddr, jmp, JMP8_INSN_SIZE, false /* optimize */);
 }
 
 static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
@@ -1049,19 +1076,19 @@ static bool __is_optimized(struct mm_struct *mm, uprobe_opcode_t *insn, unsigned
 	struct __packed __arch_relative_insn {
 		u8 op;
 		s32 raddr;
-	} *call = (struct __arch_relative_insn *) insn;
+	} *call = (struct __arch_relative_insn *)(insn + LEA_INSN_SIZE);
 
-	if (!is_call_insn(insn))
+	if (!is_opt_insns(insn))
 		return false;
-	return __in_uprobe_trampoline(mm, vaddr + 5 + call->raddr);
+	return __in_uprobe_trampoline(mm, vaddr + OPT_INSN_SIZE + call->raddr);
 }
 
 static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
 {
-	uprobe_opcode_t insn[5];
+	uprobe_opcode_t insn[OPT_INSN_SIZE];
 	int err;
 
-	err = copy_from_vaddr(mm, vaddr, &insn, 5);
+	err = copy_from_vaddr(mm, vaddr, &insn, OPT_INSN_SIZE);
 	if (err)
 		return err;
 	return __is_optimized(mm, (uprobe_opcode_t *)&insn, vaddr);
@@ -1095,14 +1122,25 @@ int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
 		  unsigned long vaddr)
 {
 	if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
-		int ret = is_optimized(vma->vm_mm, vaddr);
-		if (ret < 0)
+		uprobe_opcode_t insn[OPT_INSN_SIZE];
+		int ret;
+
+		ret = copy_from_vaddr(vma->vm_mm, vaddr, &insn, OPT_INSN_SIZE);
+		if (ret)
 			return ret;
-		if (ret) {
+		if (__is_optimized(vma->vm_mm, (uprobe_opcode_t *)&insn, vaddr)) {
 			ret = swbp_unoptimize(auprobe, vma, vaddr);
 			WARN_ON_ONCE(ret);
 			return ret;
 		}
+		/*
+		 * We can have re-attached probe on top of jmp8 instruction,
+		 * which did not get optimized. We need to restore the jmp8
+		 * instruction, instead of the original instruction (nop10).
+		 */
+		if (is_swbp_insn(&insn[0]) && insn[1] == OPT_JMP8_OFFSET)
+			return uprobe_write_opcode(auprobe, vma, vaddr, JMP8_INSN_OPCODE,
+						   false /* is_register */);
 	}
 	return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
 				   false /* is_register */);
@@ -1131,7 +1169,7 @@ static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct
 void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
 {
 	struct mm_struct *mm = current->mm;
-	uprobe_opcode_t insn[5];
+	uprobe_opcode_t insn[OPT_INSN_SIZE];
 
 	if (!should_optimize(auprobe))
 		return;
@@ -1142,7 +1180,7 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
 	 * Check if some other thread already optimized the uprobe for us,
 	 * if it's the case just go away silently.
 	 */
-	if (copy_from_vaddr(mm, vaddr, &insn, 5))
+	if (copy_from_vaddr(mm, vaddr, &insn, OPT_INSN_SIZE))
 		goto unlock;
 	if (!is_swbp_insn((uprobe_opcode_t*) &insn))
 		goto unlock;
@@ -1160,14 +1198,24 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
 
 static bool can_optimize(struct insn *insn, unsigned long vaddr)
 {
-	if (!insn->x86_64 || insn->length != 5)
+	if (!insn->x86_64)
 		return false;
 
-	if (!insn_is_nop(insn))
+	/* We can't do cross page atomic writes yet. */
+	if (PAGE_SIZE - (vaddr & ~PAGE_MASK) < OPT_INSN_SIZE)
 		return false;
 
-	/* We can't do cross page atomic writes yet. */
-	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+	/* We can optimize on top of nop10.. */
+	if (insn->length == OPT_INSN_SIZE && insn_is_nop(insn))
+		return true;
+
+	/* .. and JMP rel8 to end of slot — check swbp_unoptimize. */
+	if (insn->length == 2 &&
+	    insn->opcode.bytes[0] == JMP8_INSN_OPCODE &&
+	    insn->immediate.value == OPT_JMP8_OFFSET)
+		return true;
+
+	return false;
 }
 #else /* 32-bit: */
 /*
-- 
2.53.0


^ permalink raw reply related

* [PATCHv2 04/11] libbpf: Change has_nop_combo to work on top of nop10
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: Jakub Sitnicki, bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

We now expect nop combo with 10 bytes nop instead of 5 bytes nop,
fixing has_nop_combo to reflect that.

Fixes: 41a5c7df4466 ("libbpf: Add support to detect nop,nop5 instructions combo for usdt probe")
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/lib/bpf/usdt.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
index e3710933fd52..7e62e4d5bedd 100644
--- a/tools/lib/bpf/usdt.c
+++ b/tools/lib/bpf/usdt.c
@@ -305,7 +305,7 @@ struct usdt_manager *usdt_manager_new(struct bpf_object *obj)
 
 	/*
 	 * Detect kernel support for uprobe() syscall, it's presence means we can
-	 * take advantage of faster nop5 uprobe handling.
+	 * take advantage of faster nop10 uprobe handling.
 	 * Added in: 56101b69c919 ("uprobes/x86: Add uprobe syscall to speed up uprobe")
 	 */
 	man->has_uprobe_syscall = kernel_supports(obj, FEAT_UPROBE_SYSCALL);
@@ -596,14 +596,14 @@ static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note,
 #if defined(__x86_64__)
 static bool has_nop_combo(int fd, long off)
 {
-	unsigned char nop_combo[6] = {
-		0x90, 0x0f, 0x1f, 0x44, 0x00, 0x00 /* nop,nop5 */
+	unsigned char nop_combo[11] = {
+		0x90, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00,
 	};
-	unsigned char buf[6];
+	unsigned char buf[11];
 
-	if (pread(fd, buf, 6, off) != 6)
+	if (pread(fd, buf, 11, off) != 11)
 		return false;
-	return memcmp(buf, nop_combo, 6) == 0;
+	return memcmp(buf, nop_combo, 11) == 0;
 }
 #else
 static bool has_nop_combo(int fd, long off)
@@ -814,8 +814,8 @@ static int collect_usdt_targets(struct usdt_manager *man, struct elf_fd *elf_fd,
 		memset(target, 0, sizeof(*target));
 
 		/*
-		 * We have uprobe syscall and usdt with nop,nop5 instructions combo,
-		 * so we can place the uprobe directly on nop5 (+1) and get this probe
+		 * We have uprobe syscall and usdt with nop,nop10 instructions combo,
+		 * so we can place the uprobe directly on nop10 (+1) and get this probe
 		 * optimized.
 		 */
 		if (man->has_uprobe_syscall && has_nop_combo(elf_fd->fd, usdt_rel_ip)) {
-- 
2.53.0


^ permalink raw reply related

* [PATCHv2 05/11] libbpf: Detect uprobe syscall with new error
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

In the previous optimized uprobe fix we changed the syscall
error used for its detection from ENXIO to EPROTO.

Changing related probe_uprobe_syscall detection check.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Fixes: 05738da0efa1 ("libbpf: Add uprobe syscall feature detection")
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/lib/bpf/features.c                                | 4 ++--
 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
index b7e388f99d0b..e5641fa60163 100644
--- a/tools/lib/bpf/features.c
+++ b/tools/lib/bpf/features.c
@@ -577,10 +577,10 @@ static int probe_ldimm64_full_range_off(int token_fd)
 static int probe_uprobe_syscall(int token_fd)
 {
 	/*
-	 * If kernel supports uprobe() syscall, it will return -ENXIO when called
+	 * If kernel supports uprobe() syscall, it will return -EPROTO when called
 	 * from the outside of a kernel-generated uprobe trampoline.
 	 */
-	return syscall(__NR_uprobe) < 0 && errno == ENXIO;
+	return syscall(__NR_uprobe) < 0 && errno == EPROTO;
 }
 #else
 static int probe_uprobe_syscall(int token_fd)
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index 955a37751b52..c944136252c6 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -762,7 +762,7 @@ static void test_uprobe_error(void)
 	long err = syscall(__NR_uprobe);
 
 	ASSERT_EQ(err, -1, "error");
-	ASSERT_EQ(errno, ENXIO, "errno");
+	ASSERT_EQ(errno, EPROTO, "errno");
 }
 
 static void __test_uprobe_syscall(void)
-- 
2.53.0


^ permalink raw reply related

* [PATCHv2 06/11] selftests/bpf: Emit nop,nop10 instructions combo for x86_64 arch
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

Syncing latest usdt.h change [1].

Now that we have nop10 optimization support in kernel, let's emit
nop,nop10 for usdt probe. We leave it up to the library to use
desirable nop instruction.

[1] TBD
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/testing/selftests/bpf/usdt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h
index c71e21df38b3..d359663b9c32 100644
--- a/tools/testing/selftests/bpf/usdt.h
+++ b/tools/testing/selftests/bpf/usdt.h
@@ -313,7 +313,7 @@ struct usdt_sema { volatile unsigned short active; };
 #if defined(__ia64__) || defined(__s390__) || defined(__s390x__)
 #define USDT_NOP			nop 0
 #elif defined(__x86_64__)
-#define USDT_NOP                       .byte 0x90, 0x0f, 0x1f, 0x44, 0x00, 0x0 /* nop, nop5 */
+#define USDT_NOP                       .byte 0x90, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 /* nop, nop10 */
 #else
 #define USDT_NOP			nop
 #endif
-- 
2.53.0


^ permalink raw reply related

* [PATCHv2 07/11] selftests/bpf: Change uprobe syscall tests to use nop10
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

Optimized uprobes are now on top of 10-bytes nop instructions,
reflect that in existing tests.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 .../selftests/bpf/benchs/bench_trigger.c      |  2 +-
 .../selftests/bpf/prog_tests/uprobe_syscall.c | 29 ++++++++++---------
 tools/testing/selftests/bpf/prog_tests/usdt.c | 25 +++++++++-------
 tools/testing/selftests/bpf/usdt_2.c          |  2 +-
 4 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 2f22ec61667b..bcc4820c802e 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -398,7 +398,7 @@ static void *uprobe_producer_ret(void *input)
 #ifdef __x86_64__
 __nocf_check __weak void uprobe_target_nop5(void)
 {
-	asm volatile (".byte 0x0f, 0x1f, 0x44, 0x00, 0x00");
+	asm volatile (".byte 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
 }
 
 static void *uprobe_producer_nop5(void *input)
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index c944136252c6..e4a19dc9df69 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -17,7 +17,7 @@
 #include "uprobe_syscall_executed.skel.h"
 #include "bpf/libbpf_internal.h"
 
-#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00
+#define USDT_NOP .byte 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
 #include "usdt.h"
 
 #pragma GCC diagnostic ignored "-Wattributes"
@@ -26,7 +26,7 @@ __attribute__((aligned(16)))
 __nocf_check __weak __naked unsigned long uprobe_regs_trigger(void)
 {
 	asm volatile (
-		".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */
+		".byte 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00\n" /* nop10 */
 		"movq $0xdeadbeef, %rax\n"
 		"ret\n"
 	);
@@ -345,9 +345,9 @@ static void test_uretprobe_syscall_call(void)
 __attribute__((aligned(16)))
 __nocf_check __weak __naked void uprobe_test(void)
 {
-	asm volatile ("					\n"
-		".byte 0x0f, 0x1f, 0x44, 0x00, 0x00	\n"
-		"ret					\n"
+	asm volatile (
+		".byte 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00\n" /* nop10 */
+		"ret\n"
 	);
 }
 
@@ -388,14 +388,16 @@ static int find_uprobes_trampoline(void *tramp_addr)
 	return ret;
 }
 
-static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 };
+static unsigned char jmp2B[2]   = { 0xeb, 8 };
+static unsigned char nop10[10]  = { 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
+static unsigned char lea_rsp[5] = { 0x48, 0x8d, 0x64, 0x24, 0x80 };
 
-static void *find_nop5(void *fn)
+static void *find_nop10(void *fn)
 {
 	int i;
 
-	for (i = 0; i < 10; i++) {
-		if (!memcmp(nop5, fn + i, 5))
+	for (i = 0; i < 128; i++) {
+		if (!memcmp(nop10, fn + i, 10))
 			return fn + i;
 	}
 	return NULL;
@@ -420,7 +422,8 @@ static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigge
 	ASSERT_EQ(skel->bss->executed, executed, "executed");
 
 	/* .. and check the trampoline is as expected. */
-	call = (struct __arch_relative_insn *) addr;
+	ASSERT_OK(memcmp(addr, lea_rsp, 5), "lea_rsp");
+	call = (struct __arch_relative_insn *)(addr + 5);
 	tramp = (void *) (call + 1) + call->raddr;
 	ASSERT_EQ(call->op, 0xe8, "call");
 	ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
@@ -432,7 +435,7 @@ static void check_detach(void *addr, void *tramp)
 {
 	/* [uprobes_trampoline] stays after detach */
 	ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
-	ASSERT_OK(memcmp(addr, nop5, 5), "nop5");
+	ASSERT_OK(memcmp(addr, jmp2B, 2), "jmp2B");
 }
 
 static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link,
@@ -568,8 +571,8 @@ static void test_uprobe_usdt(void)
 	void *addr;
 
 	errno = 0;
-	addr = find_nop5(usdt_test);
-	if (!ASSERT_OK_PTR(addr, "find_nop5"))
+	addr = find_nop10(usdt_test);
+	if (!ASSERT_OK_PTR(addr, "find_nop10"))
 		return;
 
 	skel = uprobe_syscall_executed__open_and_load();
diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c
index 69759b27794d..a160d7c4fa0d 100644
--- a/tools/testing/selftests/bpf/prog_tests/usdt.c
+++ b/tools/testing/selftests/bpf/prog_tests/usdt.c
@@ -252,7 +252,7 @@ extern void usdt_1(void);
 extern void usdt_2(void);
 
 static unsigned char nop1[1] = { 0x90 };
-static unsigned char nop1_nop5_combo[6] = { 0x90, 0x0f, 0x1f, 0x44, 0x00, 0x00 };
+static unsigned char nop1_nop10_combo[11] = { 0x90, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
 
 static void *find_instr(void *fn, unsigned char *instr, size_t cnt)
 {
@@ -271,17 +271,17 @@ static void subtest_optimized_attach(void)
 	__u8 *addr_1, *addr_2;
 
 	/* usdt_1 USDT probe has single nop instruction */
-	addr_1 = find_instr(usdt_1, nop1_nop5_combo, 6);
-	if (!ASSERT_NULL(addr_1, "usdt_1_find_nop1_nop5_combo"))
+	addr_1 = find_instr(usdt_1, nop1_nop10_combo, 11);
+	if (!ASSERT_NULL(addr_1, "usdt_1_find_nop1_nop10_combo"))
 		return;
 
 	addr_1 = find_instr(usdt_1, nop1, 1);
 	if (!ASSERT_OK_PTR(addr_1, "usdt_1_find_nop1"))
 		return;
 
-	/* usdt_2 USDT probe has nop,nop5 instructions combo */
-	addr_2 = find_instr(usdt_2, nop1_nop5_combo, 6);
-	if (!ASSERT_OK_PTR(addr_2, "usdt_2_find_nop1_nop5_combo"))
+	/* usdt_2 USDT probe has nop,nop10 instructions combo */
+	addr_2 = find_instr(usdt_2, nop1_nop10_combo, 11);
+	if (!ASSERT_OK_PTR(addr_2, "usdt_2_find_nop1_nop10_combo"))
 		return;
 
 	skel = test_usdt__open_and_load();
@@ -309,12 +309,12 @@ static void subtest_optimized_attach(void)
 
 	bpf_link__destroy(skel->links.usdt_executed);
 
-	/* we expect the nop5 ip */
+	/* we expect the nop10 ip */
 	skel->bss->expected_ip = (unsigned long) addr_2 + 1;
 
 	/*
 	 * Attach program on top of usdt_2 which is probe defined on top
-	 * of nop1,nop5 combo, so the probe gets optimized on top of nop5.
+	 * of nop1,nop10 combo, so the probe gets optimized on top of nop10.
 	 */
 	skel->links.usdt_executed = bpf_program__attach_usdt(skel->progs.usdt_executed,
 						     0 /*self*/, "/proc/self/exe",
@@ -328,8 +328,13 @@ static void subtest_optimized_attach(void)
 	/* nop stays on addr_2 address */
 	ASSERT_EQ(*addr_2, 0x90, "nop");
 
-	/* call is on addr_2 + 1 address */
-	ASSERT_EQ(*(addr_2 + 1), 0xe8, "call");
+	/*
+	 * lea -0x80(%rsp), %rsp
+	 * call ...
+	 */
+	static unsigned char expected[] = { 0x48, 0x8d, 0x64, 0x24, 0x80, 0xe8 };
+
+	ASSERT_MEMEQ(addr_2 + 1, expected, sizeof(expected), "lea_and_call");
 	ASSERT_EQ(skel->bss->executed, 4, "executed");
 
 cleanup:
diff --git a/tools/testing/selftests/bpf/usdt_2.c b/tools/testing/selftests/bpf/usdt_2.c
index 789883aaca4c..b359b389f6c0 100644
--- a/tools/testing/selftests/bpf/usdt_2.c
+++ b/tools/testing/selftests/bpf/usdt_2.c
@@ -3,7 +3,7 @@
 #if defined(__x86_64__)
 
 /*
- * Include usdt.h with default nop,nop5 instructions combo.
+ * Include usdt.h with default nop,nop10 instructions combo.
  */
 #include "usdt.h"
 
-- 
2.53.0


^ permalink raw reply related

* [PATCHv2 08/11] selftests/bpf: Change uprobe/usdt trigger bench code to use nop10
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

Changing uprobe/usdt trigger bench code to use nop10 instead
of nop5. Also changing un_bench_uprobes.sh to use nop10 triggers.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/testing/selftests/bpf/bench.c           | 20 +++++------
 .../selftests/bpf/benchs/bench_trigger.c      | 36 +++++++++----------
 .../selftests/bpf/benchs/run_bench_uprobes.sh |  2 +-
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 6155ce455c27..1252a1af2e84 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -539,12 +539,12 @@ extern const struct bench bench_trig_uretprobe_multi_push;
 extern const struct bench bench_trig_uprobe_multi_ret;
 extern const struct bench bench_trig_uretprobe_multi_ret;
 #ifdef __x86_64__
-extern const struct bench bench_trig_uprobe_nop5;
-extern const struct bench bench_trig_uretprobe_nop5;
-extern const struct bench bench_trig_uprobe_multi_nop5;
-extern const struct bench bench_trig_uretprobe_multi_nop5;
+extern const struct bench bench_trig_uprobe_nop10;
+extern const struct bench bench_trig_uretprobe_nop10;
+extern const struct bench bench_trig_uprobe_multi_nop10;
+extern const struct bench bench_trig_uretprobe_multi_nop10;
 extern const struct bench bench_trig_usdt_nop;
-extern const struct bench bench_trig_usdt_nop5;
+extern const struct bench bench_trig_usdt_nop10;
 #endif
 
 extern const struct bench bench_rb_libbpf;
@@ -619,12 +619,12 @@ static const struct bench *benchs[] = {
 	&bench_trig_uprobe_multi_ret,
 	&bench_trig_uretprobe_multi_ret,
 #ifdef __x86_64__
-	&bench_trig_uprobe_nop5,
-	&bench_trig_uretprobe_nop5,
-	&bench_trig_uprobe_multi_nop5,
-	&bench_trig_uretprobe_multi_nop5,
+	&bench_trig_uprobe_nop10,
+	&bench_trig_uretprobe_nop10,
+	&bench_trig_uprobe_multi_nop10,
+	&bench_trig_uretprobe_multi_nop10,
 	&bench_trig_usdt_nop,
-	&bench_trig_usdt_nop5,
+	&bench_trig_usdt_nop10,
 #endif
 	/* ringbuf/perfbuf benchmarks */
 	&bench_rb_libbpf,
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index bcc4820c802e..3998ea8ff9aa 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -396,15 +396,15 @@ static void *uprobe_producer_ret(void *input)
 }
 
 #ifdef __x86_64__
-__nocf_check __weak void uprobe_target_nop5(void)
+__nocf_check __weak void uprobe_target_nop10(void)
 {
 	asm volatile (".byte 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
 }
 
-static void *uprobe_producer_nop5(void *input)
+static void *uprobe_producer_nop10(void *input)
 {
 	while (true)
-		uprobe_target_nop5();
+		uprobe_target_nop10();
 	return NULL;
 }
 
@@ -418,7 +418,7 @@ static void *uprobe_producer_usdt_nop(void *input)
 	return NULL;
 }
 
-static void *uprobe_producer_usdt_nop5(void *input)
+static void *uprobe_producer_usdt_nop10(void *input)
 {
 	while (true)
 		usdt_2();
@@ -542,24 +542,24 @@ static void uretprobe_multi_ret_setup(void)
 }
 
 #ifdef __x86_64__
-static void uprobe_nop5_setup(void)
+static void uprobe_nop10_setup(void)
 {
-	usetup(false, false /* !use_multi */, &uprobe_target_nop5);
+	usetup(false, false /* !use_multi */, &uprobe_target_nop10);
 }
 
-static void uretprobe_nop5_setup(void)
+static void uretprobe_nop10_setup(void)
 {
-	usetup(true, false /* !use_multi */, &uprobe_target_nop5);
+	usetup(true, false /* !use_multi */, &uprobe_target_nop10);
 }
 
-static void uprobe_multi_nop5_setup(void)
+static void uprobe_multi_nop10_setup(void)
 {
-	usetup(false, true /* use_multi */, &uprobe_target_nop5);
+	usetup(false, true /* use_multi */, &uprobe_target_nop10);
 }
 
-static void uretprobe_multi_nop5_setup(void)
+static void uretprobe_multi_nop10_setup(void)
 {
-	usetup(true, true /* use_multi */, &uprobe_target_nop5);
+	usetup(true, true /* use_multi */, &uprobe_target_nop10);
 }
 
 static void usdt_setup(const char *name)
@@ -598,7 +598,7 @@ static void usdt_nop_setup(void)
 	usdt_setup("usdt_1");
 }
 
-static void usdt_nop5_setup(void)
+static void usdt_nop10_setup(void)
 {
 	usdt_setup("usdt_2");
 }
@@ -665,10 +665,10 @@ BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop");
 BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push");
 BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret");
 #ifdef __x86_64__
-BENCH_TRIG_USERMODE(uprobe_nop5, nop5, "uprobe-nop5");
-BENCH_TRIG_USERMODE(uretprobe_nop5, nop5, "uretprobe-nop5");
-BENCH_TRIG_USERMODE(uprobe_multi_nop5, nop5, "uprobe-multi-nop5");
-BENCH_TRIG_USERMODE(uretprobe_multi_nop5, nop5, "uretprobe-multi-nop5");
+BENCH_TRIG_USERMODE(uprobe_nop10, nop10, "uprobe-nop10");
+BENCH_TRIG_USERMODE(uretprobe_nop10, nop10, "uretprobe-nop10");
+BENCH_TRIG_USERMODE(uprobe_multi_nop10, nop10, "uprobe-multi-nop10");
+BENCH_TRIG_USERMODE(uretprobe_multi_nop10, nop10, "uretprobe-multi-nop10");
 BENCH_TRIG_USERMODE(usdt_nop, usdt_nop, "usdt-nop");
-BENCH_TRIG_USERMODE(usdt_nop5, usdt_nop5, "usdt-nop5");
+BENCH_TRIG_USERMODE(usdt_nop10, usdt_nop10, "usdt-nop10");
 #endif
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh
index 9ec59423b949..e490b337e960 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_uprobes.sh
@@ -2,7 +2,7 @@
 
 set -eufo pipefail
 
-for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret,nop5} usdt-nop usdt-nop5
+for i in usermode-count syscall-count {uprobe,uretprobe}-{nop,push,ret,nop10} usdt-nop usdt-nop10
 do
 	summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
 	printf "%-15s: %s\n" $i "$summary"
-- 
2.53.0


^ permalink raw reply related

* [PATCHv2 09/11] selftests/bpf: Add reattach tests for uprobe syscall
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

Adding reattach tests for uprobe syscall tests to make sure
we can re-attach and optimize same uprobe multiple times.

The reason is that optimized uprobe does not restore original
nop10 after detach, but instead it uses 'jmp 8' instruction.

Making sure we can still install and optimize uprobe on top
of the 'jmp 8' instruction.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 .../selftests/bpf/prog_tests/uprobe_syscall.c | 115 ++++++++++++++++--
 1 file changed, 105 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index e4a19dc9df69..e88b316a3f2c 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -431,21 +431,27 @@ static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigge
 	return tramp;
 }
 
-static void check_detach(void *addr, void *tramp)
+static bool check_detach(void *addr, void *tramp)
 {
+	bool ok = true;
+
 	/* [uprobes_trampoline] stays after detach */
-	ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
-	ASSERT_OK(memcmp(addr, jmp2B, 2), "jmp2B");
+	if (!ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"))
+		ok = false;
+	if (!ASSERT_OK(memcmp(addr, jmp2B, 2), "jmp2B"))
+		ok = false;
+	return ok;
 }
 
-static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link,
-		  trigger_t trigger, void *addr, int executed)
+static void *check(struct uprobe_syscall_executed *skel, struct bpf_link *link,
+		   trigger_t trigger, void *addr, int executed)
 {
 	void *tramp;
 
 	tramp = check_attach(skel, trigger, addr, executed);
 	bpf_link__destroy(link);
 	check_detach(addr, tramp);
+	return tramp;
 }
 
 static void test_uprobe_legacy(void)
@@ -456,6 +462,7 @@ static void test_uprobe_legacy(void)
 	);
 	struct bpf_link *link;
 	unsigned long offset;
+	void *tramp;
 
 	offset = get_uprobe_offset(&uprobe_test);
 	if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
@@ -473,7 +480,28 @@ static void test_uprobe_legacy(void)
 	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts"))
 		goto cleanup;
 
-	check(skel, link, uprobe_test, uprobe_test, 2);
+	tramp = check(skel, link, uprobe_test, uprobe_test, 2);
+
+	/* reattach and detach without triggering optimization */
+	link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe,
+					       0, "/proc/self/exe", offset, NULL);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts"))
+		goto cleanup;
+
+	bpf_link__destroy(link);
+	if (!check_detach(uprobe_test, tramp))
+		goto cleanup;
+
+	uprobe_test();
+	ASSERT_EQ(skel->bss->executed, 2, "executed_no_probe");
+
+	/* reattach with triggering optimization */
+	link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe,
+				0, "/proc/self/exe", offset, NULL);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts"))
+		goto cleanup;
+
+	check(skel, link, uprobe_test, uprobe_test, 4);
 
 	/* uretprobe */
 	skel->bss->executed = 0;
@@ -495,6 +523,7 @@ static void test_uprobe_multi(void)
 	LIBBPF_OPTS(bpf_uprobe_multi_opts, opts);
 	struct bpf_link *link;
 	unsigned long offset;
+	void *tramp;
 
 	offset = get_uprobe_offset(&uprobe_test);
 	if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
@@ -515,7 +544,28 @@ static void test_uprobe_multi(void)
 	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
 		goto cleanup;
 
-	check(skel, link, uprobe_test, uprobe_test, 2);
+	tramp = check(skel, link, uprobe_test, uprobe_test, 2);
+
+	/* reattach and detach without triggering optimization */
+	link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi,
+				0, "/proc/self/exe", NULL, &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
+		goto cleanup;
+
+	bpf_link__destroy(link);
+	if (!check_detach(uprobe_test, tramp))
+		goto cleanup;
+
+	uprobe_test();
+	ASSERT_EQ(skel->bss->executed, 2, "executed_no_probe");
+
+	/* reattach with triggering optimization */
+	link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi,
+				0, "/proc/self/exe", NULL, &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
+		goto cleanup;
+
+	check(skel, link, uprobe_test, uprobe_test, 4);
 
 	/* uretprobe.multi */
 	skel->bss->executed = 0;
@@ -539,6 +589,7 @@ static void test_uprobe_session(void)
 	);
 	struct bpf_link *link;
 	unsigned long offset;
+	void *tramp;
 
 	offset = get_uprobe_offset(&uprobe_test);
 	if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
@@ -558,7 +609,28 @@ static void test_uprobe_session(void)
 	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
 		goto cleanup;
 
-	check(skel, link, uprobe_test, uprobe_test, 4);
+	tramp = check(skel, link, uprobe_test, uprobe_test, 4);
+
+	/* reattach and detach without triggering optimization */
+	link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session,
+				0, "/proc/self/exe", NULL, &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
+		goto cleanup;
+
+	bpf_link__destroy(link);
+	if (!check_detach(uprobe_test, tramp))
+		goto cleanup;
+
+	uprobe_test();
+	ASSERT_EQ(skel->bss->executed, 4, "executed_no_probe");
+
+	/* reattach with triggering optimization */
+	link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session,
+				0, "/proc/self/exe", NULL, &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
+		goto cleanup;
+
+	check(skel, link, uprobe_test, uprobe_test, 8);
 
 cleanup:
 	uprobe_syscall_executed__destroy(skel);
@@ -568,7 +640,7 @@ static void test_uprobe_usdt(void)
 {
 	struct uprobe_syscall_executed *skel;
 	struct bpf_link *link;
-	void *addr;
+	void *addr, *tramp;
 
 	errno = 0;
 	addr = find_nop10(usdt_test);
@@ -587,7 +659,30 @@ static void test_uprobe_usdt(void)
 	if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt"))
 		goto cleanup;
 
-	check(skel, link, usdt_test, addr, 2);
+	tramp = check(skel, link, usdt_test, addr, 2);
+
+	/* reattach and detach without triggering optimization */
+	link = bpf_program__attach_usdt(skel->progs.test_usdt,
+				-1 /* all PIDs */, "/proc/self/exe",
+				"optimized_uprobe", "usdt", NULL);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt"))
+		goto cleanup;
+
+	bpf_link__destroy(link);
+	if (!check_detach(addr, tramp))
+		goto cleanup;
+
+	usdt_test();
+	ASSERT_EQ(skel->bss->executed, 2, "executed_no_probe");
+
+	/* reattach with triggering optimization */
+	link = bpf_program__attach_usdt(skel->progs.test_usdt,
+				-1 /* all PIDs */, "/proc/self/exe",
+				"optimized_uprobe", "usdt", NULL);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt"))
+		goto cleanup;
+
+	check(skel, link, usdt_test, addr, 4);
 
 cleanup:
 	uprobe_syscall_executed__destroy(skel);
-- 
2.53.0


^ permalink raw reply related

* [PATCHv2 10/11] selftests/bpf: Add tests for uprobe nop10 red zone clobbering
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

From: Andrii Nakryiko <andrii@kernel.org>

The uprobe nop5 optimization used to replace a 5-byte NOP with a 5-byte
CALL to a trampoline. The CALL pushes a return address onto the stack at
[rsp-8], clobbering whatever was stored there.

On x86-64, the red zone is the 128 bytes below rsp that user code may use
for temporary storage without adjusting rsp. Compilers can place USDT
argument operands there, generating specs like "8@-8(%rbp)" when rbp ==
rsp. With the CALL-based optimization, the return address overwrites that
argument before the BPF-side USDT argument fetch runs.

Add two tests for this case. The uprobe_syscall subtest stores known values
at -8(%rsp), -16(%rsp), and -24(%rsp), executes an optimized nop10 uprobe,
and verifies the red-zone data is still intact. The USDT subtest triggers a
probe in a function where the compiler places three USDT operands in the
red zone and verifies that all 10 optimized invocations deliver the expected
argument values to BPF.

On an unfixed kernel, the first hit goes through the INT3 path and later
hits use the optimized CALL path, so the red-zone checks fail after
optimization.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
[ updates to use nop10 ]
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 .../selftests/bpf/prog_tests/uprobe_syscall.c | 75 +++++++++++++++++++
 tools/testing/selftests/bpf/prog_tests/usdt.c | 49 ++++++++++++
 tools/testing/selftests/bpf/progs/test_usdt.c | 25 +++++++
 tools/testing/selftests/bpf/usdt_2.c          | 13 ++++
 4 files changed, 162 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index e88b316a3f2c..739b0be13aa3 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -357,6 +357,48 @@ __nocf_check __weak void usdt_test(void)
 	USDT(optimized_uprobe, usdt);
 }
 
+/*
+ * Assembly-level red zone clobbering test. Stores known values in the
+ * red zone (below RSP), executes a nop10 (uprobe site), and checks that
+ * the values survived. Returns 0 if intact, 1 if clobbered.
+ *
+ * The nop5 optimization used CALL (which pushes a return address to
+ * [rsp-8]), the value at -8(%rsp) was overwritten. The nop10 optimization
+ * should escape that by moving stackpointer below the redzone before
+ * doing the CALL.
+ */
+__attribute__((aligned(16)))
+__nocf_check __weak __naked unsigned long uprobe_red_zone_test(void)
+{
+	asm volatile (
+		"movabs $0x1111111111111111, %%rax\n"
+		"movq   %%rax, -8(%%rsp)\n"
+		"movabs $0x2222222222222222, %%rax\n"
+		"movq   %%rax, -16(%%rsp)\n"
+		"movabs $0x3333333333333333, %%rax\n"
+		"movq   %%rax, -24(%%rsp)\n"
+
+		".byte 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00\n" /* nop10: uprobe site */
+
+		"movabs $0x1111111111111111, %%rax\n"
+		"cmpq   %%rax, -8(%%rsp)\n"
+		"jne    1f\n"
+		"movabs $0x2222222222222222, %%rax\n"
+		"cmpq   %%rax, -16(%%rsp)\n"
+		"jne    1f\n"
+		"movabs $0x3333333333333333, %%rax\n"
+		"cmpq   %%rax, -24(%%rsp)\n"
+		"jne    1f\n"
+
+		"xorl   %%eax, %%eax\n"
+		"retq\n"
+		"1:\n"
+		"movl   $1, %%eax\n"
+		"retq\n"
+		::: "rax", "memory"
+	);
+}
+
 static int find_uprobes_trampoline(void *tramp_addr)
 {
 	void *start, *end;
@@ -855,6 +897,37 @@ static void test_uprobe_race(void)
 #define __NR_uprobe 336
 #endif
 
+static void test_uprobe_red_zone(void)
+{
+	struct uprobe_syscall_executed *skel;
+	struct bpf_link *link;
+	void *nop10_addr;
+	size_t offset;
+	int i;
+
+	nop10_addr = find_nop10(uprobe_red_zone_test);
+	if (!ASSERT_NEQ(nop10_addr, NULL, "find_nop10"))
+		return;
+
+	skel = uprobe_syscall_executed__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	offset = get_uprobe_offset(nop10_addr);
+	link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe,
+			0, "/proc/self/exe", offset, NULL);
+	if (!ASSERT_OK_PTR(link, "attach_uprobe"))
+		goto cleanup;
+
+	for (i = 0; i < 10; i++)
+		ASSERT_EQ(uprobe_red_zone_test(), 0, "red_zone_intact");
+
+	bpf_link__destroy(link);
+
+cleanup:
+	uprobe_syscall_executed__destroy(skel);
+}
+
 static void test_uprobe_error(void)
 {
 	long err = syscall(__NR_uprobe);
@@ -881,6 +954,8 @@ static void __test_uprobe_syscall(void)
 		test_uprobe_usdt();
 	if (test__start_subtest("uprobe_race"))
 		test_uprobe_race();
+	if (test__start_subtest("uprobe_red_zone"))
+		test_uprobe_red_zone();
 	if (test__start_subtest("uprobe_error"))
 		test_uprobe_error();
 	if (test__start_subtest("uprobe_regs_equal"))
diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c
index a160d7c4fa0d..8954f543d68e 100644
--- a/tools/testing/selftests/bpf/prog_tests/usdt.c
+++ b/tools/testing/selftests/bpf/prog_tests/usdt.c
@@ -250,6 +250,7 @@ static void subtest_basic_usdt(bool optimized)
 #ifdef __x86_64__
 extern void usdt_1(void);
 extern void usdt_2(void);
+extern void usdt_red_zone_trigger(void);
 
 static unsigned char nop1[1] = { 0x90 };
 static unsigned char nop1_nop10_combo[11] = { 0x90, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
@@ -340,6 +341,52 @@ static void subtest_optimized_attach(void)
 cleanup:
 	test_usdt__destroy(skel);
 }
+
+/*
+ * Test that USDT arguments survive nop10 optimization in a function where
+ * the compiler places operands in the red zone.
+ *
+ * Signal handlers are prone to having the compiler place USDT argument
+ * operands in the red zone (below rsp).
+ *
+ * The nop5 optimization used CALL (which pushes a return address to
+ * [rsp-8]), the value at -8(%rsp) was overwritten. The nop10 optimization
+ * should escape that by moving stackpointer below the redzone before
+ * doing the CALL.
+ */
+static void subtest_optimized_red_zone(void)
+{
+	struct test_usdt *skel;
+	int i;
+
+	skel = test_usdt__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		return;
+
+	skel->bss->expected_arg[0] = 0xDEADBEEF;
+	skel->bss->expected_arg[1] = 0xCAFEBABE;
+	skel->bss->expected_arg[2] = 0xFEEDFACE;
+	skel->bss->expected_pid = getpid();
+
+	skel->links.usdt_check_arg = bpf_program__attach_usdt(
+		skel->progs.usdt_check_arg, 0, "/proc/self/exe",
+		"optimized_attach", "usdt_red_zone", NULL);
+	if (!ASSERT_OK_PTR(skel->links.usdt_check_arg, "attach_usdt_red_zone"))
+		goto cleanup;
+
+	for (i = 0; i < 10; i++)
+		usdt_red_zone_trigger();
+
+	ASSERT_EQ(skel->bss->arg_total, 10, "arg_total");
+	ASSERT_EQ(skel->bss->arg_bad, 0, "arg_bad");
+	ASSERT_EQ(skel->bss->arg_last[0], 0xDEADBEEF, "arg_last_1");
+	ASSERT_EQ(skel->bss->arg_last[1], 0xCAFEBABE, "arg_last_2");
+	ASSERT_EQ(skel->bss->arg_last[2], 0xFEEDFACE, "arg_last_3");
+
+cleanup:
+	test_usdt__destroy(skel);
+}
+
 #endif
 
 unsigned short test_usdt_100_semaphore SEC(".probes");
@@ -613,6 +660,8 @@ void test_usdt(void)
 		subtest_basic_usdt(true);
 	if (test__start_subtest("optimized_attach"))
 		subtest_optimized_attach();
+	if (test__start_subtest("optimized_red_zone"))
+		subtest_optimized_red_zone();
 #endif
 	if (test__start_subtest("multispec"))
 		subtest_multispec_usdt();
diff --git a/tools/testing/selftests/bpf/progs/test_usdt.c b/tools/testing/selftests/bpf/progs/test_usdt.c
index f00cb52874e0..0ee78fb050a1 100644
--- a/tools/testing/selftests/bpf/progs/test_usdt.c
+++ b/tools/testing/selftests/bpf/progs/test_usdt.c
@@ -149,5 +149,30 @@ int usdt_executed(struct pt_regs *ctx)
 		executed++;
 	return 0;
 }
+
+int arg_total;
+int arg_bad;
+long arg_last[3];
+long expected_arg[3];
+int expected_pid;
+
+SEC("usdt")
+int BPF_USDT(usdt_check_arg, long arg1, long arg2, long arg3)
+{
+	if (expected_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	__sync_fetch_and_add(&arg_total, 1);
+	arg_last[0] = arg1;
+	arg_last[1] = arg2;
+	arg_last[2] = arg3;
+
+	if (arg1 != expected_arg[0] ||
+	    arg2 != expected_arg[1] ||
+	    arg3 != expected_arg[2])
+		__sync_fetch_and_add(&arg_bad, 1);
+
+	return 0;
+}
 #endif
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/usdt_2.c b/tools/testing/selftests/bpf/usdt_2.c
index b359b389f6c0..5e38f8605b02 100644
--- a/tools/testing/selftests/bpf/usdt_2.c
+++ b/tools/testing/selftests/bpf/usdt_2.c
@@ -13,4 +13,17 @@ void usdt_2(void)
 	USDT(optimized_attach, usdt_2);
 }
 
+static volatile unsigned long usdt_red_zone_arg1 = 0xDEADBEEF;
+static volatile unsigned long usdt_red_zone_arg2 = 0xCAFEBABE;
+static volatile unsigned long usdt_red_zone_arg3 = 0xFEEDFACE;
+
+void __attribute__((noinline)) usdt_red_zone_trigger(void)
+{
+	unsigned long a1 = usdt_red_zone_arg1;
+	unsigned long a2 = usdt_red_zone_arg2;
+	unsigned long a3 = usdt_red_zone_arg3;
+
+	USDT(optimized_attach, usdt_red_zone, a1, a2, a3);
+}
+
 #endif
-- 
2.53.0


^ permalink raw reply related

* [PATCHv2 11/11] selftests/bpf: Add tests for forked/cloned optimized uprobes
From: Jiri Olsa @ 2026-05-18 10:59 UTC (permalink / raw)
  To: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Masami Hiramatsu,
	Andrii Nakryiko
  Cc: bpf, linux-trace-kernel
In-Reply-To: <20260518105957.123445-1-jolsa@kernel.org>

Adding tests for forked/cloned optimized uprobes and make
sure the child can properly execute optimized probe for
both fork (dups mm) and clone with CLONE_VM.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 .../selftests/bpf/prog_tests/uprobe_syscall.c | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index 739b0be13aa3..713b42d4f190 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -4,6 +4,8 @@
 
 #ifdef __x86_64__
 
+#define _GNU_SOURCE
+#include <sched.h>
 #include <unistd.h>
 #include <asm/ptrace.h>
 #include <linux/compiler.h>
@@ -936,6 +938,88 @@ static void test_uprobe_error(void)
 	ASSERT_EQ(errno, EPROTO, "errno");
 }
 
+__attribute__((aligned(16)))
+__nocf_check __weak __naked void uprobe_fork_test(void)
+{
+	asm volatile (
+		".byte 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00\n" /* nop10 */
+		"ret\n"
+	);
+}
+
+static int child_func(void *arg)
+{
+	struct uprobe_syscall_executed *skel = arg;
+
+	/* Make sure the child's probe is still there and optimized.. */
+	if (memcmp(uprobe_fork_test, lea_rsp, sizeof(lea_rsp)))
+		_exit(1);
+
+	skel->bss->pid = getpid();
+
+	/* .. and it executes properly. */
+	uprobe_fork_test();
+
+	if (skel->bss->executed != 3)
+		_exit(2);
+
+	_exit(0);
+}
+
+static void test_uprobe_fork_optimized(bool clone_vm)
+{
+	struct uprobe_syscall_executed *skel = NULL;
+	struct bpf_link *link = NULL;
+	unsigned long offset;
+	int pid, status, err;
+	char stack[65535];
+
+	offset = get_uprobe_offset(&uprobe_fork_test);
+	if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
+		return;
+
+	skel = uprobe_syscall_executed__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto cleanup;
+
+	link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe,
+				-1, "/proc/self/exe", offset, NULL);
+	if (!ASSERT_OK_PTR(link, "attach_uprobe"))
+		goto cleanup;
+
+	skel->bss->pid = getpid();
+
+	/* Trigger optimization of uprobe in uprobe_fork_test.  */
+	uprobe_fork_test();
+	uprobe_fork_test();
+
+	/* Make sure it got optimied. */
+	if (!ASSERT_OK(memcmp(uprobe_fork_test, lea_rsp, sizeof(lea_rsp)), "optimized"))
+		goto cleanup;
+
+	if (clone_vm) {
+		pid = clone(child_func, stack + sizeof(stack), CLONE_VM|SIGCHLD, skel);
+		if (!ASSERT_GT(pid, 0, "clone"))
+			goto cleanup;
+	} else {
+		pid = fork();
+		if (!ASSERT_GE(pid, 0, "fork"))
+			goto cleanup;
+		if (pid == 0)
+			child_func(skel);
+	}
+
+	/* Wait for the child and verify it exited properly with 0. */
+	err = waitpid(pid, &status, 0);
+	if (ASSERT_EQ(err, pid, "waitpid")) {
+		ASSERT_EQ(WIFEXITED(status), 1, "child_exited");
+		ASSERT_EQ(WEXITSTATUS(status), 0, "child_exit_code");
+	}
+
+cleanup:
+	uprobe_syscall_executed__destroy(skel);
+}
+
 static void __test_uprobe_syscall(void)
 {
 	if (test__start_subtest("uretprobe_regs_equal"))
@@ -956,6 +1040,10 @@ static void __test_uprobe_syscall(void)
 		test_uprobe_race();
 	if (test__start_subtest("uprobe_red_zone"))
 		test_uprobe_red_zone();
+	if (test__start_subtest("uprobe_optimized_fork"))
+		test_uprobe_fork_optimized(false);
+	if (test__start_subtest("uprobe_optimized_clone_vm"))
+		test_uprobe_fork_optimized(true);
 	if (test__start_subtest("uprobe_error"))
 		test_uprobe_error();
 	if (test__start_subtest("uprobe_regs_equal"))
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCHv2 05/11] libbpf: Detect uprobe syscall with new error
From: bot+bpf-ci @ 2026-05-18 11:37 UTC (permalink / raw)
  To: jolsa, oleg, peterz, mingo, mhiramat, andrii
  Cc: bpf, linux-trace-kernel, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260518105957.123445-6-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2554 bytes --]

>     libbpf: Detect uprobe syscall with new error
>
>     In the previous optimized uprobe fix we changed the syscall
>     error used for its detection from ENXIO to EPROTO.
>
>     Changing related probe_uprobe_syscall detection check.
>
>     Suggested-by: Andrii Nakryiko <andrii@kernel.org>
>     Fixes: 05738da0efa1 ("libbpf: Add uprobe syscall feature detection")
>     Signed-off-by: Jiri Olsa <jolsa@kernel.org>

The Fixes: tag points to commit 05738da0efa1 ("libbpf: Add uprobe
syscall feature detection"), but wasn't the bug actually introduced by
commit 5c3c59c7952a ("uprobes/x86: Move optimized uprobe from nop5 to
nop10")? Commit 05738da0efa1 originally added the ENXIO check
correctly, but commit 5c3c59c7952a changed the kernel to return EPROTO
instead of ENXIO, breaking the libbpf detection. The commit message
even says "In the previous optimized uprobe fix we changed the syscall
error," which refers to 5c3c59c7952a.

Should this use:

  Fixes: 5c3c59c7952a ("uprobes/x86: Move optimized uprobe from nop5 to nop10")

> diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
> index b7e388f99d0b..e5641fa60163 100644
> --- a/tools/lib/bpf/features.c
> +++ b/tools/lib/bpf/features.c
> @@ -577,10 +577,10 @@ static int probe_ldimm64_full_range_off(int token_fd)
>  static int probe_uprobe_syscall(int token_fd)
>  {
>  	/*
> -	 * If kernel supports uprobe() syscall, it will return -ENXIO when called
> +	 * If kernel supports uprobe() syscall, it will return -EPROTO when called
>  	 * from the outside of a kernel-generated uprobe trampoline.
>  	 */
> -	return syscall(__NR_uprobe) < 0 && errno == ENXIO;
> +	return syscall(__NR_uprobe) < 0 && errno == EPROTO;
>  }
>  #else
>  static int probe_uprobe_syscall(int token_fd)
> diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> index 955a37751b52..c944136252c6 100644
> --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
> @@ -762,7 +762,7 @@ static void test_uprobe_error(void)
>  	long err = syscall(__NR_uprobe);
>
>  	ASSERT_EQ(err, -1, "error");
> -	ASSERT_EQ(errno, ENXIO, "errno");
> +	ASSERT_EQ(errno, EPROTO, "errno");
>  }
>
>  static void __test_uprobe_syscall(void)

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26030080109

^ permalink raw reply

* Re: [PATCHv2 08/11] selftests/bpf: Change uprobe/usdt trigger bench code to use nop10
From: bot+bpf-ci @ 2026-05-18 11:37 UTC (permalink / raw)
  To: jolsa, oleg, peterz, mingo, mhiramat, andrii
  Cc: bpf, linux-trace-kernel, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260518105957.123445-9-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 658 bytes --]

> selftests/bpf: Change uprobe/usdt trigger bench code to use nop10
>
> Changing uprobe/usdt trigger bench code to use nop10 instead
> of nop5. Also changing un_bench_uprobes.sh to use nop10 triggers.
>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>

This isn't a bug, but there's a typo in the commit message: the filename
is written as "un_bench_uprobes.sh" but should be "run_bench_uprobes.sh"
(starting with 'r').


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26030080109

^ permalink raw reply

* Re: [PATCHv2 04/11] libbpf: Change has_nop_combo to work on top of nop10
From: bot+bpf-ci @ 2026-05-18 11:37 UTC (permalink / raw)
  To: jolsa, oleg, peterz, mingo, mhiramat, andrii
  Cc: jakub, bpf, linux-trace-kernel, ast, andrii, daniel, martin.lau,
	eddyz87, yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260518105957.123445-5-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2138 bytes --]

> libbpf: Change has_nop_combo to work on top of nop10
>
> We now expect nop combo with 10 bytes nop instead of 5 bytes nop,
> fixing has_nop_combo to reflect that.
>
> Fixes: 41a5c7df4466 ("libbpf: Add support to detect nop,nop5 instructions combo for usdt probe")

Should the Fixes tag reference commit 5c3c59c7952a ("uprobes/x86: Move
optimized uprobe from nop5 to nop10") instead? Commit 41a5c7df4466 originally
introduced has_nop_combo() with the correct nop5 check. Commit 5c3c59c7952a
later changed the kernel to use nop10 instead of nop5 for uprobe optimization
but failed to update the libbpf has_nop_combo() function accordingly, creating
a mismatch between kernel and userspace.

> Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
>
> diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
> index e3710933fd52..7e62e4d5bedd 100644
> --- a/tools/lib/bpf/usdt.c
> +++ b/tools/lib/bpf/usdt.c

[ ... ]

> @@ -596,14 +596,14 @@ static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note,
>  #if defined(__x86_64__)
>  static bool has_nop_combo(int fd, long off)
>  {
> -	unsigned char nop_combo[6] = {
> -		0x90, 0x0f, 0x1f, 0x44, 0x00, 0x00 /* nop,nop5 */
> +	unsigned char nop_combo[11] = {
> +		0x90, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00,
>  	};
> -	unsigned char buf[6];
> +	unsigned char buf[11];
>
> -	if (pread(fd, buf, 6, off) != 6)
> +	if (pread(fd, buf, 11, off) != 11)
>  		return false;
> -	return memcmp(buf, nop_combo, 6) == 0;
> +	return memcmp(buf, nop_combo, 11) == 0;
>  }

This isn't a bug, but Jakub Sitnicki suggested in v1 review to use
ARRAY_SIZE(buf) instead of repeating the scalar value 11 in multiple places.
The suggestion was included alongside the Reviewed-by tag but was not
addressed.

Reference: https://lore.kernel.org/bpf/87fr3t7wpu.fsf@cloudflare.com/

[ ... ]


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26030080109

^ permalink raw reply

* Re: [PATCHv2 07/11] selftests/bpf: Change uprobe syscall tests to use nop10
From: bot+bpf-ci @ 2026-05-18 11:50 UTC (permalink / raw)
  To: jolsa, oleg, peterz, mingo, mhiramat, andrii
  Cc: bpf, linux-trace-kernel, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260518105957.123445-8-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2341 bytes --]

> diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> index 2f22ec61667b..bcc4820c802e 100644
> --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
> +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
> @@ -398,7 +398,7 @@ static void *uprobe_producer_ret(void *input)
>  #ifdef __x86_64__
>  __nocf_check __weak void uprobe_target_nop5(void)
>  {
> -	asm volatile (".byte 0x0f, 0x1f, 0x44, 0x00, 0x00");
> +	asm volatile (".byte 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00");
>  }
>
>  static void *uprobe_producer_nop5(void *input)

This isn't a bug, but should the function and benchmark names also be
updated? The function is still called uprobe_target_nop5() and the
benchmark exposes names like uprobe-nop5, uretprobe-nop5,
uprobe-multi-nop5, uretprobe-multi-nop5, and usdt-nop5, but they are
now all testing a 10-byte nop instruction.

[ ... ]

> diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c
> index 69759b27794d..a160d7c4fa0d 100644
> --- a/tools/testing/selftests/bpf/prog_tests/usdt.c
> +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c
> @@ -252,7 +252,7 @@ extern void usdt_1(void);
>  extern void usdt_2(void);
>
>  static unsigned char nop1[1] = { 0x90 };
> -static unsigned char nop1_nop5_combo[6] = { 0x90, 0x0f, 0x1f, 0x44, 0x00, 0x00 };
> +static unsigned char nop1_nop10_combo[11] = { 0x90, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
>
>  static void *find_instr(void *fn, unsigned char *instr, size_t cnt)
>  {
>  	int i;
>
>  	for (i = 0; i < 10; i++) {
>  		if (!memcmp(instr, fn + i, cnt))
>  			return fn + i;
>  	}
>  	return NULL;
>  }

Is the loop in find_instr() adequate to find this instruction sequence?
In uprobe_syscall.c, find_nop10() was updated to search up to 128 bytes
to account for compiler-generated prologues. If a modern compiler
generates a prologue longer than 9 bytes for usdt_2(), this will
prematurely terminate and cause the subtest_optimized_attach() test to
fail.

[ ... ]


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26030080109

^ permalink raw reply

* Re: [PATCHv2 03/11] uprobes/x86: Move optimized uprobe from nop5 to nop10
From: bot+bpf-ci @ 2026-05-18 11:50 UTC (permalink / raw)
  To: jolsa, oleg, peterz, mingo, mhiramat, andrii
  Cc: bpf, linux-trace-kernel, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260518105957.123445-4-jolsa@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 7654 bytes --]

> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c

[ ... ]

> @@ -963,7 +975,7 @@ static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
>   *   - SMP sync all CPUs
>   */
>  static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
> -		       unsigned long vaddr, char *insn, bool optimize)
> +		       unsigned long vaddr, char *insn, int size, bool optimize)
>  {
>  	uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
>  	struct write_opcode_ctx ctx = {
> @@ -978,7 +990,7 @@ struct write_opcode_ctx ctx = {
>  	 * so we can skip this step for optimize == true.
>  	 */
>  	if (!optimize) {
> -		ctx.expect = EXPECT_CALL;
> +		ctx.expect = EXPECT_OPTIMIZED;
>  		err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
>  				   true /* is_register */, false /* do_update_ref_ctr */,
>  				   &ctx);
> @@ -990,7 +1002,7 @@ struct write_opcode_ctx ctx = {
>
>  	/* Write all but the first byte of the patched range. */
>  	ctx.expect = EXPECT_SWBP;
> -	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
> +	err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, size - 1, verify_insn,

When re-optimizing from JMP8 back to LEA+CALL, does the CALL instruction at
offset 5 get rewritten without INT3 protection?

The int3_update() sequence writes INT3 at offset 0 during unoptimize
(!optimize path), then later writes bytes 1..9 while verifying offset 0
stays INT3. During re-optimization (optimize == true), the code skips the
initial INT3 write and directly writes bytes 1..9.

Consider this sequence when re-optimizing a previously unoptimized uprobe
that is currently 'jmp rel8' back to 'lea; call':

  Step 7: Current state is 0xCC 0x08 ... (INT3 followed by JMP8 offset)
  Step 8: Write bytes 1..9, transforming to 0xCC 0x8d 0x64 0x24 0x80 0xE8 ...
          The CALL opcode (0xE8) at offset 5 and its 4-byte displacement
          get written without an INT3 protecting that location.

A concurrent thread executing at vaddr could observe a partially-written CALL
instruction during the transition. Peter Zijlstra suggested placing INT3 at
both offset 0 and offset 5 to protect both the LEA and CALL instructions
during modification:

  Reference: https://lore.kernel.org/bpf/20260518104306.GU3102624@noisy.programming.kicks-ass.net/

  "Note that from step 7 to step 8, you re-write the CALL instruction
   without going through INT3. This means it is entirely possible for a
   concurrent execution to observe a composite instruction. This is NOT
   sound! However, I think it can be salvaged, if instead of only writing
   INT3 at +0, you also write INT3 at +5."

[ ... ]

> @@ -1095,14 +1122,25 @@ int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
>  		  unsigned long vaddr)
>  {
>  	if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
> -		int ret = is_optimized(vma->vm_mm, vaddr);
> -		if (ret < 0)
> +		uprobe_opcode_t insn[OPT_INSN_SIZE];
> +		int ret;
> +
> +		ret = copy_from_vaddr(vma->vm_mm, vaddr, &insn, OPT_INSN_SIZE);
> +		if (ret)
>  			return ret;
> -		if (ret) {
> +		if (__is_optimized(vma->vm_mm, (uprobe_opcode_t *)&insn, vaddr)) {
>  			ret = swbp_unoptimize(auprobe, vma, vaddr);
>  			WARN_ON_ONCE(ret);
>  			return ret;
>  		}
> +		/*
> +		 * We can have re-attached probe on top of jmp8 instruction,
> +		 * which did not get optimized. We need to restore the jmp8
> +		 * instruction, instead of the original instruction (nop10).
> +		 */
> +		if (is_swbp_insn(&insn[0]) && insn[1] == OPT_JMP8_OFFSET)
> +			return uprobe_write_opcode(auprobe, vma, vaddr, JMP8_INSN_OPCODE,

This isn't a bug, but the multi-line if statement is missing braces:

		if (is_swbp_insn(&insn[0]) && insn[1] == OPT_JMP8_OFFSET)
			return uprobe_write_opcode(auprobe, vma, vaddr, JMP8_INSN_OPCODE,
						   false /* is_register */);

Kernel coding style requires braces on any multi-line statement.

  Reference: https://lore.kernel.org/bpf/20260518104306.GU3102624@noisy.programming.kicks-ass.net/

[ ... ]

> @@ -1160,14 +1198,24 @@ void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
>
>  static bool can_optimize(struct insn *insn, unsigned long vaddr)
>  {
> -	if (!insn->x86_64 || insn->length != 5)
> +	if (!insn->x86_64)
>  		return false;
>
> -	if (!insn_is_nop(insn))
> +	/* We can't do cross page atomic writes yet. */
> +	if (PAGE_SIZE - (vaddr & ~PAGE_MASK) < OPT_INSN_SIZE)
>  		return false;
>
> -	/* We can't do cross page atomic writes yet. */
> -	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
> +	/* We can optimize on top of nop10.. */
> +	if (insn->length == OPT_INSN_SIZE && insn_is_nop(insn))
> +		return true;
> +
> +	/* .. and JMP rel8 to end of slot — check swbp_unoptimize. */
> +	if (insn->length == 2 &&
> +	    insn->opcode.bytes[0] == JMP8_INSN_OPCODE &&
> +	    insn->immediate.value == OPT_JMP8_OFFSET)
> +		return true;
> +
> +	return false;
>  }

Can can_optimize() safely accept naturally occurring 'jmp +8' instructions
from user binaries?

The JMP8 branch returns true for any 2-byte 'jmp rel8' with displacement 8
(bytes 0xEB 0x08). The comment says this is for swbp_unoptimize()'s
kernel-placed jmp, but can_optimize() is called during initial analysis via
prepare_uprobe() -> arch_uprobe_analyze_insn() -> uprobe_init_insn(), before
any kernel modifications occur. The instruction bytes come directly from the
user binary through copy_insn().

If a user binary naturally contains 'EB 08' at the uprobe location (e.g., a
forward branch over an 8-byte literal pool or alignment padding), this path
would trigger:

  1. uprobe_init_insn() decodes the user's 'jmp 0x08' (length=2, opcode=0xEB,
     immediate=8).
  2. can_optimize() matches the new JMP8 branch and returns true.
  3. ARCH_UPROBE_FLAG_CAN_OPTIMIZE is set.
  4. set_swbp() writes INT3 at vaddr, replacing the 0xEB.
  5. When the breakpoint fires, swbp_optimize() -> int3_update() writes 10
     bytes (lea -0x80(%rsp),%rsp; call tramp) over vaddr..vaddr+9.
  6. verify_insn() only checks that byte 0 is INT3 (EXPECT_SWBP) before
     writing bytes 1..9. It does not verify those 8 bytes are from a prior
     optimization.
  7. The 8 bytes at vaddr+2..vaddr+9 (original user code/data following the
     jmp) get overwritten with the tail of the LEA and the CALL encoding.
  8. On unregister, swbp_unoptimize() only writes 2 bytes back. Bytes
     vaddr+2..vaddr+9 stay corrupted with call instruction remnants in the
     COW'd page.

The corruption persists even though the original jmp would skip those bytes,
because other control flow (indirect branches, function pointers, alternative
entry points) might reach that range.

Contrast with nop10: when can_optimize returns true for a 10-byte nop
(length==OPT_INSN_SIZE && insn_is_nop), writing 10 bytes only overwrites the
nop itself. The earlier nop5 code was safe because it required insn->length
== 5 and wrote exactly 5 bytes (write size matched instruction size).

The JMP8 case breaks that invariant: write size (10 bytes) exceeds
instruction size (2 bytes), so the 8 trailing bytes are arbitrary user
content. Would a length check (requiring insn->length == OPT_INSN_SIZE for
the JMP8 case) or tracking optimization state via a flag prevent optimizing
user instructions that happen to match the JMP8 pattern?


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26030080109

^ permalink raw reply

* Re: [PATCH 6/9] rv: Ensure synchronous cleanup for HA monitors
From: Gabriele Monaco @ 2026-05-18 11:54 UTC (permalink / raw)
  To: Wen Yang; +Cc: linux-kernel, Steven Rostedt, Nam Cao, linux-trace-kernel
In-Reply-To: <b7d760dc-162a-48e8-af75-e566bfa46493@linux.dev>

On Sun, 2026-05-17 at 17:12 +0800, Wen Yang wrote:
> The guard(rcu)() + synchronize_rcu() mechanism for HA timer callbacks
> is correct.
> 
> One concern: TOCTOU between the pre-check and guard(rcu)().

Yes, this could happen, but I'm not sure it's really a big issue:

> 
> da_monitor_reset() calls reset_hook BEFORE clearing monitoring:
> 
>    da_monitor_reset_hook(da_mon);        /* ha_cancel_timer [async]   */
>    WRITE_ONCE(da_mon->monitoring, 0);    /* cleared AFTER reset_hook  */
>    da_mon->curr_state = model_get_initial_state();
> 
> This may creates a window where the callback pre-check passes but the
> monitor is reset before guard(rcu)() is acquired:

If a callback is running, there was a violation because the timer expired, so it
isn't wrong to report, although we are unloading the monitor.

> 
>    /* __ha_monitor_timer_callback() */
>    if (unlikely(!da_monitor_handling_event(&ha_mon->da_mon)))
>        return;
> 
>    /* passes: monitoring=1
>     *
>     * WINDOW ─ CPU A runs da_monitor_reset_all() here:
>     *   ha_cancel_timer()  [returns: callback is running, cannot cancel]
>     *   WRITE_ONCE(monitoring, 0)
>     *   curr_state = model_get_initial_state()
>     */
>    guard(rcu)();
>    curr_state = READ_ONCE(ha_mon->da_mon.curr_state);  /* initial_state */
>    /* no second da_monitoring() check */
>    ha_react(curr_state, EVENT_NONE, env_string.buffer); /* spurious call */
>    ha_trace_error_env(ha_mon, ...);                     /* fires 
> unconditionally */
> 
> Result: spurious ha_trace_error_env() for initial_state.  For existing
> monitors (stall/nomiss/opid), model_should_send_event_env(initial, NONE)
> returns false, so no false-positive reaction, but the trace event fires.
> Monitors where initial_state carries a constraint would produce a false
> positive.

I'm not sure what you mean here, if I understand the situation correctly: the
callback is running (so we should react), da_monitor_reset() is too late to stop
it but somehow manages to reset curr_state on time for the callback to see it
change: react reports the wrong state in an otherwise valid reaction.

> 
> Proposed fix : re-check inside the RCU critical section:
> 
>    guard(rcu)();
>    if (unlikely(!da_monitoring(&ha_mon->da_mon)))  /* re-check here */
>        return;
>    curr_state = READ_ONCE(ha_mon->da_mon.curr_state);

I'm not sure that's going to fix it anyway, RCU cannot synchronise readers,
checking again would at most (mildly) reduce the race window, not remove it.

What we could do is to play with barriersin for the callback to either:
* see monitoring = 1 AND the old curr_state
* see monitoring = 0 AND the new curr_state

Something like:

void __ha_monitor_timer_callback() {
	guard(rcu)(); //this is only for waiters, let them wait more

	if (unlikely(!da_monitor_handling_event(&ha_mon->da_mon)))
		return;
	smp_rmb();
	curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
	...
}

void da_monitor_reset() {
	da_monitor_reset_hook(da_mon);
	WRITE_ONCE(da_mon->monitoring, 0);
	smp_wmb();
	WRITE_ONCE(da_mon->curr_state, model_get_initial_state());
}

Coupled with your patch [1] adding more atomic accesses to da_mon->monitoring
should probably do the trick.

Am I missing anything?

Thanks,
Gabriele

[1] -
https://lore.kernel.org/lkml/8af5ba4bd93d2acb8a546e8e47ced974a87c1eb8.1778522945.git.wen.yang@linux.dev

> 
> 
> --
> Best wishes,
> Wen
> 
> 
> On 5/12/26 22:02, Gabriele Monaco wrote:
> > HA monitors may start timers, all cleanup functions currently stop the
> > timers asynchronously to avoid sleeping in the wrong context.
> > Nothing makes sure running callbacks terminate on cleanup.
> > 
> > Run the entire HA timer callback in an RCU read-side critical section,
> > this way we can simply synchronize_rcu() with any pending timer and are
> > sure any cleanup using kfree_rcu() runs after callbacks terminated.
> > Additionally make sure any unlikely callback running late won't run any
> > code if the monitor is marked as disabled.
> > 
> > Fixes: f5587d1b6ec9 ("rv: Add Hybrid Automata monitor type")
> > Fixes: 4a24127bd6cb ("rv: Add support for per-object monitors in DA/HA")
> > Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> > ---
> >   include/rv/da_monitor.h | 23 +++++++++++++++++++----
> >   include/rv/ha_monitor.h | 18 ++++++++++++++++--
> >   2 files changed, 35 insertions(+), 6 deletions(-)
> > 
> > diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
> > index a4a13b62d1a4..402d3b935c08 100644
> > --- a/include/rv/da_monitor.h
> > +++ b/include/rv/da_monitor.h
> > @@ -57,6 +57,15 @@ static struct rv_monitor rv_this;
> >   #define da_monitor_reset_hook(da_mon)
> >   #endif
> >   
> > +/*
> > + * Hook to allow the implementation of hybrid automata: define it with a
> > + * function that waits for the termination of all monitors background
> > + * activities (e.g. all timers). This hook can sleep.
> > + */
> > +#ifndef da_monitor_sync_hook
> > +#define da_monitor_sync_hook()
> > +#endif
> > +
> >   /*
> >    * Type for the target id, default to int but can be overridden.
> >    * A long type can work as hash table key (PER_OBJ) but will be downgraded
> > to
> > @@ -179,6 +188,7 @@ static inline int da_monitor_init(void)
> >   static inline void da_monitor_destroy(void)
> >   {
> >   	da_monitor_reset_all();
> > +	da_monitor_sync_hook();
> >   }
> >   
> >   #ifndef da_implicit_guard
> > @@ -232,6 +242,7 @@ static inline int da_monitor_init(void)
> >   static inline void da_monitor_destroy(void)
> >   {
> >   	da_monitor_reset_all();
> > +	da_monitor_sync_hook();
> >   }
> >   
> >   #ifndef da_implicit_guard
> > @@ -319,6 +330,7 @@ static inline void da_monitor_destroy(void)
> >   	}
> >   
> >   	da_monitor_reset_all();
> > +	da_monitor_sync_hook();
> >   
> >   	rv_put_task_monitor_slot(task_mon_slot);
> >   	task_mon_slot = RV_PER_TASK_MONITOR_INIT;
> > @@ -497,10 +509,9 @@ static void da_monitor_reset_all(void)
> >   	struct da_monitor_storage *mon_storage;
> >   	int bkt;
> >   
> > -	rcu_read_lock();
> > +	guard(rcu)();
> >   	hash_for_each_rcu(da_monitor_ht, bkt, mon_storage, node)
> >   		da_monitor_reset(&mon_storage->rv.da_mon);
> > -	rcu_read_unlock();
> >   }
> >   
> >   static inline int da_monitor_init(void)
> > @@ -516,13 +527,17 @@ static inline void da_monitor_destroy(void)
> >   	int bkt;
> >   
> >   	tracepoint_synchronize_unregister();
> > +	scoped_guard(rcu) {
> > +		hash_for_each_rcu(da_monitor_ht, bkt, mon_storage, node) {
> > +			da_monitor_reset_hook(&mon_storage->rv.da_mon);
> > +		}
> > +	}
> > +	da_monitor_sync_hook();
> >   	/*
> >   	 * This function is called after all probes are disabled and no
> > longer
> >   	 * pending, we can safely assume no concurrent user.
> >   	 */
> > -	synchronize_rcu();
> >   	hash_for_each_safe(da_monitor_ht, bkt, tmp, mon_storage, node) {
> > -		da_monitor_reset_hook(&mon_storage->rv.da_mon);
> >   		hash_del_rcu(&mon_storage->node);
> >   		kfree(mon_storage);
> >   	}
> > diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
> > index d59507e8cb30..47ff1a41febe 100644
> > --- a/include/rv/ha_monitor.h
> > +++ b/include/rv/ha_monitor.h
> > @@ -36,6 +36,7 @@ static bool ha_monitor_handle_constraint(struct da_monitor
> > *da_mon,
> >   #define da_monitor_event_hook ha_monitor_handle_constraint
> >   #define da_monitor_init_hook ha_monitor_init_env
> >   #define da_monitor_reset_hook ha_monitor_reset_env
> > +#define da_monitor_sync_hook() synchronize_rcu()
> >   
> >   #include <rv/da_monitor.h>
> >   #include <linux/seq_buf.h>
> > @@ -237,12 +238,25 @@ static bool ha_monitor_handle_constraint(struct
> > da_monitor *da_mon,
> >   	return false;
> >   }
> >   
> > +/*
> > + * __ha_monitor_timer_callback - generic callback representation
> > + *
> > + * This callback runs in an RCU read-side critical section to allow the
> > + * destruction sequence to easily synchronize_rcu() with all pending timer
> > + * after asynchronously disabling them.
> > + */
> >   static inline void __ha_monitor_timer_callback(struct ha_monitor *ha_mon)
> >   {
> > -	enum states curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
> >   	DECLARE_SEQ_BUF(env_string, ENV_BUFFER_SIZE);
> > -	u64 time_ns = ha_get_ns();
> > +	enum states curr_state;
> > +	u64 time_ns;
> > +
> > +	if (unlikely(!da_monitor_handling_event(&ha_mon->da_mon)))
> > +		return;
> >   
> > +	guard(rcu)();
> > +	curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
> > +	time_ns = ha_get_ns();
> >   	ha_get_env_string(&env_string, ha_mon, time_ns);
> >   	ha_react(curr_state, EVENT_NONE, env_string.buffer);
> >   	ha_trace_error_env(ha_mon, model_get_state_name(curr_state),


^ permalink raw reply

* Re: [PATCH mm-unstable v17 02/14] mm/khugepaged: generalize alloc_charge_folio()
From: Usama Arif @ 2026-05-18 11:55 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260511185817.686831-3-npache@redhat.com>

On Mon, 11 May 2026 12:58:02 -0600 Nico Pache <npache@redhat.com> wrote:

> From: Dev Jain <dev.jain@arm.com>
> 
> Pass order to alloc_charge_folio() and update mTHP statistics.
> 
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Acked-by: Usama Arif <usama.arif@linux.dev>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Co-developed-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst |  8 ++++++++
>  include/linux/huge_mm.h                    |  2 ++
>  mm/huge_memory.c                           |  4 ++++
>  mm/khugepaged.c                            | 17 +++++++++++------
>  4 files changed, 25 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 5fbc3d89bb07..c51932e6275d 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -639,6 +639,14 @@ anon_fault_fallback_charge
>  	instead falls back to using huge pages with lower orders or
>  	small pages even though the allocation was successful.
>  
> +collapse_alloc
> +	is incremented every time a huge page is successfully allocated for a
> +	khugepaged collapse.
> +
> +collapse_alloc_failed
> +	is incremented every time a huge page allocation fails during a
> +	khugepaged collapse.
> +
>  zswpout
>  	is incremented every time a huge page is swapped out to zswap in one
>  	piece without splitting.
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 2949e5acff35..ba7ae6808544 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -128,6 +128,8 @@ enum mthp_stat_item {
>  	MTHP_STAT_ANON_FAULT_ALLOC,
>  	MTHP_STAT_ANON_FAULT_FALLBACK,
>  	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> +	MTHP_STAT_COLLAPSE_ALLOC,
> +	MTHP_STAT_COLLAPSE_ALLOC_FAILED,
>  	MTHP_STAT_ZSWPOUT,
>  	MTHP_STAT_SWPIN,
>  	MTHP_STAT_SWPIN_FALLBACK,
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index e9d499da0ac7..05f482a72a89 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -699,6 +699,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>  DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +DEFINE_MTHP_STAT_ATTR(collapse_alloc, MTHP_STAT_COLLAPSE_ALLOC);
> +DEFINE_MTHP_STAT_ATTR(collapse_alloc_failed, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
>  DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
>  DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
>  DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
> @@ -764,6 +766,8 @@ static struct attribute *any_stats_attrs[] = {
>  #endif
>  	&split_attr.attr,
>  	&split_failed_attr.attr,
> +	&collapse_alloc_attr.attr,
> +	&collapse_alloc_failed_attr.attr,
>  	NULL,
>  };
>  
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 979885694351..f0e29d5c7b1f 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1068,21 +1068,26 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
>  }
>  
>  static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> -		struct collapse_control *cc)
> +		struct collapse_control *cc, unsigned int order)
>  {
>  	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
>  		     GFP_TRANSHUGE);
>  	int node = collapse_find_target_node(cc);
>  	struct folio *folio;
>  
> -	folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
> +	folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
>  	if (!folio) {
>  		*foliop = NULL;
> -		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> +		if (is_pmd_order(order))
> +			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> +		count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
>  		return SCAN_ALLOC_HUGE_PAGE_FAIL;
>  	}
>  
> -	count_vm_event(THP_COLLAPSE_ALLOC);
> +	if (is_pmd_order(order))
> +		count_vm_event(THP_COLLAPSE_ALLOC);
> +	count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC);
> +

The vmstat THP_COLLAPSE_ALLOC counter is pmd order only.
But after this we have

	count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);

which is not being guarded with is_pmd_order().

I think we want this to be pmd order only as well so that
the meaning of the vmstat and cgroup counter remains the same?


>  	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
>  		folio_put(folio);
>  		*foliop = NULL;
> @@ -1118,7 +1123,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	 */
>  	mmap_read_unlock(mm);
>  
> -	result = alloc_charge_folio(&folio, mm, cc);
> +	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
>  	if (result != SCAN_SUCCEED)
>  		goto out_nolock;
>  
> @@ -1899,7 +1904,7 @@ static enum scan_result collapse_file(struct mm_struct *mm, unsigned long addr,
>  	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
>  	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
>  
> -	result = alloc_charge_folio(&new_folio, mm, cc);
> +	result = alloc_charge_folio(&new_folio, mm, cc, HPAGE_PMD_ORDER);
>  	if (result != SCAN_SUCCEED)
>  		goto out;
>  
> -- 
> 2.54.0
> 
> 

^ permalink raw reply

* Re: [PATCH 8/9] rv: Add automatic cleanup handlers for per-task HA monitors
From: Gabriele Monaco @ 2026-05-18 12:18 UTC (permalink / raw)
  To: Wen Yang; +Cc: linux-kernel, Steven Rostedt, Nam Cao, linux-trace-kernel
In-Reply-To: <843882d4-3cf1-403f-8d49-172c8efb8201@linux.dev>

On Sun, 2026-05-17 at 17:40 +0800, Wen Yang wrote:
> ha_cancel_timer_sync() is the right choice for task exit; the
> ha_mon_initializing guard correctly handles the init-window race.
> 
> One issue: after ha_monitor_disable_hook(), an in-flight
> ha_handle_sched_process_exit() handler may still be executing.  It
> reads task_mon_slot via da_get_monitor() (&p->rv[task_mon_slot]);
> da_monitor_sync_hook() = synchronize_rcu() cannot drain it because
> tracepoint handlers run outside any RCU read-side section.  If
> rv_put_task_monitor_slot() writes RV_PER_TASK_MONITOR_INIT to
> task_mon_slot first, the handler dereferences an OOB index.
> 
> This is the same race Patch 5 closes for PER_OBJ with
> tracepoint_synchronize_unregister(); the PER_TASK da_monitor_destroy()
> needs the same call (and so does every other PER_TASK monitor, not only
> the new exit handler).
> 
> Could you add tracepoint_synchronize_unregister() to the PER_TASK
> da_monitor_destroy() ?  Alternatively, we can carry the fix on top of
> your series.

Yeah you're right, that's the neatest way to solve it.

Indeed, any in-flight handler would do da_get_monitor() not only the newly added
exit hook, so we're going to need tracepoint_synchronize_unregister() before
touching task_mon_slot in any per-task monitor (not only HA, also DA and even
LTL).

Feel free to send your patch and I'll apply it to this series, including also
ltl_monitor_destroy().

Thanks,
Gabriele

> 
> --
> Best wishes,
> Wen
> 
> 
> On 5/12/26 22:02, Gabriele Monaco wrote:
> > Hybrid automata monitors may start timers, depending on the model, these
> > may remain active on an exiting task and cause false positives or even
> > access freed memory.
> > 
> > Add an enable/disable hook in the HA code, currently only populated by
> > the per-task handler for registration and deregistration.
> > This hooks to the sched_process_exit event and ensures the timer is
> > stopped for every exiting task. The handler is enabled automatically but
> > may be disabled, for instance if the monitor uses the event for another
> > purpose (but should still manually ensure timers are stopped).
> > 
> > Fixes: f5587d1b6ec9 ("rv: Add Hybrid Automata monitor type")
> > Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> > ---
> >   include/rv/ha_monitor.h | 44 +++++++++++++++++++++++++++++++++++++++++
> >   1 file changed, 44 insertions(+)
> > 
> > diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
> > index 11ae85bad492..1bdf866e9c63 100644
> > --- a/include/rv/ha_monitor.h
> > +++ b/include/rv/ha_monitor.h
> > @@ -28,6 +28,7 @@ static inline void ha_monitor_init_env(struct da_monitor
> > *da_mon);
> >   static inline void ha_monitor_reset_env(struct da_monitor *da_mon);
> >   static inline void ha_setup_timer(struct ha_monitor *ha_mon);
> >   static inline bool ha_cancel_timer(struct ha_monitor *ha_mon);
> > +static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon);
> >   static bool ha_monitor_handle_constraint(struct da_monitor *da_mon,
> >   					 enum states curr_state,
> >   					 enum events event,
> > @@ -38,6 +39,26 @@ static bool ha_monitor_handle_constraint(struct
> > da_monitor *da_mon,
> >   #define da_monitor_reset_hook ha_monitor_reset_env
> >   #define da_monitor_sync_hook() synchronize_rcu()
> >   
> > +#if !defined(HA_SKIP_AUTO_CLEANUP) && RV_MON_TYPE == RV_MON_PER_TASK
> > +/*
> > + * Automatic cleanup handlers for per-task HA monitors, only skip if you
> > know
> > + * what you are doing (e.g. you want to implement cleanup manually in
> > another
> > + * handler doing more things).
> > + */
> > +static void ha_handle_sched_process_exit(void *data, struct task_struct *p,
> > +					 bool group_dead);
> > +
> > +#define
> > ha_monitor_enable_hook()                                             \
> > +	rv_attach_trace_probe(__stringify(MONITOR_NAME),
> > sched_process_exit, \
> > +			      ha_handle_sched_process_exit)
> > +#define
> > ha_monitor_disable_hook()                                            \
> > +	rv_detach_trace_probe(__stringify(MONITOR_NAME),
> > sched_process_exit, \
> > +			      ha_handle_sched_process_exit)
> > +#else
> > +#define ha_monitor_enable_hook()
> > +#define ha_monitor_disable_hook()
> > +#endif
> > +
> >   #include <rv/da_monitor.h>
> >   #include <linux/seq_buf.h>
> >   
> > @@ -124,12 +145,14 @@ static int ha_monitor_init(void)
> >   
> >   	ha_mon_initializing = true;
> >   	ret = da_monitor_init();
> > +	ha_monitor_enable_hook();
> >   	ha_mon_initializing = false;
> >   	return ret;
> >   }
> >   
> >   static void ha_monitor_destroy(void)
> >   {
> > +	ha_monitor_disable_hook();
> >   	da_monitor_destroy();
> >   }
> >   
> > @@ -230,6 +253,18 @@ static inline void ha_trace_error_env(struct ha_monitor
> > *ha_mon,
> >   {
> >   	CONCATENATE(trace_error_env_, MONITOR_NAME)(id, curr_state, event,
> > env);
> >   }
> > +
> > +#if !defined(HA_SKIP_AUTO_CLEANUP) && RV_MON_TYPE == RV_MON_PER_TASK
> > +static void ha_handle_sched_process_exit(void *data, struct task_struct *p,
> > +					 bool group_dead)
> > +{
> > +	struct da_monitor *da_mon = da_get_monitor(p);
> > +
> > +	if (likely(!ha_monitor_uninitialized(da_mon)))
> > +		ha_cancel_timer_sync(to_ha_monitor(da_mon));
> > +}
> > +#endif
> > +
> >   #endif /* RV_MON_TYPE */
> >   
> >   /*
> > @@ -455,6 +490,10 @@ static inline bool ha_cancel_timer(struct ha_monitor
> > *ha_mon)
> >   {
> >   	return timer_delete(&ha_mon->timer);
> >   }
> > +static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon)
> > +{
> > +	timer_delete_sync(&ha_mon->timer);
> > +}
> >   #elif HA_TIMER_TYPE == HA_TIMER_HRTIMER
> >   /*
> >    * Helper functions to handle the monitor timer.
> > @@ -506,6 +545,10 @@ static inline bool ha_cancel_timer(struct ha_monitor
> > *ha_mon)
> >   {
> >   	return hrtimer_try_to_cancel(&ha_mon->hrtimer) == 1;
> >   }
> > +static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon)
> > +{
> > +	hrtimer_cancel(&ha_mon->hrtimer);
> > +}
> >   #else /* HA_TIMER_NONE */
> >   /*
> >    * Start function is intentionally not defined, monitors using timers must
> > @@ -516,6 +559,7 @@ static inline bool ha_cancel_timer(struct ha_monitor
> > *ha_mon)
> >   {
> >   	return false;
> >   }
> > +static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon) { }
> >   #endif
> >   
> >   #endif


^ permalink raw reply

* Re: [PATCH mm-unstable v17 00/14] khugepaged: mTHP support
From: Wei Yang @ 2026-05-18 12:50 UTC (permalink / raw)
  To: Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260511185817.686831-1-npache@redhat.com>

On Mon, May 11, 2026 at 12:58:00PM -0600, Nico Pache wrote:
>The following series provides khugepaged with the capability to collapse
>anonymous memory regions to mTHPs.
>
>To achieve this we generalize the khugepaged functions to no longer depend
>on PMD_ORDER. Then during the PMD scan, we use a bitmap to track individual
>pages that are occupied (!none/zero). After the PMD scan is done, we use
>the bitmap to find the optimal mTHP sizes for the PMD range. The
>restriction on max_ptes_none is removed during the scan, to make sure we
>account for the whole PMD range in the bitmap. When no mTHP size is
>enabled, the legacy behavior of khugepaged is maintained.
>
>We currently only support max_ptes_none values of 0 or HPAGE_PMD_NR - 1
>(ie 511). If any other value is specified, the kernel will emit a warning
>and no mTHP collapse will be attempted. If a mTHP collapse is attempted,
>but contains swapped out, or shared pages, we don't perform the collapse.
>It is now also possible to collapse to mTHPs without requiring the PMD THP
>size to be enabled. These limitations are to prevent collapse "creep"
>behavior. This prevents constantly promoting mTHPs to the next available
>size, which would occur because a collapse introduces more non-zero pages
>that would satisfy the promotion condition on subsequent scans.
>
>Patch 1-2:   Generalize hugepage_vma_revalidate and alloc_charge_folio
>	     for arbitrary orders.
>Patch 3:     Rework max_ptes_* handling into helper functions
>Patch 4:     Generalize __collapse_huge_page_* for mTHP support
>Patch 5:     Require collapse_huge_page to enter/exit with the lock dropped
>Patch 6:     Generalize collapse_huge_page for mTHP collapse
>Patch 7:     Skip collapsing mTHP to smaller orders
>Patch 8-9:   Add per-order mTHP statistics and tracepoints
>Patch 10:    Introduce collapse_allowable_orders helper function
>Patch 11-13: Introduce bitmap and mTHP collapse support, fully enabled
>Patch 14:    Documentation
>
>Testing:
>- Built for x86_64, aarch64, ppc64le, and s390x
>- ran all arches on test suites provided by the kernel-tests project
>- internal testing suites: functional testing and performance testing
>- selftests mm
>- I created a test script that I used to push khugepaged to its limits
>   while monitoring a number of stats and tracepoints. The code is
>   available here[1] (Run in legacy mode for these changes and set mthp
>   sizes to inherit)
>   The summary from my testings was that there was no significant
>   regression noticed through this test. In some cases my changes had
>   better collapse latencies, and was able to scan more pages in the same
>   amount of time/work, but for the most part the results were consistent.
>- redis testing. I did some testing with these changes along with my defer
>  changes (see followup [2] post for more details). We've decided to get
>  the mTHP changes merged first before attempting the defer series.
>- some basic testing on 64k page size.
>- lots of general use.
>

Two links are missing. I got them from previous version.

[1] - https://gitlab.com/npache/khugepaged_mthp_test
[2] - https://lore.kernel.org/lkml/20250515033857.132535-1-npache@redhat.com/

And the test in [1] is a performance test. I am thinking whether we want a
functional test in selftests.

I did a quick try with following change and some hack.

@@ -744,6 +765,51 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o
 	ksft_test_result_report(exit_status, "%s\n", __func__);
 }
 
+static void collapse_mth_ptes(struct collapse_context *c, struct mem_ops *ops)
+{
+	struct thp_settings settings = *thp_current_settings();
+	void *p;
+	int i;
+
+	/* Disable mthp on fault */
+	for (i = 0; i < NR_ORDERS; i++) {
+		settings.hugepages[i].enabled = THP_NEVER;
+	}
+	thp_push_settings(&settings);
+
+	p = ops->setup_area(1);
+
+	ops->fault(p, 0, hpage_pmd_size);
+
+	/* Expect all order-0 folio after fault */
+	memset(expected_orders, 0, sizeof(int) * (pmd_order + 1));
+	expected_orders[0] = hpage_pmd_nr;
+	if (check_folio_orders(p, hpage_pmd_size, pagemap_fd,
+					   kpageflags_fd, expected_orders,
+					   (pmd_order + 1)))
+		ksft_exit_fail_msg("Unexpected huge page at fault\n");
+
+	/* Enable mthp before collapse */
+	thp_pop_settings();
+	settings.hugepages[2].enabled = THP_ALWAYS;
+	thp_push_settings(&settings);
+
+	c->collapse("Collapse fully populated PTE table with order 2", p, 1,
+		    ops, true);
+
+	/* Expect all order-2 folio after collapse */
+	memset(expected_orders, 0, sizeof(int) * (pmd_order + 1));
+	expected_orders[2] = 1 << (pmd_order - 2);
+	if (check_folio_orders(p, hpage_pmd_size, pagemap_fd,
+					   kpageflags_fd, expected_orders,
+					   (pmd_order + 1)))
+		ksft_exit_fail_msg("Unexpected page order\n");
+
+	ops->cleanup_area(p, hpage_pmd_size);
+	thp_pop_settings();
+	ksft_test_result_report(exit_status, "%s\n", __func__);
+}
+
 static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
 {
 	void *p;

This leverage check_after_split_folio_orders() in split_huge_page_test.c to
check folio order in PMD range.

-- 
Wei Yang
Help you, Help me

^ permalink raw reply

* Re: [PATCH mm-unstable v17 04/14] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: David Hildenbrand (Arm) @ 2026-05-18 13:16 UTC (permalink / raw)
  To: Wei Yang, Lance Yang
  Cc: npache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, liam, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260514031009.f66cgop3ctgiqxz3@master>

On 5/14/26 05:10, Wei Yang wrote:
> On Tue, May 12, 2026 at 03:42:02PM +0800, Lance Yang wrote:
>>
>> On Mon, May 11, 2026 at 12:58:04PM -0600, Nico Pache wrote:
>>> generalize the order of the __collapse_huge_page_* and collapse_max_*
>>> functions to support future mTHP collapse.
>>>
>>> The current mechanism for determining collapse with the
>>> khugepaged_max_ptes_none value is not designed with mTHP in mind. This
>>> raises a key design issue: if we support user defined max_pte_none values
>>> (even those scaled by order), a collapse of a lower order can introduces
>>> an feedback loop, or "creep", when max_ptes_none is set to a value greater
>>> than HPAGE_PMD_NR / 2. [1]
>>>
>>> With this configuration, a successful collapse to order N will populate
>>> enough pages to satisfy the collapse condition on order N+1 on the next
>>> scan. This leads to unnecessary work and memory churn.
>>>
>>> To fix this issue introduce a helper function that will limit mTHP
>>> collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
>>> This effectively supports two modes: [2]
>>>
>>> - max_ptes_none=0: never collapses if it encounters an empty PTE or a PTE
>>>  that maps the shared zeropage. Consequently, no memory bloat.
>>> - max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
>>>  available mTHP order.
>>>
>>> This removes the possiblilty of "creep", while not modifying any uAPI
>>> expectations. A warning will be emitted if any non-supported
>>> max_ptes_none value is configured with mTHP enabled.
>>>
>>> mTHP collapse will not honor the khugepaged_max_ptes_shared or
>>> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
>>> shared or swapped entry.
>>>
>>> No functional changes in this patch; however it defines future behavior
>>> for mTHP collapse.
>>>
>>> [1] - https://lore.kernel.org/all/e46ab3ab-a3d7-4fb7-9970-d0704bd5d05a@arm.com
>>> [2] - https://lore.kernel.org/all/37375ace-5601-4d6c-9dac-d1c8268698e9@redhat.com
>>>
>>> Co-developed-by: Dev Jain <dev.jain@arm.com>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> Signed-off-by: Nico Pache <npache@redhat.com>
>>> ---
>>> include/trace/events/huge_memory.h |   3 +-
>>> mm/khugepaged.c                    | 117 ++++++++++++++++++++---------
>>> 2 files changed, 85 insertions(+), 35 deletions(-)
>>>
>>> diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
>>> index bcdc57eea270..443e0bd13fdb 100644
>>> --- a/include/trace/events/huge_memory.h
>>> +++ b/include/trace/events/huge_memory.h
>>> @@ -39,7 +39,8 @@
>>> 	EM( SCAN_STORE_FAILED,		"store_failed")			\
>>> 	EM( SCAN_COPY_MC,		"copy_poisoned_page")		\
>>> 	EM( SCAN_PAGE_FILLED,		"page_filled")			\
>>> -	EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")
>>> +	EM(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")	\
>>> +	EMe(SCAN_INVALID_PTES_NONE,	"invalid_ptes_none")
>>>
>>> #undef EM
>>> #undef EMe
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index f68853b3caa7..27465161fa6d 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -61,6 +61,7 @@ enum scan_result {
>>> 	SCAN_COPY_MC,
>>> 	SCAN_PAGE_FILLED,
>>> 	SCAN_PAGE_DIRTY_OR_WRITEBACK,
>>> +	SCAN_INVALID_PTES_NONE,
>>> };
>>>
>>> #define CREATE_TRACE_POINTS
>>> @@ -353,37 +354,60 @@ static bool pte_none_or_zero(pte_t pte)
>>>  * PTEs for the given collapse operation.
>>>  * @cc: The collapse control struct
>>>  * @vma: The vma to check for userfaultfd
>>> + * @order: The folio order being collapsed to
>>>  *
>>>  * Return: Maximum number of none-page or zero-page PTEs allowed for the
>>>  * collapse operation.
>>>  */
>>> -static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
>>> -		struct vm_area_struct *vma)
>>> +static int collapse_max_ptes_none(struct collapse_control *cc,
>>> +		struct vm_area_struct *vma, unsigned int order)
>>> {
>>> +	unsigned int max_ptes_none = khugepaged_max_ptes_none;
>>> 	// If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
>>
>> One thing I still want to call out: kernel code usually uses C-style
>> comments :)
>>
>>> 	if (vma && userfaultfd_armed(vma))
>>> 		return 0;
>>> 	// for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
>>> 	if (!cc->is_khugepaged)
>>> 		return HPAGE_PMD_NR;
>>> -	// For all other cases repect the user defined maximum.
>>> -	return khugepaged_max_ptes_none;
>>> +	// for PMD collapse, respect the user defined maximum.
>>> +	if (is_pmd_order(order))
>>> +		return max_ptes_none;
>>> +	/* Zero/non-present collapse disabled. */
>>> +	if (!max_ptes_none)
>>> +		return 0;
>>> +	// for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
>>> +	// scale the maximum number of PTEs to the order of the collapse.
>>> +	if (max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
>>> +		return (1 << order) - 1;
>>> +
>>> +	// We currently only support max_ptes_none values of 0 or KHUGEPAGED_MAX_PTES_LIMIT.
>>> +	// Emit a warning and return -EINVAL.
>>> +	pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %u\n",
>>> +		      KHUGEPAGED_MAX_PTES_LIMIT);
>>
>> Maybe fallback to 0 instead, as David suggested earlier?
>>
> 
> It looks reasonable to fallback to 0.
> 
> But as the updated Document says in patch 14:
> 
>   For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) are supported. Any other
>   value will emit a warning and no mTHP collapse will be attempted.
> 
> This is why it does like this now.
> 
>     mthp_collapse()
>         max_ptes_none = collapse_max_ptes_none();
>         if (max_ptes_none < 0)
>             return collapsed;
> 
>> max_ptes_none is mostly legacy PMD THP behavior. mTHP is new, and any
>> intermediate value in (0, KHUGEPAGED_MAX_PTES_LIMIT) would implicitly
>> disable it :(
>>
> 
> So it depends on what we want to do here :-)
> 
> For me, I would vote for fallback to 0.

At this point I'll prefer to not return errors from collapse_max_ptes_none().
It's just rather awkward to return an error deep down in collapse code for a
configuration problem.

For mthp collapse, we only support max_ptes_none==0 and
max_ptes_none=="HPAGE_PMD_NR - 1" (default).

If another value is specified while collapsing mTHP, print a warning and treat
it as 0 (save value, no creep, no memory waste).

In a sense, this is similar to how we handle max_ptes_shared + max_ptes_swap:
for mTHP: we always treat them as being 0 for mTHP collapse (and don't issue a
warning, because we would issue a warning with the default settings).

@Lorenzo, fine with you?

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v6 0/2] blk-mq: introduce tag starvation observability
From: Jens Axboe @ 2026-05-18 13:31 UTC (permalink / raw)
  To: Aaron Tomlin, rostedt, mhiramat, mathieu.desnoyers
  Cc: bvanassche, johannes.thumshirn, kch, dlemoal, ritesh.list,
	loberman, neelx, sean, mproche, chjohnst, linux-block,
	linux-kernel, linux-trace-kernel
In-Reply-To: <20260517213614.350367-1-atomlin@atomlin.com>

On 5/17/26 3:36 PM, Aaron Tomlin wrote:
> Hi Jens, Steve, Masami,
> 
> In high-performance storage environments, particularly when utilising RAID
> controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe latency
> spikes can occur when fast devices are starved of available tags.
> Currently, diagnosing this specific queue contention requires deploying
> dynamic kprobes or inferring sleep states, which lacks a simple,
> out-of-the-box diagnostic path.
> 
> This short series introduces dedicated, low-overhead observability for tag
> exhaustion events in the block layer:
> 
>   - Patch 1 introduces the "block_rq_tag_wait" tracepoint in the tag
>     allocation slow-path to capture precise, event-based starvation.
> 
>   - Patch 2 complements this by exposing "wait_on_hw_tag" and
>     "wait_on_sched_tag" per-CPU counters via debugfs for quick,
>     point-in-time cumulative polling.
> 
> Together, these provide storage engineers with zero-configuration
> mechanisms to definitively identify shared-tag bottlenecks.

Why not just issue the trace points? Then there's close to zero
overhead, rather than needing to need added counters for this, and the
kernel to keep track. If you just issue the get/put tag kind of traces,
then userspace can keep track. That's what blktrace has done for decades
for things like inflight/queue depth accounting.

IOW, seems to me, this could be done with basically zero kernel
additions outside of perhaps a trace point or two.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH v2 1/2] serial: qcom-geni: trace: Add tracepoint support for Qualcomm GENI serial
From: Konrad Dybcio @ 2026-05-18 13:40 UTC (permalink / raw)
  To: Praveen Talari, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Greg Kroah-Hartman, Jiri Slaby
  Cc: linux-kernel, linux-trace-kernel, linux-arm-msm, linux-serial,
	Mukesh Kumar Savaliya, Aniket Randive, chandana.chiluveru,
	jyothi.seerapu
In-Reply-To: <20260512-add-tracepoints-for-qcom-geni-serial-v2-1-a5726421b3af@oss.qualcomm.com>

On 5/12/26 7:14 PM, Praveen Talari wrote:
> Add tracepoint support to the Qualcomm GENI serial driver to provide
> runtime visibility into driver behavior without requiring invasive debug
> patches.
> 
> The trace events cover UART termios configuration, clock setup, modem
> control state, interrupt status, and TX/RX data, making it easier to
> diagnose communication issues in the field.
> 
> Signed-off-by: Praveen Talari <praveen.talari@oss.qualcomm.com>
> ---

[...]

> +DEFINE_EVENT(geni_serial_data, geni_serial_tx_data,
> +
> +	TP_PROTO(struct device *dev, const u8 *buf, unsigned int len),
> +
> +	TP_ARGS(dev, buf, len)
> +
> +);
> +
> +DEFINE_EVENT(geni_serial_data, geni_serial_rx_data,
> +
> +	TP_PROTO(struct device *dev, const u8 *buf, unsigned int len),
> +
> +	TP_ARGS(dev, buf, len)
> +
> +);

stray \ns above

otherwise lgtm

Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>

Konrad

^ permalink raw reply

* Re: [PATCH v2 2/2] serial: qcom-geni: Add tracepoints for Qualcomm GENI serial driver
From: Konrad Dybcio @ 2026-05-18 13:41 UTC (permalink / raw)
  To: Praveen Talari, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Greg Kroah-Hartman, Jiri Slaby
  Cc: linux-kernel, linux-trace-kernel, linux-arm-msm, linux-serial,
	Mukesh Kumar Savaliya, Aniket Randive, chandana.chiluveru,
	jyothi.seerapu
In-Reply-To: <20260512-add-tracepoints-for-qcom-geni-serial-v2-2-a5726421b3af@oss.qualcomm.com>

On 5/12/26 7:14 PM, Praveen Talari wrote:
> Add tracing to the Qualcomm GENI serial driver to improve runtime
> observability.
> 
> Trace hooks are added at key points including termios and clock
> configuration, manual control get/set, interrupt handling, and data
> TX/RX paths.
> 
> Usage examples:
> 
> Enable all serial traces:
>   echo 1 > /sys/kernel/debug/tracing/events/qcom_geni_serial/enable
>   cat /sys/kernel/debug/tracing/trace_pipe
> 
> Example trace output:
> 
> 2517.938432: geni_serial_clk_cfg: a94000.serial: desired_rate=1843200
>      clk_rate=7372800 clk_div=4 clk_idx=0
> 2517.938753: geni_serial_irq: a94000.serial: m_irq=0x88800000
>      s_irq=0x08000111 dma_tx=0x00000000 dma_rx=0x00000000
> 2517.938803: geni_serial_set_termios: a94000.serial: baud=115200 bpc=8
>      tx_trans=0x00000002 tx_par=0x00000000 rx_trans=0x00000000
> rx_par=0x00000000 stop=0
> 2517.938807: geni_serial_set_mctrl: a94000.serial: mctrl=0x8006
>      uart_manual_rfr=0x00000000
> 2517.938818: geni_serial_get_mctrl: a94000.serial: mctrl=0x0160
>      geni_ios=0x00000001
> 2517.939165: geni_serial_irq: a94000.serial: m_irq=0x00400000
>      s_irq=0x00000000 dma_tx=0x00000000 dma_rx=0x00000000
> 2517.939592: geni_serial_tx_data: a94000.serial: tx_len=8 data=61 62 63
>      64 65 66 67 68
> 2517.940610: geni_serial_irq: a94000.serial: m_irq=0x00000001
>      s_irq=0x00000000 dma_tx=0x00000003 dma_rx=0x00000000
> 2517.942174: geni_serial_irq: a94000.serial: m_irq=0x08000000
>      s_irq=0x08000100 dma_tx=0x00000000 dma_rx=0x00000003
> 2517.942323: geni_serial_rx_data: a94000.serial: rx_len=8 data=61 62 63
>      64 65 66 67 68
> 2517.942680: geni_serial_set_mctrl: a94000.serial: mctrl=0x8000
>      uart_manual_rfr=0x80000002

I think the example (or at least the data that it produces) could go
under the --- line, there's plenty of docs regarding tracing on
docs.kernel.org

Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>

Konrad

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox