BPF List
 help / color / mirror / Atom feed
From: Jiri Olsa <jolsa@kernel.org>
To: Oleg Nesterov <oleg@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Andrii Nakryiko <andrii@kernel.org>
Cc: bpf@vger.kernel.org, Song Liu <songliubraving@fb.com>,
	Yonghong Song <yhs@fb.com>,
	John Fastabend <john.fastabend@gmail.com>,
	Hao Luo <haoluo@google.com>, Steven Rostedt <rostedt@goodmis.org>,
	Masami Hiramatsu <mhiramat@kernel.org>,
	Alan Maguire <alan.maguire@oracle.com>,
	linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org
Subject: [PATCH bpf-next 08/13] uprobes/x86: Add support to optimize uprobes
Date: Wed, 11 Dec 2024 14:33:57 +0100	[thread overview]
Message-ID: <20241211133403.208920-9-jolsa@kernel.org> (raw)
In-Reply-To: <20241211133403.208920-1-jolsa@kernel.org>

Putting together all the previously added pieces to support optimized
uprobes on top of 5-byte nop instruction.

The current uprobe execution goes through following:
  - installs breakpoint instruction over original instruction
  - exception handler hit and calls related uprobe consumers
  - and either simulates original instruction or does out of line single step
    execution of it
  - returns to user space

The optimized uprobe path

  - checks the original instruction is 5-byte nop (plus other checks)
  - adds (or uses existing) user space trampoline and overwrites original
    instruction (5-byte nop) with call to user space trampoline
  - the user space trampoline executes uprobe syscall that calls related uprobe
    consumers
  - trampoline returns back to next instruction

This approach won't speed up all uprobes as it's limited to using nop5 as
original instruction, but we could use nop5 as USDT probe instruction (which
uses single byte nop ATM) and speed up the USDT probes.

This patch overloads related arch functions in uprobe_write_opcode and
set_orig_insn so they can install call instruction if needed.

The arch_uprobe_optimize triggers the uprobe optimization and is called after
first uprobe hit. I originally had it called on uprobe installation but then
it clashed with elf loader, because the user space trampoline was added in a
place where loader might need to put elf segments, so I decided to do it after
first uprobe hit when loading is done.

We do not unmap and release uprobe trampoline when it's no longer needed,
because there's no easy way to make sure none of the threads is still
inside the trampoline. But we do not waste memory, because there's just
single page for all the uprobe trampoline mappings.

We do waste frmae on page mapping for every 4GB by keeping the uprobe
trampoline page mapped, but that seems ok.

Attaching the speed up from benchs/run_bench_uprobes.sh script:

current:

     uprobe-nop     :    3.281 ± 0.003M/s
     uprobe-push    :    3.085 ± 0.003M/s
     uprobe-ret     :    1.130 ± 0.000M/s
 --> uprobe-nop5    :    3.276 ± 0.007M/s
     uretprobe-nop  :    1.716 ± 0.016M/s
     uretprobe-push :    1.651 ± 0.017M/s
     uretprobe-ret  :    0.846 ± 0.006M/s
 --> uretprobe-nop5 :    3.279 ± 0.002M/s

after the change:

     uprobe-nop     :    3.246 ± 0.004M/s
     uprobe-push    :    3.057 ± 0.000M/s
     uprobe-ret     :    1.113 ± 0.003M/s
 --> uprobe-nop5    :    6.751 ± 0.037M/s
     uretprobe-nop  :    1.740 ± 0.015M/s
     uretprobe-push :    1.677 ± 0.018M/s
     uretprobe-ret  :    0.852 ± 0.005M/s
 --> uretprobe-nop5 :    6.769 ± 0.040M/s

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/include/asm/uprobes.h |   7 ++
 arch/x86/kernel/uprobes.c      | 168 ++++++++++++++++++++++++++++++++-
 include/linux/uprobes.h        |   1 +
 kernel/events/uprobes.c        |   8 ++
 4 files changed, 181 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 678fb546f0a7..84a75ed748f0 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
 #define UPROBE_SWBP_INSN		0xcc
 #define UPROBE_SWBP_INSN_SIZE		   1
 
+enum {
+	ARCH_UPROBE_FLAG_CAN_OPTIMIZE	= 0,
+	ARCH_UPROBE_FLAG_OPTIMIZED	= 1,
+};
+
 struct uprobe_xol_ops;
 
 struct arch_uprobe {
@@ -45,6 +50,8 @@ struct arch_uprobe {
 			u8	ilen;
 		}			push;
 	};
+
+	unsigned long flags;
 };
 
 struct arch_uprobe_task {
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index cdea97f8cd39..b2420eeee23a 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -18,6 +18,7 @@
 #include <asm/processor.h>
 #include <asm/insn.h>
 #include <asm/mmu_context.h>
+#include <asm/nops.h>
 
 /* Post-execution fixups. */
 
@@ -914,8 +915,37 @@ static int is_nop5_insn(uprobe_opcode_t *insn)
 	return !memcmp(insn, x86_nops[5], 5);
 }
 
+static int is_call_insn(uprobe_opcode_t *insn)
+{
+	return *insn == CALL_INSN_OPCODE;
+}
+
+static void relative_insn(void *dest, void *from, void *to, u8 op)
+{
+	struct __arch_relative_insn {
+		u8 op;
+		s32 raddr;
+	} __packed *insn;
+
+	insn = (struct __arch_relative_insn *)dest;
+	insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+	insn->op = op;
+}
+
+static void relative_call(void *dest, void *from, void *to)
+{
+	relative_insn(dest, from, to, CALL_INSN_OPCODE);
+}
+
+static bool can_optimize_vaddr(unsigned long vaddr)
+{
+	/* We can't do cross page atomic writes yet. */
+	return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
+}
+
 /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
-static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn,
+				unsigned long vaddr)
 {
 	u8 opc1 = OPCODE1(insn);
 	insn_byte_t p;
@@ -933,8 +963,11 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 		break;
 
 	case 0x0f:
-		if (is_nop5_insn((uprobe_opcode_t *) &auprobe->insn))
+		if (is_nop5_insn((uprobe_opcode_t *) &auprobe->insn)) {
+			if (can_optimize_vaddr(vaddr))
+				set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
 			goto setup;
+		}
 		if (insn->opcode.nbytes != 2)
 			return -ENOSYS;
 		/*
@@ -1065,7 +1098,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	if (ret)
 		return ret;
 
-	ret = branch_setup_xol_ops(auprobe, &insn);
+	ret = branch_setup_xol_ops(auprobe, &insn, addr);
 	if (ret != -ENOSYS)
 		return ret;
 
@@ -1306,3 +1339,132 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
 	else
 		return regs->sp <= ret->stack;
 }
+
+int arch_uprobe_verify_opcode(struct arch_uprobe *auprobe, struct page *page,
+			      unsigned long vaddr, uprobe_opcode_t *new_opcode,
+			      int nbytes)
+{
+	uprobe_opcode_t old_opcode[5];
+	bool is_call, is_swbp, is_nop5;
+
+	if (!test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags))
+		return uprobe_verify_opcode(page, vaddr, new_opcode);
+
+	/*
+	 * The ARCH_UPROBE_FLAG_CAN_OPTIMIZE flag guarantees the following
+	 * 5 bytes read won't cross the page boundary.
+	 */
+	uprobe_copy_from_page(page, vaddr, (uprobe_opcode_t *) &old_opcode, 5);
+	is_call = is_call_insn((uprobe_opcode_t *) &old_opcode);
+	is_swbp = is_swbp_insn((uprobe_opcode_t *) &old_opcode);
+	is_nop5 = is_nop5_insn((uprobe_opcode_t *) &old_opcode);
+
+	/*
+	 * We allow following trasitions for optimized uprobes:
+	 *
+	 *   nop5 -> swbp -> call
+	 *   ||      |       |
+	 *   |'--<---'       |
+	 *   '---<-----------'
+	 *
+	 * We return 1 to ack the write, 0 to do nothing, -1 to fail write.
+	 *
+	 * If the current opcode (old_opcode) has already desired value,
+	 * we do nothing, because we are racing with another thread doing
+	 * the update.
+	 */
+	switch (nbytes) {
+	case 5:
+		if (is_call_insn(new_opcode)) {
+			if (is_swbp)
+				return 1;
+			if (is_call && !memcmp(new_opcode, &old_opcode, 5))
+				return 0;
+		} else {
+			if (is_call || is_swbp)
+				return 1;
+			if (is_nop5)
+				return 0;
+		}
+		break;
+	case 1:
+		if (is_swbp_insn(new_opcode)) {
+			if (is_nop5)
+				return 1;
+			if (is_swbp || is_call)
+				return 0;
+		} else {
+			if (is_swbp || is_call)
+				return 1;
+			if (is_nop5)
+				return 0;
+		}
+	}
+	return -1;
+}
+
+bool arch_uprobe_is_register(uprobe_opcode_t *insn, int nbytes)
+{
+	return nbytes == 5 ? is_call_insn(insn) : is_swbp_insn(insn);
+}
+
+static void __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
+				   unsigned long vaddr)
+{
+	struct uprobe_trampoline *tramp;
+	char call[5];
+
+	tramp = uprobe_trampoline_get(vaddr);
+	if (!tramp)
+		goto fail;
+
+	relative_call(call, (void *) vaddr, (void *) tramp->vaddr);
+	if (uprobe_write_opcode(auprobe, mm, vaddr, call, 5))
+		goto fail;
+
+	set_bit(ARCH_UPROBE_FLAG_OPTIMIZED, &auprobe->flags);
+	return;
+
+fail:
+	/* Once we fail we never try again. */
+	clear_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
+	uprobe_trampoline_put(tramp);
+}
+
+static bool should_optimize(struct arch_uprobe *auprobe)
+{
+	if (!test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags))
+		return false;
+	if (test_bit(ARCH_UPROBE_FLAG_OPTIMIZED, &auprobe->flags))
+		return false;
+	return true;
+}
+
+void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	struct mm_struct *mm = current->mm;
+
+	if (!should_optimize(auprobe))
+		return;
+
+	mmap_write_lock(mm);
+	if (should_optimize(auprobe))
+		__arch_uprobe_optimize(auprobe, mm, vaddr);
+	mmap_write_unlock(mm);
+}
+
+int set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+{
+	uprobe_opcode_t *insn = (uprobe_opcode_t *) auprobe->insn;
+
+	if (test_bit(ARCH_UPROBE_FLAG_OPTIMIZED, &auprobe->flags))
+		return uprobe_write_opcode(auprobe, mm, vaddr, insn, 5);
+
+	return uprobe_write_opcode(auprobe, mm, vaddr, insn, UPROBE_SWBP_INSN_SIZE);
+}
+
+bool arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr)
+{
+	long delta = (long)(vaddr + 5 - vtramp);
+	return delta >= INT_MIN && delta <= INT_MAX;
+}
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 5e9a33bfb747..1b14b9f2f8d0 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -233,6 +233,7 @@ extern void uprobe_trampoline_put(struct uprobe_trampoline *area);
 extern bool arch_uprobe_is_callable(unsigned long vtramp, unsigned long vaddr);
 extern const struct vm_special_mapping *arch_uprobe_trampoline_mapping(void);
 extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
+extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 52f38d1ef276..a7a3eeec9e51 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2654,6 +2654,11 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
 	return true;
 }
 
+void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
+{
+	return;
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -2718,6 +2723,9 @@ static void handle_swbp(struct pt_regs *regs)
 
 	handler_chain(uprobe, regs);
 
+	/* Try to optimize after first hit. */
+	arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
+
 	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
 		goto out;
 
-- 
2.47.0


  parent reply	other threads:[~2024-12-11 13:35 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-11 13:33 [PATCH bpf-next 00/13] uprobes: Add support to optimize usdt probes on x86_64 Jiri Olsa
2024-12-11 13:33 ` [PATCH bpf-next 01/13] uprobes: Rename arch_uretprobe_trampoline function Jiri Olsa
2024-12-13  0:42   ` Andrii Nakryiko
2024-12-11 13:33 ` [PATCH bpf-next 02/13] uprobes: Make copy_from_page global Jiri Olsa
2024-12-13  0:43   ` Andrii Nakryiko
2024-12-11 13:33 ` [PATCH bpf-next 03/13] uprobes: Add nbytes argument to uprobe_write_opcode Jiri Olsa
2024-12-13  0:45   ` Andrii Nakryiko
2024-12-11 13:33 ` [PATCH bpf-next 04/13] uprobes: Add arch_uprobe_verify_opcode function Jiri Olsa
2024-12-13  0:48   ` Andrii Nakryiko
2024-12-13 13:21     ` Jiri Olsa
2024-12-13 21:11       ` Andrii Nakryiko
2024-12-13 21:52         ` Jiri Olsa
2024-12-11 13:33 ` [PATCH bpf-next 05/13] uprobes: Add mapping for optimized uprobe trampolines Jiri Olsa
2024-12-13  1:01   ` Andrii Nakryiko
2024-12-13 13:42     ` Jiri Olsa
2024-12-13 21:58       ` Andrii Nakryiko
2024-12-11 13:33 ` [PATCH bpf-next 06/13] uprobes/x86: Add uprobe syscall to speed up uprobe Jiri Olsa
2024-12-13 13:48   ` Thomas Weißschuh
2024-12-13 14:51     ` Jiri Olsa
2024-12-13 15:12       ` Thomas Weißschuh
2024-12-13 21:52         ` Jiri Olsa
2024-12-14 13:21           ` Thomas Weißschuh
2024-12-16  8:03             ` Jiri Olsa
2024-12-11 13:33 ` [PATCH bpf-next 07/13] uprobes/x86: Add support to emulate nop5 instruction Jiri Olsa
2024-12-13 10:45   ` Peter Zijlstra
2024-12-13 13:02     ` Jiri Olsa
2024-12-11 13:33 ` Jiri Olsa [this message]
2024-12-13 10:49   ` [PATCH bpf-next 08/13] uprobes/x86: Add support to optimize uprobes Peter Zijlstra
2024-12-13 13:06     ` Jiri Olsa
2024-12-13 21:58   ` Andrii Nakryiko
2024-12-15 12:06   ` David Laight
2024-12-15 14:14     ` Oleg Nesterov
2024-12-16  8:08       ` Jiri Olsa
2024-12-16  9:18         ` David Laight
2024-12-16 10:12           ` Oleg Nesterov
2024-12-16 11:10             ` David Laight
2024-12-16 12:22               ` Oleg Nesterov
2024-12-16 12:50                 ` Jiri Olsa
2024-12-16 15:08                   ` David Laight
2024-12-16 16:06                     ` Jiri Olsa
2024-12-11 13:33 ` [PATCH bpf-next 09/13] selftests/bpf: Use 5-byte nop for x86 usdt probes Jiri Olsa
2024-12-13 21:58   ` Andrii Nakryiko
2024-12-16  8:32     ` Jiri Olsa
2024-12-16 23:06       ` Andrii Nakryiko
2024-12-11 13:33 ` [PATCH bpf-next 10/13] selftests/bpf: Add uprobe/usdt optimized test Jiri Olsa
2024-12-13 21:58   ` Andrii Nakryiko
2024-12-16  7:58     ` Jiri Olsa
2024-12-11 13:34 ` [PATCH bpf-next 11/13] selftests/bpf: Add hit/attach/detach race optimized uprobe test Jiri Olsa
2024-12-13 21:58   ` Andrii Nakryiko
2024-12-16  7:59     ` Jiri Olsa
2024-12-11 13:34 ` [PATCH bpf-next 12/13] selftests/bpf: Add uprobe syscall sigill signal test Jiri Olsa
2024-12-11 13:34 ` [PATCH bpf-next 13/13] selftests/bpf: Add 5-byte nop uprobe trigger bench Jiri Olsa
2024-12-13 21:57   ` Andrii Nakryiko
2024-12-16  7:56     ` Jiri Olsa
2024-12-13  0:43 ` [PATCH bpf-next 00/13] uprobes: Add support to optimize usdt probes on x86_64 Andrii Nakryiko
2024-12-13  9:46   ` Jiri Olsa
2024-12-13 10:51 ` Peter Zijlstra
2024-12-13 13:07   ` Jiri Olsa
2024-12-13 13:54     ` Peter Zijlstra
2024-12-13 14:05       ` Jiri Olsa
2024-12-13 18:39         ` Peter Zijlstra
2024-12-13 21:52           ` Jiri Olsa
2024-12-13 21:59             ` Andrii Nakryiko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241211133403.208920-9-jolsa@kernel.org \
    --to=jolsa@kernel.org \
    --cc=alan.maguire@oracle.com \
    --cc=andrii@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=haoluo@google.com \
    --cc=john.fastabend@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=mhiramat@kernel.org \
    --cc=oleg@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=songliubraving@fb.com \
    --cc=yhs@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox