[PATCH bpf-next v2 02/18] x86,bpf: add bpf_global

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-15  2:25   ` Alexei Starovoitov
  2025-07-03 12:15 ` [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args Menglong Dong
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 32+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, H. Peter Anvin, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, linux-kernel, netdev

Implement the bpf global trampoline "bpf_global_caller" for x86_64. Thanks
to Alexei's advice, we implement most of the global trampoline with C
instead of asm.

We implement the entry of the trampoline with a "__naked" function, who
will save the regs to an array on the stack and call
bpf_global_caller_run(). The entry will pass the address of the array and
the address of the rip to bpf_global_caller_run().

In bpf_global_caller_run(), we will find the metadata by the function ip.
For origin call case, we call kfunc_md_enter() to protect the metadata,
which is similar to __bpf_tramp_enter(). Then we will call all the BPF
progs, just like what BPF trampoline do.

Without origin call, the bpf_global_caller_run() will return 0, and the
entry will restore the regs and return; In origin call case, it will
return 1, and the entry will make the RSP skip the rip before return.

In the FENTRY case, the performance of global trampoline is ~10% slower
than BPF trampoline. The global trampoline is optimized by inline some
function call, such as __bpf_prog_enter_recur and __bpf_prog_exit_recur.
However, more condition, branch and memory read is used in the
bpf_global_caller.

In the FEXIT and MODIFY_RETURN cases, the performance of the global
trampoline is the same(or even better) than BPF trampoline. It make sense,
as we also make the function call to __bpf_tramp_enter and
__bpf_tramp_exit inlined in the bpf_global_caller.

In fact, we can do more optimization to the bpf_global_caller. For
example, we can define more bpf_global_caller_xx_run() function and make
the "if (prog->sleepable)" and "if (do_origin_call)" fixed. It can be done
in a next series. After such optimization, I believe the performance of
FENTRY_MULTI can be closer or the same to FENTRY. And for the
FEXIT/MODIFY_RETURN cases, the performance can be better.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
v2:
- rewrite the global trampoline with C instead of asm
---
 arch/x86/Kconfig            |   4 +
 arch/x86/net/bpf_jit_comp.c | 268 ++++++++++++++++++++++++++++++++++++
 include/linux/bpf_tramp.h   |  72 ++++++++++
 kernel/bpf/trampoline.c     |  23 +---
 4 files changed, 346 insertions(+), 21 deletions(-)
 create mode 100644 include/linux/bpf_tramp.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 71019b3b54ea..96962c61419a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -155,6 +155,7 @@ config X86
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select ARCH_HAS_PARANOID_L1D_FLUSH
 	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
+	select ARCH_HAS_BPF_GLOBAL_CALLER	if X86_64
 	select BUILDTIME_TABLE_SORT
 	select CLKEVT_I8253
 	select CLOCKSOURCE_WATCHDOG
@@ -432,6 +433,9 @@ config PGTABLE_LEVELS
 	default 3 if X86_PAE
 	default 2
 
+config ARCH_HAS_BPF_GLOBAL_CALLER
+	bool
+
 menu "Processor type and features"
 
 config SMP
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 15672cb926fc..8d2fc436a748 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -11,6 +11,8 @@
 #include <linux/bpf.h>
 #include <linux/memory.h>
 #include <linux/sort.h>
+#include <linux/bpf_tramp.h>
+#include <linux/kfunc_md.h>
 #include <asm/extable.h>
 #include <asm/ftrace.h>
 #include <asm/set_memory.h>
@@ -3413,6 +3415,272 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 	return ret;
 }
 
+#define FUNC_ARGS_0		((2 - 1) * 8)
+#define FUNC_ARGS_1		((2 + 0) * 8)
+#define FUNC_ARGS_2		((2 + 1) * 8)
+#define FUNC_ARGS_3		((2 + 2) * 8)
+#define FUNC_ARGS_4		((2 + 3) * 8)
+#define FUNC_ARGS_5		((2 + 4) * 8)
+#define FUNC_ARGS_6		((2 + 5) * 8)
+
+#define SAVE_ARGS_0
+#define SAVE_ARGS_1						\
+	"movq %rdi, " __stringify(FUNC_ARGS_1) "(%rsp)\n"
+#define SAVE_ARGS_2 SAVE_ARGS_1					\
+	"movq %rsi, " __stringify(FUNC_ARGS_2) "(%rsp)\n"
+#define SAVE_ARGS_3 SAVE_ARGS_2					\
+	"movq %rdx, " __stringify(FUNC_ARGS_3) "(%rsp)\n"
+#define SAVE_ARGS_4 SAVE_ARGS_3					\
+	"movq %rcx, " __stringify(FUNC_ARGS_4) "(%rsp)\n"
+#define SAVE_ARGS_5 SAVE_ARGS_4					\
+	"movq %r8, " __stringify(FUNC_ARGS_5) "(%rsp)\n"
+#define SAVE_ARGS_6 SAVE_ARGS_5					\
+	"movq %r9, " __stringify(FUNC_ARGS_6) "(%rsp)\n"	\
+
+#define RESTORE_ARGS_0
+#define RESTORE_ARGS_1						\
+	"movq " __stringify(FUNC_ARGS_1) "(%rsp), %rdi\n"
+#define RESTORE_ARGS_2 RESTORE_ARGS_1				\
+	"movq " __stringify(FUNC_ARGS_2) "(%rsp), %rsi\n"
+#define RESTORE_ARGS_3 RESTORE_ARGS_2				\
+	"movq " __stringify(FUNC_ARGS_3) "(%rsp), %rdx\n"
+#define RESTORE_ARGS_4 RESTORE_ARGS_3				\
+	"movq " __stringify(FUNC_ARGS_4) "(%rsp), %rcx\n"
+#define RESTORE_ARGS_5 RESTORE_ARGS_4				\
+	"movq " __stringify(FUNC_ARGS_5) "(%rsp), %r8\n"
+#define RESTORE_ARGS_6 RESTORE_ARGS_5				\
+	"movq " __stringify(FUNC_ARGS_6) "(%rsp), %r9\n"
+
+#define RESTORE_ORIGIN_0
+#define RESTORE_ORIGIN_1						\
+	"movq " __stringify(FUNC_ARGS_1 - FUNC_ARGS_1) "(%[args]), %%rdi\n"
+#define RESTORE_ORIGIN_2 RESTORE_ORIGIN_1				\
+	"movq " __stringify(FUNC_ARGS_2 - FUNC_ARGS_1) "(%[args]), %%rsi\n"
+#define RESTORE_ORIGIN_3 RESTORE_ORIGIN_2				\
+	"movq " __stringify(FUNC_ARGS_3 - FUNC_ARGS_1) "(%[args]), %%rdx\n"
+#define RESTORE_ORIGIN_4 RESTORE_ORIGIN_3				\
+	"movq " __stringify(FUNC_ARGS_4 - FUNC_ARGS_1) "(%[args]), %%rcx\n"
+#define RESTORE_ORIGIN_5 RESTORE_ORIGIN_4				\
+	"movq " __stringify(FUNC_ARGS_5 - FUNC_ARGS_1) "(%[args]), %%r8\n"
+#define RESTORE_ORIGIN_6 RESTORE_ORIGIN_5				\
+	"movq " __stringify(FUNC_ARGS_6 - FUNC_ARGS_1) "(%[args]), %%r9\n"
+
+static __always_inline void
+do_origin_call(unsigned long *args, unsigned long *ip, int nr_args)
+{
+	/* Following code will be optimized by the compiler, as nr_args
+	 * is a const, and there will be no condition here.
+	 */
+	if (nr_args == 0) {
+		asm volatile(
+			RESTORE_ORIGIN_0 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			:
+		);
+	} else if (nr_args == 1) {
+		asm volatile(
+			RESTORE_ORIGIN_1 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi"
+		);
+	} else if (nr_args == 2) {
+		asm volatile(
+			RESTORE_ORIGIN_2 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi"
+		);
+	} else if (nr_args == 3) {
+		asm volatile(
+			RESTORE_ORIGIN_3 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx"
+		);
+	} else if (nr_args == 4) {
+		asm volatile(
+			RESTORE_ORIGIN_4 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx", "rcx"
+		);
+	} else if (nr_args == 5) {
+		asm volatile(
+			RESTORE_ORIGIN_5 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx", "rcx", "r8"
+		);
+	} else if (nr_args == 6) {
+		asm volatile(
+			RESTORE_ORIGIN_6 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx", "rcx", "r8", "r9"
+		);
+	}
+}
+
+static __always_inline notrace void
+run_tramp_prog(struct kfunc_md_tramp_prog *tramp_prog,
+	       struct bpf_tramp_run_ctx *run_ctx, unsigned long *args)
+{
+	struct bpf_prog *prog;
+	u64 start_time;
+
+	while (tramp_prog) {
+		prog = tramp_prog->prog;
+		run_ctx->bpf_cookie = tramp_prog->cookie;
+		start_time = bpf_gtramp_enter(prog, run_ctx);
+
+		if (likely(start_time)) {
+			asm volatile(
+				CALL_NOSPEC "\n"
+				: : [thunk_target]"r"(prog->bpf_func), [args]"D"(args)
+			);
+		}
+
+		bpf_gtramp_exit(prog, start_time, run_ctx);
+		tramp_prog = tramp_prog->next;
+	}
+}
+
+static __always_inline notrace int
+bpf_global_caller_run(unsigned long *args, unsigned long *ip, int nr_args)
+{
+	unsigned long origin_ip = (*ip) & 0xfffffffffffffff0; // Align to 16 bytes
+	struct kfunc_md_tramp_prog *tramp_prog;
+	struct bpf_tramp_run_ctx run_ctx;
+	struct kfunc_md *md;
+	bool do_orgin;
+
+	rcu_read_lock();
+	md = kfunc_md_get_rcu(origin_ip);
+	do_orgin = md->bpf_origin_call;
+	if (do_orgin)
+		kfunc_md_enter(md);
+	rcu_read_unlock();
+
+	/* save the origin function ip for bpf_get_func_ip() */
+	*(args - 2) = origin_ip;
+	*(args - 1) = nr_args;
+
+	run_tramp_prog(md->bpf_progs[BPF_TRAMP_FENTRY], &run_ctx, args);
+
+	/* no fexit and modify_return, return directly */
+	if (!do_orgin)
+		return 0;
+
+	/* modify return case */
+	tramp_prog = md->bpf_progs[BPF_TRAMP_MODIFY_RETURN];
+	/* initialize return value */
+	args[nr_args] = 0;
+	while (tramp_prog) {
+		struct bpf_prog *prog;
+		u64 start_time, ret;
+
+		prog = tramp_prog->prog;
+		run_ctx.bpf_cookie = tramp_prog->cookie;
+		start_time = bpf_gtramp_enter(prog, &run_ctx);
+
+		if (likely(start_time)) {
+			asm volatile(
+				CALL_NOSPEC "\n"
+				: "=a"(ret), ASM_CALL_CONSTRAINT
+				: [thunk_target]"r"(prog->bpf_func),
+				  [args]"D"(args)
+			);
+			args[nr_args] = ret;
+		} else {
+			ret = 0;
+		}
+
+		bpf_gtramp_exit(prog, start_time, &run_ctx);
+		if (ret)
+			goto do_fexit;
+		tramp_prog = tramp_prog->next;
+	}
+
+	/* restore the function arguments and call the origin function */
+	do_origin_call(args, ip, nr_args);
+do_fexit:
+	run_tramp_prog(md->bpf_progs[BPF_TRAMP_FEXIT], &run_ctx, args);
+	kfunc_md_exit(md);
+	return 1;
+}
+
+/* Layout of the stack frame:
+ *   rip		----> 8 bytes
+ *   return value	----> 8 bytes
+ *   args		----> 8 * 6 bytes
+ *   arg count		----> 8 bytes
+ *   origin ip		----> 8 bytes
+ */
+#define stack_size __stringify(8 + 8 + 6 * 8 + 8)
+
+#define CALLER_DEFINE(name, nr_args)					\
+static __always_used __no_stack_protector notrace int			\
+name##_run(unsigned long *args, unsigned long *ip)			\
+{									\
+	return bpf_global_caller_run(args, ip, nr_args);		\
+}									\
+static __naked void name(void)						\
+{									\
+	asm volatile(							\
+		"subq $" stack_size ", %rsp\n"				\
+		SAVE_ARGS_##nr_args					\
+	);								\
+									\
+	asm volatile(							\
+		"leaq " __stringify(FUNC_ARGS_1) "(%rsp), %rdi\n"	\
+		"leaq " stack_size "(%rsp), %rsi\n"			\
+		"call " #name "_run\n"					\
+		"test %rax, %rax\n"					\
+		"jne 1f\n"						\
+	);								\
+									\
+	asm volatile(							\
+		RESTORE_ARGS_##nr_args					\
+		"addq $" stack_size ", %rsp\n"				\
+		ASM_RET							\
+	);								\
+									\
+	asm volatile(							\
+		"1:\n"							\
+		"movq " __stringify(FUNC_ARGS_##nr_args + 8)		\
+		"(%rsp), %rax\n"					\
+		"addq $(" stack_size " + 8), %rsp\n"			\
+		ASM_RET);						\
+}									\
+STACK_FRAME_NON_STANDARD(name)
+
+CALLER_DEFINE(bpf_global_caller_0, 0);
+CALLER_DEFINE(bpf_global_caller_1, 1);
+CALLER_DEFINE(bpf_global_caller_2, 2);
+CALLER_DEFINE(bpf_global_caller_3, 3);
+CALLER_DEFINE(bpf_global_caller_4, 4);
+CALLER_DEFINE(bpf_global_caller_5, 5);
+CALLER_DEFINE(bpf_global_caller_6, 6);
+
+void *bpf_gloabl_caller_array[MAX_BPF_FUNC_ARGS + 1] = {
+	bpf_global_caller_0,
+	bpf_global_caller_1,
+	bpf_global_caller_2,
+	bpf_global_caller_3,
+	bpf_global_caller_4,
+	bpf_global_caller_5,
+	bpf_global_caller_6,
+};
+
 static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
 {
 	u8 *jg_reloc, *prog = *pprog;
diff --git a/include/linux/bpf_tramp.h b/include/linux/bpf_tramp.h
new file mode 100644
index 000000000000..32447fcfc017
--- /dev/null
+++ b/include/linux/bpf_tramp.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __LINUX_BPF_TRAMP_H__
+#define __LINUX_BPF_TRAMP_H__
+#ifdef CONFIG_BPF_JIT
+#include <linux/filter.h>
+
+#ifdef CONFIG_ARCH_HAS_BPF_GLOBAL_CALLER
+extern void *bpf_gloabl_caller_array[MAX_BPF_FUNC_ARGS + 1];
+#endif
+
+void notrace __update_prog_stats(struct bpf_prog *prog, u64 start);
+
+#define NO_START_TIME 1
+static __always_inline u64 notrace bpf_prog_start_time(void)
+{
+	u64 start = NO_START_TIME;
+
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
+		start = sched_clock();
+		if (unlikely(!start))
+			start = NO_START_TIME;
+	}
+	return start;
+}
+
+static __always_inline void notrace update_prog_stats(struct bpf_prog *prog,
+						      u64 start)
+{
+	if (static_branch_unlikely(&bpf_stats_enabled_key))
+		__update_prog_stats(prog, start);
+}
+
+static __always_inline u64 notrace
+bpf_gtramp_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+	__acquires(RCU)
+{
+	if (unlikely(prog->sleepable)) {
+		rcu_read_lock_trace();
+		might_fault();
+	} else {
+		rcu_read_lock();
+	}
+	migrate_disable();
+
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+		bpf_prog_inc_misses_counter(prog);
+		if (prog->aux->recursion_detected)
+			prog->aux->recursion_detected(prog);
+		return 0;
+	}
+	return bpf_prog_start_time();
+}
+
+static __always_inline void notrace
+bpf_gtramp_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx)
+	__releases(RCU)
+{
+	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+	update_prog_stats(prog, start);
+	this_cpu_dec(*(prog->active));
+	migrate_enable();
+	if (unlikely(prog->sleepable))
+		rcu_read_unlock_trace();
+	else
+		rcu_read_unlock();
+}
+
+#endif
+#endif
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index b1e358c16eeb..fa90c225c93b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -13,6 +13,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/bpf_lsm.h>
 #include <linux/delay.h>
+#include <linux/bpf_tramp.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -868,19 +869,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
 	mutex_unlock(&trampoline_mutex);
 }
 
-#define NO_START_TIME 1
-static __always_inline u64 notrace bpf_prog_start_time(void)
-{
-	u64 start = NO_START_TIME;
-
-	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
-		start = sched_clock();
-		if (unlikely(!start))
-			start = NO_START_TIME;
-	}
-	return start;
-}
-
 /* The logic is similar to bpf_prog_run(), but with an explicit
  * rcu_read_lock() and migrate_disable() which are required
  * for the trampoline. The macro is split into
@@ -911,7 +899,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
 	return bpf_prog_start_time();
 }
 
-static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
+void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
 {
 	struct bpf_prog_stats *stats;
 	unsigned long flags;
@@ -932,13 +920,6 @@ static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
 	u64_stats_update_end_irqrestore(&stats->syncp, flags);
 }
 
-static __always_inline void notrace update_prog_stats(struct bpf_prog *prog,
-						      u64 start)
-{
-	if (static_branch_unlikely(&bpf_stats_enabled_key))
-		__update_prog_stats(prog, start);
-}
-
 static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
 					  struct bpf_tramp_run_ctx *run_ctx)
 	__releases(RCU)
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
  2025-07-03 12:15 ` [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-03 12:15 ` [PATCH bpf-next v2 10/18] x86,bpf: factor out arch_bpf_get_regs_nr Menglong Dong
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 32+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, John Fastabend, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Simon Horman, linux-kernel, netdev

In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
which is used to record the accessed index of the function args in
btf_ctx_access().

Meanwhile, we add the function btf_check_func_part_match() to compare the
accessed function args of two function prototype. This function will be
used in the following commit.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 include/linux/bpf.h   |   4 ++
 include/linux/btf.h   |   3 +-
 kernel/bpf/btf.c      | 108 +++++++++++++++++++++++++++++++++++++++++-
 net/sched/bpf_qdisc.c |   2 +-
 4 files changed, 113 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 70bf613d51d0..5e6d83750d39 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1605,6 +1605,7 @@ struct bpf_prog_aux {
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
 	const char *attach_func_name;
+	u64 accessed_args;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
 	struct bpf_jit_poke_descriptor *poke_tab;
@@ -2790,6 +2791,9 @@ struct bpf_reg_state;
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
+int btf_check_func_part_match(struct btf *btf1, const struct btf_type *t1,
+			      struct btf *btf2, const struct btf_type *t2,
+			      u64 func_args);
 const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
 				    int comp_idx, const char *tag_key);
 int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index a40beb9cf160..b2b56249ce11 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -524,7 +524,8 @@ bool btf_param_match_suffix(const struct btf *btf,
 			    const char *suffix);
 int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
 		       u32 arg_no);
-u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off);
+u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+		    int off, int *aligned_idx);
 
 struct bpf_verifier_log;
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 05fd64a371af..853ca19bbe81 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6404,19 +6404,24 @@ static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t)
 }
 
 u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
-		    int off)
+		    int off, int *aligned_idx)
 {
 	const struct btf_param *args;
 	const struct btf_type *t;
 	u32 offset = 0, nr_args;
 	int i;
 
+	if (aligned_idx)
+		*aligned_idx = -ENOENT;
+
 	if (!func_proto)
 		return off / 8;
 
 	nr_args = btf_type_vlen(func_proto);
 	args = (const struct btf_param *)(func_proto + 1);
 	for (i = 0; i < nr_args; i++) {
+		if (aligned_idx && offset == off)
+			*aligned_idx = i;
 		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
 		offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
 		if (off < offset)
@@ -6684,7 +6689,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			tname, off);
 		return false;
 	}
-	arg = btf_ctx_arg_idx(btf, t, off);
+	arg = btf_ctx_arg_idx(btf, t, off, NULL);
 	args = (const struct btf_param *)(t + 1);
 	/* if (t == NULL) Fall back to default BPF prog with
 	 * MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -6694,6 +6699,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		/* skip first 'void *__data' argument in btf_trace_##name typedef */
 		args++;
 		nr_args--;
+		prog->aux->accessed_args |= (1 << (arg + 1));
+	} else {
+		prog->aux->accessed_args |= (1 << arg);
 	}
 
 	if (arg > nr_args) {
@@ -7553,6 +7561,102 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
+static u32 get_ctx_arg_total_size(struct btf *btf, const struct btf_type *t)
+{
+	const struct btf_param *args;
+	u32 size = 0, nr_args;
+	int i;
+
+	nr_args = btf_type_vlen(t);
+	args = (const struct btf_param *)(t + 1);
+	for (i = 0; i < nr_args; i++) {
+		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+		size += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+	}
+
+	return size;
+}
+
+/* This function is similar to btf_check_func_type_match(), except that it
+ * only compare some function args of the function prototype t1 and t2.
+ */
+int btf_check_func_part_match(struct btf *btf1, const struct btf_type *func1,
+			      struct btf *btf2, const struct btf_type *func2,
+			      u64 func_args)
+{
+	const struct btf_param *args1, *args2;
+	u32 nargs1, i, offset = 0;
+	const char *s1, *s2;
+
+	if (!btf_type_is_func_proto(func1) || !btf_type_is_func_proto(func2))
+		return -EINVAL;
+
+	args1 = (const struct btf_param *)(func1 + 1);
+	args2 = (const struct btf_param *)(func2 + 1);
+	nargs1 = btf_type_vlen(func1);
+
+	for (i = 0; i <= nargs1; i++) {
+		const struct btf_type *t1, *t2;
+
+		if (!(func_args & (1 << i)))
+			goto next;
+
+		if (i < nargs1) {
+			int t2_index;
+
+			/* get the index of the arg corresponding to args1[i]
+			 * by the offset.
+			 */
+			btf_ctx_arg_idx(btf2, func2, offset, &t2_index);
+			if (t2_index < 0)
+				return -EINVAL;
+
+			t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+			t2 = btf_type_skip_modifiers(btf2, args2[t2_index].type,
+						     NULL);
+		} else {
+			/* i == nargs1, this is the index of return value of t1 */
+			if (get_ctx_arg_total_size(btf1, func1) !=
+			    get_ctx_arg_total_size(btf2, func2))
+				return -EINVAL;
+
+			/* check the return type of t1 and t2 */
+			t1 = btf_type_skip_modifiers(btf1, func1->type, NULL);
+			t2 = btf_type_skip_modifiers(btf2, func2->type, NULL);
+		}
+
+		if (t1->info != t2->info ||
+		    (btf_type_has_size(t1) && t1->size != t2->size))
+			return -EINVAL;
+		if (btf_type_is_int(t1) || btf_is_any_enum(t1))
+			goto next;
+
+		if (btf_type_is_struct(t1))
+			goto on_struct;
+
+		if (!btf_type_is_ptr(t1))
+			return -EINVAL;
+
+		t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+		t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+		if (!btf_type_is_struct(t1) || !btf_type_is_struct(t2))
+			return -EINVAL;
+
+on_struct:
+		s1 = btf_name_by_offset(btf1, t1->name_off);
+		s2 = btf_name_by_offset(btf2, t2->name_off);
+		if (strcmp(s1, s2))
+			return -EINVAL;
+next:
+		if (i < nargs1) {
+			t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+			offset += btf_type_is_ptr(t1) ? 8 : roundup(t1->size, 8);
+		}
+	}
+
+	return 0;
+}
+
 static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
 {
 	const char *name;
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
index 7ea8b54b2ab1..4ce395a72996 100644
--- a/net/sched/bpf_qdisc.c
+++ b/net/sched/bpf_qdisc.c
@@ -38,7 +38,7 @@ static bool bpf_qdisc_is_valid_access(int off, int size,
 	struct btf *btf = prog->aux->attach_btf;
 	u32 arg;
 
-	arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off);
+	arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off, NULL);
 	if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) {
 		if (arg == 2 && type == BPF_READ) {
 			info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH bpf-next v2 10/18] x86,bpf: factor out arch_bpf_get_regs_nr
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
  2025-07-03 12:15 ` [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 11/18] bpf: tracing: add multi-link support Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 16/18] selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c Menglong Dong
  4 siblings, 0 replies; 32+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, H. Peter Anvin, netdev, linux-kernel

Factor the function arch_bpf_get_regs_nr() to get the regs count that used
by the function args.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 arch/x86/net/bpf_jit_comp.c | 22 +++++++++++++++-------
 include/linux/bpf.h         |  1 +
 kernel/bpf/verifier.c       |  5 +++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8d2fc436a748..7795917efc41 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3001,6 +3001,19 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
 	return 0;
 }
 
+int arch_bpf_get_regs_nr(const struct btf_func_model *m)
+{
+	int nr_regs = m->nr_args;
+
+	/* extra registers for struct arguments */
+	for (int i = 0; i < m->nr_args; i++) {
+		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
+			nr_regs += (m->arg_size[i] + 7) / 8 - 1;
+	}
+
+	return nr_regs;
+}
+
 /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
 #define LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack)	\
 	__LOAD_TCC_PTR(-round_up(stack, 8) - 8)
@@ -3071,7 +3084,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 					 struct bpf_tramp_links *tlinks,
 					 void *func_addr)
 {
-	int i, ret, nr_regs = m->nr_args, stack_size = 0;
+	int i, ret, nr_regs, stack_size = 0;
 	int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
 	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
 	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
@@ -3089,15 +3102,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 	WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
 		     (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));
 
-	/* extra registers for struct arguments */
-	for (i = 0; i < m->nr_args; i++) {
-		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
-			nr_regs += (m->arg_size[i] + 7) / 8 - 1;
-	}
-
 	/* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
 	 * are passed through regs, the remains are through stack.
 	 */
+	nr_regs = arch_bpf_get_regs_nr(m);
 	if (nr_regs > MAX_BPF_FUNC_ARGS)
 		return -ENOTSUPP;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bb3ab1aa3a9d..da5951b0335b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1220,6 +1220,7 @@ void arch_free_bpf_trampoline(void *image, unsigned int size);
 int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size);
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 			     struct bpf_tramp_links *tlinks, void *func_addr);
+int arch_bpf_get_regs_nr(const struct btf_func_model *m);
 
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d6311be5a63a..86a64d843465 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23274,6 +23274,11 @@ static int do_check_main(struct bpf_verifier_env *env)
 }
 
 
+int __weak arch_bpf_get_regs_nr(const struct btf_func_model *m)
+{
+	return -ENODEV;
+}
+
 static void print_verification_stats(struct bpf_verifier_env *env)
 {
 	int i;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH bpf-next v2 11/18] bpf: tracing: add multi-link support
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (2 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 10/18] x86,bpf: factor out arch_bpf_get_regs_nr Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 16/18] selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c Menglong Dong
  4 siblings, 0 replies; 32+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, John Fastabend, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Simon Horman, linux-kernel, netdev

In this commit, we add the support to allow attaching a tracing BPF
program to multi hooks, which is similar to BPF_TRACE_KPROBE_MULTI.

The use case is obvious. For now, we have to create a BPF program for each
kernel function, for which we want to trace, even through all the program
have the same (or similar logic). This can consume extra memory, and make
the program loading slow if we have plenty of kernel function to trace.
The KPROBE_MULTI maybe a alternative, but it can't do what TRACING do. For
example, the kretprobe can't obtain the function args, but the FEXIT can.

For now, we support to create multi-link for fentry/fexit/modify_return
with the following new attach types that we introduce:

  BPF_TRACE_FENTRY_MULTI
  BPF_TRACE_FEXIT_MULTI
  BPF_MODIFY_RETURN_MULTI

We introduce the struct bpf_tracing_multi_link for this purpose, which
can hold all the kernel modules, target bpf program (for attaching to bpf
program) or target btf (for attaching to kernel function) that we
referenced.

During loading, the first target is used for verification by the verifer.
And during attaching, we check the consistency of all the targets with
the first target.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 include/linux/bpf.h            |   9 +
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |  10 +
 kernel/bpf/btf.c               |   5 +
 kernel/bpf/syscall.c           | 353 +++++++++++++++++++++++++++++++++
 kernel/bpf/trampoline.c        |   7 +-
 kernel/bpf/verifier.c          |  25 ++-
 net/bpf/test_run.c             |   3 +
 net/core/bpf_sk_storage.c      |   2 +
 tools/include/uapi/linux/bpf.h |  10 +
 10 files changed, 421 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index da5951b0335b..b4f8e2a068e5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1806,6 +1806,15 @@ struct bpf_raw_tp_link {
 	u64 cookie;
 };
 
+struct bpf_tracing_multi_link {
+	struct bpf_gtramp_link link;
+	enum bpf_attach_type attach_type;
+	struct btf **tgt_btfs;
+	struct module **mods;
+	u32 btf_cnt;
+	u32 mods_cnt;
+};
+
 struct bpf_link_primer {
 	struct bpf_link *link;
 	struct file *file;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fa78f49d4a9a..139d5436ce4c 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -154,3 +154,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
 BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi)
 BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops)
 BPF_LINK_TYPE(BPF_LINK_TYPE_UPROBE_MULTI, uprobe_multi)
+BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING_MULTI, tracing_multi)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 719ba230032f..a143a64f69ae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1120,6 +1120,9 @@ enum bpf_attach_type {
 	BPF_NETKIT_PEER,
 	BPF_TRACE_KPROBE_SESSION,
 	BPF_TRACE_UPROBE_SESSION,
+	BPF_TRACE_FENTRY_MULTI,
+	BPF_TRACE_FEXIT_MULTI,
+	BPF_MODIFY_RETURN_MULTI,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1144,6 +1147,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
 	BPF_LINK_TYPE_SOCKMAP = 14,
+	BPF_LINK_TYPE_TRACING_MULTI = 15,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -1765,6 +1769,12 @@ union bpf_attr {
 				 */
 				__u64		cookie;
 			} tracing;
+			struct {
+				__u32		cnt;
+				__aligned_u64	tgt_fds;
+				__aligned_u64	btf_ids;
+				__aligned_u64	cookies;
+			} tracing_multi;
 			struct {
 				__u32		pf;
 				__u32		hooknum;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 853ca19bbe81..a25c2a0303f2 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6112,6 +6112,9 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct
 		case BPF_TRACE_FENTRY:
 		case BPF_TRACE_FEXIT:
 		case BPF_MODIFY_RETURN:
+		case BPF_TRACE_FENTRY_MULTI:
+		case BPF_TRACE_FEXIT_MULTI:
+		case BPF_MODIFY_RETURN_MULTI:
 			/* allow u64* as ctx */
 			if (btf_is_int(t) && t->size == 8)
 				return 0;
@@ -6718,6 +6721,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			fallthrough;
 		case BPF_LSM_CGROUP:
 		case BPF_TRACE_FEXIT:
+		case BPF_TRACE_FEXIT_MULTI:
 			/* When LSM programs are attached to void LSM hooks
 			 * they use FEXIT trampolines and when attached to
 			 * int LSM hooks, they use MODIFY_RETURN trampolines.
@@ -6736,6 +6740,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			t = btf_type_by_id(btf, t->type);
 			break;
 		case BPF_MODIFY_RETURN:
+		case BPF_MODIFY_RETURN_MULTI:
 			/* For now the BPF_MODIFY_RETURN can only be attached to
 			 * functions that return an int.
 			 */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b21bbbc4263d..01430308558f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -37,6 +37,7 @@
 #include <linux/trace_events.h>
 #include <linux/tracepoint.h>
 #include <linux/overflow.h>
+#include <linux/kfunc_md.h>
 
 #include <net/netfilter/nf_bpf_link.h>
 #include <net/netkit.h>
@@ -3469,6 +3470,34 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
 	.fill_link_info = bpf_tracing_link_fill_link_info,
 };
 
+static int bpf_tracing_check_multi(struct bpf_prog *prog,
+				   struct bpf_prog *tgt_prog,
+				   struct btf *btf2,
+				   const struct btf_type *t2)
+{
+	const struct btf_type *t1;
+	struct btf *btf1;
+
+	/* this case is already valided in bpf_check_attach_target() */
+	if (prog->type == BPF_PROG_TYPE_EXT)
+		return 0;
+
+	btf1 = prog->aux->dst_prog ? prog->aux->dst_prog->aux->btf :
+				     prog->aux->attach_btf;
+	if (!btf1)
+		return -EOPNOTSUPP;
+
+	btf2 = btf2 ?: tgt_prog->aux->btf;
+	t1 = prog->aux->attach_func_proto;
+
+	/* the target is the same as the origin one, this is a re-attach */
+	if (t1 == t2)
+		return 0;
+
+	return btf_check_func_part_match(btf1, t1, btf2, t2,
+					 prog->aux->accessed_args);
+}
+
 static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 				   int tgt_prog_fd,
 				   u32 btf_id,
@@ -3668,6 +3697,323 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	return err;
 }
 
+static void __bpf_tracing_multi_link_release(struct bpf_tracing_multi_link *link)
+{
+	int i;
+
+	if (link->mods_cnt) {
+		for (i = 0; i < link->mods_cnt; i++)
+			module_put(link->mods[i]);
+		kfree(link->mods);
+	}
+
+	if (link->btf_cnt) {
+		for (i = 0; i < link->btf_cnt; i++)
+			btf_put(link->tgt_btfs[i]);
+		kfree(link->tgt_btfs);
+	}
+
+	kfree(link->link.entries);
+}
+
+static void bpf_tracing_multi_link_release(struct bpf_link *link)
+{
+	struct bpf_tracing_multi_link *multi_link =
+		container_of(link, struct bpf_tracing_multi_link, link.link);
+
+	bpf_gtrampoline_unlink_prog(&multi_link->link);
+	__bpf_tracing_multi_link_release(multi_link);
+}
+
+static void bpf_tracing_multi_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_tracing_multi_link *tr_link =
+		container_of(link, struct bpf_tracing_multi_link, link.link);
+
+	kfree(tr_link);
+}
+
+static void bpf_tracing_multi_link_show_fdinfo(const struct bpf_link *link,
+					 struct seq_file *seq)
+{
+	struct bpf_tracing_multi_link *tr_link =
+		container_of(link, struct bpf_tracing_multi_link, link.link);
+	int i;
+
+	for (i = 0; i < tr_link->link.entry_cnt; i++) {
+		seq_printf(seq,
+			   "attach_type:\t%d\n"
+			   "target_addr:\t%p\n",
+			   tr_link->attach_type,
+			   tr_link->link.entries[i].addr);
+	}
+}
+
+static const struct bpf_link_ops bpf_tracing_multi_link_lops = {
+	.release = bpf_tracing_multi_link_release,
+	.dealloc = bpf_tracing_multi_link_dealloc,
+	.show_fdinfo = bpf_tracing_multi_link_show_fdinfo,
+};
+
+#define MAX_TRACING_MULTI_CNT	102400
+
+static int bpf_tracing_get_target(u32 fd, struct bpf_prog **tgt_prog,
+				  struct btf **tgt_btf)
+{
+	struct bpf_prog *prog = NULL;
+	struct btf *btf = NULL;
+	int err = 0;
+
+	if (fd) {
+		prog = bpf_prog_get(fd);
+		if (!IS_ERR(prog))
+			goto found;
+
+		prog = NULL;
+		/* "fd" is the fd of the kernel module BTF */
+		btf = btf_get_by_fd(fd);
+		if (IS_ERR(btf)) {
+			err = PTR_ERR(btf);
+			goto err;
+		}
+		if (!btf_is_kernel(btf)) {
+			btf_put(btf);
+			err = -EOPNOTSUPP;
+			goto err;
+		}
+	} else {
+		btf = bpf_get_btf_vmlinux();
+		if (IS_ERR(btf)) {
+			err = PTR_ERR(btf);
+			goto err;
+		}
+		if (!btf) {
+			err = -EINVAL;
+			goto err;
+		}
+		btf_get(btf);
+	}
+found:
+	*tgt_prog = prog;
+	*tgt_btf = btf;
+	return 0;
+err:
+	*tgt_prog = NULL;
+	*tgt_btf = NULL;
+	return err;
+}
+
+static int bpf_tracing_multi_link_check(const union bpf_attr *attr, u32 **btf_ids,
+					u32 **tgt_fds, u64 **cookies,
+					u32 cnt)
+{
+	void __user *ubtf_ids;
+	void __user *utgt_fds;
+	void __user *ucookies;
+	void *tmp;
+	int i;
+
+	if (!cnt)
+		return -EINVAL;
+
+	if (cnt > MAX_TRACING_MULTI_CNT)
+		return -E2BIG;
+
+	ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies);
+	if (ucookies) {
+		tmp = kvmalloc_array(cnt, sizeof(**cookies), GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+
+		*cookies = tmp;
+		if (copy_from_user(tmp, ucookies, cnt * sizeof(**cookies)))
+			return -EFAULT;
+	}
+
+	utgt_fds = u64_to_user_ptr(attr->link_create.tracing_multi.tgt_fds);
+	if (utgt_fds) {
+		tmp = kvmalloc_array(cnt, sizeof(**tgt_fds), GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+
+		*tgt_fds = tmp;
+		if (copy_from_user(tmp, utgt_fds, cnt * sizeof(**tgt_fds)))
+			return -EFAULT;
+	}
+
+	ubtf_ids = u64_to_user_ptr(attr->link_create.tracing_multi.btf_ids);
+	if (!ubtf_ids)
+		return -EINVAL;
+
+	tmp = kvmalloc_array(cnt, sizeof(**btf_ids), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	*btf_ids = tmp;
+	if (copy_from_user(tmp, ubtf_ids, cnt * sizeof(**btf_ids)))
+		return -EFAULT;
+
+	for (i = 0; i < cnt; i++) {
+		if (!(*btf_ids)[i])
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void bpf_tracing_multi_link_ptr_fill(struct bpf_tracing_multi_link *link,
+					    struct ptr_array *mods,
+					    struct ptr_array *btfs)
+{
+	link->mods = (struct module **) mods->ptrs;
+	link->mods_cnt = mods->cnt;
+	link->tgt_btfs = (struct btf **) btfs->ptrs;
+	link->btf_cnt = btfs->cnt;
+}
+
+static int bpf_tracing_prog_attach_multi(const union bpf_attr *attr,
+					 struct bpf_prog *prog)
+{
+	struct bpf_tracing_multi_link *link = NULL;
+	u32 cnt, *btf_ids = NULL, *tgt_fds = NULL;
+	struct bpf_link_primer link_primer;
+	struct ptr_array btf_array = { };
+	struct ptr_array mod_array = { };
+	u64 *cookies = NULL;
+	int err = 0, i;
+
+	if ((prog->expected_attach_type != BPF_TRACE_FENTRY_MULTI &&
+	     prog->expected_attach_type != BPF_TRACE_FEXIT_MULTI &&
+	     prog->expected_attach_type != BPF_MODIFY_RETURN_MULTI) ||
+	     prog->type != BPF_PROG_TYPE_TRACING)
+		return -EINVAL;
+
+	cnt = attr->link_create.tracing_multi.cnt;
+	err = bpf_tracing_multi_link_check(attr, &btf_ids, &tgt_fds, &cookies,
+					   cnt);
+	if (err)
+		goto err_out;
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	link->link.entries = kzalloc(sizeof(*link->link.entries) * cnt,
+				     GFP_USER);
+	if (!link->link.entries) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING_MULTI,
+		      &bpf_tracing_multi_link_lops, prog);
+	link->attach_type = prog->expected_attach_type;
+
+	mutex_lock(&prog->aux->dst_mutex);
+
+	for (i = 0; i < cnt; i++) {
+		struct bpf_attach_target_info tgt_info = {};
+		struct bpf_gtramp_link_entry *entry;
+		struct bpf_prog *tgt_prog = NULL;
+		u32 tgt_fd, btf_id = btf_ids[i];
+		struct btf *tgt_btf = NULL;
+		struct module *mod = NULL;
+		int nr_regs;
+
+		entry = &link->link.entries[i];
+		tgt_fd = tgt_fds ? tgt_fds[i] : 0;
+		err = bpf_tracing_get_target(tgt_fd, &tgt_prog, &tgt_btf);
+		if (err)
+			goto err_out_unlock;
+
+		if (tgt_prog) {
+			/* the global trampoline link is ftrace based, bpf2bpf
+			 * is not supported for now.
+			 */
+			bpf_prog_put(tgt_prog);
+			err = -EOPNOTSUPP;
+			goto err_out_unlock;
+		}
+
+		if (tgt_btf) {
+			err = bpf_try_add_ptr(&btf_array, tgt_btf);
+			if (err) {
+				btf_put(tgt_btf);
+				if (err != -EEXIST)
+					goto err_out_unlock;
+			}
+		}
+
+		prog->aux->attach_tracing_prog = tgt_prog &&
+			tgt_prog->type == BPF_PROG_TYPE_TRACING &&
+			prog->type == BPF_PROG_TYPE_TRACING;
+
+		err = bpf_check_attach_target(NULL, prog, tgt_prog, tgt_btf,
+					      btf_id, &tgt_info);
+		if (err)
+			goto err_out_unlock;
+
+		nr_regs = arch_bpf_get_regs_nr(&tgt_info.fmodel);
+		if (nr_regs < 0) {
+			err = nr_regs;
+			goto err_out_unlock;
+		}
+
+		mod = tgt_info.tgt_mod;
+		if (mod) {
+			err = bpf_try_add_ptr(&mod_array, mod);
+			if (err) {
+				module_put(mod);
+				if (err != -EEXIST)
+					goto err_out_unlock;
+			}
+		}
+
+		err = bpf_tracing_check_multi(prog, tgt_prog, tgt_btf,
+					      tgt_info.tgt_type);
+		if (err)
+			goto err_out_unlock;
+
+		entry->cookie = cookies ? cookies[i] : 0;
+		entry->addr = (void *)tgt_info.tgt_addr;
+		entry->tgt_prog = tgt_prog;
+		entry->attach_btf = tgt_btf;
+		entry->btf_id = btf_id;
+		entry->nr_args = nr_regs;
+
+		link->link.entry_cnt++;
+	}
+
+	err = bpf_gtrampoline_link_prog(&link->link);
+	if (err)
+		goto err_out_unlock;
+
+	err = bpf_link_prime(&link->link.link, &link_primer);
+	if (err) {
+		bpf_gtrampoline_unlink_prog(&link->link);
+		goto err_out_unlock;
+	}
+
+	bpf_tracing_multi_link_ptr_fill(link, &mod_array, &btf_array);
+	mutex_unlock(&prog->aux->dst_mutex);
+
+	kfree(btf_ids);
+	kfree(tgt_fds);
+	kfree(cookies);
+	return bpf_link_settle(&link_primer);
+err_out_unlock:
+	bpf_tracing_multi_link_ptr_fill(link, &mod_array, &btf_array);
+	__bpf_tracing_multi_link_release(link);
+	mutex_unlock(&prog->aux->dst_mutex);
+err_out:
+	kfree(btf_ids);
+	kfree(tgt_fds);
+	kfree(cookies);
+	kfree(link);
+	return err;
+}
+
 static void bpf_raw_tp_link_release(struct bpf_link *link)
 {
 	struct bpf_raw_tp_link *raw_tp =
@@ -4259,6 +4605,9 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 	case BPF_MODIFY_RETURN:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
+	case BPF_MODIFY_RETURN_MULTI:
 		return BPF_PROG_TYPE_TRACING;
 	case BPF_LSM_MAC:
 		return BPF_PROG_TYPE_LSM;
@@ -5581,6 +5930,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 			ret = bpf_iter_link_attach(attr, uattr, prog);
 		else if (prog->expected_attach_type == BPF_LSM_CGROUP)
 			ret = cgroup_bpf_link_attach(attr, prog);
+		else if (prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
+			 prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
+			 prog->expected_attach_type == BPF_MODIFY_RETURN_MULTI)
+			ret = bpf_tracing_prog_attach_multi(attr, prog);
 		else
 			ret = bpf_tracing_prog_attach(prog,
 						      attr->link_create.target_fd,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 8fcb0352f36e..07986669ada0 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -117,7 +117,9 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 
 	return (ptype == BPF_PROG_TYPE_TRACING &&
 		(eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
-		 eatype == BPF_MODIFY_RETURN)) ||
+		 eatype == BPF_MODIFY_RETURN ||
+		 eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI ||
+		 eatype == BPF_MODIFY_RETURN_MULTI)) ||
 		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
 }
 
@@ -516,10 +518,13 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 {
 	switch (prog->expected_attach_type) {
 	case BPF_TRACE_FENTRY:
+	case BPF_TRACE_FENTRY_MULTI:
 		return BPF_TRAMP_FENTRY;
 	case BPF_MODIFY_RETURN:
+	case BPF_MODIFY_RETURN_MULTI:
 		return BPF_TRAMP_MODIFY_RETURN;
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FEXIT_MULTI:
 		return BPF_TRAMP_FEXIT;
 	case BPF_LSM_MAC:
 		if (!prog->aux->attach_func_proto->type)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 86a64d843465..a44e1fed3fa1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17103,10 +17103,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 		switch (env->prog->expected_attach_type) {
 		case BPF_TRACE_FENTRY:
 		case BPF_TRACE_FEXIT:
+		case BPF_TRACE_FENTRY_MULTI:
+		case BPF_TRACE_FEXIT_MULTI:
 			range = retval_range(0, 0);
 			break;
 		case BPF_TRACE_RAW_TP:
 		case BPF_MODIFY_RETURN:
+		case BPF_MODIFY_RETURN_MULTI:
 			return 0;
 		case BPF_TRACE_ITER:
 			break;
@@ -22632,7 +22635,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		if (prog_type == BPF_PROG_TYPE_TRACING &&
 		    insn->imm == BPF_FUNC_get_func_ret) {
 			if (eatype == BPF_TRACE_FEXIT ||
-			    eatype == BPF_MODIFY_RETURN) {
+			    eatype == BPF_MODIFY_RETURN ||
+			    eatype == BPF_TRACE_FEXIT_MULTI ||
+			    eatype == BPF_MODIFY_RETURN_MULTI) {
 				/* Load nr_args from ctx - 8 */
 				insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
 				insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
@@ -23619,7 +23624,9 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
 		    prog_extension &&
 		    (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
-		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT ||
+		     tgt_prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
+		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI)) {
 			/* Program extensions can extend all program types
 			 * except fentry/fexit. The reason is the following.
 			 * The fentry/fexit programs are used for performance
@@ -23718,6 +23725,9 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	case BPF_LSM_CGROUP:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_MODIFY_RETURN_MULTI:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
 		if (!btf_type_is_func(t)) {
 			bpf_log(log, "attach_btf_id %u is not a function\n",
 				btf_id);
@@ -23803,7 +23813,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 				bpf_log(log, "%s is not sleepable\n", tname);
 				return ret;
 			}
-		} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
+		} else if (prog->expected_attach_type == BPF_MODIFY_RETURN ||
+			   prog->expected_attach_type == BPF_MODIFY_RETURN_MULTI) {
 			if (tgt_prog) {
 				module_put(mod);
 				bpf_log(log, "can't modify return codes of BPF programs\n");
@@ -23856,6 +23867,9 @@ static bool can_be_sleepable(struct bpf_prog *prog)
 		case BPF_TRACE_FEXIT:
 		case BPF_MODIFY_RETURN:
 		case BPF_TRACE_ITER:
+		case BPF_TRACE_FENTRY_MULTI:
+		case BPF_TRACE_FEXIT_MULTI:
+		case BPF_MODIFY_RETURN_MULTI:
 			return true;
 		default:
 			return false;
@@ -23930,6 +23944,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		return bpf_iter_prog_supported(prog);
 	}
 
+	if (prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
+	    prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
+	    prog->expected_attach_type == BPF_MODIFY_RETURN_MULTI)
+		return 0;
+
 	key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
 	tr = bpf_trampoline_get(key, &tgt_info);
 	if (!tr)
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 9728dbd4c66c..a5e5094a5189 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -696,6 +696,8 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
 	switch (prog->expected_attach_type) {
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
 		if (bpf_fentry_test1(1) != 2 ||
 		    bpf_fentry_test2(2, 3) != 5 ||
 		    bpf_fentry_test3(4, 5, 6) != 15 ||
@@ -709,6 +711,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
 			goto out;
 		break;
 	case BPF_MODIFY_RETURN:
+	case BPF_MODIFY_RETURN_MULTI:
 		ret = bpf_modify_return_test(1, &b);
 		if (b != 2)
 			side_effect++;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 2e538399757f..c5b1fd714b58 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -369,6 +369,8 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
 		return true;
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
 		return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
 				 strlen("bpf_sk_storage"));
 	default:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 719ba230032f..a143a64f69ae 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1120,6 +1120,9 @@ enum bpf_attach_type {
 	BPF_NETKIT_PEER,
 	BPF_TRACE_KPROBE_SESSION,
 	BPF_TRACE_UPROBE_SESSION,
+	BPF_TRACE_FENTRY_MULTI,
+	BPF_TRACE_FEXIT_MULTI,
+	BPF_MODIFY_RETURN_MULTI,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1144,6 +1147,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
 	BPF_LINK_TYPE_SOCKMAP = 14,
+	BPF_LINK_TYPE_TRACING_MULTI = 15,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -1765,6 +1769,12 @@ union bpf_attr {
 				 */
 				__u64		cookie;
 			} tracing;
+			struct {
+				__u32		cnt;
+				__aligned_u64	tgt_fds;
+				__aligned_u64	btf_ids;
+				__aligned_u64	cookies;
+			} tracing_multi;
 			struct {
 				__u32		pf;
 				__u32		hooknum;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH bpf-next v2 16/18] selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (3 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 11/18] bpf: tracing: add multi-link support Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  4 siblings, 0 replies; 32+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Mykola Lysenko, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Nick Desaulniers,
	Bill Wendling, Justin Stitt, linux-kselftest, linux-kernel,
	netdev, llvm

We need to get all the kernel function that can be traced sometimes, so we
move the get_syms() and get_addrs() in kprobe_multi_test.c to
trace_helpers.c and rename it to bpf_get_ksyms() and bpf_get_addrs().

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 .../bpf/prog_tests/kprobe_multi_test.c        | 220 +-----------------
 tools/testing/selftests/bpf/trace_helpers.c   | 214 +++++++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h   |   3 +
 3 files changed, 220 insertions(+), 217 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
index e19ef509ebf8..171706e78da8 100644
--- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
@@ -422,220 +422,6 @@ static void test_unique_match(void)
 	kprobe_multi__destroy(skel);
 }
 
-static size_t symbol_hash(long key, void *ctx __maybe_unused)
-{
-	return str_hash((const char *) key);
-}
-
-static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused)
-{
-	return strcmp((const char *) key1, (const char *) key2) == 0;
-}
-
-static bool is_invalid_entry(char *buf, bool kernel)
-{
-	if (kernel && strchr(buf, '['))
-		return true;
-	if (!kernel && !strchr(buf, '['))
-		return true;
-	return false;
-}
-
-static bool skip_entry(char *name)
-{
-	/*
-	 * We attach to almost all kernel functions and some of them
-	 * will cause 'suspicious RCU usage' when fprobe is attached
-	 * to them. Filter out the current culprits - arch_cpu_idle
-	 * default_idle and rcu_* functions.
-	 */
-	if (!strcmp(name, "arch_cpu_idle"))
-		return true;
-	if (!strcmp(name, "default_idle"))
-		return true;
-	if (!strncmp(name, "rcu_", 4))
-		return true;
-	if (!strcmp(name, "bpf_dispatcher_xdp_func"))
-		return true;
-	if (!strncmp(name, "__ftrace_invalid_address__",
-		     sizeof("__ftrace_invalid_address__") - 1))
-		return true;
-	return false;
-}
-
-/* Do comparision by ignoring '.llvm.<hash>' suffixes. */
-static int compare_name(const char *name1, const char *name2)
-{
-	const char *res1, *res2;
-	int len1, len2;
-
-	res1 = strstr(name1, ".llvm.");
-	res2 = strstr(name2, ".llvm.");
-	len1 = res1 ? res1 - name1 : strlen(name1);
-	len2 = res2 ? res2 - name2 : strlen(name2);
-
-	if (len1 == len2)
-		return strncmp(name1, name2, len1);
-	if (len1 < len2)
-		return strncmp(name1, name2, len1) <= 0 ? -1 : 1;
-	return strncmp(name1, name2, len2) >= 0 ? 1 : -1;
-}
-
-static int load_kallsyms_compare(const void *p1, const void *p2)
-{
-	return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name);
-}
-
-static int search_kallsyms_compare(const void *p1, const struct ksym *p2)
-{
-	return compare_name(p1, p2->name);
-}
-
-static int get_syms(char ***symsp, size_t *cntp, bool kernel)
-{
-	size_t cap = 0, cnt = 0;
-	char *name = NULL, *ksym_name, **syms = NULL;
-	struct hashmap *map;
-	struct ksyms *ksyms;
-	struct ksym *ks;
-	char buf[256];
-	FILE *f;
-	int err = 0;
-
-	ksyms = load_kallsyms_custom_local(load_kallsyms_compare);
-	if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local"))
-		return -EINVAL;
-
-	/*
-	 * The available_filter_functions contains many duplicates,
-	 * but other than that all symbols are usable in kprobe multi
-	 * interface.
-	 * Filtering out duplicates by using hashmap__add, which won't
-	 * add existing entry.
-	 */
-
-	if (access("/sys/kernel/tracing/trace", F_OK) == 0)
-		f = fopen("/sys/kernel/tracing/available_filter_functions", "r");
-	else
-		f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r");
-
-	if (!f)
-		return -EINVAL;
-
-	map = hashmap__new(symbol_hash, symbol_equal, NULL);
-	if (IS_ERR(map)) {
-		err = libbpf_get_error(map);
-		goto error;
-	}
-
-	while (fgets(buf, sizeof(buf), f)) {
-		if (is_invalid_entry(buf, kernel))
-			continue;
-
-		free(name);
-		if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1)
-			continue;
-		if (skip_entry(name))
-			continue;
-
-		ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare);
-		if (!ks) {
-			err = -EINVAL;
-			goto error;
-		}
-
-		ksym_name = ks->name;
-		err = hashmap__add(map, ksym_name, 0);
-		if (err == -EEXIST) {
-			err = 0;
-			continue;
-		}
-		if (err)
-			goto error;
-
-		err = libbpf_ensure_mem((void **) &syms, &cap,
-					sizeof(*syms), cnt + 1);
-		if (err)
-			goto error;
-
-		syms[cnt++] = ksym_name;
-	}
-
-	*symsp = syms;
-	*cntp = cnt;
-
-error:
-	free(name);
-	fclose(f);
-	hashmap__free(map);
-	if (err)
-		free(syms);
-	return err;
-}
-
-static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
-{
-	unsigned long *addr, *addrs, *tmp_addrs;
-	int err = 0, max_cnt, inc_cnt;
-	char *name = NULL;
-	size_t cnt = 0;
-	char buf[256];
-	FILE *f;
-
-	if (access("/sys/kernel/tracing/trace", F_OK) == 0)
-		f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r");
-	else
-		f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r");
-
-	if (!f)
-		return -ENOENT;
-
-	/* In my local setup, the number of entries is 50k+ so Let us initially
-	 * allocate space to hold 64k entries. If 64k is not enough, incrementally
-	 * increase 1k each time.
-	 */
-	max_cnt = 65536;
-	inc_cnt = 1024;
-	addrs = malloc(max_cnt * sizeof(long));
-	if (addrs == NULL) {
-		err = -ENOMEM;
-		goto error;
-	}
-
-	while (fgets(buf, sizeof(buf), f)) {
-		if (is_invalid_entry(buf, kernel))
-			continue;
-
-		free(name);
-		if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2)
-			continue;
-		if (skip_entry(name))
-			continue;
-
-		if (cnt == max_cnt) {
-			max_cnt += inc_cnt;
-			tmp_addrs = realloc(addrs, max_cnt);
-			if (!tmp_addrs) {
-				err = -ENOMEM;
-				goto error;
-			}
-			addrs = tmp_addrs;
-		}
-
-		addrs[cnt++] = (unsigned long)addr;
-	}
-
-	*addrsp = addrs;
-	*cntp = cnt;
-
-error:
-	free(name);
-	fclose(f);
-	if (err)
-		free(addrs);
-	return err;
-}
-
 static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts)
 {
 	long attach_start_ns, attach_end_ns;
@@ -670,7 +456,7 @@ static void test_kprobe_multi_bench_attach(bool kernel)
 	char **syms = NULL;
 	size_t cnt = 0;
 
-	if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms"))
+	if (!ASSERT_OK(bpf_get_ksyms(&syms, &cnt, kernel), "bpf_get_ksyms"))
 		return;
 
 	skel = kprobe_multi_empty__open_and_load();
@@ -696,13 +482,13 @@ static void test_kprobe_multi_bench_attach_addr(bool kernel)
 	size_t cnt = 0;
 	int err;
 
-	err = get_addrs(&addrs, &cnt, kernel);
+	err = bpf_get_addrs(&addrs, &cnt, kernel);
 	if (err == -ENOENT) {
 		test__skip();
 		return;
 	}
 
-	if (!ASSERT_OK(err, "get_addrs"))
+	if (!ASSERT_OK(err, "bpf_get_addrs"))
 		return;
 
 	skel = kprobe_multi_empty__open_and_load();
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 81943c6254e6..d24baf244d1f 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -17,6 +17,7 @@
 #include <linux/limits.h>
 #include <libelf.h>
 #include <gelf.h>
+#include "bpf/hashmap.h"
 #include "bpf/libbpf_internal.h"
 
 #define TRACEFS_PIPE	"/sys/kernel/tracing/trace_pipe"
@@ -519,3 +520,216 @@ void read_trace_pipe(void)
 {
 	read_trace_pipe_iter(trace_pipe_cb, NULL, 0);
 }
+
+static size_t symbol_hash(long key, void *ctx __maybe_unused)
+{
+	return str_hash((const char *) key);
+}
+
+static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused)
+{
+	return strcmp((const char *) key1, (const char *) key2) == 0;
+}
+
+static bool is_invalid_entry(char *buf, bool kernel)
+{
+	if (kernel && strchr(buf, '['))
+		return true;
+	if (!kernel && !strchr(buf, '['))
+		return true;
+	return false;
+}
+
+static bool skip_entry(char *name)
+{
+	/*
+	 * We attach to almost all kernel functions and some of them
+	 * will cause 'suspicious RCU usage' when fprobe is attached
+	 * to them. Filter out the current culprits - arch_cpu_idle
+	 * default_idle and rcu_* functions.
+	 */
+	if (!strcmp(name, "arch_cpu_idle"))
+		return true;
+	if (!strcmp(name, "default_idle"))
+		return true;
+	if (!strncmp(name, "rcu_", 4))
+		return true;
+	if (!strcmp(name, "bpf_dispatcher_xdp_func"))
+		return true;
+	if (!strncmp(name, "__ftrace_invalid_address__",
+		     sizeof("__ftrace_invalid_address__") - 1))
+		return true;
+	return false;
+}
+
+/* Do comparison by ignoring '.llvm.<hash>' suffixes. */
+static int compare_name(const char *name1, const char *name2)
+{
+	const char *res1, *res2;
+	int len1, len2;
+
+	res1 = strstr(name1, ".llvm.");
+	res2 = strstr(name2, ".llvm.");
+	len1 = res1 ? res1 - name1 : strlen(name1);
+	len2 = res2 ? res2 - name2 : strlen(name2);
+
+	if (len1 == len2)
+		return strncmp(name1, name2, len1);
+	if (len1 < len2)
+		return strncmp(name1, name2, len1) <= 0 ? -1 : 1;
+	return strncmp(name1, name2, len2) >= 0 ? 1 : -1;
+}
+
+static int load_kallsyms_compare(const void *p1, const void *p2)
+{
+	return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name);
+}
+
+static int search_kallsyms_compare(const void *p1, const struct ksym *p2)
+{
+	return compare_name(p1, p2->name);
+}
+
+int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel)
+{
+	size_t cap = 0, cnt = 0;
+	char *name = NULL, *ksym_name, **syms = NULL;
+	struct hashmap *map;
+	struct ksyms *ksyms;
+	struct ksym *ks;
+	char buf[256];
+	FILE *f;
+	int err = 0;
+
+	ksyms = load_kallsyms_custom_local(load_kallsyms_compare);
+	if (!ksyms)
+		return -EINVAL;
+
+	/*
+	 * The available_filter_functions contains many duplicates,
+	 * but other than that all symbols are usable to trace.
+	 * Filtering out duplicates by using hashmap__add, which won't
+	 * add existing entry.
+	 */
+
+	if (access("/sys/kernel/tracing/trace", F_OK) == 0)
+		f = fopen("/sys/kernel/tracing/available_filter_functions", "r");
+	else
+		f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r");
+
+	if (!f)
+		return -EINVAL;
+
+	map = hashmap__new(symbol_hash, symbol_equal, NULL);
+	if (IS_ERR(map)) {
+		err = libbpf_get_error(map);
+		goto error;
+	}
+
+	while (fgets(buf, sizeof(buf), f)) {
+		if (is_invalid_entry(buf, kernel))
+			continue;
+
+		free(name);
+		if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1)
+			continue;
+		if (skip_entry(name))
+			continue;
+
+		ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare);
+		if (!ks) {
+			err = -EINVAL;
+			goto error;
+		}
+
+		ksym_name = ks->name;
+		err = hashmap__add(map, ksym_name, 0);
+		if (err == -EEXIST) {
+			err = 0;
+			continue;
+		}
+		if (err)
+			goto error;
+
+		err = libbpf_ensure_mem((void **) &syms, &cap,
+					sizeof(*syms), cnt + 1);
+		if (err)
+			goto error;
+
+		syms[cnt++] = ksym_name;
+	}
+
+	*symsp = syms;
+	*cntp = cnt;
+
+error:
+	free(name);
+	fclose(f);
+	hashmap__free(map);
+	if (err)
+		free(syms);
+	return err;
+}
+
+int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
+{
+	unsigned long *addr, *addrs, *tmp_addrs;
+	int err = 0, max_cnt, inc_cnt;
+	char *name = NULL;
+	size_t cnt = 0;
+	char buf[256];
+	FILE *f;
+
+	if (access("/sys/kernel/tracing/trace", F_OK) == 0)
+		f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r");
+	else
+		f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r");
+
+	if (!f)
+		return -ENOENT;
+
+	/* In my local setup, the number of entries is 50k+ so Let us initially
+	 * allocate space to hold 64k entries. If 64k is not enough, incrementally
+	 * increase 1k each time.
+	 */
+	max_cnt = 65536;
+	inc_cnt = 1024;
+	addrs = malloc(max_cnt * sizeof(long));
+	if (addrs == NULL) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	while (fgets(buf, sizeof(buf), f)) {
+		if (is_invalid_entry(buf, kernel))
+			continue;
+
+		free(name);
+		if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2)
+			continue;
+		if (skip_entry(name))
+			continue;
+
+		if (cnt == max_cnt) {
+			max_cnt += inc_cnt;
+			tmp_addrs = realloc(addrs, max_cnt);
+			if (!tmp_addrs) {
+				err = -ENOMEM;
+				goto error;
+			}
+			addrs = tmp_addrs;
+		}
+
+		addrs[cnt++] = (unsigned long)addr;
+	}
+
+	*addrsp = addrs;
+	*cntp = cnt;
+
+error:
+	free(name);
+	fclose(f);
+	if (err)
+		free(addrs);
+	return err;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 2ce873c9f9aa..9437bdd4afa5 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -41,4 +41,7 @@ ssize_t get_rel_offset(uintptr_t addr);
 
 int read_build_id(const char *path, char *build_id, size_t size);
 
+int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel);
+int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel);
+
 #endif
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args
  2025-07-03 12:15 ` [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args Menglong Dong
@ 2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-14 23:45     ` Menglong Dong
  0 siblings, 1 reply; 32+ messages in thread
From: Andrii Nakryiko @ 2025-07-14 22:07 UTC (permalink / raw)
  To: Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	John Fastabend, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo,
	Simon Horman, linux-kernel, netdev

On Thu, Jul 3, 2025 at 5:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> which is used to record the accessed index of the function args in
> btf_ctx_access().

Do we need to bother giving access to arguments through direct ctx[i]
access for these multi-fentry/fexit programs? We have
bpf_get_func_arg_cnt() and bpf_get_func_arg() which can be used to get
any given argument at runtime.


>
> Meanwhile, we add the function btf_check_func_part_match() to compare the
> accessed function args of two function prototype. This function will be
> used in the following commit.
>
> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> ---
>  include/linux/bpf.h   |   4 ++
>  include/linux/btf.h   |   3 +-
>  kernel/bpf/btf.c      | 108 +++++++++++++++++++++++++++++++++++++++++-
>  net/sched/bpf_qdisc.c |   2 +-
>  4 files changed, 113 insertions(+), 4 deletions(-)
>

[...]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args
  2025-07-14 22:07   ` Andrii Nakryiko
@ 2025-07-14 23:45     ` Menglong Dong
  2025-07-15 17:11       ` Andrii Nakryiko
  0 siblings, 1 reply; 32+ messages in thread
From: Menglong Dong @ 2025-07-14 23:45 UTC (permalink / raw)
  To: Andrii Nakryiko, Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	John Fastabend, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo,
	Simon Horman, linux-kernel, netdev


On 2025/7/15 06:07, Andrii Nakryiko wrote:
> On Thu, Jul 3, 2025 at 5:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>> In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
>> which is used to record the accessed index of the function args in
>> btf_ctx_access().
> Do we need to bother giving access to arguments through direct ctx[i]
> access for these multi-fentry/fexit programs? We have
> bpf_get_func_arg_cnt() and bpf_get_func_arg() which can be used to get
> any given argument at runtime.


Hi Andrii. This commit is not for that purpose. We remember all the accessed
args to bpf_prog_aux->accessed_args. And when we attach the tracing-multi
prog to the kernel functions, we will check if the accessed arguments are
consistent between all the target functions.

The bpf_prog_aux->accessed_args will be used in
https://lore.kernel.org/bpf/20250703121521.1874196-12-dongml2@chinatelecom.cn/

in bpf_tracing_check_multi() to do such checking.

With such checking, the target functions don't need to have
the same prototype, which makes tracing-multi more flexible.

Thanks!
Menglong Dong


>
>> Meanwhile, we add the function btf_check_func_part_match() to compare the
>> accessed function args of two function prototype. This function will be
>> used in the following commit.
>>
>> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
>> ---
>>   include/linux/bpf.h   |   4 ++
>>   include/linux/btf.h   |   3 +-
>>   kernel/bpf/btf.c      | 108 +++++++++++++++++++++++++++++++++++++++++-
>>   net/sched/bpf_qdisc.c |   2 +-
>>   4 files changed, 113 insertions(+), 4 deletions(-)
>>
> [...]
>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-03 12:15 ` [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline Menglong Dong
@ 2025-07-15  2:25   ` Alexei Starovoitov
  2025-07-15  8:36     ` Menglong Dong
  0 siblings, 1 reply; 32+ messages in thread
From: Alexei Starovoitov @ 2025-07-15  2:25 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, H. Peter Anvin,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

On Thu, Jul 3, 2025 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> +static __always_inline void
> +do_origin_call(unsigned long *args, unsigned long *ip, int nr_args)
> +{
> +       /* Following code will be optimized by the compiler, as nr_args
> +        * is a const, and there will be no condition here.
> +        */
> +       if (nr_args == 0) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_0 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       :
> +               );
> +       } else if (nr_args == 1) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_1 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi"
> +               );
> +       } else if (nr_args == 2) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_2 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi", "rsi"
> +               );
> +       } else if (nr_args == 3) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_3 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi", "rsi", "rdx"
> +               );
> +       } else if (nr_args == 4) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_4 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi", "rsi", "rdx", "rcx"
> +               );
> +       } else if (nr_args == 5) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_5 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi", "rsi", "rdx", "rcx", "r8"
> +               );
> +       } else if (nr_args == 6) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_6 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi", "rsi", "rdx", "rcx", "r8", "r9"
> +               );
> +       }
> +}

What is the performance difference between 0-6 variants?
I would think save/restore of regs shouldn't be that expensive.
bpf trampoline saves only what's necessary because it can do
this micro optimization, but for this one, I think, doing
_one_ global trampoline that covers all cases will simplify the code
a lot, but please benchmark the difference to understand
the trade-off.

The major simplification will be due to skipping nr_args.
There won't be a need to do btf model and count the args.
Just do one trampoline for them all.

Also funcs with 7+ arguments need to be thought through
from the start.
I think it's ok trade-off if we allow global trampoline
to be safe to attach to a function with 7+ args (and
it will not mess with the stack), but bpf prog can only
access up to 6 args. The kfuncs to access arg 7 might be
more complex and slower. It's ok trade off.

> +
> +static __always_inline notrace void
> +run_tramp_prog(struct kfunc_md_tramp_prog *tramp_prog,
> +              struct bpf_tramp_run_ctx *run_ctx, unsigned long *args)
> +{
> +       struct bpf_prog *prog;
> +       u64 start_time;
> +
> +       while (tramp_prog) {
> +               prog = tramp_prog->prog;
> +               run_ctx->bpf_cookie = tramp_prog->cookie;
> +               start_time = bpf_gtramp_enter(prog, run_ctx);
> +
> +               if (likely(start_time)) {
> +                       asm volatile(
> +                               CALL_NOSPEC "\n"
> +                               : : [thunk_target]"r"(prog->bpf_func), [args]"D"(args)
> +                       );

Why this cannot be "call *(prog->bpf_func)" ?

> +               }
> +
> +               bpf_gtramp_exit(prog, start_time, run_ctx);
> +               tramp_prog = tramp_prog->next;
> +       }
> +}
> +
> +static __always_inline notrace int
> +bpf_global_caller_run(unsigned long *args, unsigned long *ip, int nr_args)

Pls share top 10 from "perf report" while running the bench.
I'm curious about what's hot.
Last time I benchmarked fentry/fexit migrate_disable/enable were
one the hottest functions. I suspect it's the case here as well.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15  2:25   ` Alexei Starovoitov
@ 2025-07-15  8:36     ` Menglong Dong
  2025-07-15  9:30       ` Menglong Dong
  2025-07-15 16:35       ` Alexei Starovoitov
  0 siblings, 2 replies; 32+ messages in thread
From: Menglong Dong @ 2025-07-15  8:36 UTC (permalink / raw)
  To: Alexei Starovoitov, Menglong Dong
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, H. Peter Anvin,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development


On 7/15/25 10:25, Alexei Starovoitov wrote:
> On Thu, Jul 3, 2025 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>> +static __always_inline void
>> +do_origin_call(unsigned long *args, unsigned long *ip, int nr_args)
>> +{
>> +       /* Following code will be optimized by the compiler, as nr_args
>> +        * is a const, and there will be no condition here.
>> +        */
>> +       if (nr_args == 0) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_0 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       :
>> +               );
>> +       } else if (nr_args == 1) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_1 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi"
>> +               );
>> +       } else if (nr_args == 2) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_2 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi", "rsi"
>> +               );
>> +       } else if (nr_args == 3) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_3 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi", "rsi", "rdx"
>> +               );
>> +       } else if (nr_args == 4) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_4 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi", "rsi", "rdx", "rcx"
>> +               );
>> +       } else if (nr_args == 5) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_5 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi", "rsi", "rdx", "rcx", "r8"
>> +               );
>> +       } else if (nr_args == 6) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_6 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi", "rsi", "rdx", "rcx", "r8", "r9"
>> +               );
>> +       }
>> +}
> What is the performance difference between 0-6 variants?
> I would think save/restore of regs shouldn't be that expensive.
> bpf trampoline saves only what's necessary because it can do
> this micro optimization, but for this one, I think, doing
> _one_ global trampoline that covers all cases will simplify the code
> a lot, but please benchmark the difference to understand
> the trade-off.

According to my benchmark, it has ~5% overhead to save/restore
*5* variants when compared with *0* variant. The save/restore of regs
is fast, but it still need 12 insn, which can produce ~6% overhead.

I think the performance is more import and we should keep this logic.
Should we? If you think the do_origin_call() is not simple enough, we
can recover all the 6 regs from the stack directly for the origin call, 
which won't
introduce too much overhead, and keep the save/restore logic.

What do you think?


>
> The major simplification will be due to skipping nr_args.
> There won't be a need to do btf model and count the args.
> Just do one trampoline for them all.
>
> Also funcs with 7+ arguments need to be thought through
> from the start.


In the current version, the attachment will be rejected if any functions 
have
7+ arguments.


> I think it's ok trade-off if we allow global trampoline
> to be safe to attach to a function with 7+ args (and
> it will not mess with the stack), but bpf prog can only
> access up to 6 args. The kfuncs to access arg 7 might be
> more complex and slower. It's ok trade off.


It's OK for fentry-multi, but we can't allow fexit-multi and 
modify_return-multi
to be attached to the function with 7+ args, as we need to do the origin
call, and we can't recover the arguments in the stack for the origin 
call for now.

So we can allow the functions with 7+ args to be attached as long as the 
accessed
arguments are all in regs for fentry-multi. And I think we need one more 
patch to
do the "all accessed arguments are in regs" checking, so maybe we can 
put it in
the next series? As current series is a little complex :/

Anyway, I'll have a try to see if we can add this part in this series :)


>
>> +
>> +static __always_inline notrace void
>> +run_tramp_prog(struct kfunc_md_tramp_prog *tramp_prog,
>> +              struct bpf_tramp_run_ctx *run_ctx, unsigned long *args)
>> +{
>> +       struct bpf_prog *prog;
>> +       u64 start_time;
>> +
>> +       while (tramp_prog) {
>> +               prog = tramp_prog->prog;
>> +               run_ctx->bpf_cookie = tramp_prog->cookie;
>> +               start_time = bpf_gtramp_enter(prog, run_ctx);
>> +
>> +               if (likely(start_time)) {
>> +                       asm volatile(
>> +                               CALL_NOSPEC "\n"
>> +                               : : [thunk_target]"r"(prog->bpf_func), [args]"D"(args)
>> +                       );
> Why this cannot be "call *(prog->bpf_func)" ?

Do you mean "prog->bpf_func(args, NULL);"? In my previous testing, this 
cause
bad performance, and I see others do the indirect call in this way. And 
I just do
the benchmark again, it seems the performance is not affected in this 
way anymore.
So I think I can replace it with "prog->bpf_func(args, NULL);" in the 
next version.

>
>> +               }
>> +
>> +               bpf_gtramp_exit(prog, start_time, run_ctx);
>> +               tramp_prog = tramp_prog->next;
>> +       }
>> +}
>> +
>> +static __always_inline notrace int
>> +bpf_global_caller_run(unsigned long *args, unsigned long *ip, int nr_args)
> Pls share top 10 from "perf report" while running the bench.
> I'm curious about what's hot.
> Last time I benchmarked fentry/fexit migrate_disable/enable were
> one the hottest functions. I suspect it's the case here as well.


You are right, the migrate_disable/enable are the hottest functions in
both bpf trampoline and global trampoline. Following is the perf top
for fentry-multi:
36.36% bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi [k] 
bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi 20.54% [kernel] [k] 
migrate_enable 19.35% [kernel] [k] bpf_global_caller_5_run 6.52% 
[kernel] [k] bpf_global_caller_5 3.58% libc.so.6 [.] syscall 2.88% 
[kernel] [k] entry_SYSCALL_64 1.50% [kernel] [k] memchr_inv 1.39% 
[kernel] [k] fput 1.04% [kernel] [k] migrate_disable 0.91% [kernel] [k] 
_copy_to_user

And I also did the testing for fentry:

54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k] 
bpf_prog_2dcccf652aac1793_bench_trigger_fentry
10.43% [kernel] [k] migrate_enable
10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
8.06% [kernel] [k] __bpf_prog_exit_recur 4.11% libc.so.6 [.] syscall 
2.15% [kernel] [k] entry_SYSCALL_64 1.48% [kernel] [k] memchr_inv 1.32% 
[kernel] [k] fput 1.16% [kernel] [k] _copy_to_user 0.73% [kernel] [k] 
bpf_prog_test_run_raw_tp
The migrate_enable/disable are used to do the recursive checking,
and I even wanted to perform recursive checks in the same way as
ftrace to eliminate this overhead :/


>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15  8:36     ` Menglong Dong
@ 2025-07-15  9:30       ` Menglong Dong
  2025-07-16 16:56         ` Inlining migrate_disable/enable. Was: " Alexei Starovoitov
  2025-07-15 16:35       ` Alexei Starovoitov
  1 sibling, 1 reply; 32+ messages in thread
From: Menglong Dong @ 2025-07-15  9:30 UTC (permalink / raw)
  To: Alexei Starovoitov, Menglong Dong
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, H. Peter Anvin,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

On 7/15/25 16:36, Menglong Dong wrote:
>
> On 7/15/25 10:25, Alexei Starovoitov wrote:
>> Pls share top 10 from "perf report" while running the bench.
>> I'm curious about what's hot.
>> Last time I benchmarked fentry/fexit migrate_disable/enable were
>> one the hottest functions. I suspect it's the case here as well.
>
>
> You are right, the migrate_disable/enable are the hottest functions in
> both bpf trampoline and global trampoline. Following is the perf top
> for fentry-multi:
> 36.36% bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi [k] 
> bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi 20.54% [kernel] 
> [k] migrate_enable 19.35% [kernel] [k] bpf_global_caller_5_run 6.52% 
> [kernel] [k] bpf_global_caller_5 3.58% libc.so.6 [.] syscall 2.88% 
> [kernel] [k] entry_SYSCALL_64 1.50% [kernel] [k] memchr_inv 1.39% 
> [kernel] [k] fput 1.04% [kernel] [k] migrate_disable 0.91% [kernel] 
> [k] _copy_to_user
>
> And I also did the testing for fentry:
>
> 54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k] 
> bpf_prog_2dcccf652aac1793_bench_trigger_fentry
> 10.43% [kernel] [k] migrate_enable
> 10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
> 8.06% [kernel] [k] __bpf_prog_exit_recur 4.11% libc.so.6 [.] syscall 
> 2.15% [kernel] [k] entry_SYSCALL_64 1.48% [kernel] [k] memchr_inv 
> 1.32% [kernel] [k] fput 1.16% [kernel] [k] _copy_to_user 0.73% 
> [kernel] [k] bpf_prog_test_run_raw_tp
> The migrate_enable/disable are used to do the recursive checking,
> and I even wanted to perform recursive checks in the same way as
> ftrace to eliminate this overhead :/
>

Sorry that I'm not familiar with Thunderbird yet, and the perf top
messed up. Following are the test results for fentry-multi:
   36.36% bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi [k] 
bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi
   20.54% [kernel] [k] migrate_enable
   19.35% [kernel] [k] bpf_global_caller_5_run
   6.52% [kernel] [k] bpf_global_caller_5
   3.58% libc.so.6 [.] syscall
   2.88% [kernel] [k] entry_SYSCALL_64
   1.50% [kernel] [k] memchr_inv
   1.39% [kernel] [k] fput
   1.04% [kernel] [k] migrate_disable
   0.91% [kernel] [k] _copy_to_user

And I also did the testing for fentry:
   54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k] 
bpf_prog_2dcccf652aac1793_bench_trigger_fentry
   10.43% [kernel] [k] migrate_enable
   10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
   8.06% [kernel] [k] __bpf_prog_exit_recur
   4.11% libc.so.6 [.] syscall
   2.15% [kernel] [k] entry_SYSCALL_64
   1.48% [kernel] [k] memchr_inv
   1.32% [kernel] [k] fput
   1.16% [kernel] [k] _copy_to_user
   0.73% [kernel] [k] bpf_prog_test_run_raw_tp

The migrate_enable/disable are used to do the recursive checking,
and I even wanted to perform recursive checks in the same way as
ftrace to eliminate this overhead :/

Thanks!
Menglong Dong

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15  8:36     ` Menglong Dong
  2025-07-15  9:30       ` Menglong Dong
@ 2025-07-15 16:35       ` Alexei Starovoitov
  2025-07-16 13:05         ` Menglong Dong
  2025-07-16 14:40         ` Menglong Dong
  1 sibling, 2 replies; 32+ messages in thread
From: Alexei Starovoitov @ 2025-07-15 16:35 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Menglong Dong, Steven Rostedt, Jiri Olsa, bpf, Menglong Dong,
	H. Peter Anvin, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, LKML, Network Development

On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
>
>
> On 7/15/25 10:25, Alexei Starovoitov wrote:
> > On Thu, Jul 3, 2025 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >> +static __always_inline void
> >> +do_origin_call(unsigned long *args, unsigned long *ip, int nr_args)
> >> +{
> >> +       /* Following code will be optimized by the compiler, as nr_args
> >> +        * is a const, and there will be no condition here.
> >> +        */
> >> +       if (nr_args == 0) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_0 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       :
> >> +               );
> >> +       } else if (nr_args == 1) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_1 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi"
> >> +               );
> >> +       } else if (nr_args == 2) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_2 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi", "rsi"
> >> +               );
> >> +       } else if (nr_args == 3) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_3 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi", "rsi", "rdx"
> >> +               );
> >> +       } else if (nr_args == 4) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_4 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi", "rsi", "rdx", "rcx"
> >> +               );
> >> +       } else if (nr_args == 5) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_5 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi", "rsi", "rdx", "rcx", "r8"
> >> +               );
> >> +       } else if (nr_args == 6) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_6 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi", "rsi", "rdx", "rcx", "r8", "r9"
> >> +               );
> >> +       }
> >> +}
> > What is the performance difference between 0-6 variants?
> > I would think save/restore of regs shouldn't be that expensive.
> > bpf trampoline saves only what's necessary because it can do
> > this micro optimization, but for this one, I think, doing
> > _one_ global trampoline that covers all cases will simplify the code
> > a lot, but please benchmark the difference to understand
> > the trade-off.
>
> According to my benchmark, it has ~5% overhead to save/restore
> *5* variants when compared with *0* variant. The save/restore of regs
> is fast, but it still need 12 insn, which can produce ~6% overhead.

I think it's an ok trade off, because with one global trampoline
we do not need to call rhashtable lookup before entering bpf prog.
bpf prog will do it on demand if/when it needs to access arguments.
This will compensate for a bit of lost performance due to extra save/restore.

PS
pls don't add your chinatelecom.cn email in cc.
gmail just cannot deliver there and it's annoying to keep deleting
it manually in every reply.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args
  2025-07-14 23:45     ` Menglong Dong
@ 2025-07-15 17:11       ` Andrii Nakryiko
  2025-07-16 12:50         ` Menglong Dong
  0 siblings, 1 reply; 32+ messages in thread
From: Andrii Nakryiko @ 2025-07-15 17:11 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Menglong Dong, alexei.starovoitov, rostedt, jolsa, bpf,
	Menglong Dong, John Fastabend, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo,
	Simon Horman, linux-kernel, netdev

On Mon, Jul 14, 2025 at 4:45 PM Menglong Dong <menglong.dong@linux.dev> wrote:
>
>
> On 2025/7/15 06:07, Andrii Nakryiko wrote:
> > On Thu, Jul 3, 2025 at 5:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >> In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> >> which is used to record the accessed index of the function args in
> >> btf_ctx_access().
> > Do we need to bother giving access to arguments through direct ctx[i]
> > access for these multi-fentry/fexit programs? We have
> > bpf_get_func_arg_cnt() and bpf_get_func_arg() which can be used to get
> > any given argument at runtime.
>
>
> Hi Andrii. This commit is not for that purpose. We remember all the accessed
> args to bpf_prog_aux->accessed_args. And when we attach the tracing-multi
> prog to the kernel functions, we will check if the accessed arguments are
> consistent between all the target functions.
>
> The bpf_prog_aux->accessed_args will be used in
> https://lore.kernel.org/bpf/20250703121521.1874196-12-dongml2@chinatelecom.cn/
>
> in bpf_tracing_check_multi() to do such checking.
>
> With such checking, the target functions don't need to have
> the same prototype, which makes tracing-multi more flexible.

Yeah, and my point is why even track this at verifier level. If we
don't allow direct ctx[i] access and only access arguments through
bpf_get_func_arg(), we can check actual number of arguments at runtime
and if program is trying to access something that's not there, we'll
just return error code, so user can handle this generically.

I'm just not sure if there is a need to do anything more than that.

>
> Thanks!
> Menglong Dong
>
>
> >
> >> Meanwhile, we add the function btf_check_func_part_match() to compare the
> >> accessed function args of two function prototype. This function will be
> >> used in the following commit.
> >>
> >> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> >> ---
> >>   include/linux/bpf.h   |   4 ++
> >>   include/linux/btf.h   |   3 +-
> >>   kernel/bpf/btf.c      | 108 +++++++++++++++++++++++++++++++++++++++++-
> >>   net/sched/bpf_qdisc.c |   2 +-
> >>   4 files changed, 113 insertions(+), 4 deletions(-)
> >>
> > [...]
> >

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args
  2025-07-15 17:11       ` Andrii Nakryiko
@ 2025-07-16 12:50         ` Menglong Dong
  0 siblings, 0 replies; 32+ messages in thread
From: Menglong Dong @ 2025-07-16 12:50 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Menglong Dong, alexei.starovoitov, rostedt, jolsa, bpf,
	John Fastabend, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo,
	Simon Horman, linux-kernel, netdev

[-- Attachment #1: Type: text/plain, Size: 2126 bytes --]

On Wednesday, July 16, 2025 1:11 AM Andrii Nakryiko <andrii.nakryiko@gmail.com> write:
> On Mon, Jul 14, 2025 at 4:45 PM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> >
> > On 2025/7/15 06:07, Andrii Nakryiko wrote:
> > > On Thu, Jul 3, 2025 at 5:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > >> In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> > >> which is used to record the accessed index of the function args in
> > >> btf_ctx_access().
> > > Do we need to bother giving access to arguments through direct ctx[i]
> > > access for these multi-fentry/fexit programs? We have
> > > bpf_get_func_arg_cnt() and bpf_get_func_arg() which can be used to get
> > > any given argument at runtime.
> >
> >
> > Hi Andrii. This commit is not for that purpose. We remember all the accessed
> > args to bpf_prog_aux->accessed_args. And when we attach the tracing-multi
> > prog to the kernel functions, we will check if the accessed arguments are
> > consistent between all the target functions.
> >
> > The bpf_prog_aux->accessed_args will be used in
> > https://lore.kernel.org/bpf/20250703121521.1874196-12-dongml2@chinatelecom.cn/
> >
> > in bpf_tracing_check_multi() to do such checking.
> >
> > With such checking, the target functions don't need to have
> > the same prototype, which makes tracing-multi more flexible.
> 
> Yeah, and my point is why even track this at verifier level. If we
> don't allow direct ctx[i] access and only access arguments through
> bpf_get_func_arg(), we can check actual number of arguments at runtime
> and if program is trying to access something that's not there, we'll
> just return error code, so user can handle this generically.
> 
> I'm just not sure if there is a need to do anything more than that.

This commit is for the ctx[i] direct access, and we can use
bpf_core_cast() instead, as you said in
https://lore.kernel.org/bpf/CADxym3Zrqb6MxoV6mg4ioQMCiR+Cden9tmD5YHj8DtRFjn14HA@mail.gmail.com/T/#m7daa262d423c0e8bb1c7033e51099ef06180d2c5

Which means that we don't need this commit any more.



[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15 16:35       ` Alexei Starovoitov
@ 2025-07-16 13:05         ` Menglong Dong
  2025-07-17  0:59           ` multi-fentry proposal. Was: " Alexei Starovoitov
  2025-07-16 14:40         ` Menglong Dong
  1 sibling, 1 reply; 32+ messages in thread
From: Menglong Dong @ 2025-07-16 13:05 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Menglong Dong, Steven Rostedt, Jiri Olsa, bpf, H. Peter Anvin,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

[-- Attachment #1: Type: text/plain, Size: 1686 bytes --]

On Wednesday, July 16, 2025 12:35 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> >
> > On 7/15/25 10:25, Alexei Starovoitov wrote:
[......]
> >
> > According to my benchmark, it has ~5% overhead to save/restore
> > *5* variants when compared with *0* variant. The save/restore of regs
> > is fast, but it still need 12 insn, which can produce ~6% overhead.
> 
> I think it's an ok trade off, because with one global trampoline
> we do not need to call rhashtable lookup before entering bpf prog.
> bpf prog will do it on demand if/when it needs to access arguments.
> This will compensate for a bit of lost performance due to extra save/restore.

I don't understand here :/

The rhashtable lookup is done at the beginning of the global trampoline,
which is called before we enter bpf prog. The bpf progs is stored in the
kfunc_md, and we need get them from the hash table.

If this is the only change, it is still OK. But according to my previous, the
rhashtable can cause ~7% addition overhead. So if we change both
them, the performance of tracing-multi is a little far from tracing, which
means ~25% performance gap for the functions that have no arguments.
About the rhashtable part, I'll do more research on it and feedback late.

> 
> PS
> pls don't add your chinatelecom.cn email in cc.
> gmail just cannot deliver there and it's annoying to keep deleting
> it manually in every reply.

Sorry about that. I filtered out such message in my gmail, and
didn't notice it. I'll remove it from the CC in the feature :)

Thanks!
Menglong Dong


[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15 16:35       ` Alexei Starovoitov
  2025-07-16 13:05         ` Menglong Dong
@ 2025-07-16 14:40         ` Menglong Dong
  1 sibling, 0 replies; 32+ messages in thread
From: Menglong Dong @ 2025-07-16 14:40 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Menglong Dong, Steven Rostedt, Jiri Olsa, bpf, H. Peter Anvin,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

On Wed, Jul 16, 2025 at 12:35 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> >
> > On 7/15/25 10:25, Alexei Starovoitov wrote:
[......]
> >
> > According to my benchmark, it has ~5% overhead to save/restore
> > *5* variants when compared with *0* variant. The save/restore of regs
> > is fast, but it still need 12 insn, which can produce ~6% overhead.
>
> I think it's an ok trade off, because with one global trampoline
> we do not need to call rhashtable lookup before entering bpf prog.
> bpf prog will do it on demand if/when it needs to access arguments.
> This will compensate for a bit of lost performance due to extra save/restore.

I just think of another benefit of defining multiple global trampolines
here, which you may be interested in. In the feature, we can make
the global trampoline supports functions that have 7+ arguments.
If we use _one_ global trampoline, it's not possible, as we can't handle
the arguments in the stack. However, it's possible if we define
different global trampoline for the functions that have different arguments
count, and what we need to do in the feature is do some adjustment
to CALLER_DEFINE().

Wish you are interested in this idea :)

Thanks!
Menglong Dong


>
> PS
> pls don't add your chinatelecom.cn email in cc.
> gmail just cannot deliver there and it's annoying to keep deleting
> it manually in every reply.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15  9:30       ` Menglong Dong
@ 2025-07-16 16:56         ` Alexei Starovoitov
  2025-07-16 18:24           ` Peter Zijlstra
  0 siblings, 1 reply; 32+ messages in thread
From: Alexei Starovoitov @ 2025-07-16 16:56 UTC (permalink / raw)
  To: Menglong Dong, Peter Zijlstra
  Cc: Menglong Dong, Steven Rostedt, Jiri Olsa, bpf, Martin KaFai Lau,
	Eduard Zingerman, LKML, Network Development

On Tue, Jul 15, 2025 at 2:31 AM Menglong Dong <menglong.dong@linux.dev> wrote:
>
>  Following are the test results for fentry-multi:
>    36.36% bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi [k]
> bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi
>    20.54% [kernel] [k] migrate_enable
>    19.35% [kernel] [k] bpf_global_caller_5_run
>    6.52% [kernel] [k] bpf_global_caller_5
>    3.58% libc.so.6 [.] syscall
>    2.88% [kernel] [k] entry_SYSCALL_64
>    1.50% [kernel] [k] memchr_inv
>    1.39% [kernel] [k] fput
>    1.04% [kernel] [k] migrate_disable
>    0.91% [kernel] [k] _copy_to_user
>
> And I also did the testing for fentry:
>    54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k]
> bpf_prog_2dcccf652aac1793_bench_trigger_fentry
>    10.43% [kernel] [k] migrate_enable
>    10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
>    8.06% [kernel] [k] __bpf_prog_exit_recur
>    4.11% libc.so.6 [.] syscall
>    2.15% [kernel] [k] entry_SYSCALL_64
>    1.48% [kernel] [k] memchr_inv
>    1.32% [kernel] [k] fput
>    1.16% [kernel] [k] _copy_to_user
>    0.73% [kernel] [k] bpf_prog_test_run_raw_tp

Let's pause fentry-multi stuff and fix this as a higher priority.
Since migrate_disable/enable is so hot in yours and my tests,
let's figure out how to inline it.

As far as I can see both functions can be moved to a header file
including this_rq() macro, but we need to keep
struct rq private to sched.h. Moving the whole thing is not an option.
Luckily we only need nr_pinned from there.
Maybe we can offsetof(struct rq, nr_pinned) in a precompile step
the way it's done for asm-offsets ?
And then use that constant to do nr_pinned ++, --.
__set_cpus_allowed_ptr() is a slow path and can stay .c

Maybe Peter has better ideas ?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 16:56         ` Inlining migrate_disable/enable. Was: " Alexei Starovoitov
@ 2025-07-16 18:24           ` Peter Zijlstra
  2025-07-16 22:35             ` Alexei Starovoitov
  0 siblings, 1 reply; 32+ messages in thread
From: Peter Zijlstra @ 2025-07-16 18:24 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Menglong Dong, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:

> Maybe Peter has better ideas ?

Is it possible to express runqueues::nr_pinned as an alias?

extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;

And use:

	__this_cpu_inc(&this_nr_pinned);


This syntax doesn't actually seem to work; but can we construct
something like that?

Google finds me this:

 https://gcc.gnu.org/pipermail/gcc-help/2012-February/109877.html


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 18:24           ` Peter Zijlstra
@ 2025-07-16 22:35             ` Alexei Starovoitov
  2025-07-16 22:49               ` Steven Rostedt
  2025-07-28  9:20               ` Menglong Dong
  0 siblings, 2 replies; 32+ messages in thread
From: Alexei Starovoitov @ 2025-07-16 22:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Menglong Dong, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
>
> > Maybe Peter has better ideas ?
>
> Is it possible to express runqueues::nr_pinned as an alias?
>
> extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
>
> And use:
>
>         __this_cpu_inc(&this_nr_pinned);
>
>
> This syntax doesn't actually seem to work; but can we construct
> something like that?

Yeah. Iant is right. It's a string and not a pointer dereference.
It never worked.

Few options:

1.
 struct rq {
+#ifdef CONFIG_SMP
+       unsigned int            nr_pinned;
+#endif
        /* runqueue lock: */
        raw_spinlock_t          __lock;

@@ -1271,9 +1274,6 @@ struct rq {
        struct cpuidle_state    *idle_state;
 #endif

-#ifdef CONFIG_SMP
-       unsigned int            nr_pinned;
-#endif

but ugly...

2.
static unsigned int nr_pinned_offset __ro_after_init __used;
RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)

overkill for what's needed

3.
OFFSET(RQ_nr_pinned, rq, nr_pinned);
then
#include <generated/asm-offsets.h>

imo the best.


4.
Maybe we should extend clang/gcc to support attr(preserve_access_index)
on x86 and other architectures ;)
We rely heavily on it in bpf backend.
Then one can simply write:

struct rq___my {
  unsigned int nr_pinned;
} __attribute__((preserve_access_index));

struct rq___my *rq;

rq = this_rq();
rq->nr_pinned++;

and the compiler will do its magic of offset adjustment.
That's how BPF CORE works.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 22:35             ` Alexei Starovoitov
@ 2025-07-16 22:49               ` Steven Rostedt
  2025-07-16 22:50                 ` Steven Rostedt
  2025-07-28  9:20               ` Menglong Dong
  1 sibling, 1 reply; 32+ messages in thread
From: Steven Rostedt @ 2025-07-16 22:49 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Menglong Dong, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development,
	Jose E. Marchesi

On Wed, 16 Jul 2025 15:35:16 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> 4.
> Maybe we should extend clang/gcc to support attr(preserve_access_index)
> on x86 and other architectures ;)
> We rely heavily on it in bpf backend.
> Then one can simply write:
> 
> struct rq___my {
>   unsigned int nr_pinned;
> } __attribute__((preserve_access_index));
> 
> struct rq___my *rq;
> 
> rq = this_rq();
> rq->nr_pinned++;
> 
> and the compiler will do its magic of offset adjustment.
> That's how BPF CORE works.

GNU Cauldron in Porto, Portugal is having a kernel track (hopefully if it
gets accepted). I highly recommend you attending and recommending these
features. It's happening two days after Kernel Recipes (I already booked my
plane tickets).

 https://gcc.gnu.org/wiki/cauldron2025

Peter, maybe you can attend too?

-- Steve

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 22:49               ` Steven Rostedt
@ 2025-07-16 22:50                 ` Steven Rostedt
  0 siblings, 0 replies; 32+ messages in thread
From: Steven Rostedt @ 2025-07-16 22:50 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Menglong Dong, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development,
	Jose E. Marchesi

On Wed, 16 Jul 2025 18:49:40 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> GNU Cauldron in Porto, Portugal is having a kernel track (hopefully if it
> gets accepted). I highly recommend you attending and recommending these
> features. It's happening two days after Kernel Recipes (I already booked my
> plane tickets).
> 

Bah, I forgot you are on the abstract so you already know about this! ;-)

[ But I might as well advertise to let other kernel devs know ]

-- Steve

^ permalink raw reply	[flat|nested] 32+ messages in thread

* multi-fentry proposal. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 13:05         ` Menglong Dong
@ 2025-07-17  0:59           ` Alexei Starovoitov
  2025-07-17  1:50             ` Menglong Dong
  0 siblings, 1 reply; 32+ messages in thread
From: Alexei Starovoitov @ 2025-07-17  0:59 UTC (permalink / raw)
  To: Menglong Dong, Jiri Olsa, Andrii Nakryiko
  Cc: Menglong Dong, Steven Rostedt, bpf, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, LKML, Network Development

On Wed, Jul 16, 2025 at 6:06 AM Menglong Dong <menglong.dong@linux.dev> wrote:
>
> On Wednesday, July 16, 2025 12:35 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > >
> > >
> > > On 7/15/25 10:25, Alexei Starovoitov wrote:
> [......]
> > >
> > > According to my benchmark, it has ~5% overhead to save/restore
> > > *5* variants when compared with *0* variant. The save/restore of regs
> > > is fast, but it still need 12 insn, which can produce ~6% overhead.
> >
> > I think it's an ok trade off, because with one global trampoline
> > we do not need to call rhashtable lookup before entering bpf prog.
> > bpf prog will do it on demand if/when it needs to access arguments.
> > This will compensate for a bit of lost performance due to extra save/restore.
>
> I don't understand here :/
>
> The rhashtable lookup is done at the beginning of the global trampoline,
> which is called before we enter bpf prog. The bpf progs is stored in the
> kfunc_md, and we need get them from the hash table.

Ahh. Right.

Looking at the existing bpf trampoline... It has complicated logic
to handle livepatching and tailcalls. Your global trampoline
doesn't, and once that is added it's starting to feel that it will
look just as complex as the current one.
So I think we better repurpose what we have.
Maybe we can rewrite the existing one in C too.

How about the following approach.
I think we discussed something like this in the past
and Jiri tried to implement something like this.
Andrii reminded me recently about it.

Say, we need to attach prog A to 30k functions.
10k with 2 args, 10k with 3 args, and 10k with 7 args.
We can generate 3 _existing_ bpf trampolines for 2,3,7 args
with hard coded prog A in there (the cookies would need to be
fetched via binary search similar to kprobe-multi).
The arch_prepare_bpf_trampoline() supports BPF_TRAMP_F_ORIG_STACK.
So one 2-arg trampoline will work to invoke prog A in all 10k 2-arg functions.
We don't need to match types, but have to compare that btf_func_model-s
are the same.

Menglong, your global trampoline for 0,1,..6 args works only for x86,
because btf_func_model doesn't care about sizes of args,
but it's not the correct mental model to use.

The above "10k with 2 args" is a simplified example.
We will need an arch specific callback is_btf_func_model_equal()
that will compare func models in arch specific ways.
For x86-64 the number of args is all it needs.
For other archs it will compare sizes and flags too.
So 30k functions will be sorted into
10k with btf_func_model_1, 10k with btf_func_model_2 and so on.
And the corresponding number of equivalent trampolines will be generated.

Note there will be no actual BTF types. All args will be untyped and
untrusted unlike current fentry.
We can go further and sort 30k functions by comparing BTFs
instead of btf_func_model-s, but I suspect 30k funcs will be split
into several thousands of exact BTFs. At that point multi-fentry
benefits are diminishing and we might as well generate 30k unique
bpf trampolines for 30k functions and avoid all the complexity.
So I would sort by btf_func_model compared by arch specific comparator.

Now say prog B needs to be attached to another 30k functions.
If all 30k+30k functions are different then it's the same as
the previous step.
Say, prog A is attached to 10k funcs with btf_func_model_1.
If prog B wants to attach to the exact same func set then we
just regenerate bpf trampoline with hard coded progs A and B
and reattach.
If not then we need to split the set into up to 3 sets.
Say, prog B wants 5k funcs, but only 1k func are common:
(prog_A, 9k func with btf_func_model_1) -> bpf trampoline X
(prog_A, prog_B, 1k funcs with btf_func_model_1) -> bpf trampoline Y
(prog_B, 4k funcs with btf_func_model_1) -> bpf trampoline Z

And so on when prog C needs to be attached.
At detach time we can merge sets/trampolines,
but for now we can leave it all fragmented.
Unlike regular fentry progs the multi-fentry progs are not going to
be attached for long time. So we can reduce the detach complexity.

The nice part of the algorithm is that coexistence of fentry
and multi-fentry is easy.
If fentry is already attached to some function we just
attach multi-fentry prog to that bpf trampoline.
If multi-fentry was attached first and fentry needs to be attached,
we create a regular bpf trampoline and add both progs there.

The intersect and sorting by btf_func_model is not trivial,
but we can hold global trampoline_mutex, so no concerns of races.

Example:
bpf_link_A is a set of:
(prog_A, funcs X,Y with btf_func_model_1)
(prog_A, funcs N,M with btf_func_model_2)

To attach prog B via bpf_link_B that wants:
(prog_B, funcs Y,Z with btf_func_model_1)
(prog_B, funcs P,Q with btf_func_model_3)

walk all existing links, intersect and split, and update the links.
At the end:

bpf_link_A:
(prog_A, funcs X with btf_func_model_1)
(prog_A, prog_B funcs Y with btf_func_model_1)
(prog_A, funcs N,M with btf_func_model_2)

bpf_link_B:
(prog_A, prog_B funcs Y with btf_func_model_1)
(prog_B, funcs Z with btf_func_model_1)
(prog_B, funcs P,Q with btf_func_model_3)

When link is detached: walk its own tuples, remove the prog,
if nr_progs == 0 -> detach corresponding trampoline,
if nr_progs > 0 -> remove prog and regenerate trampoline.

If fentry prog C needs to be attached to N it might split bpf_link_A:
(prog_A, funcs X with btf_func_model_1)
(prog_A, prog_B funcs Y with btf_func_model_1)
(prog_A, funcs M with btf_func_model_2)
(prog_A, prog_C funcs N with _fentry_)

Last time we gave up on it because we discovered that
overlap support was too complicated, but I cannot recall now
what it was :)
Maybe all of the above repeating some old mistakes.

Jiri,
How does the above proposal look to you?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: multi-fentry proposal. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-17  0:59           ` multi-fentry proposal. Was: " Alexei Starovoitov
@ 2025-07-17  1:50             ` Menglong Dong
  2025-07-17  2:13               ` Alexei Starovoitov
  0 siblings, 1 reply; 32+ messages in thread
From: Menglong Dong @ 2025-07-17  1:50 UTC (permalink / raw)
  To: Jiri Olsa, Andrii Nakryiko, Alexei Starovoitov
  Cc: Menglong Dong, Steven Rostedt, bpf, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, LKML, Network Development

On Thursday, July 17, 2025 8:59 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> On Wed, Jul 16, 2025 at 6:06 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> > On Wednesday, July 16, 2025 12:35 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > > On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > > >
> > > >
> > > > On 7/15/25 10:25, Alexei Starovoitov wrote:
> > [......]
> > > >
> > > > According to my benchmark, it has ~5% overhead to save/restore
> > > > *5* variants when compared with *0* variant. The save/restore of regs
> > > > is fast, but it still need 12 insn, which can produce ~6% overhead.
> > >
> > > I think it's an ok trade off, because with one global trampoline
> > > we do not need to call rhashtable lookup before entering bpf prog.
> > > bpf prog will do it on demand if/when it needs to access arguments.
> > > This will compensate for a bit of lost performance due to extra save/restore.
> >
> > I don't understand here :/
> >
> > The rhashtable lookup is done at the beginning of the global trampoline,
> > which is called before we enter bpf prog. The bpf progs is stored in the
> > kfunc_md, and we need get them from the hash table.
> 
> Ahh. Right.
> 
> Looking at the existing bpf trampoline... It has complicated logic
> to handle livepatching and tailcalls. Your global trampoline
> doesn't, and once that is added it's starting to feel that it will
> look just as complex as the current one.
> So I think we better repurpose what we have.
> Maybe we can rewrite the existing one in C too.

You are right, the tailcalls is not handled yet. But for the livepatching,
it is already handled, as we always get the origin ip from the stack
and call it, just like how the bpf trampoline handle the livepatching.
So no addition handling is needed here.

> 
> How about the following approach.
> I think we discussed something like this in the past
> and Jiri tried to implement something like this.
> Andrii reminded me recently about it.
> 
> Say, we need to attach prog A to 30k functions.
> 10k with 2 args, 10k with 3 args, and 10k with 7 args.
> We can generate 3 _existing_ bpf trampolines for 2,3,7 args
> with hard coded prog A in there (the cookies would need to be
> fetched via binary search similar to kprobe-multi).
> The arch_prepare_bpf_trampoline() supports BPF_TRAMP_F_ORIG_STACK.
> So one 2-arg trampoline will work to invoke prog A in all 10k 2-arg functions.
> We don't need to match types, but have to compare that btf_func_model-s
> are the same.
> 
> Menglong, your global trampoline for 0,1,..6 args works only for x86,
> because btf_func_model doesn't care about sizes of args,
> but it's not the correct mental model to use.
> 
> The above "10k with 2 args" is a simplified example.
> We will need an arch specific callback is_btf_func_model_equal()
> that will compare func models in arch specific ways.
> For x86-64 the number of args is all it needs.
> For other archs it will compare sizes and flags too.
> So 30k functions will be sorted into
> 10k with btf_func_model_1, 10k with btf_func_model_2 and so on.
> And the corresponding number of equivalent trampolines will be generated.
> 
> Note there will be no actual BTF types. All args will be untyped and
> untrusted unlike current fentry.
> We can go further and sort 30k functions by comparing BTFs
> instead of btf_func_model-s, but I suspect 30k funcs will be split
> into several thousands of exact BTFs. At that point multi-fentry
> benefits are diminishing and we might as well generate 30k unique
> bpf trampolines for 30k functions and avoid all the complexity.
> So I would sort by btf_func_model compared by arch specific comparator.
> 
> Now say prog B needs to be attached to another 30k functions.
> If all 30k+30k functions are different then it's the same as
> the previous step.
> Say, prog A is attached to 10k funcs with btf_func_model_1.
> If prog B wants to attach to the exact same func set then we
> just regenerate bpf trampoline with hard coded progs A and B
> and reattach.
> If not then we need to split the set into up to 3 sets.
> Say, prog B wants 5k funcs, but only 1k func are common:
> (prog_A, 9k func with btf_func_model_1) -> bpf trampoline X
> (prog_A, prog_B, 1k funcs with btf_func_model_1) -> bpf trampoline Y
> (prog_B, 4k funcs with btf_func_model_1) -> bpf trampoline Z
> 
> And so on when prog C needs to be attached.
> At detach time we can merge sets/trampolines,
> but for now we can leave it all fragmented.
> Unlike regular fentry progs the multi-fentry progs are not going to
> be attached for long time. So we can reduce the detach complexity.
> 
> The nice part of the algorithm is that coexistence of fentry
> and multi-fentry is easy.
> If fentry is already attached to some function we just
> attach multi-fentry prog to that bpf trampoline.
> If multi-fentry was attached first and fentry needs to be attached,
> we create a regular bpf trampoline and add both progs there.

This seems not easy, and it is exactly how I handle the
coexistence now:

  https://lore.kernel.org/bpf/20250528034712.138701-16-dongml2@chinatelecom.cn/
  https://lore.kernel.org/bpf/20250528034712.138701-17-dongml2@chinatelecom.cn/
  https://lore.kernel.org/bpf/20250528034712.138701-18-dongml2@chinatelecom.cn/

The most difficult part is that we need a way to replace the the
multi-fentry with fentry for the function in the ftrace atomically. Of
course, we can remove the global trampoline first, and then attach
the bpf trampoline, which will make things much easier. But a
short suspend will happen for the progs in fentry-multi.

> 
> The intersect and sorting by btf_func_model is not trivial,
> but we can hold global trampoline_mutex, so no concerns of races.
> 
> Example:
> bpf_link_A is a set of:
> (prog_A, funcs X,Y with btf_func_model_1)
> (prog_A, funcs N,M with btf_func_model_2)
> 
> To attach prog B via bpf_link_B that wants:
> (prog_B, funcs Y,Z with btf_func_model_1)
> (prog_B, funcs P,Q with btf_func_model_3)
> 
> walk all existing links, intersect and split, and update the links.
> At the end:
> 
> bpf_link_A:
> (prog_A, funcs X with btf_func_model_1)
> (prog_A, prog_B funcs Y with btf_func_model_1)
> (prog_A, funcs N,M with btf_func_model_2)
> 
> bpf_link_B:
> (prog_A, prog_B funcs Y with btf_func_model_1)
> (prog_B, funcs Z with btf_func_model_1)
> (prog_B, funcs P,Q with btf_func_model_3)
> 
> When link is detached: walk its own tuples, remove the prog,
> if nr_progs == 0 -> detach corresponding trampoline,
> if nr_progs > 0 -> remove prog and regenerate trampoline.
> 
> If fentry prog C needs to be attached to N it might split bpf_link_A:
> (prog_A, funcs X with btf_func_model_1)
> (prog_A, prog_B funcs Y with btf_func_model_1)
> (prog_A, funcs M with btf_func_model_2)
> (prog_A, prog_C funcs N with _fentry_)
> 
> Last time we gave up on it because we discovered that
> overlap support was too complicated, but I cannot recall now
> what it was :)
> Maybe all of the above repeating some old mistakes.

In my impression, this is exactly the solution of Jiri's, and this is
part of the discussion that I know:

  https://lore.kernel.org/bpf/ZfKY6E8xhSgzYL1I@krava/

> 
> Jiri,
> How does the above proposal look to you?
> 





^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: multi-fentry proposal. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-17  1:50             ` Menglong Dong
@ 2025-07-17  2:13               ` Alexei Starovoitov
  2025-07-17  2:37                 ` Menglong Dong
  0 siblings, 1 reply; 32+ messages in thread
From: Alexei Starovoitov @ 2025-07-17  2:13 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Jiri Olsa, Andrii Nakryiko, Menglong Dong, Steven Rostedt, bpf,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

On Wed, Jul 16, 2025 at 6:51 PM Menglong Dong <menglong.dong@linux.dev> wrote:
>
> On Thursday, July 17, 2025 8:59 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > On Wed, Jul 16, 2025 at 6:06 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > >
> > > On Wednesday, July 16, 2025 12:35 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > > > On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > > > >
> > > > >
> > > > > On 7/15/25 10:25, Alexei Starovoitov wrote:
> > > [......]
> > > > >
> > > > > According to my benchmark, it has ~5% overhead to save/restore
> > > > > *5* variants when compared with *0* variant. The save/restore of regs
> > > > > is fast, but it still need 12 insn, which can produce ~6% overhead.
> > > >
> > > > I think it's an ok trade off, because with one global trampoline
> > > > we do not need to call rhashtable lookup before entering bpf prog.
> > > > bpf prog will do it on demand if/when it needs to access arguments.
> > > > This will compensate for a bit of lost performance due to extra save/restore.
> > >
> > > I don't understand here :/
> > >
> > > The rhashtable lookup is done at the beginning of the global trampoline,
> > > which is called before we enter bpf prog. The bpf progs is stored in the
> > > kfunc_md, and we need get them from the hash table.
> >
> > Ahh. Right.
> >
> > Looking at the existing bpf trampoline... It has complicated logic
> > to handle livepatching and tailcalls. Your global trampoline
> > doesn't, and once that is added it's starting to feel that it will
> > look just as complex as the current one.
> > So I think we better repurpose what we have.
> > Maybe we can rewrite the existing one in C too.
>
> You are right, the tailcalls is not handled yet. But for the livepatching,
> it is already handled, as we always get the origin ip from the stack
> and call it, just like how the bpf trampoline handle the livepatching.
> So no addition handling is needed here.
>
> >
> > How about the following approach.
> > I think we discussed something like this in the past
> > and Jiri tried to implement something like this.
> > Andrii reminded me recently about it.
> >
> > Say, we need to attach prog A to 30k functions.
> > 10k with 2 args, 10k with 3 args, and 10k with 7 args.
> > We can generate 3 _existing_ bpf trampolines for 2,3,7 args
> > with hard coded prog A in there (the cookies would need to be
> > fetched via binary search similar to kprobe-multi).
> > The arch_prepare_bpf_trampoline() supports BPF_TRAMP_F_ORIG_STACK.
> > So one 2-arg trampoline will work to invoke prog A in all 10k 2-arg functions.
> > We don't need to match types, but have to compare that btf_func_model-s
> > are the same.
> >
> > Menglong, your global trampoline for 0,1,..6 args works only for x86,
> > because btf_func_model doesn't care about sizes of args,
> > but it's not the correct mental model to use.
> >
> > The above "10k with 2 args" is a simplified example.
> > We will need an arch specific callback is_btf_func_model_equal()
> > that will compare func models in arch specific ways.
> > For x86-64 the number of args is all it needs.
> > For other archs it will compare sizes and flags too.
> > So 30k functions will be sorted into
> > 10k with btf_func_model_1, 10k with btf_func_model_2 and so on.
> > And the corresponding number of equivalent trampolines will be generated.
> >
> > Note there will be no actual BTF types. All args will be untyped and
> > untrusted unlike current fentry.
> > We can go further and sort 30k functions by comparing BTFs
> > instead of btf_func_model-s, but I suspect 30k funcs will be split
> > into several thousands of exact BTFs. At that point multi-fentry
> > benefits are diminishing and we might as well generate 30k unique
> > bpf trampolines for 30k functions and avoid all the complexity.
> > So I would sort by btf_func_model compared by arch specific comparator.
> >
> > Now say prog B needs to be attached to another 30k functions.
> > If all 30k+30k functions are different then it's the same as
> > the previous step.
> > Say, prog A is attached to 10k funcs with btf_func_model_1.
> > If prog B wants to attach to the exact same func set then we
> > just regenerate bpf trampoline with hard coded progs A and B
> > and reattach.
> > If not then we need to split the set into up to 3 sets.
> > Say, prog B wants 5k funcs, but only 1k func are common:
> > (prog_A, 9k func with btf_func_model_1) -> bpf trampoline X
> > (prog_A, prog_B, 1k funcs with btf_func_model_1) -> bpf trampoline Y
> > (prog_B, 4k funcs with btf_func_model_1) -> bpf trampoline Z
> >
> > And so on when prog C needs to be attached.
> > At detach time we can merge sets/trampolines,
> > but for now we can leave it all fragmented.
> > Unlike regular fentry progs the multi-fentry progs are not going to
> > be attached for long time. So we can reduce the detach complexity.
> >
> > The nice part of the algorithm is that coexistence of fentry
> > and multi-fentry is easy.
> > If fentry is already attached to some function we just
> > attach multi-fentry prog to that bpf trampoline.
> > If multi-fentry was attached first and fentry needs to be attached,
> > we create a regular bpf trampoline and add both progs there.
>
> This seems not easy, and it is exactly how I handle the
> coexistence now:
>
>   https://lore.kernel.org/bpf/20250528034712.138701-16-dongml2@chinatelecom.cn/
>   https://lore.kernel.org/bpf/20250528034712.138701-17-dongml2@chinatelecom.cn/
>   https://lore.kernel.org/bpf/20250528034712.138701-18-dongml2@chinatelecom.cn/

hmm. exactly? That's very different.
You're relying on kfunc_md for prog list.
The above proposal doesn't need kfunc_md in the critical path.
All progs are built into the trampolines.

> The most difficult part is that we need a way to replace the the
> multi-fentry with fentry for the function in the ftrace atomically. Of
> course, we can remove the global trampoline first, and then attach
> the bpf trampoline, which will make things much easier. But a
> short suspend will happen for the progs in fentry-multi.

I don't follow.
In the above proposal fentry attach/detach is atomic.
Prepare a new trampoline, single call to ftrace to modify_fentry().

> >
> > The intersect and sorting by btf_func_model is not trivial,
> > but we can hold global trampoline_mutex, so no concerns of races.
> >
> > Example:
> > bpf_link_A is a set of:
> > (prog_A, funcs X,Y with btf_func_model_1)
> > (prog_A, funcs N,M with btf_func_model_2)
> >
> > To attach prog B via bpf_link_B that wants:
> > (prog_B, funcs Y,Z with btf_func_model_1)
> > (prog_B, funcs P,Q with btf_func_model_3)
> >
> > walk all existing links, intersect and split, and update the links.
> > At the end:
> >
> > bpf_link_A:
> > (prog_A, funcs X with btf_func_model_1)
> > (prog_A, prog_B funcs Y with btf_func_model_1)
> > (prog_A, funcs N,M with btf_func_model_2)
> >
> > bpf_link_B:
> > (prog_A, prog_B funcs Y with btf_func_model_1)
> > (prog_B, funcs Z with btf_func_model_1)
> > (prog_B, funcs P,Q with btf_func_model_3)
> >
> > When link is detached: walk its own tuples, remove the prog,
> > if nr_progs == 0 -> detach corresponding trampoline,
> > if nr_progs > 0 -> remove prog and regenerate trampoline.
> >
> > If fentry prog C needs to be attached to N it might split bpf_link_A:
> > (prog_A, funcs X with btf_func_model_1)
> > (prog_A, prog_B funcs Y with btf_func_model_1)
> > (prog_A, funcs M with btf_func_model_2)
> > (prog_A, prog_C funcs N with _fentry_)
> >
> > Last time we gave up on it because we discovered that
> > overlap support was too complicated, but I cannot recall now
> > what it was :)
> > Maybe all of the above repeating some old mistakes.
>
> In my impression, this is exactly the solution of Jiri's, and this is
> part of the discussion that I know:
>
>   https://lore.kernel.org/bpf/ZfKY6E8xhSgzYL1I@krava/

Yes. It's similar, but somehow it feels simple enough now.
The algorithms for both detach and attach fit on one page,
and everything is uniform. There are no spaghetty of corner cases.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: multi-fentry proposal. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-17  2:13               ` Alexei Starovoitov
@ 2025-07-17  2:37                 ` Menglong Dong
  0 siblings, 0 replies; 32+ messages in thread
From: Menglong Dong @ 2025-07-17  2:37 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jiri Olsa, Andrii Nakryiko, Menglong Dong, Steven Rostedt, bpf,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

On Thursday, July 17, 2025 10:13 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> On Wed, Jul 16, 2025 at 6:51 PM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> > On Thursday, July 17, 2025 8:59 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > > On Wed, Jul 16, 2025 at 6:06 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > > >
> > > > On Wednesday, July 16, 2025 12:35 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > > > > On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > > > > >
> > > > > >
> > > > > > On 7/15/25 10:25, Alexei Starovoitov wrote:
> > > > [......]
> > > > > >
> > > > > > According to my benchmark, it has ~5% overhead to save/restore
> > > > > > *5* variants when compared with *0* variant. The save/restore of regs
> > > > > > is fast, but it still need 12 insn, which can produce ~6% overhead.
> > > > >
> > > > > I think it's an ok trade off, because with one global trampoline
> > > > > we do not need to call rhashtable lookup before entering bpf prog.
> > > > > bpf prog will do it on demand if/when it needs to access arguments.
> > > > > This will compensate for a bit of lost performance due to extra save/restore.
> > > >
> > > > I don't understand here :/
> > > >
> > > > The rhashtable lookup is done at the beginning of the global trampoline,
> > > > which is called before we enter bpf prog. The bpf progs is stored in the
> > > > kfunc_md, and we need get them from the hash table.
> > >
> > > Ahh. Right.
> > >
> > > Looking at the existing bpf trampoline... It has complicated logic
> > > to handle livepatching and tailcalls. Your global trampoline
> > > doesn't, and once that is added it's starting to feel that it will
> > > look just as complex as the current one.
> > > So I think we better repurpose what we have.
> > > Maybe we can rewrite the existing one in C too.
> >
> > You are right, the tailcalls is not handled yet. But for the livepatching,
> > it is already handled, as we always get the origin ip from the stack
> > and call it, just like how the bpf trampoline handle the livepatching.
> > So no addition handling is needed here.
> >
> > >
> > > How about the following approach.
> > > I think we discussed something like this in the past
> > > and Jiri tried to implement something like this.
> > > Andrii reminded me recently about it.
> > >
> > > Say, we need to attach prog A to 30k functions.
> > > 10k with 2 args, 10k with 3 args, and 10k with 7 args.
> > > We can generate 3 _existing_ bpf trampolines for 2,3,7 args
> > > with hard coded prog A in there (the cookies would need to be
> > > fetched via binary search similar to kprobe-multi).
> > > The arch_prepare_bpf_trampoline() supports BPF_TRAMP_F_ORIG_STACK.
> > > So one 2-arg trampoline will work to invoke prog A in all 10k 2-arg functions.
> > > We don't need to match types, but have to compare that btf_func_model-s
> > > are the same.
> > >
> > > Menglong, your global trampoline for 0,1,..6 args works only for x86,
> > > because btf_func_model doesn't care about sizes of args,
> > > but it's not the correct mental model to use.
> > >
> > > The above "10k with 2 args" is a simplified example.
> > > We will need an arch specific callback is_btf_func_model_equal()
> > > that will compare func models in arch specific ways.
> > > For x86-64 the number of args is all it needs.
> > > For other archs it will compare sizes and flags too.
> > > So 30k functions will be sorted into
> > > 10k with btf_func_model_1, 10k with btf_func_model_2 and so on.
> > > And the corresponding number of equivalent trampolines will be generated.
> > >
> > > Note there will be no actual BTF types. All args will be untyped and
> > > untrusted unlike current fentry.
> > > We can go further and sort 30k functions by comparing BTFs
> > > instead of btf_func_model-s, but I suspect 30k funcs will be split
> > > into several thousands of exact BTFs. At that point multi-fentry
> > > benefits are diminishing and we might as well generate 30k unique
> > > bpf trampolines for 30k functions and avoid all the complexity.
> > > So I would sort by btf_func_model compared by arch specific comparator.
> > >
> > > Now say prog B needs to be attached to another 30k functions.
> > > If all 30k+30k functions are different then it's the same as
> > > the previous step.
> > > Say, prog A is attached to 10k funcs with btf_func_model_1.
> > > If prog B wants to attach to the exact same func set then we
> > > just regenerate bpf trampoline with hard coded progs A and B
> > > and reattach.
> > > If not then we need to split the set into up to 3 sets.
> > > Say, prog B wants 5k funcs, but only 1k func are common:
> > > (prog_A, 9k func with btf_func_model_1) -> bpf trampoline X
> > > (prog_A, prog_B, 1k funcs with btf_func_model_1) -> bpf trampoline Y
> > > (prog_B, 4k funcs with btf_func_model_1) -> bpf trampoline Z
> > >
> > > And so on when prog C needs to be attached.
> > > At detach time we can merge sets/trampolines,
> > > but for now we can leave it all fragmented.
> > > Unlike regular fentry progs the multi-fentry progs are not going to
> > > be attached for long time. So we can reduce the detach complexity.
> > >
> > > The nice part of the algorithm is that coexistence of fentry
> > > and multi-fentry is easy.
> > > If fentry is already attached to some function we just
> > > attach multi-fentry prog to that bpf trampoline.
> > > If multi-fentry was attached first and fentry needs to be attached,
> > > we create a regular bpf trampoline and add both progs there.
> >
> > This seems not easy, and it is exactly how I handle the
> > coexistence now:
> >
> >   https://lore.kernel.org/bpf/20250528034712.138701-16-dongml2@chinatelecom.cn/
> >   https://lore.kernel.org/bpf/20250528034712.138701-17-dongml2@chinatelecom.cn/
> >   https://lore.kernel.org/bpf/20250528034712.138701-18-dongml2@chinatelecom.cn/
> 
> hmm. exactly? That's very different.
> You're relying on kfunc_md for prog list.
> The above proposal doesn't need kfunc_md in the critical path.
> All progs are built into the trampolines.
> 
> > The most difficult part is that we need a way to replace the the
> > multi-fentry with fentry for the function in the ftrace atomically. Of
> > course, we can remove the global trampoline first, and then attach
> > the bpf trampoline, which will make things much easier. But a
> > short suspend will happen for the progs in fentry-multi.
> 
> I don't follow.
> In the above proposal fentry attach/detach is atomic.
> Prepare a new trampoline, single call to ftrace to modify_fentry().

modify_fentry() is used to operate on the same ftrace_ops. For
example, we have the bpf trampoline A, and its corresponding
ftrace_ops is opsA. Now, the image of the trampolineA is updated,
we call modify_fentry() for opsA to update the direct call of it.

When we talk about the coexistence, it means the functionA is
attached with the global trampoline B, whose ftrace_ops is
opsB. We can't call modify_fentry(trampolineA, new_addr) here,
as the opsA is not register yet. And we can't call register_fentry
too, as the functionA is already in the direct_functions when we
register opsB.

So we need a way to do such transition.

> 
> > >
> > > The intersect and sorting by btf_func_model is not trivial,
> > > but we can hold global trampoline_mutex, so no concerns of races.
> > >
> > > Example:
> > > bpf_link_A is a set of:
> > > (prog_A, funcs X,Y with btf_func_model_1)
> > > (prog_A, funcs N,M with btf_func_model_2)
> > >
> > > To attach prog B via bpf_link_B that wants:
> > > (prog_B, funcs Y,Z with btf_func_model_1)
> > > (prog_B, funcs P,Q with btf_func_model_3)
> > >
> > > walk all existing links, intersect and split, and update the links.
> > > At the end:
> > >
> > > bpf_link_A:
> > > (prog_A, funcs X with btf_func_model_1)
> > > (prog_A, prog_B funcs Y with btf_func_model_1)
> > > (prog_A, funcs N,M with btf_func_model_2)
> > >
> > > bpf_link_B:
> > > (prog_A, prog_B funcs Y with btf_func_model_1)
> > > (prog_B, funcs Z with btf_func_model_1)
> > > (prog_B, funcs P,Q with btf_func_model_3)
> > >
> > > When link is detached: walk its own tuples, remove the prog,
> > > if nr_progs == 0 -> detach corresponding trampoline,
> > > if nr_progs > 0 -> remove prog and regenerate trampoline.
> > >
> > > If fentry prog C needs to be attached to N it might split bpf_link_A:
> > > (prog_A, funcs X with btf_func_model_1)
> > > (prog_A, prog_B funcs Y with btf_func_model_1)
> > > (prog_A, funcs M with btf_func_model_2)
> > > (prog_A, prog_C funcs N with _fentry_)
> > >
> > > Last time we gave up on it because we discovered that
> > > overlap support was too complicated, but I cannot recall now
> > > what it was :)
> > > Maybe all of the above repeating some old mistakes.
> >
> > In my impression, this is exactly the solution of Jiri's, and this is
> > part of the discussion that I know:
> >
> >   https://lore.kernel.org/bpf/ZfKY6E8xhSgzYL1I@krava/
> 
> Yes. It's similar, but somehow it feels simple enough now.
> The algorithms for both detach and attach fit on one page,
> and everything is uniform. There are no spaghetty of corner cases.
> 





^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 22:35             ` Alexei Starovoitov
  2025-07-16 22:49               ` Steven Rostedt
@ 2025-07-28  9:20               ` Menglong Dong
  2025-07-31 16:15                 ` Alexei Starovoitov
  1 sibling, 1 reply; 32+ messages in thread
From: Menglong Dong @ 2025-07-28  9:20 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> >
> > > Maybe Peter has better ideas ?
> >
> > Is it possible to express runqueues::nr_pinned as an alias?
> >
> > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> >
> > And use:
> >
> >         __this_cpu_inc(&this_nr_pinned);
> >
> >
> > This syntax doesn't actually seem to work; but can we construct
> > something like that?
>
> Yeah. Iant is right. It's a string and not a pointer dereference.
> It never worked.
>
> Few options:
>
> 1.
>  struct rq {
> +#ifdef CONFIG_SMP
> +       unsigned int            nr_pinned;
> +#endif
>         /* runqueue lock: */
>         raw_spinlock_t          __lock;
>
> @@ -1271,9 +1274,6 @@ struct rq {
>         struct cpuidle_state    *idle_state;
>  #endif
>
> -#ifdef CONFIG_SMP
> -       unsigned int            nr_pinned;
> -#endif
>
> but ugly...
>
> 2.
> static unsigned int nr_pinned_offset __ro_after_init __used;
> RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
>
> overkill for what's needed
>
> 3.
> OFFSET(RQ_nr_pinned, rq, nr_pinned);
> then
> #include <generated/asm-offsets.h>
>
> imo the best.

I had a try. The struct rq is not visible to asm-offsets.c, so we
can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
to define a similar rq-offsets.c in kernel/sched/ ? It will be more
complex than the way 2, and I think the second way 2 is
easier :/

>
> 4.
> Maybe we should extend clang/gcc to support attr(preserve_access_index)
> on x86 and other architectures ;)
> We rely heavily on it in bpf backend.
> Then one can simply write:
>
> struct rq___my {
>   unsigned int nr_pinned;
> } __attribute__((preserve_access_index));
>
> struct rq___my *rq;
>
> rq = this_rq();
> rq->nr_pinned++;
>
> and the compiler will do its magic of offset adjustment.
> That's how BPF CORE works.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-28  9:20               ` Menglong Dong
@ 2025-07-31 16:15                 ` Alexei Starovoitov
  2025-08-01  1:42                   ` Menglong Dong
  2025-08-06  8:44                   ` Menglong Dong
  0 siblings, 2 replies; 32+ messages in thread
From: Alexei Starovoitov @ 2025-07-31 16:15 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Mon, Jul 28, 2025 at 2:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> > >
> > > > Maybe Peter has better ideas ?
> > >
> > > Is it possible to express runqueues::nr_pinned as an alias?
> > >
> > > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> > >
> > > And use:
> > >
> > >         __this_cpu_inc(&this_nr_pinned);
> > >
> > >
> > > This syntax doesn't actually seem to work; but can we construct
> > > something like that?
> >
> > Yeah. Iant is right. It's a string and not a pointer dereference.
> > It never worked.
> >
> > Few options:
> >
> > 1.
> >  struct rq {
> > +#ifdef CONFIG_SMP
> > +       unsigned int            nr_pinned;
> > +#endif
> >         /* runqueue lock: */
> >         raw_spinlock_t          __lock;
> >
> > @@ -1271,9 +1274,6 @@ struct rq {
> >         struct cpuidle_state    *idle_state;
> >  #endif
> >
> > -#ifdef CONFIG_SMP
> > -       unsigned int            nr_pinned;
> > -#endif
> >
> > but ugly...
> >
> > 2.
> > static unsigned int nr_pinned_offset __ro_after_init __used;
> > RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
> >
> > overkill for what's needed
> >
> > 3.
> > OFFSET(RQ_nr_pinned, rq, nr_pinned);
> > then
> > #include <generated/asm-offsets.h>
> >
> > imo the best.
>
> I had a try. The struct rq is not visible to asm-offsets.c, so we
> can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
> to define a similar rq-offsets.c in kernel/sched/ ? It will be more
> complex than the way 2, and I think the second way 2 is
> easier :/

2 maybe easier, but it's an overkill.
I still think asm-offset is cleaner.
arch/xx shouldn't be used, of course, since this nr_pinned should
be generic for all archs.
We can do something similar to drivers/memory/emif-asm-offsets.c
and do that within kernel/sched/.
rq-offsets.c as you said.
It will generate rq-offsets.h in a build dir that can be #include-d.

I thought about another alternative (as a derivative of 1):
split nr_pinned from 'struct rq' into its own per-cpu variable,
but I don't think that will work, since rq_has_pinned_tasks()
doesn't always operate on this_rq().
So the acceptable choices are realistically 1 and 3 and
rq-offsets.c seems cleaner.
Pls give it another try.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-31 16:15                 ` Alexei Starovoitov
@ 2025-08-01  1:42                   ` Menglong Dong
  2025-08-06  8:44                   ` Menglong Dong
  1 sibling, 0 replies; 32+ messages in thread
From: Menglong Dong @ 2025-08-01  1:42 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Fri, Aug 1, 2025 at 12:15 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jul 28, 2025 at 2:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> > > >
> > > > > Maybe Peter has better ideas ?
> > > >
> > > > Is it possible to express runqueues::nr_pinned as an alias?
> > > >
> > > > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> > > >
> > > > And use:
> > > >
> > > >         __this_cpu_inc(&this_nr_pinned);
> > > >
> > > >
> > > > This syntax doesn't actually seem to work; but can we construct
> > > > something like that?
> > >
> > > Yeah. Iant is right. It's a string and not a pointer dereference.
> > > It never worked.
> > >
> > > Few options:
> > >
> > > 1.
> > >  struct rq {
> > > +#ifdef CONFIG_SMP
> > > +       unsigned int            nr_pinned;
> > > +#endif
> > >         /* runqueue lock: */
> > >         raw_spinlock_t          __lock;
> > >
> > > @@ -1271,9 +1274,6 @@ struct rq {
> > >         struct cpuidle_state    *idle_state;
> > >  #endif
> > >
> > > -#ifdef CONFIG_SMP
> > > -       unsigned int            nr_pinned;
> > > -#endif
> > >
> > > but ugly...
> > >
> > > 2.
> > > static unsigned int nr_pinned_offset __ro_after_init __used;
> > > RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
> > >
> > > overkill for what's needed
> > >
> > > 3.
> > > OFFSET(RQ_nr_pinned, rq, nr_pinned);
> > > then
> > > #include <generated/asm-offsets.h>
> > >
> > > imo the best.
> >
> > I had a try. The struct rq is not visible to asm-offsets.c, so we
> > can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
> > to define a similar rq-offsets.c in kernel/sched/ ? It will be more
> > complex than the way 2, and I think the second way 2 is
> > easier :/
>
> 2 maybe easier, but it's an overkill.
> I still think asm-offset is cleaner.
> arch/xx shouldn't be used, of course, since this nr_pinned should
> be generic for all archs.
> We can do something similar to drivers/memory/emif-asm-offsets.c

Great, I'll have a try on this way!

> and do that within kernel/sched/.
> rq-offsets.c as you said.
> It will generate rq-offsets.h in a build dir that can be #include-d.
>
> I thought about another alternative (as a derivative of 1):
> split nr_pinned from 'struct rq' into its own per-cpu variable,
> but I don't think that will work, since rq_has_pinned_tasks()
> doesn't always operate on this_rq().
> So the acceptable choices are realistically 1 and 3 and
> rq-offsets.c seems cleaner.
> Pls give it another try.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-31 16:15                 ` Alexei Starovoitov
  2025-08-01  1:42                   ` Menglong Dong
@ 2025-08-06  8:44                   ` Menglong Dong
  2025-08-08  0:58                     ` Alexei Starovoitov
  1 sibling, 1 reply; 32+ messages in thread
From: Menglong Dong @ 2025-08-06  8:44 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Fri, Aug 1, 2025 at 12:15 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jul 28, 2025 at 2:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> > > >
> > > > > Maybe Peter has better ideas ?
> > > >
> > > > Is it possible to express runqueues::nr_pinned as an alias?
> > > >
> > > > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> > > >
> > > > And use:
> > > >
> > > >         __this_cpu_inc(&this_nr_pinned);
> > > >
> > > >
> > > > This syntax doesn't actually seem to work; but can we construct
> > > > something like that?
> > >
> > > Yeah. Iant is right. It's a string and not a pointer dereference.
> > > It never worked.
> > >
> > > Few options:
> > >
> > > 1.
> > >  struct rq {
> > > +#ifdef CONFIG_SMP
> > > +       unsigned int            nr_pinned;
> > > +#endif
> > >         /* runqueue lock: */
> > >         raw_spinlock_t          __lock;
> > >
> > > @@ -1271,9 +1274,6 @@ struct rq {
> > >         struct cpuidle_state    *idle_state;
> > >  #endif
> > >
> > > -#ifdef CONFIG_SMP
> > > -       unsigned int            nr_pinned;
> > > -#endif
> > >
> > > but ugly...
> > >
> > > 2.
> > > static unsigned int nr_pinned_offset __ro_after_init __used;
> > > RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
> > >
> > > overkill for what's needed
> > >
> > > 3.
> > > OFFSET(RQ_nr_pinned, rq, nr_pinned);
> > > then
> > > #include <generated/asm-offsets.h>
> > >
> > > imo the best.
> >
> > I had a try. The struct rq is not visible to asm-offsets.c, so we
> > can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
> > to define a similar rq-offsets.c in kernel/sched/ ? It will be more
> > complex than the way 2, and I think the second way 2 is
> > easier :/
>
> 2 maybe easier, but it's an overkill.
> I still think asm-offset is cleaner.
> arch/xx shouldn't be used, of course, since this nr_pinned should
> be generic for all archs.
> We can do something similar to drivers/memory/emif-asm-offsets.c
> and do that within kernel/sched/.
> rq-offsets.c as you said.
> It will generate rq-offsets.h in a build dir that can be #include-d.
>
> I thought about another alternative (as a derivative of 1):
> split nr_pinned from 'struct rq' into its own per-cpu variable,
> but I don't think that will work, since rq_has_pinned_tasks()
> doesn't always operate on this_rq().
> So the acceptable choices are realistically 1 and 3 and
> rq-offsets.c seems cleaner.
> Pls give it another try.

Generally speaking, the way 3 works. The only problem is how
we handle this_rq(). I introduced following code in
include/linux/sched.h:

struct rq;
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)

The this_rq_ptr() is used in migrate_enable(). I have to use the
arch_raw_cpu_ptr() for it. this_cpu_ptr() can't be used here, as
it will fail on this_cpu_ptr -> raw_cpu_ptr -> __verify_pcpu_ptr:

#define __verify_pcpu_ptr(ptr)                        \
do {                                    \
    const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL;    \
    (void)__vpp_verify;                        \
} while (0)

The struct rq is not available here, which makes the typeof((ptr) + 0)
fail during compiling. What can we do here?

According to my testing, the performance of fentry increased from
111M/s to 121M/s with migrate_enable/disable inlined.

Following is the whole patch:
-------------------------------------------------------------------------------------------
diff --git a/Kbuild b/Kbuild
index f327ca86990c..13324b4bbe23 100644
--- a/Kbuild
+++ b/Kbuild
@@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s:
$(timeconst-file) $(bounds-file)
 $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
     $(call filechk,offsets,__ASM_OFFSETS_H__)

+# Generate rq-offsets.h
+
+rq-offsets-file := include/generated/rq-offsets.h
+
+targets += kernel/sched/rq-offsets.s
+
+kernel/sched/rq-offsets.s: $(offsets-file)
+
+$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE
+    $(call filechk,offsets,__RQ_OFFSETS_H__)
+
 # Check for missing system calls

 quiet_cmd_syscalls = CALL    $<
       cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags)
$(missing_syscalls_flags)

 PHONY += missing-syscalls
-missing-syscalls: scripts/checksyscalls.sh $(offsets-file)
+missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file)
     $(call cmd,syscalls)

 # Check the manual modification of atomic headers
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 1fad1c8a4c76..3a1c08a75c09 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -369,64 +369,6 @@ static inline void preempt_notifier_init(struct
preempt_notifier *notifier,

 #endif

-/*
- * Migrate-Disable and why it is undesired.
- *
- * When a preempted task becomes elegible to run under the ideal model (IOW it
- * becomes one of the M highest priority tasks), it might still have to wait
- * for the preemptee's migrate_disable() section to complete. Thereby suffering
- * a reduction in bandwidth in the exact duration of the migrate_disable()
- * section.
- *
- * Per this argument, the change from preempt_disable() to migrate_disable()
- * gets us:
- *
- * - a higher priority tasks gains reduced wake-up latency; with
preempt_disable()
- *   it would have had to wait for the lower priority task.
- *
- * - a lower priority tasks; which under preempt_disable() could've instantly
- *   migrated away when another CPU becomes available, is now constrained
- *   by the ability to push the higher priority task away, which
might itself be
- *   in a migrate_disable() section, reducing it's available bandwidth.
- *
- * IOW it trades latency / moves the interference term, but it stays in the
- * system, and as long as it remains unbounded, the system is not fully
- * deterministic.
- *
- *
- * The reason we have it anyway.
- *
- * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
- * number of primitives into becoming preemptible, they would also allow
- * migration. This turns out to break a bunch of per-cpu usage. To this end,
- * all these primitives employ migirate_disable() to restore this implicit
- * assumption.
- *
- * This is a 'temporary' work-around at best. The correct solution is getting
- * rid of the above assumptions and reworking the code to employ explicit
- * per-cpu locking or short preempt-disable regions.
- *
- * The end goal must be to get rid of migrate_disable(), alternatively we need
- * a schedulability theory that does not depend on abritrary migration.
- *
- *
- * Notes on the implementation.
- *
- * The implementation is particularly tricky since existing code patterns
- * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
- * This means that it cannot use cpus_read_lock() to serialize against hotplug,
- * nor can it easily migrate itself into a pending affinity mask change on
- * migrate_enable().
- *
- *
- * Note: even non-work-conserving schedulers like semi-partitioned depends on
- *       migration, so migrate_disable() is not only a problem for
- *       work-conserving schedulers.
- *
- */
-extern void migrate_disable(void);
-extern void migrate_enable(void);
-
 /**
  * preempt_disable_nested - Disable preemption inside a normally
preempt disabled section
  *
@@ -471,7 +413,6 @@ static __always_inline void preempt_enable_nested(void)

 DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
 DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(),
preempt_enable_notrace())
-DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

 #ifdef CONFIG_PREEMPT_DYNAMIC

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40d2fa90df42..365ac6d17504 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -48,6 +48,9 @@
 #include <linux/uidgid_types.h>
 #include <linux/tracepoint-defs.h>
 #include <asm/kmap_size.h>
+#ifndef COMPILE_OFFSETS
+#include <generated/rq-offsets.h>
+#endif

 /* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
@@ -2299,4 +2302,127 @@ static __always_inline void
alloc_tag_restore(struct alloc_tag *tag, struct allo
 #define alloc_tag_restore(_tag, _old)        do {} while (0)
 #endif

+#if defined(CONFIG_SMP) && !defined(COMPILE_OFFSETS)
+
+extern void __migrate_enable(void);
+
+struct rq;
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)
+
+/*
+ * Migrate-Disable and why it is undesired.
+ *
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
+ *
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with
preempt_disable()
+ *   it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ *   migrated away when another CPU becomes available, is now constrained
+ *   by the ability to push the higher priority task away, which
might itself be
+ *   in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
+ *
+ *
+ * The reason we have it anyway.
+ *
+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
+ * number of primitives into becoming preemptible, they would also allow
+ * migration. This turns out to break a bunch of per-cpu usage. To this end,
+ * all these primitives employ migirate_disable() to restore this implicit
+ * assumption.
+ *
+ * This is a 'temporary' work-around at best. The correct solution is getting
+ * rid of the above assumptions and reworking the code to employ explicit
+ * per-cpu locking or short preempt-disable regions.
+ *
+ * The end goal must be to get rid of migrate_disable(), alternatively we need
+ * a schedulability theory that does not depend on abritrary migration.
+ *
+ *
+ * Notes on the implementation.
+ *
+ * The implementation is particularly tricky since existing code patterns
+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
+ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
+ * nor can it easily migrate itself into a pending affinity mask change on
+ * migrate_enable().
+ *
+ *
+ * Note: even non-work-conserving schedulers like semi-partitioned depends on
+ *       migration, so migrate_disable() is not only a problem for
+ *       work-conserving schedulers.
+ *
+ */
+static inline void migrate_enable(void)
+{
+    struct task_struct *p = current;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+    /*
+     * Check both overflow from migrate_disable() and superfluous
+     * migrate_enable().
+     */
+    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
+        return;
+#endif
+
+    if (p->migration_disabled > 1) {
+        p->migration_disabled--;
+        return;
+    }
+
+    /*
+     * Ensure stop_task runs either before or after this, and that
+     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+     */
+    guard(preempt)();
+    __migrate_enable();
+    /*
+     * Mustn't clear migration_disabled() until cpus_ptr points back at the
+     * regular cpus_mask, otherwise things that race (eg.
+     * select_fallback_rq) get confused.
+     */
+    barrier();
+    p->migration_disabled = 0;
+    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))--;
+}
+
+static inline void migrate_disable(void)
+{
+    struct task_struct *p = current;
+
+    if (p->migration_disabled) {
+#ifdef CONFIG_DEBUG_PREEMPT
+        /*
+         *Warn about overflow half-way through the range.
+         */
+        WARN_ON_ONCE((s16)p->migration_disabled < 0);
+#endif
+        p->migration_disabled++;
+        return;
+    }
+
+    guard(preempt)();
+    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))++;
+    p->migration_disabled = 1;
+}
+#else
+static inline void migrate_disable(void) { }
+static inline void migrate_enable(void) { }
+#endif
+
+DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
+
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 399f03e62508..75d5f145ca60 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23853,8 +23853,7 @@ int bpf_check_attach_target(struct
bpf_verifier_log *log,
 BTF_SET_START(btf_id_deny)
 BTF_ID_UNUSED
 #ifdef CONFIG_SMP
-BTF_ID(func, migrate_disable)
-BTF_ID(func, migrate_enable)
+BTF_ID(func, __migrate_enable)
 #endif
 #if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
 BTF_ID(func, rcu_read_unlock_strict)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3ec00d08d46a..b521024c99ed 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,6 +119,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);

 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+EXPORT_SYMBOL_GPL(runqueues);

 #ifdef CONFIG_SCHED_PROXY_EXEC
 DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -2375,28 +2376,7 @@ static void migrate_disable_switch(struct rq
*rq, struct task_struct *p)
     __do_set_cpus_allowed(p, &ac);
 }

-void migrate_disable(void)
-{
-    struct task_struct *p = current;
-
-    if (p->migration_disabled) {
-#ifdef CONFIG_DEBUG_PREEMPT
-        /*
-         *Warn about overflow half-way through the range.
-         */
-        WARN_ON_ONCE((s16)p->migration_disabled < 0);
-#endif
-        p->migration_disabled++;
-        return;
-    }
-
-    guard(preempt)();
-    this_rq()->nr_pinned++;
-    p->migration_disabled = 1;
-}
-EXPORT_SYMBOL_GPL(migrate_disable);
-
-void migrate_enable(void)
+void __migrate_enable(void)
 {
     struct task_struct *p = current;
     struct affinity_context ac = {
@@ -2404,37 +2384,10 @@ void migrate_enable(void)
         .flags     = SCA_MIGRATE_ENABLE,
     };

-#ifdef CONFIG_DEBUG_PREEMPT
-    /*
-     * Check both overflow from migrate_disable() and superfluous
-     * migrate_enable().
-     */
-    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
-        return;
-#endif
-
-    if (p->migration_disabled > 1) {
-        p->migration_disabled--;
-        return;
-    }
-
-    /*
-     * Ensure stop_task runs either before or after this, and that
-     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
-     */
-    guard(preempt)();
     if (p->cpus_ptr != &p->cpus_mask)
         __set_cpus_allowed_ptr(p, &ac);
-    /*
-     * Mustn't clear migration_disabled() until cpus_ptr points back at the
-     * regular cpus_mask, otherwise things that race (eg.
-     * select_fallback_rq) get confused.
-     */
-    barrier();
-    p->migration_disabled = 0;
-    this_rq()->nr_pinned--;
 }
-EXPORT_SYMBOL_GPL(migrate_enable);
+EXPORT_SYMBOL_GPL(__migrate_enable);

 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
new file mode 100644
index 000000000000..a23747bbe25b
--- /dev/null
+++ b/kernel/sched/rq-offsets.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#define COMPILE_OFFSETS
+#include <linux/kbuild.h>
+#include <linux/types.h>
+#include "sched.h"
+
+int main(void)
+{
+    DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
+
+    return 0;
+}

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-08-06  8:44                   ` Menglong Dong
@ 2025-08-08  0:58                     ` Alexei Starovoitov
  2025-08-08  5:48                       ` Menglong Dong
  2025-08-08  6:32                       ` Menglong Dong
  0 siblings, 2 replies; 32+ messages in thread
From: Alexei Starovoitov @ 2025-08-08  0:58 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Wed, Aug 6, 2025 at 1:44 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> On Fri, Aug 1, 2025 at 12:15 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Jul 28, 2025 at 2:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > >
> > > On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > >
> > > > > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> > > > >
> > > > > > Maybe Peter has better ideas ?
> > > > >
> > > > > Is it possible to express runqueues::nr_pinned as an alias?
> > > > >
> > > > > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> > > > >
> > > > > And use:
> > > > >
> > > > >         __this_cpu_inc(&this_nr_pinned);
> > > > >
> > > > >
> > > > > This syntax doesn't actually seem to work; but can we construct
> > > > > something like that?
> > > >
> > > > Yeah. Iant is right. It's a string and not a pointer dereference.
> > > > It never worked.
> > > >
> > > > Few options:
> > > >
> > > > 1.
> > > >  struct rq {
> > > > +#ifdef CONFIG_SMP
> > > > +       unsigned int            nr_pinned;
> > > > +#endif
> > > >         /* runqueue lock: */
> > > >         raw_spinlock_t          __lock;
> > > >
> > > > @@ -1271,9 +1274,6 @@ struct rq {
> > > >         struct cpuidle_state    *idle_state;
> > > >  #endif
> > > >
> > > > -#ifdef CONFIG_SMP
> > > > -       unsigned int            nr_pinned;
> > > > -#endif
> > > >
> > > > but ugly...
> > > >
> > > > 2.
> > > > static unsigned int nr_pinned_offset __ro_after_init __used;
> > > > RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
> > > >
> > > > overkill for what's needed
> > > >
> > > > 3.
> > > > OFFSET(RQ_nr_pinned, rq, nr_pinned);
> > > > then
> > > > #include <generated/asm-offsets.h>
> > > >
> > > > imo the best.
> > >
> > > I had a try. The struct rq is not visible to asm-offsets.c, so we
> > > can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
> > > to define a similar rq-offsets.c in kernel/sched/ ? It will be more
> > > complex than the way 2, and I think the second way 2 is
> > > easier :/
> >
> > 2 maybe easier, but it's an overkill.
> > I still think asm-offset is cleaner.
> > arch/xx shouldn't be used, of course, since this nr_pinned should
> > be generic for all archs.
> > We can do something similar to drivers/memory/emif-asm-offsets.c
> > and do that within kernel/sched/.
> > rq-offsets.c as you said.
> > It will generate rq-offsets.h in a build dir that can be #include-d.
> >
> > I thought about another alternative (as a derivative of 1):
> > split nr_pinned from 'struct rq' into its own per-cpu variable,
> > but I don't think that will work, since rq_has_pinned_tasks()
> > doesn't always operate on this_rq().
> > So the acceptable choices are realistically 1 and 3 and
> > rq-offsets.c seems cleaner.
> > Pls give it another try.
>
> Generally speaking, the way 3 works. The only problem is how
> we handle this_rq(). I introduced following code in
> include/linux/sched.h:
>
> struct rq;
> DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> #define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)
>
> The this_rq_ptr() is used in migrate_enable(). I have to use the
> arch_raw_cpu_ptr() for it. this_cpu_ptr() can't be used here, as
> it will fail on this_cpu_ptr -> raw_cpu_ptr -> __verify_pcpu_ptr:
>
> #define __verify_pcpu_ptr(ptr)                        \
> do {                                    \
>     const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL;    \
>     (void)__vpp_verify;                        \
> } while (0)
>
> The struct rq is not available here, which makes the typeof((ptr) + 0)
> fail during compiling. What can we do here?

Interesting.
The comment says:
 * + 0 is required in order to convert the pointer type from a
 * potential array type to a pointer to a single item of the array.

so maybe we can do some macro magic to avoid '+ 0'
when type is already pointer,
but for now let's proceed with arch_raw_cpu_ptr().

> According to my testing, the performance of fentry increased from
> 111M/s to 121M/s with migrate_enable/disable inlined.

Very nice.

> Following is the whole patch:
> -------------------------------------------------------------------------------------------
> diff --git a/Kbuild b/Kbuild
> index f327ca86990c..13324b4bbe23 100644
> --- a/Kbuild
> +++ b/Kbuild
> @@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s:
> $(timeconst-file) $(bounds-file)
>  $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
>      $(call filechk,offsets,__ASM_OFFSETS_H__)
>
> +# Generate rq-offsets.h
> +
> +rq-offsets-file := include/generated/rq-offsets.h
> +
> +targets += kernel/sched/rq-offsets.s
> +
> +kernel/sched/rq-offsets.s: $(offsets-file)
> +
> +$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE
> +    $(call filechk,offsets,__RQ_OFFSETS_H__)
> +
>  # Check for missing system calls
>
>  quiet_cmd_syscalls = CALL    $<
>        cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags)
> $(missing_syscalls_flags)
>
>  PHONY += missing-syscalls
> -missing-syscalls: scripts/checksyscalls.sh $(offsets-file)
> +missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file)
>      $(call cmd,syscalls)
>
>  # Check the manual modification of atomic headers
> diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> index 1fad1c8a4c76..3a1c08a75c09 100644
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -369,64 +369,6 @@ static inline void preempt_notifier_init(struct
> preempt_notifier *notifier,
>
>  #endif
>
> -/*
> - * Migrate-Disable and why it is undesired.

Keep the comment where it is. It will keep the diff smaller.
There is really no need to move it.

> - *
> - * When a preempted task becomes elegible to run under the ideal model (IOW it

but fix the typos.

> - * becomes one of the M highest priority tasks), it might still have to wait
> - * for the preemptee's migrate_disable() section to complete. Thereby suffering
> - * a reduction in bandwidth in the exact duration of the migrate_disable()
> - * section.
> - *
> - * Per this argument, the change from preempt_disable() to migrate_disable()
> - * gets us:
> - *
> - * - a higher priority tasks gains reduced wake-up latency; with
> preempt_disable()
> - *   it would have had to wait for the lower priority task.
> - *
> - * - a lower priority tasks; which under preempt_disable() could've instantly
> - *   migrated away when another CPU becomes available, is now constrained
> - *   by the ability to push the higher priority task away, which
> might itself be
> - *   in a migrate_disable() section, reducing it's available bandwidth.
> - *
> - * IOW it trades latency / moves the interference term, but it stays in the
> - * system, and as long as it remains unbounded, the system is not fully
> - * deterministic.
> - *
> - *
> - * The reason we have it anyway.
> - *
> - * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
> - * number of primitives into becoming preemptible, they would also allow
> - * migration. This turns out to break a bunch of per-cpu usage. To this end,
> - * all these primitives employ migirate_disable() to restore this implicit
> - * assumption.
> - *
> - * This is a 'temporary' work-around at best. The correct solution is getting
> - * rid of the above assumptions and reworking the code to employ explicit
> - * per-cpu locking or short preempt-disable regions.
> - *
> - * The end goal must be to get rid of migrate_disable(), alternatively we need
> - * a schedulability theory that does not depend on abritrary migration.

and this one.

> - *
> - *
> - * Notes on the implementation.
> - *
> - * The implementation is particularly tricky since existing code patterns
> - * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
> - * This means that it cannot use cpus_read_lock() to serialize against hotplug,
> - * nor can it easily migrate itself into a pending affinity mask change on
> - * migrate_enable().
> - *
> - *
> - * Note: even non-work-conserving schedulers like semi-partitioned depends on
> - *       migration, so migrate_disable() is not only a problem for
> - *       work-conserving schedulers.
> - *
> - */
> -extern void migrate_disable(void);
> -extern void migrate_enable(void);
> -
>  /**
>   * preempt_disable_nested - Disable preemption inside a normally
> preempt disabled section
>   *
> @@ -471,7 +413,6 @@ static __always_inline void preempt_enable_nested(void)
>
>  DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
>  DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(),
> preempt_enable_notrace())
> -DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

hmm. why?

>  #ifdef CONFIG_PREEMPT_DYNAMIC
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 40d2fa90df42..365ac6d17504 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -48,6 +48,9 @@
>  #include <linux/uidgid_types.h>
>  #include <linux/tracepoint-defs.h>
>  #include <asm/kmap_size.h>
> +#ifndef COMPILE_OFFSETS
> +#include <generated/rq-offsets.h>
> +#endif
>
>  /* task_struct member predeclarations (sorted alphabetically): */
>  struct audit_context;
> @@ -2299,4 +2302,127 @@ static __always_inline void
> alloc_tag_restore(struct alloc_tag *tag, struct allo
>  #define alloc_tag_restore(_tag, _old)        do {} while (0)
>  #endif
>
> +#if defined(CONFIG_SMP) && !defined(COMPILE_OFFSETS)
> +
> +extern void __migrate_enable(void);
> +
> +struct rq;
> +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +#define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)
> +
> +/*
> + * Migrate-Disable and why it is undesired.
> + *
> + * When a preempted task becomes elegible to run under the ideal model (IOW it
> + * becomes one of the M highest priority tasks), it might still have to wait
> + * for the preemptee's migrate_disable() section to complete. Thereby suffering
> + * a reduction in bandwidth in the exact duration of the migrate_disable()
> + * section.
> + *
> + * Per this argument, the change from preempt_disable() to migrate_disable()
> + * gets us:
> + *
> + * - a higher priority tasks gains reduced wake-up latency; with
> preempt_disable()
> + *   it would have had to wait for the lower priority task.
> + *
> + * - a lower priority tasks; which under preempt_disable() could've instantly
> + *   migrated away when another CPU becomes available, is now constrained
> + *   by the ability to push the higher priority task away, which
> might itself be
> + *   in a migrate_disable() section, reducing it's available bandwidth.
> + *
> + * IOW it trades latency / moves the interference term, but it stays in the
> + * system, and as long as it remains unbounded, the system is not fully
> + * deterministic.
> + *
> + *
> + * The reason we have it anyway.
> + *
> + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
> + * number of primitives into becoming preemptible, they would also allow
> + * migration. This turns out to break a bunch of per-cpu usage. To this end,
> + * all these primitives employ migirate_disable() to restore this implicit
> + * assumption.
> + *
> + * This is a 'temporary' work-around at best. The correct solution is getting
> + * rid of the above assumptions and reworking the code to employ explicit
> + * per-cpu locking or short preempt-disable regions.
> + *
> + * The end goal must be to get rid of migrate_disable(), alternatively we need
> + * a schedulability theory that does not depend on abritrary migration.
> + *
> + *
> + * Notes on the implementation.
> + *
> + * The implementation is particularly tricky since existing code patterns
> + * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
> + * This means that it cannot use cpus_read_lock() to serialize against hotplug,
> + * nor can it easily migrate itself into a pending affinity mask change on
> + * migrate_enable().
> + *
> + *
> + * Note: even non-work-conserving schedulers like semi-partitioned depends on
> + *       migration, so migrate_disable() is not only a problem for
> + *       work-conserving schedulers.
> + *
> + */
> +static inline void migrate_enable(void)
> +{
> +    struct task_struct *p = current;
> +
> +#ifdef CONFIG_DEBUG_PREEMPT
> +    /*
> +     * Check both overflow from migrate_disable() and superfluous
> +     * migrate_enable().
> +     */
> +    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
> +        return;
> +#endif
> +
> +    if (p->migration_disabled > 1) {
> +        p->migration_disabled--;
> +        return;
> +    }
> +
> +    /*
> +     * Ensure stop_task runs either before or after this, and that
> +     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
> +     */
> +    guard(preempt)();
> +    __migrate_enable();

You're leaving performance on the table.
In many case bpf is one and only user of migrate_enable/disable
and it's not nested.
So this call is likely hot.
Move 'if (p->cpus_ptr != &p->cpus_mask)' check into .h
and only keep slow path of __set_cpus_allowed_ptr() in .c

Can probably wrap it with likely() too.

> +    /*
> +     * Mustn't clear migration_disabled() until cpus_ptr points back at the
> +     * regular cpus_mask, otherwise things that race (eg.
> +     * select_fallback_rq) get confused.
> +     */
> +    barrier();
> +    p->migration_disabled = 0;
> +    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))--;
> +}
> +
> +static inline void migrate_disable(void)
> +{
> +    struct task_struct *p = current;
> +
> +    if (p->migration_disabled) {
> +#ifdef CONFIG_DEBUG_PREEMPT
> +        /*
> +         *Warn about overflow half-way through the range.
> +         */
> +        WARN_ON_ONCE((s16)p->migration_disabled < 0);
> +#endif
> +        p->migration_disabled++;
> +        return;
> +    }
> +
> +    guard(preempt)();
> +    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))++;
> +    p->migration_disabled = 1;
> +}
> +#else
> +static inline void migrate_disable(void) { }
> +static inline void migrate_enable(void) { }
> +#endif
> +
> +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
> +
>  #endif
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 399f03e62508..75d5f145ca60 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -23853,8 +23853,7 @@ int bpf_check_attach_target(struct
> bpf_verifier_log *log,
>  BTF_SET_START(btf_id_deny)
>  BTF_ID_UNUSED
>  #ifdef CONFIG_SMP
> -BTF_ID(func, migrate_disable)
> -BTF_ID(func, migrate_enable)
> +BTF_ID(func, __migrate_enable)
>  #endif
>  #if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
>  BTF_ID(func, rcu_read_unlock_strict)
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3ec00d08d46a..b521024c99ed 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -119,6 +119,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
>
>  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +EXPORT_SYMBOL_GPL(runqueues);

why?

>
>  #ifdef CONFIG_SCHED_PROXY_EXEC
>  DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
> @@ -2375,28 +2376,7 @@ static void migrate_disable_switch(struct rq
> *rq, struct task_struct *p)
>      __do_set_cpus_allowed(p, &ac);
>  }
>
> -void migrate_disable(void)
> -{
> -    struct task_struct *p = current;
> -
> -    if (p->migration_disabled) {
> -#ifdef CONFIG_DEBUG_PREEMPT
> -        /*
> -         *Warn about overflow half-way through the range.
> -         */
> -        WARN_ON_ONCE((s16)p->migration_disabled < 0);
> -#endif
> -        p->migration_disabled++;
> -        return;
> -    }
> -
> -    guard(preempt)();
> -    this_rq()->nr_pinned++;
> -    p->migration_disabled = 1;
> -}
> -EXPORT_SYMBOL_GPL(migrate_disable);
> -
> -void migrate_enable(void)
> +void __migrate_enable(void)
>  {
>      struct task_struct *p = current;
>      struct affinity_context ac = {
> @@ -2404,37 +2384,10 @@ void migrate_enable(void)
>          .flags     = SCA_MIGRATE_ENABLE,
>      };
>
> -#ifdef CONFIG_DEBUG_PREEMPT
> -    /*
> -     * Check both overflow from migrate_disable() and superfluous
> -     * migrate_enable().
> -     */
> -    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
> -        return;
> -#endif
> -
> -    if (p->migration_disabled > 1) {
> -        p->migration_disabled--;
> -        return;
> -    }
> -
> -    /*
> -     * Ensure stop_task runs either before or after this, and that
> -     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
> -     */
> -    guard(preempt)();
>      if (p->cpus_ptr != &p->cpus_mask)
>          __set_cpus_allowed_ptr(p, &ac);
> -    /*
> -     * Mustn't clear migration_disabled() until cpus_ptr points back at the
> -     * regular cpus_mask, otherwise things that race (eg.
> -     * select_fallback_rq) get confused.
> -     */
> -    barrier();
> -    p->migration_disabled = 0;
> -    this_rq()->nr_pinned--;
>  }
> -EXPORT_SYMBOL_GPL(migrate_enable);
> +EXPORT_SYMBOL_GPL(__migrate_enable);
>
>  static inline bool rq_has_pinned_tasks(struct rq *rq)
>  {
> diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
> new file mode 100644
> index 000000000000..a23747bbe25b
> --- /dev/null
> +++ b/kernel/sched/rq-offsets.c
> @@ -0,0 +1,12 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define COMPILE_OFFSETS
> +#include <linux/kbuild.h>
> +#include <linux/types.h>
> +#include "sched.h"
> +
> +int main(void)
> +{
> +    DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));

This part looks nice and sweet. Not sure what you were concerned about.

Respin it as a proper patch targeting tip tree.

And explain the motivation in commit log with detailed
'perf report' before/after along with 111M/s to 121M/s speed up,

I suspect with my other __set_cpus_allowed_ptr() suggestion
the speed up should be even bigger.

> +    return 0;
> +}

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-08-08  0:58                     ` Alexei Starovoitov
@ 2025-08-08  5:48                       ` Menglong Dong
  2025-08-08  6:32                       ` Menglong Dong
  1 sibling, 0 replies; 32+ messages in thread
From: Menglong Dong @ 2025-08-08  5:48 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Fri, Aug 8, 2025 at 8:58 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Aug 6, 2025 at 1:44 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > On Fri, Aug 1, 2025 at 12:15 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Jul 28, 2025 at 2:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > > >
> > > > On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > >
> > > > > > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> > > > > >
> > > > > > > Maybe Peter has better ideas ?
> > > > > >
> > > > > > Is it possible to express runqueues::nr_pinned as an alias?
> > > > > >
> > > > > > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> > > > > >
> > > > > > And use:
> > > > > >
> > > > > >         __this_cpu_inc(&this_nr_pinned);
> > > > > >
> > > > > >
> > > > > > This syntax doesn't actually seem to work; but can we construct
> > > > > > something like that?
> > > > >
> > > > > Yeah. Iant is right. It's a string and not a pointer dereference.
> > > > > It never worked.
> > > > >
> > > > > Few options:
> > > > >
> > > > > 1.
> > > > >  struct rq {
> > > > > +#ifdef CONFIG_SMP
> > > > > +       unsigned int            nr_pinned;
> > > > > +#endif
> > > > >         /* runqueue lock: */
> > > > >         raw_spinlock_t          __lock;
> > > > >
> > > > > @@ -1271,9 +1274,6 @@ struct rq {
> > > > >         struct cpuidle_state    *idle_state;
> > > > >  #endif
> > > > >
> > > > > -#ifdef CONFIG_SMP
> > > > > -       unsigned int            nr_pinned;
> > > > > -#endif
> > > > >
> > > > > but ugly...
> > > > >
> > > > > 2.
> > > > > static unsigned int nr_pinned_offset __ro_after_init __used;
> > > > > RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
> > > > >
> > > > > overkill for what's needed
> > > > >
> > > > > 3.
> > > > > OFFSET(RQ_nr_pinned, rq, nr_pinned);
> > > > > then
> > > > > #include <generated/asm-offsets.h>
> > > > >
> > > > > imo the best.
> > > >
> > > > I had a try. The struct rq is not visible to asm-offsets.c, so we
> > > > can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
> > > > to define a similar rq-offsets.c in kernel/sched/ ? It will be more
> > > > complex than the way 2, and I think the second way 2 is
> > > > easier :/
> > >
> > > 2 maybe easier, but it's an overkill.
> > > I still think asm-offset is cleaner.
> > > arch/xx shouldn't be used, of course, since this nr_pinned should
> > > be generic for all archs.
> > > We can do something similar to drivers/memory/emif-asm-offsets.c
> > > and do that within kernel/sched/.
> > > rq-offsets.c as you said.
> > > It will generate rq-offsets.h in a build dir that can be #include-d.
> > >
> > > I thought about another alternative (as a derivative of 1):
> > > split nr_pinned from 'struct rq' into its own per-cpu variable,
> > > but I don't think that will work, since rq_has_pinned_tasks()
> > > doesn't always operate on this_rq().
> > > So the acceptable choices are realistically 1 and 3 and
> > > rq-offsets.c seems cleaner.
> > > Pls give it another try.
> >
> > Generally speaking, the way 3 works. The only problem is how
> > we handle this_rq(). I introduced following code in
> > include/linux/sched.h:
> >
> > struct rq;
> > DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> > #define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)
> >
> > The this_rq_ptr() is used in migrate_enable(). I have to use the
> > arch_raw_cpu_ptr() for it. this_cpu_ptr() can't be used here, as
> > it will fail on this_cpu_ptr -> raw_cpu_ptr -> __verify_pcpu_ptr:
> >
> > #define __verify_pcpu_ptr(ptr)                        \
> > do {                                    \
> >     const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL;    \
> >     (void)__vpp_verify;                        \
> > } while (0)
> >
> > The struct rq is not available here, which makes the typeof((ptr) + 0)
> > fail during compiling. What can we do here?
>
> Interesting.
> The comment says:
>  * + 0 is required in order to convert the pointer type from a
>  * potential array type to a pointer to a single item of the array.
>
> so maybe we can do some macro magic to avoid '+ 0'
> when type is already pointer,
> but for now let's proceed with arch_raw_cpu_ptr().

OK

>
> > According to my testing, the performance of fentry increased from
> > 111M/s to 121M/s with migrate_enable/disable inlined.
>
> Very nice.
>
> > Following is the whole patch:
> > -------------------------------------------------------------------------------------------
> > diff --git a/Kbuild b/Kbuild
> > index f327ca86990c..13324b4bbe23 100644
> > --- a/Kbuild
> > +++ b/Kbuild
> > @@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s:
> > $(timeconst-file) $(bounds-file)
> >  $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
> >      $(call filechk,offsets,__ASM_OFFSETS_H__)
> >
> > +# Generate rq-offsets.h
> > +
> > +rq-offsets-file := include/generated/rq-offsets.h
> > +
> > +targets += kernel/sched/rq-offsets.s
> > +
> > +kernel/sched/rq-offsets.s: $(offsets-file)
> > +
> > +$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE
> > +    $(call filechk,offsets,__RQ_OFFSETS_H__)
> > +
> >  # Check for missing system calls
> >
> >  quiet_cmd_syscalls = CALL    $<
> >        cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags)
> > $(missing_syscalls_flags)
> >
> >  PHONY += missing-syscalls
> > -missing-syscalls: scripts/checksyscalls.sh $(offsets-file)
> > +missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file)
> >      $(call cmd,syscalls)
> >
> >  # Check the manual modification of atomic headers
> > diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> > index 1fad1c8a4c76..3a1c08a75c09 100644
> > --- a/include/linux/preempt.h
> > +++ b/include/linux/preempt.h
> > @@ -369,64 +369,6 @@ static inline void preempt_notifier_init(struct
> > preempt_notifier *notifier,
> >
> >  #endif
> >
> > -/*
> > - * Migrate-Disable and why it is undesired.
>
> Keep the comment where it is. It will keep the diff smaller.
> There is really no need to move it.

OK

>
> > - *
> > - * When a preempted task becomes elegible to run under the ideal model (IOW it
>
> but fix the typos.

OK

>
> > - * becomes one of the M highest priority tasks), it might still have to wait
> > - * for the preemptee's migrate_disable() section to complete. Thereby suffering
> > - * a reduction in bandwidth in the exact duration of the migrate_disable()
> > - * section.
> > - *
> > - * Per this argument, the change from preempt_disable() to migrate_disable()
> > - * gets us:
> > - *
> > - * - a higher priority tasks gains reduced wake-up latency; with
> > preempt_disable()
> > - *   it would have had to wait for the lower priority task.
> > - *
> > - * - a lower priority tasks; which under preempt_disable() could've instantly
> > - *   migrated away when another CPU becomes available, is now constrained
> > - *   by the ability to push the higher priority task away, which
> > might itself be
> > - *   in a migrate_disable() section, reducing it's available bandwidth.
> > - *
> > - * IOW it trades latency / moves the interference term, but it stays in the
> > - * system, and as long as it remains unbounded, the system is not fully
> > - * deterministic.
> > - *
> > - *
> > - * The reason we have it anyway.
> > - *
> > - * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
> > - * number of primitives into becoming preemptible, they would also allow
> > - * migration. This turns out to break a bunch of per-cpu usage. To this end,
> > - * all these primitives employ migirate_disable() to restore this implicit
> > - * assumption.
> > - *
> > - * This is a 'temporary' work-around at best. The correct solution is getting
> > - * rid of the above assumptions and reworking the code to employ explicit
> > - * per-cpu locking or short preempt-disable regions.
> > - *
> > - * The end goal must be to get rid of migrate_disable(), alternatively we need
> > - * a schedulability theory that does not depend on abritrary migration.
>
> and this one.

OK

>
> > - *
> > - *
> > - * Notes on the implementation.
> > - *
> > - * The implementation is particularly tricky since existing code patterns
> > - * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
> > - * This means that it cannot use cpus_read_lock() to serialize against hotplug,
> > - * nor can it easily migrate itself into a pending affinity mask change on
> > - * migrate_enable().
> > - *
> > - *
> > - * Note: even non-work-conserving schedulers like semi-partitioned depends on
> > - *       migration, so migrate_disable() is not only a problem for
> > - *       work-conserving schedulers.
> > - *
> > - */
> > -extern void migrate_disable(void);
> > -extern void migrate_enable(void);
> > -
> >  /**
> >   * preempt_disable_nested - Disable preemption inside a normally
> > preempt disabled section
> >   *
> > @@ -471,7 +413,6 @@ static __always_inline void preempt_enable_nested(void)
> >
> >  DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
> >  DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(),
> > preempt_enable_notrace())
> > -DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>
> hmm. why?

Because we moved migrate_disable and migrate_enable to
include/linux/sched.h, which makes them not available in
include/linux/preempt.h, so we need to move this line to
include/linux/sched.h too.

>
> >  #ifdef CONFIG_PREEMPT_DYNAMIC
> >
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 40d2fa90df42..365ac6d17504 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -48,6 +48,9 @@
> >  #include <linux/uidgid_types.h>
> >  #include <linux/tracepoint-defs.h>
> >  #include <asm/kmap_size.h>
> > +#ifndef COMPILE_OFFSETS
> > +#include <generated/rq-offsets.h>
> > +#endif
> >
> >  /* task_struct member predeclarations (sorted alphabetically): */
> >  struct audit_context;
> > @@ -2299,4 +2302,127 @@ static __always_inline void
> > alloc_tag_restore(struct alloc_tag *tag, struct allo
> >  #define alloc_tag_restore(_tag, _old)        do {} while (0)
> >  #endif
> >
> > +#if defined(CONFIG_SMP) && !defined(COMPILE_OFFSETS)
> > +
> > +extern void __migrate_enable(void);
> > +
> > +struct rq;
> > +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> > +#define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)
> > +
> > +/*
> > + * Migrate-Disable and why it is undesired.
> > + *
> > + * When a preempted task becomes elegible to run under the ideal model (IOW it
> > + * becomes one of the M highest priority tasks), it might still have to wait
> > + * for the preemptee's migrate_disable() section to complete. Thereby suffering
> > + * a reduction in bandwidth in the exact duration of the migrate_disable()
> > + * section.
> > + *
> > + * Per this argument, the change from preempt_disable() to migrate_disable()
> > + * gets us:
> > + *
> > + * - a higher priority tasks gains reduced wake-up latency; with
> > preempt_disable()
> > + *   it would have had to wait for the lower priority task.
> > + *
> > + * - a lower priority tasks; which under preempt_disable() could've instantly
> > + *   migrated away when another CPU becomes available, is now constrained
> > + *   by the ability to push the higher priority task away, which
> > might itself be
> > + *   in a migrate_disable() section, reducing it's available bandwidth.
> > + *
> > + * IOW it trades latency / moves the interference term, but it stays in the
> > + * system, and as long as it remains unbounded, the system is not fully
> > + * deterministic.
> > + *
> > + *
> > + * The reason we have it anyway.
> > + *
> > + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
> > + * number of primitives into becoming preemptible, they would also allow
> > + * migration. This turns out to break a bunch of per-cpu usage. To this end,
> > + * all these primitives employ migirate_disable() to restore this implicit
> > + * assumption.
> > + *
> > + * This is a 'temporary' work-around at best. The correct solution is getting
> > + * rid of the above assumptions and reworking the code to employ explicit
> > + * per-cpu locking or short preempt-disable regions.
> > + *
> > + * The end goal must be to get rid of migrate_disable(), alternatively we need
> > + * a schedulability theory that does not depend on abritrary migration.
> > + *
> > + *
> > + * Notes on the implementation.
> > + *
> > + * The implementation is particularly tricky since existing code patterns
> > + * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
> > + * This means that it cannot use cpus_read_lock() to serialize against hotplug,
> > + * nor can it easily migrate itself into a pending affinity mask change on
> > + * migrate_enable().
> > + *
> > + *
> > + * Note: even non-work-conserving schedulers like semi-partitioned depends on
> > + *       migration, so migrate_disable() is not only a problem for
> > + *       work-conserving schedulers.
> > + *
> > + */
> > +static inline void migrate_enable(void)
> > +{
> > +    struct task_struct *p = current;
> > +
> > +#ifdef CONFIG_DEBUG_PREEMPT
> > +    /*
> > +     * Check both overflow from migrate_disable() and superfluous
> > +     * migrate_enable().
> > +     */
> > +    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
> > +        return;
> > +#endif
> > +
> > +    if (p->migration_disabled > 1) {
> > +        p->migration_disabled--;
> > +        return;
> > +    }
> > +
> > +    /*
> > +     * Ensure stop_task runs either before or after this, and that
> > +     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
> > +     */
> > +    guard(preempt)();
> > +    __migrate_enable();
>
> You're leaving performance on the table.
> In many case bpf is one and only user of migrate_enable/disable
> and it's not nested.
> So this call is likely hot.
> Move 'if (p->cpus_ptr != &p->cpus_mask)' check into .h
> and only keep slow path of __set_cpus_allowed_ptr() in .c

Oops, my mistake, I should do it this way :/

>
> Can probably wrap it with likely() too.
>
> > +    /*
> > +     * Mustn't clear migration_disabled() until cpus_ptr points back at the
> > +     * regular cpus_mask, otherwise things that race (eg.
> > +     * select_fallback_rq) get confused.
> > +     */
> > +    barrier();
> > +    p->migration_disabled = 0;
> > +    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))--;
> > +}
> > +
> > +static inline void migrate_disable(void)
> > +{
> > +    struct task_struct *p = current;
> > +
> > +    if (p->migration_disabled) {
> > +#ifdef CONFIG_DEBUG_PREEMPT
> > +        /*
> > +         *Warn about overflow half-way through the range.
> > +         */
> > +        WARN_ON_ONCE((s16)p->migration_disabled < 0);
> > +#endif
> > +        p->migration_disabled++;
> > +        return;
> > +    }
> > +
> > +    guard(preempt)();
> > +    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))++;
> > +    p->migration_disabled = 1;
> > +}
> > +#else
> > +static inline void migrate_disable(void) { }
> > +static inline void migrate_enable(void) { }
> > +#endif
> > +
> > +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
> > +
> >  #endif
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 399f03e62508..75d5f145ca60 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -23853,8 +23853,7 @@ int bpf_check_attach_target(struct
> > bpf_verifier_log *log,
> >  BTF_SET_START(btf_id_deny)
> >  BTF_ID_UNUSED
> >  #ifdef CONFIG_SMP
> > -BTF_ID(func, migrate_disable)
> > -BTF_ID(func, migrate_enable)
> > +BTF_ID(func, __migrate_enable)
> >  #endif
> >  #if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
> >  BTF_ID(func, rcu_read_unlock_strict)
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 3ec00d08d46a..b521024c99ed 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -119,6 +119,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
> >
> >  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> > +EXPORT_SYMBOL_GPL(runqueues);
>
> why?

Because the runqueues referenced in migrate_enable/migrate_disable
directly, and they can be used in modules, we need to export
the runqueues. Isn't it?

>
> >
> >  #ifdef CONFIG_SCHED_PROXY_EXEC
> >  DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
> > @@ -2375,28 +2376,7 @@ static void migrate_disable_switch(struct rq
> > *rq, struct task_struct *p)
> >      __do_set_cpus_allowed(p, &ac);
> >  }
> >
> > -void migrate_disable(void)
> > -{
> > -    struct task_struct *p = current;
> > -
> > -    if (p->migration_disabled) {
> > -#ifdef CONFIG_DEBUG_PREEMPT
> > -        /*
> > -         *Warn about overflow half-way through the range.
> > -         */
> > -        WARN_ON_ONCE((s16)p->migration_disabled < 0);
> > -#endif
> > -        p->migration_disabled++;
> > -        return;
> > -    }
> > -
> > -    guard(preempt)();
> > -    this_rq()->nr_pinned++;
> > -    p->migration_disabled = 1;
> > -}
> > -EXPORT_SYMBOL_GPL(migrate_disable);
> > -
> > -void migrate_enable(void)
> > +void __migrate_enable(void)
> >  {
> >      struct task_struct *p = current;
> >      struct affinity_context ac = {
> > @@ -2404,37 +2384,10 @@ void migrate_enable(void)
> >          .flags     = SCA_MIGRATE_ENABLE,
> >      };
> >
> > -#ifdef CONFIG_DEBUG_PREEMPT
> > -    /*
> > -     * Check both overflow from migrate_disable() and superfluous
> > -     * migrate_enable().
> > -     */
> > -    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
> > -        return;
> > -#endif
> > -
> > -    if (p->migration_disabled > 1) {
> > -        p->migration_disabled--;
> > -        return;
> > -    }
> > -
> > -    /*
> > -     * Ensure stop_task runs either before or after this, and that
> > -     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
> > -     */
> > -    guard(preempt)();
> >      if (p->cpus_ptr != &p->cpus_mask)
> >          __set_cpus_allowed_ptr(p, &ac);
> > -    /*
> > -     * Mustn't clear migration_disabled() until cpus_ptr points back at the
> > -     * regular cpus_mask, otherwise things that race (eg.
> > -     * select_fallback_rq) get confused.
> > -     */
> > -    barrier();
> > -    p->migration_disabled = 0;
> > -    this_rq()->nr_pinned--;
> >  }
> > -EXPORT_SYMBOL_GPL(migrate_enable);
> > +EXPORT_SYMBOL_GPL(__migrate_enable);
> >
> >  static inline bool rq_has_pinned_tasks(struct rq *rq)
> >  {
> > diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
> > new file mode 100644
> > index 000000000000..a23747bbe25b
> > --- /dev/null
> > +++ b/kernel/sched/rq-offsets.c
> > @@ -0,0 +1,12 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#define COMPILE_OFFSETS
> > +#include <linux/kbuild.h>
> > +#include <linux/types.h>
> > +#include "sched.h"
> > +
> > +int main(void)
> > +{
> > +    DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
>
> This part looks nice and sweet. Not sure what you were concerned about.

The usage of arch_raw_cpu_ptr() looks ugly and there is
no such usage existing, which made me concerned.

>
> Respin it as a proper patch targeting tip tree.
>
> And explain the motivation in commit log with detailed
> 'perf report' before/after along with 111M/s to 121M/s speed up,
>
> I suspect with my other __set_cpus_allowed_ptr() suggestion
> the speed up should be even bigger.

It should be. I'll respin this patch and send it to the tip tree.

Thanks!
Menglong Dong

>
> > +    return 0;
> > +}

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-08-08  0:58                     ` Alexei Starovoitov
  2025-08-08  5:48                       ` Menglong Dong
@ 2025-08-08  6:32                       ` Menglong Dong
  2025-08-08 15:47                         ` Alexei Starovoitov
  1 sibling, 1 reply; 32+ messages in thread
From: Menglong Dong @ 2025-08-08  6:32 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Fri, Aug 8, 2025 at 8:58 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
[......]
> > +{
> > +    DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
>
> This part looks nice and sweet. Not sure what you were concerned about.
>
> Respin it as a proper patch targeting tip tree.
>
> And explain the motivation in commit log with detailed
> 'perf report' before/after along with 111M/s to 121M/s speed up,
>
> I suspect with my other __set_cpus_allowed_ptr() suggestion
> the speed up should be even bigger.

Much better.

Before:
fentry         :  113.030 ± 0.149M/s
fentry         :  112.501 ± 0.187M/s
fentry         :  112.828 ± 0.267M/s
fentry         :  115.287 ± 0.241M/s

After:
fentry         :  143.644 ± 0.670M/s
fentry         :  149.764 ± 0.362M/s
fentry         :  149.642 ± 0.156M/s
fentry         :  145.263 ± 0.221M/s
fentry         :  145.558 ± 0.145M/s

>
> > +    return 0;
> > +}

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-08-08  6:32                       ` Menglong Dong
@ 2025-08-08 15:47                         ` Alexei Starovoitov
  0 siblings, 0 replies; 32+ messages in thread
From: Alexei Starovoitov @ 2025-08-08 15:47 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Thu, Aug 7, 2025 at 11:32 PM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> On Fri, Aug 8, 2025 at 8:58 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> [......]
> > > +{
> > > +    DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
> >
> > This part looks nice and sweet. Not sure what you were concerned about.
> >
> > Respin it as a proper patch targeting tip tree.
> >
> > And explain the motivation in commit log with detailed
> > 'perf report' before/after along with 111M/s to 121M/s speed up,
> >
> > I suspect with my other __set_cpus_allowed_ptr() suggestion
> > the speed up should be even bigger.
>
> Much better.
>
> Before:
> fentry         :  113.030 ± 0.149M/s
> fentry         :  112.501 ± 0.187M/s
> fentry         :  112.828 ± 0.267M/s
> fentry         :  115.287 ± 0.241M/s
>
> After:
> fentry         :  143.644 ± 0.670M/s
> fentry         :  149.764 ± 0.362M/s
> fentry         :  149.642 ± 0.156M/s
> fentry         :  145.263 ± 0.221M/s
> fentry         :  145.558 ± 0.145M/s

Nice!

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2025-08-08 15:47 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
2025-07-03 12:15 ` [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline Menglong Dong
2025-07-15  2:25   ` Alexei Starovoitov
2025-07-15  8:36     ` Menglong Dong
2025-07-15  9:30       ` Menglong Dong
2025-07-16 16:56         ` Inlining migrate_disable/enable. Was: " Alexei Starovoitov
2025-07-16 18:24           ` Peter Zijlstra
2025-07-16 22:35             ` Alexei Starovoitov
2025-07-16 22:49               ` Steven Rostedt
2025-07-16 22:50                 ` Steven Rostedt
2025-07-28  9:20               ` Menglong Dong
2025-07-31 16:15                 ` Alexei Starovoitov
2025-08-01  1:42                   ` Menglong Dong
2025-08-06  8:44                   ` Menglong Dong
2025-08-08  0:58                     ` Alexei Starovoitov
2025-08-08  5:48                       ` Menglong Dong
2025-08-08  6:32                       ` Menglong Dong
2025-08-08 15:47                         ` Alexei Starovoitov
2025-07-15 16:35       ` Alexei Starovoitov
2025-07-16 13:05         ` Menglong Dong
2025-07-17  0:59           ` multi-fentry proposal. Was: " Alexei Starovoitov
2025-07-17  1:50             ` Menglong Dong
2025-07-17  2:13               ` Alexei Starovoitov
2025-07-17  2:37                 ` Menglong Dong
2025-07-16 14:40         ` Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args Menglong Dong
2025-07-14 22:07   ` Andrii Nakryiko
2025-07-14 23:45     ` Menglong Dong
2025-07-15 17:11       ` Andrii Nakryiko
2025-07-16 12:50         ` Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 10/18] x86,bpf: factor out arch_bpf_get_regs_nr Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 11/18] bpf: tracing: add multi-link support Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 16/18] selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c Menglong Dong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).