[PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-04 16:07   ` kernel test robot
  2025-07-15  1:55   ` Alexei Starovoitov
  2025-07-03 12:15 ` [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline Menglong Dong
                   ` (16 subsequent siblings)
  17 siblings, 2 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, linux-kernel

Implement a hash table to store the BPF progs and the function metadata.
The key of this hash table is the kernel function address, and following
data is stored in the hash value:

- The BPF progs, whose type is FENTRY, FEXIT or MODIFY_RETURN. The struct
  kfunc_md_tramp_prog is introduced to store the BPF prog and the cookie,
  and makes the BPF progs of the same type a list with the "next" field.
- The kernel function address
- The kernel function arguments count
- If origin call needed

The hlist is used, and we will grow the budgets when the entries count
greater than 90% of the budget count by making it double. Meanwhile, we
will shrink the budget when the entries count less than 30% of the budget
length.

We don't use rhashtable here, as the compiler is not clever enough and it
refused to inline the hash lookup for me, which bring in addition overhead
in the following BPF global trampoline.

The release of the metadata is controlled by the percpu ref and RCU
together, and have similar logic to the release of bpf trampoline image in
bpf_tramp_image_put().

The whole function will be used in the next patch.

Link: https://lore.kernel.org/bpf/CADxym3anLzM6cAkn_z71GDd_VeKiqqk1ts=xuiP7pr4PO6USPA@mail.gmail.com/
Link: https://lore.kernel.org/bpf/CAADnVQ+G+mQPJ+O1Oc9+UW=J17CGNC5B=usCmUDxBA-ze+gZGw@mail.gmail.com/
Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
v2:
- implement the function metadata with hash table, as Alexei advised
---
 include/linux/kfunc_md.h |  91 ++++++++++
 kernel/bpf/Makefile      |   1 +
 kernel/bpf/kfunc_md.c    | 352 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 444 insertions(+)
 create mode 100644 include/linux/kfunc_md.h
 create mode 100644 kernel/bpf/kfunc_md.c

diff --git a/include/linux/kfunc_md.h b/include/linux/kfunc_md.h
new file mode 100644
index 000000000000..1a766aa160f5
--- /dev/null
+++ b/include/linux/kfunc_md.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KFUNC_MD_H
+#define _LINUX_KFUNC_MD_H
+
+#include <linux/kernel.h>
+#include <linux/bpf.h>
+#include <linux/rhashtable.h>
+
+struct kfunc_md_tramp_prog {
+	struct kfunc_md_tramp_prog *next;
+	struct bpf_prog *prog;
+	u64 cookie;
+	struct rcu_head rcu;
+};
+
+struct kfunc_md {
+	struct hlist_node hash;
+	struct rcu_head rcu;
+	unsigned long func;
+	struct kfunc_md_tramp_prog *bpf_progs[BPF_TRAMP_MAX];
+	struct percpu_ref pcref;
+	u16 users;
+	bool bpf_origin_call;
+	u8 bpf_prog_cnt;
+	u8 nr_args;
+};
+
+struct kfunc_md_array {
+	atomic_t used;
+	struct rcu_head rcu;
+	int hash_bits;
+	struct hlist_head mds[];
+};
+
+extern struct kfunc_md_array __rcu *kfunc_mds;
+
+struct kfunc_md *kfunc_md_create(unsigned long ip, int nr_args);
+struct kfunc_md *kfunc_md_get(unsigned long ip);
+void kfunc_md_put(struct kfunc_md *meta);
+bool kfunc_md_arch_support(int *insn, int *data);
+
+int kfunc_md_bpf_ips(void ***ips, int nr_args);
+int kfunc_md_bpf_unlink(struct kfunc_md *md, struct bpf_prog *prog, int type);
+int kfunc_md_bpf_link(struct kfunc_md *md, struct bpf_prog *prog, int type,
+		      u64 cookie);
+
+static __always_inline notrace struct hlist_head *
+kfunc_md_hash_head(struct kfunc_md_array *mds, unsigned long ip)
+{
+	return &mds->mds[hash_ptr((void *)ip, mds->hash_bits)];
+}
+
+static __always_inline notrace struct kfunc_md *
+__kfunc_md_get(struct kfunc_md_array *mds, unsigned long ip)
+{
+	struct hlist_head *head;
+	struct kfunc_md *md;
+
+	head = kfunc_md_hash_head(mds, ip);
+	hlist_for_each_entry_rcu_notrace(md, head, hash) {
+		if (md->func == ip)
+			return md;
+	}
+
+	return NULL;
+}
+
+/* This function will be called in the bpf global trampoline, so it can't
+ * be traced, and the "notrace" is necessary.
+ */
+static __always_inline notrace struct kfunc_md *kfunc_md_get_rcu(unsigned long ip)
+{
+	return __kfunc_md_get(rcu_dereference_raw(kfunc_mds), ip);
+}
+
+static __always_inline notrace void kfunc_md_enter(struct kfunc_md *md)
+{
+	percpu_ref_get(&md->pcref);
+}
+
+static __always_inline notrace void kfunc_md_exit(struct kfunc_md *md)
+{
+	percpu_ref_put(&md->pcref);
+}
+
+static inline void kfunc_md_put_ip(unsigned long ip)
+{
+	kfunc_md_put(kfunc_md_get(ip));
+}
+
+#endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3a335c50e6e3..a8a404e82e3d 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
+obj-$(CONFIG_BPF_JIT) += kfunc_md.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o
 ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
 obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o
diff --git a/kernel/bpf/kfunc_md.c b/kernel/bpf/kfunc_md.c
new file mode 100644
index 000000000000..152d6741d06d
--- /dev/null
+++ b/kernel/bpf/kfunc_md.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 ChinaTelecom */
+
+#include <linux/slab.h>
+#include <linux/memory.h>
+#include <linux/rcupdate.h>
+#include <linux/ftrace.h>
+#include <linux/rhashtable.h>
+#include <linux/kfunc_md.h>
+
+#include <uapi/linux/bpf.h>
+
+#define MIN_KFUNC_MD_ARRAY_BITS 4
+struct kfunc_md_array default_mds = {
+	.used = ATOMIC_INIT(0),
+	.hash_bits = MIN_KFUNC_MD_ARRAY_BITS,
+	.mds = {
+		[0 ... ((1 << MIN_KFUNC_MD_ARRAY_BITS) - 1)] = HLIST_HEAD_INIT,
+	},
+};
+struct kfunc_md_array __rcu *kfunc_mds = &default_mds;
+EXPORT_SYMBOL_GPL(kfunc_mds);
+
+static DEFINE_MUTEX(kfunc_md_mutex);
+
+static int kfunc_md_array_inc(void);
+
+static void kfunc_md_release_rcu(struct rcu_head *rcu)
+{
+	struct kfunc_md *md;
+
+	md = container_of(rcu, struct kfunc_md, rcu);
+	/* Step 4, free the md */
+	kfree(md);
+}
+
+static void kfunc_md_release_rcu_tasks(struct rcu_head *rcu)
+{
+	struct kfunc_md *md;
+
+	md = container_of(rcu, struct kfunc_md, rcu);
+	/* Step 3, wait for the nornal progs and bfp_global_caller to finish */
+	call_rcu_tasks(&md->rcu, kfunc_md_release_rcu);
+}
+
+static void kfunc_md_release(struct percpu_ref *pcref)
+{
+	struct kfunc_md *md;
+
+	md = container_of(pcref, struct kfunc_md, pcref);
+	percpu_ref_exit(&md->pcref);
+
+	/* Step 2, wait for sleepable progs to finish. */
+	call_rcu_tasks_trace(&md->rcu, kfunc_md_release_rcu_tasks);
+}
+
+struct kfunc_md *kfunc_md_get(unsigned long ip)
+{
+	struct kfunc_md_array *mds;
+	struct kfunc_md *md;
+
+	rcu_read_lock();
+	mds = rcu_dereference(kfunc_mds);
+	md = __kfunc_md_get(mds, ip);
+	rcu_read_unlock();
+
+	return md;
+}
+EXPORT_SYMBOL_GPL(kfunc_md_get);
+
+static struct kfunc_md *__kfunc_md_create(struct kfunc_md_array *mds, unsigned long ip,
+					  int nr_args)
+{
+	struct kfunc_md *md = __kfunc_md_get(mds, ip);
+	int err;
+
+	if (md) {
+		md->users++;
+		return md;
+	}
+
+	md = kzalloc(sizeof(*md), GFP_KERNEL);
+	if (!md)
+		return NULL;
+
+	md->users = 1;
+	md->func = ip;
+	md->nr_args = nr_args;
+
+	err = percpu_ref_init(&md->pcref, kfunc_md_release, 0, GFP_KERNEL);
+	if (err) {
+		kfree(md);
+		return NULL;
+	}
+
+	hlist_add_head_rcu(&md->hash, kfunc_md_hash_head(mds, ip));
+	atomic_inc(&mds->used);
+
+	return md;
+}
+
+struct kfunc_md *kfunc_md_create(unsigned long ip, int nr_args)
+{
+	struct kfunc_md *md = NULL;
+
+	mutex_lock(&kfunc_md_mutex);
+
+	if (kfunc_md_array_inc())
+		goto out;
+
+	md = __kfunc_md_create(kfunc_mds, ip, nr_args);
+out:
+	mutex_unlock(&kfunc_md_mutex);
+
+	return md;
+}
+EXPORT_SYMBOL_GPL(kfunc_md_create);
+
+static int kfunc_md_array_adjust(bool inc)
+{
+	struct kfunc_md_array *new_mds, *old_mds;
+	struct kfunc_md *md, *new_md;
+	struct hlist_node *n;
+	int size, hash_bits, i;
+
+	hash_bits = kfunc_mds->hash_bits;
+	hash_bits += inc ? 1 : -1;
+
+	size = sizeof(*new_mds) + sizeof(struct hlist_head) * (1 << hash_bits);
+	new_mds = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+	if (!new_mds)
+		return -ENOMEM;
+
+	new_mds->hash_bits = hash_bits;
+	for (i = 0; i < (1 << new_mds->hash_bits); i++)
+		INIT_HLIST_HEAD(&new_mds->mds[i]);
+
+	/* copy all the mds from kfunc_mds to new_mds */
+	for (i = 0; i < (1 << kfunc_mds->hash_bits); i++) {
+		hlist_for_each_entry(md, &kfunc_mds->mds[i], hash) {
+			new_md = __kfunc_md_create(new_mds, md->func, md->nr_args);
+			if (!new_md)
+				goto err_out;
+
+			new_md->bpf_prog_cnt = md->bpf_prog_cnt;
+			new_md->bpf_origin_call = md->bpf_origin_call;
+			new_md->users = md->users;
+
+			memcpy(new_md->bpf_progs, md->bpf_progs, sizeof(md->bpf_progs));
+		}
+	}
+
+	old_mds = kfunc_mds;
+	rcu_assign_pointer(kfunc_mds, new_mds);
+	synchronize_rcu();
+
+	/* free all the mds in the old_mds. See kfunc_md_put() for the
+	 * complete release process.
+	 */
+	for (i = 0; i < (1 << old_mds->hash_bits); i++) {
+		hlist_for_each_entry_safe(md, n, &old_mds->mds[i], hash) {
+			percpu_ref_kill(&md->pcref);
+			hlist_del(&md->hash);
+		}
+	}
+
+	if (old_mds != &default_mds)
+		kfree_rcu(old_mds, rcu);
+
+	return 0;
+
+err_out:
+	for (i = 0; i < (1 << new_mds->hash_bits); i++) {
+		hlist_for_each_entry_safe(md, n, &new_mds->mds[i], hash) {
+			percpu_ref_exit(&md->pcref);
+			hlist_del(&md->hash);
+			kfree(md);
+		}
+	}
+	return -ENOMEM;
+}
+
+static int kfunc_md_array_inc(void)
+{
+	/* increase the hash table if greater than 90% */
+	if (atomic_read(&kfunc_mds->used) * 10 < (1 << (kfunc_mds->hash_bits)) * 9)
+		return 0;
+	return kfunc_md_array_adjust(true);
+}
+
+static int kfunc_md_array_dec(void)
+{
+	/* decrease the hash table if less than 30%. */
+	if (atomic_read(&kfunc_mds->used) * 10 > (1 << (kfunc_mds->hash_bits)) * 3)
+		return 0;
+
+	if (kfunc_mds->hash_bits <= MIN_KFUNC_MD_ARRAY_BITS)
+		return 0;
+
+	return kfunc_md_array_adjust(false);
+}
+
+void kfunc_md_put(struct kfunc_md *md)
+{
+	if (!md || WARN_ON_ONCE(md->users <= 0))
+		return;
+
+	mutex_lock(&kfunc_md_mutex);
+	md->users--;
+	if (md->users > 0)
+		goto out_unlock;
+
+	hlist_del_rcu(&md->hash);
+	atomic_dec(&kfunc_mds->used);
+	/* Step 1, use percpu_ref_kill to wait for the origin function to
+	 * finish. See kfunc_md_release for step 2.
+	 */
+	percpu_ref_kill(&md->pcref);
+	kfunc_md_array_dec();
+
+out_unlock:
+	mutex_unlock(&kfunc_md_mutex);
+}
+EXPORT_SYMBOL_GPL(kfunc_md_put);
+
+static bool kfunc_md_bpf_check(struct kfunc_md *md, int nr_args)
+{
+	return md->bpf_prog_cnt && md->nr_args == nr_args;
+}
+
+int kfunc_md_bpf_ips(void ***ips_ptr, int nr_args)
+{
+	struct kfunc_md *md;
+	int count, res = 0;
+	void **ips;
+
+	mutex_lock(&kfunc_md_mutex);
+	count = atomic_read(&kfunc_mds->used);
+	if (count <= 0)
+		goto out_unlock;
+
+	ips = kmalloc_array(count, sizeof(*ips), GFP_KERNEL);
+	if (!ips) {
+		res = -ENOMEM;
+		goto out_unlock;
+	}
+
+	for (int j = 0; j < (1 << kfunc_mds->hash_bits); j++) {
+		hlist_for_each_entry(md, &kfunc_mds->mds[j], hash) {
+			if (kfunc_md_bpf_check(md, nr_args))
+				ips[res++] = (void *)md->func;
+		}
+	}
+	*ips_ptr = ips;
+
+out_unlock:
+	mutex_unlock(&kfunc_md_mutex);
+
+	return res;
+}
+
+int kfunc_md_bpf_link(struct kfunc_md *md, struct bpf_prog *prog, int type,
+		      u64 cookie)
+{
+	struct kfunc_md_tramp_prog *tramp_prog, **last;
+	int err = 0;
+
+	mutex_lock(&kfunc_md_mutex);
+	tramp_prog = md->bpf_progs[type];
+	/* check if the prog is already linked */
+	while (tramp_prog) {
+		if (tramp_prog->prog == prog) {
+			err = -EEXIST;
+			goto out_unlock;
+		}
+		tramp_prog = tramp_prog->next;
+	}
+
+	tramp_prog = kmalloc(sizeof(*tramp_prog), GFP_KERNEL);
+	if (!tramp_prog) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	WRITE_ONCE(tramp_prog->prog, prog);
+	WRITE_ONCE(tramp_prog->cookie, cookie);
+	WRITE_ONCE(tramp_prog->next, NULL);
+
+	/* add the new prog to the list tail */
+	last = &md->bpf_progs[type];
+	while (*last)
+		last = &(*last)->next;
+
+	WRITE_ONCE(*last, tramp_prog);
+
+	md->bpf_prog_cnt++;
+	if (type == BPF_TRAMP_FEXIT || type == BPF_TRAMP_MODIFY_RETURN)
+		md->bpf_origin_call = true;
+
+out_unlock:
+	mutex_unlock(&kfunc_md_mutex);
+	return err;
+}
+
+static void link_free_rcu(struct rcu_head *rcu)
+{
+	struct kfunc_md_tramp_prog *tramp_prog;
+
+	tramp_prog = container_of(rcu, struct kfunc_md_tramp_prog, rcu);
+	/* Step 3, free the tramp_prog */
+	kfree(tramp_prog);
+}
+
+static void link_free_rcu_tasks(struct rcu_head *rcu)
+{
+	struct kfunc_md_tramp_prog *tramp_prog;
+
+	tramp_prog = container_of(rcu, struct kfunc_md_tramp_prog, rcu);
+	/* Step 2, wait for normal progs finish, which means all the progs
+	 * in the list finished.
+	 */
+	call_rcu_tasks(&tramp_prog->rcu, link_free_rcu);
+}
+
+int kfunc_md_bpf_unlink(struct kfunc_md *md, struct bpf_prog *prog, int type)
+{
+	struct kfunc_md_tramp_prog *cur, **prev, **progs;
+
+	mutex_lock(&kfunc_md_mutex);
+	progs = md->bpf_progs;
+	prev = progs + type;
+	while (*prev && (*prev)->prog != prog)
+		prev = &(*prev)->next;
+
+	cur = *prev;
+	if (!cur) {
+		mutex_unlock(&kfunc_md_mutex);
+		return -EINVAL;
+	}
+
+	WRITE_ONCE(*prev, cur->next);
+	WRITE_ONCE(md->bpf_origin_call, progs[BPF_TRAMP_MODIFY_RETURN] ||
+					progs[BPF_TRAMP_FEXIT]);
+
+	md->bpf_prog_cnt--;
+
+	/* Step 1, wait for sleepable progs to finish. */
+	call_rcu_tasks_trace(&cur->rcu, link_free_rcu_tasks);
+	mutex_unlock(&kfunc_md_mutex);
+
+	return 0;
+}
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
  2025-07-03 12:15 ` [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-15  2:25   ` Alexei Starovoitov
  2025-07-03 12:15 ` [PATCH bpf-next v2 03/18] ftrace: factor out ftrace_direct_update from register_ftrace_direct Menglong Dong
                   ` (15 subsequent siblings)
  17 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, H. Peter Anvin, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, linux-kernel, netdev

Implement the bpf global trampoline "bpf_global_caller" for x86_64. Thanks
to Alexei's advice, we implement most of the global trampoline with C
instead of asm.

We implement the entry of the trampoline with a "__naked" function, who
will save the regs to an array on the stack and call
bpf_global_caller_run(). The entry will pass the address of the array and
the address of the rip to bpf_global_caller_run().

In bpf_global_caller_run(), we will find the metadata by the function ip.
For origin call case, we call kfunc_md_enter() to protect the metadata,
which is similar to __bpf_tramp_enter(). Then we will call all the BPF
progs, just like what BPF trampoline do.

Without origin call, the bpf_global_caller_run() will return 0, and the
entry will restore the regs and return; In origin call case, it will
return 1, and the entry will make the RSP skip the rip before return.

In the FENTRY case, the performance of global trampoline is ~10% slower
than BPF trampoline. The global trampoline is optimized by inline some
function call, such as __bpf_prog_enter_recur and __bpf_prog_exit_recur.
However, more condition, branch and memory read is used in the
bpf_global_caller.

In the FEXIT and MODIFY_RETURN cases, the performance of the global
trampoline is the same(or even better) than BPF trampoline. It make sense,
as we also make the function call to __bpf_tramp_enter and
__bpf_tramp_exit inlined in the bpf_global_caller.

In fact, we can do more optimization to the bpf_global_caller. For
example, we can define more bpf_global_caller_xx_run() function and make
the "if (prog->sleepable)" and "if (do_origin_call)" fixed. It can be done
in a next series. After such optimization, I believe the performance of
FENTRY_MULTI can be closer or the same to FENTRY. And for the
FEXIT/MODIFY_RETURN cases, the performance can be better.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
v2:
- rewrite the global trampoline with C instead of asm
---
 arch/x86/Kconfig            |   4 +
 arch/x86/net/bpf_jit_comp.c | 268 ++++++++++++++++++++++++++++++++++++
 include/linux/bpf_tramp.h   |  72 ++++++++++
 kernel/bpf/trampoline.c     |  23 +---
 4 files changed, 346 insertions(+), 21 deletions(-)
 create mode 100644 include/linux/bpf_tramp.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 71019b3b54ea..96962c61419a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -155,6 +155,7 @@ config X86
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select ARCH_HAS_PARANOID_L1D_FLUSH
 	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
+	select ARCH_HAS_BPF_GLOBAL_CALLER	if X86_64
 	select BUILDTIME_TABLE_SORT
 	select CLKEVT_I8253
 	select CLOCKSOURCE_WATCHDOG
@@ -432,6 +433,9 @@ config PGTABLE_LEVELS
 	default 3 if X86_PAE
 	default 2
 
+config ARCH_HAS_BPF_GLOBAL_CALLER
+	bool
+
 menu "Processor type and features"
 
 config SMP
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 15672cb926fc..8d2fc436a748 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -11,6 +11,8 @@
 #include <linux/bpf.h>
 #include <linux/memory.h>
 #include <linux/sort.h>
+#include <linux/bpf_tramp.h>
+#include <linux/kfunc_md.h>
 #include <asm/extable.h>
 #include <asm/ftrace.h>
 #include <asm/set_memory.h>
@@ -3413,6 +3415,272 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 	return ret;
 }
 
+#define FUNC_ARGS_0		((2 - 1) * 8)
+#define FUNC_ARGS_1		((2 + 0) * 8)
+#define FUNC_ARGS_2		((2 + 1) * 8)
+#define FUNC_ARGS_3		((2 + 2) * 8)
+#define FUNC_ARGS_4		((2 + 3) * 8)
+#define FUNC_ARGS_5		((2 + 4) * 8)
+#define FUNC_ARGS_6		((2 + 5) * 8)
+
+#define SAVE_ARGS_0
+#define SAVE_ARGS_1						\
+	"movq %rdi, " __stringify(FUNC_ARGS_1) "(%rsp)\n"
+#define SAVE_ARGS_2 SAVE_ARGS_1					\
+	"movq %rsi, " __stringify(FUNC_ARGS_2) "(%rsp)\n"
+#define SAVE_ARGS_3 SAVE_ARGS_2					\
+	"movq %rdx, " __stringify(FUNC_ARGS_3) "(%rsp)\n"
+#define SAVE_ARGS_4 SAVE_ARGS_3					\
+	"movq %rcx, " __stringify(FUNC_ARGS_4) "(%rsp)\n"
+#define SAVE_ARGS_5 SAVE_ARGS_4					\
+	"movq %r8, " __stringify(FUNC_ARGS_5) "(%rsp)\n"
+#define SAVE_ARGS_6 SAVE_ARGS_5					\
+	"movq %r9, " __stringify(FUNC_ARGS_6) "(%rsp)\n"	\
+
+#define RESTORE_ARGS_0
+#define RESTORE_ARGS_1						\
+	"movq " __stringify(FUNC_ARGS_1) "(%rsp), %rdi\n"
+#define RESTORE_ARGS_2 RESTORE_ARGS_1				\
+	"movq " __stringify(FUNC_ARGS_2) "(%rsp), %rsi\n"
+#define RESTORE_ARGS_3 RESTORE_ARGS_2				\
+	"movq " __stringify(FUNC_ARGS_3) "(%rsp), %rdx\n"
+#define RESTORE_ARGS_4 RESTORE_ARGS_3				\
+	"movq " __stringify(FUNC_ARGS_4) "(%rsp), %rcx\n"
+#define RESTORE_ARGS_5 RESTORE_ARGS_4				\
+	"movq " __stringify(FUNC_ARGS_5) "(%rsp), %r8\n"
+#define RESTORE_ARGS_6 RESTORE_ARGS_5				\
+	"movq " __stringify(FUNC_ARGS_6) "(%rsp), %r9\n"
+
+#define RESTORE_ORIGIN_0
+#define RESTORE_ORIGIN_1						\
+	"movq " __stringify(FUNC_ARGS_1 - FUNC_ARGS_1) "(%[args]), %%rdi\n"
+#define RESTORE_ORIGIN_2 RESTORE_ORIGIN_1				\
+	"movq " __stringify(FUNC_ARGS_2 - FUNC_ARGS_1) "(%[args]), %%rsi\n"
+#define RESTORE_ORIGIN_3 RESTORE_ORIGIN_2				\
+	"movq " __stringify(FUNC_ARGS_3 - FUNC_ARGS_1) "(%[args]), %%rdx\n"
+#define RESTORE_ORIGIN_4 RESTORE_ORIGIN_3				\
+	"movq " __stringify(FUNC_ARGS_4 - FUNC_ARGS_1) "(%[args]), %%rcx\n"
+#define RESTORE_ORIGIN_5 RESTORE_ORIGIN_4				\
+	"movq " __stringify(FUNC_ARGS_5 - FUNC_ARGS_1) "(%[args]), %%r8\n"
+#define RESTORE_ORIGIN_6 RESTORE_ORIGIN_5				\
+	"movq " __stringify(FUNC_ARGS_6 - FUNC_ARGS_1) "(%[args]), %%r9\n"
+
+static __always_inline void
+do_origin_call(unsigned long *args, unsigned long *ip, int nr_args)
+{
+	/* Following code will be optimized by the compiler, as nr_args
+	 * is a const, and there will be no condition here.
+	 */
+	if (nr_args == 0) {
+		asm volatile(
+			RESTORE_ORIGIN_0 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			:
+		);
+	} else if (nr_args == 1) {
+		asm volatile(
+			RESTORE_ORIGIN_1 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi"
+		);
+	} else if (nr_args == 2) {
+		asm volatile(
+			RESTORE_ORIGIN_2 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi"
+		);
+	} else if (nr_args == 3) {
+		asm volatile(
+			RESTORE_ORIGIN_3 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx"
+		);
+	} else if (nr_args == 4) {
+		asm volatile(
+			RESTORE_ORIGIN_4 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx", "rcx"
+		);
+	} else if (nr_args == 5) {
+		asm volatile(
+			RESTORE_ORIGIN_5 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx", "rcx", "r8"
+		);
+	} else if (nr_args == 6) {
+		asm volatile(
+			RESTORE_ORIGIN_6 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx", "rcx", "r8", "r9"
+		);
+	}
+}
+
+static __always_inline notrace void
+run_tramp_prog(struct kfunc_md_tramp_prog *tramp_prog,
+	       struct bpf_tramp_run_ctx *run_ctx, unsigned long *args)
+{
+	struct bpf_prog *prog;
+	u64 start_time;
+
+	while (tramp_prog) {
+		prog = tramp_prog->prog;
+		run_ctx->bpf_cookie = tramp_prog->cookie;
+		start_time = bpf_gtramp_enter(prog, run_ctx);
+
+		if (likely(start_time)) {
+			asm volatile(
+				CALL_NOSPEC "\n"
+				: : [thunk_target]"r"(prog->bpf_func), [args]"D"(args)
+			);
+		}
+
+		bpf_gtramp_exit(prog, start_time, run_ctx);
+		tramp_prog = tramp_prog->next;
+	}
+}
+
+static __always_inline notrace int
+bpf_global_caller_run(unsigned long *args, unsigned long *ip, int nr_args)
+{
+	unsigned long origin_ip = (*ip) & 0xfffffffffffffff0; // Align to 16 bytes
+	struct kfunc_md_tramp_prog *tramp_prog;
+	struct bpf_tramp_run_ctx run_ctx;
+	struct kfunc_md *md;
+	bool do_orgin;
+
+	rcu_read_lock();
+	md = kfunc_md_get_rcu(origin_ip);
+	do_orgin = md->bpf_origin_call;
+	if (do_orgin)
+		kfunc_md_enter(md);
+	rcu_read_unlock();
+
+	/* save the origin function ip for bpf_get_func_ip() */
+	*(args - 2) = origin_ip;
+	*(args - 1) = nr_args;
+
+	run_tramp_prog(md->bpf_progs[BPF_TRAMP_FENTRY], &run_ctx, args);
+
+	/* no fexit and modify_return, return directly */
+	if (!do_orgin)
+		return 0;
+
+	/* modify return case */
+	tramp_prog = md->bpf_progs[BPF_TRAMP_MODIFY_RETURN];
+	/* initialize return value */
+	args[nr_args] = 0;
+	while (tramp_prog) {
+		struct bpf_prog *prog;
+		u64 start_time, ret;
+
+		prog = tramp_prog->prog;
+		run_ctx.bpf_cookie = tramp_prog->cookie;
+		start_time = bpf_gtramp_enter(prog, &run_ctx);
+
+		if (likely(start_time)) {
+			asm volatile(
+				CALL_NOSPEC "\n"
+				: "=a"(ret), ASM_CALL_CONSTRAINT
+				: [thunk_target]"r"(prog->bpf_func),
+				  [args]"D"(args)
+			);
+			args[nr_args] = ret;
+		} else {
+			ret = 0;
+		}
+
+		bpf_gtramp_exit(prog, start_time, &run_ctx);
+		if (ret)
+			goto do_fexit;
+		tramp_prog = tramp_prog->next;
+	}
+
+	/* restore the function arguments and call the origin function */
+	do_origin_call(args, ip, nr_args);
+do_fexit:
+	run_tramp_prog(md->bpf_progs[BPF_TRAMP_FEXIT], &run_ctx, args);
+	kfunc_md_exit(md);
+	return 1;
+}
+
+/* Layout of the stack frame:
+ *   rip		----> 8 bytes
+ *   return value	----> 8 bytes
+ *   args		----> 8 * 6 bytes
+ *   arg count		----> 8 bytes
+ *   origin ip		----> 8 bytes
+ */
+#define stack_size __stringify(8 + 8 + 6 * 8 + 8)
+
+#define CALLER_DEFINE(name, nr_args)					\
+static __always_used __no_stack_protector notrace int			\
+name##_run(unsigned long *args, unsigned long *ip)			\
+{									\
+	return bpf_global_caller_run(args, ip, nr_args);		\
+}									\
+static __naked void name(void)						\
+{									\
+	asm volatile(							\
+		"subq $" stack_size ", %rsp\n"				\
+		SAVE_ARGS_##nr_args					\
+	);								\
+									\
+	asm volatile(							\
+		"leaq " __stringify(FUNC_ARGS_1) "(%rsp), %rdi\n"	\
+		"leaq " stack_size "(%rsp), %rsi\n"			\
+		"call " #name "_run\n"					\
+		"test %rax, %rax\n"					\
+		"jne 1f\n"						\
+	);								\
+									\
+	asm volatile(							\
+		RESTORE_ARGS_##nr_args					\
+		"addq $" stack_size ", %rsp\n"				\
+		ASM_RET							\
+	);								\
+									\
+	asm volatile(							\
+		"1:\n"							\
+		"movq " __stringify(FUNC_ARGS_##nr_args + 8)		\
+		"(%rsp), %rax\n"					\
+		"addq $(" stack_size " + 8), %rsp\n"			\
+		ASM_RET);						\
+}									\
+STACK_FRAME_NON_STANDARD(name)
+
+CALLER_DEFINE(bpf_global_caller_0, 0);
+CALLER_DEFINE(bpf_global_caller_1, 1);
+CALLER_DEFINE(bpf_global_caller_2, 2);
+CALLER_DEFINE(bpf_global_caller_3, 3);
+CALLER_DEFINE(bpf_global_caller_4, 4);
+CALLER_DEFINE(bpf_global_caller_5, 5);
+CALLER_DEFINE(bpf_global_caller_6, 6);
+
+void *bpf_gloabl_caller_array[MAX_BPF_FUNC_ARGS + 1] = {
+	bpf_global_caller_0,
+	bpf_global_caller_1,
+	bpf_global_caller_2,
+	bpf_global_caller_3,
+	bpf_global_caller_4,
+	bpf_global_caller_5,
+	bpf_global_caller_6,
+};
+
 static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
 {
 	u8 *jg_reloc, *prog = *pprog;
diff --git a/include/linux/bpf_tramp.h b/include/linux/bpf_tramp.h
new file mode 100644
index 000000000000..32447fcfc017
--- /dev/null
+++ b/include/linux/bpf_tramp.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __LINUX_BPF_TRAMP_H__
+#define __LINUX_BPF_TRAMP_H__
+#ifdef CONFIG_BPF_JIT
+#include <linux/filter.h>
+
+#ifdef CONFIG_ARCH_HAS_BPF_GLOBAL_CALLER
+extern void *bpf_gloabl_caller_array[MAX_BPF_FUNC_ARGS + 1];
+#endif
+
+void notrace __update_prog_stats(struct bpf_prog *prog, u64 start);
+
+#define NO_START_TIME 1
+static __always_inline u64 notrace bpf_prog_start_time(void)
+{
+	u64 start = NO_START_TIME;
+
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
+		start = sched_clock();
+		if (unlikely(!start))
+			start = NO_START_TIME;
+	}
+	return start;
+}
+
+static __always_inline void notrace update_prog_stats(struct bpf_prog *prog,
+						      u64 start)
+{
+	if (static_branch_unlikely(&bpf_stats_enabled_key))
+		__update_prog_stats(prog, start);
+}
+
+static __always_inline u64 notrace
+bpf_gtramp_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+	__acquires(RCU)
+{
+	if (unlikely(prog->sleepable)) {
+		rcu_read_lock_trace();
+		might_fault();
+	} else {
+		rcu_read_lock();
+	}
+	migrate_disable();
+
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+		bpf_prog_inc_misses_counter(prog);
+		if (prog->aux->recursion_detected)
+			prog->aux->recursion_detected(prog);
+		return 0;
+	}
+	return bpf_prog_start_time();
+}
+
+static __always_inline void notrace
+bpf_gtramp_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx)
+	__releases(RCU)
+{
+	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+	update_prog_stats(prog, start);
+	this_cpu_dec(*(prog->active));
+	migrate_enable();
+	if (unlikely(prog->sleepable))
+		rcu_read_unlock_trace();
+	else
+		rcu_read_unlock();
+}
+
+#endif
+#endif
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index b1e358c16eeb..fa90c225c93b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -13,6 +13,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/bpf_lsm.h>
 #include <linux/delay.h>
+#include <linux/bpf_tramp.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -868,19 +869,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
 	mutex_unlock(&trampoline_mutex);
 }
 
-#define NO_START_TIME 1
-static __always_inline u64 notrace bpf_prog_start_time(void)
-{
-	u64 start = NO_START_TIME;
-
-	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
-		start = sched_clock();
-		if (unlikely(!start))
-			start = NO_START_TIME;
-	}
-	return start;
-}
-
 /* The logic is similar to bpf_prog_run(), but with an explicit
  * rcu_read_lock() and migrate_disable() which are required
  * for the trampoline. The macro is split into
@@ -911,7 +899,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
 	return bpf_prog_start_time();
 }
 
-static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
+void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
 {
 	struct bpf_prog_stats *stats;
 	unsigned long flags;
@@ -932,13 +920,6 @@ static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
 	u64_stats_update_end_irqrestore(&stats->syncp, flags);
 }
 
-static __always_inline void notrace update_prog_stats(struct bpf_prog *prog,
-						      u64 start)
-{
-	if (static_branch_unlikely(&bpf_stats_enabled_key))
-		__update_prog_stats(prog, start);
-}
-
 static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
 					  struct bpf_tramp_run_ctx *run_ctx)
 	__releases(RCU)
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 03/18] ftrace: factor out ftrace_direct_update from register_ftrace_direct
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
  2025-07-03 12:15 ` [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-05  2:41   ` kernel test robot
  2025-07-03 12:15 ` [PATCH bpf-next v2 04/18] ftrace: add reset_ftrace_direct_ips Menglong Dong
                   ` (14 subsequent siblings)
  17 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Mark Rutland, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel

Factor out ftrace_direct_update() from register_ftrace_direct(), which is
used to add new entries to the direct_functions. This function will be
used in the later patch.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 kernel/trace/ftrace.c | 108 +++++++++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 48 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4203fad56b6c..f5f6d7bc26f0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5953,53 +5953,18 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp)
 	free_ftrace_hash(fhp);
 }
 
-/**
- * register_ftrace_direct - Call a custom trampoline directly
- * for multiple functions registered in @ops
- * @ops: The address of the struct ftrace_ops object
- * @addr: The address of the trampoline to call at @ops functions
- *
- * This is used to connect a direct calls to @addr from the nop locations
- * of the functions registered in @ops (with by ftrace_set_filter_ip
- * function).
- *
- * The location that it calls (@addr) must be able to handle a direct call,
- * and save the parameters of the function being traced, and restore them
- * (or inject new ones if needed), before returning.
- *
- * Returns:
- *  0 on success
- *  -EINVAL  - The @ops object was already registered with this call or
- *             when there are no functions in @ops object.
- *  -EBUSY   - Another direct function is already attached (there can be only one)
- *  -ENODEV  - @ip does not point to a ftrace nop location (or not supported)
- *  -ENOMEM  - There was an allocation failure.
- */
-int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
+static int ftrace_direct_update(struct ftrace_hash *hash, unsigned long addr)
 {
-	struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL;
 	struct ftrace_func_entry *entry, *new;
+	struct ftrace_hash *new_hash = NULL;
 	int err = -EBUSY, size, i;
 
-	if (ops->func || ops->trampoline)
-		return -EINVAL;
-	if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED))
-		return -EINVAL;
-	if (ops->flags & FTRACE_OPS_FL_ENABLED)
-		return -EINVAL;
-
-	hash = ops->func_hash->filter_hash;
-	if (ftrace_hash_empty(hash))
-		return -EINVAL;
-
-	mutex_lock(&direct_mutex);
-
 	/* Make sure requested entries are not already registered.. */
 	size = 1 << hash->size_bits;
 	for (i = 0; i < size; i++) {
 		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
 			if (ftrace_find_rec_direct(entry->ip))
-				goto out_unlock;
+				goto out;
 		}
 	}
 
@@ -6012,7 +5977,7 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 		size = FTRACE_HASH_MAX_BITS;
 	new_hash = alloc_ftrace_hash(size);
 	if (!new_hash)
-		goto out_unlock;
+		goto out;
 
 	/* Now copy over the existing direct entries */
 	size = 1 << direct_functions->size_bits;
@@ -6020,7 +5985,7 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 		hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) {
 			new = add_hash_entry(new_hash, entry->ip);
 			if (!new)
-				goto out_unlock;
+				goto out;
 			new->direct = entry->direct;
 		}
 	}
@@ -6031,16 +5996,67 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
 			new = add_hash_entry(new_hash, entry->ip);
 			if (!new)
-				goto out_unlock;
+				goto out;
 			/* Update both the copy and the hash entry */
 			new->direct = addr;
 			entry->direct = addr;
 		}
 	}
 
-	free_hash = direct_functions;
 	rcu_assign_pointer(direct_functions, new_hash);
 	new_hash = NULL;
+	err = 0;
+out:
+	if (new_hash)
+		free_ftrace_hash(new_hash);
+
+	return err;
+}
+
+/**
+ * register_ftrace_direct - Call a custom trampoline directly
+ * for multiple functions registered in @ops
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the trampoline to call at @ops functions
+ *
+ * This is used to connect a direct calls to @addr from the nop locations
+ * of the functions registered in @ops (with by ftrace_set_filter_ip
+ * function).
+ *
+ * The location that it calls (@addr) must be able to handle a direct call,
+ * and save the parameters of the function being traced, and restore them
+ * (or inject new ones if needed), before returning.
+ *
+ * Returns:
+ *  0 on success
+ *  -EINVAL  - The @ops object was already registered with this call or
+ *             when there are no functions in @ops object.
+ *  -EBUSY   - Another direct function is already attached (there can be only one)
+ *  -ENODEV  - @ip does not point to a ftrace nop location (or not supported)
+ *  -ENOMEM  - There was an allocation failure.
+ */
+int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
+{
+	struct ftrace_hash *hash, *free_hash = NULL;
+	int err = -EBUSY;
+
+	if (ops->func || ops->trampoline)
+		return -EINVAL;
+	if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED))
+		return -EINVAL;
+	if (ops->flags & FTRACE_OPS_FL_ENABLED)
+		return -EINVAL;
+
+	hash = ops->func_hash->filter_hash;
+	if (ftrace_hash_empty(hash))
+		return -EINVAL;
+
+	mutex_lock(&direct_mutex);
+
+	free_hash = direct_functions;
+	err = ftrace_direct_update(hash, addr);
+	if (err)
+		goto out_unlock;
 
 	ops->func = call_direct_funcs;
 	ops->flags = MULTI_FLAGS;
@@ -6048,15 +6064,11 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 	ops->direct_call = addr;
 
 	err = register_ftrace_function_nolock(ops);
-
- out_unlock:
-	mutex_unlock(&direct_mutex);
-
 	if (free_hash && free_hash != EMPTY_HASH)
 		call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
 
-	if (new_hash)
-		free_ftrace_hash(new_hash);
+ out_unlock:
+	mutex_unlock(&direct_mutex);
 
 	return err;
 }
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 04/18] ftrace: add reset_ftrace_direct_ips
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (2 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 03/18] ftrace: factor out ftrace_direct_update from register_ftrace_direct Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-03 15:30   ` Steven Rostedt
  2025-07-03 12:15 ` [PATCH bpf-next v2 05/18] bpf: introduce bpf_gtramp_link Menglong Dong
                   ` (13 subsequent siblings)
  17 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Mark Rutland, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel

For now, we can change the address of a direct ftrace_ops with
modify_ftrace_direct(). However, we can't change the functions to filter
for a direct ftrace_ops. Therefore, we introduce the function
reset_ftrace_direct_ips() to do such things, and this function will reset
the functions to filter for a direct ftrace_ops.

This function do such thing in following steps:

1. filter out the new functions from ips that don't exist in the
   ops->func_hash->filter_hash and add them to the new hash.
2. add all the functions in the new ftrace_hash to direct_functions by
   ftrace_direct_update().
3. reset the functions to filter of the ftrace_ops to the ips with
   ftrace_set_filter_ips().
4. remove the functions that in the old ftrace_hash, but not in the new
   ftrace_hash from direct_functions.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 include/linux/ftrace.h |  7 ++++
 kernel/trace/ftrace.c  | 75 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b672ca15f265..b7c60f5a4120 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -528,6 +528,8 @@ int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr);
 
 void ftrace_stub_direct_tramp(void);
 
+int reset_ftrace_direct_ips(struct ftrace_ops *ops, unsigned long *ips,
+			    unsigned int cnt);
 #else
 struct ftrace_ops;
 static inline unsigned long ftrace_find_rec_direct(unsigned long ip)
@@ -551,6 +553,11 @@ static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned l
 {
 	return -ENODEV;
 }
+static inline int reset_ftrace_direct_ips(struct ftrace_ops *ops, unsigned long *ips,
+					  unsigned int cnt)
+{
+	return -ENODEV;
+}
 
 /*
  * This must be implemented by the architecture.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f5f6d7bc26f0..db3aa61889d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6224,6 +6224,81 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 	return err;
 }
 EXPORT_SYMBOL_GPL(modify_ftrace_direct);
+
+/* reset the ips for a direct ftrace (add or remove) */
+int reset_ftrace_direct_ips(struct ftrace_ops *ops, unsigned long *ips,
+			    unsigned int cnt)
+{
+	struct ftrace_hash *hash, *free_hash;
+	struct ftrace_func_entry *entry, *del;
+	unsigned long ip;
+	int err, size;
+
+	if (check_direct_multi(ops))
+		return -EINVAL;
+	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+		return -EINVAL;
+
+	mutex_lock(&direct_mutex);
+	hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+	if (!hash) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	/* find out the new functions from ips and add to hash */
+	for (int i = 0; i < cnt; i++) {
+		ip = ftrace_location(ips[i]);
+		if (!ip) {
+			err = -ENOENT;
+			goto out_unlock;
+		}
+		if (__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
+			continue;
+		err = __ftrace_match_addr(hash, ip, 0);
+		if (err)
+			goto out_unlock;
+	}
+
+	free_hash = direct_functions;
+	/* add the new ips to direct hash. */
+	err = ftrace_direct_update(hash, ops->direct_call);
+	if (err)
+		goto out_unlock;
+
+	if (free_hash && free_hash != EMPTY_HASH)
+		call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
+
+	free_ftrace_hash(hash);
+	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS,
+					  ops->func_hash->filter_hash);
+	if (!hash) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+	err = ftrace_set_filter_ips(ops, ips, cnt, 0, 1);
+
+	/* remove the entries that don't exist in our filter_hash anymore
+	 * from the direct_functions.
+	 */
+	size = 1 << hash->size_bits;
+	for (int i = 0; i < size; i++) {
+		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+			if (__ftrace_lookup_ip(ops->func_hash->filter_hash, entry->ip))
+				continue;
+			del = __ftrace_lookup_ip(direct_functions, entry->ip);
+			if (del && del->direct == ops->direct_call) {
+				remove_hash_entry(direct_functions, del);
+				kfree(del);
+			}
+		}
+	}
+out_unlock:
+	mutex_unlock(&direct_mutex);
+	if (hash)
+		free_ftrace_hash(hash);
+	return err;
+}
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
 /**
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 05/18] bpf: introduce bpf_gtramp_link
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (3 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 04/18] ftrace: add reset_ftrace_direct_ips Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-04  7:00   ` kernel test robot
  2025-07-04  7:52   ` kernel test robot
  2025-07-03 12:15 ` [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args Menglong Dong
                   ` (12 subsequent siblings)
  17 siblings, 2 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, John Fastabend, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, linux-kernel

Introduce the struct bpf_gtramp_link, which is used to attach
a bpf prog to multi functions. Meanwhile, introduce corresponding
function bpf_gtrampoline_{link,unlink}_prog. The lock global_tr_lock is
held during global trampoline link and unlink.

We create different global trampoline for the kernel functions that have
different argument count. If corresponding global_tr->image is NULL, it
means such function argument count is not supported.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 include/linux/bpf.h     |  35 ++++++++
 kernel/bpf/trampoline.c | 189 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 224 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5dd556e89cce..70bf613d51d0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -58,6 +58,8 @@ struct bpf_token;
 struct user_namespace;
 struct super_block;
 struct inode;
+struct bpf_tramp_link;
+struct bpf_gtramp_link;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -1279,6 +1281,12 @@ struct bpf_trampoline {
 	struct bpf_tramp_image *cur_image;
 };
 
+struct bpf_global_trampoline {
+	struct ftrace_ops *fops;
+	void *image;
+	int nr_args;
+};
+
 struct bpf_attach_target_info {
 	struct btf_func_model fmodel;
 	long tgt_addr;
@@ -1382,6 +1390,9 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key,
 void bpf_trampoline_put(struct bpf_trampoline *tr);
 int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs);
 
+int bpf_gtrampoline_link_prog(struct bpf_gtramp_link *link);
+int bpf_gtrampoline_unlink_prog(struct bpf_gtramp_link *link);
+
 /*
  * When the architecture supports STATIC_CALL replace the bpf_dispatcher_fn
  * indirection with a direct call to the bpf program. If the architecture does
@@ -1490,6 +1501,14 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 {
 	return false;
 }
+int bpf_gtrampoline_link_prog(struct bpf_gtramp_link *link)
+{
+	return -ENODEV;
+}
+int bpf_gtrampoline_unlink_prog(struct bpf_gtramp_link *link)
+{
+	return -ENODEV;
+}
 #endif
 
 struct bpf_func_info_aux {
@@ -1746,6 +1765,22 @@ struct bpf_shim_tramp_link {
 	struct bpf_trampoline *trampoline;
 };
 
+struct bpf_gtramp_link_entry {
+	struct bpf_prog *tgt_prog;
+	struct bpf_trampoline *trampoline;
+	void *addr;
+	struct btf *attach_btf;
+	u64 cookie;
+	u32 btf_id;
+	u32 nr_args;
+};
+
+struct bpf_gtramp_link {
+	struct bpf_link link;
+	struct bpf_gtramp_link_entry *entries;
+	u32 entry_cnt;
+};
+
 struct bpf_tracing_link {
 	struct bpf_tramp_link link;
 	enum bpf_attach_type attach_type;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index fa90c225c93b..f70921ce4e97 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -14,6 +14,7 @@
 #include <linux/bpf_lsm.h>
 #include <linux/delay.h>
 #include <linux/bpf_tramp.h>
+#include <linux/kfunc_md.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -30,6 +31,10 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
 /* serializes access to trampoline_table */
 static DEFINE_MUTEX(trampoline_mutex);
 
+static struct bpf_global_trampoline global_tr_array[MAX_BPF_FUNC_ARGS + 1];
+static DEFINE_MUTEX(global_tr_lock);
+static const struct bpf_link_ops bpf_shim_tramp_link_lops;
+
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
 
@@ -646,6 +651,172 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
 	return err;
 }
 
+#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && defined(CONFIG_ARCH_HAS_BPF_GLOBAL_CALLER)
+static int bpf_gtrampoline_update(struct bpf_global_trampoline *tr)
+{
+	struct ftrace_ops *fops;
+	int ips_count, err = 0;
+	void **ips = NULL;
+
+	ips_count = kfunc_md_bpf_ips(&ips, tr->nr_args);
+	if (ips_count < 0) {
+		err = ips_count;
+		goto out;
+	}
+
+	fops = tr->fops;
+	if (ips_count == 0) {
+		if (!(fops->flags & FTRACE_OPS_FL_ENABLED))
+			goto out;
+		err = unregister_ftrace_direct(fops, (unsigned long)tr->image,
+					       true);
+		goto out;
+	}
+
+	if (fops->flags & FTRACE_OPS_FL_ENABLED) {
+		err = reset_ftrace_direct_ips(fops, (unsigned long *)ips,
+					      ips_count);
+		goto out;
+	}
+
+	err = ftrace_set_filter_ips(tr->fops, (unsigned long *)ips,
+				    ips_count, 0, 1);
+	if (err)
+		goto out;
+
+	err = register_ftrace_direct(fops, (unsigned long)tr->image);
+out:
+	kfree(ips);
+
+	return err;
+}
+
+static int bpf_gtrampoline_update_all(void)
+{
+	struct bpf_global_trampoline *gtr;
+	int err;
+
+	for (int i = 0; i <= MAX_BPF_FUNC_ARGS; i++) {
+		gtr = &global_tr_array[i];
+		if (!gtr->image)
+			break;
+		err = bpf_gtrampoline_update(gtr);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+#else
+static int bpf_gtrampoline_update_all(void)
+{
+	return -ENODEV;
+}
+#endif
+
+static int __bpf_gtrampoline_unlink_prog(struct bpf_gtramp_link *link,
+					 u32 cnt)
+{
+	enum bpf_tramp_prog_type kind;
+	struct kfunc_md *md;
+	int err = 0;
+
+	kind = bpf_attach_type_to_tramp(link->link.prog);
+
+	/* remove the prog from all the coressponding md */
+	for (int i = 0; i < link->entry_cnt; i++) {
+		md = kfunc_md_get((long)link->entries[i].addr);
+		if (WARN_ON_ONCE(!md))
+			continue;
+
+		err = kfunc_md_bpf_unlink(md, link->link.prog, kind);
+		if (err)
+			return err;
+	}
+
+	bpf_gtrampoline_update_all();
+	for (int i = 0; i < cnt; i++)
+		kfunc_md_put_ip((long)link->entries[i].addr);
+
+	return 0;
+}
+
+int bpf_gtrampoline_unlink_prog(struct bpf_gtramp_link *link)
+{
+	int err;
+
+	/* hold the global trampoline lock, to make the target functions
+	 * consist during we unlink the prog.
+	 */
+	mutex_lock(&global_tr_lock);
+	err = __bpf_gtrampoline_unlink_prog(link, link->entry_cnt);
+	mutex_unlock(&global_tr_lock);
+
+	return err;
+}
+
+int bpf_gtrampoline_link_prog(struct bpf_gtramp_link *link)
+{
+	struct bpf_gtramp_link_entry *entry;
+	enum bpf_tramp_prog_type kind;
+	struct bpf_prog *prog;
+	struct kfunc_md *md;
+	bool update = false;
+	int err = 0, i;
+
+	/* check if the function arguments count is supported by the arch */
+	for (int i = 0; i < link->entry_cnt; i++) {
+		entry = &link->entries[i];
+		if (entry->nr_args > MAX_BPF_FUNC_ARGS ||
+		    !global_tr_array[entry->nr_args].image)
+			return -EOPNOTSUPP;
+	}
+
+	prog = link->link.prog;
+	kind = bpf_attach_type_to_tramp(prog);
+
+	/* hold the global trampoline lock, to make the target functions
+	 * consist during we link the prog.
+	 */
+	mutex_lock(&global_tr_lock);
+
+	/* update the bpf prog to all the corresponding function metadata */
+	for (i = 0; i < link->entry_cnt; i++) {
+		entry = &link->entries[i];
+		md = kfunc_md_create((long)entry->addr, entry->nr_args);
+		if (md) {
+			/* the function is not in the filter hash of gtr,
+			 * we need update the global trampoline.
+			 */
+			if (!md->bpf_prog_cnt)
+				update = true;
+			err = kfunc_md_bpf_link(md, prog, kind, entry->cookie);
+		} else {
+			err = -ENOMEM;
+		}
+
+		if (err) {
+			kfunc_md_put(md);
+			goto on_fallback;
+		}
+	}
+
+	if (update) {
+		err = bpf_gtrampoline_update_all();
+		if (err)
+			goto on_fallback;
+	}
+	mutex_unlock(&global_tr_lock);
+
+	return 0;
+
+on_fallback:
+	__bpf_gtrampoline_unlink_prog(link, i);
+	mutex_unlock(&global_tr_lock);
+
+	return err;
+}
+
 #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
 static void bpf_shim_tramp_link_release(struct bpf_link *link)
 {
@@ -1117,6 +1288,24 @@ static int __init init_trampolines(void)
 {
 	int i;
 
+	for (i = 0; i <= MAX_BPF_FUNC_ARGS; i++) {
+		struct bpf_global_trampoline *global_tr;
+
+		global_tr = &global_tr_array[i];
+		global_tr->nr_args = i;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+		global_tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+		if (!global_tr->fops)
+			return -ENOMEM;
+
+		global_tr->fops->private = global_tr;
+		global_tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+#endif
+#ifdef CONFIG_ARCH_HAS_BPF_GLOBAL_CALLER
+		global_tr->image = bpf_gloabl_caller_array[i];
+#endif
+	}
+
 	for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
 		INIT_HLIST_HEAD(&trampoline_table[i]);
 	return 0;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (4 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 05/18] bpf: introduce bpf_gtramp_link Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-03 12:15 ` [PATCH bpf-next v2 07/18] bpf: refactor the modules_array to ptr_array Menglong Dong
                   ` (11 subsequent siblings)
  17 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, John Fastabend, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Simon Horman, linux-kernel, netdev

In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
which is used to record the accessed index of the function args in
btf_ctx_access().

Meanwhile, we add the function btf_check_func_part_match() to compare the
accessed function args of two function prototype. This function will be
used in the following commit.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 include/linux/bpf.h   |   4 ++
 include/linux/btf.h   |   3 +-
 kernel/bpf/btf.c      | 108 +++++++++++++++++++++++++++++++++++++++++-
 net/sched/bpf_qdisc.c |   2 +-
 4 files changed, 113 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 70bf613d51d0..5e6d83750d39 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1605,6 +1605,7 @@ struct bpf_prog_aux {
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
 	const char *attach_func_name;
+	u64 accessed_args;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
 	struct bpf_jit_poke_descriptor *poke_tab;
@@ -2790,6 +2791,9 @@ struct bpf_reg_state;
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
 			 struct btf *btf, const struct btf_type *t);
+int btf_check_func_part_match(struct btf *btf1, const struct btf_type *t1,
+			      struct btf *btf2, const struct btf_type *t2,
+			      u64 func_args);
 const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt,
 				    int comp_idx, const char *tag_key);
 int btf_find_next_decl_tag(const struct btf *btf, const struct btf_type *pt,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index a40beb9cf160..b2b56249ce11 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -524,7 +524,8 @@ bool btf_param_match_suffix(const struct btf *btf,
 			    const char *suffix);
 int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto,
 		       u32 arg_no);
-u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off);
+u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+		    int off, int *aligned_idx);
 
 struct bpf_verifier_log;
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 05fd64a371af..853ca19bbe81 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6404,19 +6404,24 @@ static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t)
 }
 
 u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
-		    int off)
+		    int off, int *aligned_idx)
 {
 	const struct btf_param *args;
 	const struct btf_type *t;
 	u32 offset = 0, nr_args;
 	int i;
 
+	if (aligned_idx)
+		*aligned_idx = -ENOENT;
+
 	if (!func_proto)
 		return off / 8;
 
 	nr_args = btf_type_vlen(func_proto);
 	args = (const struct btf_param *)(func_proto + 1);
 	for (i = 0; i < nr_args; i++) {
+		if (aligned_idx && offset == off)
+			*aligned_idx = i;
 		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
 		offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
 		if (off < offset)
@@ -6684,7 +6689,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			tname, off);
 		return false;
 	}
-	arg = btf_ctx_arg_idx(btf, t, off);
+	arg = btf_ctx_arg_idx(btf, t, off, NULL);
 	args = (const struct btf_param *)(t + 1);
 	/* if (t == NULL) Fall back to default BPF prog with
 	 * MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -6694,6 +6699,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		/* skip first 'void *__data' argument in btf_trace_##name typedef */
 		args++;
 		nr_args--;
+		prog->aux->accessed_args |= (1 << (arg + 1));
+	} else {
+		prog->aux->accessed_args |= (1 << arg);
 	}
 
 	if (arg > nr_args) {
@@ -7553,6 +7561,102 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
 	return btf_check_func_type_match(log, btf1, t1, btf2, t2);
 }
 
+static u32 get_ctx_arg_total_size(struct btf *btf, const struct btf_type *t)
+{
+	const struct btf_param *args;
+	u32 size = 0, nr_args;
+	int i;
+
+	nr_args = btf_type_vlen(t);
+	args = (const struct btf_param *)(t + 1);
+	for (i = 0; i < nr_args; i++) {
+		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+		size += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+	}
+
+	return size;
+}
+
+/* This function is similar to btf_check_func_type_match(), except that it
+ * only compare some function args of the function prototype t1 and t2.
+ */
+int btf_check_func_part_match(struct btf *btf1, const struct btf_type *func1,
+			      struct btf *btf2, const struct btf_type *func2,
+			      u64 func_args)
+{
+	const struct btf_param *args1, *args2;
+	u32 nargs1, i, offset = 0;
+	const char *s1, *s2;
+
+	if (!btf_type_is_func_proto(func1) || !btf_type_is_func_proto(func2))
+		return -EINVAL;
+
+	args1 = (const struct btf_param *)(func1 + 1);
+	args2 = (const struct btf_param *)(func2 + 1);
+	nargs1 = btf_type_vlen(func1);
+
+	for (i = 0; i <= nargs1; i++) {
+		const struct btf_type *t1, *t2;
+
+		if (!(func_args & (1 << i)))
+			goto next;
+
+		if (i < nargs1) {
+			int t2_index;
+
+			/* get the index of the arg corresponding to args1[i]
+			 * by the offset.
+			 */
+			btf_ctx_arg_idx(btf2, func2, offset, &t2_index);
+			if (t2_index < 0)
+				return -EINVAL;
+
+			t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+			t2 = btf_type_skip_modifiers(btf2, args2[t2_index].type,
+						     NULL);
+		} else {
+			/* i == nargs1, this is the index of return value of t1 */
+			if (get_ctx_arg_total_size(btf1, func1) !=
+			    get_ctx_arg_total_size(btf2, func2))
+				return -EINVAL;
+
+			/* check the return type of t1 and t2 */
+			t1 = btf_type_skip_modifiers(btf1, func1->type, NULL);
+			t2 = btf_type_skip_modifiers(btf2, func2->type, NULL);
+		}
+
+		if (t1->info != t2->info ||
+		    (btf_type_has_size(t1) && t1->size != t2->size))
+			return -EINVAL;
+		if (btf_type_is_int(t1) || btf_is_any_enum(t1))
+			goto next;
+
+		if (btf_type_is_struct(t1))
+			goto on_struct;
+
+		if (!btf_type_is_ptr(t1))
+			return -EINVAL;
+
+		t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
+		t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
+		if (!btf_type_is_struct(t1) || !btf_type_is_struct(t2))
+			return -EINVAL;
+
+on_struct:
+		s1 = btf_name_by_offset(btf1, t1->name_off);
+		s2 = btf_name_by_offset(btf2, t2->name_off);
+		if (strcmp(s1, s2))
+			return -EINVAL;
+next:
+		if (i < nargs1) {
+			t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
+			offset += btf_type_is_ptr(t1) ? 8 : roundup(t1->size, 8);
+		}
+	}
+
+	return 0;
+}
+
 static bool btf_is_dynptr_ptr(const struct btf *btf, const struct btf_type *t)
 {
 	const char *name;
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
index 7ea8b54b2ab1..4ce395a72996 100644
--- a/net/sched/bpf_qdisc.c
+++ b/net/sched/bpf_qdisc.c
@@ -38,7 +38,7 @@ static bool bpf_qdisc_is_valid_access(int off, int size,
 	struct btf *btf = prog->aux->attach_btf;
 	u32 arg;
 
-	arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off);
+	arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off, NULL);
 	if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) {
 		if (arg == 2 && type == BPF_READ) {
 			info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 07/18] bpf: refactor the modules_array to ptr_array
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (5 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 08/18] bpf: verifier: add btf to the function args of bpf_check_attach_target Menglong Dong
                   ` (10 subsequent siblings)
  17 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, Mathieu Desnoyers, linux-kernel, linux-trace-kernel

Refactor the struct modules_array to more general struct ptr_array, which
is used to store the pointers.

Meanwhile, introduce the bpf_try_add_ptr(), which checks the existing of
the ptr before adding it to the array.

Seems it should be moved to another files in "lib", and I'm not sure where
to add it now, and let's move it to kernel/bpf/syscall.c for now.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 include/linux/bpf.h      | 10 +++++++++
 kernel/bpf/syscall.c     | 36 ++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c | 48 ++++++----------------------------------
 3 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5e6d83750d39..bb3ab1aa3a9d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -314,6 +314,16 @@ struct bpf_map {
 	s64 __percpu *elem_count;
 };
 
+struct ptr_array {
+	void **ptrs;
+	int cnt;
+	int cap;
+};
+
+int bpf_add_ptr(struct ptr_array *arr, void *ptr);
+bool bpf_has_ptr(struct ptr_array *arr, struct module *mod);
+int bpf_try_add_ptr(struct ptr_array *arr, void *ptr);
+
 static inline const char *btf_field_type_name(enum btf_field_type type)
 {
 	switch (type) {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 56500381c28a..8ce061b079ec 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -619,6 +619,42 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
 	return ret;
 }
 
+int bpf_add_ptr(struct ptr_array *arr, void *ptr)
+{
+	void **ptrs;
+
+	if (arr->cnt == arr->cap) {
+		arr->cap = max(16, arr->cap * 3 / 2);
+		ptrs = krealloc_array(arr->ptrs, arr->cap, sizeof(*ptrs), GFP_KERNEL);
+		if (!ptrs)
+			return -ENOMEM;
+		arr->ptrs = ptrs;
+	}
+
+	arr->ptrs[arr->cnt] = ptr;
+	arr->cnt++;
+	return 0;
+}
+
+bool bpf_has_ptr(struct ptr_array *arr, struct module *mod)
+{
+	int i;
+
+	for (i = arr->cnt - 1; i >= 0; i--) {
+		if (arr->ptrs[i] == mod)
+			return true;
+	}
+	return false;
+}
+
+int bpf_try_add_ptr(struct ptr_array *arr, void *ptr)
+{
+	if (bpf_has_ptr(arr, ptr))
+		return -EEXIST;
+	if (bpf_add_ptr(arr, ptr))
+		return -ENOMEM;
+	return 0;
+}
 
 static int btf_field_cmp(const void *a, const void *b)
 {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0a06ea6638fe..167fd1dcc28b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2779,43 +2779,9 @@ static void symbols_swap_r(void *a, void *b, int size, const void *priv)
 	}
 }
 
-struct modules_array {
-	struct module **mods;
-	int mods_cnt;
-	int mods_cap;
-};
-
-static int add_module(struct modules_array *arr, struct module *mod)
-{
-	struct module **mods;
-
-	if (arr->mods_cnt == arr->mods_cap) {
-		arr->mods_cap = max(16, arr->mods_cap * 3 / 2);
-		mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL);
-		if (!mods)
-			return -ENOMEM;
-		arr->mods = mods;
-	}
-
-	arr->mods[arr->mods_cnt] = mod;
-	arr->mods_cnt++;
-	return 0;
-}
-
-static bool has_module(struct modules_array *arr, struct module *mod)
-{
-	int i;
-
-	for (i = arr->mods_cnt - 1; i >= 0; i--) {
-		if (arr->mods[i] == mod)
-			return true;
-	}
-	return false;
-}
-
 static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt)
 {
-	struct modules_array arr = {};
+	struct ptr_array arr = {};
 	u32 i, err = 0;
 
 	for (i = 0; i < addrs_cnt; i++) {
@@ -2825,7 +2791,7 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
 		scoped_guard(rcu) {
 			mod = __module_address(addrs[i]);
 			/* Either no module or it's already stored  */
-			if (!mod || has_module(&arr, mod)) {
+			if (!mod || bpf_has_ptr(&arr, mod)) {
 				skip_add = true;
 				break; /* scoped_guard */
 			}
@@ -2836,7 +2802,7 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
 			continue;
 		if (err)
 			break;
-		err = add_module(&arr, mod);
+		err = bpf_add_ptr(&arr, mod);
 		if (err) {
 			module_put(mod);
 			break;
@@ -2845,14 +2811,14 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
 
 	/* We return either err < 0 in case of error, ... */
 	if (err) {
-		kprobe_multi_put_modules(arr.mods, arr.mods_cnt);
-		kfree(arr.mods);
+		kprobe_multi_put_modules((struct module **)arr.ptrs, arr.cnt);
+		kfree(arr.ptrs);
 		return err;
 	}
 
 	/* or number of modules found if everything is ok. */
-	*mods = arr.mods;
-	return arr.mods_cnt;
+	*mods = (struct module **)arr.ptrs;
+	return arr.cnt;
 }
 
 static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt)
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 08/18] bpf: verifier: add btf to the function args of bpf_check_attach_target
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (6 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 07/18] bpf: refactor the modules_array to ptr_array Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 09/18] bpf: verifier: move btf_id_deny to bpf_check_attach_target Menglong Dong
                   ` (9 subsequent siblings)
  17 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, linux-kernel

Add target btf to the function args of bpf_check_attach_target(), then
the caller can specify the btf to check.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 include/linux/bpf_verifier.h | 1 +
 kernel/bpf/syscall.c         | 6 ++++--
 kernel/bpf/trampoline.c      | 1 +
 kernel/bpf/verifier.c        | 8 +++++---
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e459e839f8b..5db2e006d5ac 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -926,6 +926,7 @@ static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id)
 int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    const struct bpf_prog *prog,
 			    const struct bpf_prog *tgt_prog,
+			    struct btf *btf,
 			    u32 btf_id,
 			    struct bpf_attach_target_info *tgt_info);
 void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8ce061b079ec..b21bbbc4263d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3596,9 +3596,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		 * need a new trampoline and a check for compatibility
 		 */
 		struct bpf_attach_target_info tgt_info = {};
+		struct btf *btf;
 
-		err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
-					      &tgt_info);
+		btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
+		err = bpf_check_attach_target(NULL, prog, tgt_prog, btf,
+					      btf_id, &tgt_info);
 		if (err)
 			goto out_unlock;
 
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index f70921ce4e97..8fcb0352f36e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -908,6 +908,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
 	int err;
 
 	err = bpf_check_attach_target(NULL, prog, NULL,
+				      prog->aux->attach_btf,
 				      prog->aux->attach_btf_id,
 				      &tgt_info);
 	if (err)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 90e688f81a48..8e5c4280745f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23451,6 +23451,7 @@ static int check_non_sleepable_error_inject(u32 btf_id)
 int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    const struct bpf_prog *prog,
 			    const struct bpf_prog *tgt_prog,
+			    struct btf *btf,
 			    u32 btf_id,
 			    struct bpf_attach_target_info *tgt_info)
 {
@@ -23463,7 +23464,6 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	const struct btf_type *t;
 	bool conservative = true;
 	const char *tname, *fname;
-	struct btf *btf;
 	long addr = 0;
 	struct module *mod = NULL;
 
@@ -23471,7 +23471,6 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		bpf_log(log, "Tracing programs must provide btf_id\n");
 		return -EINVAL;
 	}
-	btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
 	if (!btf) {
 		bpf_log(log,
 			"FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
@@ -23850,6 +23849,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	struct bpf_attach_target_info tgt_info = {};
 	u32 btf_id = prog->aux->attach_btf_id;
 	struct bpf_trampoline *tr;
+	struct btf *btf;
 	int ret;
 	u64 key;
 
@@ -23874,7 +23874,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	    prog->type != BPF_PROG_TYPE_EXT)
 		return 0;
 
-	ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info);
+	btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf;
+	ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf,
+				      btf_id, &tgt_info);
 	if (ret)
 		return ret;
 
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 09/18] bpf: verifier: move btf_id_deny to bpf_check_attach_target
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (7 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 08/18] bpf: verifier: add btf to the function args of bpf_check_attach_target Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 10/18] x86,bpf: factor out arch_bpf_get_regs_nr Menglong Dong
                   ` (8 subsequent siblings)
  17 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, John Fastabend, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, linux-kernel

Move the checking of btf_id_deny and noreturn_deny from
check_attach_btf_id() to bpf_check_attach_target(). Therefore, we can do
such checking during attaching for tracing multi-link in the later
patches.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 kernel/bpf/verifier.c | 125 ++++++++++++++++++++++--------------------
 1 file changed, 65 insertions(+), 60 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8e5c4280745f..d6311be5a63a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23448,6 +23448,52 @@ static int check_non_sleepable_error_inject(u32 btf_id)
 	return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id);
 }
 
+BTF_SET_START(btf_id_deny)
+BTF_ID_UNUSED
+#ifdef CONFIG_SMP
+BTF_ID(func, migrate_disable)
+BTF_ID(func, migrate_enable)
+#endif
+#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
+BTF_ID(func, rcu_read_unlock_strict)
+#endif
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
+BTF_ID(func, preempt_count_add)
+BTF_ID(func, preempt_count_sub)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
+BTF_ID(func, __rcu_read_lock)
+BTF_ID(func, __rcu_read_unlock)
+#endif
+BTF_SET_END(btf_id_deny)
+
+/* fexit and fmod_ret can't be used to attach to __noreturn functions.
+ * Currently, we must manually list all __noreturn functions here. Once a more
+ * robust solution is implemented, this workaround can be removed.
+ */
+BTF_SET_START(noreturn_deny)
+#ifdef CONFIG_IA32_EMULATION
+BTF_ID(func, __ia32_sys_exit)
+BTF_ID(func, __ia32_sys_exit_group)
+#endif
+#ifdef CONFIG_KUNIT
+BTF_ID(func, __kunit_abort)
+BTF_ID(func, kunit_try_catch_throw)
+#endif
+#ifdef CONFIG_MODULES
+BTF_ID(func, __module_put_and_kthread_exit)
+#endif
+#ifdef CONFIG_X86_64
+BTF_ID(func, __x64_sys_exit)
+BTF_ID(func, __x64_sys_exit_group)
+#endif
+BTF_ID(func, do_exit)
+BTF_ID(func, do_group_exit)
+BTF_ID(func, kthread_complete_and_exit)
+BTF_ID(func, kthread_exit)
+BTF_ID(func, make_task_dead)
+BTF_SET_END(noreturn_deny)
+
 int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    const struct bpf_prog *prog,
 			    const struct bpf_prog *tgt_prog,
@@ -23771,6 +23817,25 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 
 		break;
 	}
+
+	if (prog->type == BPF_PROG_TYPE_LSM) {
+		ret = bpf_lsm_verify_prog(log, prog);
+		if (ret < 0) {
+			module_put(mod);
+			return ret;
+		}
+	} else if (prog->type == BPF_PROG_TYPE_TRACING &&
+		   btf_id_set_contains(&btf_id_deny, btf_id)) {
+		module_put(mod);
+		return -EINVAL;
+	} else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+		   prog->expected_attach_type == BPF_MODIFY_RETURN) &&
+		   btf_id_set_contains(&noreturn_deny, btf_id)) {
+		module_put(mod);
+		bpf_log(log, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n");
+		return -EINVAL;
+	}
+
 	tgt_info->tgt_addr = addr;
 	tgt_info->tgt_name = tname;
 	tgt_info->tgt_type = t;
@@ -23778,52 +23843,6 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	return 0;
 }
 
-BTF_SET_START(btf_id_deny)
-BTF_ID_UNUSED
-#ifdef CONFIG_SMP
-BTF_ID(func, migrate_disable)
-BTF_ID(func, migrate_enable)
-#endif
-#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
-BTF_ID(func, rcu_read_unlock_strict)
-#endif
-#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
-BTF_ID(func, preempt_count_add)
-BTF_ID(func, preempt_count_sub)
-#endif
-#ifdef CONFIG_PREEMPT_RCU
-BTF_ID(func, __rcu_read_lock)
-BTF_ID(func, __rcu_read_unlock)
-#endif
-BTF_SET_END(btf_id_deny)
-
-/* fexit and fmod_ret can't be used to attach to __noreturn functions.
- * Currently, we must manually list all __noreturn functions here. Once a more
- * robust solution is implemented, this workaround can be removed.
- */
-BTF_SET_START(noreturn_deny)
-#ifdef CONFIG_IA32_EMULATION
-BTF_ID(func, __ia32_sys_exit)
-BTF_ID(func, __ia32_sys_exit_group)
-#endif
-#ifdef CONFIG_KUNIT
-BTF_ID(func, __kunit_abort)
-BTF_ID(func, kunit_try_catch_throw)
-#endif
-#ifdef CONFIG_MODULES
-BTF_ID(func, __module_put_and_kthread_exit)
-#endif
-#ifdef CONFIG_X86_64
-BTF_ID(func, __x64_sys_exit)
-BTF_ID(func, __x64_sys_exit_group)
-#endif
-BTF_ID(func, do_exit)
-BTF_ID(func, do_group_exit)
-BTF_ID(func, kthread_complete_and_exit)
-BTF_ID(func, kthread_exit)
-BTF_ID(func, make_task_dead)
-BTF_SET_END(noreturn_deny)
-
 static bool can_be_sleepable(struct bpf_prog *prog)
 {
 	if (prog->type == BPF_PROG_TYPE_TRACING) {
@@ -23906,20 +23925,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		return bpf_iter_prog_supported(prog);
 	}
 
-	if (prog->type == BPF_PROG_TYPE_LSM) {
-		ret = bpf_lsm_verify_prog(&env->log, prog);
-		if (ret < 0)
-			return ret;
-	} else if (prog->type == BPF_PROG_TYPE_TRACING &&
-		   btf_id_set_contains(&btf_id_deny, btf_id)) {
-		return -EINVAL;
-	} else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
-		   prog->expected_attach_type == BPF_MODIFY_RETURN) &&
-		   btf_id_set_contains(&noreturn_deny, btf_id)) {
-		verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n");
-		return -EINVAL;
-	}
-
 	key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
 	tr = bpf_trampoline_get(key, &tgt_info);
 	if (!tr)
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 10/18] x86,bpf: factor out arch_bpf_get_regs_nr
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (8 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 09/18] bpf: verifier: move btf_id_deny to bpf_check_attach_target Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 11/18] bpf: tracing: add multi-link support Menglong Dong
                   ` (7 subsequent siblings)
  17 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, H. Peter Anvin, netdev, linux-kernel

Factor the function arch_bpf_get_regs_nr() to get the regs count that used
by the function args.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 arch/x86/net/bpf_jit_comp.c | 22 +++++++++++++++-------
 include/linux/bpf.h         |  1 +
 kernel/bpf/verifier.c       |  5 +++++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8d2fc436a748..7795917efc41 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3001,6 +3001,19 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
 	return 0;
 }
 
+int arch_bpf_get_regs_nr(const struct btf_func_model *m)
+{
+	int nr_regs = m->nr_args;
+
+	/* extra registers for struct arguments */
+	for (int i = 0; i < m->nr_args; i++) {
+		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
+			nr_regs += (m->arg_size[i] + 7) / 8 - 1;
+	}
+
+	return nr_regs;
+}
+
 /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
 #define LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack)	\
 	__LOAD_TCC_PTR(-round_up(stack, 8) - 8)
@@ -3071,7 +3084,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 					 struct bpf_tramp_links *tlinks,
 					 void *func_addr)
 {
-	int i, ret, nr_regs = m->nr_args, stack_size = 0;
+	int i, ret, nr_regs, stack_size = 0;
 	int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
 	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
 	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
@@ -3089,15 +3102,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
 	WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
 		     (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));
 
-	/* extra registers for struct arguments */
-	for (i = 0; i < m->nr_args; i++) {
-		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
-			nr_regs += (m->arg_size[i] + 7) / 8 - 1;
-	}
-
 	/* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
 	 * are passed through regs, the remains are through stack.
 	 */
+	nr_regs = arch_bpf_get_regs_nr(m);
 	if (nr_regs > MAX_BPF_FUNC_ARGS)
 		return -ENOTSUPP;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bb3ab1aa3a9d..da5951b0335b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1220,6 +1220,7 @@ void arch_free_bpf_trampoline(void *image, unsigned int size);
 int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size);
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 			     struct bpf_tramp_links *tlinks, void *func_addr);
+int arch_bpf_get_regs_nr(const struct btf_func_model *m);
 
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d6311be5a63a..86a64d843465 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23274,6 +23274,11 @@ static int do_check_main(struct bpf_verifier_env *env)
 }
 
 
+int __weak arch_bpf_get_regs_nr(const struct btf_func_model *m)
+{
+	return -ENODEV;
+}
+
 static void print_verification_stats(struct bpf_verifier_env *env)
 {
 	int i;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 11/18] bpf: tracing: add multi-link support
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (9 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 10/18] x86,bpf: factor out arch_bpf_get_regs_nr Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 12/18] libbpf: don't free btf if tracing_multi progs existing Menglong Dong
                   ` (6 subsequent siblings)
  17 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, John Fastabend, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Simon Horman, linux-kernel, netdev

In this commit, we add the support to allow attaching a tracing BPF
program to multi hooks, which is similar to BPF_TRACE_KPROBE_MULTI.

The use case is obvious. For now, we have to create a BPF program for each
kernel function, for which we want to trace, even through all the program
have the same (or similar logic). This can consume extra memory, and make
the program loading slow if we have plenty of kernel function to trace.
The KPROBE_MULTI maybe a alternative, but it can't do what TRACING do. For
example, the kretprobe can't obtain the function args, but the FEXIT can.

For now, we support to create multi-link for fentry/fexit/modify_return
with the following new attach types that we introduce:

  BPF_TRACE_FENTRY_MULTI
  BPF_TRACE_FEXIT_MULTI
  BPF_MODIFY_RETURN_MULTI

We introduce the struct bpf_tracing_multi_link for this purpose, which
can hold all the kernel modules, target bpf program (for attaching to bpf
program) or target btf (for attaching to kernel function) that we
referenced.

During loading, the first target is used for verification by the verifer.
And during attaching, we check the consistency of all the targets with
the first target.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 include/linux/bpf.h            |   9 +
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |  10 +
 kernel/bpf/btf.c               |   5 +
 kernel/bpf/syscall.c           | 353 +++++++++++++++++++++++++++++++++
 kernel/bpf/trampoline.c        |   7 +-
 kernel/bpf/verifier.c          |  25 ++-
 net/bpf/test_run.c             |   3 +
 net/core/bpf_sk_storage.c      |   2 +
 tools/include/uapi/linux/bpf.h |  10 +
 10 files changed, 421 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index da5951b0335b..b4f8e2a068e5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1806,6 +1806,15 @@ struct bpf_raw_tp_link {
 	u64 cookie;
 };
 
+struct bpf_tracing_multi_link {
+	struct bpf_gtramp_link link;
+	enum bpf_attach_type attach_type;
+	struct btf **tgt_btfs;
+	struct module **mods;
+	u32 btf_cnt;
+	u32 mods_cnt;
+};
+
 struct bpf_link_primer {
 	struct bpf_link *link;
 	struct file *file;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fa78f49d4a9a..139d5436ce4c 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -154,3 +154,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf)
 BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi)
 BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops)
 BPF_LINK_TYPE(BPF_LINK_TYPE_UPROBE_MULTI, uprobe_multi)
+BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING_MULTI, tracing_multi)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 719ba230032f..a143a64f69ae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1120,6 +1120,9 @@ enum bpf_attach_type {
 	BPF_NETKIT_PEER,
 	BPF_TRACE_KPROBE_SESSION,
 	BPF_TRACE_UPROBE_SESSION,
+	BPF_TRACE_FENTRY_MULTI,
+	BPF_TRACE_FEXIT_MULTI,
+	BPF_MODIFY_RETURN_MULTI,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1144,6 +1147,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
 	BPF_LINK_TYPE_SOCKMAP = 14,
+	BPF_LINK_TYPE_TRACING_MULTI = 15,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -1765,6 +1769,12 @@ union bpf_attr {
 				 */
 				__u64		cookie;
 			} tracing;
+			struct {
+				__u32		cnt;
+				__aligned_u64	tgt_fds;
+				__aligned_u64	btf_ids;
+				__aligned_u64	cookies;
+			} tracing_multi;
 			struct {
 				__u32		pf;
 				__u32		hooknum;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 853ca19bbe81..a25c2a0303f2 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6112,6 +6112,9 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct
 		case BPF_TRACE_FENTRY:
 		case BPF_TRACE_FEXIT:
 		case BPF_MODIFY_RETURN:
+		case BPF_TRACE_FENTRY_MULTI:
+		case BPF_TRACE_FEXIT_MULTI:
+		case BPF_MODIFY_RETURN_MULTI:
 			/* allow u64* as ctx */
 			if (btf_is_int(t) && t->size == 8)
 				return 0;
@@ -6718,6 +6721,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			fallthrough;
 		case BPF_LSM_CGROUP:
 		case BPF_TRACE_FEXIT:
+		case BPF_TRACE_FEXIT_MULTI:
 			/* When LSM programs are attached to void LSM hooks
 			 * they use FEXIT trampolines and when attached to
 			 * int LSM hooks, they use MODIFY_RETURN trampolines.
@@ -6736,6 +6740,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			t = btf_type_by_id(btf, t->type);
 			break;
 		case BPF_MODIFY_RETURN:
+		case BPF_MODIFY_RETURN_MULTI:
 			/* For now the BPF_MODIFY_RETURN can only be attached to
 			 * functions that return an int.
 			 */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b21bbbc4263d..01430308558f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -37,6 +37,7 @@
 #include <linux/trace_events.h>
 #include <linux/tracepoint.h>
 #include <linux/overflow.h>
+#include <linux/kfunc_md.h>
 
 #include <net/netfilter/nf_bpf_link.h>
 #include <net/netkit.h>
@@ -3469,6 +3470,34 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
 	.fill_link_info = bpf_tracing_link_fill_link_info,
 };
 
+static int bpf_tracing_check_multi(struct bpf_prog *prog,
+				   struct bpf_prog *tgt_prog,
+				   struct btf *btf2,
+				   const struct btf_type *t2)
+{
+	const struct btf_type *t1;
+	struct btf *btf1;
+
+	/* this case is already valided in bpf_check_attach_target() */
+	if (prog->type == BPF_PROG_TYPE_EXT)
+		return 0;
+
+	btf1 = prog->aux->dst_prog ? prog->aux->dst_prog->aux->btf :
+				     prog->aux->attach_btf;
+	if (!btf1)
+		return -EOPNOTSUPP;
+
+	btf2 = btf2 ?: tgt_prog->aux->btf;
+	t1 = prog->aux->attach_func_proto;
+
+	/* the target is the same as the origin one, this is a re-attach */
+	if (t1 == t2)
+		return 0;
+
+	return btf_check_func_part_match(btf1, t1, btf2, t2,
+					 prog->aux->accessed_args);
+}
+
 static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 				   int tgt_prog_fd,
 				   u32 btf_id,
@@ -3668,6 +3697,323 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	return err;
 }
 
+static void __bpf_tracing_multi_link_release(struct bpf_tracing_multi_link *link)
+{
+	int i;
+
+	if (link->mods_cnt) {
+		for (i = 0; i < link->mods_cnt; i++)
+			module_put(link->mods[i]);
+		kfree(link->mods);
+	}
+
+	if (link->btf_cnt) {
+		for (i = 0; i < link->btf_cnt; i++)
+			btf_put(link->tgt_btfs[i]);
+		kfree(link->tgt_btfs);
+	}
+
+	kfree(link->link.entries);
+}
+
+static void bpf_tracing_multi_link_release(struct bpf_link *link)
+{
+	struct bpf_tracing_multi_link *multi_link =
+		container_of(link, struct bpf_tracing_multi_link, link.link);
+
+	bpf_gtrampoline_unlink_prog(&multi_link->link);
+	__bpf_tracing_multi_link_release(multi_link);
+}
+
+static void bpf_tracing_multi_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_tracing_multi_link *tr_link =
+		container_of(link, struct bpf_tracing_multi_link, link.link);
+
+	kfree(tr_link);
+}
+
+static void bpf_tracing_multi_link_show_fdinfo(const struct bpf_link *link,
+					 struct seq_file *seq)
+{
+	struct bpf_tracing_multi_link *tr_link =
+		container_of(link, struct bpf_tracing_multi_link, link.link);
+	int i;
+
+	for (i = 0; i < tr_link->link.entry_cnt; i++) {
+		seq_printf(seq,
+			   "attach_type:\t%d\n"
+			   "target_addr:\t%p\n",
+			   tr_link->attach_type,
+			   tr_link->link.entries[i].addr);
+	}
+}
+
+static const struct bpf_link_ops bpf_tracing_multi_link_lops = {
+	.release = bpf_tracing_multi_link_release,
+	.dealloc = bpf_tracing_multi_link_dealloc,
+	.show_fdinfo = bpf_tracing_multi_link_show_fdinfo,
+};
+
+#define MAX_TRACING_MULTI_CNT	102400
+
+static int bpf_tracing_get_target(u32 fd, struct bpf_prog **tgt_prog,
+				  struct btf **tgt_btf)
+{
+	struct bpf_prog *prog = NULL;
+	struct btf *btf = NULL;
+	int err = 0;
+
+	if (fd) {
+		prog = bpf_prog_get(fd);
+		if (!IS_ERR(prog))
+			goto found;
+
+		prog = NULL;
+		/* "fd" is the fd of the kernel module BTF */
+		btf = btf_get_by_fd(fd);
+		if (IS_ERR(btf)) {
+			err = PTR_ERR(btf);
+			goto err;
+		}
+		if (!btf_is_kernel(btf)) {
+			btf_put(btf);
+			err = -EOPNOTSUPP;
+			goto err;
+		}
+	} else {
+		btf = bpf_get_btf_vmlinux();
+		if (IS_ERR(btf)) {
+			err = PTR_ERR(btf);
+			goto err;
+		}
+		if (!btf) {
+			err = -EINVAL;
+			goto err;
+		}
+		btf_get(btf);
+	}
+found:
+	*tgt_prog = prog;
+	*tgt_btf = btf;
+	return 0;
+err:
+	*tgt_prog = NULL;
+	*tgt_btf = NULL;
+	return err;
+}
+
+static int bpf_tracing_multi_link_check(const union bpf_attr *attr, u32 **btf_ids,
+					u32 **tgt_fds, u64 **cookies,
+					u32 cnt)
+{
+	void __user *ubtf_ids;
+	void __user *utgt_fds;
+	void __user *ucookies;
+	void *tmp;
+	int i;
+
+	if (!cnt)
+		return -EINVAL;
+
+	if (cnt > MAX_TRACING_MULTI_CNT)
+		return -E2BIG;
+
+	ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies);
+	if (ucookies) {
+		tmp = kvmalloc_array(cnt, sizeof(**cookies), GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+
+		*cookies = tmp;
+		if (copy_from_user(tmp, ucookies, cnt * sizeof(**cookies)))
+			return -EFAULT;
+	}
+
+	utgt_fds = u64_to_user_ptr(attr->link_create.tracing_multi.tgt_fds);
+	if (utgt_fds) {
+		tmp = kvmalloc_array(cnt, sizeof(**tgt_fds), GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+
+		*tgt_fds = tmp;
+		if (copy_from_user(tmp, utgt_fds, cnt * sizeof(**tgt_fds)))
+			return -EFAULT;
+	}
+
+	ubtf_ids = u64_to_user_ptr(attr->link_create.tracing_multi.btf_ids);
+	if (!ubtf_ids)
+		return -EINVAL;
+
+	tmp = kvmalloc_array(cnt, sizeof(**btf_ids), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	*btf_ids = tmp;
+	if (copy_from_user(tmp, ubtf_ids, cnt * sizeof(**btf_ids)))
+		return -EFAULT;
+
+	for (i = 0; i < cnt; i++) {
+		if (!(*btf_ids)[i])
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void bpf_tracing_multi_link_ptr_fill(struct bpf_tracing_multi_link *link,
+					    struct ptr_array *mods,
+					    struct ptr_array *btfs)
+{
+	link->mods = (struct module **) mods->ptrs;
+	link->mods_cnt = mods->cnt;
+	link->tgt_btfs = (struct btf **) btfs->ptrs;
+	link->btf_cnt = btfs->cnt;
+}
+
+static int bpf_tracing_prog_attach_multi(const union bpf_attr *attr,
+					 struct bpf_prog *prog)
+{
+	struct bpf_tracing_multi_link *link = NULL;
+	u32 cnt, *btf_ids = NULL, *tgt_fds = NULL;
+	struct bpf_link_primer link_primer;
+	struct ptr_array btf_array = { };
+	struct ptr_array mod_array = { };
+	u64 *cookies = NULL;
+	int err = 0, i;
+
+	if ((prog->expected_attach_type != BPF_TRACE_FENTRY_MULTI &&
+	     prog->expected_attach_type != BPF_TRACE_FEXIT_MULTI &&
+	     prog->expected_attach_type != BPF_MODIFY_RETURN_MULTI) ||
+	     prog->type != BPF_PROG_TYPE_TRACING)
+		return -EINVAL;
+
+	cnt = attr->link_create.tracing_multi.cnt;
+	err = bpf_tracing_multi_link_check(attr, &btf_ids, &tgt_fds, &cookies,
+					   cnt);
+	if (err)
+		goto err_out;
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	link->link.entries = kzalloc(sizeof(*link->link.entries) * cnt,
+				     GFP_USER);
+	if (!link->link.entries) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING_MULTI,
+		      &bpf_tracing_multi_link_lops, prog);
+	link->attach_type = prog->expected_attach_type;
+
+	mutex_lock(&prog->aux->dst_mutex);
+
+	for (i = 0; i < cnt; i++) {
+		struct bpf_attach_target_info tgt_info = {};
+		struct bpf_gtramp_link_entry *entry;
+		struct bpf_prog *tgt_prog = NULL;
+		u32 tgt_fd, btf_id = btf_ids[i];
+		struct btf *tgt_btf = NULL;
+		struct module *mod = NULL;
+		int nr_regs;
+
+		entry = &link->link.entries[i];
+		tgt_fd = tgt_fds ? tgt_fds[i] : 0;
+		err = bpf_tracing_get_target(tgt_fd, &tgt_prog, &tgt_btf);
+		if (err)
+			goto err_out_unlock;
+
+		if (tgt_prog) {
+			/* the global trampoline link is ftrace based, bpf2bpf
+			 * is not supported for now.
+			 */
+			bpf_prog_put(tgt_prog);
+			err = -EOPNOTSUPP;
+			goto err_out_unlock;
+		}
+
+		if (tgt_btf) {
+			err = bpf_try_add_ptr(&btf_array, tgt_btf);
+			if (err) {
+				btf_put(tgt_btf);
+				if (err != -EEXIST)
+					goto err_out_unlock;
+			}
+		}
+
+		prog->aux->attach_tracing_prog = tgt_prog &&
+			tgt_prog->type == BPF_PROG_TYPE_TRACING &&
+			prog->type == BPF_PROG_TYPE_TRACING;
+
+		err = bpf_check_attach_target(NULL, prog, tgt_prog, tgt_btf,
+					      btf_id, &tgt_info);
+		if (err)
+			goto err_out_unlock;
+
+		nr_regs = arch_bpf_get_regs_nr(&tgt_info.fmodel);
+		if (nr_regs < 0) {
+			err = nr_regs;
+			goto err_out_unlock;
+		}
+
+		mod = tgt_info.tgt_mod;
+		if (mod) {
+			err = bpf_try_add_ptr(&mod_array, mod);
+			if (err) {
+				module_put(mod);
+				if (err != -EEXIST)
+					goto err_out_unlock;
+			}
+		}
+
+		err = bpf_tracing_check_multi(prog, tgt_prog, tgt_btf,
+					      tgt_info.tgt_type);
+		if (err)
+			goto err_out_unlock;
+
+		entry->cookie = cookies ? cookies[i] : 0;
+		entry->addr = (void *)tgt_info.tgt_addr;
+		entry->tgt_prog = tgt_prog;
+		entry->attach_btf = tgt_btf;
+		entry->btf_id = btf_id;
+		entry->nr_args = nr_regs;
+
+		link->link.entry_cnt++;
+	}
+
+	err = bpf_gtrampoline_link_prog(&link->link);
+	if (err)
+		goto err_out_unlock;
+
+	err = bpf_link_prime(&link->link.link, &link_primer);
+	if (err) {
+		bpf_gtrampoline_unlink_prog(&link->link);
+		goto err_out_unlock;
+	}
+
+	bpf_tracing_multi_link_ptr_fill(link, &mod_array, &btf_array);
+	mutex_unlock(&prog->aux->dst_mutex);
+
+	kfree(btf_ids);
+	kfree(tgt_fds);
+	kfree(cookies);
+	return bpf_link_settle(&link_primer);
+err_out_unlock:
+	bpf_tracing_multi_link_ptr_fill(link, &mod_array, &btf_array);
+	__bpf_tracing_multi_link_release(link);
+	mutex_unlock(&prog->aux->dst_mutex);
+err_out:
+	kfree(btf_ids);
+	kfree(tgt_fds);
+	kfree(cookies);
+	kfree(link);
+	return err;
+}
+
 static void bpf_raw_tp_link_release(struct bpf_link *link)
 {
 	struct bpf_raw_tp_link *raw_tp =
@@ -4259,6 +4605,9 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 	case BPF_MODIFY_RETURN:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
+	case BPF_MODIFY_RETURN_MULTI:
 		return BPF_PROG_TYPE_TRACING;
 	case BPF_LSM_MAC:
 		return BPF_PROG_TYPE_LSM;
@@ -5581,6 +5930,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 			ret = bpf_iter_link_attach(attr, uattr, prog);
 		else if (prog->expected_attach_type == BPF_LSM_CGROUP)
 			ret = cgroup_bpf_link_attach(attr, prog);
+		else if (prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
+			 prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
+			 prog->expected_attach_type == BPF_MODIFY_RETURN_MULTI)
+			ret = bpf_tracing_prog_attach_multi(attr, prog);
 		else
 			ret = bpf_tracing_prog_attach(prog,
 						      attr->link_create.target_fd,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 8fcb0352f36e..07986669ada0 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -117,7 +117,9 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 
 	return (ptype == BPF_PROG_TYPE_TRACING &&
 		(eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
-		 eatype == BPF_MODIFY_RETURN)) ||
+		 eatype == BPF_MODIFY_RETURN ||
+		 eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI ||
+		 eatype == BPF_MODIFY_RETURN_MULTI)) ||
 		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
 }
 
@@ -516,10 +518,13 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 {
 	switch (prog->expected_attach_type) {
 	case BPF_TRACE_FENTRY:
+	case BPF_TRACE_FENTRY_MULTI:
 		return BPF_TRAMP_FENTRY;
 	case BPF_MODIFY_RETURN:
+	case BPF_MODIFY_RETURN_MULTI:
 		return BPF_TRAMP_MODIFY_RETURN;
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FEXIT_MULTI:
 		return BPF_TRAMP_FEXIT;
 	case BPF_LSM_MAC:
 		if (!prog->aux->attach_func_proto->type)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 86a64d843465..a44e1fed3fa1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17103,10 +17103,13 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 		switch (env->prog->expected_attach_type) {
 		case BPF_TRACE_FENTRY:
 		case BPF_TRACE_FEXIT:
+		case BPF_TRACE_FENTRY_MULTI:
+		case BPF_TRACE_FEXIT_MULTI:
 			range = retval_range(0, 0);
 			break;
 		case BPF_TRACE_RAW_TP:
 		case BPF_MODIFY_RETURN:
+		case BPF_MODIFY_RETURN_MULTI:
 			return 0;
 		case BPF_TRACE_ITER:
 			break;
@@ -22632,7 +22635,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		if (prog_type == BPF_PROG_TYPE_TRACING &&
 		    insn->imm == BPF_FUNC_get_func_ret) {
 			if (eatype == BPF_TRACE_FEXIT ||
-			    eatype == BPF_MODIFY_RETURN) {
+			    eatype == BPF_MODIFY_RETURN ||
+			    eatype == BPF_TRACE_FEXIT_MULTI ||
+			    eatype == BPF_MODIFY_RETURN_MULTI) {
 				/* Load nr_args from ctx - 8 */
 				insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
 				insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
@@ -23619,7 +23624,9 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 		if (tgt_prog->type == BPF_PROG_TYPE_TRACING &&
 		    prog_extension &&
 		    (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY ||
-		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) {
+		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT ||
+		     tgt_prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
+		     tgt_prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI)) {
 			/* Program extensions can extend all program types
 			 * except fentry/fexit. The reason is the following.
 			 * The fentry/fexit programs are used for performance
@@ -23718,6 +23725,9 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	case BPF_LSM_CGROUP:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_MODIFY_RETURN_MULTI:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
 		if (!btf_type_is_func(t)) {
 			bpf_log(log, "attach_btf_id %u is not a function\n",
 				btf_id);
@@ -23803,7 +23813,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 				bpf_log(log, "%s is not sleepable\n", tname);
 				return ret;
 			}
-		} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
+		} else if (prog->expected_attach_type == BPF_MODIFY_RETURN ||
+			   prog->expected_attach_type == BPF_MODIFY_RETURN_MULTI) {
 			if (tgt_prog) {
 				module_put(mod);
 				bpf_log(log, "can't modify return codes of BPF programs\n");
@@ -23856,6 +23867,9 @@ static bool can_be_sleepable(struct bpf_prog *prog)
 		case BPF_TRACE_FEXIT:
 		case BPF_MODIFY_RETURN:
 		case BPF_TRACE_ITER:
+		case BPF_TRACE_FENTRY_MULTI:
+		case BPF_TRACE_FEXIT_MULTI:
+		case BPF_MODIFY_RETURN_MULTI:
 			return true;
 		default:
 			return false;
@@ -23930,6 +23944,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		return bpf_iter_prog_supported(prog);
 	}
 
+	if (prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
+	    prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
+	    prog->expected_attach_type == BPF_MODIFY_RETURN_MULTI)
+		return 0;
+
 	key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
 	tr = bpf_trampoline_get(key, &tgt_info);
 	if (!tr)
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 9728dbd4c66c..a5e5094a5189 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -696,6 +696,8 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
 	switch (prog->expected_attach_type) {
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
 		if (bpf_fentry_test1(1) != 2 ||
 		    bpf_fentry_test2(2, 3) != 5 ||
 		    bpf_fentry_test3(4, 5, 6) != 15 ||
@@ -709,6 +711,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog,
 			goto out;
 		break;
 	case BPF_MODIFY_RETURN:
+	case BPF_MODIFY_RETURN_MULTI:
 		ret = bpf_modify_return_test(1, &b);
 		if (b != 2)
 			side_effect++;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 2e538399757f..c5b1fd714b58 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -369,6 +369,8 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
 		return true;
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
 		return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
 				 strlen("bpf_sk_storage"));
 	default:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 719ba230032f..a143a64f69ae 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1120,6 +1120,9 @@ enum bpf_attach_type {
 	BPF_NETKIT_PEER,
 	BPF_TRACE_KPROBE_SESSION,
 	BPF_TRACE_UPROBE_SESSION,
+	BPF_TRACE_FENTRY_MULTI,
+	BPF_TRACE_FEXIT_MULTI,
+	BPF_MODIFY_RETURN_MULTI,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1144,6 +1147,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
 	BPF_LINK_TYPE_SOCKMAP = 14,
+	BPF_LINK_TYPE_TRACING_MULTI = 15,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -1765,6 +1769,12 @@ union bpf_attr {
 				 */
 				__u64		cookie;
 			} tracing;
+			struct {
+				__u32		cnt;
+				__aligned_u64	tgt_fds;
+				__aligned_u64	btf_ids;
+				__aligned_u64	cookies;
+			} tracing_multi;
 			struct {
 				__u32		pf;
 				__u32		hooknum;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 12/18] libbpf: don't free btf if tracing_multi progs existing
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (10 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 11/18] bpf: tracing: add multi-link support Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-03 12:15 ` [PATCH bpf-next v2 13/18] libbpf: support tracing_multi Menglong Dong
                   ` (5 subsequent siblings)
  17 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, linux-kernel

By default, the kernel btf that we load during loading program will be
freed after the programs are loaded in bpf_object_load(). However, we
still need to use these btf for tracing of multi-link during attaching.
Therefore, we don't free the btfs until the bpf object is closed if any
bpf programs of the type multi-link tracing exist.

Meanwhile, introduce the new api bpf_object__free_btf() to manually free
the btfs after attaching.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 tools/lib/bpf/libbpf.c   | 24 +++++++++++++++++++++++-
 tools/lib/bpf/libbpf.h   |  2 ++
 tools/lib/bpf/libbpf.map |  1 +
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index aee36402f0a3..530c29f2f5fc 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8583,6 +8583,28 @@ static void bpf_object_post_load_cleanup(struct bpf_object *obj)
 	obj->btf_vmlinux = NULL;
 }
 
+void bpf_object__free_btfs(struct bpf_object *obj)
+{
+	if (!obj->btf_vmlinux || obj->state != OBJ_LOADED)
+		return;
+
+	bpf_object_post_load_cleanup(obj);
+}
+
+static void bpf_object_early_free_btf(struct bpf_object *obj)
+{
+	struct bpf_program *prog;
+
+	bpf_object__for_each_program(prog, obj) {
+		if (prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
+		    prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
+		    prog->expected_attach_type == BPF_MODIFY_RETURN_MULTI)
+			return;
+	}
+
+	bpf_object_post_load_cleanup(obj);
+}
+
 static int bpf_object_prepare(struct bpf_object *obj, const char *target_btf_path)
 {
 	int err;
@@ -8654,7 +8676,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
 			err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps);
 	}
 
-	bpf_object_post_load_cleanup(obj);
+	bpf_object_early_free_btf(obj);
 	obj->state = OBJ_LOADED; /* doesn't matter if successfully or not */
 
 	if (err) {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index d1cf813a057b..7cc810aa7967 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -323,6 +323,8 @@ LIBBPF_API struct bpf_program *
 bpf_object__find_program_by_name(const struct bpf_object *obj,
 				 const char *name);
 
+LIBBPF_API void bpf_object__free_btfs(struct bpf_object *obj);
+
 LIBBPF_API int
 libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
 			 enum bpf_attach_type *expected_attach_type);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index c7fc0bde5648..4a0c993221a5 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -444,4 +444,5 @@ LIBBPF_1.6.0 {
 		bpf_program__line_info_cnt;
 		btf__add_decl_attr;
 		btf__add_type_attr;
+		bpf_object__free_btfs;
 } LIBBPF_1.5.0;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 13/18] libbpf: support tracing_multi
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (11 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 12/18] libbpf: don't free btf if tracing_multi progs existing Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-03 12:15 ` [PATCH bpf-next v2 14/18] libbpf: add btf type hash lookup support Menglong Dong
                   ` (4 subsequent siblings)
  17 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, linux-kernel

Add supporting for the attach types of:

BPF_TRACE_FENTRY_MULTI
BPF_TRACE_FEXIT_MULTI
BPF_MODIFY_RETURN_MULTI

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 tools/bpf/bpftool/common.c |   3 +
 tools/lib/bpf/bpf.c        |  10 +++
 tools/lib/bpf/bpf.h        |   6 ++
 tools/lib/bpf/libbpf.c     | 168 ++++++++++++++++++++++++++++++++++++-
 tools/lib/bpf/libbpf.h     |  19 +++++
 tools/lib/bpf/libbpf.map   |   1 +
 6 files changed, 204 insertions(+), 3 deletions(-)

diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index b07317d2842f..bcf2991a3f8f 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -1189,6 +1189,9 @@ const char *bpf_attach_type_input_str(enum bpf_attach_type t)
 	case BPF_TRACE_FENTRY:			return "fentry";
 	case BPF_TRACE_FEXIT:			return "fexit";
 	case BPF_MODIFY_RETURN:			return "mod_ret";
+	case BPF_TRACE_FENTRY_MULTI:		return "fentry_multi";
+	case BPF_TRACE_FEXIT_MULTI:		return "fexit_multi";
+	case BPF_MODIFY_RETURN_MULTI:		return "mod_ret_multi";
 	case BPF_SK_REUSEPORT_SELECT:		return "sk_skb_reuseport_select";
 	case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:	return "sk_skb_reuseport_select_or_migrate";
 	default:	return libbpf_bpf_attach_type_str(t);
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 6eb421ccf91b..dd65e133d412 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -797,6 +797,16 @@ int bpf_link_create(int prog_fd, int target_fd,
 		if (!OPTS_ZEROED(opts, tracing))
 			return libbpf_err(-EINVAL);
 		break;
+	case BPF_TRACE_FENTRY_MULTI:
+	case BPF_TRACE_FEXIT_MULTI:
+	case BPF_MODIFY_RETURN_MULTI:
+		attr.link_create.tracing_multi.btf_ids = ptr_to_u64(OPTS_GET(opts, tracing_multi.btf_ids, 0));
+		attr.link_create.tracing_multi.tgt_fds = ptr_to_u64(OPTS_GET(opts, tracing_multi.tgt_fds, 0));
+		attr.link_create.tracing_multi.cookies = ptr_to_u64(OPTS_GET(opts, tracing_multi.cookies, 0));
+		attr.link_create.tracing_multi.cnt = OPTS_GET(opts, tracing_multi.cnt, 0);
+		if (!OPTS_ZEROED(opts, tracing_multi))
+			return libbpf_err(-EINVAL);
+		break;
 	case BPF_NETFILTER:
 		attr.link_create.netfilter.pf = OPTS_GET(opts, netfilter.pf, 0);
 		attr.link_create.netfilter.hooknum = OPTS_GET(opts, netfilter.hooknum, 0);
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 1342564214c8..5c97acec643d 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -422,6 +422,12 @@ struct bpf_link_create_opts {
 		struct {
 			__u64 cookie;
 		} tracing;
+		struct {
+			__u32 cnt;
+			const __u32 *btf_ids;
+			const __u32 *tgt_fds;
+			const __u64 *cookies;
+		} tracing_multi;
 		struct {
 			__u32 pf;
 			__u32 hooknum;
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 530c29f2f5fc..ae38b3ab84c7 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -136,6 +136,9 @@ static const char * const attach_type_name[] = {
 	[BPF_NETKIT_PEER]		= "netkit_peer",
 	[BPF_TRACE_KPROBE_SESSION]	= "trace_kprobe_session",
 	[BPF_TRACE_UPROBE_SESSION]	= "trace_uprobe_session",
+	[BPF_TRACE_FENTRY_MULTI]	= "trace_fentry_multi",
+	[BPF_TRACE_FEXIT_MULTI]		= "trace_fexit_multi",
+	[BPF_MODIFY_RETURN_MULTI]	= "modify_return_multi",
 };
 
 static const char * const link_type_name[] = {
@@ -410,6 +413,8 @@ enum sec_def_flags {
 	SEC_XDP_FRAGS = 16,
 	/* Setup proper attach type for usdt probes. */
 	SEC_USDT = 32,
+	/* attachment target is multi-link */
+	SEC_ATTACH_BTF_MULTI = 64,
 };
 
 struct bpf_sec_def {
@@ -7419,9 +7424,9 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
 		opts->expected_attach_type = BPF_TRACE_UPROBE_MULTI;
 	}
 
-	if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) {
+	if ((def & (SEC_ATTACH_BTF | SEC_ATTACH_BTF_MULTI)) && !prog->attach_btf_id) {
 		int btf_obj_fd = 0, btf_type_id = 0, err;
-		const char *attach_name;
+		const char *attach_name, *name_end;
 
 		attach_name = strchr(prog->sec_name, '/');
 		if (!attach_name) {
@@ -7440,7 +7445,27 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
 		}
 		attach_name++; /* skip over / */
 
-		err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd, &btf_type_id);
+		name_end = strchr(attach_name, ',');
+		/* for multi-link tracing, use the first target symbol during
+		 * loading.
+		 */
+		if ((def & SEC_ATTACH_BTF_MULTI) && name_end) {
+			int len = name_end - attach_name + 1;
+			char *first_tgt;
+
+			first_tgt = malloc(len);
+			if (!first_tgt)
+				return -ENOMEM;
+			libbpf_strlcpy(first_tgt, attach_name, len);
+			first_tgt[len - 1] = '\0';
+			err = libbpf_find_attach_btf_id(prog, first_tgt, &btf_obj_fd,
+							&btf_type_id);
+			free(first_tgt);
+		} else {
+			err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd,
+							&btf_type_id);
+		}
+
 		if (err)
 			return err;
 
@@ -9519,6 +9544,7 @@ static int attach_kprobe_session(const struct bpf_program *prog, long cookie, st
 static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link);
+static int attach_trace_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 
 static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("socket",		SOCKET_FILTER, 0, SEC_NONE),
@@ -9565,6 +9591,13 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("fentry.s+",		TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
 	SEC_DEF("fmod_ret.s+",		TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
 	SEC_DEF("fexit.s+",		TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
+	SEC_DEF("tp_btf+",		TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace),
+	SEC_DEF("fentry.multi+",	TRACING, BPF_TRACE_FENTRY_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
+	SEC_DEF("fmod_ret.multi+",	TRACING, BPF_MODIFY_RETURN_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
+	SEC_DEF("fexit.multi+",		TRACING, BPF_TRACE_FEXIT_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
+	SEC_DEF("fentry.multi.s+",	TRACING, BPF_TRACE_FENTRY_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
+	SEC_DEF("fmod_ret.multi.s+",	TRACING, BPF_MODIFY_RETURN_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
+	SEC_DEF("fexit.multi.s+",	TRACING, BPF_TRACE_FEXIT_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
 	SEC_DEF("freplace+",		EXT, 0, SEC_ATTACH_BTF, attach_trace),
 	SEC_DEF("lsm+",			LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
 	SEC_DEF("lsm.s+",		LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
@@ -12799,6 +12832,135 @@ static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_
 	return libbpf_get_error(*link);
 }
 
+struct bpf_link *bpf_program__attach_trace_multi_opts(const struct bpf_program *prog,
+						      const struct bpf_trace_multi_opts *opts)
+{
+	LIBBPF_OPTS(bpf_link_create_opts, link_opts);
+	__u32 *btf_ids = NULL, *tgt_fds = NULL;
+	struct bpf_link *link = NULL;
+	char errmsg[STRERR_BUFSIZE];
+	int prog_fd, pfd, cnt, err;
+
+	if (!OPTS_VALID(opts, bpf_trace_multi_opts))
+		return libbpf_err_ptr(-EINVAL);
+
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0) {
+		pr_warn("prog '%s': can't attach before loaded\n", prog->name);
+		return libbpf_err_ptr(-EINVAL);
+	}
+
+	cnt = OPTS_GET(opts, cnt, 0);
+	if (opts->syms) {
+		int btf_obj_fd, btf_type_id, i;
+
+		if (opts->btf_ids || opts->tgt_fds) {
+			pr_warn("can set both opts->syms and opts->btf_ids\n");
+			return libbpf_err_ptr(-EINVAL);
+		}
+
+		btf_ids = malloc(sizeof(*btf_ids) * cnt);
+		tgt_fds = malloc(sizeof(*tgt_fds) * cnt);
+		if (!btf_ids || !tgt_fds) {
+			err = -ENOMEM;
+			goto err_free;
+		}
+		for (i = 0; i < cnt; i++) {
+			btf_obj_fd = btf_type_id = 0;
+
+			err = find_kernel_btf_id(prog->obj, opts->syms[i],
+					 prog->expected_attach_type, &btf_obj_fd,
+					 &btf_type_id);
+			if (err)
+				goto err_free;
+			btf_ids[i] = btf_type_id;
+			tgt_fds[i] = btf_obj_fd;
+		}
+		link_opts.tracing_multi.btf_ids = btf_ids;
+		link_opts.tracing_multi.tgt_fds = tgt_fds;
+	} else {
+		link_opts.tracing_multi.btf_ids = OPTS_GET(opts, btf_ids, 0);
+		link_opts.tracing_multi.tgt_fds = OPTS_GET(opts, tgt_fds, 0);
+	}
+
+	link = calloc(1, sizeof(*link));
+	if (!link) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+	link->detach = &bpf_link__detach_fd;
+
+	link_opts.tracing_multi.cookies = OPTS_GET(opts, cookies, 0);
+	link_opts.tracing_multi.cnt = cnt;
+
+	pfd = bpf_link_create(prog_fd, 0, bpf_program__expected_attach_type(prog), &link_opts);
+	if (pfd < 0) {
+		err = -errno;
+		pr_warn("prog '%s': failed to attach: %s\n",
+			prog->name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg)));
+		goto err_free;
+	}
+	link->fd = pfd;
+
+	free(btf_ids);
+	free(tgt_fds);
+	return link;
+err_free:
+	free(btf_ids);
+	free(tgt_fds);
+	free(link);
+	return libbpf_err_ptr(err);
+}
+
+static int attach_trace_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link)
+{
+	LIBBPF_OPTS(bpf_trace_multi_opts, opts);
+	int i, err, len, cnt = 1;
+	char **syms, *buf, *name;
+	const char *spec;
+
+	spec = strchr(prog->sec_name, '/');
+	if (!spec || !*(++spec))
+		return -EINVAL;
+
+	len = strlen(spec) + 1;
+	buf = malloc(len);
+	if (!buf)
+		return -ENOMEM;
+
+	libbpf_strlcpy(buf, spec, len);
+	for (i = 0; i < len; i++) {
+		if (buf[i] == ',')
+			cnt++;
+	}
+
+	syms = malloc(sizeof(*syms) * cnt);
+	if (!syms) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	opts.syms = (const char **)syms;
+	opts.cnt = cnt;
+	name = buf;
+	err = -EINVAL;
+	while (name) {
+		if (*name == '\0')
+			goto out_free;
+		*(syms++) = name;
+		name = strchr(name, ',');
+		if (name)
+			*(name++) = '\0';
+	}
+
+	*link = bpf_program__attach_trace_multi_opts(prog, &opts);
+	err = libbpf_get_error(*link);
+out_free:
+	free(buf);
+	free(opts.syms);
+	return err;
+}
+
 static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link)
 {
 	*link = bpf_program__attach_lsm(prog);
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 7cc810aa7967..1e7603c75224 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -833,6 +833,25 @@ bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex);
 LIBBPF_API struct bpf_link *
 bpf_program__attach_freplace(const struct bpf_program *prog,
 			     int target_fd, const char *attach_func_name);
+struct bpf_trace_multi_opts {
+	/* size of this struct, for forward/backward compatibility */
+	size_t sz;
+	/* array of function symbols to attach */
+	const char **syms;
+	/* array of the btf type id to attach */
+	__u32 *btf_ids;
+	/* array of the target fds */
+	__u32 *tgt_fds;
+	/* array of the cookies */
+	__u64 *cookies;
+	/* number of elements in syms/btf_ids/cookies arrays */
+	size_t cnt;
+};
+#define bpf_trace_multi_opts__last_field cnt
+
+LIBBPF_API struct bpf_link *
+bpf_program__attach_trace_multi_opts(const struct bpf_program *prog,
+				     const struct bpf_trace_multi_opts *opts);
 
 struct bpf_netfilter_opts {
 	/* size of this struct, for forward/backward compatibility */
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 4a0c993221a5..5f580b134d18 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -445,4 +445,5 @@ LIBBPF_1.6.0 {
 		btf__add_decl_attr;
 		btf__add_type_attr;
 		bpf_object__free_btfs;
+		bpf_program__attach_trace_multi_opts;
 } LIBBPF_1.5.0;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 14/18] libbpf: add btf type hash lookup support
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (12 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 13/18] libbpf: support tracing_multi Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-03 12:15 ` [PATCH bpf-next v2 15/18] libbpf: add skip_invalid and attach_tracing for tracing_multi Menglong Dong
                   ` (3 subsequent siblings)
  17 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, linux-kernel

For now, the libbpf find the btf type id by loop all the btf types and
compare its name, which is inefficient if we have many functions to
lookup.

We add the "use_hash" to the function args of find_kernel_btf_id() to
indicate if we should lookup the btf type id by hash. The hash table will
be initialized if it has not yet.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 tools/lib/bpf/btf.c      | 102 +++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/btf.h      |   6 +++
 tools/lib/bpf/libbpf.c   |  37 +++++++++++---
 tools/lib/bpf/libbpf.map |   3 ++
 4 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 37682908cb0f..e8ed8e6de7d7 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -35,6 +35,7 @@ struct btf {
 	void *raw_data;
 	/* raw BTF data in non-native endianness */
 	void *raw_data_swapped;
+	struct hashmap *func_hash;
 	__u32 raw_size;
 	/* whether target endianness differs from the native one */
 	bool swapped_endian;
@@ -131,6 +132,12 @@ struct btf {
 	int ptr_sz;
 };
 
+struct btf_type_key {
+	__u32 dummy;
+	const char *name;
+	int kind;
+};
+
 static inline __u64 ptr_to_u64(const void *ptr)
 {
 	return (__u64) (unsigned long) ptr;
@@ -938,6 +945,100 @@ static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id,
 	return libbpf_err(-ENOENT);
 }
 
+static size_t btf_hash_name(long key, void *btf)
+{
+	const struct btf_type *t = (const struct btf_type *)key;
+	const char *name;
+
+	if (t->name_off > BTF_MAX_NAME_OFFSET)
+		name = ((struct btf_type_key *)key)->name;
+	else
+		name = btf__name_by_offset(btf, t->name_off);
+
+	return str_hash(name);
+}
+
+static bool btf_name_equal(long key1, long key2, void *btf)
+{
+	const struct btf_type *t1 = (const struct btf_type *)key1,
+		*t2 = (const struct btf_type *)key2;
+	const char *name1, *name2;
+	int k1, k2;
+
+	name1 = btf__name_by_offset(btf, t1->name_off);
+	k1 = btf_kind(t1);
+
+	if (t2->name_off > BTF_MAX_NAME_OFFSET) {
+		struct btf_type_key *t2_key = (struct btf_type_key *)key2;
+
+		name2 = t2_key->name;
+		k2 = t2_key->kind;
+	} else {
+		name2 = btf__name_by_offset(btf, t2->name_off);
+		k2 = btf_kind(t2);
+	}
+
+	return k1 == k2 && strcmp(name1, name2) == 0;
+}
+
+__s32 btf__make_hash(struct btf *btf)
+{
+	__u32 i, nr_types = btf__type_cnt(btf);
+	struct hashmap *map;
+
+	if (btf->func_hash)
+		return 0;
+
+	map = hashmap__new(btf_hash_name, btf_name_equal, (void *)btf);
+	if (!map)
+		return libbpf_err(-ENOMEM);
+
+	for (i = btf->start_id; i < nr_types; i++) {
+		const struct btf_type *t = btf__type_by_id(btf, i);
+		int err;
+
+		/* only function need this */
+		if (btf_kind(t) != BTF_KIND_FUNC)
+			continue;
+
+		err = hashmap__add(map, t, i);
+		if (err == -EEXIST) {
+			pr_warn("btf type exist: name=%s\n",
+				btf__name_by_offset(btf, t->name_off));
+			continue;
+		}
+
+		if (err)
+			return libbpf_err(err);
+	}
+
+	btf->func_hash = map;
+	return 0;
+}
+
+bool btf__has_hash(struct btf *btf)
+{
+	return !!btf->func_hash;
+}
+
+int btf__find_by_func_hash(struct btf *btf, const char *type_name, __u32 kind)
+{
+	struct btf_type_key key = {
+		.dummy = 0xffffffff,
+		.name = type_name,
+		.kind = kind,
+	};
+	long t;
+
+	if (!btf->func_hash)
+		return -ENOENT;
+
+	if (hashmap__find(btf->func_hash, &key, &t))
+		return t;
+
+	return -ENOENT;
+}
+
 __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name,
 				 __u32 kind)
 {
@@ -974,6 +1075,7 @@ void btf__free(struct btf *btf)
 	if (btf->fd >= 0)
 		close(btf->fd);
 
+	hashmap__free(btf->func_hash);
 	if (btf_is_modifiable(btf)) {
 		/* if BTF was modified after loading, it will have a split
 		 * in-memory representation for header, types, and strings
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index ccfd905f03df..dd88800684c0 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -336,6 +336,12 @@ btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
 			 const void *data, size_t data_sz,
 			 const struct btf_dump_type_data_opts *opts);
 
+
+LIBBPF_API __s32 btf__make_hash(struct btf *btf);
+LIBBPF_API bool btf__has_hash(struct btf *btf);
+LIBBPF_API int
+btf__find_by_func_hash(struct btf *btf, const char *type_name, __u32 kind);
+
 /*
  * A set of helpers for easier BTF types handling.
  *
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index ae38b3ab84c7..4c67f6ee8a90 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -634,6 +634,7 @@ struct extern_desc {
 
 struct module_btf {
 	struct btf *btf;
+	struct hashmap *btf_name_hash;
 	char *name;
 	__u32 id;
 	int fd;
@@ -717,6 +718,7 @@ struct bpf_object {
 	 * it at load time.
 	 */
 	struct btf *btf_vmlinux;
+	struct hashmap *btf_name_hash;
 	/* Path to the custom BTF to be used for BPF CO-RE relocations as an
 	 * override for vmlinux BTF.
 	 */
@@ -1004,7 +1006,7 @@ static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name,
 			    struct module_btf **res_mod_btf);
 
 #define STRUCT_OPS_VALUE_PREFIX "bpf_struct_ops_"
-static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix,
+static int find_btf_by_prefix_kind(struct btf *btf, const char *prefix,
 				   const char *name, __u32 kind);
 
 static int
@@ -10052,7 +10054,7 @@ void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type,
 	}
 }
 
-static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix,
+static int find_btf_by_prefix_kind(struct btf *btf, const char *prefix,
 				   const char *name, __u32 kind)
 {
 	char btf_type_name[BTF_MAX_NAME_SIZE];
@@ -10066,6 +10068,10 @@ static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix,
 	 */
 	if (ret < 0 || ret >= sizeof(btf_type_name))
 		return -ENAMETOOLONG;
+
+	if (btf__has_hash(btf))
+		return btf__find_by_func_hash(btf, btf_type_name, kind);
+
 	return btf__find_by_name_kind(btf, btf_type_name, kind);
 }
 
@@ -10138,9 +10144,9 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd, int t
 
 static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name,
 			      enum bpf_attach_type attach_type,
-			      int *btf_obj_fd, int *btf_type_id)
+			      int *btf_obj_fd, int *btf_type_id, bool use_hash)
 {
-	int ret, i, mod_len;
+	int ret, i, mod_len, err;
 	const char *fn_name, *mod_name = NULL;
 
 	fn_name = strchr(attach_name, ':');
@@ -10151,6 +10157,11 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name,
 	}
 
 	if (!mod_name || strncmp(mod_name, "vmlinux", mod_len) == 0) {
+		if (use_hash) {
+			err = btf__make_hash(obj->btf_vmlinux);
+			if (err)
+				return err;
+		}
 		ret = find_attach_btf_id(obj->btf_vmlinux,
 					 mod_name ? fn_name : attach_name,
 					 attach_type);
@@ -10173,6 +10184,11 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name,
 		if (mod_name && strncmp(mod->name, mod_name, mod_len) != 0)
 			continue;
 
+		if (use_hash) {
+			err = btf__make_hash(mod->btf);
+			if (err)
+				return err;
+		}
 		ret = find_attach_btf_id(mod->btf,
 					 mod_name ? fn_name : attach_name,
 					 attach_type);
@@ -10222,7 +10238,7 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac
 	} else {
 		err = find_kernel_btf_id(prog->obj, attach_name,
 					 attach_type, btf_obj_fd,
-					 btf_type_id);
+					 btf_type_id, false);
 	}
 	if (err) {
 		pr_warn("prog '%s': failed to find kernel BTF type ID of '%s': %s\n",
@@ -12866,11 +12882,16 @@ struct bpf_link *bpf_program__attach_trace_multi_opts(const struct bpf_program *
 			goto err_free;
 		}
 		for (i = 0; i < cnt; i++) {
-			btf_obj_fd = btf_type_id = 0;
+			/* only use btf type function hashmap when the count
+			 * is big enough.
+			 */
+			bool func_hash = cnt > 1024;
+
 
+			btf_obj_fd = btf_type_id = 0;
 			err = find_kernel_btf_id(prog->obj, opts->syms[i],
 					 prog->expected_attach_type, &btf_obj_fd,
-					 &btf_type_id);
+					 &btf_type_id, func_hash);
 			if (err)
 				goto err_free;
 			btf_ids[i] = btf_type_id;
@@ -13976,7 +13997,7 @@ int bpf_program__set_attach_target(struct bpf_program *prog,
 			return libbpf_err(err);
 		err = find_kernel_btf_id(prog->obj, attach_func_name,
 					 prog->expected_attach_type,
-					 &btf_obj_fd, &btf_id);
+					 &btf_obj_fd, &btf_id, false);
 		if (err)
 			return libbpf_err(err);
 	}
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 5f580b134d18..e7435252d15d 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -446,4 +446,7 @@ LIBBPF_1.6.0 {
 		btf__add_type_attr;
 		bpf_object__free_btfs;
 		bpf_program__attach_trace_multi_opts;
+		btf__has_hash;
+		btf__find_by_func_hash;
+		btf__make_hash;
 } LIBBPF_1.5.0;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 15/18] libbpf: add skip_invalid and attach_tracing for tracing_multi
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (13 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 14/18] libbpf: add btf type hash lookup support Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-03 12:15 ` [PATCH bpf-next v2 16/18] selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c Menglong Dong
                   ` (2 subsequent siblings)
  17 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, linux-kernel

We add skip_invalid and attach_tracing for tracing_multi for the
selftests.

When we try to attach all the functions in available_filter_functions with
tracing_multi, we can't tell if the target symbol can be attached
successfully, and the attaching will fail. When skip_invalid is set to
true, we will check if it can be attached in libbpf, and skip the invalid
entries.

We will skip the symbols in the following cases:

1. the btf type not exist
2. the btf type is not a function proto
3. the function args count more that 6
4. the return type is struct or union
5. any function args is struct or union

The 5th rule can be a manslaughter, but it's ok for the testings.

"attach_tracing" is used to convert a TRACING prog to TRACING_MULTI. For
example, we can set the attach type to FENTRY_MULTI before we load the
skel. And we can attach the prog with
bpf_program__attach_trace_multi_opts() with "attach_tracing=1". The libbpf
will attach the target btf type of the prog automatically. This is also
used to reuse the selftests of tracing.

(Oh my goodness! What am I doing?)

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 tools/lib/bpf/libbpf.c | 97 ++++++++++++++++++++++++++++++++++++------
 tools/lib/bpf/libbpf.h |  6 ++-
 2 files changed, 89 insertions(+), 14 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4c67f6ee8a90..e8068ca58149 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -10144,7 +10144,8 @@ static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd, int t
 
 static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name,
 			      enum bpf_attach_type attach_type,
-			      int *btf_obj_fd, int *btf_type_id, bool use_hash)
+			      int *btf_obj_fd, int *btf_type_id, bool use_hash,
+			      const struct btf **btf)
 {
 	int ret, i, mod_len, err;
 	const char *fn_name, *mod_name = NULL;
@@ -10168,6 +10169,8 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name,
 		if (ret > 0) {
 			*btf_obj_fd = 0; /* vmlinux BTF */
 			*btf_type_id = ret;
+			if (btf)
+				*btf = obj->btf_vmlinux;
 			return 0;
 		}
 		if (ret != -ENOENT)
@@ -10195,6 +10198,8 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name,
 		if (ret > 0) {
 			*btf_obj_fd = mod->fd;
 			*btf_type_id = ret;
+			if (btf)
+				*btf = mod->btf;
 			return 0;
 		}
 		if (ret == -ENOENT)
@@ -10238,7 +10243,7 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac
 	} else {
 		err = find_kernel_btf_id(prog->obj, attach_name,
 					 attach_type, btf_obj_fd,
-					 btf_type_id, false);
+					 btf_type_id, false, NULL);
 	}
 	if (err) {
 		pr_warn("prog '%s': failed to find kernel BTF type ID of '%s': %s\n",
@@ -12848,6 +12853,53 @@ static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_
 	return libbpf_get_error(*link);
 }
 
+static bool is_trace_valid(const struct btf *btf, int btf_type_id, const char *name)
+{
+	const struct btf_type *t;
+
+	t = skip_mods_and_typedefs(btf, btf_type_id, NULL);
+	if (btf_is_func(t)) {
+		const struct btf_param *args;
+		__u32 nargs, m;
+
+		t = skip_mods_and_typedefs(btf, t->type, NULL);
+		if (!btf_is_func_proto(t)) {
+			pr_debug("skipping no function btf type for %s\n",
+				 name);
+			return false;
+		}
+
+		args = (const struct btf_param *)(t + 1);
+		nargs = btf_vlen(t);
+		if (nargs > 6) {
+			pr_debug("skipping args count more than 6 for %s\n",
+				 name);
+			return false;
+		}
+
+		t = skip_mods_and_typedefs(btf, t->type, NULL);
+		if (btf_is_struct(t) || btf_is_union(t) ||
+		    (nargs && args[nargs - 1].type == 0)) {
+			pr_debug("skipping invalid return type for %s\n",
+				 name);
+			return false;
+		}
+
+		for (m = 0; m < nargs; m++) {
+			t = skip_mods_and_typedefs(btf, args[m].type, NULL);
+			if (btf_is_struct(t) || btf_is_union(t)) {
+				pr_debug("skipping not supported arg type %s\n",
+					 name);
+				break;
+			}
+		}
+		if (m < nargs)
+			return false;
+	}
+
+	return true;
+}
+
 struct bpf_link *bpf_program__attach_trace_multi_opts(const struct bpf_program *prog,
 						      const struct bpf_trace_multi_opts *opts)
 {
@@ -12868,7 +12920,7 @@ struct bpf_link *bpf_program__attach_trace_multi_opts(const struct bpf_program *
 
 	cnt = OPTS_GET(opts, cnt, 0);
 	if (opts->syms) {
-		int btf_obj_fd, btf_type_id, i;
+		int btf_obj_fd, btf_type_id, i, j = 0;
 
 		if (opts->btf_ids || opts->tgt_fds) {
 			pr_warn("can set both opts->syms and opts->btf_ids\n");
@@ -12882,23 +12934,41 @@ struct bpf_link *bpf_program__attach_trace_multi_opts(const struct bpf_program *
 			goto err_free;
 		}
 		for (i = 0; i < cnt; i++) {
+			const struct btf *btf = NULL;
+			bool func_hash;
+
 			/* only use btf type function hashmap when the count
 			 * is big enough.
 			 */
-			bool func_hash = cnt > 1024;
-
-
+			func_hash = cnt > 1024;
 			btf_obj_fd = btf_type_id = 0;
 			err = find_kernel_btf_id(prog->obj, opts->syms[i],
-					 prog->expected_attach_type, &btf_obj_fd,
-					 &btf_type_id, func_hash);
-			if (err)
-				goto err_free;
-			btf_ids[i] = btf_type_id;
-			tgt_fds[i] = btf_obj_fd;
+					prog->expected_attach_type, &btf_obj_fd,
+					&btf_type_id, func_hash, &btf);
+			if (err) {
+				if (!opts->skip_invalid)
+					goto err_free;
+
+				pr_debug("can't find btf type for %s, skip\n",
+					 opts->syms[i]);
+				continue;
+			}
+
+			if (opts->skip_invalid &&
+			    !is_trace_valid(btf, btf_type_id, opts->syms[i]))
+				continue;
+
+			btf_ids[j] = btf_type_id;
+			tgt_fds[j] = btf_obj_fd;
+			j++;
 		}
+		cnt = j;
 		link_opts.tracing_multi.btf_ids = btf_ids;
 		link_opts.tracing_multi.tgt_fds = tgt_fds;
+	} else if (opts->attach_tracing) {
+		link_opts.tracing_multi.btf_ids = &prog->attach_btf_id;
+		link_opts.tracing_multi.tgt_fds = &prog->attach_btf_obj_fd;
+		cnt = 1;
 	} else {
 		link_opts.tracing_multi.btf_ids = OPTS_GET(opts, btf_ids, 0);
 		link_opts.tracing_multi.tgt_fds = OPTS_GET(opts, tgt_fds, 0);
@@ -13997,7 +14067,8 @@ int bpf_program__set_attach_target(struct bpf_program *prog,
 			return libbpf_err(err);
 		err = find_kernel_btf_id(prog->obj, attach_func_name,
 					 prog->expected_attach_type,
-					 &btf_obj_fd, &btf_id, false);
+					 &btf_obj_fd, &btf_id, false,
+					 NULL);
 		if (err)
 			return libbpf_err(err);
 	}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 1e7603c75224..2f65a9cd57f9 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -846,8 +846,12 @@ struct bpf_trace_multi_opts {
 	__u64 *cookies;
 	/* number of elements in syms/btf_ids/cookies arrays */
 	size_t cnt;
+	/* skip the invalid btf type before attaching */
+	bool skip_invalid;
+	/* attach a TRACING prog as TRACING_MULTI */
+	bool attach_tracing;
 };
-#define bpf_trace_multi_opts__last_field cnt
+#define bpf_trace_multi_opts__last_field attach_tracing
 
 LIBBPF_API struct bpf_link *
 bpf_program__attach_trace_multi_opts(const struct bpf_program *prog,
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 16/18] selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (14 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 15/18] libbpf: add skip_invalid and attach_tracing for tracing_multi Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 17/18] selftests/bpf: add basic testcases for tracing_multi Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 18/18] selftests/bpf: add bench tests " Menglong Dong
  17 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Mykola Lysenko, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Nick Desaulniers,
	Bill Wendling, Justin Stitt, linux-kselftest, linux-kernel,
	netdev, llvm

We need to get all the kernel function that can be traced sometimes, so we
move the get_syms() and get_addrs() in kprobe_multi_test.c to
trace_helpers.c and rename it to bpf_get_ksyms() and bpf_get_addrs().

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 .../bpf/prog_tests/kprobe_multi_test.c        | 220 +-----------------
 tools/testing/selftests/bpf/trace_helpers.c   | 214 +++++++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h   |   3 +
 3 files changed, 220 insertions(+), 217 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
index e19ef509ebf8..171706e78da8 100644
--- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c
@@ -422,220 +422,6 @@ static void test_unique_match(void)
 	kprobe_multi__destroy(skel);
 }
 
-static size_t symbol_hash(long key, void *ctx __maybe_unused)
-{
-	return str_hash((const char *) key);
-}
-
-static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused)
-{
-	return strcmp((const char *) key1, (const char *) key2) == 0;
-}
-
-static bool is_invalid_entry(char *buf, bool kernel)
-{
-	if (kernel && strchr(buf, '['))
-		return true;
-	if (!kernel && !strchr(buf, '['))
-		return true;
-	return false;
-}
-
-static bool skip_entry(char *name)
-{
-	/*
-	 * We attach to almost all kernel functions and some of them
-	 * will cause 'suspicious RCU usage' when fprobe is attached
-	 * to them. Filter out the current culprits - arch_cpu_idle
-	 * default_idle and rcu_* functions.
-	 */
-	if (!strcmp(name, "arch_cpu_idle"))
-		return true;
-	if (!strcmp(name, "default_idle"))
-		return true;
-	if (!strncmp(name, "rcu_", 4))
-		return true;
-	if (!strcmp(name, "bpf_dispatcher_xdp_func"))
-		return true;
-	if (!strncmp(name, "__ftrace_invalid_address__",
-		     sizeof("__ftrace_invalid_address__") - 1))
-		return true;
-	return false;
-}
-
-/* Do comparision by ignoring '.llvm.<hash>' suffixes. */
-static int compare_name(const char *name1, const char *name2)
-{
-	const char *res1, *res2;
-	int len1, len2;
-
-	res1 = strstr(name1, ".llvm.");
-	res2 = strstr(name2, ".llvm.");
-	len1 = res1 ? res1 - name1 : strlen(name1);
-	len2 = res2 ? res2 - name2 : strlen(name2);
-
-	if (len1 == len2)
-		return strncmp(name1, name2, len1);
-	if (len1 < len2)
-		return strncmp(name1, name2, len1) <= 0 ? -1 : 1;
-	return strncmp(name1, name2, len2) >= 0 ? 1 : -1;
-}
-
-static int load_kallsyms_compare(const void *p1, const void *p2)
-{
-	return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name);
-}
-
-static int search_kallsyms_compare(const void *p1, const struct ksym *p2)
-{
-	return compare_name(p1, p2->name);
-}
-
-static int get_syms(char ***symsp, size_t *cntp, bool kernel)
-{
-	size_t cap = 0, cnt = 0;
-	char *name = NULL, *ksym_name, **syms = NULL;
-	struct hashmap *map;
-	struct ksyms *ksyms;
-	struct ksym *ks;
-	char buf[256];
-	FILE *f;
-	int err = 0;
-
-	ksyms = load_kallsyms_custom_local(load_kallsyms_compare);
-	if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_custom_local"))
-		return -EINVAL;
-
-	/*
-	 * The available_filter_functions contains many duplicates,
-	 * but other than that all symbols are usable in kprobe multi
-	 * interface.
-	 * Filtering out duplicates by using hashmap__add, which won't
-	 * add existing entry.
-	 */
-
-	if (access("/sys/kernel/tracing/trace", F_OK) == 0)
-		f = fopen("/sys/kernel/tracing/available_filter_functions", "r");
-	else
-		f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r");
-
-	if (!f)
-		return -EINVAL;
-
-	map = hashmap__new(symbol_hash, symbol_equal, NULL);
-	if (IS_ERR(map)) {
-		err = libbpf_get_error(map);
-		goto error;
-	}
-
-	while (fgets(buf, sizeof(buf), f)) {
-		if (is_invalid_entry(buf, kernel))
-			continue;
-
-		free(name);
-		if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1)
-			continue;
-		if (skip_entry(name))
-			continue;
-
-		ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare);
-		if (!ks) {
-			err = -EINVAL;
-			goto error;
-		}
-
-		ksym_name = ks->name;
-		err = hashmap__add(map, ksym_name, 0);
-		if (err == -EEXIST) {
-			err = 0;
-			continue;
-		}
-		if (err)
-			goto error;
-
-		err = libbpf_ensure_mem((void **) &syms, &cap,
-					sizeof(*syms), cnt + 1);
-		if (err)
-			goto error;
-
-		syms[cnt++] = ksym_name;
-	}
-
-	*symsp = syms;
-	*cntp = cnt;
-
-error:
-	free(name);
-	fclose(f);
-	hashmap__free(map);
-	if (err)
-		free(syms);
-	return err;
-}
-
-static int get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
-{
-	unsigned long *addr, *addrs, *tmp_addrs;
-	int err = 0, max_cnt, inc_cnt;
-	char *name = NULL;
-	size_t cnt = 0;
-	char buf[256];
-	FILE *f;
-
-	if (access("/sys/kernel/tracing/trace", F_OK) == 0)
-		f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r");
-	else
-		f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r");
-
-	if (!f)
-		return -ENOENT;
-
-	/* In my local setup, the number of entries is 50k+ so Let us initially
-	 * allocate space to hold 64k entries. If 64k is not enough, incrementally
-	 * increase 1k each time.
-	 */
-	max_cnt = 65536;
-	inc_cnt = 1024;
-	addrs = malloc(max_cnt * sizeof(long));
-	if (addrs == NULL) {
-		err = -ENOMEM;
-		goto error;
-	}
-
-	while (fgets(buf, sizeof(buf), f)) {
-		if (is_invalid_entry(buf, kernel))
-			continue;
-
-		free(name);
-		if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2)
-			continue;
-		if (skip_entry(name))
-			continue;
-
-		if (cnt == max_cnt) {
-			max_cnt += inc_cnt;
-			tmp_addrs = realloc(addrs, max_cnt);
-			if (!tmp_addrs) {
-				err = -ENOMEM;
-				goto error;
-			}
-			addrs = tmp_addrs;
-		}
-
-		addrs[cnt++] = (unsigned long)addr;
-	}
-
-	*addrsp = addrs;
-	*cntp = cnt;
-
-error:
-	free(name);
-	fclose(f);
-	if (err)
-		free(addrs);
-	return err;
-}
-
 static void do_bench_test(struct kprobe_multi_empty *skel, struct bpf_kprobe_multi_opts *opts)
 {
 	long attach_start_ns, attach_end_ns;
@@ -670,7 +456,7 @@ static void test_kprobe_multi_bench_attach(bool kernel)
 	char **syms = NULL;
 	size_t cnt = 0;
 
-	if (!ASSERT_OK(get_syms(&syms, &cnt, kernel), "get_syms"))
+	if (!ASSERT_OK(bpf_get_ksyms(&syms, &cnt, kernel), "bpf_get_ksyms"))
 		return;
 
 	skel = kprobe_multi_empty__open_and_load();
@@ -696,13 +482,13 @@ static void test_kprobe_multi_bench_attach_addr(bool kernel)
 	size_t cnt = 0;
 	int err;
 
-	err = get_addrs(&addrs, &cnt, kernel);
+	err = bpf_get_addrs(&addrs, &cnt, kernel);
 	if (err == -ENOENT) {
 		test__skip();
 		return;
 	}
 
-	if (!ASSERT_OK(err, "get_addrs"))
+	if (!ASSERT_OK(err, "bpf_get_addrs"))
 		return;
 
 	skel = kprobe_multi_empty__open_and_load();
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 81943c6254e6..d24baf244d1f 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -17,6 +17,7 @@
 #include <linux/limits.h>
 #include <libelf.h>
 #include <gelf.h>
+#include "bpf/hashmap.h"
 #include "bpf/libbpf_internal.h"
 
 #define TRACEFS_PIPE	"/sys/kernel/tracing/trace_pipe"
@@ -519,3 +520,216 @@ void read_trace_pipe(void)
 {
 	read_trace_pipe_iter(trace_pipe_cb, NULL, 0);
 }
+
+static size_t symbol_hash(long key, void *ctx __maybe_unused)
+{
+	return str_hash((const char *) key);
+}
+
+static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused)
+{
+	return strcmp((const char *) key1, (const char *) key2) == 0;
+}
+
+static bool is_invalid_entry(char *buf, bool kernel)
+{
+	if (kernel && strchr(buf, '['))
+		return true;
+	if (!kernel && !strchr(buf, '['))
+		return true;
+	return false;
+}
+
+static bool skip_entry(char *name)
+{
+	/*
+	 * We attach to almost all kernel functions and some of them
+	 * will cause 'suspicious RCU usage' when fprobe is attached
+	 * to them. Filter out the current culprits - arch_cpu_idle
+	 * default_idle and rcu_* functions.
+	 */
+	if (!strcmp(name, "arch_cpu_idle"))
+		return true;
+	if (!strcmp(name, "default_idle"))
+		return true;
+	if (!strncmp(name, "rcu_", 4))
+		return true;
+	if (!strcmp(name, "bpf_dispatcher_xdp_func"))
+		return true;
+	if (!strncmp(name, "__ftrace_invalid_address__",
+		     sizeof("__ftrace_invalid_address__") - 1))
+		return true;
+	return false;
+}
+
+/* Do comparison by ignoring '.llvm.<hash>' suffixes. */
+static int compare_name(const char *name1, const char *name2)
+{
+	const char *res1, *res2;
+	int len1, len2;
+
+	res1 = strstr(name1, ".llvm.");
+	res2 = strstr(name2, ".llvm.");
+	len1 = res1 ? res1 - name1 : strlen(name1);
+	len2 = res2 ? res2 - name2 : strlen(name2);
+
+	if (len1 == len2)
+		return strncmp(name1, name2, len1);
+	if (len1 < len2)
+		return strncmp(name1, name2, len1) <= 0 ? -1 : 1;
+	return strncmp(name1, name2, len2) >= 0 ? 1 : -1;
+}
+
+static int load_kallsyms_compare(const void *p1, const void *p2)
+{
+	return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name);
+}
+
+static int search_kallsyms_compare(const void *p1, const struct ksym *p2)
+{
+	return compare_name(p1, p2->name);
+}
+
+int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel)
+{
+	size_t cap = 0, cnt = 0;
+	char *name = NULL, *ksym_name, **syms = NULL;
+	struct hashmap *map;
+	struct ksyms *ksyms;
+	struct ksym *ks;
+	char buf[256];
+	FILE *f;
+	int err = 0;
+
+	ksyms = load_kallsyms_custom_local(load_kallsyms_compare);
+	if (!ksyms)
+		return -EINVAL;
+
+	/*
+	 * The available_filter_functions contains many duplicates,
+	 * but other than that all symbols are usable to trace.
+	 * Filtering out duplicates by using hashmap__add, which won't
+	 * add existing entry.
+	 */
+
+	if (access("/sys/kernel/tracing/trace", F_OK) == 0)
+		f = fopen("/sys/kernel/tracing/available_filter_functions", "r");
+	else
+		f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r");
+
+	if (!f)
+		return -EINVAL;
+
+	map = hashmap__new(symbol_hash, symbol_equal, NULL);
+	if (IS_ERR(map)) {
+		err = libbpf_get_error(map);
+		goto error;
+	}
+
+	while (fgets(buf, sizeof(buf), f)) {
+		if (is_invalid_entry(buf, kernel))
+			continue;
+
+		free(name);
+		if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1)
+			continue;
+		if (skip_entry(name))
+			continue;
+
+		ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare);
+		if (!ks) {
+			err = -EINVAL;
+			goto error;
+		}
+
+		ksym_name = ks->name;
+		err = hashmap__add(map, ksym_name, 0);
+		if (err == -EEXIST) {
+			err = 0;
+			continue;
+		}
+		if (err)
+			goto error;
+
+		err = libbpf_ensure_mem((void **) &syms, &cap,
+					sizeof(*syms), cnt + 1);
+		if (err)
+			goto error;
+
+		syms[cnt++] = ksym_name;
+	}
+
+	*symsp = syms;
+	*cntp = cnt;
+
+error:
+	free(name);
+	fclose(f);
+	hashmap__free(map);
+	if (err)
+		free(syms);
+	return err;
+}
+
+int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
+{
+	unsigned long *addr, *addrs, *tmp_addrs;
+	int err = 0, max_cnt, inc_cnt;
+	char *name = NULL;
+	size_t cnt = 0;
+	char buf[256];
+	FILE *f;
+
+	if (access("/sys/kernel/tracing/trace", F_OK) == 0)
+		f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r");
+	else
+		f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r");
+
+	if (!f)
+		return -ENOENT;
+
+	/* In my local setup, the number of entries is 50k+ so Let us initially
+	 * allocate space to hold 64k entries. If 64k is not enough, incrementally
+	 * increase 1k each time.
+	 */
+	max_cnt = 65536;
+	inc_cnt = 1024;
+	addrs = malloc(max_cnt * sizeof(long));
+	if (addrs == NULL) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	while (fgets(buf, sizeof(buf), f)) {
+		if (is_invalid_entry(buf, kernel))
+			continue;
+
+		free(name);
+		if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2)
+			continue;
+		if (skip_entry(name))
+			continue;
+
+		if (cnt == max_cnt) {
+			max_cnt += inc_cnt;
+			tmp_addrs = realloc(addrs, max_cnt);
+			if (!tmp_addrs) {
+				err = -ENOMEM;
+				goto error;
+			}
+			addrs = tmp_addrs;
+		}
+
+		addrs[cnt++] = (unsigned long)addr;
+	}
+
+	*addrsp = addrs;
+	*cntp = cnt;
+
+error:
+	free(name);
+	fclose(f);
+	if (err)
+		free(addrs);
+	return err;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 2ce873c9f9aa..9437bdd4afa5 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -41,4 +41,7 @@ ssize_t get_rel_offset(uintptr_t addr);
 
 int read_build_id(const char *path, char *build_id, size_t size);
 
+int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel);
+int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel);
+
 #endif
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 17/18] selftests/bpf: add basic testcases for tracing_multi
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (15 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 16/18] selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  2025-07-03 12:15 ` [PATCH bpf-next v2 18/18] selftests/bpf: add bench tests " Menglong Dong
  17 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, Mykola Lysenko, linux-kernel, linux-kselftest,
	linux-stm32, linux-arm-kernel

In this commit, we add some testcases for the following attach types:

BPF_TRACE_FENTRY_MULTI
BPF_TRACE_FEXIT_MULTI
BPF_MODIFY_RETURN_MULTI

We reuse the testings in fentry_test.c, fexit_test.c and modify_return.c
by attach the tracing bpf prog as tracing_multi.

We add some functions to skip for tracing progs to bpf_get_ksyms(). The
functions that in the "btf_id_deny" should be skipped. What's more, the
kernel can't find the right function address according to the btf type id
when duplicated function name exist. So we skip such functions that we
meet. The list is not whole, so we still can fail during attaching the
FENTRY_MULTI to all the kernel functions. This is something that we need
to fix in the feature.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
 tools/testing/selftests/bpf/Makefile          |   2 +-
 .../selftests/bpf/prog_tests/fentry_fexit.c   |  22 +-
 .../selftests/bpf/prog_tests/fentry_test.c    |  79 +++++--
 .../selftests/bpf/prog_tests/fexit_test.c     |  79 +++++--
 .../selftests/bpf/prog_tests/modify_return.c  |  60 +++++
 .../bpf/prog_tests/tracing_multi_link.c       | 210 ++++++++++++++++++
 .../selftests/bpf/progs/fentry_multi_empty.c  |  13 ++
 .../selftests/bpf/progs/tracing_multi_test.c  | 181 +++++++++++++++
 .../selftests/bpf/test_kmods/bpf_testmod.c    |  24 ++
 tools/testing/selftests/bpf/test_progs.c      |  50 +++++
 tools/testing/selftests/bpf/test_progs.h      |   3 +
 tools/testing/selftests/bpf/trace_helpers.c   |  69 ++++++
 12 files changed, 744 insertions(+), 48 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/tracing_multi_link.c
 create mode 100644 tools/testing/selftests/bpf/progs/fentry_multi_empty.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_multi_test.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 4863106034df..1fa0da096262 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -496,7 +496,7 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 		test_subskeleton.skel.h test_subskeleton_lib.skel.h	\
 		test_usdt.skel.h
 
-LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c 		\
+LSKELS := fexit_sleep.c atomics.c 		\
 	trace_printk.c trace_vprintk.c map_ptr_kern.c 			\
 	core_kern.c core_kern_overflow.c test_ringbuf.c			\
 	test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
index 130f5b82d2e6..84cc8b669684 100644
--- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
+++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
@@ -1,32 +1,32 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
-#include "fentry_test.lskel.h"
-#include "fexit_test.lskel.h"
+#include "fentry_test.skel.h"
+#include "fexit_test.skel.h"
 
 void test_fentry_fexit(void)
 {
-	struct fentry_test_lskel *fentry_skel = NULL;
-	struct fexit_test_lskel *fexit_skel = NULL;
+	struct fentry_test *fentry_skel = NULL;
+	struct fexit_test *fexit_skel = NULL;
 	__u64 *fentry_res, *fexit_res;
 	int err, prog_fd, i;
 	LIBBPF_OPTS(bpf_test_run_opts, topts);
 
-	fentry_skel = fentry_test_lskel__open_and_load();
+	fentry_skel = fentry_test__open_and_load();
 	if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load"))
 		goto close_prog;
-	fexit_skel = fexit_test_lskel__open_and_load();
+	fexit_skel = fexit_test__open_and_load();
 	if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load"))
 		goto close_prog;
 
-	err = fentry_test_lskel__attach(fentry_skel);
+	err = fentry_test__attach(fentry_skel);
 	if (!ASSERT_OK(err, "fentry_attach"))
 		goto close_prog;
-	err = fexit_test_lskel__attach(fexit_skel);
+	err = fexit_test__attach(fexit_skel);
 	if (!ASSERT_OK(err, "fexit_attach"))
 		goto close_prog;
 
-	prog_fd = fexit_skel->progs.test1.prog_fd;
+	prog_fd = bpf_program__fd(fexit_skel->progs.test1);
 	err = bpf_prog_test_run_opts(prog_fd, &topts);
 	ASSERT_OK(err, "ipv6 test_run");
 	ASSERT_OK(topts.retval, "ipv6 test retval");
@@ -40,6 +40,6 @@ void test_fentry_fexit(void)
 	}
 
 close_prog:
-	fentry_test_lskel__destroy(fentry_skel);
-	fexit_test_lskel__destroy(fexit_skel);
+	fentry_test__destroy(fentry_skel);
+	fexit_test__destroy(fexit_skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c
index aee1bc77a17f..9edd383feabd 100644
--- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c
@@ -1,26 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
-#include "fentry_test.lskel.h"
+#include "fentry_test.skel.h"
 #include "fentry_many_args.skel.h"
 
-static int fentry_test_common(struct fentry_test_lskel *fentry_skel)
+static int fentry_test_check(struct fentry_test *fentry_skel)
 {
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
 	int err, prog_fd, i;
-	int link_fd;
 	__u64 *result;
-	LIBBPF_OPTS(bpf_test_run_opts, topts);
-
-	err = fentry_test_lskel__attach(fentry_skel);
-	if (!ASSERT_OK(err, "fentry_attach"))
-		return err;
 
-	/* Check that already linked program can't be attached again. */
-	link_fd = fentry_test_lskel__test1__attach(fentry_skel);
-	if (!ASSERT_LT(link_fd, 0, "fentry_attach_link"))
-		return -1;
-
-	prog_fd = fentry_skel->progs.test1.prog_fd;
+	prog_fd = bpf_program__fd(fentry_skel->progs.test1);
 	err = bpf_prog_test_run_opts(prog_fd, &topts);
 	ASSERT_OK(err, "test_run");
 	ASSERT_EQ(topts.retval, 0, "test_run");
@@ -31,7 +21,28 @@ static int fentry_test_common(struct fentry_test_lskel *fentry_skel)
 			return -1;
 	}
 
-	fentry_test_lskel__detach(fentry_skel);
+	return 0;
+}
+
+static int fentry_test_common(struct fentry_test *fentry_skel)
+{
+	struct bpf_link *link;
+	int err;
+
+	err = fentry_test__attach(fentry_skel);
+	if (!ASSERT_OK(err, "fentry_attach"))
+		return err;
+
+	/* Check that already linked program can't be attached again. */
+	link = bpf_program__attach(fentry_skel->progs.test1);
+	if (!ASSERT_ERR_PTR(link, "fentry_attach_link"))
+		return -1;
+
+	err = fentry_test_check(fentry_skel);
+	if (!ASSERT_OK(err, "fentry_test_check"))
+		return err;
+
+	fentry_test__detach(fentry_skel);
 
 	/* zero results for re-attach test */
 	memset(fentry_skel->bss, 0, sizeof(*fentry_skel->bss));
@@ -40,10 +51,10 @@ static int fentry_test_common(struct fentry_test_lskel *fentry_skel)
 
 static void fentry_test(void)
 {
-	struct fentry_test_lskel *fentry_skel = NULL;
+	struct fentry_test *fentry_skel = NULL;
 	int err;
 
-	fentry_skel = fentry_test_lskel__open_and_load();
+	fentry_skel = fentry_test__open_and_load();
 	if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_load"))
 		goto cleanup;
 
@@ -55,7 +66,7 @@ static void fentry_test(void)
 	ASSERT_OK(err, "fentry_second_attach");
 
 cleanup:
-	fentry_test_lskel__destroy(fentry_skel);
+	fentry_test__destroy(fentry_skel);
 }
 
 static void fentry_many_args(void)
@@ -84,10 +95,42 @@ static void fentry_many_args(void)
 	fentry_many_args__destroy(fentry_skel);
 }
 
+static void fentry_multi_test(void)
+{
+	struct fentry_test *fentry_skel = NULL;
+	int err, prog_cnt;
+
+	fentry_skel = fentry_test__open();
+	if (!ASSERT_OK_PTR(fentry_skel, "fentry_skel_open"))
+		goto cleanup;
+
+	prog_cnt = sizeof(fentry_skel->progs) / sizeof(long);
+	err = bpf_to_tracing_multi((void *)&fentry_skel->progs, prog_cnt);
+	if (!ASSERT_OK(err, "fentry_to_multi"))
+		goto cleanup;
+
+	err = fentry_test__load(fentry_skel);
+	if (!ASSERT_OK(err, "fentry_skel_load"))
+		goto cleanup;
+
+	err = bpf_attach_as_tracing_multi((void *)&fentry_skel->progs,
+					  prog_cnt,
+					  (void *)&fentry_skel->links);
+	if (!ASSERT_OK(err, "fentry_attach_multi"))
+		goto cleanup;
+
+	err = fentry_test_check(fentry_skel);
+	ASSERT_OK(err, "fentry_first_attach");
+cleanup:
+	fentry_test__destroy(fentry_skel);
+}
+
 void test_fentry_test(void)
 {
 	if (test__start_subtest("fentry"))
 		fentry_test();
+	if (test__start_subtest("fentry_multi"))
+		fentry_multi_test();
 	if (test__start_subtest("fentry_many_args"))
 		fentry_many_args();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c
index 1c13007e37dd..5652d02b3ad9 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c
@@ -1,26 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
-#include "fexit_test.lskel.h"
+#include "fexit_test.skel.h"
 #include "fexit_many_args.skel.h"
 
-static int fexit_test_common(struct fexit_test_lskel *fexit_skel)
+static int fexit_test_check(struct fexit_test *fexit_skel)
 {
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
 	int err, prog_fd, i;
-	int link_fd;
 	__u64 *result;
-	LIBBPF_OPTS(bpf_test_run_opts, topts);
-
-	err = fexit_test_lskel__attach(fexit_skel);
-	if (!ASSERT_OK(err, "fexit_attach"))
-		return err;
 
-	/* Check that already linked program can't be attached again. */
-	link_fd = fexit_test_lskel__test1__attach(fexit_skel);
-	if (!ASSERT_LT(link_fd, 0, "fexit_attach_link"))
-		return -1;
-
-	prog_fd = fexit_skel->progs.test1.prog_fd;
+	prog_fd = bpf_program__fd(fexit_skel->progs.test1);
 	err = bpf_prog_test_run_opts(prog_fd, &topts);
 	ASSERT_OK(err, "test_run");
 	ASSERT_EQ(topts.retval, 0, "test_run");
@@ -31,7 +21,28 @@ static int fexit_test_common(struct fexit_test_lskel *fexit_skel)
 			return -1;
 	}
 
-	fexit_test_lskel__detach(fexit_skel);
+	return 0;
+}
+
+static int fexit_test_common(struct fexit_test *fexit_skel)
+{
+	struct bpf_link *link;
+	int err;
+
+	err = fexit_test__attach(fexit_skel);
+	if (!ASSERT_OK(err, "fexit_attach"))
+		return err;
+
+	/* Check that already linked program can't be attached again. */
+	link = bpf_program__attach(fexit_skel->progs.test1);
+	if (!ASSERT_ERR_PTR(link, "fexit_attach_link"))
+		return -1;
+
+	err = fexit_test_check(fexit_skel);
+	if (!ASSERT_OK(err, "fexit_test_check"))
+		return err;
+
+	fexit_test__detach(fexit_skel);
 
 	/* zero results for re-attach test */
 	memset(fexit_skel->bss, 0, sizeof(*fexit_skel->bss));
@@ -40,10 +51,10 @@ static int fexit_test_common(struct fexit_test_lskel *fexit_skel)
 
 static void fexit_test(void)
 {
-	struct fexit_test_lskel *fexit_skel = NULL;
+	struct fexit_test *fexit_skel = NULL;
 	int err;
 
-	fexit_skel = fexit_test_lskel__open_and_load();
+	fexit_skel = fexit_test__open_and_load();
 	if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_load"))
 		goto cleanup;
 
@@ -55,7 +66,7 @@ static void fexit_test(void)
 	ASSERT_OK(err, "fexit_second_attach");
 
 cleanup:
-	fexit_test_lskel__destroy(fexit_skel);
+	fexit_test__destroy(fexit_skel);
 }
 
 static void fexit_many_args(void)
@@ -84,10 +95,42 @@ static void fexit_many_args(void)
 	fexit_many_args__destroy(fexit_skel);
 }
 
+static void fexit_test_multi(void)
+{
+	struct fexit_test *fexit_skel = NULL;
+	int err, prog_cnt;
+
+	fexit_skel = fexit_test__open();
+	if (!ASSERT_OK_PTR(fexit_skel, "fexit_skel_open"))
+		goto cleanup;
+
+	prog_cnt = sizeof(fexit_skel->progs) / sizeof(long);
+	err = bpf_to_tracing_multi((void *)&fexit_skel->progs, prog_cnt);
+	if (!ASSERT_OK(err, "fexit_to_multi"))
+		goto cleanup;
+
+	err = fexit_test__load(fexit_skel);
+	if (!ASSERT_OK(err, "fexit_skel_load"))
+		goto cleanup;
+
+	err = bpf_attach_as_tracing_multi((void *)&fexit_skel->progs,
+					  prog_cnt,
+					  (void *)&fexit_skel->links);
+	if (!ASSERT_OK(err, "fexit_attach_multi"))
+		goto cleanup;
+
+	err = fexit_test_check(fexit_skel);
+	ASSERT_OK(err, "fexit_first_attach");
+cleanup:
+	fexit_test__destroy(fexit_skel);
+}
+
 void test_fexit_test(void)
 {
 	if (test__start_subtest("fexit"))
 		fexit_test();
+	if (test__start_subtest("fexit_multi"))
+		fexit_test_multi();
 	if (test__start_subtest("fexit_many_args"))
 		fexit_many_args();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/modify_return.c b/tools/testing/selftests/bpf/prog_tests/modify_return.c
index a70c99c2f8c8..3ca454379e90 100644
--- a/tools/testing/selftests/bpf/prog_tests/modify_return.c
+++ b/tools/testing/selftests/bpf/prog_tests/modify_return.c
@@ -49,6 +49,56 @@ static void run_test(__u32 input_retval, __u16 want_side_effect, __s16 want_ret)
 	modify_return__destroy(skel);
 }
 
+static void run_multi_test(__u32 input_retval, __u16 want_side_effect, __s16 want_ret)
+{
+	struct modify_return *skel = NULL;
+	int err, prog_fd, prog_cnt;
+	__u16 side_effect;
+	__s16 ret;
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+
+	skel = modify_return__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		goto cleanup;
+
+	/* stack function args is not supported by tracing multi-link yet,
+	 * so we only enable the bpf progs without stack function args.
+	 */
+	bpf_program__set_expected_attach_type(skel->progs.fentry_test,
+					      BPF_TRACE_FENTRY_MULTI);
+	bpf_program__set_expected_attach_type(skel->progs.fexit_test,
+					      BPF_TRACE_FEXIT_MULTI);
+	bpf_program__set_expected_attach_type(skel->progs.fmod_ret_test,
+					      BPF_MODIFY_RETURN_MULTI);
+
+	err = modify_return__load(skel);
+	if (!ASSERT_OK(err, "skel_load"))
+		goto cleanup;
+
+	prog_cnt = sizeof(skel->progs) / sizeof(long);
+	err = bpf_attach_as_tracing_multi((void *)&skel->progs,
+					  prog_cnt,
+					  (void *)&skel->links);
+	if (!ASSERT_OK(err, "modify_return__attach failed"))
+		goto cleanup;
+
+	skel->bss->input_retval = input_retval;
+	prog_fd = bpf_program__fd(skel->progs.fmod_ret_test);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+
+	side_effect = UPPER(topts.retval);
+	ret = LOWER(topts.retval);
+
+	ASSERT_EQ(ret, want_ret, "test_run ret");
+	ASSERT_EQ(side_effect, want_side_effect, "modify_return side_effect");
+	ASSERT_EQ(skel->bss->fentry_result, 1, "modify_return fentry_result");
+	ASSERT_EQ(skel->bss->fexit_result, 1, "modify_return fexit_result");
+	ASSERT_EQ(skel->bss->fmod_ret_result, 1, "modify_return fmod_ret_result");
+cleanup:
+	modify_return__destroy(skel);
+}
+
 /* TODO: conflict with get_func_ip_test */
 void serial_test_modify_return(void)
 {
@@ -59,3 +109,13 @@ void serial_test_modify_return(void)
 		 0 /* want_side_effect */,
 		 -EINVAL * 2 /* want_ret */);
 }
+
+void serial_test_modify_return_multi(void)
+{
+	run_multi_test(0 /* input_retval */,
+		       2 /* want_side_effect */,
+		       33 /* want_ret */);
+	run_multi_test(-EINVAL /* input_retval */,
+		       1 /* want_side_effect */,
+		       -EINVAL + 29 /* want_ret */);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_multi_link.c b/tools/testing/selftests/bpf/prog_tests/tracing_multi_link.c
new file mode 100644
index 000000000000..1cbe6089472f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_multi_link.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 ChinaTelecom */
+
+#include <test_progs.h>
+
+#include "tracing_multi_test.skel.h"
+#include "fentry_multi_empty.skel.h"
+
+static void test_run(struct tracing_multi_test *skel)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	int err, prog_fd;
+
+	skel->bss->pid = getpid();
+	prog_fd = bpf_program__fd(skel->progs.fentry_cookie_test1);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(topts.retval, 0, "test_run");
+
+	ASSERT_EQ(skel->bss->fentry_test1_result, 1, "fentry_test1_result");
+	ASSERT_EQ(skel->bss->fentry_test2_result, 1, "fentry_test2_result");
+	ASSERT_EQ(skel->bss->fentry_test3_result, 1, "fentry_test3_result");
+	ASSERT_EQ(skel->bss->fentry_test4_result, 1, "fentry_test4_result");
+	ASSERT_EQ(skel->bss->fentry_test5_result, 1, "fentry_test5_result");
+	ASSERT_EQ(skel->bss->fentry_test6_result, 1, "fentry_test6_result");
+	ASSERT_EQ(skel->bss->fentry_test7_result, 1, "fentry_test7_result");
+	ASSERT_EQ(skel->bss->fentry_test8_result, 1, "fentry_test8_result");
+}
+
+static void test_skel_auto_api(void)
+{
+	struct tracing_multi_test *skel;
+	int err;
+
+	skel = tracing_multi_test__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_test__open_and_load"))
+		return;
+
+	/* disable all programs that should fail */
+	bpf_program__set_autoattach(skel->progs.fentry_fail_test1, false);
+	bpf_program__set_autoattach(skel->progs.fentry_fail_test2, false);
+	bpf_program__set_autoattach(skel->progs.fentry_fail_test3, false);
+	bpf_program__set_autoattach(skel->progs.fentry_fail_test4, false);
+	bpf_program__set_autoattach(skel->progs.fentry_fail_test5, false);
+	bpf_program__set_autoattach(skel->progs.fentry_fail_test6, false);
+
+	bpf_program__set_autoattach(skel->progs.fexit_fail_test1, false);
+	bpf_program__set_autoattach(skel->progs.fexit_fail_test2, false);
+	bpf_program__set_autoattach(skel->progs.fexit_fail_test3, false);
+
+	err = tracing_multi_test__attach(skel);
+	if (!ASSERT_OK(err, "tracing_multi_test__attach"))
+		goto cleanup;
+
+	test_run(skel);
+
+cleanup:
+	tracing_multi_test__destroy(skel);
+}
+
+static int attach_bpf(struct bpf_program *prog, struct bpf_link **link_ptr,
+		       bool success)
+{
+	struct bpf_link *link;
+	int err;
+
+	link = bpf_program__attach(prog);
+	err = libbpf_get_error(link);
+	if (!ASSERT_OK(success ? err : !err, "attach_bpf"))
+		return err;
+	*link_ptr = link;
+
+	return 0;
+}
+
+#define attach_skel_bpf(name, success) \
+	attach_bpf(skel->progs.name, &skel->links.name, success)
+
+static void test_skel_manual_api(void)
+{
+	struct tracing_multi_test *skel;
+
+	skel = tracing_multi_test__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_test__open_and_load"))
+		return;
+
+	if (attach_skel_bpf(fentry_success_test1, true) ||
+	    attach_skel_bpf(fentry_success_test2, true) ||
+	    attach_skel_bpf(fentry_success_test3, true) ||
+	    attach_skel_bpf(fentry_success_test4, true) ||
+	    attach_skel_bpf(fexit_success_test1, true) ||
+	    attach_skel_bpf(fexit_success_test2, true) ||
+	    attach_skel_bpf(fentry_fail_test1, false) ||
+	    attach_skel_bpf(fentry_fail_test2, false) ||
+	    attach_skel_bpf(fentry_fail_test3, false) ||
+	    attach_skel_bpf(fentry_fail_test4, false) ||
+	    attach_skel_bpf(fentry_fail_test5, false) ||
+	    attach_skel_bpf(fentry_fail_test6, false) ||
+	    attach_skel_bpf(fexit_fail_test1, false) ||
+	    attach_skel_bpf(fexit_fail_test2, false) ||
+	    attach_skel_bpf(fexit_fail_test3, false) ||
+	    attach_skel_bpf(fentry_cookie_test1, true))
+		goto cleanup;
+
+	test_run(skel);
+
+cleanup:
+	tracing_multi_test__destroy(skel);
+}
+
+static void test_attach_api(void)
+{
+	LIBBPF_OPTS(bpf_trace_multi_opts, opts);
+	struct tracing_multi_test *skel;
+	struct bpf_link *link;
+	const char *syms[8] = {
+		"bpf_fentry_test1",
+		"bpf_fentry_test2",
+		"bpf_fentry_test3",
+		"bpf_fentry_test4",
+		"bpf_fentry_test5",
+		"bpf_fentry_test6",
+		"bpf_fentry_test7",
+		"bpf_fentry_test8",
+	};
+	__u64 cookies[] = {1, 7, 2, 3, 4, 5, 6, 8};
+
+	skel = tracing_multi_test__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_multi_test__open_and_load"))
+		return;
+
+	opts.syms = syms;
+	opts.cookies = cookies;
+	opts.cnt = ARRAY_SIZE(syms);
+	link = bpf_program__attach_trace_multi_opts(skel->progs.fentry_cookie_test1,
+						    &opts);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_trace_multi_opts"))
+		goto cleanup;
+	skel->links.fentry_cookie_test1 = link;
+
+	skel->bss->test_cookie = true;
+	test_run(skel);
+cleanup:
+	tracing_multi_test__destroy(skel);
+}
+
+static void test_attach_bench(bool kernel)
+{
+	LIBBPF_OPTS(bpf_trace_multi_opts, opts);
+	struct fentry_multi_empty *skel;
+	long attach_start_ns, attach_end_ns;
+	long detach_start_ns, detach_end_ns;
+	double attach_delta, detach_delta;
+	struct bpf_link *link = NULL;
+	char **syms = NULL;
+	size_t cnt = 0;
+
+	if (!ASSERT_OK(bpf_get_ksyms(&syms, &cnt, kernel), "get_syms"))
+		return;
+
+	skel = fentry_multi_empty__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "fentry_multi_empty__open_and_load"))
+		goto cleanup;
+
+	opts.syms = (const char **) syms;
+	opts.cnt = cnt;
+	opts.skip_invalid = true;
+
+	attach_start_ns = get_time_ns();
+	link = bpf_program__attach_trace_multi_opts(skel->progs.fentry_multi_empty,
+						    &opts);
+	attach_end_ns = get_time_ns();
+
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_trace_multi_opts"))
+		return;
+
+	detach_start_ns = get_time_ns();
+	bpf_link__destroy(link);
+	detach_end_ns = get_time_ns();
+
+	attach_delta = (attach_end_ns - attach_start_ns) / 1000000000.0;
+	detach_delta = (detach_end_ns - detach_start_ns) / 1000000000.0;
+
+	printf("%s: found %lu functions\n", __func__, opts.cnt);
+	printf("%s: attached in %7.3lfs\n", __func__, attach_delta);
+	printf("%s: detached in %7.3lfs\n", __func__, detach_delta);
+
+cleanup:
+	fentry_multi_empty__destroy(skel);
+	if (syms)
+		free(syms);
+}
+
+void serial_test_tracing_multi_attach_bench(void)
+{
+	if (test__start_subtest("kernel"))
+		test_attach_bench(true);
+	if (test__start_subtest("modules"))
+		test_attach_bench(false);
+}
+
+void test_tracing_multi_attach_test(void)
+{
+	if (test__start_subtest("skel_auto_api"))
+		test_skel_auto_api();
+	if (test__start_subtest("skel_manual_api"))
+		test_skel_manual_api();
+	if (test__start_subtest("attach_api"))
+		test_attach_api();
+}
diff --git a/tools/testing/selftests/bpf/progs/fentry_multi_empty.c b/tools/testing/selftests/bpf/progs/fentry_multi_empty.c
new file mode 100644
index 000000000000..a09ba216dff8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/fentry_multi_empty.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 ChinaTelecom */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("fentry.multi/bpf_fentry_test1")
+int BPF_PROG(fentry_multi_empty)
+{
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_multi_test.c b/tools/testing/selftests/bpf/progs/tracing_multi_test.c
new file mode 100644
index 000000000000..fa27851896b9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_multi_test.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 ChinaTelecom */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct bpf_testmod_struct_arg_1 {
+	int a;
+};
+struct bpf_testmod_struct_arg_2 {
+	long a;
+	long b;
+};
+
+__u64 test_result = 0;
+
+int pid = 0;
+int test_cookie = 0;
+
+__u64 fentry_test1_result = 0;
+__u64 fentry_test2_result = 0;
+__u64 fentry_test3_result = 0;
+__u64 fentry_test4_result = 0;
+__u64 fentry_test5_result = 0;
+__u64 fentry_test6_result = 0;
+__u64 fentry_test7_result = 0;
+__u64 fentry_test8_result = 0;
+
+extern const void bpf_fentry_test1 __ksym;
+extern const void bpf_fentry_test2 __ksym;
+extern const void bpf_fentry_test3 __ksym;
+extern const void bpf_fentry_test4 __ksym;
+extern const void bpf_fentry_test5 __ksym;
+extern const void bpf_fentry_test6 __ksym;
+extern const void bpf_fentry_test7 __ksym;
+extern const void bpf_fentry_test8 __ksym;
+
+SEC("fentry.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_13")
+int BPF_PROG2(fentry_success_test1, struct bpf_testmod_struct_arg_2, a)
+{
+	test_result = a.a + a.b;
+	return 0;
+}
+
+SEC("fentry.multi/bpf_testmod_test_struct_arg_2,bpf_testmod_test_struct_arg_10")
+int BPF_PROG2(fentry_success_test2, int, a, struct bpf_testmod_struct_arg_2, b)
+{
+	test_result = a + b.a + b.b;
+	return 0;
+}
+
+SEC("fentry.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_4")
+int BPF_PROG2(fentry_success_test3, struct bpf_testmod_struct_arg_2, a, int, b,
+	      int, c)
+{
+	test_result = c;
+	return 0;
+}
+
+SEC("fentry.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_2")
+int BPF_PROG2(fentry_success_test4, struct bpf_testmod_struct_arg_2, a, int, b,
+	      int, c)
+{
+	test_result = c;
+	return 0;
+}
+
+SEC("fentry.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_1")
+int BPF_PROG2(fentry_fail_test1, struct bpf_testmod_struct_arg_2, a)
+{
+	test_result = a.a + a.b;
+	return 0;
+}
+
+SEC("fentry.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_2")
+int BPF_PROG2(fentry_fail_test2, struct bpf_testmod_struct_arg_2, a)
+{
+	test_result = a.a + a.b;
+	return 0;
+}
+
+SEC("fentry.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_4")
+int BPF_PROG2(fentry_fail_test3, struct bpf_testmod_struct_arg_2, a)
+{
+	test_result = a.a + a.b;
+	return 0;
+}
+
+SEC("fentry.multi/bpf_testmod_test_struct_arg_2,bpf_testmod_test_struct_arg_2")
+int BPF_PROG2(fentry_fail_test4, int, a, struct bpf_testmod_struct_arg_2, b)
+{
+	test_result = a + b.a + b.b;
+	return 0;
+}
+
+SEC("fentry.multi/bpf_testmod_test_struct_arg_2,bpf_testmod_test_struct_arg_13")
+int BPF_PROG2(fentry_fail_test5, int, a, struct bpf_testmod_struct_arg_2, b)
+{
+	test_result = a + b.a + b.b;
+	return 0;
+}
+
+SEC("fentry.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_12")
+int BPF_PROG2(fentry_fail_test6, struct bpf_testmod_struct_arg_2, a, int, b,
+	      int, c)
+{
+	test_result = c;
+	return 0;
+}
+
+SEC("fexit.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_2,bpf_testmod_test_struct_arg_3")
+int BPF_PROG2(fexit_success_test1, struct bpf_testmod_struct_arg_2, a, int, b,
+	      int, c, int, retval)
+{
+	test_result = retval;
+	return 0;
+}
+
+SEC("fexit.multi/bpf_testmod_test_struct_arg_2,bpf_testmod_test_struct_arg_12")
+int BPF_PROG2(fexit_success_test2, int, a, struct bpf_testmod_struct_arg_2, b,
+	      int, c, int, retval)
+{
+	test_result = a + b.a + b.b + retval;
+	return 0;
+}
+
+SEC("fexit.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_4")
+int BPF_PROG2(fexit_fail_test1, struct bpf_testmod_struct_arg_2, a, int, b,
+	      int, c, int, retval)
+{
+	test_result = retval;
+	return 0;
+}
+
+SEC("fexit.multi/bpf_testmod_test_struct_arg_2,bpf_testmod_test_struct_arg_10")
+int BPF_PROG2(fexit_fail_test2, int, a, struct bpf_testmod_struct_arg_2, b,
+	      int, c, int, retval)
+{
+	test_result = a + b.a + b.b + retval;
+	return 0;
+}
+
+SEC("fexit.multi/bpf_testmod_test_struct_arg_2,bpf_testmod_test_struct_arg_11")
+int BPF_PROG2(fexit_fail_test3, int, a, struct bpf_testmod_struct_arg_2, b,
+	      int, c, int, retval)
+{
+	test_result = a + b.a + b.b + retval;
+	return 0;
+}
+
+static void tracing_multi_check_cookie(unsigned long long *ctx)
+{
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return;
+
+	__u64 cookie = test_cookie ? bpf_get_attach_cookie(ctx) : 0;
+	__u64 addr = bpf_get_func_ip(ctx);
+
+#define SET(__var, __addr, __cookie) ({			\
+	if (((const void *) addr == __addr) &&		\
+	     (!test_cookie || (cookie == __cookie)))	\
+		__var = 1;				\
+})
+	SET(fentry_test1_result, &bpf_fentry_test1, 1);
+	SET(fentry_test2_result, &bpf_fentry_test2, 7);
+	SET(fentry_test3_result, &bpf_fentry_test3, 2);
+	SET(fentry_test4_result, &bpf_fentry_test4, 3);
+	SET(fentry_test5_result, &bpf_fentry_test5, 4);
+	SET(fentry_test6_result, &bpf_fentry_test6, 5);
+	SET(fentry_test7_result, &bpf_fentry_test7, 6);
+	SET(fentry_test8_result, &bpf_fentry_test8, 8);
+}
+
+SEC("fentry.multi/bpf_fentry_test1,bpf_fentry_test2,bpf_fentry_test3,bpf_fentry_test4,bpf_fentry_test5,bpf_fentry_test6,bpf_fentry_test7,bpf_fentry_test8")
+int BPF_PROG(fentry_cookie_test1)
+{
+	tracing_multi_check_cookie(ctx);
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index e9e918cdf31f..07ea1d5d3795 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -128,6 +128,30 @@ bpf_testmod_test_struct_arg_9(u64 a, void *b, short c, int d, void *e, char f,
 	return bpf_testmod_test_struct_arg_result;
 }
 
+noinline int
+bpf_testmod_test_struct_arg_10(int a, struct bpf_testmod_struct_arg_2 b) {
+	bpf_testmod_test_struct_arg_result = a + b.a + b.b;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline struct bpf_testmod_struct_arg_2 *
+bpf_testmod_test_struct_arg_11(int a, struct bpf_testmod_struct_arg_2 b, int c) {
+	bpf_testmod_test_struct_arg_result = a + b.a + b.b + c;
+	return (void *)bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_12(int a, struct bpf_testmod_struct_arg_2 b, int *c) {
+	bpf_testmod_test_struct_arg_result = a + b.a + b.b + *c;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_13(struct bpf_testmod_struct_arg_2 b) {
+	bpf_testmod_test_struct_arg_result = b.a + b.b;
+	return bpf_testmod_test_struct_arg_result;
+}
+
 noinline int
 bpf_testmod_test_arg_ptr_to_struct(struct bpf_testmod_struct_arg_1 *a) {
 	bpf_testmod_test_struct_arg_result = a->a;
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 309d9d4a8ace..533b714f1ca6 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -667,6 +667,56 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name)
 	return bpf_map__fd(map);
 }
 
+int bpf_to_tracing_multi(struct bpf_program **progs, int prog_cnt)
+{
+	enum bpf_attach_type type;
+	int i, err;
+
+	for (i = 0; i < prog_cnt; i++) {
+		type = bpf_program__get_expected_attach_type(progs[i]);
+		if (type == BPF_TRACE_FENTRY)
+			type = BPF_TRACE_FENTRY_MULTI;
+		else if (type == BPF_TRACE_FEXIT)
+			type = BPF_TRACE_FEXIT_MULTI;
+		else if (type == BPF_MODIFY_RETURN)
+			type = BPF_MODIFY_RETURN_MULTI;
+		else
+			continue;
+		err = bpf_program__set_expected_attach_type(progs[i], type);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int bpf_attach_as_tracing_multi(struct bpf_program **progs, int prog_cnt,
+				struct bpf_link **link)
+{
+	struct bpf_link *__link;
+	int err, type;
+
+	for (int i = 0; i < prog_cnt; i++) {
+		LIBBPF_OPTS(bpf_trace_multi_opts, opts);
+
+		type = bpf_program__get_expected_attach_type(progs[i]);
+		if (type != BPF_TRACE_FENTRY_MULTI &&
+		    type != BPF_TRACE_FEXIT_MULTI &&
+		    type != BPF_MODIFY_RETURN_MULTI)
+			continue;
+
+		opts.attach_tracing = true;
+		__link = bpf_program__attach_trace_multi_opts(progs[i], &opts);
+		err = libbpf_get_error(link);
+		if (err)
+			return err;
+
+		link[i] = __link;
+	}
+
+	return 0;
+}
+
 int compare_map_keys(int map1_fd, int map2_fd)
 {
 	__u32 key, next_key;
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index df2222a1806f..7e30c6dbf35c 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -496,6 +496,9 @@ int trigger_module_test_write(int write_sz);
 int write_sysctl(const char *sysctl, const char *value);
 int get_bpf_max_tramp_links_from(struct btf *btf);
 int get_bpf_max_tramp_links(void);
+int bpf_to_tracing_multi(struct bpf_program **progs, int prog_cnt);
+int bpf_attach_as_tracing_multi(struct bpf_program **progs, int prog_cnt,
+				struct bpf_link **link);
 
 struct netns_obj;
 struct netns_obj *netns_new(const char *name, bool open);
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index d24baf244d1f..a9e9dd3be226 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -559,6 +559,75 @@ static bool skip_entry(char *name)
 	if (!strncmp(name, "__ftrace_invalid_address__",
 		     sizeof("__ftrace_invalid_address__") - 1))
 		return true;
+
+	/* skip functions in "btf_id_deny" */
+	if (!strcmp(name, "migrate_disable"))
+		return true;
+	if (!strcmp(name, "migrate_enable"))
+		return true;
+	if (!strcmp(name, "rcu_read_unlock_strict"))
+		return true;
+	if (!strcmp(name, "preempt_count_add"))
+		return true;
+	if (!strcmp(name, "preempt_count_sub"))
+		return true;
+	if (!strcmp(name, "__rcu_read_lock"))
+		return true;
+	if (!strcmp(name, "__rcu_read_unlock"))
+		return true;
+
+	/* Following symbols have multi definition in kallsyms, take
+	 * "t_next" for example:
+	 *
+	 *     ffffffff813c10d0 t t_next
+	 *     ffffffff813d31b0 t t_next
+	 *     ffffffff813e06b0 t t_next
+	 *     ffffffff813eb360 t t_next
+	 *     ffffffff81613360 t t_next
+	 *
+	 * but only one of them have corresponding mrecord:
+	 *     ffffffff81613364 t_next
+	 *
+	 * The kernel search the target function address by the symbol
+	 * name "t_next" with kallsyms_lookup_name() during attaching
+	 * and the function "0xffffffff813c10d0" can be matched, which
+	 * doesn't have a corresponding mrecord. And this will make
+	 * the attach failing. Skip the functions like this.
+	 *
+	 * The list maybe not whole, so we still can fail......We need a
+	 * way to make the whole things right. Yes, we need fix it :/
+	 */
+	if (!strcmp(name, "kill_pid_usb_asyncio"))
+		return true;
+	if (!strcmp(name, "t_next"))
+		return true;
+	if (!strcmp(name, "t_stop"))
+		return true;
+	if (!strcmp(name, "t_start"))
+		return true;
+	if (!strcmp(name, "p_next"))
+		return true;
+	if (!strcmp(name, "p_stop"))
+		return true;
+	if (!strcmp(name, "p_start"))
+		return true;
+	if (!strcmp(name, "mem32_serial_out"))
+		return true;
+	if (!strcmp(name, "mem32_serial_in"))
+		return true;
+	if (!strcmp(name, "io_serial_in"))
+		return true;
+	if (!strcmp(name, "io_serial_out"))
+		return true;
+	if (!strcmp(name, "event_callback"))
+		return true;
+	if (!strcmp(name, "amd_pmu_init"))
+		return true;
+	if (!strcmp(name, "sync_regs"))
+		return true;
+	if (!strcmp(name, "empty"))
+		return true;
+
 	return false;
 }
 
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* [PATCH bpf-next v2 18/18] selftests/bpf: add bench tests for tracing_multi
       [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
                   ` (16 preceding siblings ...)
  2025-07-03 12:15 ` [PATCH bpf-next v2 17/18] selftests/bpf: add basic testcases for tracing_multi Menglong Dong
@ 2025-07-03 12:15 ` Menglong Dong
  17 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-03 12:15 UTC (permalink / raw)
  To: alexei.starovoitov, rostedt, jolsa
  Cc: bpf, Menglong Dong, Mykola Lysenko, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, linux-kselftest,
	linux-kernel

Add bench testcase for fentry_multi, fexit_multi and fmodret_multi in
bench_trigger.c.

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
v2:
- use the existing bpf bench framework instead of introducing new one
---
 tools/testing/selftests/bpf/bench.c           |  8 +++
 .../selftests/bpf/benchs/bench_trigger.c      | 72 +++++++++++++++++++
 .../selftests/bpf/benchs/run_bench_trigger.sh |  1 +
 .../selftests/bpf/progs/trigger_bench.c       | 22 ++++++
 4 files changed, 103 insertions(+)

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index ddd73d06a1eb..32f1e2e936c0 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -510,8 +510,12 @@ extern const struct bench bench_trig_kretprobe;
 extern const struct bench bench_trig_kprobe_multi;
 extern const struct bench bench_trig_kretprobe_multi;
 extern const struct bench bench_trig_fentry;
+extern const struct bench bench_trig_fentry_multi;
+extern const struct bench bench_trig_fentry_multi_all;
 extern const struct bench bench_trig_fexit;
+extern const struct bench bench_trig_fexit_multi;
 extern const struct bench bench_trig_fmodret;
+extern const struct bench bench_trig_fmodret_multi;
 extern const struct bench bench_trig_tp;
 extern const struct bench bench_trig_rawtp;
 
@@ -578,8 +582,12 @@ static const struct bench *benchs[] = {
 	&bench_trig_kprobe_multi,
 	&bench_trig_kretprobe_multi,
 	&bench_trig_fentry,
+	&bench_trig_fentry_multi,
+	&bench_trig_fentry_multi_all,
 	&bench_trig_fexit,
+	&bench_trig_fexit_multi,
 	&bench_trig_fmodret,
+	&bench_trig_fmodret_multi,
 	&bench_trig_tp,
 	&bench_trig_rawtp,
 	/* uprobes */
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 82327657846e..a1844ee358f1 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -226,6 +226,54 @@ static void trigger_fentry_setup(void)
 	attach_bpf(ctx.skel->progs.bench_trigger_fentry);
 }
 
+static void trigger_fentry_multi_setup(void)
+{
+	setup_ctx();
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fentry_multi, true);
+	load_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_fentry_multi);
+}
+
+static void trigger_fentry_multi_all_setup(void)
+{
+	LIBBPF_OPTS(bpf_trace_multi_opts, opts);
+	struct bpf_program *prog;
+	struct bpf_link *link;
+	char **syms = NULL;
+	size_t cnt = 0;
+	int i;
+
+	setup_ctx();
+	prog = ctx.skel->progs.bench_trigger_fentry_multi;
+	bpf_program__set_autoload(prog, true);
+	load_ctx();
+
+	if (bpf_get_ksyms(&syms, &cnt, true)) {
+		printf("failed to get ksyms\n");
+		exit(1);
+	}
+
+	for (i = 0; i < cnt; i++) {
+		if (strcmp(syms[i], "bpf_get_numa_node_id") == 0)
+			break;
+	}
+	if (i == cnt) {
+		printf("bpf_get_numa_node_id not found in ksyms\n");
+		exit(1);
+	}
+
+	printf("found %zu ksyms\n", cnt);
+	opts.syms = (const char **) syms;
+	opts.cnt = cnt;
+	opts.skip_invalid = true;
+	link = bpf_program__attach_trace_multi_opts(prog, &opts);
+	if (!link) {
+		printf("failed to attach bench_trigger_fentry_multi to all\n");
+		exit(1);
+	}
+	ctx.skel->links.bench_trigger_fentry_multi = link;
+}
+
 static void trigger_fexit_setup(void)
 {
 	setup_ctx();
@@ -234,6 +282,14 @@ static void trigger_fexit_setup(void)
 	attach_bpf(ctx.skel->progs.bench_trigger_fexit);
 }
 
+static void trigger_fexit_multi_setup(void)
+{
+	setup_ctx();
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fexit_multi, true);
+	load_ctx();
+	attach_bpf(ctx.skel->progs.bench_trigger_fexit_multi);
+}
+
 static void trigger_fmodret_setup(void)
 {
 	setup_ctx();
@@ -246,6 +302,18 @@ static void trigger_fmodret_setup(void)
 	attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
 }
 
+static void trigger_fmodret_multi_setup(void)
+{
+	setup_ctx();
+	bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
+	bpf_program__set_autoload(ctx.skel->progs.trigger_driver_kfunc, true);
+	bpf_program__set_autoload(ctx.skel->progs.bench_trigger_fmodret_multi, true);
+	load_ctx();
+	/* override driver program */
+	ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_driver_kfunc);
+	attach_bpf(ctx.skel->progs.bench_trigger_fmodret_multi);
+}
+
 static void trigger_tp_setup(void)
 {
 	setup_ctx();
@@ -512,8 +580,12 @@ BENCH_TRIG_KERNEL(kretprobe, "kretprobe");
 BENCH_TRIG_KERNEL(kprobe_multi, "kprobe-multi");
 BENCH_TRIG_KERNEL(kretprobe_multi, "kretprobe-multi");
 BENCH_TRIG_KERNEL(fentry, "fentry");
+BENCH_TRIG_KERNEL(fentry_multi, "fentry-multi");
+BENCH_TRIG_KERNEL(fentry_multi_all, "fentry-multi-all");
 BENCH_TRIG_KERNEL(fexit, "fexit");
+BENCH_TRIG_KERNEL(fexit_multi, "fexit-multi");
 BENCH_TRIG_KERNEL(fmodret, "fmodret");
+BENCH_TRIG_KERNEL(fmodret_multi, "fmodret-multi");
 BENCH_TRIG_KERNEL(tp, "tp");
 BENCH_TRIG_KERNEL(rawtp, "rawtp");
 
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
index a690f5a68b6b..48a7f809d053 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
@@ -5,6 +5,7 @@ set -eufo pipefail
 def_tests=( \
 	usermode-count kernel-count syscall-count \
 	fentry fexit fmodret \
+	fentry-multi fentry-multi-all fexit-multi fmodret-multi \
 	rawtp tp \
 	kprobe kprobe-multi \
 	kretprobe kretprobe-multi \
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 044a6d78923e..2ff1a7568080 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -111,6 +111,13 @@ int bench_trigger_fentry(void *ctx)
 	return 0;
 }
 
+SEC("?fentry.multi/bpf_get_numa_node_id")
+int bench_trigger_fentry_multi(void *ctx)
+{
+	inc_counter();
+	return 0;
+}
+
 SEC("?fexit/bpf_get_numa_node_id")
 int bench_trigger_fexit(void *ctx)
 {
@@ -118,6 +125,14 @@ int bench_trigger_fexit(void *ctx)
 	return 0;
 }
 
+SEC("?fexit.multi/bpf_get_numa_node_id")
+int bench_trigger_fexit_multi(void *ctx)
+{
+	inc_counter();
+
+	return 0;
+}
+
 SEC("?fmod_ret/bpf_modify_return_test_tp")
 int bench_trigger_fmodret(void *ctx)
 {
@@ -125,6 +140,13 @@ int bench_trigger_fmodret(void *ctx)
 	return -22;
 }
 
+SEC("?fmod_ret.multi/bpf_modify_return_test_tp")
+int bench_trigger_fmodret_multi(void *ctx)
+{
+	inc_counter();
+	return -22;
+}
+
 SEC("?tp/bpf_test_run/bpf_trigger_tp")
 int bench_trigger_tp(void *ctx)
 {
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 04/18] ftrace: add reset_ftrace_direct_ips
  2025-07-03 12:15 ` [PATCH bpf-next v2 04/18] ftrace: add reset_ftrace_direct_ips Menglong Dong
@ 2025-07-03 15:30   ` Steven Rostedt
  2025-07-04  1:54     ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Steven Rostedt @ 2025-07-03 15:30 UTC (permalink / raw)
  To: Menglong Dong
  Cc: alexei.starovoitov, jolsa, bpf, Menglong Dong, Mark Rutland,
	Mathieu Desnoyers, linux-kernel, linux-trace-kernel

On Thu,  3 Jul 2025 20:15:07 +0800
Menglong Dong <menglong8.dong@gmail.com> wrote:

Note, the tracing subsystem uses capitalization in the subject:

  ftrace: Add reset_ftrace_direct_ips


> For now, we can change the address of a direct ftrace_ops with
> modify_ftrace_direct(). However, we can't change the functions to filter
> for a direct ftrace_ops. Therefore, we introduce the function
> reset_ftrace_direct_ips() to do such things, and this function will reset
> the functions to filter for a direct ftrace_ops.
> 
> This function do such thing in following steps:
> 
> 1. filter out the new functions from ips that don't exist in the
>    ops->func_hash->filter_hash and add them to the new hash.
> 2. add all the functions in the new ftrace_hash to direct_functions by
>    ftrace_direct_update().
> 3. reset the functions to filter of the ftrace_ops to the ips with
>    ftrace_set_filter_ips().
> 4. remove the functions that in the old ftrace_hash, but not in the new
>    ftrace_hash from direct_functions.

Please also include a module that can be loaded for testing.
See samples/ftrace/ftrace-direct*

But make it a separate patch. And you'll need to add a test in selftests.
See tools/testing/selftests/ftrace/test.d/direct

> 
> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> ---
>  include/linux/ftrace.h |  7 ++++
>  kernel/trace/ftrace.c  | 75 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 82 insertions(+)
> 
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index b672ca15f265..b7c60f5a4120 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -528,6 +528,8 @@ int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr);
>  
>  void ftrace_stub_direct_tramp(void);
>  
> +int reset_ftrace_direct_ips(struct ftrace_ops *ops, unsigned long *ips,
> +			    unsigned int cnt);
>  #else
>  struct ftrace_ops;
>  static inline unsigned long ftrace_find_rec_direct(unsigned long ip)
> @@ -551,6 +553,11 @@ static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned l
>  {
>  	return -ENODEV;
>  }
> +static inline int reset_ftrace_direct_ips(struct ftrace_ops *ops, unsigned long *ips,
> +					  unsigned int cnt)
> +{
> +	return -ENODEV;
> +}
>  
>  /*
>   * This must be implemented by the architecture.
> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> index f5f6d7bc26f0..db3aa61889d3 100644
> --- a/kernel/trace/ftrace.c
> +++ b/kernel/trace/ftrace.c
> @@ -6224,6 +6224,81 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
>  	return err;
>  }
>  EXPORT_SYMBOL_GPL(modify_ftrace_direct);
> +
> +/* reset the ips for a direct ftrace (add or remove) */

As this function is being used externally, it requires proper KernelDoc
headers.

What exactly do you mean by "reset"?

> +int reset_ftrace_direct_ips(struct ftrace_ops *ops, unsigned long *ips,
> +			    unsigned int cnt)
> +{
> +	struct ftrace_hash *hash, *free_hash;
> +	struct ftrace_func_entry *entry, *del;
> +	unsigned long ip;
> +	int err, size;
> +
> +	if (check_direct_multi(ops))
> +		return -EINVAL;
> +	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
> +		return -EINVAL;
> +
> +	mutex_lock(&direct_mutex);
> +	hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
> +	if (!hash) {
> +		err = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	/* find out the new functions from ips and add to hash */

Capitalize comment: /* Find out ...

> +	for (int i = 0; i < cnt; i++) {
> +		ip = ftrace_location(ips[i]);
> +		if (!ip) {
> +			err = -ENOENT;
> +			goto out_unlock;
> +		}
> +		if (__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
> +			continue;
> +		err = __ftrace_match_addr(hash, ip, 0);
> +		if (err)
> +			goto out_unlock;
> +	}
> +
> +	free_hash = direct_functions;

Add newline.

> +	/* add the new ips to direct hash. */

Again capitalize.

> +	err = ftrace_direct_update(hash, ops->direct_call);
> +	if (err)
> +		goto out_unlock;
> +
> +	if (free_hash && free_hash != EMPTY_HASH)
> +		call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);

Since the above is now used more than once, let's make it into a helper
function so that if things change, there's only one place to change it:

	free_ftrace_direct(free_hash);

static inline void free_ftrace_direct(struct ftrace_hash *hash)
{
	if (hash && hash != EMPTY_HASH)
		call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
}

> +
> +	free_ftrace_hash(hash);
> +	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS,
> +					  ops->func_hash->filter_hash);
> +	if (!hash) {
> +		err = -ENOMEM;
> +		goto out_unlock;
> +	}
> +	err = ftrace_set_filter_ips(ops, ips, cnt, 0, 1);
> +
> +	/* remove the entries that don't exist in our filter_hash anymore
> +	 * from the direct_functions.
> +	 */

This isn't the network subsystem, we use the default comment style for multiple lines:

/*
 * line 1
 * line 2
 * ...
 */

-- Steve

> +	size = 1 << hash->size_bits;
> +	for (int i = 0; i < size; i++) {
> +		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
> +			if (__ftrace_lookup_ip(ops->func_hash->filter_hash, entry->ip))
> +				continue;
> +			del = __ftrace_lookup_ip(direct_functions, entry->ip);
> +			if (del && del->direct == ops->direct_call) {
> +				remove_hash_entry(direct_functions, del);
> +				kfree(del);
> +			}
> +		}
> +	}
> +out_unlock:
> +	mutex_unlock(&direct_mutex);
> +	if (hash)
> +		free_ftrace_hash(hash);
> +	return err;
> +}
>  #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
>  
>  /**


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 04/18] ftrace: add reset_ftrace_direct_ips
  2025-07-03 15:30   ` Steven Rostedt
@ 2025-07-04  1:54     ` Menglong Dong
  2025-07-07 18:52       ` Steven Rostedt
  0 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-04  1:54 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: alexei.starovoitov, jolsa, bpf, Menglong Dong, Mark Rutland,
	Mathieu Desnoyers, linux-kernel, linux-trace-kernel

On Thu, Jul 3, 2025 at 11:30 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Thu,  3 Jul 2025 20:15:07 +0800
> Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> Note, the tracing subsystem uses capitalization in the subject:
>
>   ftrace: Add reset_ftrace_direct_ips

Hi, Steven. Thanks for your feedback. I'll keep this point
in mind. I was wondering why Alexei changed the "make" to "Make"
in c11f34e30088 :/

>
>
> > For now, we can change the address of a direct ftrace_ops with
> > modify_ftrace_direct(). However, we can't change the functions to filter
> > for a direct ftrace_ops. Therefore, we introduce the function
> > reset_ftrace_direct_ips() to do such things, and this function will reset
> > the functions to filter for a direct ftrace_ops.
> >
> > This function do such thing in following steps:
> >
> > 1. filter out the new functions from ips that don't exist in the
> >    ops->func_hash->filter_hash and add them to the new hash.
> > 2. add all the functions in the new ftrace_hash to direct_functions by
> >    ftrace_direct_update().
> > 3. reset the functions to filter of the ftrace_ops to the ips with
> >    ftrace_set_filter_ips().
> > 4. remove the functions that in the old ftrace_hash, but not in the new
> >    ftrace_hash from direct_functions.
>
> Please also include a module that can be loaded for testing.
> See samples/ftrace/ftrace-direct*
>
> But make it a separate patch. And you'll need to add a test in selftests.
> See tools/testing/selftests/ftrace/test.d/direct

Okay!

>
> >
> > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> > ---
> >  include/linux/ftrace.h |  7 ++++
> >  kernel/trace/ftrace.c  | 75 ++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 82 insertions(+)
> >
> > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> > index b672ca15f265..b7c60f5a4120 100644
> > --- a/include/linux/ftrace.h
> > +++ b/include/linux/ftrace.h
> > @@ -528,6 +528,8 @@ int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr);
> >
> >  void ftrace_stub_direct_tramp(void);
> >
> > +int reset_ftrace_direct_ips(struct ftrace_ops *ops, unsigned long *ips,
> > +                         unsigned int cnt);
> >  #else
> >  struct ftrace_ops;
> >  static inline unsigned long ftrace_find_rec_direct(unsigned long ip)
> > @@ -551,6 +553,11 @@ static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned l
> >  {
> >       return -ENODEV;
> >  }
> > +static inline int reset_ftrace_direct_ips(struct ftrace_ops *ops, unsigned long *ips,
> > +                                       unsigned int cnt)
> > +{
> > +     return -ENODEV;
> > +}
> >
> >  /*
> >   * This must be implemented by the architecture.
> > diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> > index f5f6d7bc26f0..db3aa61889d3 100644
> > --- a/kernel/trace/ftrace.c
> > +++ b/kernel/trace/ftrace.c
> > @@ -6224,6 +6224,81 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
> >       return err;
> >  }
> >  EXPORT_SYMBOL_GPL(modify_ftrace_direct);
> > +
> > +/* reset the ips for a direct ftrace (add or remove) */
>
> As this function is being used externally, it requires proper KernelDoc
> headers.

Okay!

>
> What exactly do you mean by "reset"?

It means to reset the filter hash of the ftrace_ops to ips. In
the origin logic, the filter hash of a direct ftrace_ops will not
be changed. However, in the tracing-multi case, there are
multi functions in the filter hash and can change. This function
is used to change the filter hash of a direct ftrace_ops.

>
> > +int reset_ftrace_direct_ips(struct ftrace_ops *ops, unsigned long *ips,
> > +                         unsigned int cnt)
> > +{
> > +     struct ftrace_hash *hash, *free_hash;
> > +     struct ftrace_func_entry *entry, *del;
> > +     unsigned long ip;
> > +     int err, size;
> > +
> > +     if (check_direct_multi(ops))
> > +             return -EINVAL;
> > +     if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
> > +             return -EINVAL;
> > +
> > +     mutex_lock(&direct_mutex);
> > +     hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
> > +     if (!hash) {
> > +             err = -ENOMEM;
> > +             goto out_unlock;
> > +     }
> > +
> > +     /* find out the new functions from ips and add to hash */
>
> Capitalize comment: /* Find out ...
>
> > +     for (int i = 0; i < cnt; i++) {
> > +             ip = ftrace_location(ips[i]);
> > +             if (!ip) {
> > +                     err = -ENOENT;
> > +                     goto out_unlock;
> > +             }
> > +             if (__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
> > +                     continue;
> > +             err = __ftrace_match_addr(hash, ip, 0);
> > +             if (err)
> > +                     goto out_unlock;
> > +     }
> > +
> > +     free_hash = direct_functions;
>
> Add newline.
>
> > +     /* add the new ips to direct hash. */
>
> Again capitalize.
>
> > +     err = ftrace_direct_update(hash, ops->direct_call);
> > +     if (err)
> > +             goto out_unlock;
> > +
> > +     if (free_hash && free_hash != EMPTY_HASH)
> > +             call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
>
> Since the above is now used more than once, let's make it into a helper
> function so that if things change, there's only one place to change it:
>
>         free_ftrace_direct(free_hash);
>
> static inline void free_ftrace_direct(struct ftrace_hash *hash)
> {
>         if (hash && hash != EMPTY_HASH)
>                 call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
> }

Sounds nice~

>
> > +
> > +     free_ftrace_hash(hash);
> > +     hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS,
> > +                                       ops->func_hash->filter_hash);
> > +     if (!hash) {
> > +             err = -ENOMEM;
> > +             goto out_unlock;
> > +     }
> > +     err = ftrace_set_filter_ips(ops, ips, cnt, 0, 1);
> > +
> > +     /* remove the entries that don't exist in our filter_hash anymore
> > +      * from the direct_functions.
> > +      */
>
> This isn't the network subsystem, we use the default comment style for multiple lines:
>
> /*
>  * line 1
>  * line 2
>  * ...
>  */

Okay! I'll do the modification as your comment in this (and other)
patches.

Thanks!
Menglong Dong

>
> -- Steve
>
> > +     size = 1 << hash->size_bits;
> > +     for (int i = 0; i < size; i++) {
> > +             hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
> > +                     if (__ftrace_lookup_ip(ops->func_hash->filter_hash, entry->ip))
> > +                             continue;
> > +                     del = __ftrace_lookup_ip(direct_functions, entry->ip);
> > +                     if (del && del->direct == ops->direct_call) {
> > +                             remove_hash_entry(direct_functions, del);
> > +                             kfree(del);
> > +                     }
> > +             }
> > +     }
> > +out_unlock:
> > +     mutex_unlock(&direct_mutex);
> > +     if (hash)
> > +             free_ftrace_hash(hash);
> > +     return err;
> > +}
> >  #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
> >
> >  /**
>

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 05/18] bpf: introduce bpf_gtramp_link
  2025-07-03 12:15 ` [PATCH bpf-next v2 05/18] bpf: introduce bpf_gtramp_link Menglong Dong
@ 2025-07-04  7:00   ` kernel test robot
  2025-07-04  7:52   ` kernel test robot
  1 sibling, 0 replies; 73+ messages in thread
From: kernel test robot @ 2025-07-04  7:00 UTC (permalink / raw)
  To: Menglong Dong, alexei.starovoitov, rostedt, jolsa
  Cc: oe-kbuild-all, bpf, Menglong Dong, John Fastabend,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	KP Singh, Stanislav Fomichev, Hao Luo, linux-kernel

Hi Menglong,

kernel test robot noticed the following build warnings:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Menglong-Dong/bpf-add-function-hash-table-for-tracing-multi/20250703-203035
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20250703121521.1874196-6-dongml2%40chinatelecom.cn
patch subject: [PATCH bpf-next v2 05/18] bpf: introduce bpf_gtramp_link
config: x86_64-buildonly-randconfig-002-20250704 (https://download.01.org/0day-ci/archive/20250704/202507041433.Taj70BHu-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250704/202507041433.Taj70BHu-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507041433.Taj70BHu-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> kernel/bpf/trampoline.c:36:34: warning: 'bpf_shim_tramp_link_lops' defined but not used [-Wunused-const-variable=]
      36 | static const struct bpf_link_ops bpf_shim_tramp_link_lops;
         |                                  ^~~~~~~~~~~~~~~~~~~~~~~~


vim +/bpf_shim_tramp_link_lops +36 kernel/bpf/trampoline.c

    33	
    34	static struct bpf_global_trampoline global_tr_array[MAX_BPF_FUNC_ARGS + 1];
    35	static DEFINE_MUTEX(global_tr_lock);
  > 36	static const struct bpf_link_ops bpf_shim_tramp_link_lops;
    37	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 05/18] bpf: introduce bpf_gtramp_link
  2025-07-03 12:15 ` [PATCH bpf-next v2 05/18] bpf: introduce bpf_gtramp_link Menglong Dong
  2025-07-04  7:00   ` kernel test robot
@ 2025-07-04  7:52   ` kernel test robot
  1 sibling, 0 replies; 73+ messages in thread
From: kernel test robot @ 2025-07-04  7:52 UTC (permalink / raw)
  To: Menglong Dong, alexei.starovoitov, rostedt, jolsa
  Cc: oe-kbuild-all, bpf, Menglong Dong, John Fastabend,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	KP Singh, Stanislav Fomichev, Hao Luo, linux-kernel

Hi Menglong,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Menglong-Dong/bpf-add-function-hash-table-for-tracing-multi/20250703-203035
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20250703121521.1874196-6-dongml2%40chinatelecom.cn
patch subject: [PATCH bpf-next v2 05/18] bpf: introduce bpf_gtramp_link
config: alpha-allnoconfig (https://download.01.org/0day-ci/archive/20250704/202507041510.pXgjmaZP-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250704/202507041510.pXgjmaZP-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507041510.pXgjmaZP-lkp@intel.com/

All errors (new ones prefixed by >>):

   alpha-linux-ld: init/do_mounts.o: in function `bpf_gtrampoline_link_prog':
>> (.text+0x30): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: init/do_mounts.o: in function `bpf_gtrampoline_unlink_prog':
>> (.text+0x40): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: init/noinitramfs.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x0): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: init/noinitramfs.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x10): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: arch/alpha/kernel/osf_sys.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x1190): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: arch/alpha/kernel/osf_sys.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x11a0): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: arch/alpha/kernel/signal.o: in function `bpf_gtrampoline_link_prog':
   (.text+0xb90): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: arch/alpha/kernel/signal.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0xba0): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: arch/alpha/kernel/ptrace.o: in function `bpf_gtrampoline_link_prog':
   (.text+0xe0): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: arch/alpha/kernel/ptrace.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0xf0): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: arch/alpha/kernel/pci.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x1a0): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: arch/alpha/kernel/pci.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x1b0): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: arch/alpha/mm/fault.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x1c0): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: arch/alpha/mm/fault.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x1d0): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/fork.o: in function `bpf_gtrampoline_link_prog':
   (.text+0xd30): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/fork.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0xd40): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/exec_domain.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x40): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/exec_domain.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x50): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/cpu.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x890): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/cpu.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x8a0): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/exit.o: in function `bpf_gtrampoline_link_prog':
   (.text+0xac0): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/exit.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0xad0): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/resource.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x1560): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/resource.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x1570): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/sysctl.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x24c0): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/sysctl.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x24d0): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/capability.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x770): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/capability.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x780): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/ptrace.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x460): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/ptrace.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x470): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/signal.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x2270): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/signal.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x2280): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/sys.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x18e0): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/sys.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x18f0): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/umh.o: in function `bpf_gtrampoline_link_prog':
   (.text+0xb40): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/umh.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0xb50): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/pid.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x890): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/pid.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x8a0): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/extable.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x0): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/extable.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x10): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/params.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x1240): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/params.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x1250): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/nsproxy.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x530): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/nsproxy.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x540): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/cred.o: in function `bpf_gtrampoline_link_prog':
   (.text+0xb10): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/cred.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0xb20): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/reboot.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x1860): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/reboot.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x1870): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/ksyms_common.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x0): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/ksyms_common.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x10): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here
   alpha-linux-ld: kernel/groups.o: in function `bpf_gtrampoline_link_prog':
   (.text+0x460): multiple definition of `bpf_gtrampoline_link_prog'; init/main.o:(.text+0x0): first defined here
   alpha-linux-ld: kernel/groups.o: in function `bpf_gtrampoline_unlink_prog':
   (.text+0x470): multiple definition of `bpf_gtrampoline_unlink_prog'; init/main.o:(.text+0x10): first defined here

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi
  2025-07-03 12:15 ` [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi Menglong Dong
@ 2025-07-04 16:07   ` kernel test robot
  2025-07-15  1:55   ` Alexei Starovoitov
  1 sibling, 0 replies; 73+ messages in thread
From: kernel test robot @ 2025-07-04 16:07 UTC (permalink / raw)
  To: Menglong Dong, alexei.starovoitov, rostedt, jolsa
  Cc: oe-kbuild-all, bpf, Menglong Dong, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, linux-kernel

Hi Menglong,

kernel test robot noticed the following build warnings:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Menglong-Dong/bpf-add-function-hash-table-for-tracing-multi/20250703-203035
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20250703121521.1874196-2-dongml2%40chinatelecom.cn
patch subject: [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi
config: i386-randconfig-062-20250704 (https://download.01.org/0day-ci/archive/20250704/202507042321.HJfEZMgT-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250704/202507042321.HJfEZMgT-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507042321.HJfEZMgT-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> kernel/bpf/kfunc_md.c:14:23: sparse: sparse: symbol 'default_mds' was not declared. Should it be static?
>> kernel/bpf/kfunc_md.c:21:43: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct kfunc_md_array [noderef] __rcu *[addressable] [toplevel] kfunc_mds @@     got struct kfunc_md_array * @@
   kernel/bpf/kfunc_md.c:21:43: sparse:     expected struct kfunc_md_array [noderef] __rcu *[addressable] [toplevel] kfunc_mds
   kernel/bpf/kfunc_md.c:21:43: sparse:     got struct kfunc_md_array *
>> kernel/bpf/kfunc_md.c:186:26: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct atomic_t const [usertype] *v @@     got struct atomic_t [noderef] __rcu * @@
   kernel/bpf/kfunc_md.c:186:26: sparse:     expected struct atomic_t const [usertype] *v
   kernel/bpf/kfunc_md.c:186:26: sparse:     got struct atomic_t [noderef] __rcu *
>> kernel/bpf/kfunc_md.c:111:32: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct kfunc_md_array *mds @@     got struct kfunc_md_array [noderef] __rcu *extern [addressable] [toplevel] kfunc_mds @@
   kernel/bpf/kfunc_md.c:111:32: sparse:     expected struct kfunc_md_array *mds
   kernel/bpf/kfunc_md.c:111:32: sparse:     got struct kfunc_md_array [noderef] __rcu *extern [addressable] [toplevel] kfunc_mds
>> kernel/bpf/kfunc_md.c:153:17: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct kfunc_md_array *old_mds @@     got struct kfunc_md_array [noderef] __rcu *extern [addressable] [toplevel] kfunc_mds @@
   kernel/bpf/kfunc_md.c:153:17: sparse:     expected struct kfunc_md_array *old_mds
   kernel/bpf/kfunc_md.c:153:17: sparse:     got struct kfunc_md_array [noderef] __rcu *extern [addressable] [toplevel] kfunc_mds
   kernel/bpf/kfunc_md.c:194:26: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct atomic_t const [usertype] *v @@     got struct atomic_t [noderef] __rcu * @@
   kernel/bpf/kfunc_md.c:194:26: sparse:     expected struct atomic_t const [usertype] *v
   kernel/bpf/kfunc_md.c:194:26: sparse:     got struct atomic_t [noderef] __rcu *
>> kernel/bpf/kfunc_md.c:214:21: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct atomic_t [usertype] *v @@     got struct atomic_t [noderef] __rcu * @@
   kernel/bpf/kfunc_md.c:214:21: sparse:     expected struct atomic_t [usertype] *v
   kernel/bpf/kfunc_md.c:214:21: sparse:     got struct atomic_t [noderef] __rcu *
   kernel/bpf/kfunc_md.c:238:30: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct atomic_t const [usertype] *v @@     got struct atomic_t [noderef] __rcu * @@
   kernel/bpf/kfunc_md.c:238:30: sparse:     expected struct atomic_t const [usertype] *v
   kernel/bpf/kfunc_md.c:238:30: sparse:     got struct atomic_t [noderef] __rcu *
>> kernel/bpf/kfunc_md.c:126:21: sparse: sparse: dereference of noderef expression
   kernel/bpf/kfunc_md.c:139:31: sparse: sparse: dereference of noderef expression
   kernel/bpf/kfunc_md.c:140:17: sparse: sparse: dereference of noderef expression
   kernel/bpf/kfunc_md.c:186:57: sparse: sparse: dereference of noderef expression
   kernel/bpf/kfunc_md.c:194:57: sparse: sparse: dereference of noderef expression
   kernel/bpf/kfunc_md.c:197:13: sparse: sparse: dereference of noderef expression
   kernel/bpf/kfunc_md.c:248:35: sparse: sparse: dereference of noderef expression
   kernel/bpf/kfunc_md.c:249:17: sparse: sparse: dereference of noderef expression

vim +/default_mds +14 kernel/bpf/kfunc_md.c

    12	
    13	#define MIN_KFUNC_MD_ARRAY_BITS 4
  > 14	struct kfunc_md_array default_mds = {
    15		.used = ATOMIC_INIT(0),
    16		.hash_bits = MIN_KFUNC_MD_ARRAY_BITS,
    17		.mds = {
    18			[0 ... ((1 << MIN_KFUNC_MD_ARRAY_BITS) - 1)] = HLIST_HEAD_INIT,
    19		},
    20	};
  > 21	struct kfunc_md_array __rcu *kfunc_mds = &default_mds;
    22	EXPORT_SYMBOL_GPL(kfunc_mds);
    23	
    24	static DEFINE_MUTEX(kfunc_md_mutex);
    25	
    26	static int kfunc_md_array_inc(void);
    27	
    28	static void kfunc_md_release_rcu(struct rcu_head *rcu)
    29	{
    30		struct kfunc_md *md;
    31	
    32		md = container_of(rcu, struct kfunc_md, rcu);
    33		/* Step 4, free the md */
    34		kfree(md);
    35	}
    36	
    37	static void kfunc_md_release_rcu_tasks(struct rcu_head *rcu)
    38	{
    39		struct kfunc_md *md;
    40	
    41		md = container_of(rcu, struct kfunc_md, rcu);
    42		/* Step 3, wait for the nornal progs and bfp_global_caller to finish */
    43		call_rcu_tasks(&md->rcu, kfunc_md_release_rcu);
    44	}
    45	
    46	static void kfunc_md_release(struct percpu_ref *pcref)
    47	{
    48		struct kfunc_md *md;
    49	
    50		md = container_of(pcref, struct kfunc_md, pcref);
    51		percpu_ref_exit(&md->pcref);
    52	
    53		/* Step 2, wait for sleepable progs to finish. */
    54		call_rcu_tasks_trace(&md->rcu, kfunc_md_release_rcu_tasks);
    55	}
    56	
    57	struct kfunc_md *kfunc_md_get(unsigned long ip)
    58	{
    59		struct kfunc_md_array *mds;
    60		struct kfunc_md *md;
    61	
    62		rcu_read_lock();
    63		mds = rcu_dereference(kfunc_mds);
    64		md = __kfunc_md_get(mds, ip);
    65		rcu_read_unlock();
    66	
    67		return md;
    68	}
    69	EXPORT_SYMBOL_GPL(kfunc_md_get);
    70	
    71	static struct kfunc_md *__kfunc_md_create(struct kfunc_md_array *mds, unsigned long ip,
    72						  int nr_args)
    73	{
    74		struct kfunc_md *md = __kfunc_md_get(mds, ip);
    75		int err;
    76	
    77		if (md) {
    78			md->users++;
    79			return md;
    80		}
    81	
    82		md = kzalloc(sizeof(*md), GFP_KERNEL);
    83		if (!md)
    84			return NULL;
    85	
    86		md->users = 1;
    87		md->func = ip;
    88		md->nr_args = nr_args;
    89	
    90		err = percpu_ref_init(&md->pcref, kfunc_md_release, 0, GFP_KERNEL);
    91		if (err) {
    92			kfree(md);
    93			return NULL;
    94		}
    95	
    96		hlist_add_head_rcu(&md->hash, kfunc_md_hash_head(mds, ip));
    97		atomic_inc(&mds->used);
    98	
    99		return md;
   100	}
   101	
   102	struct kfunc_md *kfunc_md_create(unsigned long ip, int nr_args)
   103	{
   104		struct kfunc_md *md = NULL;
   105	
   106		mutex_lock(&kfunc_md_mutex);
   107	
   108		if (kfunc_md_array_inc())
   109			goto out;
   110	
 > 111		md = __kfunc_md_create(kfunc_mds, ip, nr_args);
   112	out:
   113		mutex_unlock(&kfunc_md_mutex);
   114	
   115		return md;
   116	}
   117	EXPORT_SYMBOL_GPL(kfunc_md_create);
   118	
   119	static int kfunc_md_array_adjust(bool inc)
   120	{
   121		struct kfunc_md_array *new_mds, *old_mds;
   122		struct kfunc_md *md, *new_md;
   123		struct hlist_node *n;
   124		int size, hash_bits, i;
   125	
 > 126		hash_bits = kfunc_mds->hash_bits;
   127		hash_bits += inc ? 1 : -1;
   128	
   129		size = sizeof(*new_mds) + sizeof(struct hlist_head) * (1 << hash_bits);
   130		new_mds = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
   131		if (!new_mds)
   132			return -ENOMEM;
   133	
   134		new_mds->hash_bits = hash_bits;
   135		for (i = 0; i < (1 << new_mds->hash_bits); i++)
   136			INIT_HLIST_HEAD(&new_mds->mds[i]);
   137	
   138		/* copy all the mds from kfunc_mds to new_mds */
   139		for (i = 0; i < (1 << kfunc_mds->hash_bits); i++) {
   140			hlist_for_each_entry(md, &kfunc_mds->mds[i], hash) {
   141				new_md = __kfunc_md_create(new_mds, md->func, md->nr_args);
   142				if (!new_md)
   143					goto err_out;
   144	
   145				new_md->bpf_prog_cnt = md->bpf_prog_cnt;
   146				new_md->bpf_origin_call = md->bpf_origin_call;
   147				new_md->users = md->users;
   148	
   149				memcpy(new_md->bpf_progs, md->bpf_progs, sizeof(md->bpf_progs));
   150			}
   151		}
   152	
 > 153		old_mds = kfunc_mds;
   154		rcu_assign_pointer(kfunc_mds, new_mds);
   155		synchronize_rcu();
   156	
   157		/* free all the mds in the old_mds. See kfunc_md_put() for the
   158		 * complete release process.
   159		 */
   160		for (i = 0; i < (1 << old_mds->hash_bits); i++) {
   161			hlist_for_each_entry_safe(md, n, &old_mds->mds[i], hash) {
   162				percpu_ref_kill(&md->pcref);
   163				hlist_del(&md->hash);
   164			}
   165		}
   166	
   167		if (old_mds != &default_mds)
   168			kfree_rcu(old_mds, rcu);
   169	
   170		return 0;
   171	
   172	err_out:
   173		for (i = 0; i < (1 << new_mds->hash_bits); i++) {
   174			hlist_for_each_entry_safe(md, n, &new_mds->mds[i], hash) {
   175				percpu_ref_exit(&md->pcref);
   176				hlist_del(&md->hash);
   177				kfree(md);
   178			}
   179		}
   180		return -ENOMEM;
   181	}
   182	
   183	static int kfunc_md_array_inc(void)
   184	{
   185		/* increase the hash table if greater than 90% */
 > 186		if (atomic_read(&kfunc_mds->used) * 10 < (1 << (kfunc_mds->hash_bits)) * 9)
   187			return 0;
   188		return kfunc_md_array_adjust(true);
   189	}
   190	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 03/18] ftrace: factor out ftrace_direct_update from register_ftrace_direct
  2025-07-03 12:15 ` [PATCH bpf-next v2 03/18] ftrace: factor out ftrace_direct_update from register_ftrace_direct Menglong Dong
@ 2025-07-05  2:41   ` kernel test robot
  0 siblings, 0 replies; 73+ messages in thread
From: kernel test robot @ 2025-07-05  2:41 UTC (permalink / raw)
  To: Menglong Dong, alexei.starovoitov, rostedt, jolsa
  Cc: oe-kbuild-all, bpf, Menglong Dong, Mark Rutland,
	Mathieu Desnoyers, linux-kernel, linux-trace-kernel

Hi Menglong,

kernel test robot noticed the following build warnings:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/intel-lab-lkp/linux/commits/Menglong-Dong/bpf-add-function-hash-table-for-tracing-multi/20250703-203035
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20250703121521.1874196-4-dongml2%40chinatelecom.cn
patch subject: [PATCH bpf-next v2 03/18] ftrace: factor out ftrace_direct_update from register_ftrace_direct
config: x86_64-randconfig-123-20250704 (https://download.01.org/0day-ci/archive/20250705/202507051048.PEDxVblg-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250705/202507051048.PEDxVblg-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507051048.PEDxVblg-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
   kernel/trace/ftrace.c:233:49: sparse:     got struct ftrace_ops [noderef] __rcu *[addressable] [toplevel] ftrace_ops_list
   kernel/trace/ftrace.c:318:16: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_ops **p @@     got struct ftrace_ops [noderef] __rcu **list @@
   kernel/trace/ftrace.c:318:16: sparse:     expected struct ftrace_ops **p
   kernel/trace/ftrace.c:318:16: sparse:     got struct ftrace_ops [noderef] __rcu **list
   kernel/trace/ftrace.c:318:50: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_ops **p @@     got struct ftrace_ops [noderef] __rcu ** @@
   kernel/trace/ftrace.c:318:50: sparse:     expected struct ftrace_ops **p
   kernel/trace/ftrace.c:318:50: sparse:     got struct ftrace_ops [noderef] __rcu **
   kernel/trace/ftrace.c:325:12: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_ops * @@     got struct ftrace_ops [noderef] __rcu *next @@
   kernel/trace/ftrace.c:325:12: sparse:     expected struct ftrace_ops *
   kernel/trace/ftrace.c:325:12: sparse:     got struct ftrace_ops [noderef] __rcu *next
   kernel/trace/ftrace.c:1072:43: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *notrace_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:1072:43: sparse:     expected struct ftrace_hash [noderef] __rcu *notrace_hash
   kernel/trace/ftrace.c:1072:43: sparse:     got struct ftrace_hash *
   kernel/trace/ftrace.c:1073:43: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *filter_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:1073:43: sparse:     expected struct ftrace_hash [noderef] __rcu *filter_hash
   kernel/trace/ftrace.c:1073:43: sparse:     got struct ftrace_hash *
   kernel/trace/ftrace.c:1298:40: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:1298:40: sparse:     expected struct ftrace_hash *hash
   kernel/trace/ftrace.c:1298:40: sparse:     got struct ftrace_hash [noderef] __rcu *filter_hash
   kernel/trace/ftrace.c:1299:40: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:1299:40: sparse:     expected struct ftrace_hash *hash
   kernel/trace/ftrace.c:1299:40: sparse:     got struct ftrace_hash [noderef] __rcu *notrace_hash
   kernel/trace/ftrace.c:1300:37: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *filter_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:1300:37: sparse:     expected struct ftrace_hash [noderef] __rcu *filter_hash
   kernel/trace/ftrace.c:1300:37: sparse:     got struct ftrace_hash *
   kernel/trace/ftrace.c:1301:38: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *notrace_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:1301:38: sparse:     expected struct ftrace_hash [noderef] __rcu *notrace_hash
   kernel/trace/ftrace.c:1301:38: sparse:     got struct ftrace_hash *
   kernel/trace/ftrace.c:2100:54: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct ftrace_hash *old_hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:2100:54: sparse:     expected struct ftrace_hash *old_hash
   kernel/trace/ftrace.c:2100:54: sparse:     got struct ftrace_hash [noderef] __rcu *filter_hash
   kernel/trace/ftrace.c:1505:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/trace/ftrace.c:1505:9: sparse:    struct ftrace_hash [noderef] __rcu *
   kernel/trace/ftrace.c:1505:9: sparse:    struct ftrace_hash *
   kernel/trace/ftrace.c:1521:39: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:1522:40: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:1523:40: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:1524:42: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:1695:18: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_ops *ops @@     got struct ftrace_ops [noderef] __rcu *[addressable] [toplevel] ftrace_ops_list @@
   kernel/trace/ftrace.c:1696:43: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_ops *ops @@     got struct ftrace_ops [noderef] __rcu *next @@
   kernel/trace/ftrace.c:1757:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:1758:22: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *notrace_hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:2078:50: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:2089:50: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:2572:53: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *static [toplevel] direct_functions @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:2583:36: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *static [toplevel] direct_functions @@
   kernel/trace/ftrace.c:3379:51: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *B @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3380:66: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash **orig_hash @@     got struct ftrace_hash [noderef] __rcu ** @@
   kernel/trace/ftrace.c:3386:52: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *B @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3387:66: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash **orig_hash @@     got struct ftrace_hash [noderef] __rcu ** @@
   kernel/trace/ftrace.c:3400:41: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3401:51: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *src @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3404:52: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *notrace_hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3408:52: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *src @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3423:39: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3424:42: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3432:17: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3438:81: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3442:54: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *notrace_hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3444:56: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *new_hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3474:60: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *new_hash1 @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3475:49: sparse: sparse: incorrect type in argument 3 (different address spaces) @@     expected struct ftrace_hash *new_hash2 @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3514:45: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *filter_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:3516:46: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *notrace_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:3518:48: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *filter_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:3520:49: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *notrace_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:3526:17: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3527:17: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3533:34: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *save_filter_hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3534:35: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *save_notrace_hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3536:45: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *filter_hash @@     got struct ftrace_hash *[addressable] filter_hash @@
   kernel/trace/ftrace.c:3537:46: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *notrace_hash @@     got struct ftrace_hash *[addressable] notrace_hash @@
   kernel/trace/ftrace.c:3542:53: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *filter_hash @@     got struct ftrace_hash *save_filter_hash @@
   kernel/trace/ftrace.c:3543:54: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *notrace_hash @@     got struct ftrace_hash *save_notrace_hash @@
   kernel/trace/ftrace.c:3590:31: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *filter_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:3591:32: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *notrace_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:3606:59: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *[addressable] filter_hash @@
   kernel/trace/ftrace.c:3607:59: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *[addressable] notrace_hash @@
   kernel/trace/ftrace.c:3612:43: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *[addressable] filter_hash @@
   kernel/trace/ftrace.c:3613:43: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *[addressable] notrace_hash @@
   kernel/trace/ftrace.c:3615:39: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *[addressable] filter_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:3616:40: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *[addressable] notrace_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:3658:48: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3659:48: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3660:45: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *filter_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:3661:46: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *notrace_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:3947:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3964:22: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:4650:22: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:4653:22: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:5060:27: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash **orig_hash @@     got struct ftrace_hash [noderef] __rcu ** @@
   kernel/trace/ftrace.c:5062:27: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash **orig_hash @@     got struct ftrace_hash [noderef] __rcu ** @@
   kernel/trace/ftrace.c:5442:19: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash **orig_hash @@     got struct ftrace_hash [noderef] __rcu ** @@
   kernel/trace/ftrace.c:5586:19: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash **orig_hash @@     got struct ftrace_hash [noderef] __rcu ** @@
   kernel/trace/ftrace.c:5592:34: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *filter_hash @@     got struct ftrace_hash *[assigned] old_hash @@
   kernel/trace/ftrace.c:5857:27: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash **orig_hash @@     got struct ftrace_hash [noderef] __rcu ** @@
   kernel/trace/ftrace.c:5859:27: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash **orig_hash @@     got struct ftrace_hash [noderef] __rcu ** @@
   kernel/trace/ftrace.c:5940:50: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *static [toplevel] direct_functions @@
   kernel/trace/ftrace.c:5942:51: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *static [toplevel] direct_functions @@
   kernel/trace/ftrace.c:6050:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
>> kernel/trace/ftrace.c:6056:19: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *free_hash @@     got struct ftrace_hash [noderef] __rcu *static [addressable] [assigned] [toplevel] direct_functions @@
   kernel/trace/ftrace.c:6095:50: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:6147:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:6151:52: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *static [addressable] [assigned] [toplevel] direct_functions @@
   kernel/trace/ftrace.c:6477:35: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *extern [addressable] [toplevel] ftrace_graph_hash @@     got struct ftrace_hash *[assigned] hash @@
   kernel/trace/ftrace.c:6479:43: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *extern [addressable] [toplevel] ftrace_graph_notrace_hash @@     got struct ftrace_hash *[assigned] hash @@
   kernel/trace/ftrace.c:6548:35: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash **orig_hash @@     got struct ftrace_hash [noderef] __rcu ** @@
   kernel/trace/ftrace.c:6556:35: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash **orig_hash @@     got struct ftrace_hash [noderef] __rcu ** @@
   kernel/trace/ftrace.c:6624:47: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *[addressable] [toplevel] ftrace_graph_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:6625:55: sparse: sparse: incorrect type in initializer (different address spaces) @@     expected struct ftrace_hash [noderef] __rcu *[addressable] [toplevel] ftrace_graph_notrace_hash @@     got struct ftrace_hash * @@
   kernel/trace/ftrace.c:7344:46: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:7345:47: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:7349:44: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:7367:18: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_ops *ops @@     got struct ftrace_ops [noderef] __rcu *[addressable] [toplevel] ftrace_ops_list @@
   kernel/trace/ftrace.c:7367:66: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_ops *ops @@     got struct ftrace_ops [noderef] __rcu *next @@
   kernel/trace/ftrace.c:7419:59: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:7420:59: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:7807:62: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:7808:62: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:7852:36: sparse: sparse: incompatible types in comparison expression (different address spaces):
   kernel/trace/ftrace.c:7852:36: sparse:    struct ftrace_ops [noderef] __rcu *
   kernel/trace/ftrace.c:7852:36: sparse:    struct ftrace_ops *
   kernel/trace/ftrace.c:8628:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:8628:14: sparse:     expected struct ftrace_hash *hash
   kernel/trace/ftrace.c:8628:14: sparse:     got struct ftrace_hash [noderef] __rcu *filter_hash
   kernel/trace/ftrace.c:8677:14: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:8677:14: sparse:     expected struct ftrace_hash *hash
   kernel/trace/ftrace.c:8677:14: sparse:     got struct ftrace_hash [noderef] __rcu *filter_hash
   kernel/trace/ftrace.c:231:20: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:231:20: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:231:20: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3434:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3434:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3434:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3434:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3434:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3434:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3468:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3468:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3468:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3468:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3468:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3468:29: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:5974:30: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:5983:21: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:5985:17: sparse: sparse: dereference of noderef expression
   kernel/trace/ftrace.c:3739:48: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *filter_hash @@
   kernel/trace/ftrace.c:3739:48: sparse:     expected struct ftrace_hash *hash
   kernel/trace/ftrace.c:3739:48: sparse:     got struct ftrace_hash [noderef] __rcu *filter_hash
   kernel/trace/ftrace.c:3740:49: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct ftrace_hash *hash @@     got struct ftrace_hash [noderef] __rcu *notrace_hash @@
   kernel/trace/ftrace.c:3740:49: sparse:     expected struct ftrace_hash *hash
   kernel/trace/ftrace.c:3740:49: sparse:     got struct ftrace_hash [noderef] __rcu *notrace_hash

vim +6056 kernel/trace/ftrace.c

  6015	
  6016	/**
  6017	 * register_ftrace_direct - Call a custom trampoline directly
  6018	 * for multiple functions registered in @ops
  6019	 * @ops: The address of the struct ftrace_ops object
  6020	 * @addr: The address of the trampoline to call at @ops functions
  6021	 *
  6022	 * This is used to connect a direct calls to @addr from the nop locations
  6023	 * of the functions registered in @ops (with by ftrace_set_filter_ip
  6024	 * function).
  6025	 *
  6026	 * The location that it calls (@addr) must be able to handle a direct call,
  6027	 * and save the parameters of the function being traced, and restore them
  6028	 * (or inject new ones if needed), before returning.
  6029	 *
  6030	 * Returns:
  6031	 *  0 on success
  6032	 *  -EINVAL  - The @ops object was already registered with this call or
  6033	 *             when there are no functions in @ops object.
  6034	 *  -EBUSY   - Another direct function is already attached (there can be only one)
  6035	 *  -ENODEV  - @ip does not point to a ftrace nop location (or not supported)
  6036	 *  -ENOMEM  - There was an allocation failure.
  6037	 */
  6038	int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
  6039	{
  6040		struct ftrace_hash *hash, *free_hash = NULL;
  6041		int err = -EBUSY;
  6042	
  6043		if (ops->func || ops->trampoline)
  6044			return -EINVAL;
  6045		if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED))
  6046			return -EINVAL;
  6047		if (ops->flags & FTRACE_OPS_FL_ENABLED)
  6048			return -EINVAL;
  6049	
  6050		hash = ops->func_hash->filter_hash;
  6051		if (ftrace_hash_empty(hash))
  6052			return -EINVAL;
  6053	
  6054		mutex_lock(&direct_mutex);
  6055	
> 6056		free_hash = direct_functions;
  6057		err = ftrace_direct_update(hash, addr);
  6058		if (err)
  6059			goto out_unlock;
  6060	
  6061		ops->func = call_direct_funcs;
  6062		ops->flags = MULTI_FLAGS;
  6063		ops->trampoline = FTRACE_REGS_ADDR;
  6064		ops->direct_call = addr;
  6065	
  6066		err = register_ftrace_function_nolock(ops);
  6067		if (free_hash && free_hash != EMPTY_HASH)
  6068			call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
  6069	
  6070	 out_unlock:
  6071		mutex_unlock(&direct_mutex);
  6072	
  6073		return err;
  6074	}
  6075	EXPORT_SYMBOL_GPL(register_ftrace_direct);
  6076	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 04/18] ftrace: add reset_ftrace_direct_ips
  2025-07-04  1:54     ` Menglong Dong
@ 2025-07-07 18:52       ` Steven Rostedt
  2025-07-08  1:26         ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Steven Rostedt @ 2025-07-07 18:52 UTC (permalink / raw)
  To: Menglong Dong
  Cc: alexei.starovoitov, jolsa, bpf, Menglong Dong, Mark Rutland,
	Mathieu Desnoyers, linux-kernel, linux-trace-kernel

On Fri, 4 Jul 2025 09:54:52 +0800
Menglong Dong <menglong8.dong@gmail.com> wrote:

> > What exactly do you mean by "reset"?  
> 
> It means to reset the filter hash of the ftrace_ops to ips. In
> the origin logic, the filter hash of a direct ftrace_ops will not
> be changed. However, in the tracing-multi case, there are
> multi functions in the filter hash and can change. This function
> is used to change the filter hash of a direct ftrace_ops.

The above still doesn't make sense to me.

Can you explain more what exactly you are doing at a higher level? To
me "reset" means to set back to what it originally was (which usually
is zero or nothing).

-- Steve

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 04/18] ftrace: add reset_ftrace_direct_ips
  2025-07-07 18:52       ` Steven Rostedt
@ 2025-07-08  1:26         ` Menglong Dong
  0 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-08  1:26 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: alexei.starovoitov, jolsa, bpf, Menglong Dong, Mark Rutland,
	Mathieu Desnoyers, linux-kernel, linux-trace-kernel

On Tue, Jul 8, 2025 at 2:52 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Fri, 4 Jul 2025 09:54:52 +0800
> Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> > > What exactly do you mean by "reset"?
> >
> > It means to reset the filter hash of the ftrace_ops to ips. In
> > the origin logic, the filter hash of a direct ftrace_ops will not
> > be changed. However, in the tracing-multi case, there are
> > multi functions in the filter hash and can change. This function
> > is used to change the filter hash of a direct ftrace_ops.
>
> The above still doesn't make sense to me.
>
> Can you explain more what exactly you are doing at a higher level? To
> me "reset" means to set back to what it originally was (which usually
> is zero or nothing).

Yeah, with pleasure. When we need to update the functions to filter
for a ftrace_ops, we can use ftrace_set_filter_ips(), which is able to
add, remove and reset the functions to filter.

However, we don't have a function to do similar things for a direct
ftrace_ops. What reset_ftrace_direct_ips() do is the same as
ftrace_set_filter_ips() when the "reset" argument of it is 1, and that's
why I call it "reset". Or we can name it something else, such as "update"?

The use case is for the global trampoline. BPF global trampoline has
a direct ftrace_ops. When we attach new tracing-multi progs to the
new kernel functions, we need to add these functions to the ftrace_ops
of the bpf global trampoline.

The "reset_ftrace_direct_ips" can do both adding and removing things
for the direct ftrace_ops. We will get all the functions for the global
trampoline, and "reset" the functions to filter of the ftrace_ops to them.

Hoping I expressed it clearly :/

Thanks!
Menglong Dong

>
> -- Steve

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args
  2025-07-03 12:15 ` [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args Menglong Dong
@ 2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-14 23:45     ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Andrii Nakryiko @ 2025-07-14 22:07 UTC (permalink / raw)
  To: Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	John Fastabend, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo,
	Simon Horman, linux-kernel, netdev

On Thu, Jul 3, 2025 at 5:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> which is used to record the accessed index of the function args in
> btf_ctx_access().

Do we need to bother giving access to arguments through direct ctx[i]
access for these multi-fentry/fexit programs? We have
bpf_get_func_arg_cnt() and bpf_get_func_arg() which can be used to get
any given argument at runtime.


>
> Meanwhile, we add the function btf_check_func_part_match() to compare the
> accessed function args of two function prototype. This function will be
> used in the following commit.
>
> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> ---
>  include/linux/bpf.h   |   4 ++
>  include/linux/btf.h   |   3 +-
>  kernel/bpf/btf.c      | 108 +++++++++++++++++++++++++++++++++++++++++-
>  net/sched/bpf_qdisc.c |   2 +-
>  4 files changed, 113 insertions(+), 4 deletions(-)
>

[...]

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 12/18] libbpf: don't free btf if tracing_multi progs existing
  2025-07-03 12:15 ` [PATCH bpf-next v2 12/18] libbpf: don't free btf if tracing_multi progs existing Menglong Dong
@ 2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-15  1:15     ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Andrii Nakryiko @ 2025-07-14 22:07 UTC (permalink / raw)
  To: Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel

On Thu, Jul 3, 2025 at 5:21 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> By default, the kernel btf that we load during loading program will be
> freed after the programs are loaded in bpf_object_load(). However, we
> still need to use these btf for tracing of multi-link during attaching.
> Therefore, we don't free the btfs until the bpf object is closed if any
> bpf programs of the type multi-link tracing exist.
>
> Meanwhile, introduce the new api bpf_object__free_btf() to manually free
> the btfs after attaching.
>
> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> ---
>  tools/lib/bpf/libbpf.c   | 24 +++++++++++++++++++++++-
>  tools/lib/bpf/libbpf.h   |  2 ++
>  tools/lib/bpf/libbpf.map |  1 +
>  3 files changed, 26 insertions(+), 1 deletion(-)
>
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index aee36402f0a3..530c29f2f5fc 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -8583,6 +8583,28 @@ static void bpf_object_post_load_cleanup(struct bpf_object *obj)
>         obj->btf_vmlinux = NULL;
>  }
>
> +void bpf_object__free_btfs(struct bpf_object *obj)

let's not add this as a new API. We'll keep BTF fds open, if
necessary, but not (yet) give user full control of when those FDs will
be closed, I'm not convinced yet we need that much user control over
this


> +{
> +       if (!obj->btf_vmlinux || obj->state != OBJ_LOADED)
> +               return;
> +
> +       bpf_object_post_load_cleanup(obj);
> +}
> +
> +static void bpf_object_early_free_btf(struct bpf_object *obj)
> +{
> +       struct bpf_program *prog;
> +
> +       bpf_object__for_each_program(prog, obj) {
> +               if (prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
> +                   prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
> +                   prog->expected_attach_type == BPF_MODIFY_RETURN_MULTI)
> +                       return;
> +       }
> +
> +       bpf_object_post_load_cleanup(obj);
> +}
> +
>  static int bpf_object_prepare(struct bpf_object *obj, const char *target_btf_path)
>  {
>         int err;
> @@ -8654,7 +8676,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
>                         err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps);
>         }
>
> -       bpf_object_post_load_cleanup(obj);
> +       bpf_object_early_free_btf(obj);
>         obj->state = OBJ_LOADED; /* doesn't matter if successfully or not */
>
>         if (err) {
> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> index d1cf813a057b..7cc810aa7967 100644
> --- a/tools/lib/bpf/libbpf.h
> +++ b/tools/lib/bpf/libbpf.h
> @@ -323,6 +323,8 @@ LIBBPF_API struct bpf_program *
>  bpf_object__find_program_by_name(const struct bpf_object *obj,
>                                  const char *name);
>
> +LIBBPF_API void bpf_object__free_btfs(struct bpf_object *obj);
> +
>  LIBBPF_API int
>  libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
>                          enum bpf_attach_type *expected_attach_type);
> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
> index c7fc0bde5648..4a0c993221a5 100644
> --- a/tools/lib/bpf/libbpf.map
> +++ b/tools/lib/bpf/libbpf.map
> @@ -444,4 +444,5 @@ LIBBPF_1.6.0 {
>                 bpf_program__line_info_cnt;
>                 btf__add_decl_attr;
>                 btf__add_type_attr;
> +               bpf_object__free_btfs;
>  } LIBBPF_1.5.0;
> --
> 2.39.5
>
>

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 14/18] libbpf: add btf type hash lookup support
  2025-07-03 12:15 ` [PATCH bpf-next v2 14/18] libbpf: add btf type hash lookup support Menglong Dong
@ 2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-15  4:40     ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Andrii Nakryiko @ 2025-07-14 22:07 UTC (permalink / raw)
  To: Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel

On Thu, Jul 3, 2025 at 5:22 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> For now, the libbpf find the btf type id by loop all the btf types and
> compare its name, which is inefficient if we have many functions to
> lookup.
>
> We add the "use_hash" to the function args of find_kernel_btf_id() to
> indicate if we should lookup the btf type id by hash. The hash table will
> be initialized if it has not yet.

Or we could build hashtable-based index outside of struct btf for a
specific use case, because there is no one perfect hashtable-based
indexing that can be done generically (e.g., just by name, or
name+kind, or kind+name, or some more complicated lookup key) and
cover all potential use cases. I'd prefer not to get into a problem of
defining and building indexes and leave it to callers (even if the
caller is other part of libbpf itself).

>
> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> ---
>  tools/lib/bpf/btf.c      | 102 +++++++++++++++++++++++++++++++++++++++
>  tools/lib/bpf/btf.h      |   6 +++
>  tools/lib/bpf/libbpf.c   |  37 +++++++++++---
>  tools/lib/bpf/libbpf.map |   3 ++
>  4 files changed, 140 insertions(+), 8 deletions(-)
>

[...]

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 13/18] libbpf: support tracing_multi
  2025-07-03 12:15 ` [PATCH bpf-next v2 13/18] libbpf: support tracing_multi Menglong Dong
@ 2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-15  1:58     ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Andrii Nakryiko @ 2025-07-14 22:07 UTC (permalink / raw)
  To: Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel

On Thu, Jul 3, 2025 at 5:24 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> Add supporting for the attach types of:
>
> BPF_TRACE_FENTRY_MULTI
> BPF_TRACE_FEXIT_MULTI
> BPF_MODIFY_RETURN_MULTI
>
> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> ---
>  tools/bpf/bpftool/common.c |   3 +
>  tools/lib/bpf/bpf.c        |  10 +++
>  tools/lib/bpf/bpf.h        |   6 ++
>  tools/lib/bpf/libbpf.c     | 168 ++++++++++++++++++++++++++++++++++++-
>  tools/lib/bpf/libbpf.h     |  19 +++++
>  tools/lib/bpf/libbpf.map   |   1 +
>  6 files changed, 204 insertions(+), 3 deletions(-)
>

[...]

> diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
> index 1342564214c8..5c97acec643d 100644
> --- a/tools/lib/bpf/bpf.h
> +++ b/tools/lib/bpf/bpf.h
> @@ -422,6 +422,12 @@ struct bpf_link_create_opts {
>                 struct {
>                         __u64 cookie;
>                 } tracing;
> +               struct {
> +                       __u32 cnt;
> +                       const __u32 *btf_ids;
> +                       const __u32 *tgt_fds;

tgt_fds are always BTF FDs, right? Do we intend to support
freplace-style multi attachment at all? If not, I'd name them btf_fds,
and btf_ids -> btf_type_ids (because BTF ID can also refer to kernel
ID of BTF object, so ambiguous and somewhat confusing)

> +                       const __u64 *cookies;
> +               } tracing_multi;
>                 struct {
>                         __u32 pf;
>                         __u32 hooknum;
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 530c29f2f5fc..ae38b3ab84c7 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -136,6 +136,9 @@ static const char * const attach_type_name[] = {
>         [BPF_NETKIT_PEER]               = "netkit_peer",
>         [BPF_TRACE_KPROBE_SESSION]      = "trace_kprobe_session",
>         [BPF_TRACE_UPROBE_SESSION]      = "trace_uprobe_session",
> +       [BPF_TRACE_FENTRY_MULTI]        = "trace_fentry_multi",
> +       [BPF_TRACE_FEXIT_MULTI]         = "trace_fexit_multi",
> +       [BPF_MODIFY_RETURN_MULTI]       = "modify_return_multi",
>  };
>
>  static const char * const link_type_name[] = {
> @@ -410,6 +413,8 @@ enum sec_def_flags {
>         SEC_XDP_FRAGS = 16,
>         /* Setup proper attach type for usdt probes. */
>         SEC_USDT = 32,
> +       /* attachment target is multi-link */
> +       SEC_ATTACH_BTF_MULTI = 64,
>  };
>
>  struct bpf_sec_def {
> @@ -7419,9 +7424,9 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
>                 opts->expected_attach_type = BPF_TRACE_UPROBE_MULTI;
>         }
>
> -       if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) {
> +       if ((def & (SEC_ATTACH_BTF | SEC_ATTACH_BTF_MULTI)) && !prog->attach_btf_id) {
>                 int btf_obj_fd = 0, btf_type_id = 0, err;
> -               const char *attach_name;
> +               const char *attach_name, *name_end;
>
>                 attach_name = strchr(prog->sec_name, '/');
>                 if (!attach_name) {
> @@ -7440,7 +7445,27 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
>                 }
>                 attach_name++; /* skip over / */
>
> -               err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd, &btf_type_id);
> +               name_end = strchr(attach_name, ',');
> +               /* for multi-link tracing, use the first target symbol during
> +                * loading.
> +                */
> +               if ((def & SEC_ATTACH_BTF_MULTI) && name_end) {
> +                       int len = name_end - attach_name + 1;

for multi-kprobe we decided to only support a single glob  as a target
in declarative SEC() definition. If a user needs more control, they
can always fallback to the programmatic bpf_program__attach_..._opts()
variant. Let's do the same here, glob is good enough for declarative
use cases, and for complicated cases programmatic is the way to go
anyways. You'll avoid unnecessary complications like this one then.

BTW, it's not trivial to figure this out from earlier patches, but
does BPF verifier need to know all these BTF type IDs during program
verification time? If yes, why and then why do we need to specify them
during LINK_CREATE time. And if not, then great, and we don't need to
parse all this during load time.

> +                       char *first_tgt;
> +
> +                       first_tgt = malloc(len);
> +                       if (!first_tgt)
> +                               return -ENOMEM;
> +                       libbpf_strlcpy(first_tgt, attach_name, len);
> +                       first_tgt[len - 1] = '\0';
> +                       err = libbpf_find_attach_btf_id(prog, first_tgt, &btf_obj_fd,
> +                                                       &btf_type_id);
> +                       free(first_tgt);
> +               } else {
> +                       err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd,
> +                                                       &btf_type_id);
> +               }
> +
>                 if (err)
>                         return err;
>
> @@ -9519,6 +9544,7 @@ static int attach_kprobe_session(const struct bpf_program *prog, long cookie, st
>  static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
>  static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link);
>  static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link);
> +static int attach_trace_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
>
>  static const struct bpf_sec_def section_defs[] = {
>         SEC_DEF("socket",               SOCKET_FILTER, 0, SEC_NONE),
> @@ -9565,6 +9591,13 @@ static const struct bpf_sec_def section_defs[] = {
>         SEC_DEF("fentry.s+",            TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
>         SEC_DEF("fmod_ret.s+",          TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
>         SEC_DEF("fexit.s+",             TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
> +       SEC_DEF("tp_btf+",              TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace),

duplicate


> +       SEC_DEF("fentry.multi+",        TRACING, BPF_TRACE_FENTRY_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
> +       SEC_DEF("fmod_ret.multi+",      TRACING, BPF_MODIFY_RETURN_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
> +       SEC_DEF("fexit.multi+",         TRACING, BPF_TRACE_FEXIT_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
> +       SEC_DEF("fentry.multi.s+",      TRACING, BPF_TRACE_FENTRY_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
> +       SEC_DEF("fmod_ret.multi.s+",    TRACING, BPF_MODIFY_RETURN_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
> +       SEC_DEF("fexit.multi.s+",       TRACING, BPF_TRACE_FEXIT_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
>         SEC_DEF("freplace+",            EXT, 0, SEC_ATTACH_BTF, attach_trace),
>         SEC_DEF("lsm+",                 LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
>         SEC_DEF("lsm.s+",               LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
> @@ -12799,6 +12832,135 @@ static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_
>         return libbpf_get_error(*link);
>  }
>

[...]

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 15/18] libbpf: add skip_invalid and attach_tracing for tracing_multi
  2025-07-03 12:15 ` [PATCH bpf-next v2 15/18] libbpf: add skip_invalid and attach_tracing for tracing_multi Menglong Dong
@ 2025-07-14 22:07   ` Andrii Nakryiko
  2025-07-15  5:48     ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Andrii Nakryiko @ 2025-07-14 22:07 UTC (permalink / raw)
  To: Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel

On Thu, Jul 3, 2025 at 5:23 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> We add skip_invalid and attach_tracing for tracing_multi for the
> selftests.
>
> When we try to attach all the functions in available_filter_functions with
> tracing_multi, we can't tell if the target symbol can be attached
> successfully, and the attaching will fail. When skip_invalid is set to
> true, we will check if it can be attached in libbpf, and skip the invalid
> entries.
>
> We will skip the symbols in the following cases:
>
> 1. the btf type not exist
> 2. the btf type is not a function proto
> 3. the function args count more that 6
> 4. the return type is struct or union
> 5. any function args is struct or union
>
> The 5th rule can be a manslaughter, but it's ok for the testings.
>
> "attach_tracing" is used to convert a TRACING prog to TRACING_MULTI. For
> example, we can set the attach type to FENTRY_MULTI before we load the
> skel. And we can attach the prog with
> bpf_program__attach_trace_multi_opts() with "attach_tracing=1". The libbpf
> will attach the target btf type of the prog automatically. This is also
> used to reuse the selftests of tracing.
>
> (Oh my goodness! What am I doing?)

exactly...

Let's think if we need any of that, as in: take a step back, and try
to explain why you think any of this should be part of libbpf's UAPI.

>
> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> ---
>  tools/lib/bpf/libbpf.c | 97 ++++++++++++++++++++++++++++++++++++------
>  tools/lib/bpf/libbpf.h |  6 ++-
>  2 files changed, 89 insertions(+), 14 deletions(-)
>

[...]

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args
  2025-07-14 22:07   ` Andrii Nakryiko
@ 2025-07-14 23:45     ` Menglong Dong
  2025-07-15 17:11       ` Andrii Nakryiko
  0 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-14 23:45 UTC (permalink / raw)
  To: Andrii Nakryiko, Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	John Fastabend, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo,
	Simon Horman, linux-kernel, netdev


On 2025/7/15 06:07, Andrii Nakryiko wrote:
> On Thu, Jul 3, 2025 at 5:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>> In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
>> which is used to record the accessed index of the function args in
>> btf_ctx_access().
> Do we need to bother giving access to arguments through direct ctx[i]
> access for these multi-fentry/fexit programs? We have
> bpf_get_func_arg_cnt() and bpf_get_func_arg() which can be used to get
> any given argument at runtime.


Hi Andrii. This commit is not for that purpose. We remember all the accessed
args to bpf_prog_aux->accessed_args. And when we attach the tracing-multi
prog to the kernel functions, we will check if the accessed arguments are
consistent between all the target functions.

The bpf_prog_aux->accessed_args will be used in
https://lore.kernel.org/bpf/20250703121521.1874196-12-dongml2@chinatelecom.cn/

in bpf_tracing_check_multi() to do such checking.

With such checking, the target functions don't need to have
the same prototype, which makes tracing-multi more flexible.

Thanks!
Menglong Dong


>
>> Meanwhile, we add the function btf_check_func_part_match() to compare the
>> accessed function args of two function prototype. This function will be
>> used in the following commit.
>>
>> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
>> ---
>>   include/linux/bpf.h   |   4 ++
>>   include/linux/btf.h   |   3 +-
>>   kernel/bpf/btf.c      | 108 +++++++++++++++++++++++++++++++++++++++++-
>>   net/sched/bpf_qdisc.c |   2 +-
>>   4 files changed, 113 insertions(+), 4 deletions(-)
>>
> [...]
>

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 12/18] libbpf: don't free btf if tracing_multi progs existing
  2025-07-14 22:07   ` Andrii Nakryiko
@ 2025-07-15  1:15     ` Menglong Dong
  0 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-15  1:15 UTC (permalink / raw)
  To: Andrii Nakryiko, Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel


On 7/15/25 06:07, Andrii Nakryiko wrote:
> On Thu, Jul 3, 2025 at 5:21 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>> By default, the kernel btf that we load during loading program will be
>> freed after the programs are loaded in bpf_object_load(). However, we
>> still need to use these btf for tracing of multi-link during attaching.
>> Therefore, we don't free the btfs until the bpf object is closed if any
>> bpf programs of the type multi-link tracing exist.
>>
>> Meanwhile, introduce the new api bpf_object__free_btf() to manually free
>> the btfs after attaching.
>>
>> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
>> ---
>>   tools/lib/bpf/libbpf.c   | 24 +++++++++++++++++++++++-
>>   tools/lib/bpf/libbpf.h   |  2 ++
>>   tools/lib/bpf/libbpf.map |  1 +
>>   3 files changed, 26 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
>> index aee36402f0a3..530c29f2f5fc 100644
>> --- a/tools/lib/bpf/libbpf.c
>> +++ b/tools/lib/bpf/libbpf.c
>> @@ -8583,6 +8583,28 @@ static void bpf_object_post_load_cleanup(struct bpf_object *obj)
>>          obj->btf_vmlinux = NULL;
>>   }
>>
>> +void bpf_object__free_btfs(struct bpf_object *obj)
> let's not add this as a new API. We'll keep BTF fds open, if
> necessary, but not (yet) give user full control of when those FDs will
> be closed, I'm not convinced yet we need that much user control over
> this


Okay! I previously thought that this would take up a certain amount of
memory, but it seems I was overthinking :/

I'll remove this API in the next version.

Thanks!
Menglong Dong


>
>
>> +{
>> +       if (!obj->btf_vmlinux || obj->state != OBJ_LOADED)
>> +               return;
>> +
>> +       bpf_object_post_load_cleanup(obj);
>> +}
>> +
>> +static void bpf_object_early_free_btf(struct bpf_object *obj)
>> +{
>> +       struct bpf_program *prog;
>> +
>> +       bpf_object__for_each_program(prog, obj) {
>> +               if (prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI ||
>> +                   prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI ||
>> +                   prog->expected_attach_type == BPF_MODIFY_RETURN_MULTI)
>> +                       return;
>> +       }
>> +
>> +       bpf_object_post_load_cleanup(obj);
>> +}
>> +
>>   static int bpf_object_prepare(struct bpf_object *obj, const char *target_btf_path)
>>   {
>>          int err;
>> @@ -8654,7 +8676,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
>>                          err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps);
>>          }
>>
>> -       bpf_object_post_load_cleanup(obj);
>> +       bpf_object_early_free_btf(obj);
>>          obj->state = OBJ_LOADED; /* doesn't matter if successfully or not */
>>
>>          if (err) {
>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>> index d1cf813a057b..7cc810aa7967 100644
>> --- a/tools/lib/bpf/libbpf.h
>> +++ b/tools/lib/bpf/libbpf.h
>> @@ -323,6 +323,8 @@ LIBBPF_API struct bpf_program *
>>   bpf_object__find_program_by_name(const struct bpf_object *obj,
>>                                   const char *name);
>>
>> +LIBBPF_API void bpf_object__free_btfs(struct bpf_object *obj);
>> +
>>   LIBBPF_API int
>>   libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
>>                           enum bpf_attach_type *expected_attach_type);
>> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
>> index c7fc0bde5648..4a0c993221a5 100644
>> --- a/tools/lib/bpf/libbpf.map
>> +++ b/tools/lib/bpf/libbpf.map
>> @@ -444,4 +444,5 @@ LIBBPF_1.6.0 {
>>                  bpf_program__line_info_cnt;
>>                  btf__add_decl_attr;
>>                  btf__add_type_attr;
>> +               bpf_object__free_btfs;
>>   } LIBBPF_1.5.0;
>> --
>> 2.39.5
>>
>>

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi
  2025-07-03 12:15 ` [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi Menglong Dong
  2025-07-04 16:07   ` kernel test robot
@ 2025-07-15  1:55   ` Alexei Starovoitov
  2025-07-15  2:37     ` Menglong Dong
  1 sibling, 1 reply; 73+ messages in thread
From: Alexei Starovoitov @ 2025-07-15  1:55 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, LKML

On Thu, Jul 3, 2025 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> We don't use rhashtable here, as the compiler is not clever enough and it
> refused to inline the hash lookup for me, which bring in addition overhead
> in the following BPF global trampoline.

That's not good enough justification.
rhashtable is used in many performance critical components.
You need to figure out what was causing compiler not to inline lookup
in your case.
Did you make sure that params are constant as I suggested earlier?
If 'static inline' wasn't enough, have you tried always_inline ?

> The release of the metadata is controlled by the percpu ref and RCU
> together, and have similar logic to the release of bpf trampoline image in
> bpf_tramp_image_put().

tbh the locking complexity in this patch is through the roof.
rcu, rcu_tasks, rcu_task_trace, percpu_ref, ...
all that look questionable.
kfunc_mds looks to be rcu protected, but md-s are percpu_ref.
Why? There were choices made that I don't understand the reasons for.
I don't think we should start in depth review of rhashtable-wanne-be
when rhashtable should just work.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 13/18] libbpf: support tracing_multi
  2025-07-14 22:07   ` Andrii Nakryiko
@ 2025-07-15  1:58     ` Menglong Dong
  2025-07-15 17:20       ` Andrii Nakryiko
  0 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-15  1:58 UTC (permalink / raw)
  To: Andrii Nakryiko, Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel


On 7/15/25 06:07, Andrii Nakryiko wrote:
> On Thu, Jul 3, 2025 at 5:24 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>> Add supporting for the attach types of:
>>
>> BPF_TRACE_FENTRY_MULTI
>> BPF_TRACE_FEXIT_MULTI
>> BPF_MODIFY_RETURN_MULTI
>>
>> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
>> ---
>>   tools/bpf/bpftool/common.c |   3 +
>>   tools/lib/bpf/bpf.c        |  10 +++
>>   tools/lib/bpf/bpf.h        |   6 ++
>>   tools/lib/bpf/libbpf.c     | 168 ++++++++++++++++++++++++++++++++++++-
>>   tools/lib/bpf/libbpf.h     |  19 +++++
>>   tools/lib/bpf/libbpf.map   |   1 +
>>   6 files changed, 204 insertions(+), 3 deletions(-)
>>
> [...]
>
>> diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
>> index 1342564214c8..5c97acec643d 100644
>> --- a/tools/lib/bpf/bpf.h
>> +++ b/tools/lib/bpf/bpf.h
>> @@ -422,6 +422,12 @@ struct bpf_link_create_opts {
>>                  struct {
>>                          __u64 cookie;
>>                  } tracing;
>> +               struct {
>> +                       __u32 cnt;
>> +                       const __u32 *btf_ids;
>> +                       const __u32 *tgt_fds;
> tgt_fds are always BTF FDs, right? Do we intend to support
> freplace-style multi attachment at all? If not, I'd name them btf_fds,
> and btf_ids -> btf_type_ids (because BTF ID can also refer to kernel
> ID of BTF object, so ambiguous and somewhat confusing)


For now, freplace is not supported. And I'm not sure if we will support

it in the feature.


I think that there should be no need to use freplace in large quantities,

so we don't need to support the multi attachment for it in the feature.


Yeah, I'll follow your advice in the next version.


>
>> +                       const __u64 *cookies;
>> +               } tracing_multi;
>>                  struct {
>>                          __u32 pf;
>>                          __u32 hooknum;
>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
>> index 530c29f2f5fc..ae38b3ab84c7 100644
>> --- a/tools/lib/bpf/libbpf.c
>> +++ b/tools/lib/bpf/libbpf.c
>> @@ -136,6 +136,9 @@ static const char * const attach_type_name[] = {
>>          [BPF_NETKIT_PEER]               = "netkit_peer",
>>          [BPF_TRACE_KPROBE_SESSION]      = "trace_kprobe_session",
>>          [BPF_TRACE_UPROBE_SESSION]      = "trace_uprobe_session",
>> +       [BPF_TRACE_FENTRY_MULTI]        = "trace_fentry_multi",
>> +       [BPF_TRACE_FEXIT_MULTI]         = "trace_fexit_multi",
>> +       [BPF_MODIFY_RETURN_MULTI]       = "modify_return_multi",
>>   };
>>
>>   static const char * const link_type_name[] = {
>> @@ -410,6 +413,8 @@ enum sec_def_flags {
>>          SEC_XDP_FRAGS = 16,
>>          /* Setup proper attach type for usdt probes. */
>>          SEC_USDT = 32,
>> +       /* attachment target is multi-link */
>> +       SEC_ATTACH_BTF_MULTI = 64,
>>   };
>>
>>   struct bpf_sec_def {
>> @@ -7419,9 +7424,9 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
>>                  opts->expected_attach_type = BPF_TRACE_UPROBE_MULTI;
>>          }
>>
>> -       if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) {
>> +       if ((def & (SEC_ATTACH_BTF | SEC_ATTACH_BTF_MULTI)) && !prog->attach_btf_id) {
>>                  int btf_obj_fd = 0, btf_type_id = 0, err;
>> -               const char *attach_name;
>> +               const char *attach_name, *name_end;
>>
>>                  attach_name = strchr(prog->sec_name, '/');
>>                  if (!attach_name) {
>> @@ -7440,7 +7445,27 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
>>                  }
>>                  attach_name++; /* skip over / */
>>
>> -               err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd, &btf_type_id);
>> +               name_end = strchr(attach_name, ',');
>> +               /* for multi-link tracing, use the first target symbol during
>> +                * loading.
>> +                */
>> +               if ((def & SEC_ATTACH_BTF_MULTI) && name_end) {
>> +                       int len = name_end - attach_name + 1;
> for multi-kprobe we decided to only support a single glob  as a target
> in declarative SEC() definition. If a user needs more control, they
> can always fallback to the programmatic bpf_program__attach_..._opts()
> variant. Let's do the same here, glob is good enough for declarative
> use cases, and for complicated cases programmatic is the way to go
> anyways. You'll avoid unnecessary complications like this one then.


In fact, this is to make the BPF code in the selftests simple. With such

control, I can test different combination of the target functions easily,

just like this:


SEC("fentry.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_13")
int BPF_PROG2(fentry_success_test1, struct bpf_testmod_struct_arg_2, a)
{
     test_result = a.a + a.b;
     return 0;
}

SEC("fentry.multi/bpf_testmod_test_struct_arg_2,bpf_testmod_test_struct_arg_10")
int BPF_PROG2(fentry_success_test2, int, a, struct 
bpf_testmod_struct_arg_2, b)
{
     test_result = a + b.a + b.b;
     return 0;
}


And you are right, we should design it for the users, and a single glob is

much better. Instead, I'll implement the combination testings in the

loader with bpf_program__attach_trace_multi_opts().


>
> BTW, it's not trivial to figure this out from earlier patches, but
> does BPF verifier need to know all these BTF type IDs during program
> verification time? If yes, why and then why do we need to specify them
> during LINK_CREATE time. And if not, then great, and we don't need to
> parse all this during load time.


It doesn't need to know all the BTF type IDs, but it need to know one

of them(the first one), which means that we still need to do the parse

during load time.


Of course, we can split it:

step 1: parse the glob and get the first BTF type ID during load time

step 2: parse the glob and get all the BTF type IDs during attachment


But it will make the code a little more complex. Shoud I do it this way?

I'd appreciate it to hear some advice here :/


>
>> +                       char *first_tgt;
>> +
>> +                       first_tgt = malloc(len);
>> +                       if (!first_tgt)
>> +                               return -ENOMEM;
>> +                       libbpf_strlcpy(first_tgt, attach_name, len);
>> +                       first_tgt[len - 1] = '\0';
>> +                       err = libbpf_find_attach_btf_id(prog, first_tgt, &btf_obj_fd,
>> +                                                       &btf_type_id);
>> +                       free(first_tgt);
>> +               } else {
>> +                       err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd,
>> +                                                       &btf_type_id);
>> +               }
>> +
>>                  if (err)
>>                          return err;
>>
>> @@ -9519,6 +9544,7 @@ static int attach_kprobe_session(const struct bpf_program *prog, long cookie, st
>>   static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
>>   static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link);
>>   static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link);
>> +static int attach_trace_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
>>
>>   static const struct bpf_sec_def section_defs[] = {
>>          SEC_DEF("socket",               SOCKET_FILTER, 0, SEC_NONE),
>> @@ -9565,6 +9591,13 @@ static const struct bpf_sec_def section_defs[] = {
>>          SEC_DEF("fentry.s+",            TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
>>          SEC_DEF("fmod_ret.s+",          TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
>>          SEC_DEF("fexit.s+",             TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
>> +       SEC_DEF("tp_btf+",              TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace),
> duplicate


Get it :/


Thanks!

Menglong Dong


>
>
>> +       SEC_DEF("fentry.multi+",        TRACING, BPF_TRACE_FENTRY_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
>> +       SEC_DEF("fmod_ret.multi+",      TRACING, BPF_MODIFY_RETURN_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
>> +       SEC_DEF("fexit.multi+",         TRACING, BPF_TRACE_FEXIT_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
>> +       SEC_DEF("fentry.multi.s+",      TRACING, BPF_TRACE_FENTRY_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
>> +       SEC_DEF("fmod_ret.multi.s+",    TRACING, BPF_MODIFY_RETURN_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
>> +       SEC_DEF("fexit.multi.s+",       TRACING, BPF_TRACE_FEXIT_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
>>          SEC_DEF("freplace+",            EXT, 0, SEC_ATTACH_BTF, attach_trace),
>>          SEC_DEF("lsm+",                 LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
>>          SEC_DEF("lsm.s+",               LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
>> @@ -12799,6 +12832,135 @@ static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_
>>          return libbpf_get_error(*link);
>>   }
>>
> [...]
>

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-03 12:15 ` [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline Menglong Dong
@ 2025-07-15  2:25   ` Alexei Starovoitov
  2025-07-15  8:36     ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Alexei Starovoitov @ 2025-07-15  2:25 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, H. Peter Anvin,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

On Thu, Jul 3, 2025 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> +static __always_inline void
> +do_origin_call(unsigned long *args, unsigned long *ip, int nr_args)
> +{
> +       /* Following code will be optimized by the compiler, as nr_args
> +        * is a const, and there will be no condition here.
> +        */
> +       if (nr_args == 0) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_0 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       :
> +               );
> +       } else if (nr_args == 1) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_1 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi"
> +               );
> +       } else if (nr_args == 2) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_2 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi", "rsi"
> +               );
> +       } else if (nr_args == 3) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_3 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi", "rsi", "rdx"
> +               );
> +       } else if (nr_args == 4) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_4 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi", "rsi", "rdx", "rcx"
> +               );
> +       } else if (nr_args == 5) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_5 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi", "rsi", "rdx", "rcx", "r8"
> +               );
> +       } else if (nr_args == 6) {
> +               asm volatile(
> +                       RESTORE_ORIGIN_6 CALL_NOSPEC "\n"
> +                       "movq %%rax, %0\n"
> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> +                       : "rdi", "rsi", "rdx", "rcx", "r8", "r9"
> +               );
> +       }
> +}

What is the performance difference between 0-6 variants?
I would think save/restore of regs shouldn't be that expensive.
bpf trampoline saves only what's necessary because it can do
this micro optimization, but for this one, I think, doing
_one_ global trampoline that covers all cases will simplify the code
a lot, but please benchmark the difference to understand
the trade-off.

The major simplification will be due to skipping nr_args.
There won't be a need to do btf model and count the args.
Just do one trampoline for them all.

Also funcs with 7+ arguments need to be thought through
from the start.
I think it's ok trade-off if we allow global trampoline
to be safe to attach to a function with 7+ args (and
it will not mess with the stack), but bpf prog can only
access up to 6 args. The kfuncs to access arg 7 might be
more complex and slower. It's ok trade off.

> +
> +static __always_inline notrace void
> +run_tramp_prog(struct kfunc_md_tramp_prog *tramp_prog,
> +              struct bpf_tramp_run_ctx *run_ctx, unsigned long *args)
> +{
> +       struct bpf_prog *prog;
> +       u64 start_time;
> +
> +       while (tramp_prog) {
> +               prog = tramp_prog->prog;
> +               run_ctx->bpf_cookie = tramp_prog->cookie;
> +               start_time = bpf_gtramp_enter(prog, run_ctx);
> +
> +               if (likely(start_time)) {
> +                       asm volatile(
> +                               CALL_NOSPEC "\n"
> +                               : : [thunk_target]"r"(prog->bpf_func), [args]"D"(args)
> +                       );

Why this cannot be "call *(prog->bpf_func)" ?

> +               }
> +
> +               bpf_gtramp_exit(prog, start_time, run_ctx);
> +               tramp_prog = tramp_prog->next;
> +       }
> +}
> +
> +static __always_inline notrace int
> +bpf_global_caller_run(unsigned long *args, unsigned long *ip, int nr_args)

Pls share top 10 from "perf report" while running the bench.
I'm curious about what's hot.
Last time I benchmarked fentry/fexit migrate_disable/enable were
one the hottest functions. I suspect it's the case here as well.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi
  2025-07-15  1:55   ` Alexei Starovoitov
@ 2025-07-15  2:37     ` Menglong Dong
  2025-07-15  2:49       ` Alexei Starovoitov
  0 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-15  2:37 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, LKML

On Tue, Jul 15, 2025 at 9:55 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Jul 3, 2025 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > We don't use rhashtable here, as the compiler is not clever enough and it
> > refused to inline the hash lookup for me, which bring in addition overhead
> > in the following BPF global trampoline.
>
> That's not good enough justification.
> rhashtable is used in many performance critical components.
> You need to figure out what was causing compiler not to inline lookup
> in your case.
> Did you make sure that params are constant as I suggested earlier?
> If 'static inline' wasn't enough, have you tried always_inline ?

Yeah, I'm sure all the params are constant. always_inline works, but I have
to replace the "inline" with "__always_inline" for rhashtable_lookup_fast,
rhashtable_lookup, __rhashtable_lookup, rht_key_get_hash, etc. After that,
everything will be inlined.

In fact, I think rhashtable is not good enough in our case, which
has high performance requirements. With rhashtable, the insn count
is 35 to finish the hash lookup. With the hash table here, it needs only
17 insn, which means the rhashtable introduces ~5% overhead.

BTW, the function padding based metadata needs only 5 insn, which
decreases 5% overhead.

>
> > The release of the metadata is controlled by the percpu ref and RCU
> > together, and have similar logic to the release of bpf trampoline image in
> > bpf_tramp_image_put().
>
> tbh the locking complexity in this patch is through the roof.
> rcu, rcu_tasks, rcu_task_trace, percpu_ref, ...
> all that look questionable.
> kfunc_mds looks to be rcu protected, but md-s are percpu_ref.
> Why? There were choices made that I don't understand the reasons for.
> I don't think we should start in depth review of rhashtable-wanne-be
> when rhashtable should just work.

In fact, all these locking is not for the mds, but for md. For mds, we protect
it with RCU only, what complex is the md. So the logic for the hashtable
that we introduced is quite simple. We allocate a now struct kfunc_md_array,
we copy the old one to it, and we assign it to kfunc_mds with
rcu_assign_pointer().
And we free the old one. That's all.

We need to protect the md the same as how we protect the trampoline image,
as it is used in the global trampoline from the beginning to the ending.
The rcu_tasks, rcu_task_trace, percpu_ref is used for that purpose. It's
complex, but it is really the same as what we do in bpf_tramp_image_put().
You wrote that part, and I think you understand me :/

For the fexit/modify_return case, percpu_ref will be used in the global
trampoline to protect the md, just like that we do with __bpf_tramp_enter().
When releasing, we will kill the percpu_ref first. Then, we use
rcu_task_trace to ensure the rest of the bpf global trampoline is finished.

Maybe I should split this part (the release of the md) to the next patch(the
bpf global trampoline) to make it easier to understand?

Thanks!
Menglong Dong

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi
  2025-07-15  2:37     ` Menglong Dong
@ 2025-07-15  2:49       ` Alexei Starovoitov
  2025-07-15  3:13         ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Alexei Starovoitov @ 2025-07-15  2:49 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, LKML

On Mon, Jul 14, 2025 at 7:38 PM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> On Tue, Jul 15, 2025 at 9:55 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Thu, Jul 3, 2025 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > >
> > > We don't use rhashtable here, as the compiler is not clever enough and it
> > > refused to inline the hash lookup for me, which bring in addition overhead
> > > in the following BPF global trampoline.
> >
> > That's not good enough justification.
> > rhashtable is used in many performance critical components.
> > You need to figure out what was causing compiler not to inline lookup
> > in your case.
> > Did you make sure that params are constant as I suggested earlier?
> > If 'static inline' wasn't enough, have you tried always_inline ?
>
> Yeah, I'm sure all the params are constant. always_inline works, but I have
> to replace the "inline" with "__always_inline" for rhashtable_lookup_fast,
> rhashtable_lookup, __rhashtable_lookup, rht_key_get_hash, etc. After that,
> everything will be inlined.

That doesn't sound right.
When everything is always_inline the compiler can inline the callback hashfn.
Without always inline do use see ht->p.hashfn in the assembly?
If so, the compiler is taking this path:
        if (!__builtin_constant_p(params.key_len))
                hash = ht->p.hashfn(key, ht->key_len, hash_rnd);

which is back to const params.

> In fact, I think rhashtable is not good enough in our case, which
> has high performance requirements. With rhashtable, the insn count
> is 35 to finish the hash lookup. With the hash table here, it needs only
> 17 insn, which means the rhashtable introduces ~5% overhead.

I feel you're not using rhashtable correctly.
Try disasm of xdp_unreg_mem_model().
The inlined lookup is quite small.

> We need to protect the md the same as how we protect the trampoline image,
> as it is used in the global trampoline from the beginning to the ending.
> The rcu_tasks, rcu_task_trace, percpu_ref is used for that purpose. It's
> complex, but it is really the same as what we do in bpf_tramp_image_put().
> You wrote that part, and I think you understand me :/

Sounds like you copied it without understanding :(
bpf trampoline is dynamic. It can go away and all the complexity
because of that. global trampoline is global.
It never gets freed. It doesn't need any of bpf trampoline complexity.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi
  2025-07-15  2:49       ` Alexei Starovoitov
@ 2025-07-15  3:13         ` Menglong Dong
  2025-07-15  9:06           ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-15  3:13 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, LKML

On Tue, Jul 15, 2025 at 10:49 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jul 14, 2025 at 7:38 PM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > On Tue, Jul 15, 2025 at 9:55 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Thu, Jul 3, 2025 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > > >
> > > > We don't use rhashtable here, as the compiler is not clever enough and it
> > > > refused to inline the hash lookup for me, which bring in addition overhead
> > > > in the following BPF global trampoline.
> > >
> > > That's not good enough justification.
> > > rhashtable is used in many performance critical components.
> > > You need to figure out what was causing compiler not to inline lookup
> > > in your case.
> > > Did you make sure that params are constant as I suggested earlier?
> > > If 'static inline' wasn't enough, have you tried always_inline ?
> >
> > Yeah, I'm sure all the params are constant. always_inline works, but I have
> > to replace the "inline" with "__always_inline" for rhashtable_lookup_fast,
> > rhashtable_lookup, __rhashtable_lookup, rht_key_get_hash, etc. After that,
> > everything will be inlined.
>
> That doesn't sound right.
> When everything is always_inline the compiler can inline the callback hashfn.
> Without always inline do use see ht->p.hashfn in the assembly?
> If so, the compiler is taking this path:
>         if (!__builtin_constant_p(params.key_len))
>                 hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
>
> which is back to const params.

I think the compiler thinks the bpf_global_caller is complex enough and
refuses to inline it for me, and a call to __rhashtable_lookup() happens.
When I add always_inline to __rhashtable_lookup(), the compiler makes
a call to rht_key_get_hash(), which is annoying. And I'm sure the params.key_len
is const, and the function call is not for the ht->p.hashfn.

>
> > In fact, I think rhashtable is not good enough in our case, which
> > has high performance requirements. With rhashtable, the insn count
> > is 35 to finish the hash lookup. With the hash table here, it needs only
> > 17 insn, which means the rhashtable introduces ~5% overhead.
>
> I feel you're not using rhashtable correctly.
> Try disasm of xdp_unreg_mem_model().
> The inlined lookup is quite small.

Okay, I'll disasm it and have a look. In my case, it does consume 35 insn
after I disasm it.

>
> > We need to protect the md the same as how we protect the trampoline image,
> > as it is used in the global trampoline from the beginning to the ending.
> > The rcu_tasks, rcu_task_trace, percpu_ref is used for that purpose. It's
> > complex, but it is really the same as what we do in bpf_tramp_image_put().
> > You wrote that part, and I think you understand me :/
>
> Sounds like you copied it without understanding :(
> bpf trampoline is dynamic. It can go away and all the complexity
> because of that. global trampoline is global.
> It never gets freed. It doesn't need any of bpf trampoline complexity.

Hi, I think you misunderstand something. I'm not protecting the
global trampoline, and it doesn't need to be protected. I'm protecting
the kfunc_md.

In the global trampoline, we will look up the kfunc_md of the current ip
and use it. And it will be used from the beginning to the ending in the
global trampoline, so we need to protect it.

But we can see that the life of the kfunc_md is exactly the same as
the bpf trampoline image. So we can use the same way to protect, which
prevent it from being freed if it is still used in the global trampoline.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 14/18] libbpf: add btf type hash lookup support
  2025-07-14 22:07   ` Andrii Nakryiko
@ 2025-07-15  4:40     ` Menglong Dong
  2025-07-15 17:20       ` Andrii Nakryiko
  0 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-15  4:40 UTC (permalink / raw)
  To: Andrii Nakryiko, Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel


On 7/15/25 06:07, Andrii Nakryiko wrote:
> On Thu, Jul 3, 2025 at 5:22 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>> For now, the libbpf find the btf type id by loop all the btf types and
>> compare its name, which is inefficient if we have many functions to
>> lookup.
>>
>> We add the "use_hash" to the function args of find_kernel_btf_id() to
>> indicate if we should lookup the btf type id by hash. The hash table will
>> be initialized if it has not yet.
> Or we could build hashtable-based index outside of struct btf for a
> specific use case, because there is no one perfect hashtable-based
> indexing that can be done generically (e.g., just by name, or
> name+kind, or kind+name, or some more complicated lookup key) and
> cover all potential use cases. I'd prefer not to get into a problem of
> defining and building indexes and leave it to callers (even if the
> caller is other part of libbpf itself).


I think that works. We can define a global hash table in libbpf.c,
and add all the btf type to it. I'll redesign this part, and make it
separate with the btf.

Thanks!
Menglong Dong

>> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
>> ---
>>   tools/lib/bpf/btf.c      | 102 +++++++++++++++++++++++++++++++++++++++
>>   tools/lib/bpf/btf.h      |   6 +++
>>   tools/lib/bpf/libbpf.c   |  37 +++++++++++---
>>   tools/lib/bpf/libbpf.map |   3 ++
>>   4 files changed, 140 insertions(+), 8 deletions(-)
>>
> [...]
>

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 15/18] libbpf: add skip_invalid and attach_tracing for tracing_multi
  2025-07-14 22:07   ` Andrii Nakryiko
@ 2025-07-15  5:48     ` Menglong Dong
  2025-07-15 17:23       ` Andrii Nakryiko
  0 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-15  5:48 UTC (permalink / raw)
  To: Andrii Nakryiko, Menglong Dong
  Cc: alexei.starovoitov, rostedt, jolsa, bpf, Menglong Dong,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel


On 7/15/25 06:07, Andrii Nakryiko wrote:
> On Thu, Jul 3, 2025 at 5:23 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>> We add skip_invalid and attach_tracing for tracing_multi for the
>> selftests.
>>
>> When we try to attach all the functions in available_filter_functions with
>> tracing_multi, we can't tell if the target symbol can be attached
>> successfully, and the attaching will fail. When skip_invalid is set to
>> true, we will check if it can be attached in libbpf, and skip the invalid
>> entries.
>>
>> We will skip the symbols in the following cases:
>>
>> 1. the btf type not exist
>> 2. the btf type is not a function proto
>> 3. the function args count more that 6
>> 4. the return type is struct or union
>> 5. any function args is struct or union
>>
>> The 5th rule can be a manslaughter, but it's ok for the testings.
>>
>> "attach_tracing" is used to convert a TRACING prog to TRACING_MULTI. For
>> example, we can set the attach type to FENTRY_MULTI before we load the
>> skel. And we can attach the prog with
>> bpf_program__attach_trace_multi_opts() with "attach_tracing=1". The libbpf
>> will attach the target btf type of the prog automatically. This is also
>> used to reuse the selftests of tracing.
>>
>> (Oh my goodness! What am I doing?)
> exactly...
>
> Let's think if we need any of that, as in: take a step back, and try
> to explain why you think any of this should be part of libbpf's UAPI.

I know it's weird. The "attach_tracing" is used for selftests, which I can
use something else instead. But the "skip_invalid" is something that we
need.

For example, we have a function list, which contains 1000 kernel functions,
and we want to attach fentry-multi to them. However, we don't know which
of them can't be attached, so the attachment will fail. And we need a way to
skip the functions that can't be attached to make the attachment success.

This should be a common use case. And let me do more research to see if
we can do such filter out of the libbpf.

Thanks!
Menglong Dong


>
>> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
>> ---
>>   tools/lib/bpf/libbpf.c | 97 ++++++++++++++++++++++++++++++++++++------
>>   tools/lib/bpf/libbpf.h |  6 ++-
>>   2 files changed, 89 insertions(+), 14 deletions(-)
>>
> [...]
>

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15  2:25   ` Alexei Starovoitov
@ 2025-07-15  8:36     ` Menglong Dong
  2025-07-15  9:30       ` Menglong Dong
  2025-07-15 16:35       ` Alexei Starovoitov
  0 siblings, 2 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-15  8:36 UTC (permalink / raw)
  To: Alexei Starovoitov, Menglong Dong
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, H. Peter Anvin,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development


On 7/15/25 10:25, Alexei Starovoitov wrote:
> On Thu, Jul 3, 2025 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>> +static __always_inline void
>> +do_origin_call(unsigned long *args, unsigned long *ip, int nr_args)
>> +{
>> +       /* Following code will be optimized by the compiler, as nr_args
>> +        * is a const, and there will be no condition here.
>> +        */
>> +       if (nr_args == 0) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_0 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       :
>> +               );
>> +       } else if (nr_args == 1) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_1 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi"
>> +               );
>> +       } else if (nr_args == 2) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_2 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi", "rsi"
>> +               );
>> +       } else if (nr_args == 3) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_3 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi", "rsi", "rdx"
>> +               );
>> +       } else if (nr_args == 4) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_4 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi", "rsi", "rdx", "rcx"
>> +               );
>> +       } else if (nr_args == 5) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_5 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi", "rsi", "rdx", "rcx", "r8"
>> +               );
>> +       } else if (nr_args == 6) {
>> +               asm volatile(
>> +                       RESTORE_ORIGIN_6 CALL_NOSPEC "\n"
>> +                       "movq %%rax, %0\n"
>> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
>> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
>> +                       : "rdi", "rsi", "rdx", "rcx", "r8", "r9"
>> +               );
>> +       }
>> +}
> What is the performance difference between 0-6 variants?
> I would think save/restore of regs shouldn't be that expensive.
> bpf trampoline saves only what's necessary because it can do
> this micro optimization, but for this one, I think, doing
> _one_ global trampoline that covers all cases will simplify the code
> a lot, but please benchmark the difference to understand
> the trade-off.

According to my benchmark, it has ~5% overhead to save/restore
*5* variants when compared with *0* variant. The save/restore of regs
is fast, but it still need 12 insn, which can produce ~6% overhead.

I think the performance is more import and we should keep this logic.
Should we? If you think the do_origin_call() is not simple enough, we
can recover all the 6 regs from the stack directly for the origin call, 
which won't
introduce too much overhead, and keep the save/restore logic.

What do you think?


>
> The major simplification will be due to skipping nr_args.
> There won't be a need to do btf model and count the args.
> Just do one trampoline for them all.
>
> Also funcs with 7+ arguments need to be thought through
> from the start.


In the current version, the attachment will be rejected if any functions 
have
7+ arguments.


> I think it's ok trade-off if we allow global trampoline
> to be safe to attach to a function with 7+ args (and
> it will not mess with the stack), but bpf prog can only
> access up to 6 args. The kfuncs to access arg 7 might be
> more complex and slower. It's ok trade off.


It's OK for fentry-multi, but we can't allow fexit-multi and 
modify_return-multi
to be attached to the function with 7+ args, as we need to do the origin
call, and we can't recover the arguments in the stack for the origin 
call for now.

So we can allow the functions with 7+ args to be attached as long as the 
accessed
arguments are all in regs for fentry-multi. And I think we need one more 
patch to
do the "all accessed arguments are in regs" checking, so maybe we can 
put it in
the next series? As current series is a little complex :/

Anyway, I'll have a try to see if we can add this part in this series :)


>
>> +
>> +static __always_inline notrace void
>> +run_tramp_prog(struct kfunc_md_tramp_prog *tramp_prog,
>> +              struct bpf_tramp_run_ctx *run_ctx, unsigned long *args)
>> +{
>> +       struct bpf_prog *prog;
>> +       u64 start_time;
>> +
>> +       while (tramp_prog) {
>> +               prog = tramp_prog->prog;
>> +               run_ctx->bpf_cookie = tramp_prog->cookie;
>> +               start_time = bpf_gtramp_enter(prog, run_ctx);
>> +
>> +               if (likely(start_time)) {
>> +                       asm volatile(
>> +                               CALL_NOSPEC "\n"
>> +                               : : [thunk_target]"r"(prog->bpf_func), [args]"D"(args)
>> +                       );
> Why this cannot be "call *(prog->bpf_func)" ?

Do you mean "prog->bpf_func(args, NULL);"? In my previous testing, this 
cause
bad performance, and I see others do the indirect call in this way. And 
I just do
the benchmark again, it seems the performance is not affected in this 
way anymore.
So I think I can replace it with "prog->bpf_func(args, NULL);" in the 
next version.

>
>> +               }
>> +
>> +               bpf_gtramp_exit(prog, start_time, run_ctx);
>> +               tramp_prog = tramp_prog->next;
>> +       }
>> +}
>> +
>> +static __always_inline notrace int
>> +bpf_global_caller_run(unsigned long *args, unsigned long *ip, int nr_args)
> Pls share top 10 from "perf report" while running the bench.
> I'm curious about what's hot.
> Last time I benchmarked fentry/fexit migrate_disable/enable were
> one the hottest functions. I suspect it's the case here as well.


You are right, the migrate_disable/enable are the hottest functions in
both bpf trampoline and global trampoline. Following is the perf top
for fentry-multi:
36.36% bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi [k] 
bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi 20.54% [kernel] [k] 
migrate_enable 19.35% [kernel] [k] bpf_global_caller_5_run 6.52% 
[kernel] [k] bpf_global_caller_5 3.58% libc.so.6 [.] syscall 2.88% 
[kernel] [k] entry_SYSCALL_64 1.50% [kernel] [k] memchr_inv 1.39% 
[kernel] [k] fput 1.04% [kernel] [k] migrate_disable 0.91% [kernel] [k] 
_copy_to_user

And I also did the testing for fentry:

54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k] 
bpf_prog_2dcccf652aac1793_bench_trigger_fentry
10.43% [kernel] [k] migrate_enable
10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
8.06% [kernel] [k] __bpf_prog_exit_recur 4.11% libc.so.6 [.] syscall 
2.15% [kernel] [k] entry_SYSCALL_64 1.48% [kernel] [k] memchr_inv 1.32% 
[kernel] [k] fput 1.16% [kernel] [k] _copy_to_user 0.73% [kernel] [k] 
bpf_prog_test_run_raw_tp
The migrate_enable/disable are used to do the recursive checking,
and I even wanted to perform recursive checks in the same way as
ftrace to eliminate this overhead :/


>

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi
  2025-07-15  3:13         ` Menglong Dong
@ 2025-07-15  9:06           ` Menglong Dong
  2025-07-15 16:22             ` Alexei Starovoitov
  0 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-15  9:06 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, LKML

On Tue, Jul 15, 2025 at 11:13 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> On Tue, Jul 15, 2025 at 10:49 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Jul 14, 2025 at 7:38 PM Menglong Dong <menglong8.dong@gmail.com> wrote:
[......]
> >
> > That doesn't sound right.
> > When everything is always_inline the compiler can inline the callback hashfn.
> > Without always inline do use see ht->p.hashfn in the assembly?
> > If so, the compiler is taking this path:
> >         if (!__builtin_constant_p(params.key_len))
> >                 hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
> >
> > which is back to const params.
>
> I think the compiler thinks the bpf_global_caller is complex enough and
> refuses to inline it for me, and a call to __rhashtable_lookup() happens.
> When I add always_inline to __rhashtable_lookup(), the compiler makes
> a call to rht_key_get_hash(), which is annoying. And I'm sure the params.key_len
> is const, and the function call is not for the ht->p.hashfn.
>
> >
> > > In fact, I think rhashtable is not good enough in our case, which
> > > has high performance requirements. With rhashtable, the insn count
> > > is 35 to finish the hash lookup. With the hash table here, it needs only
> > > 17 insn, which means the rhashtable introduces ~5% overhead.
> >
> > I feel you're not using rhashtable correctly.
> > Try disasm of xdp_unreg_mem_model().
> > The inlined lookup is quite small.
>
> Okay, I'll disasm it and have a look. In my case, it does consume 35 insn
> after I disasm it.

You might not believe it when I say this, the rhashtable lookup in my
kernel is not inlined in xdp_unreg_mem_model(), and following is the
disasm:

disassemble xdp_unreg_mem_model
Dump of assembler code for function xdp_unreg_mem_model:
   0xffffffff81e68760 <+0>:     call   0xffffffff8127f9d0 <__fentry__>
   0xffffffff81e68765 <+5>:     push   %rbx
   0xffffffff81e68766 <+6>:     sub    $0x10,%rsp
   [......]

   /* we can see that the function call to __rhashtable_lookup happens
in this line.  */
   0xffffffff81e687ba <+90>:    call   0xffffffff81e686c0 <__rhashtable_lookup>
   0xffffffff81e687bf <+95>:    test   %rax,%rax
   0xffffffff81e687c2 <+98>:    je     0xffffffff81e687cb
<xdp_unreg_mem_model+107>
   [......]

The gcc that I'm using is:
gcc --version
gcc (Debian 12.2.0-14+deb12u1) 12.2.0

I think there may be something wrong with the rhashtable, which needs some
fixing?

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15  8:36     ` Menglong Dong
@ 2025-07-15  9:30       ` Menglong Dong
  2025-07-16 16:56         ` Inlining migrate_disable/enable. Was: " Alexei Starovoitov
  2025-07-15 16:35       ` Alexei Starovoitov
  1 sibling, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-15  9:30 UTC (permalink / raw)
  To: Alexei Starovoitov, Menglong Dong
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, H. Peter Anvin,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

On 7/15/25 16:36, Menglong Dong wrote:
>
> On 7/15/25 10:25, Alexei Starovoitov wrote:
>> Pls share top 10 from "perf report" while running the bench.
>> I'm curious about what's hot.
>> Last time I benchmarked fentry/fexit migrate_disable/enable were
>> one the hottest functions. I suspect it's the case here as well.
>
>
> You are right, the migrate_disable/enable are the hottest functions in
> both bpf trampoline and global trampoline. Following is the perf top
> for fentry-multi:
> 36.36% bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi [k] 
> bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi 20.54% [kernel] 
> [k] migrate_enable 19.35% [kernel] [k] bpf_global_caller_5_run 6.52% 
> [kernel] [k] bpf_global_caller_5 3.58% libc.so.6 [.] syscall 2.88% 
> [kernel] [k] entry_SYSCALL_64 1.50% [kernel] [k] memchr_inv 1.39% 
> [kernel] [k] fput 1.04% [kernel] [k] migrate_disable 0.91% [kernel] 
> [k] _copy_to_user
>
> And I also did the testing for fentry:
>
> 54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k] 
> bpf_prog_2dcccf652aac1793_bench_trigger_fentry
> 10.43% [kernel] [k] migrate_enable
> 10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
> 8.06% [kernel] [k] __bpf_prog_exit_recur 4.11% libc.so.6 [.] syscall 
> 2.15% [kernel] [k] entry_SYSCALL_64 1.48% [kernel] [k] memchr_inv 
> 1.32% [kernel] [k] fput 1.16% [kernel] [k] _copy_to_user 0.73% 
> [kernel] [k] bpf_prog_test_run_raw_tp
> The migrate_enable/disable are used to do the recursive checking,
> and I even wanted to perform recursive checks in the same way as
> ftrace to eliminate this overhead :/
>

Sorry that I'm not familiar with Thunderbird yet, and the perf top
messed up. Following are the test results for fentry-multi:
   36.36% bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi [k] 
bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi
   20.54% [kernel] [k] migrate_enable
   19.35% [kernel] [k] bpf_global_caller_5_run
   6.52% [kernel] [k] bpf_global_caller_5
   3.58% libc.so.6 [.] syscall
   2.88% [kernel] [k] entry_SYSCALL_64
   1.50% [kernel] [k] memchr_inv
   1.39% [kernel] [k] fput
   1.04% [kernel] [k] migrate_disable
   0.91% [kernel] [k] _copy_to_user

And I also did the testing for fentry:
   54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k] 
bpf_prog_2dcccf652aac1793_bench_trigger_fentry
   10.43% [kernel] [k] migrate_enable
   10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
   8.06% [kernel] [k] __bpf_prog_exit_recur
   4.11% libc.so.6 [.] syscall
   2.15% [kernel] [k] entry_SYSCALL_64
   1.48% [kernel] [k] memchr_inv
   1.32% [kernel] [k] fput
   1.16% [kernel] [k] _copy_to_user
   0.73% [kernel] [k] bpf_prog_test_run_raw_tp

The migrate_enable/disable are used to do the recursive checking,
and I even wanted to perform recursive checks in the same way as
ftrace to eliminate this overhead :/

Thanks!
Menglong Dong

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi
  2025-07-15  9:06           ` Menglong Dong
@ 2025-07-15 16:22             ` Alexei Starovoitov
  0 siblings, 0 replies; 73+ messages in thread
From: Alexei Starovoitov @ 2025-07-15 16:22 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Steven Rostedt, Jiri Olsa, bpf, Menglong Dong, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, LKML

On Tue, Jul 15, 2025 at 2:07 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> On Tue, Jul 15, 2025 at 11:13 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > On Tue, Jul 15, 2025 at 10:49 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Jul 14, 2025 at 7:38 PM Menglong Dong <menglong8.dong@gmail.com> wrote:
> [......]
> > >
> > > That doesn't sound right.
> > > When everything is always_inline the compiler can inline the callback hashfn.
> > > Without always inline do use see ht->p.hashfn in the assembly?
> > > If so, the compiler is taking this path:
> > >         if (!__builtin_constant_p(params.key_len))
> > >                 hash = ht->p.hashfn(key, ht->key_len, hash_rnd);
> > >
> > > which is back to const params.
> >
> > I think the compiler thinks the bpf_global_caller is complex enough and
> > refuses to inline it for me, and a call to __rhashtable_lookup() happens.
> > When I add always_inline to __rhashtable_lookup(), the compiler makes
> > a call to rht_key_get_hash(), which is annoying. And I'm sure the params.key_len
> > is const, and the function call is not for the ht->p.hashfn.
> >
> > >
> > > > In fact, I think rhashtable is not good enough in our case, which
> > > > has high performance requirements. With rhashtable, the insn count
> > > > is 35 to finish the hash lookup. With the hash table here, it needs only
> > > > 17 insn, which means the rhashtable introduces ~5% overhead.
> > >
> > > I feel you're not using rhashtable correctly.
> > > Try disasm of xdp_unreg_mem_model().
> > > The inlined lookup is quite small.
> >
> > Okay, I'll disasm it and have a look. In my case, it does consume 35 insn
> > after I disasm it.
>
> You might not believe it when I say this, the rhashtable lookup in my
> kernel is not inlined in xdp_unreg_mem_model(), and following is the
> disasm:
>
> disassemble xdp_unreg_mem_model
> Dump of assembler code for function xdp_unreg_mem_model:
>    0xffffffff81e68760 <+0>:     call   0xffffffff8127f9d0 <__fentry__>
>    0xffffffff81e68765 <+5>:     push   %rbx
>    0xffffffff81e68766 <+6>:     sub    $0x10,%rsp
>    [......]
>
>    /* we can see that the function call to __rhashtable_lookup happens
> in this line.  */
>    0xffffffff81e687ba <+90>:    call   0xffffffff81e686c0 <__rhashtable_lookup>
>    0xffffffff81e687bf <+95>:    test   %rax,%rax
>    0xffffffff81e687c2 <+98>:    je     0xffffffff81e687cb
> <xdp_unreg_mem_model+107>
>    [......]
>
> The gcc that I'm using is:
> gcc --version
> gcc (Debian 12.2.0-14+deb12u1) 12.2.0
>
> I think there may be something wrong with the rhashtable, which needs some
> fixing?

Try multiple compilers.
gcc 12 is quite old.
Making software design decisions based on one specific compiler is just wrong.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15  8:36     ` Menglong Dong
  2025-07-15  9:30       ` Menglong Dong
@ 2025-07-15 16:35       ` Alexei Starovoitov
  2025-07-16 13:05         ` Menglong Dong
  2025-07-16 14:40         ` Menglong Dong
  1 sibling, 2 replies; 73+ messages in thread
From: Alexei Starovoitov @ 2025-07-15 16:35 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Menglong Dong, Steven Rostedt, Jiri Olsa, bpf, Menglong Dong,
	H. Peter Anvin, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, LKML, Network Development

On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
>
>
> On 7/15/25 10:25, Alexei Starovoitov wrote:
> > On Thu, Jul 3, 2025 at 5:17 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >> +static __always_inline void
> >> +do_origin_call(unsigned long *args, unsigned long *ip, int nr_args)
> >> +{
> >> +       /* Following code will be optimized by the compiler, as nr_args
> >> +        * is a const, and there will be no condition here.
> >> +        */
> >> +       if (nr_args == 0) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_0 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       :
> >> +               );
> >> +       } else if (nr_args == 1) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_1 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi"
> >> +               );
> >> +       } else if (nr_args == 2) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_2 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi", "rsi"
> >> +               );
> >> +       } else if (nr_args == 3) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_3 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi", "rsi", "rdx"
> >> +               );
> >> +       } else if (nr_args == 4) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_4 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi", "rsi", "rdx", "rcx"
> >> +               );
> >> +       } else if (nr_args == 5) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_5 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi", "rsi", "rdx", "rcx", "r8"
> >> +               );
> >> +       } else if (nr_args == 6) {
> >> +               asm volatile(
> >> +                       RESTORE_ORIGIN_6 CALL_NOSPEC "\n"
> >> +                       "movq %%rax, %0\n"
> >> +                       : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
> >> +                       : [args]"r"(args), [thunk_target]"r"(*ip)
> >> +                       : "rdi", "rsi", "rdx", "rcx", "r8", "r9"
> >> +               );
> >> +       }
> >> +}
> > What is the performance difference between 0-6 variants?
> > I would think save/restore of regs shouldn't be that expensive.
> > bpf trampoline saves only what's necessary because it can do
> > this micro optimization, but for this one, I think, doing
> > _one_ global trampoline that covers all cases will simplify the code
> > a lot, but please benchmark the difference to understand
> > the trade-off.
>
> According to my benchmark, it has ~5% overhead to save/restore
> *5* variants when compared with *0* variant. The save/restore of regs
> is fast, but it still need 12 insn, which can produce ~6% overhead.

I think it's an ok trade off, because with one global trampoline
we do not need to call rhashtable lookup before entering bpf prog.
bpf prog will do it on demand if/when it needs to access arguments.
This will compensate for a bit of lost performance due to extra save/restore.

PS
pls don't add your chinatelecom.cn email in cc.
gmail just cannot deliver there and it's annoying to keep deleting
it manually in every reply.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args
  2025-07-14 23:45     ` Menglong Dong
@ 2025-07-15 17:11       ` Andrii Nakryiko
  2025-07-16 12:50         ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Andrii Nakryiko @ 2025-07-15 17:11 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Menglong Dong, alexei.starovoitov, rostedt, jolsa, bpf,
	Menglong Dong, John Fastabend, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo,
	Simon Horman, linux-kernel, netdev

On Mon, Jul 14, 2025 at 4:45 PM Menglong Dong <menglong.dong@linux.dev> wrote:
>
>
> On 2025/7/15 06:07, Andrii Nakryiko wrote:
> > On Thu, Jul 3, 2025 at 5:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >> In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> >> which is used to record the accessed index of the function args in
> >> btf_ctx_access().
> > Do we need to bother giving access to arguments through direct ctx[i]
> > access for these multi-fentry/fexit programs? We have
> > bpf_get_func_arg_cnt() and bpf_get_func_arg() which can be used to get
> > any given argument at runtime.
>
>
> Hi Andrii. This commit is not for that purpose. We remember all the accessed
> args to bpf_prog_aux->accessed_args. And when we attach the tracing-multi
> prog to the kernel functions, we will check if the accessed arguments are
> consistent between all the target functions.
>
> The bpf_prog_aux->accessed_args will be used in
> https://lore.kernel.org/bpf/20250703121521.1874196-12-dongml2@chinatelecom.cn/
>
> in bpf_tracing_check_multi() to do such checking.
>
> With such checking, the target functions don't need to have
> the same prototype, which makes tracing-multi more flexible.

Yeah, and my point is why even track this at verifier level. If we
don't allow direct ctx[i] access and only access arguments through
bpf_get_func_arg(), we can check actual number of arguments at runtime
and if program is trying to access something that's not there, we'll
just return error code, so user can handle this generically.

I'm just not sure if there is a need to do anything more than that.

>
> Thanks!
> Menglong Dong
>
>
> >
> >> Meanwhile, we add the function btf_check_func_part_match() to compare the
> >> accessed function args of two function prototype. This function will be
> >> used in the following commit.
> >>
> >> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> >> ---
> >>   include/linux/bpf.h   |   4 ++
> >>   include/linux/btf.h   |   3 +-
> >>   kernel/bpf/btf.c      | 108 +++++++++++++++++++++++++++++++++++++++++-
> >>   net/sched/bpf_qdisc.c |   2 +-
> >>   4 files changed, 113 insertions(+), 4 deletions(-)
> >>
> > [...]
> >

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 13/18] libbpf: support tracing_multi
  2025-07-15  1:58     ` Menglong Dong
@ 2025-07-15 17:20       ` Andrii Nakryiko
  2025-07-16 12:43         ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Andrii Nakryiko @ 2025-07-15 17:20 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Menglong Dong, alexei.starovoitov, rostedt, jolsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel

On Mon, Jul 14, 2025 at 6:59 PM Menglong Dong <menglong.dong@linux.dev> wrote:
>
>
> On 7/15/25 06:07, Andrii Nakryiko wrote:
> > On Thu, Jul 3, 2025 at 5:24 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >> Add supporting for the attach types of:
> >>
> >> BPF_TRACE_FENTRY_MULTI
> >> BPF_TRACE_FEXIT_MULTI
> >> BPF_MODIFY_RETURN_MULTI
> >>
> >> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> >> ---
> >>   tools/bpf/bpftool/common.c |   3 +
> >>   tools/lib/bpf/bpf.c        |  10 +++
> >>   tools/lib/bpf/bpf.h        |   6 ++
> >>   tools/lib/bpf/libbpf.c     | 168 ++++++++++++++++++++++++++++++++++++-
> >>   tools/lib/bpf/libbpf.h     |  19 +++++
> >>   tools/lib/bpf/libbpf.map   |   1 +
> >>   6 files changed, 204 insertions(+), 3 deletions(-)
> >>
> > [...]
> >
> >> diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
> >> index 1342564214c8..5c97acec643d 100644
> >> --- a/tools/lib/bpf/bpf.h
> >> +++ b/tools/lib/bpf/bpf.h
> >> @@ -422,6 +422,12 @@ struct bpf_link_create_opts {
> >>                  struct {
> >>                          __u64 cookie;
> >>                  } tracing;
> >> +               struct {
> >> +                       __u32 cnt;
> >> +                       const __u32 *btf_ids;
> >> +                       const __u32 *tgt_fds;
> > tgt_fds are always BTF FDs, right? Do we intend to support
> > freplace-style multi attachment at all? If not, I'd name them btf_fds,
> > and btf_ids -> btf_type_ids (because BTF ID can also refer to kernel
> > ID of BTF object, so ambiguous and somewhat confusing)
>
>
> For now, freplace is not supported. And I'm not sure if we will support
>
> it in the feature.
>
>
> I think that there should be no need to use freplace in large quantities,
>
> so we don't need to support the multi attachment for it in the feature.
>
>
> Yeah, I'll follow your advice in the next version.
>

great

>
> >
> >> +                       const __u64 *cookies;
> >> +               } tracing_multi;
> >>                  struct {
> >>                          __u32 pf;
> >>                          __u32 hooknum;
> >> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> >> index 530c29f2f5fc..ae38b3ab84c7 100644
> >> --- a/tools/lib/bpf/libbpf.c
> >> +++ b/tools/lib/bpf/libbpf.c
> >> @@ -136,6 +136,9 @@ static const char * const attach_type_name[] = {
> >>          [BPF_NETKIT_PEER]               = "netkit_peer",
> >>          [BPF_TRACE_KPROBE_SESSION]      = "trace_kprobe_session",
> >>          [BPF_TRACE_UPROBE_SESSION]      = "trace_uprobe_session",
> >> +       [BPF_TRACE_FENTRY_MULTI]        = "trace_fentry_multi",
> >> +       [BPF_TRACE_FEXIT_MULTI]         = "trace_fexit_multi",
> >> +       [BPF_MODIFY_RETURN_MULTI]       = "modify_return_multi",
> >>   };
> >>
> >>   static const char * const link_type_name[] = {
> >> @@ -410,6 +413,8 @@ enum sec_def_flags {
> >>          SEC_XDP_FRAGS = 16,
> >>          /* Setup proper attach type for usdt probes. */
> >>          SEC_USDT = 32,
> >> +       /* attachment target is multi-link */
> >> +       SEC_ATTACH_BTF_MULTI = 64,
> >>   };
> >>
> >>   struct bpf_sec_def {
> >> @@ -7419,9 +7424,9 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
> >>                  opts->expected_attach_type = BPF_TRACE_UPROBE_MULTI;
> >>          }
> >>
> >> -       if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) {
> >> +       if ((def & (SEC_ATTACH_BTF | SEC_ATTACH_BTF_MULTI)) && !prog->attach_btf_id) {
> >>                  int btf_obj_fd = 0, btf_type_id = 0, err;
> >> -               const char *attach_name;
> >> +               const char *attach_name, *name_end;
> >>
> >>                  attach_name = strchr(prog->sec_name, '/');
> >>                  if (!attach_name) {
> >> @@ -7440,7 +7445,27 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
> >>                  }
> >>                  attach_name++; /* skip over / */
> >>
> >> -               err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd, &btf_type_id);
> >> +               name_end = strchr(attach_name, ',');
> >> +               /* for multi-link tracing, use the first target symbol during
> >> +                * loading.
> >> +                */
> >> +               if ((def & SEC_ATTACH_BTF_MULTI) && name_end) {
> >> +                       int len = name_end - attach_name + 1;
> > for multi-kprobe we decided to only support a single glob  as a target
> > in declarative SEC() definition. If a user needs more control, they
> > can always fallback to the programmatic bpf_program__attach_..._opts()
> > variant. Let's do the same here, glob is good enough for declarative
> > use cases, and for complicated cases programmatic is the way to go
> > anyways. You'll avoid unnecessary complications like this one then.
>
>
> In fact, this is to make the BPF code in the selftests simple. With such
>
> control, I can test different combination of the target functions easily,
>
> just like this:
>
>
> SEC("fentry.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_13")
> int BPF_PROG2(fentry_success_test1, struct bpf_testmod_struct_arg_2, a)
> {
>      test_result = a.a + a.b;
>      return 0;
> }
>
> SEC("fentry.multi/bpf_testmod_test_struct_arg_2,bpf_testmod_test_struct_arg_10")
> int BPF_PROG2(fentry_success_test2, int, a, struct
> bpf_testmod_struct_arg_2, b)
> {
>      test_result = a + b.a + b.b;
>      return 0;
> }
>
>
> And you are right, we should design it for the users, and a single glob is
>
> much better. Instead, I'll implement the combination testings in the
>
> loader with bpf_program__attach_trace_multi_opts().
>

sgtm. I'd also think if we can construct a glob that would describe
functions you need (and if necessary to rename testmod functions
slightly - so be it, it's all for testing anyways)

>
> >
> > BTW, it's not trivial to figure this out from earlier patches, but
> > does BPF verifier need to know all these BTF type IDs during program
> > verification time? If yes, why and then why do we need to specify them
> > during LINK_CREATE time. And if not, then great, and we don't need to
> > parse all this during load time.
>
>
> It doesn't need to know all the BTF type IDs, but it need to know one
>
> of them(the first one), which means that we still need to do the parse
>
> during load time.
>
>
> Of course, we can split it:
>
> step 1: parse the glob and get the first BTF type ID during load time
>
> step 2: parse the glob and get all the BTF type IDs during attachment
>
>
> But it will make the code a little more complex. Shoud I do it this way?
>
> I'd appreciate it to hear some advice here :/

I think I have a bit of disconnect here, because in my mind
multi-fentry/fexit cannot be type-aware, in general, at BPF
verification time. I.e., verifier should not assume any specific
prototype, and this gets back to my suggestion to just use
bpf_get_func_arg/cnt. While in some special cases you might want to
attach to a small number of functions that, say, have task_struct
argument and we can take a bit of advantage of this in BPF code by
verifier ensuring that all attached functions have that task_struct, I
do think this is unnecessary complication and limitation, and I'd
rather make multi-fentry/fexit not type-aware in the same way as
fentry/fexit is. With that, verifier won't need to know BTF ID, and so
multi-fentry will work very similarly to multi-kprobe, just will be
slightly cheaper at runtime.

And I'm saying all this, because even if all attached functions have
task_struct as that argument, you can achieve exactly that by just
doing `bpf_core_cast(bpf_get_func_arg(0), struct task_struct)`, and
that's all. So I'd simplify and make working with multi-fentry easier
for multi-function tracers (which is the challenging aspect with
fentry today). If you have 2-3-4-5 functions you are attaching to and
hoping to get that task_struct, you might as well just attach 2-3-4-5
times, get performance benefit, without really compromising much on
attachment time (because 5 attachments are plenty fast).

>
>
> >
> >> +                       char *first_tgt;
> >> +
> >> +                       first_tgt = malloc(len);
> >> +                       if (!first_tgt)
> >> +                               return -ENOMEM;
> >> +                       libbpf_strlcpy(first_tgt, attach_name, len);
> >> +                       first_tgt[len - 1] = '\0';
> >> +                       err = libbpf_find_attach_btf_id(prog, first_tgt, &btf_obj_fd,
> >> +                                                       &btf_type_id);
> >> +                       free(first_tgt);
> >> +               } else {
> >> +                       err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd,
> >> +                                                       &btf_type_id);
> >> +               }
> >> +
> >>                  if (err)
> >>                          return err;
> >>
> >> @@ -9519,6 +9544,7 @@ static int attach_kprobe_session(const struct bpf_program *prog, long cookie, st
> >>   static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
> >>   static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link);
> >>   static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link);
> >> +static int attach_trace_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
> >>
> >>   static const struct bpf_sec_def section_defs[] = {
> >>          SEC_DEF("socket",               SOCKET_FILTER, 0, SEC_NONE),
> >> @@ -9565,6 +9591,13 @@ static const struct bpf_sec_def section_defs[] = {
> >>          SEC_DEF("fentry.s+",            TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
> >>          SEC_DEF("fmod_ret.s+",          TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
> >>          SEC_DEF("fexit.s+",             TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
> >> +       SEC_DEF("tp_btf+",              TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace),
> > duplicate
>
>
> Get it :/
>
>
> Thanks!
>
> Menglong Dong
>
>
> >
> >
> >> +       SEC_DEF("fentry.multi+",        TRACING, BPF_TRACE_FENTRY_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
> >> +       SEC_DEF("fmod_ret.multi+",      TRACING, BPF_MODIFY_RETURN_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
> >> +       SEC_DEF("fexit.multi+",         TRACING, BPF_TRACE_FEXIT_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
> >> +       SEC_DEF("fentry.multi.s+",      TRACING, BPF_TRACE_FENTRY_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
> >> +       SEC_DEF("fmod_ret.multi.s+",    TRACING, BPF_MODIFY_RETURN_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
> >> +       SEC_DEF("fexit.multi.s+",       TRACING, BPF_TRACE_FEXIT_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
> >>          SEC_DEF("freplace+",            EXT, 0, SEC_ATTACH_BTF, attach_trace),
> >>          SEC_DEF("lsm+",                 LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
> >>          SEC_DEF("lsm.s+",               LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
> >> @@ -12799,6 +12832,135 @@ static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_
> >>          return libbpf_get_error(*link);
> >>   }
> >>
> > [...]
> >

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 14/18] libbpf: add btf type hash lookup support
  2025-07-15  4:40     ` Menglong Dong
@ 2025-07-15 17:20       ` Andrii Nakryiko
  2025-07-16 11:53         ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Andrii Nakryiko @ 2025-07-15 17:20 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Menglong Dong, alexei.starovoitov, rostedt, jolsa, bpf,
	Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, linux-kernel

On Mon, Jul 14, 2025 at 9:41 PM Menglong Dong <menglong.dong@linux.dev> wrote:
>
>
> On 7/15/25 06:07, Andrii Nakryiko wrote:
> > On Thu, Jul 3, 2025 at 5:22 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >> For now, the libbpf find the btf type id by loop all the btf types and
> >> compare its name, which is inefficient if we have many functions to
> >> lookup.
> >>
> >> We add the "use_hash" to the function args of find_kernel_btf_id() to
> >> indicate if we should lookup the btf type id by hash. The hash table will
> >> be initialized if it has not yet.
> > Or we could build hashtable-based index outside of struct btf for a
> > specific use case, because there is no one perfect hashtable-based
> > indexing that can be done generically (e.g., just by name, or
> > name+kind, or kind+name, or some more complicated lookup key) and
> > cover all potential use cases. I'd prefer not to get into a problem of
> > defining and building indexes and leave it to callers (even if the
> > caller is other part of libbpf itself).
>
>
> I think that works. We can define a global hash table in libbpf.c,
> and add all the btf type to it. I'll redesign this part, and make it
> separate with the btf.

No global things, please. It can be held per-bpf_object, or even
constructed on demand during attachment and then freed. No need for
anything global.

>
> Thanks!
> Menglong Dong
>
> >> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> >> ---
> >>   tools/lib/bpf/btf.c      | 102 +++++++++++++++++++++++++++++++++++++++
> >>   tools/lib/bpf/btf.h      |   6 +++
> >>   tools/lib/bpf/libbpf.c   |  37 +++++++++++---
> >>   tools/lib/bpf/libbpf.map |   3 ++
> >>   4 files changed, 140 insertions(+), 8 deletions(-)
> >>
> > [...]
> >

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 15/18] libbpf: add skip_invalid and attach_tracing for tracing_multi
  2025-07-15  5:48     ` Menglong Dong
@ 2025-07-15 17:23       ` Andrii Nakryiko
  2025-07-16 11:46         ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Andrii Nakryiko @ 2025-07-15 17:23 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Menglong Dong, alexei.starovoitov, rostedt, jolsa, bpf,
	Menglong Dong, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
	Hao Luo, linux-kernel

On Mon, Jul 14, 2025 at 10:49 PM Menglong Dong <menglong.dong@linux.dev> wrote:
>
>
> On 7/15/25 06:07, Andrii Nakryiko wrote:
> > On Thu, Jul 3, 2025 at 5:23 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >> We add skip_invalid and attach_tracing for tracing_multi for the
> >> selftests.
> >>
> >> When we try to attach all the functions in available_filter_functions with
> >> tracing_multi, we can't tell if the target symbol can be attached
> >> successfully, and the attaching will fail. When skip_invalid is set to
> >> true, we will check if it can be attached in libbpf, and skip the invalid
> >> entries.
> >>
> >> We will skip the symbols in the following cases:
> >>
> >> 1. the btf type not exist
> >> 2. the btf type is not a function proto
> >> 3. the function args count more that 6
> >> 4. the return type is struct or union
> >> 5. any function args is struct or union
> >>
> >> The 5th rule can be a manslaughter, but it's ok for the testings.
> >>
> >> "attach_tracing" is used to convert a TRACING prog to TRACING_MULTI. For
> >> example, we can set the attach type to FENTRY_MULTI before we load the
> >> skel. And we can attach the prog with
> >> bpf_program__attach_trace_multi_opts() with "attach_tracing=1". The libbpf
> >> will attach the target btf type of the prog automatically. This is also
> >> used to reuse the selftests of tracing.
> >>
> >> (Oh my goodness! What am I doing?)
> > exactly...
> >
> > Let's think if we need any of that, as in: take a step back, and try
> > to explain why you think any of this should be part of libbpf's UAPI.
>
> I know it's weird. The "attach_tracing" is used for selftests, which I can
> use something else instead. But the "skip_invalid" is something that we
> need.
>
> For example, we have a function list, which contains 1000 kernel functions,
> and we want to attach fentry-multi to them. However, we don't know which
> of them can't be attached, so the attachment will fail. And we need a way to
> skip the functions that can't be attached to make the attachment success.

The right answer here is you need to know what's attachable and what's
not, instead of just ignoring attachment failures somewhere deep
inside libbpf API. Filter and check before you try to attach. There is
/sys/kernel/tracing/available_filter_functions and some similar
blacklist file, consult that, filter out stuff that's not attachable.

We won't be adding libbpf APIs just to make some selftests easier to
write by being sloppy.

>
> This should be a common use case. And let me do more research to see if
> we can do such filter out of the libbpf.

I have similar issues with retsnoop ([0]) and do just fine without
abusing libbpf API.

  [0] https://github.com/anakryiko/retsnoop/blob/master/src/mass_attacher.c#L749

>
> Thanks!
> Menglong Dong
>
>
> >
> >> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> >> ---
> >>   tools/lib/bpf/libbpf.c | 97 ++++++++++++++++++++++++++++++++++++------
> >>   tools/lib/bpf/libbpf.h |  6 ++-
> >>   2 files changed, 89 insertions(+), 14 deletions(-)
> >>
> > [...]
> >

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 15/18] libbpf: add skip_invalid and attach_tracing for tracing_multi
  2025-07-15 17:23       ` Andrii Nakryiko
@ 2025-07-16 11:46         ` Menglong Dong
  0 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-16 11:46 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Menglong Dong, alexei.starovoitov, rostedt, jolsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel

On Wednesday, July 16, 2025 1:23 AM Andrii Nakryiko <andrii.nakryiko@gmail.com> write:
[......]
> 
> The right answer here is you need to know what's attachable and what's
> not, instead of just ignoring attachment failures somewhere deep
> inside libbpf API. Filter and check before you try to attach. There is
> /sys/kernel/tracing/available_filter_functions and some similar
> blacklist file, consult that, filter out stuff that's not attachable.
> 
> We won't be adding libbpf APIs just to make some selftests easier to
> write by being sloppy.
> 
> >
> > This should be a common use case. And let me do more research to see if
> > we can do such filter out of the libbpf.
> 
> I have similar issues with retsnoop ([0]) and do just fine without
> abusing libbpf API.
> 
>   [0] https://github.com/anakryiko/retsnoop/blob/master/src/mass_attacher.c#L749

Thank you for the reference, and I think it will work to do such
filtering in the selftests.





^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 14/18] libbpf: add btf type hash lookup support
  2025-07-15 17:20       ` Andrii Nakryiko
@ 2025-07-16 11:53         ` Menglong Dong
  0 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-16 11:53 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Menglong Dong, alexei.starovoitov, rostedt, jolsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel

On Wednesday, July 16, 2025 1:20 AM Andrii Nakryiko <andrii.nakryiko@gmail.com> write:
> On Mon, Jul 14, 2025 at 9:41 PM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> >
> > On 7/15/25 06:07, Andrii Nakryiko wrote:
> > > On Thu, Jul 3, 2025 at 5:22 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > >> For now, the libbpf find the btf type id by loop all the btf types and
> > >> compare its name, which is inefficient if we have many functions to
> > >> lookup.
> > >>
> > >> We add the "use_hash" to the function args of find_kernel_btf_id() to
> > >> indicate if we should lookup the btf type id by hash. The hash table will
> > >> be initialized if it has not yet.
> > > Or we could build hashtable-based index outside of struct btf for a
> > > specific use case, because there is no one perfect hashtable-based
> > > indexing that can be done generically (e.g., just by name, or
> > > name+kind, or kind+name, or some more complicated lookup key) and
> > > cover all potential use cases. I'd prefer not to get into a problem of
> > > defining and building indexes and leave it to callers (even if the
> > > caller is other part of libbpf itself).
> >
> >
> > I think that works. We can define a global hash table in libbpf.c,
> > and add all the btf type to it. I'll redesign this part, and make it
> > separate with the btf.
> 
> No global things, please. It can be held per-bpf_object, or even
> constructed on demand during attachment and then freed. No need for
> anything global.

Okay, the per-bpf_object is a good idea, and I'll try to implement
it this way.

Thanks!
Menglong Dong






^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 13/18] libbpf: support tracing_multi
  2025-07-15 17:20       ` Andrii Nakryiko
@ 2025-07-16 12:43         ` Menglong Dong
  0 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-16 12:43 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Menglong Dong, alexei.starovoitov, rostedt, jolsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo,
	linux-kernel

On Wednesday, July 16, 2025 1:20 AM Andrii Nakryiko <andrii.nakryiko@gmail.com> write:
> On Mon, Jul 14, 2025 at 6:59 PM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> >
> > On 7/15/25 06:07, Andrii Nakryiko wrote:
> > > On Thu, Jul 3, 2025 at 5:24 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > >> Add supporting for the attach types of:
> > >>
> > >> BPF_TRACE_FENTRY_MULTI
> > >> BPF_TRACE_FEXIT_MULTI
> > >> BPF_MODIFY_RETURN_MULTI
> > >>
> > >> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> > >> ---
> > >>   tools/bpf/bpftool/common.c |   3 +
> > >>   tools/lib/bpf/bpf.c        |  10 +++
> > >>   tools/lib/bpf/bpf.h        |   6 ++
> > >>   tools/lib/bpf/libbpf.c     | 168 ++++++++++++++++++++++++++++++++++++-
> > >>   tools/lib/bpf/libbpf.h     |  19 +++++
> > >>   tools/lib/bpf/libbpf.map   |   1 +
> > >>   6 files changed, 204 insertions(+), 3 deletions(-)
> > >>
> > > [...]
> > >
> > >> diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
> > >> index 1342564214c8..5c97acec643d 100644
> > >> --- a/tools/lib/bpf/bpf.h
> > >> +++ b/tools/lib/bpf/bpf.h
> > >> @@ -422,6 +422,12 @@ struct bpf_link_create_opts {
> > >>                  struct {
> > >>                          __u64 cookie;
> > >>                  } tracing;
> > >> +               struct {
> > >> +                       __u32 cnt;
> > >> +                       const __u32 *btf_ids;
> > >> +                       const __u32 *tgt_fds;
> > > tgt_fds are always BTF FDs, right? Do we intend to support
> > > freplace-style multi attachment at all? If not, I'd name them btf_fds,
> > > and btf_ids -> btf_type_ids (because BTF ID can also refer to kernel
> > > ID of BTF object, so ambiguous and somewhat confusing)
> >
> >
> > For now, freplace is not supported. And I'm not sure if we will support
> >
> > it in the feature.
> >
> >
> > I think that there should be no need to use freplace in large quantities,
> >
> > so we don't need to support the multi attachment for it in the feature.
> >
> >
> > Yeah, I'll follow your advice in the next version.
> >
> 
> great
> 
> >
> > >
> > >> +                       const __u64 *cookies;
> > >> +               } tracing_multi;
> > >>                  struct {
> > >>                          __u32 pf;
> > >>                          __u32 hooknum;
> > >> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> > >> index 530c29f2f5fc..ae38b3ab84c7 100644
> > >> --- a/tools/lib/bpf/libbpf.c
> > >> +++ b/tools/lib/bpf/libbpf.c
> > >> @@ -136,6 +136,9 @@ static const char * const attach_type_name[] = {
> > >>          [BPF_NETKIT_PEER]               = "netkit_peer",
> > >>          [BPF_TRACE_KPROBE_SESSION]      = "trace_kprobe_session",
> > >>          [BPF_TRACE_UPROBE_SESSION]      = "trace_uprobe_session",
> > >> +       [BPF_TRACE_FENTRY_MULTI]        = "trace_fentry_multi",
> > >> +       [BPF_TRACE_FEXIT_MULTI]         = "trace_fexit_multi",
> > >> +       [BPF_MODIFY_RETURN_MULTI]       = "modify_return_multi",
> > >>   };
> > >>
> > >>   static const char * const link_type_name[] = {
> > >> @@ -410,6 +413,8 @@ enum sec_def_flags {
> > >>          SEC_XDP_FRAGS = 16,
> > >>          /* Setup proper attach type for usdt probes. */
> > >>          SEC_USDT = 32,
> > >> +       /* attachment target is multi-link */
> > >> +       SEC_ATTACH_BTF_MULTI = 64,
> > >>   };
> > >>
> > >>   struct bpf_sec_def {
> > >> @@ -7419,9 +7424,9 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
> > >>                  opts->expected_attach_type = BPF_TRACE_UPROBE_MULTI;
> > >>          }
> > >>
> > >> -       if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) {
> > >> +       if ((def & (SEC_ATTACH_BTF | SEC_ATTACH_BTF_MULTI)) && !prog->attach_btf_id) {
> > >>                  int btf_obj_fd = 0, btf_type_id = 0, err;
> > >> -               const char *attach_name;
> > >> +               const char *attach_name, *name_end;
> > >>
> > >>                  attach_name = strchr(prog->sec_name, '/');
> > >>                  if (!attach_name) {
> > >> @@ -7440,7 +7445,27 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog,
> > >>                  }
> > >>                  attach_name++; /* skip over / */
> > >>
> > >> -               err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd, &btf_type_id);
> > >> +               name_end = strchr(attach_name, ',');
> > >> +               /* for multi-link tracing, use the first target symbol during
> > >> +                * loading.
> > >> +                */
> > >> +               if ((def & SEC_ATTACH_BTF_MULTI) && name_end) {
> > >> +                       int len = name_end - attach_name + 1;
> > > for multi-kprobe we decided to only support a single glob  as a target
> > > in declarative SEC() definition. If a user needs more control, they
> > > can always fallback to the programmatic bpf_program__attach_..._opts()
> > > variant. Let's do the same here, glob is good enough for declarative
> > > use cases, and for complicated cases programmatic is the way to go
> > > anyways. You'll avoid unnecessary complications like this one then.
> >
> >
> > In fact, this is to make the BPF code in the selftests simple. With such
> >
> > control, I can test different combination of the target functions easily,
> >
> > just like this:
> >
> >
> > SEC("fentry.multi/bpf_testmod_test_struct_arg_1,bpf_testmod_test_struct_arg_13")
> > int BPF_PROG2(fentry_success_test1, struct bpf_testmod_struct_arg_2, a)
> > {
> >      test_result = a.a + a.b;
> >      return 0;
> > }
> >
> > SEC("fentry.multi/bpf_testmod_test_struct_arg_2,bpf_testmod_test_struct_arg_10")
> > int BPF_PROG2(fentry_success_test2, int, a, struct
> > bpf_testmod_struct_arg_2, b)
> > {
> >      test_result = a + b.a + b.b;
> >      return 0;
> > }
> >
> >
> > And you are right, we should design it for the users, and a single glob is
> >
> > much better. Instead, I'll implement the combination testings in the
> >
> > loader with bpf_program__attach_trace_multi_opts().
> >
> 
> sgtm. I'd also think if we can construct a glob that would describe
> functions you need (and if necessary to rename testmod functions
> slightly - so be it, it's all for testing anyways)

It works if I define all the functions that I need in the testmod.
However, most of the functions in the testing is reusing the
existing function, so it's a little complex to change them :/

> 
> >
> > >
> > > BTW, it's not trivial to figure this out from earlier patches, but
> > > does BPF verifier need to know all these BTF type IDs during program
> > > verification time? If yes, why and then why do we need to specify them
> > > during LINK_CREATE time. And if not, then great, and we don't need to
> > > parse all this during load time.
> >
> >
> > It doesn't need to know all the BTF type IDs, but it need to know one
> >
> > of them(the first one), which means that we still need to do the parse
> >
> > during load time.
> >
> >
> > Of course, we can split it:
> >
> > step 1: parse the glob and get the first BTF type ID during load time
> >
> > step 2: parse the glob and get all the BTF type IDs during attachment
> >
> >
> > But it will make the code a little more complex. Shoud I do it this way?
> >
> > I'd appreciate it to hear some advice here :/
> 
> I think I have a bit of disconnect here, because in my mind
> multi-fentry/fexit cannot be type-aware, in general, at BPF
> verification time. I.e., verifier should not assume any specific
> prototype, and this gets back to my suggestion to just use
> bpf_get_func_arg/cnt. While in some special cases you might want to
> attach to a small number of functions that, say, have task_struct
> argument and we can take a bit of advantage of this in BPF code by
> verifier ensuring that all attached functions have that task_struct, I
> do think this is unnecessary complication and limitation, and I'd
> rather make multi-fentry/fexit not type-aware in the same way as
> fentry/fexit is. With that, verifier won't need to know BTF ID, and so
> multi-fentry will work very similarly to multi-kprobe, just will be
> slightly cheaper at runtime.

I see your idea now, which will free us from the function prototype
checking, and we don't need to do any consistency checking during
the attaching.

In my origin design, I tried to make the fentry-multi easy to use, and
keep the same usage with fentry.

So the only shortcoming of the method you said is that the user
can't access the function argument with ctx[x] directly, and the
bpf_core_cast() need to be used. Considering the use case, I think it's
OK in this way. After all, the common use case is we attach the bpf
prog to all the functions that has "task_struct" and store the argument
index in the cookie. And get the task_struct with
`bpf_core_cast(bpf_get_func_arg(cookie), struct task_struct)`.

I'll implement this part in this way, which can reduce 100+ line code :/

Thanks!
Menglong Dong

> 
> And I'm saying all this, because even if all attached functions have
> task_struct as that argument, you can achieve exactly that by just
> doing `bpf_core_cast(bpf_get_func_arg(0), struct task_struct)`, and
> that's all. So I'd simplify and make working with multi-fentry easier
> for multi-function tracers (which is the challenging aspect with
> fentry today). If you have 2-3-4-5 functions you are attaching to and
> hoping to get that task_struct, you might as well just attach 2-3-4-5
> times, get performance benefit, without really compromising much on
> attachment time (because 5 attachments are plenty fast).
> 
> >
> >
> > >
> > >> +                       char *first_tgt;
> > >> +
> > >> +                       first_tgt = malloc(len);
> > >> +                       if (!first_tgt)
> > >> +                               return -ENOMEM;
> > >> +                       libbpf_strlcpy(first_tgt, attach_name, len);
> > >> +                       first_tgt[len - 1] = '\0';
> > >> +                       err = libbpf_find_attach_btf_id(prog, first_tgt, &btf_obj_fd,
> > >> +                                                       &btf_type_id);
> > >> +                       free(first_tgt);
> > >> +               } else {
> > >> +                       err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd,
> > >> +                                                       &btf_type_id);
> > >> +               }
> > >> +
> > >>                  if (err)
> > >>                          return err;
> > >>
> > >> @@ -9519,6 +9544,7 @@ static int attach_kprobe_session(const struct bpf_program *prog, long cookie, st
> > >>   static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
> > >>   static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link);
> > >>   static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link);
> > >> +static int attach_trace_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link);
> > >>
> > >>   static const struct bpf_sec_def section_defs[] = {
> > >>          SEC_DEF("socket",               SOCKET_FILTER, 0, SEC_NONE),
> > >> @@ -9565,6 +9591,13 @@ static const struct bpf_sec_def section_defs[] = {
> > >>          SEC_DEF("fentry.s+",            TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
> > >>          SEC_DEF("fmod_ret.s+",          TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
> > >>          SEC_DEF("fexit.s+",             TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace),
> > >> +       SEC_DEF("tp_btf+",              TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace),
> > > duplicate
> >
> >
> > Get it :/
> >
> >
> > Thanks!
> >
> > Menglong Dong
> >
> >
> > >
> > >
> > >> +       SEC_DEF("fentry.multi+",        TRACING, BPF_TRACE_FENTRY_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
> > >> +       SEC_DEF("fmod_ret.multi+",      TRACING, BPF_MODIFY_RETURN_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
> > >> +       SEC_DEF("fexit.multi+",         TRACING, BPF_TRACE_FEXIT_MULTI, SEC_ATTACH_BTF_MULTI, attach_trace_multi),
> > >> +       SEC_DEF("fentry.multi.s+",      TRACING, BPF_TRACE_FENTRY_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
> > >> +       SEC_DEF("fmod_ret.multi.s+",    TRACING, BPF_MODIFY_RETURN_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
> > >> +       SEC_DEF("fexit.multi.s+",       TRACING, BPF_TRACE_FEXIT_MULTI, SEC_ATTACH_BTF_MULTI | SEC_SLEEPABLE, attach_trace_multi),
> > >>          SEC_DEF("freplace+",            EXT, 0, SEC_ATTACH_BTF, attach_trace),
> > >>          SEC_DEF("lsm+",                 LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
> > >>          SEC_DEF("lsm.s+",               LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
> > >> @@ -12799,6 +12832,135 @@ static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_
> > >>          return libbpf_get_error(*link);
> > >>   }
> > >>
> > > [...]
> > >
> 





^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args
  2025-07-15 17:11       ` Andrii Nakryiko
@ 2025-07-16 12:50         ` Menglong Dong
  0 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-16 12:50 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Menglong Dong, alexei.starovoitov, rostedt, jolsa, bpf,
	John Fastabend, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo,
	Simon Horman, linux-kernel, netdev

[-- Attachment #1: Type: text/plain, Size: 2126 bytes --]

On Wednesday, July 16, 2025 1:11 AM Andrii Nakryiko <andrii.nakryiko@gmail.com> write:
> On Mon, Jul 14, 2025 at 4:45 PM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> >
> > On 2025/7/15 06:07, Andrii Nakryiko wrote:
> > > On Thu, Jul 3, 2025 at 5:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > >> In this commit, we add the 'accessed_args' field to struct bpf_prog_aux,
> > >> which is used to record the accessed index of the function args in
> > >> btf_ctx_access().
> > > Do we need to bother giving access to arguments through direct ctx[i]
> > > access for these multi-fentry/fexit programs? We have
> > > bpf_get_func_arg_cnt() and bpf_get_func_arg() which can be used to get
> > > any given argument at runtime.
> >
> >
> > Hi Andrii. This commit is not for that purpose. We remember all the accessed
> > args to bpf_prog_aux->accessed_args. And when we attach the tracing-multi
> > prog to the kernel functions, we will check if the accessed arguments are
> > consistent between all the target functions.
> >
> > The bpf_prog_aux->accessed_args will be used in
> > https://lore.kernel.org/bpf/20250703121521.1874196-12-dongml2@chinatelecom.cn/
> >
> > in bpf_tracing_check_multi() to do such checking.
> >
> > With such checking, the target functions don't need to have
> > the same prototype, which makes tracing-multi more flexible.
> 
> Yeah, and my point is why even track this at verifier level. If we
> don't allow direct ctx[i] access and only access arguments through
> bpf_get_func_arg(), we can check actual number of arguments at runtime
> and if program is trying to access something that's not there, we'll
> just return error code, so user can handle this generically.
> 
> I'm just not sure if there is a need to do anything more than that.

This commit is for the ctx[i] direct access, and we can use
bpf_core_cast() instead, as you said in
https://lore.kernel.org/bpf/CADxym3Zrqb6MxoV6mg4ioQMCiR+Cden9tmD5YHj8DtRFjn14HA@mail.gmail.com/T/#m7daa262d423c0e8bb1c7033e51099ef06180d2c5

Which means that we don't need this commit any more.



[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15 16:35       ` Alexei Starovoitov
@ 2025-07-16 13:05         ` Menglong Dong
  2025-07-17  0:59           ` multi-fentry proposal. Was: " Alexei Starovoitov
  2025-07-16 14:40         ` Menglong Dong
  1 sibling, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-16 13:05 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Menglong Dong, Steven Rostedt, Jiri Olsa, bpf, H. Peter Anvin,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

[-- Attachment #1: Type: text/plain, Size: 1686 bytes --]

On Wednesday, July 16, 2025 12:35 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> >
> > On 7/15/25 10:25, Alexei Starovoitov wrote:
[......]
> >
> > According to my benchmark, it has ~5% overhead to save/restore
> > *5* variants when compared with *0* variant. The save/restore of regs
> > is fast, but it still need 12 insn, which can produce ~6% overhead.
> 
> I think it's an ok trade off, because with one global trampoline
> we do not need to call rhashtable lookup before entering bpf prog.
> bpf prog will do it on demand if/when it needs to access arguments.
> This will compensate for a bit of lost performance due to extra save/restore.

I don't understand here :/

The rhashtable lookup is done at the beginning of the global trampoline,
which is called before we enter bpf prog. The bpf progs is stored in the
kfunc_md, and we need get them from the hash table.

If this is the only change, it is still OK. But according to my previous, the
rhashtable can cause ~7% addition overhead. So if we change both
them, the performance of tracing-multi is a little far from tracing, which
means ~25% performance gap for the functions that have no arguments.
About the rhashtable part, I'll do more research on it and feedback late.

> 
> PS
> pls don't add your chinatelecom.cn email in cc.
> gmail just cannot deliver there and it's annoying to keep deleting
> it manually in every reply.

Sorry about that. I filtered out such message in my gmail, and
didn't notice it. I'll remove it from the CC in the feature :)

Thanks!
Menglong Dong


[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15 16:35       ` Alexei Starovoitov
  2025-07-16 13:05         ` Menglong Dong
@ 2025-07-16 14:40         ` Menglong Dong
  1 sibling, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-16 14:40 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Menglong Dong, Steven Rostedt, Jiri Olsa, bpf, H. Peter Anvin,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

On Wed, Jul 16, 2025 at 12:35 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> >
> > On 7/15/25 10:25, Alexei Starovoitov wrote:
[......]
> >
> > According to my benchmark, it has ~5% overhead to save/restore
> > *5* variants when compared with *0* variant. The save/restore of regs
> > is fast, but it still need 12 insn, which can produce ~6% overhead.
>
> I think it's an ok trade off, because with one global trampoline
> we do not need to call rhashtable lookup before entering bpf prog.
> bpf prog will do it on demand if/when it needs to access arguments.
> This will compensate for a bit of lost performance due to extra save/restore.

I just think of another benefit of defining multiple global trampolines
here, which you may be interested in. In the feature, we can make
the global trampoline supports functions that have 7+ arguments.
If we use _one_ global trampoline, it's not possible, as we can't handle
the arguments in the stack. However, it's possible if we define
different global trampoline for the functions that have different arguments
count, and what we need to do in the feature is do some adjustment
to CALLER_DEFINE().

Wish you are interested in this idea :)

Thanks!
Menglong Dong


>
> PS
> pls don't add your chinatelecom.cn email in cc.
> gmail just cannot deliver there and it's annoying to keep deleting
> it manually in every reply.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-15  9:30       ` Menglong Dong
@ 2025-07-16 16:56         ` Alexei Starovoitov
  2025-07-16 18:24           ` Peter Zijlstra
  0 siblings, 1 reply; 73+ messages in thread
From: Alexei Starovoitov @ 2025-07-16 16:56 UTC (permalink / raw)
  To: Menglong Dong, Peter Zijlstra
  Cc: Menglong Dong, Steven Rostedt, Jiri Olsa, bpf, Martin KaFai Lau,
	Eduard Zingerman, LKML, Network Development

On Tue, Jul 15, 2025 at 2:31 AM Menglong Dong <menglong.dong@linux.dev> wrote:
>
>  Following are the test results for fentry-multi:
>    36.36% bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi [k]
> bpf_prog_2dcccf652aac1793_bench_trigger_fentry_multi
>    20.54% [kernel] [k] migrate_enable
>    19.35% [kernel] [k] bpf_global_caller_5_run
>    6.52% [kernel] [k] bpf_global_caller_5
>    3.58% libc.so.6 [.] syscall
>    2.88% [kernel] [k] entry_SYSCALL_64
>    1.50% [kernel] [k] memchr_inv
>    1.39% [kernel] [k] fput
>    1.04% [kernel] [k] migrate_disable
>    0.91% [kernel] [k] _copy_to_user
>
> And I also did the testing for fentry:
>    54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k]
> bpf_prog_2dcccf652aac1793_bench_trigger_fentry
>    10.43% [kernel] [k] migrate_enable
>    10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037
>    8.06% [kernel] [k] __bpf_prog_exit_recur
>    4.11% libc.so.6 [.] syscall
>    2.15% [kernel] [k] entry_SYSCALL_64
>    1.48% [kernel] [k] memchr_inv
>    1.32% [kernel] [k] fput
>    1.16% [kernel] [k] _copy_to_user
>    0.73% [kernel] [k] bpf_prog_test_run_raw_tp

Let's pause fentry-multi stuff and fix this as a higher priority.
Since migrate_disable/enable is so hot in yours and my tests,
let's figure out how to inline it.

As far as I can see both functions can be moved to a header file
including this_rq() macro, but we need to keep
struct rq private to sched.h. Moving the whole thing is not an option.
Luckily we only need nr_pinned from there.
Maybe we can offsetof(struct rq, nr_pinned) in a precompile step
the way it's done for asm-offsets ?
And then use that constant to do nr_pinned ++, --.
__set_cpus_allowed_ptr() is a slow path and can stay .c

Maybe Peter has better ideas ?

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 16:56         ` Inlining migrate_disable/enable. Was: " Alexei Starovoitov
@ 2025-07-16 18:24           ` Peter Zijlstra
  2025-07-16 22:35             ` Alexei Starovoitov
  0 siblings, 1 reply; 73+ messages in thread
From: Peter Zijlstra @ 2025-07-16 18:24 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Menglong Dong, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:

> Maybe Peter has better ideas ?

Is it possible to express runqueues::nr_pinned as an alias?

extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;

And use:

	__this_cpu_inc(&this_nr_pinned);


This syntax doesn't actually seem to work; but can we construct
something like that?

Google finds me this:

 https://gcc.gnu.org/pipermail/gcc-help/2012-February/109877.html


^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 18:24           ` Peter Zijlstra
@ 2025-07-16 22:35             ` Alexei Starovoitov
  2025-07-16 22:49               ` Steven Rostedt
  2025-07-28  9:20               ` Menglong Dong
  0 siblings, 2 replies; 73+ messages in thread
From: Alexei Starovoitov @ 2025-07-16 22:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Menglong Dong, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
>
> > Maybe Peter has better ideas ?
>
> Is it possible to express runqueues::nr_pinned as an alias?
>
> extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
>
> And use:
>
>         __this_cpu_inc(&this_nr_pinned);
>
>
> This syntax doesn't actually seem to work; but can we construct
> something like that?

Yeah. Iant is right. It's a string and not a pointer dereference.
It never worked.

Few options:

1.
 struct rq {
+#ifdef CONFIG_SMP
+       unsigned int            nr_pinned;
+#endif
        /* runqueue lock: */
        raw_spinlock_t          __lock;

@@ -1271,9 +1274,6 @@ struct rq {
        struct cpuidle_state    *idle_state;
 #endif

-#ifdef CONFIG_SMP
-       unsigned int            nr_pinned;
-#endif

but ugly...

2.
static unsigned int nr_pinned_offset __ro_after_init __used;
RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)

overkill for what's needed

3.
OFFSET(RQ_nr_pinned, rq, nr_pinned);
then
#include <generated/asm-offsets.h>

imo the best.


4.
Maybe we should extend clang/gcc to support attr(preserve_access_index)
on x86 and other architectures ;)
We rely heavily on it in bpf backend.
Then one can simply write:

struct rq___my {
  unsigned int nr_pinned;
} __attribute__((preserve_access_index));

struct rq___my *rq;

rq = this_rq();
rq->nr_pinned++;

and the compiler will do its magic of offset adjustment.
That's how BPF CORE works.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 22:35             ` Alexei Starovoitov
@ 2025-07-16 22:49               ` Steven Rostedt
  2025-07-16 22:50                 ` Steven Rostedt
  2025-07-28  9:20               ` Menglong Dong
  1 sibling, 1 reply; 73+ messages in thread
From: Steven Rostedt @ 2025-07-16 22:49 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Menglong Dong, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development,
	Jose E. Marchesi

On Wed, 16 Jul 2025 15:35:16 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> 4.
> Maybe we should extend clang/gcc to support attr(preserve_access_index)
> on x86 and other architectures ;)
> We rely heavily on it in bpf backend.
> Then one can simply write:
> 
> struct rq___my {
>   unsigned int nr_pinned;
> } __attribute__((preserve_access_index));
> 
> struct rq___my *rq;
> 
> rq = this_rq();
> rq->nr_pinned++;
> 
> and the compiler will do its magic of offset adjustment.
> That's how BPF CORE works.

GNU Cauldron in Porto, Portugal is having a kernel track (hopefully if it
gets accepted). I highly recommend you attending and recommending these
features. It's happening two days after Kernel Recipes (I already booked my
plane tickets).

 https://gcc.gnu.org/wiki/cauldron2025

Peter, maybe you can attend too?

-- Steve

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 22:49               ` Steven Rostedt
@ 2025-07-16 22:50                 ` Steven Rostedt
  0 siblings, 0 replies; 73+ messages in thread
From: Steven Rostedt @ 2025-07-16 22:50 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Menglong Dong, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development,
	Jose E. Marchesi

On Wed, 16 Jul 2025 18:49:40 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> GNU Cauldron in Porto, Portugal is having a kernel track (hopefully if it
> gets accepted). I highly recommend you attending and recommending these
> features. It's happening two days after Kernel Recipes (I already booked my
> plane tickets).
> 

Bah, I forgot you are on the abstract so you already know about this! ;-)

[ But I might as well advertise to let other kernel devs know ]

-- Steve

^ permalink raw reply	[flat|nested] 73+ messages in thread

* multi-fentry proposal. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 13:05         ` Menglong Dong
@ 2025-07-17  0:59           ` Alexei Starovoitov
  2025-07-17  1:50             ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Alexei Starovoitov @ 2025-07-17  0:59 UTC (permalink / raw)
  To: Menglong Dong, Jiri Olsa, Andrii Nakryiko
  Cc: Menglong Dong, Steven Rostedt, bpf, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, LKML, Network Development

On Wed, Jul 16, 2025 at 6:06 AM Menglong Dong <menglong.dong@linux.dev> wrote:
>
> On Wednesday, July 16, 2025 12:35 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > >
> > >
> > > On 7/15/25 10:25, Alexei Starovoitov wrote:
> [......]
> > >
> > > According to my benchmark, it has ~5% overhead to save/restore
> > > *5* variants when compared with *0* variant. The save/restore of regs
> > > is fast, but it still need 12 insn, which can produce ~6% overhead.
> >
> > I think it's an ok trade off, because with one global trampoline
> > we do not need to call rhashtable lookup before entering bpf prog.
> > bpf prog will do it on demand if/when it needs to access arguments.
> > This will compensate for a bit of lost performance due to extra save/restore.
>
> I don't understand here :/
>
> The rhashtable lookup is done at the beginning of the global trampoline,
> which is called before we enter bpf prog. The bpf progs is stored in the
> kfunc_md, and we need get them from the hash table.

Ahh. Right.

Looking at the existing bpf trampoline... It has complicated logic
to handle livepatching and tailcalls. Your global trampoline
doesn't, and once that is added it's starting to feel that it will
look just as complex as the current one.
So I think we better repurpose what we have.
Maybe we can rewrite the existing one in C too.

How about the following approach.
I think we discussed something like this in the past
and Jiri tried to implement something like this.
Andrii reminded me recently about it.

Say, we need to attach prog A to 30k functions.
10k with 2 args, 10k with 3 args, and 10k with 7 args.
We can generate 3 _existing_ bpf trampolines for 2,3,7 args
with hard coded prog A in there (the cookies would need to be
fetched via binary search similar to kprobe-multi).
The arch_prepare_bpf_trampoline() supports BPF_TRAMP_F_ORIG_STACK.
So one 2-arg trampoline will work to invoke prog A in all 10k 2-arg functions.
We don't need to match types, but have to compare that btf_func_model-s
are the same.

Menglong, your global trampoline for 0,1,..6 args works only for x86,
because btf_func_model doesn't care about sizes of args,
but it's not the correct mental model to use.

The above "10k with 2 args" is a simplified example.
We will need an arch specific callback is_btf_func_model_equal()
that will compare func models in arch specific ways.
For x86-64 the number of args is all it needs.
For other archs it will compare sizes and flags too.
So 30k functions will be sorted into
10k with btf_func_model_1, 10k with btf_func_model_2 and so on.
And the corresponding number of equivalent trampolines will be generated.

Note there will be no actual BTF types. All args will be untyped and
untrusted unlike current fentry.
We can go further and sort 30k functions by comparing BTFs
instead of btf_func_model-s, but I suspect 30k funcs will be split
into several thousands of exact BTFs. At that point multi-fentry
benefits are diminishing and we might as well generate 30k unique
bpf trampolines for 30k functions and avoid all the complexity.
So I would sort by btf_func_model compared by arch specific comparator.

Now say prog B needs to be attached to another 30k functions.
If all 30k+30k functions are different then it's the same as
the previous step.
Say, prog A is attached to 10k funcs with btf_func_model_1.
If prog B wants to attach to the exact same func set then we
just regenerate bpf trampoline with hard coded progs A and B
and reattach.
If not then we need to split the set into up to 3 sets.
Say, prog B wants 5k funcs, but only 1k func are common:
(prog_A, 9k func with btf_func_model_1) -> bpf trampoline X
(prog_A, prog_B, 1k funcs with btf_func_model_1) -> bpf trampoline Y
(prog_B, 4k funcs with btf_func_model_1) -> bpf trampoline Z

And so on when prog C needs to be attached.
At detach time we can merge sets/trampolines,
but for now we can leave it all fragmented.
Unlike regular fentry progs the multi-fentry progs are not going to
be attached for long time. So we can reduce the detach complexity.

The nice part of the algorithm is that coexistence of fentry
and multi-fentry is easy.
If fentry is already attached to some function we just
attach multi-fentry prog to that bpf trampoline.
If multi-fentry was attached first and fentry needs to be attached,
we create a regular bpf trampoline and add both progs there.

The intersect and sorting by btf_func_model is not trivial,
but we can hold global trampoline_mutex, so no concerns of races.

Example:
bpf_link_A is a set of:
(prog_A, funcs X,Y with btf_func_model_1)
(prog_A, funcs N,M with btf_func_model_2)

To attach prog B via bpf_link_B that wants:
(prog_B, funcs Y,Z with btf_func_model_1)
(prog_B, funcs P,Q with btf_func_model_3)

walk all existing links, intersect and split, and update the links.
At the end:

bpf_link_A:
(prog_A, funcs X with btf_func_model_1)
(prog_A, prog_B funcs Y with btf_func_model_1)
(prog_A, funcs N,M with btf_func_model_2)

bpf_link_B:
(prog_A, prog_B funcs Y with btf_func_model_1)
(prog_B, funcs Z with btf_func_model_1)
(prog_B, funcs P,Q with btf_func_model_3)

When link is detached: walk its own tuples, remove the prog,
if nr_progs == 0 -> detach corresponding trampoline,
if nr_progs > 0 -> remove prog and regenerate trampoline.

If fentry prog C needs to be attached to N it might split bpf_link_A:
(prog_A, funcs X with btf_func_model_1)
(prog_A, prog_B funcs Y with btf_func_model_1)
(prog_A, funcs M with btf_func_model_2)
(prog_A, prog_C funcs N with _fentry_)

Last time we gave up on it because we discovered that
overlap support was too complicated, but I cannot recall now
what it was :)
Maybe all of the above repeating some old mistakes.

Jiri,
How does the above proposal look to you?

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: multi-fentry proposal. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-17  0:59           ` multi-fentry proposal. Was: " Alexei Starovoitov
@ 2025-07-17  1:50             ` Menglong Dong
  2025-07-17  2:13               ` Alexei Starovoitov
  0 siblings, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-17  1:50 UTC (permalink / raw)
  To: Jiri Olsa, Andrii Nakryiko, Alexei Starovoitov
  Cc: Menglong Dong, Steven Rostedt, bpf, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, LKML, Network Development

On Thursday, July 17, 2025 8:59 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> On Wed, Jul 16, 2025 at 6:06 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> > On Wednesday, July 16, 2025 12:35 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > > On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > > >
> > > >
> > > > On 7/15/25 10:25, Alexei Starovoitov wrote:
> > [......]
> > > >
> > > > According to my benchmark, it has ~5% overhead to save/restore
> > > > *5* variants when compared with *0* variant. The save/restore of regs
> > > > is fast, but it still need 12 insn, which can produce ~6% overhead.
> > >
> > > I think it's an ok trade off, because with one global trampoline
> > > we do not need to call rhashtable lookup before entering bpf prog.
> > > bpf prog will do it on demand if/when it needs to access arguments.
> > > This will compensate for a bit of lost performance due to extra save/restore.
> >
> > I don't understand here :/
> >
> > The rhashtable lookup is done at the beginning of the global trampoline,
> > which is called before we enter bpf prog. The bpf progs is stored in the
> > kfunc_md, and we need get them from the hash table.
> 
> Ahh. Right.
> 
> Looking at the existing bpf trampoline... It has complicated logic
> to handle livepatching and tailcalls. Your global trampoline
> doesn't, and once that is added it's starting to feel that it will
> look just as complex as the current one.
> So I think we better repurpose what we have.
> Maybe we can rewrite the existing one in C too.

You are right, the tailcalls is not handled yet. But for the livepatching,
it is already handled, as we always get the origin ip from the stack
and call it, just like how the bpf trampoline handle the livepatching.
So no addition handling is needed here.

> 
> How about the following approach.
> I think we discussed something like this in the past
> and Jiri tried to implement something like this.
> Andrii reminded me recently about it.
> 
> Say, we need to attach prog A to 30k functions.
> 10k with 2 args, 10k with 3 args, and 10k with 7 args.
> We can generate 3 _existing_ bpf trampolines for 2,3,7 args
> with hard coded prog A in there (the cookies would need to be
> fetched via binary search similar to kprobe-multi).
> The arch_prepare_bpf_trampoline() supports BPF_TRAMP_F_ORIG_STACK.
> So one 2-arg trampoline will work to invoke prog A in all 10k 2-arg functions.
> We don't need to match types, but have to compare that btf_func_model-s
> are the same.
> 
> Menglong, your global trampoline for 0,1,..6 args works only for x86,
> because btf_func_model doesn't care about sizes of args,
> but it's not the correct mental model to use.
> 
> The above "10k with 2 args" is a simplified example.
> We will need an arch specific callback is_btf_func_model_equal()
> that will compare func models in arch specific ways.
> For x86-64 the number of args is all it needs.
> For other archs it will compare sizes and flags too.
> So 30k functions will be sorted into
> 10k with btf_func_model_1, 10k with btf_func_model_2 and so on.
> And the corresponding number of equivalent trampolines will be generated.
> 
> Note there will be no actual BTF types. All args will be untyped and
> untrusted unlike current fentry.
> We can go further and sort 30k functions by comparing BTFs
> instead of btf_func_model-s, but I suspect 30k funcs will be split
> into several thousands of exact BTFs. At that point multi-fentry
> benefits are diminishing and we might as well generate 30k unique
> bpf trampolines for 30k functions and avoid all the complexity.
> So I would sort by btf_func_model compared by arch specific comparator.
> 
> Now say prog B needs to be attached to another 30k functions.
> If all 30k+30k functions are different then it's the same as
> the previous step.
> Say, prog A is attached to 10k funcs with btf_func_model_1.
> If prog B wants to attach to the exact same func set then we
> just regenerate bpf trampoline with hard coded progs A and B
> and reattach.
> If not then we need to split the set into up to 3 sets.
> Say, prog B wants 5k funcs, but only 1k func are common:
> (prog_A, 9k func with btf_func_model_1) -> bpf trampoline X
> (prog_A, prog_B, 1k funcs with btf_func_model_1) -> bpf trampoline Y
> (prog_B, 4k funcs with btf_func_model_1) -> bpf trampoline Z
> 
> And so on when prog C needs to be attached.
> At detach time we can merge sets/trampolines,
> but for now we can leave it all fragmented.
> Unlike regular fentry progs the multi-fentry progs are not going to
> be attached for long time. So we can reduce the detach complexity.
> 
> The nice part of the algorithm is that coexistence of fentry
> and multi-fentry is easy.
> If fentry is already attached to some function we just
> attach multi-fentry prog to that bpf trampoline.
> If multi-fentry was attached first and fentry needs to be attached,
> we create a regular bpf trampoline and add both progs there.

This seems not easy, and it is exactly how I handle the
coexistence now:

  https://lore.kernel.org/bpf/20250528034712.138701-16-dongml2@chinatelecom.cn/
  https://lore.kernel.org/bpf/20250528034712.138701-17-dongml2@chinatelecom.cn/
  https://lore.kernel.org/bpf/20250528034712.138701-18-dongml2@chinatelecom.cn/

The most difficult part is that we need a way to replace the the
multi-fentry with fentry for the function in the ftrace atomically. Of
course, we can remove the global trampoline first, and then attach
the bpf trampoline, which will make things much easier. But a
short suspend will happen for the progs in fentry-multi.

> 
> The intersect and sorting by btf_func_model is not trivial,
> but we can hold global trampoline_mutex, so no concerns of races.
> 
> Example:
> bpf_link_A is a set of:
> (prog_A, funcs X,Y with btf_func_model_1)
> (prog_A, funcs N,M with btf_func_model_2)
> 
> To attach prog B via bpf_link_B that wants:
> (prog_B, funcs Y,Z with btf_func_model_1)
> (prog_B, funcs P,Q with btf_func_model_3)
> 
> walk all existing links, intersect and split, and update the links.
> At the end:
> 
> bpf_link_A:
> (prog_A, funcs X with btf_func_model_1)
> (prog_A, prog_B funcs Y with btf_func_model_1)
> (prog_A, funcs N,M with btf_func_model_2)
> 
> bpf_link_B:
> (prog_A, prog_B funcs Y with btf_func_model_1)
> (prog_B, funcs Z with btf_func_model_1)
> (prog_B, funcs P,Q with btf_func_model_3)
> 
> When link is detached: walk its own tuples, remove the prog,
> if nr_progs == 0 -> detach corresponding trampoline,
> if nr_progs > 0 -> remove prog and regenerate trampoline.
> 
> If fentry prog C needs to be attached to N it might split bpf_link_A:
> (prog_A, funcs X with btf_func_model_1)
> (prog_A, prog_B funcs Y with btf_func_model_1)
> (prog_A, funcs M with btf_func_model_2)
> (prog_A, prog_C funcs N with _fentry_)
> 
> Last time we gave up on it because we discovered that
> overlap support was too complicated, but I cannot recall now
> what it was :)
> Maybe all of the above repeating some old mistakes.

In my impression, this is exactly the solution of Jiri's, and this is
part of the discussion that I know:

  https://lore.kernel.org/bpf/ZfKY6E8xhSgzYL1I@krava/

> 
> Jiri,
> How does the above proposal look to you?
> 





^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: multi-fentry proposal. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-17  1:50             ` Menglong Dong
@ 2025-07-17  2:13               ` Alexei Starovoitov
  2025-07-17  2:37                 ` Menglong Dong
  0 siblings, 1 reply; 73+ messages in thread
From: Alexei Starovoitov @ 2025-07-17  2:13 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Jiri Olsa, Andrii Nakryiko, Menglong Dong, Steven Rostedt, bpf,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

On Wed, Jul 16, 2025 at 6:51 PM Menglong Dong <menglong.dong@linux.dev> wrote:
>
> On Thursday, July 17, 2025 8:59 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > On Wed, Jul 16, 2025 at 6:06 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > >
> > > On Wednesday, July 16, 2025 12:35 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > > > On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > > > >
> > > > >
> > > > > On 7/15/25 10:25, Alexei Starovoitov wrote:
> > > [......]
> > > > >
> > > > > According to my benchmark, it has ~5% overhead to save/restore
> > > > > *5* variants when compared with *0* variant. The save/restore of regs
> > > > > is fast, but it still need 12 insn, which can produce ~6% overhead.
> > > >
> > > > I think it's an ok trade off, because with one global trampoline
> > > > we do not need to call rhashtable lookup before entering bpf prog.
> > > > bpf prog will do it on demand if/when it needs to access arguments.
> > > > This will compensate for a bit of lost performance due to extra save/restore.
> > >
> > > I don't understand here :/
> > >
> > > The rhashtable lookup is done at the beginning of the global trampoline,
> > > which is called before we enter bpf prog. The bpf progs is stored in the
> > > kfunc_md, and we need get them from the hash table.
> >
> > Ahh. Right.
> >
> > Looking at the existing bpf trampoline... It has complicated logic
> > to handle livepatching and tailcalls. Your global trampoline
> > doesn't, and once that is added it's starting to feel that it will
> > look just as complex as the current one.
> > So I think we better repurpose what we have.
> > Maybe we can rewrite the existing one in C too.
>
> You are right, the tailcalls is not handled yet. But for the livepatching,
> it is already handled, as we always get the origin ip from the stack
> and call it, just like how the bpf trampoline handle the livepatching.
> So no addition handling is needed here.
>
> >
> > How about the following approach.
> > I think we discussed something like this in the past
> > and Jiri tried to implement something like this.
> > Andrii reminded me recently about it.
> >
> > Say, we need to attach prog A to 30k functions.
> > 10k with 2 args, 10k with 3 args, and 10k with 7 args.
> > We can generate 3 _existing_ bpf trampolines for 2,3,7 args
> > with hard coded prog A in there (the cookies would need to be
> > fetched via binary search similar to kprobe-multi).
> > The arch_prepare_bpf_trampoline() supports BPF_TRAMP_F_ORIG_STACK.
> > So one 2-arg trampoline will work to invoke prog A in all 10k 2-arg functions.
> > We don't need to match types, but have to compare that btf_func_model-s
> > are the same.
> >
> > Menglong, your global trampoline for 0,1,..6 args works only for x86,
> > because btf_func_model doesn't care about sizes of args,
> > but it's not the correct mental model to use.
> >
> > The above "10k with 2 args" is a simplified example.
> > We will need an arch specific callback is_btf_func_model_equal()
> > that will compare func models in arch specific ways.
> > For x86-64 the number of args is all it needs.
> > For other archs it will compare sizes and flags too.
> > So 30k functions will be sorted into
> > 10k with btf_func_model_1, 10k with btf_func_model_2 and so on.
> > And the corresponding number of equivalent trampolines will be generated.
> >
> > Note there will be no actual BTF types. All args will be untyped and
> > untrusted unlike current fentry.
> > We can go further and sort 30k functions by comparing BTFs
> > instead of btf_func_model-s, but I suspect 30k funcs will be split
> > into several thousands of exact BTFs. At that point multi-fentry
> > benefits are diminishing and we might as well generate 30k unique
> > bpf trampolines for 30k functions and avoid all the complexity.
> > So I would sort by btf_func_model compared by arch specific comparator.
> >
> > Now say prog B needs to be attached to another 30k functions.
> > If all 30k+30k functions are different then it's the same as
> > the previous step.
> > Say, prog A is attached to 10k funcs with btf_func_model_1.
> > If prog B wants to attach to the exact same func set then we
> > just regenerate bpf trampoline with hard coded progs A and B
> > and reattach.
> > If not then we need to split the set into up to 3 sets.
> > Say, prog B wants 5k funcs, but only 1k func are common:
> > (prog_A, 9k func with btf_func_model_1) -> bpf trampoline X
> > (prog_A, prog_B, 1k funcs with btf_func_model_1) -> bpf trampoline Y
> > (prog_B, 4k funcs with btf_func_model_1) -> bpf trampoline Z
> >
> > And so on when prog C needs to be attached.
> > At detach time we can merge sets/trampolines,
> > but for now we can leave it all fragmented.
> > Unlike regular fentry progs the multi-fentry progs are not going to
> > be attached for long time. So we can reduce the detach complexity.
> >
> > The nice part of the algorithm is that coexistence of fentry
> > and multi-fentry is easy.
> > If fentry is already attached to some function we just
> > attach multi-fentry prog to that bpf trampoline.
> > If multi-fentry was attached first and fentry needs to be attached,
> > we create a regular bpf trampoline and add both progs there.
>
> This seems not easy, and it is exactly how I handle the
> coexistence now:
>
>   https://lore.kernel.org/bpf/20250528034712.138701-16-dongml2@chinatelecom.cn/
>   https://lore.kernel.org/bpf/20250528034712.138701-17-dongml2@chinatelecom.cn/
>   https://lore.kernel.org/bpf/20250528034712.138701-18-dongml2@chinatelecom.cn/

hmm. exactly? That's very different.
You're relying on kfunc_md for prog list.
The above proposal doesn't need kfunc_md in the critical path.
All progs are built into the trampolines.

> The most difficult part is that we need a way to replace the the
> multi-fentry with fentry for the function in the ftrace atomically. Of
> course, we can remove the global trampoline first, and then attach
> the bpf trampoline, which will make things much easier. But a
> short suspend will happen for the progs in fentry-multi.

I don't follow.
In the above proposal fentry attach/detach is atomic.
Prepare a new trampoline, single call to ftrace to modify_fentry().

> >
> > The intersect and sorting by btf_func_model is not trivial,
> > but we can hold global trampoline_mutex, so no concerns of races.
> >
> > Example:
> > bpf_link_A is a set of:
> > (prog_A, funcs X,Y with btf_func_model_1)
> > (prog_A, funcs N,M with btf_func_model_2)
> >
> > To attach prog B via bpf_link_B that wants:
> > (prog_B, funcs Y,Z with btf_func_model_1)
> > (prog_B, funcs P,Q with btf_func_model_3)
> >
> > walk all existing links, intersect and split, and update the links.
> > At the end:
> >
> > bpf_link_A:
> > (prog_A, funcs X with btf_func_model_1)
> > (prog_A, prog_B funcs Y with btf_func_model_1)
> > (prog_A, funcs N,M with btf_func_model_2)
> >
> > bpf_link_B:
> > (prog_A, prog_B funcs Y with btf_func_model_1)
> > (prog_B, funcs Z with btf_func_model_1)
> > (prog_B, funcs P,Q with btf_func_model_3)
> >
> > When link is detached: walk its own tuples, remove the prog,
> > if nr_progs == 0 -> detach corresponding trampoline,
> > if nr_progs > 0 -> remove prog and regenerate trampoline.
> >
> > If fentry prog C needs to be attached to N it might split bpf_link_A:
> > (prog_A, funcs X with btf_func_model_1)
> > (prog_A, prog_B funcs Y with btf_func_model_1)
> > (prog_A, funcs M with btf_func_model_2)
> > (prog_A, prog_C funcs N with _fentry_)
> >
> > Last time we gave up on it because we discovered that
> > overlap support was too complicated, but I cannot recall now
> > what it was :)
> > Maybe all of the above repeating some old mistakes.
>
> In my impression, this is exactly the solution of Jiri's, and this is
> part of the discussion that I know:
>
>   https://lore.kernel.org/bpf/ZfKY6E8xhSgzYL1I@krava/

Yes. It's similar, but somehow it feels simple enough now.
The algorithms for both detach and attach fit on one page,
and everything is uniform. There are no spaghetty of corner cases.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: multi-fentry proposal. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-17  2:13               ` Alexei Starovoitov
@ 2025-07-17  2:37                 ` Menglong Dong
  0 siblings, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-07-17  2:37 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jiri Olsa, Andrii Nakryiko, Menglong Dong, Steven Rostedt, bpf,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, LKML,
	Network Development

On Thursday, July 17, 2025 10:13 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> On Wed, Jul 16, 2025 at 6:51 PM Menglong Dong <menglong.dong@linux.dev> wrote:
> >
> > On Thursday, July 17, 2025 8:59 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > > On Wed, Jul 16, 2025 at 6:06 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > > >
> > > > On Wednesday, July 16, 2025 12:35 AM Alexei Starovoitov <alexei.starovoitov@gmail.com> write:
> > > > > On Tue, Jul 15, 2025 at 1:37 AM Menglong Dong <menglong.dong@linux.dev> wrote:
> > > > > >
> > > > > >
> > > > > > On 7/15/25 10:25, Alexei Starovoitov wrote:
> > > > [......]
> > > > > >
> > > > > > According to my benchmark, it has ~5% overhead to save/restore
> > > > > > *5* variants when compared with *0* variant. The save/restore of regs
> > > > > > is fast, but it still need 12 insn, which can produce ~6% overhead.
> > > > >
> > > > > I think it's an ok trade off, because with one global trampoline
> > > > > we do not need to call rhashtable lookup before entering bpf prog.
> > > > > bpf prog will do it on demand if/when it needs to access arguments.
> > > > > This will compensate for a bit of lost performance due to extra save/restore.
> > > >
> > > > I don't understand here :/
> > > >
> > > > The rhashtable lookup is done at the beginning of the global trampoline,
> > > > which is called before we enter bpf prog. The bpf progs is stored in the
> > > > kfunc_md, and we need get them from the hash table.
> > >
> > > Ahh. Right.
> > >
> > > Looking at the existing bpf trampoline... It has complicated logic
> > > to handle livepatching and tailcalls. Your global trampoline
> > > doesn't, and once that is added it's starting to feel that it will
> > > look just as complex as the current one.
> > > So I think we better repurpose what we have.
> > > Maybe we can rewrite the existing one in C too.
> >
> > You are right, the tailcalls is not handled yet. But for the livepatching,
> > it is already handled, as we always get the origin ip from the stack
> > and call it, just like how the bpf trampoline handle the livepatching.
> > So no addition handling is needed here.
> >
> > >
> > > How about the following approach.
> > > I think we discussed something like this in the past
> > > and Jiri tried to implement something like this.
> > > Andrii reminded me recently about it.
> > >
> > > Say, we need to attach prog A to 30k functions.
> > > 10k with 2 args, 10k with 3 args, and 10k with 7 args.
> > > We can generate 3 _existing_ bpf trampolines for 2,3,7 args
> > > with hard coded prog A in there (the cookies would need to be
> > > fetched via binary search similar to kprobe-multi).
> > > The arch_prepare_bpf_trampoline() supports BPF_TRAMP_F_ORIG_STACK.
> > > So one 2-arg trampoline will work to invoke prog A in all 10k 2-arg functions.
> > > We don't need to match types, but have to compare that btf_func_model-s
> > > are the same.
> > >
> > > Menglong, your global trampoline for 0,1,..6 args works only for x86,
> > > because btf_func_model doesn't care about sizes of args,
> > > but it's not the correct mental model to use.
> > >
> > > The above "10k with 2 args" is a simplified example.
> > > We will need an arch specific callback is_btf_func_model_equal()
> > > that will compare func models in arch specific ways.
> > > For x86-64 the number of args is all it needs.
> > > For other archs it will compare sizes and flags too.
> > > So 30k functions will be sorted into
> > > 10k with btf_func_model_1, 10k with btf_func_model_2 and so on.
> > > And the corresponding number of equivalent trampolines will be generated.
> > >
> > > Note there will be no actual BTF types. All args will be untyped and
> > > untrusted unlike current fentry.
> > > We can go further and sort 30k functions by comparing BTFs
> > > instead of btf_func_model-s, but I suspect 30k funcs will be split
> > > into several thousands of exact BTFs. At that point multi-fentry
> > > benefits are diminishing and we might as well generate 30k unique
> > > bpf trampolines for 30k functions and avoid all the complexity.
> > > So I would sort by btf_func_model compared by arch specific comparator.
> > >
> > > Now say prog B needs to be attached to another 30k functions.
> > > If all 30k+30k functions are different then it's the same as
> > > the previous step.
> > > Say, prog A is attached to 10k funcs with btf_func_model_1.
> > > If prog B wants to attach to the exact same func set then we
> > > just regenerate bpf trampoline with hard coded progs A and B
> > > and reattach.
> > > If not then we need to split the set into up to 3 sets.
> > > Say, prog B wants 5k funcs, but only 1k func are common:
> > > (prog_A, 9k func with btf_func_model_1) -> bpf trampoline X
> > > (prog_A, prog_B, 1k funcs with btf_func_model_1) -> bpf trampoline Y
> > > (prog_B, 4k funcs with btf_func_model_1) -> bpf trampoline Z
> > >
> > > And so on when prog C needs to be attached.
> > > At detach time we can merge sets/trampolines,
> > > but for now we can leave it all fragmented.
> > > Unlike regular fentry progs the multi-fentry progs are not going to
> > > be attached for long time. So we can reduce the detach complexity.
> > >
> > > The nice part of the algorithm is that coexistence of fentry
> > > and multi-fentry is easy.
> > > If fentry is already attached to some function we just
> > > attach multi-fentry prog to that bpf trampoline.
> > > If multi-fentry was attached first and fentry needs to be attached,
> > > we create a regular bpf trampoline and add both progs there.
> >
> > This seems not easy, and it is exactly how I handle the
> > coexistence now:
> >
> >   https://lore.kernel.org/bpf/20250528034712.138701-16-dongml2@chinatelecom.cn/
> >   https://lore.kernel.org/bpf/20250528034712.138701-17-dongml2@chinatelecom.cn/
> >   https://lore.kernel.org/bpf/20250528034712.138701-18-dongml2@chinatelecom.cn/
> 
> hmm. exactly? That's very different.
> You're relying on kfunc_md for prog list.
> The above proposal doesn't need kfunc_md in the critical path.
> All progs are built into the trampolines.
> 
> > The most difficult part is that we need a way to replace the the
> > multi-fentry with fentry for the function in the ftrace atomically. Of
> > course, we can remove the global trampoline first, and then attach
> > the bpf trampoline, which will make things much easier. But a
> > short suspend will happen for the progs in fentry-multi.
> 
> I don't follow.
> In the above proposal fentry attach/detach is atomic.
> Prepare a new trampoline, single call to ftrace to modify_fentry().

modify_fentry() is used to operate on the same ftrace_ops. For
example, we have the bpf trampoline A, and its corresponding
ftrace_ops is opsA. Now, the image of the trampolineA is updated,
we call modify_fentry() for opsA to update the direct call of it.

When we talk about the coexistence, it means the functionA is
attached with the global trampoline B, whose ftrace_ops is
opsB. We can't call modify_fentry(trampolineA, new_addr) here,
as the opsA is not register yet. And we can't call register_fentry
too, as the functionA is already in the direct_functions when we
register opsB.

So we need a way to do such transition.

> 
> > >
> > > The intersect and sorting by btf_func_model is not trivial,
> > > but we can hold global trampoline_mutex, so no concerns of races.
> > >
> > > Example:
> > > bpf_link_A is a set of:
> > > (prog_A, funcs X,Y with btf_func_model_1)
> > > (prog_A, funcs N,M with btf_func_model_2)
> > >
> > > To attach prog B via bpf_link_B that wants:
> > > (prog_B, funcs Y,Z with btf_func_model_1)
> > > (prog_B, funcs P,Q with btf_func_model_3)
> > >
> > > walk all existing links, intersect and split, and update the links.
> > > At the end:
> > >
> > > bpf_link_A:
> > > (prog_A, funcs X with btf_func_model_1)
> > > (prog_A, prog_B funcs Y with btf_func_model_1)
> > > (prog_A, funcs N,M with btf_func_model_2)
> > >
> > > bpf_link_B:
> > > (prog_A, prog_B funcs Y with btf_func_model_1)
> > > (prog_B, funcs Z with btf_func_model_1)
> > > (prog_B, funcs P,Q with btf_func_model_3)
> > >
> > > When link is detached: walk its own tuples, remove the prog,
> > > if nr_progs == 0 -> detach corresponding trampoline,
> > > if nr_progs > 0 -> remove prog and regenerate trampoline.
> > >
> > > If fentry prog C needs to be attached to N it might split bpf_link_A:
> > > (prog_A, funcs X with btf_func_model_1)
> > > (prog_A, prog_B funcs Y with btf_func_model_1)
> > > (prog_A, funcs M with btf_func_model_2)
> > > (prog_A, prog_C funcs N with _fentry_)
> > >
> > > Last time we gave up on it because we discovered that
> > > overlap support was too complicated, but I cannot recall now
> > > what it was :)
> > > Maybe all of the above repeating some old mistakes.
> >
> > In my impression, this is exactly the solution of Jiri's, and this is
> > part of the discussion that I know:
> >
> >   https://lore.kernel.org/bpf/ZfKY6E8xhSgzYL1I@krava/
> 
> Yes. It's similar, but somehow it feels simple enough now.
> The algorithms for both detach and attach fit on one page,
> and everything is uniform. There are no spaghetty of corner cases.
> 





^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-16 22:35             ` Alexei Starovoitov
  2025-07-16 22:49               ` Steven Rostedt
@ 2025-07-28  9:20               ` Menglong Dong
  2025-07-31 16:15                 ` Alexei Starovoitov
  1 sibling, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-07-28  9:20 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> >
> > > Maybe Peter has better ideas ?
> >
> > Is it possible to express runqueues::nr_pinned as an alias?
> >
> > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> >
> > And use:
> >
> >         __this_cpu_inc(&this_nr_pinned);
> >
> >
> > This syntax doesn't actually seem to work; but can we construct
> > something like that?
>
> Yeah. Iant is right. It's a string and not a pointer dereference.
> It never worked.
>
> Few options:
>
> 1.
>  struct rq {
> +#ifdef CONFIG_SMP
> +       unsigned int            nr_pinned;
> +#endif
>         /* runqueue lock: */
>         raw_spinlock_t          __lock;
>
> @@ -1271,9 +1274,6 @@ struct rq {
>         struct cpuidle_state    *idle_state;
>  #endif
>
> -#ifdef CONFIG_SMP
> -       unsigned int            nr_pinned;
> -#endif
>
> but ugly...
>
> 2.
> static unsigned int nr_pinned_offset __ro_after_init __used;
> RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
>
> overkill for what's needed
>
> 3.
> OFFSET(RQ_nr_pinned, rq, nr_pinned);
> then
> #include <generated/asm-offsets.h>
>
> imo the best.

I had a try. The struct rq is not visible to asm-offsets.c, so we
can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
to define a similar rq-offsets.c in kernel/sched/ ? It will be more
complex than the way 2, and I think the second way 2 is
easier :/

>
> 4.
> Maybe we should extend clang/gcc to support attr(preserve_access_index)
> on x86 and other architectures ;)
> We rely heavily on it in bpf backend.
> Then one can simply write:
>
> struct rq___my {
>   unsigned int nr_pinned;
> } __attribute__((preserve_access_index));
>
> struct rq___my *rq;
>
> rq = this_rq();
> rq->nr_pinned++;
>
> and the compiler will do its magic of offset adjustment.
> That's how BPF CORE works.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-28  9:20               ` Menglong Dong
@ 2025-07-31 16:15                 ` Alexei Starovoitov
  2025-08-01  1:42                   ` Menglong Dong
  2025-08-06  8:44                   ` Menglong Dong
  0 siblings, 2 replies; 73+ messages in thread
From: Alexei Starovoitov @ 2025-07-31 16:15 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Mon, Jul 28, 2025 at 2:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> > >
> > > > Maybe Peter has better ideas ?
> > >
> > > Is it possible to express runqueues::nr_pinned as an alias?
> > >
> > > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> > >
> > > And use:
> > >
> > >         __this_cpu_inc(&this_nr_pinned);
> > >
> > >
> > > This syntax doesn't actually seem to work; but can we construct
> > > something like that?
> >
> > Yeah. Iant is right. It's a string and not a pointer dereference.
> > It never worked.
> >
> > Few options:
> >
> > 1.
> >  struct rq {
> > +#ifdef CONFIG_SMP
> > +       unsigned int            nr_pinned;
> > +#endif
> >         /* runqueue lock: */
> >         raw_spinlock_t          __lock;
> >
> > @@ -1271,9 +1274,6 @@ struct rq {
> >         struct cpuidle_state    *idle_state;
> >  #endif
> >
> > -#ifdef CONFIG_SMP
> > -       unsigned int            nr_pinned;
> > -#endif
> >
> > but ugly...
> >
> > 2.
> > static unsigned int nr_pinned_offset __ro_after_init __used;
> > RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
> >
> > overkill for what's needed
> >
> > 3.
> > OFFSET(RQ_nr_pinned, rq, nr_pinned);
> > then
> > #include <generated/asm-offsets.h>
> >
> > imo the best.
>
> I had a try. The struct rq is not visible to asm-offsets.c, so we
> can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
> to define a similar rq-offsets.c in kernel/sched/ ? It will be more
> complex than the way 2, and I think the second way 2 is
> easier :/

2 maybe easier, but it's an overkill.
I still think asm-offset is cleaner.
arch/xx shouldn't be used, of course, since this nr_pinned should
be generic for all archs.
We can do something similar to drivers/memory/emif-asm-offsets.c
and do that within kernel/sched/.
rq-offsets.c as you said.
It will generate rq-offsets.h in a build dir that can be #include-d.

I thought about another alternative (as a derivative of 1):
split nr_pinned from 'struct rq' into its own per-cpu variable,
but I don't think that will work, since rq_has_pinned_tasks()
doesn't always operate on this_rq().
So the acceptable choices are realistically 1 and 3 and
rq-offsets.c seems cleaner.
Pls give it another try.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-31 16:15                 ` Alexei Starovoitov
@ 2025-08-01  1:42                   ` Menglong Dong
  2025-08-06  8:44                   ` Menglong Dong
  1 sibling, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-08-01  1:42 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Fri, Aug 1, 2025 at 12:15 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jul 28, 2025 at 2:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> > > >
> > > > > Maybe Peter has better ideas ?
> > > >
> > > > Is it possible to express runqueues::nr_pinned as an alias?
> > > >
> > > > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> > > >
> > > > And use:
> > > >
> > > >         __this_cpu_inc(&this_nr_pinned);
> > > >
> > > >
> > > > This syntax doesn't actually seem to work; but can we construct
> > > > something like that?
> > >
> > > Yeah. Iant is right. It's a string and not a pointer dereference.
> > > It never worked.
> > >
> > > Few options:
> > >
> > > 1.
> > >  struct rq {
> > > +#ifdef CONFIG_SMP
> > > +       unsigned int            nr_pinned;
> > > +#endif
> > >         /* runqueue lock: */
> > >         raw_spinlock_t          __lock;
> > >
> > > @@ -1271,9 +1274,6 @@ struct rq {
> > >         struct cpuidle_state    *idle_state;
> > >  #endif
> > >
> > > -#ifdef CONFIG_SMP
> > > -       unsigned int            nr_pinned;
> > > -#endif
> > >
> > > but ugly...
> > >
> > > 2.
> > > static unsigned int nr_pinned_offset __ro_after_init __used;
> > > RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
> > >
> > > overkill for what's needed
> > >
> > > 3.
> > > OFFSET(RQ_nr_pinned, rq, nr_pinned);
> > > then
> > > #include <generated/asm-offsets.h>
> > >
> > > imo the best.
> >
> > I had a try. The struct rq is not visible to asm-offsets.c, so we
> > can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
> > to define a similar rq-offsets.c in kernel/sched/ ? It will be more
> > complex than the way 2, and I think the second way 2 is
> > easier :/
>
> 2 maybe easier, but it's an overkill.
> I still think asm-offset is cleaner.
> arch/xx shouldn't be used, of course, since this nr_pinned should
> be generic for all archs.
> We can do something similar to drivers/memory/emif-asm-offsets.c

Great, I'll have a try on this way!

> and do that within kernel/sched/.
> rq-offsets.c as you said.
> It will generate rq-offsets.h in a build dir that can be #include-d.
>
> I thought about another alternative (as a derivative of 1):
> split nr_pinned from 'struct rq' into its own per-cpu variable,
> but I don't think that will work, since rq_has_pinned_tasks()
> doesn't always operate on this_rq().
> So the acceptable choices are realistically 1 and 3 and
> rq-offsets.c seems cleaner.
> Pls give it another try.

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-07-31 16:15                 ` Alexei Starovoitov
  2025-08-01  1:42                   ` Menglong Dong
@ 2025-08-06  8:44                   ` Menglong Dong
  2025-08-08  0:58                     ` Alexei Starovoitov
  1 sibling, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-08-06  8:44 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Fri, Aug 1, 2025 at 12:15 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Jul 28, 2025 at 2:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > >
> > > > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> > > >
> > > > > Maybe Peter has better ideas ?
> > > >
> > > > Is it possible to express runqueues::nr_pinned as an alias?
> > > >
> > > > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> > > >
> > > > And use:
> > > >
> > > >         __this_cpu_inc(&this_nr_pinned);
> > > >
> > > >
> > > > This syntax doesn't actually seem to work; but can we construct
> > > > something like that?
> > >
> > > Yeah. Iant is right. It's a string and not a pointer dereference.
> > > It never worked.
> > >
> > > Few options:
> > >
> > > 1.
> > >  struct rq {
> > > +#ifdef CONFIG_SMP
> > > +       unsigned int            nr_pinned;
> > > +#endif
> > >         /* runqueue lock: */
> > >         raw_spinlock_t          __lock;
> > >
> > > @@ -1271,9 +1274,6 @@ struct rq {
> > >         struct cpuidle_state    *idle_state;
> > >  #endif
> > >
> > > -#ifdef CONFIG_SMP
> > > -       unsigned int            nr_pinned;
> > > -#endif
> > >
> > > but ugly...
> > >
> > > 2.
> > > static unsigned int nr_pinned_offset __ro_after_init __used;
> > > RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
> > >
> > > overkill for what's needed
> > >
> > > 3.
> > > OFFSET(RQ_nr_pinned, rq, nr_pinned);
> > > then
> > > #include <generated/asm-offsets.h>
> > >
> > > imo the best.
> >
> > I had a try. The struct rq is not visible to asm-offsets.c, so we
> > can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
> > to define a similar rq-offsets.c in kernel/sched/ ? It will be more
> > complex than the way 2, and I think the second way 2 is
> > easier :/
>
> 2 maybe easier, but it's an overkill.
> I still think asm-offset is cleaner.
> arch/xx shouldn't be used, of course, since this nr_pinned should
> be generic for all archs.
> We can do something similar to drivers/memory/emif-asm-offsets.c
> and do that within kernel/sched/.
> rq-offsets.c as you said.
> It will generate rq-offsets.h in a build dir that can be #include-d.
>
> I thought about another alternative (as a derivative of 1):
> split nr_pinned from 'struct rq' into its own per-cpu variable,
> but I don't think that will work, since rq_has_pinned_tasks()
> doesn't always operate on this_rq().
> So the acceptable choices are realistically 1 and 3 and
> rq-offsets.c seems cleaner.
> Pls give it another try.

Generally speaking, the way 3 works. The only problem is how
we handle this_rq(). I introduced following code in
include/linux/sched.h:

struct rq;
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)

The this_rq_ptr() is used in migrate_enable(). I have to use the
arch_raw_cpu_ptr() for it. this_cpu_ptr() can't be used here, as
it will fail on this_cpu_ptr -> raw_cpu_ptr -> __verify_pcpu_ptr:

#define __verify_pcpu_ptr(ptr)                        \
do {                                    \
    const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL;    \
    (void)__vpp_verify;                        \
} while (0)

The struct rq is not available here, which makes the typeof((ptr) + 0)
fail during compiling. What can we do here?

According to my testing, the performance of fentry increased from
111M/s to 121M/s with migrate_enable/disable inlined.

Following is the whole patch:
-------------------------------------------------------------------------------------------
diff --git a/Kbuild b/Kbuild
index f327ca86990c..13324b4bbe23 100644
--- a/Kbuild
+++ b/Kbuild
@@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s:
$(timeconst-file) $(bounds-file)
 $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
     $(call filechk,offsets,__ASM_OFFSETS_H__)

+# Generate rq-offsets.h
+
+rq-offsets-file := include/generated/rq-offsets.h
+
+targets += kernel/sched/rq-offsets.s
+
+kernel/sched/rq-offsets.s: $(offsets-file)
+
+$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE
+    $(call filechk,offsets,__RQ_OFFSETS_H__)
+
 # Check for missing system calls

 quiet_cmd_syscalls = CALL    $<
       cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags)
$(missing_syscalls_flags)

 PHONY += missing-syscalls
-missing-syscalls: scripts/checksyscalls.sh $(offsets-file)
+missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file)
     $(call cmd,syscalls)

 # Check the manual modification of atomic headers
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 1fad1c8a4c76..3a1c08a75c09 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -369,64 +369,6 @@ static inline void preempt_notifier_init(struct
preempt_notifier *notifier,

 #endif

-/*
- * Migrate-Disable and why it is undesired.
- *
- * When a preempted task becomes elegible to run under the ideal model (IOW it
- * becomes one of the M highest priority tasks), it might still have to wait
- * for the preemptee's migrate_disable() section to complete. Thereby suffering
- * a reduction in bandwidth in the exact duration of the migrate_disable()
- * section.
- *
- * Per this argument, the change from preempt_disable() to migrate_disable()
- * gets us:
- *
- * - a higher priority tasks gains reduced wake-up latency; with
preempt_disable()
- *   it would have had to wait for the lower priority task.
- *
- * - a lower priority tasks; which under preempt_disable() could've instantly
- *   migrated away when another CPU becomes available, is now constrained
- *   by the ability to push the higher priority task away, which
might itself be
- *   in a migrate_disable() section, reducing it's available bandwidth.
- *
- * IOW it trades latency / moves the interference term, but it stays in the
- * system, and as long as it remains unbounded, the system is not fully
- * deterministic.
- *
- *
- * The reason we have it anyway.
- *
- * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
- * number of primitives into becoming preemptible, they would also allow
- * migration. This turns out to break a bunch of per-cpu usage. To this end,
- * all these primitives employ migirate_disable() to restore this implicit
- * assumption.
- *
- * This is a 'temporary' work-around at best. The correct solution is getting
- * rid of the above assumptions and reworking the code to employ explicit
- * per-cpu locking or short preempt-disable regions.
- *
- * The end goal must be to get rid of migrate_disable(), alternatively we need
- * a schedulability theory that does not depend on abritrary migration.
- *
- *
- * Notes on the implementation.
- *
- * The implementation is particularly tricky since existing code patterns
- * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
- * This means that it cannot use cpus_read_lock() to serialize against hotplug,
- * nor can it easily migrate itself into a pending affinity mask change on
- * migrate_enable().
- *
- *
- * Note: even non-work-conserving schedulers like semi-partitioned depends on
- *       migration, so migrate_disable() is not only a problem for
- *       work-conserving schedulers.
- *
- */
-extern void migrate_disable(void);
-extern void migrate_enable(void);
-
 /**
  * preempt_disable_nested - Disable preemption inside a normally
preempt disabled section
  *
@@ -471,7 +413,6 @@ static __always_inline void preempt_enable_nested(void)

 DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
 DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(),
preempt_enable_notrace())
-DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

 #ifdef CONFIG_PREEMPT_DYNAMIC

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40d2fa90df42..365ac6d17504 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -48,6 +48,9 @@
 #include <linux/uidgid_types.h>
 #include <linux/tracepoint-defs.h>
 #include <asm/kmap_size.h>
+#ifndef COMPILE_OFFSETS
+#include <generated/rq-offsets.h>
+#endif

 /* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
@@ -2299,4 +2302,127 @@ static __always_inline void
alloc_tag_restore(struct alloc_tag *tag, struct allo
 #define alloc_tag_restore(_tag, _old)        do {} while (0)
 #endif

+#if defined(CONFIG_SMP) && !defined(COMPILE_OFFSETS)
+
+extern void __migrate_enable(void);
+
+struct rq;
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)
+
+/*
+ * Migrate-Disable and why it is undesired.
+ *
+ * When a preempted task becomes elegible to run under the ideal model (IOW it
+ * becomes one of the M highest priority tasks), it might still have to wait
+ * for the preemptee's migrate_disable() section to complete. Thereby suffering
+ * a reduction in bandwidth in the exact duration of the migrate_disable()
+ * section.
+ *
+ * Per this argument, the change from preempt_disable() to migrate_disable()
+ * gets us:
+ *
+ * - a higher priority tasks gains reduced wake-up latency; with
preempt_disable()
+ *   it would have had to wait for the lower priority task.
+ *
+ * - a lower priority tasks; which under preempt_disable() could've instantly
+ *   migrated away when another CPU becomes available, is now constrained
+ *   by the ability to push the higher priority task away, which
might itself be
+ *   in a migrate_disable() section, reducing it's available bandwidth.
+ *
+ * IOW it trades latency / moves the interference term, but it stays in the
+ * system, and as long as it remains unbounded, the system is not fully
+ * deterministic.
+ *
+ *
+ * The reason we have it anyway.
+ *
+ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
+ * number of primitives into becoming preemptible, they would also allow
+ * migration. This turns out to break a bunch of per-cpu usage. To this end,
+ * all these primitives employ migirate_disable() to restore this implicit
+ * assumption.
+ *
+ * This is a 'temporary' work-around at best. The correct solution is getting
+ * rid of the above assumptions and reworking the code to employ explicit
+ * per-cpu locking or short preempt-disable regions.
+ *
+ * The end goal must be to get rid of migrate_disable(), alternatively we need
+ * a schedulability theory that does not depend on abritrary migration.
+ *
+ *
+ * Notes on the implementation.
+ *
+ * The implementation is particularly tricky since existing code patterns
+ * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
+ * This means that it cannot use cpus_read_lock() to serialize against hotplug,
+ * nor can it easily migrate itself into a pending affinity mask change on
+ * migrate_enable().
+ *
+ *
+ * Note: even non-work-conserving schedulers like semi-partitioned depends on
+ *       migration, so migrate_disable() is not only a problem for
+ *       work-conserving schedulers.
+ *
+ */
+static inline void migrate_enable(void)
+{
+    struct task_struct *p = current;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+    /*
+     * Check both overflow from migrate_disable() and superfluous
+     * migrate_enable().
+     */
+    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
+        return;
+#endif
+
+    if (p->migration_disabled > 1) {
+        p->migration_disabled--;
+        return;
+    }
+
+    /*
+     * Ensure stop_task runs either before or after this, and that
+     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+     */
+    guard(preempt)();
+    __migrate_enable();
+    /*
+     * Mustn't clear migration_disabled() until cpus_ptr points back at the
+     * regular cpus_mask, otherwise things that race (eg.
+     * select_fallback_rq) get confused.
+     */
+    barrier();
+    p->migration_disabled = 0;
+    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))--;
+}
+
+static inline void migrate_disable(void)
+{
+    struct task_struct *p = current;
+
+    if (p->migration_disabled) {
+#ifdef CONFIG_DEBUG_PREEMPT
+        /*
+         *Warn about overflow half-way through the range.
+         */
+        WARN_ON_ONCE((s16)p->migration_disabled < 0);
+#endif
+        p->migration_disabled++;
+        return;
+    }
+
+    guard(preempt)();
+    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))++;
+    p->migration_disabled = 1;
+}
+#else
+static inline void migrate_disable(void) { }
+static inline void migrate_enable(void) { }
+#endif
+
+DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
+
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 399f03e62508..75d5f145ca60 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23853,8 +23853,7 @@ int bpf_check_attach_target(struct
bpf_verifier_log *log,
 BTF_SET_START(btf_id_deny)
 BTF_ID_UNUSED
 #ifdef CONFIG_SMP
-BTF_ID(func, migrate_disable)
-BTF_ID(func, migrate_enable)
+BTF_ID(func, __migrate_enable)
 #endif
 #if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
 BTF_ID(func, rcu_read_unlock_strict)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3ec00d08d46a..b521024c99ed 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,6 +119,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);

 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+EXPORT_SYMBOL_GPL(runqueues);

 #ifdef CONFIG_SCHED_PROXY_EXEC
 DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -2375,28 +2376,7 @@ static void migrate_disable_switch(struct rq
*rq, struct task_struct *p)
     __do_set_cpus_allowed(p, &ac);
 }

-void migrate_disable(void)
-{
-    struct task_struct *p = current;
-
-    if (p->migration_disabled) {
-#ifdef CONFIG_DEBUG_PREEMPT
-        /*
-         *Warn about overflow half-way through the range.
-         */
-        WARN_ON_ONCE((s16)p->migration_disabled < 0);
-#endif
-        p->migration_disabled++;
-        return;
-    }
-
-    guard(preempt)();
-    this_rq()->nr_pinned++;
-    p->migration_disabled = 1;
-}
-EXPORT_SYMBOL_GPL(migrate_disable);
-
-void migrate_enable(void)
+void __migrate_enable(void)
 {
     struct task_struct *p = current;
     struct affinity_context ac = {
@@ -2404,37 +2384,10 @@ void migrate_enable(void)
         .flags     = SCA_MIGRATE_ENABLE,
     };

-#ifdef CONFIG_DEBUG_PREEMPT
-    /*
-     * Check both overflow from migrate_disable() and superfluous
-     * migrate_enable().
-     */
-    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
-        return;
-#endif
-
-    if (p->migration_disabled > 1) {
-        p->migration_disabled--;
-        return;
-    }
-
-    /*
-     * Ensure stop_task runs either before or after this, and that
-     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
-     */
-    guard(preempt)();
     if (p->cpus_ptr != &p->cpus_mask)
         __set_cpus_allowed_ptr(p, &ac);
-    /*
-     * Mustn't clear migration_disabled() until cpus_ptr points back at the
-     * regular cpus_mask, otherwise things that race (eg.
-     * select_fallback_rq) get confused.
-     */
-    barrier();
-    p->migration_disabled = 0;
-    this_rq()->nr_pinned--;
 }
-EXPORT_SYMBOL_GPL(migrate_enable);
+EXPORT_SYMBOL_GPL(__migrate_enable);

 static inline bool rq_has_pinned_tasks(struct rq *rq)
 {
diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
new file mode 100644
index 000000000000..a23747bbe25b
--- /dev/null
+++ b/kernel/sched/rq-offsets.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#define COMPILE_OFFSETS
+#include <linux/kbuild.h>
+#include <linux/types.h>
+#include "sched.h"
+
+int main(void)
+{
+    DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
+
+    return 0;
+}

^ permalink raw reply related	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-08-06  8:44                   ` Menglong Dong
@ 2025-08-08  0:58                     ` Alexei Starovoitov
  2025-08-08  5:48                       ` Menglong Dong
  2025-08-08  6:32                       ` Menglong Dong
  0 siblings, 2 replies; 73+ messages in thread
From: Alexei Starovoitov @ 2025-08-08  0:58 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Wed, Aug 6, 2025 at 1:44 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> On Fri, Aug 1, 2025 at 12:15 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Jul 28, 2025 at 2:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > >
> > > On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > >
> > > > > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> > > > >
> > > > > > Maybe Peter has better ideas ?
> > > > >
> > > > > Is it possible to express runqueues::nr_pinned as an alias?
> > > > >
> > > > > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> > > > >
> > > > > And use:
> > > > >
> > > > >         __this_cpu_inc(&this_nr_pinned);
> > > > >
> > > > >
> > > > > This syntax doesn't actually seem to work; but can we construct
> > > > > something like that?
> > > >
> > > > Yeah. Iant is right. It's a string and not a pointer dereference.
> > > > It never worked.
> > > >
> > > > Few options:
> > > >
> > > > 1.
> > > >  struct rq {
> > > > +#ifdef CONFIG_SMP
> > > > +       unsigned int            nr_pinned;
> > > > +#endif
> > > >         /* runqueue lock: */
> > > >         raw_spinlock_t          __lock;
> > > >
> > > > @@ -1271,9 +1274,6 @@ struct rq {
> > > >         struct cpuidle_state    *idle_state;
> > > >  #endif
> > > >
> > > > -#ifdef CONFIG_SMP
> > > > -       unsigned int            nr_pinned;
> > > > -#endif
> > > >
> > > > but ugly...
> > > >
> > > > 2.
> > > > static unsigned int nr_pinned_offset __ro_after_init __used;
> > > > RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
> > > >
> > > > overkill for what's needed
> > > >
> > > > 3.
> > > > OFFSET(RQ_nr_pinned, rq, nr_pinned);
> > > > then
> > > > #include <generated/asm-offsets.h>
> > > >
> > > > imo the best.
> > >
> > > I had a try. The struct rq is not visible to asm-offsets.c, so we
> > > can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
> > > to define a similar rq-offsets.c in kernel/sched/ ? It will be more
> > > complex than the way 2, and I think the second way 2 is
> > > easier :/
> >
> > 2 maybe easier, but it's an overkill.
> > I still think asm-offset is cleaner.
> > arch/xx shouldn't be used, of course, since this nr_pinned should
> > be generic for all archs.
> > We can do something similar to drivers/memory/emif-asm-offsets.c
> > and do that within kernel/sched/.
> > rq-offsets.c as you said.
> > It will generate rq-offsets.h in a build dir that can be #include-d.
> >
> > I thought about another alternative (as a derivative of 1):
> > split nr_pinned from 'struct rq' into its own per-cpu variable,
> > but I don't think that will work, since rq_has_pinned_tasks()
> > doesn't always operate on this_rq().
> > So the acceptable choices are realistically 1 and 3 and
> > rq-offsets.c seems cleaner.
> > Pls give it another try.
>
> Generally speaking, the way 3 works. The only problem is how
> we handle this_rq(). I introduced following code in
> include/linux/sched.h:
>
> struct rq;
> DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> #define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)
>
> The this_rq_ptr() is used in migrate_enable(). I have to use the
> arch_raw_cpu_ptr() for it. this_cpu_ptr() can't be used here, as
> it will fail on this_cpu_ptr -> raw_cpu_ptr -> __verify_pcpu_ptr:
>
> #define __verify_pcpu_ptr(ptr)                        \
> do {                                    \
>     const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL;    \
>     (void)__vpp_verify;                        \
> } while (0)
>
> The struct rq is not available here, which makes the typeof((ptr) + 0)
> fail during compiling. What can we do here?

Interesting.
The comment says:
 * + 0 is required in order to convert the pointer type from a
 * potential array type to a pointer to a single item of the array.

so maybe we can do some macro magic to avoid '+ 0'
when type is already pointer,
but for now let's proceed with arch_raw_cpu_ptr().

> According to my testing, the performance of fentry increased from
> 111M/s to 121M/s with migrate_enable/disable inlined.

Very nice.

> Following is the whole patch:
> -------------------------------------------------------------------------------------------
> diff --git a/Kbuild b/Kbuild
> index f327ca86990c..13324b4bbe23 100644
> --- a/Kbuild
> +++ b/Kbuild
> @@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s:
> $(timeconst-file) $(bounds-file)
>  $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
>      $(call filechk,offsets,__ASM_OFFSETS_H__)
>
> +# Generate rq-offsets.h
> +
> +rq-offsets-file := include/generated/rq-offsets.h
> +
> +targets += kernel/sched/rq-offsets.s
> +
> +kernel/sched/rq-offsets.s: $(offsets-file)
> +
> +$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE
> +    $(call filechk,offsets,__RQ_OFFSETS_H__)
> +
>  # Check for missing system calls
>
>  quiet_cmd_syscalls = CALL    $<
>        cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags)
> $(missing_syscalls_flags)
>
>  PHONY += missing-syscalls
> -missing-syscalls: scripts/checksyscalls.sh $(offsets-file)
> +missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file)
>      $(call cmd,syscalls)
>
>  # Check the manual modification of atomic headers
> diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> index 1fad1c8a4c76..3a1c08a75c09 100644
> --- a/include/linux/preempt.h
> +++ b/include/linux/preempt.h
> @@ -369,64 +369,6 @@ static inline void preempt_notifier_init(struct
> preempt_notifier *notifier,
>
>  #endif
>
> -/*
> - * Migrate-Disable and why it is undesired.

Keep the comment where it is. It will keep the diff smaller.
There is really no need to move it.

> - *
> - * When a preempted task becomes elegible to run under the ideal model (IOW it

but fix the typos.

> - * becomes one of the M highest priority tasks), it might still have to wait
> - * for the preemptee's migrate_disable() section to complete. Thereby suffering
> - * a reduction in bandwidth in the exact duration of the migrate_disable()
> - * section.
> - *
> - * Per this argument, the change from preempt_disable() to migrate_disable()
> - * gets us:
> - *
> - * - a higher priority tasks gains reduced wake-up latency; with
> preempt_disable()
> - *   it would have had to wait for the lower priority task.
> - *
> - * - a lower priority tasks; which under preempt_disable() could've instantly
> - *   migrated away when another CPU becomes available, is now constrained
> - *   by the ability to push the higher priority task away, which
> might itself be
> - *   in a migrate_disable() section, reducing it's available bandwidth.
> - *
> - * IOW it trades latency / moves the interference term, but it stays in the
> - * system, and as long as it remains unbounded, the system is not fully
> - * deterministic.
> - *
> - *
> - * The reason we have it anyway.
> - *
> - * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
> - * number of primitives into becoming preemptible, they would also allow
> - * migration. This turns out to break a bunch of per-cpu usage. To this end,
> - * all these primitives employ migirate_disable() to restore this implicit
> - * assumption.
> - *
> - * This is a 'temporary' work-around at best. The correct solution is getting
> - * rid of the above assumptions and reworking the code to employ explicit
> - * per-cpu locking or short preempt-disable regions.
> - *
> - * The end goal must be to get rid of migrate_disable(), alternatively we need
> - * a schedulability theory that does not depend on abritrary migration.

and this one.

> - *
> - *
> - * Notes on the implementation.
> - *
> - * The implementation is particularly tricky since existing code patterns
> - * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
> - * This means that it cannot use cpus_read_lock() to serialize against hotplug,
> - * nor can it easily migrate itself into a pending affinity mask change on
> - * migrate_enable().
> - *
> - *
> - * Note: even non-work-conserving schedulers like semi-partitioned depends on
> - *       migration, so migrate_disable() is not only a problem for
> - *       work-conserving schedulers.
> - *
> - */
> -extern void migrate_disable(void);
> -extern void migrate_enable(void);
> -
>  /**
>   * preempt_disable_nested - Disable preemption inside a normally
> preempt disabled section
>   *
> @@ -471,7 +413,6 @@ static __always_inline void preempt_enable_nested(void)
>
>  DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
>  DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(),
> preempt_enable_notrace())
> -DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

hmm. why?

>  #ifdef CONFIG_PREEMPT_DYNAMIC
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 40d2fa90df42..365ac6d17504 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -48,6 +48,9 @@
>  #include <linux/uidgid_types.h>
>  #include <linux/tracepoint-defs.h>
>  #include <asm/kmap_size.h>
> +#ifndef COMPILE_OFFSETS
> +#include <generated/rq-offsets.h>
> +#endif
>
>  /* task_struct member predeclarations (sorted alphabetically): */
>  struct audit_context;
> @@ -2299,4 +2302,127 @@ static __always_inline void
> alloc_tag_restore(struct alloc_tag *tag, struct allo
>  #define alloc_tag_restore(_tag, _old)        do {} while (0)
>  #endif
>
> +#if defined(CONFIG_SMP) && !defined(COMPILE_OFFSETS)
> +
> +extern void __migrate_enable(void);
> +
> +struct rq;
> +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +#define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)
> +
> +/*
> + * Migrate-Disable and why it is undesired.
> + *
> + * When a preempted task becomes elegible to run under the ideal model (IOW it
> + * becomes one of the M highest priority tasks), it might still have to wait
> + * for the preemptee's migrate_disable() section to complete. Thereby suffering
> + * a reduction in bandwidth in the exact duration of the migrate_disable()
> + * section.
> + *
> + * Per this argument, the change from preempt_disable() to migrate_disable()
> + * gets us:
> + *
> + * - a higher priority tasks gains reduced wake-up latency; with
> preempt_disable()
> + *   it would have had to wait for the lower priority task.
> + *
> + * - a lower priority tasks; which under preempt_disable() could've instantly
> + *   migrated away when another CPU becomes available, is now constrained
> + *   by the ability to push the higher priority task away, which
> might itself be
> + *   in a migrate_disable() section, reducing it's available bandwidth.
> + *
> + * IOW it trades latency / moves the interference term, but it stays in the
> + * system, and as long as it remains unbounded, the system is not fully
> + * deterministic.
> + *
> + *
> + * The reason we have it anyway.
> + *
> + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
> + * number of primitives into becoming preemptible, they would also allow
> + * migration. This turns out to break a bunch of per-cpu usage. To this end,
> + * all these primitives employ migirate_disable() to restore this implicit
> + * assumption.
> + *
> + * This is a 'temporary' work-around at best. The correct solution is getting
> + * rid of the above assumptions and reworking the code to employ explicit
> + * per-cpu locking or short preempt-disable regions.
> + *
> + * The end goal must be to get rid of migrate_disable(), alternatively we need
> + * a schedulability theory that does not depend on abritrary migration.
> + *
> + *
> + * Notes on the implementation.
> + *
> + * The implementation is particularly tricky since existing code patterns
> + * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
> + * This means that it cannot use cpus_read_lock() to serialize against hotplug,
> + * nor can it easily migrate itself into a pending affinity mask change on
> + * migrate_enable().
> + *
> + *
> + * Note: even non-work-conserving schedulers like semi-partitioned depends on
> + *       migration, so migrate_disable() is not only a problem for
> + *       work-conserving schedulers.
> + *
> + */
> +static inline void migrate_enable(void)
> +{
> +    struct task_struct *p = current;
> +
> +#ifdef CONFIG_DEBUG_PREEMPT
> +    /*
> +     * Check both overflow from migrate_disable() and superfluous
> +     * migrate_enable().
> +     */
> +    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
> +        return;
> +#endif
> +
> +    if (p->migration_disabled > 1) {
> +        p->migration_disabled--;
> +        return;
> +    }
> +
> +    /*
> +     * Ensure stop_task runs either before or after this, and that
> +     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
> +     */
> +    guard(preempt)();
> +    __migrate_enable();

You're leaving performance on the table.
In many case bpf is one and only user of migrate_enable/disable
and it's not nested.
So this call is likely hot.
Move 'if (p->cpus_ptr != &p->cpus_mask)' check into .h
and only keep slow path of __set_cpus_allowed_ptr() in .c

Can probably wrap it with likely() too.

> +    /*
> +     * Mustn't clear migration_disabled() until cpus_ptr points back at the
> +     * regular cpus_mask, otherwise things that race (eg.
> +     * select_fallback_rq) get confused.
> +     */
> +    barrier();
> +    p->migration_disabled = 0;
> +    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))--;
> +}
> +
> +static inline void migrate_disable(void)
> +{
> +    struct task_struct *p = current;
> +
> +    if (p->migration_disabled) {
> +#ifdef CONFIG_DEBUG_PREEMPT
> +        /*
> +         *Warn about overflow half-way through the range.
> +         */
> +        WARN_ON_ONCE((s16)p->migration_disabled < 0);
> +#endif
> +        p->migration_disabled++;
> +        return;
> +    }
> +
> +    guard(preempt)();
> +    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))++;
> +    p->migration_disabled = 1;
> +}
> +#else
> +static inline void migrate_disable(void) { }
> +static inline void migrate_enable(void) { }
> +#endif
> +
> +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
> +
>  #endif
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 399f03e62508..75d5f145ca60 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -23853,8 +23853,7 @@ int bpf_check_attach_target(struct
> bpf_verifier_log *log,
>  BTF_SET_START(btf_id_deny)
>  BTF_ID_UNUSED
>  #ifdef CONFIG_SMP
> -BTF_ID(func, migrate_disable)
> -BTF_ID(func, migrate_enable)
> +BTF_ID(func, __migrate_enable)
>  #endif
>  #if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
>  BTF_ID(func, rcu_read_unlock_strict)
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3ec00d08d46a..b521024c99ed 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -119,6 +119,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
>
>  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> +EXPORT_SYMBOL_GPL(runqueues);

why?

>
>  #ifdef CONFIG_SCHED_PROXY_EXEC
>  DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
> @@ -2375,28 +2376,7 @@ static void migrate_disable_switch(struct rq
> *rq, struct task_struct *p)
>      __do_set_cpus_allowed(p, &ac);
>  }
>
> -void migrate_disable(void)
> -{
> -    struct task_struct *p = current;
> -
> -    if (p->migration_disabled) {
> -#ifdef CONFIG_DEBUG_PREEMPT
> -        /*
> -         *Warn about overflow half-way through the range.
> -         */
> -        WARN_ON_ONCE((s16)p->migration_disabled < 0);
> -#endif
> -        p->migration_disabled++;
> -        return;
> -    }
> -
> -    guard(preempt)();
> -    this_rq()->nr_pinned++;
> -    p->migration_disabled = 1;
> -}
> -EXPORT_SYMBOL_GPL(migrate_disable);
> -
> -void migrate_enable(void)
> +void __migrate_enable(void)
>  {
>      struct task_struct *p = current;
>      struct affinity_context ac = {
> @@ -2404,37 +2384,10 @@ void migrate_enable(void)
>          .flags     = SCA_MIGRATE_ENABLE,
>      };
>
> -#ifdef CONFIG_DEBUG_PREEMPT
> -    /*
> -     * Check both overflow from migrate_disable() and superfluous
> -     * migrate_enable().
> -     */
> -    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
> -        return;
> -#endif
> -
> -    if (p->migration_disabled > 1) {
> -        p->migration_disabled--;
> -        return;
> -    }
> -
> -    /*
> -     * Ensure stop_task runs either before or after this, and that
> -     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
> -     */
> -    guard(preempt)();
>      if (p->cpus_ptr != &p->cpus_mask)
>          __set_cpus_allowed_ptr(p, &ac);
> -    /*
> -     * Mustn't clear migration_disabled() until cpus_ptr points back at the
> -     * regular cpus_mask, otherwise things that race (eg.
> -     * select_fallback_rq) get confused.
> -     */
> -    barrier();
> -    p->migration_disabled = 0;
> -    this_rq()->nr_pinned--;
>  }
> -EXPORT_SYMBOL_GPL(migrate_enable);
> +EXPORT_SYMBOL_GPL(__migrate_enable);
>
>  static inline bool rq_has_pinned_tasks(struct rq *rq)
>  {
> diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
> new file mode 100644
> index 000000000000..a23747bbe25b
> --- /dev/null
> +++ b/kernel/sched/rq-offsets.c
> @@ -0,0 +1,12 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define COMPILE_OFFSETS
> +#include <linux/kbuild.h>
> +#include <linux/types.h>
> +#include "sched.h"
> +
> +int main(void)
> +{
> +    DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));

This part looks nice and sweet. Not sure what you were concerned about.

Respin it as a proper patch targeting tip tree.

And explain the motivation in commit log with detailed
'perf report' before/after along with 111M/s to 121M/s speed up,

I suspect with my other __set_cpus_allowed_ptr() suggestion
the speed up should be even bigger.

> +    return 0;
> +}

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-08-08  0:58                     ` Alexei Starovoitov
@ 2025-08-08  5:48                       ` Menglong Dong
  2025-08-08  6:32                       ` Menglong Dong
  1 sibling, 0 replies; 73+ messages in thread
From: Menglong Dong @ 2025-08-08  5:48 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Fri, Aug 8, 2025 at 8:58 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Aug 6, 2025 at 1:44 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> >
> > On Fri, Aug 1, 2025 at 12:15 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Mon, Jul 28, 2025 at 2:20 AM Menglong Dong <menglong8.dong@gmail.com> wrote:
> > > >
> > > > On Thu, Jul 17, 2025 at 6:35 AM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > On Wed, Jul 16, 2025 at 11:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > > > >
> > > > > > On Wed, Jul 16, 2025 at 09:56:11AM -0700, Alexei Starovoitov wrote:
> > > > > >
> > > > > > > Maybe Peter has better ideas ?
> > > > > >
> > > > > > Is it possible to express runqueues::nr_pinned as an alias?
> > > > > >
> > > > > > extern unsigned int __attribute__((alias("runqueues.nr_pinned"))) this_nr_pinned;
> > > > > >
> > > > > > And use:
> > > > > >
> > > > > >         __this_cpu_inc(&this_nr_pinned);
> > > > > >
> > > > > >
> > > > > > This syntax doesn't actually seem to work; but can we construct
> > > > > > something like that?
> > > > >
> > > > > Yeah. Iant is right. It's a string and not a pointer dereference.
> > > > > It never worked.
> > > > >
> > > > > Few options:
> > > > >
> > > > > 1.
> > > > >  struct rq {
> > > > > +#ifdef CONFIG_SMP
> > > > > +       unsigned int            nr_pinned;
> > > > > +#endif
> > > > >         /* runqueue lock: */
> > > > >         raw_spinlock_t          __lock;
> > > > >
> > > > > @@ -1271,9 +1274,6 @@ struct rq {
> > > > >         struct cpuidle_state    *idle_state;
> > > > >  #endif
> > > > >
> > > > > -#ifdef CONFIG_SMP
> > > > > -       unsigned int            nr_pinned;
> > > > > -#endif
> > > > >
> > > > > but ugly...
> > > > >
> > > > > 2.
> > > > > static unsigned int nr_pinned_offset __ro_after_init __used;
> > > > > RUNTIME_CONST(nr_pinned_offset, nr_pinned_offset)
> > > > >
> > > > > overkill for what's needed
> > > > >
> > > > > 3.
> > > > > OFFSET(RQ_nr_pinned, rq, nr_pinned);
> > > > > then
> > > > > #include <generated/asm-offsets.h>
> > > > >
> > > > > imo the best.
> > > >
> > > > I had a try. The struct rq is not visible to asm-offsets.c, so we
> > > > can't define it in arch/xx/kernel/asm-offsets.c. Do you mean
> > > > to define a similar rq-offsets.c in kernel/sched/ ? It will be more
> > > > complex than the way 2, and I think the second way 2 is
> > > > easier :/
> > >
> > > 2 maybe easier, but it's an overkill.
> > > I still think asm-offset is cleaner.
> > > arch/xx shouldn't be used, of course, since this nr_pinned should
> > > be generic for all archs.
> > > We can do something similar to drivers/memory/emif-asm-offsets.c
> > > and do that within kernel/sched/.
> > > rq-offsets.c as you said.
> > > It will generate rq-offsets.h in a build dir that can be #include-d.
> > >
> > > I thought about another alternative (as a derivative of 1):
> > > split nr_pinned from 'struct rq' into its own per-cpu variable,
> > > but I don't think that will work, since rq_has_pinned_tasks()
> > > doesn't always operate on this_rq().
> > > So the acceptable choices are realistically 1 and 3 and
> > > rq-offsets.c seems cleaner.
> > > Pls give it another try.
> >
> > Generally speaking, the way 3 works. The only problem is how
> > we handle this_rq(). I introduced following code in
> > include/linux/sched.h:
> >
> > struct rq;
> > DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> > #define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)
> >
> > The this_rq_ptr() is used in migrate_enable(). I have to use the
> > arch_raw_cpu_ptr() for it. this_cpu_ptr() can't be used here, as
> > it will fail on this_cpu_ptr -> raw_cpu_ptr -> __verify_pcpu_ptr:
> >
> > #define __verify_pcpu_ptr(ptr)                        \
> > do {                                    \
> >     const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL;    \
> >     (void)__vpp_verify;                        \
> > } while (0)
> >
> > The struct rq is not available here, which makes the typeof((ptr) + 0)
> > fail during compiling. What can we do here?
>
> Interesting.
> The comment says:
>  * + 0 is required in order to convert the pointer type from a
>  * potential array type to a pointer to a single item of the array.
>
> so maybe we can do some macro magic to avoid '+ 0'
> when type is already pointer,
> but for now let's proceed with arch_raw_cpu_ptr().

OK

>
> > According to my testing, the performance of fentry increased from
> > 111M/s to 121M/s with migrate_enable/disable inlined.
>
> Very nice.
>
> > Following is the whole patch:
> > -------------------------------------------------------------------------------------------
> > diff --git a/Kbuild b/Kbuild
> > index f327ca86990c..13324b4bbe23 100644
> > --- a/Kbuild
> > +++ b/Kbuild
> > @@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s:
> > $(timeconst-file) $(bounds-file)
> >  $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
> >      $(call filechk,offsets,__ASM_OFFSETS_H__)
> >
> > +# Generate rq-offsets.h
> > +
> > +rq-offsets-file := include/generated/rq-offsets.h
> > +
> > +targets += kernel/sched/rq-offsets.s
> > +
> > +kernel/sched/rq-offsets.s: $(offsets-file)
> > +
> > +$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE
> > +    $(call filechk,offsets,__RQ_OFFSETS_H__)
> > +
> >  # Check for missing system calls
> >
> >  quiet_cmd_syscalls = CALL    $<
> >        cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags)
> > $(missing_syscalls_flags)
> >
> >  PHONY += missing-syscalls
> > -missing-syscalls: scripts/checksyscalls.sh $(offsets-file)
> > +missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file)
> >      $(call cmd,syscalls)
> >
> >  # Check the manual modification of atomic headers
> > diff --git a/include/linux/preempt.h b/include/linux/preempt.h
> > index 1fad1c8a4c76..3a1c08a75c09 100644
> > --- a/include/linux/preempt.h
> > +++ b/include/linux/preempt.h
> > @@ -369,64 +369,6 @@ static inline void preempt_notifier_init(struct
> > preempt_notifier *notifier,
> >
> >  #endif
> >
> > -/*
> > - * Migrate-Disable and why it is undesired.
>
> Keep the comment where it is. It will keep the diff smaller.
> There is really no need to move it.

OK

>
> > - *
> > - * When a preempted task becomes elegible to run under the ideal model (IOW it
>
> but fix the typos.

OK

>
> > - * becomes one of the M highest priority tasks), it might still have to wait
> > - * for the preemptee's migrate_disable() section to complete. Thereby suffering
> > - * a reduction in bandwidth in the exact duration of the migrate_disable()
> > - * section.
> > - *
> > - * Per this argument, the change from preempt_disable() to migrate_disable()
> > - * gets us:
> > - *
> > - * - a higher priority tasks gains reduced wake-up latency; with
> > preempt_disable()
> > - *   it would have had to wait for the lower priority task.
> > - *
> > - * - a lower priority tasks; which under preempt_disable() could've instantly
> > - *   migrated away when another CPU becomes available, is now constrained
> > - *   by the ability to push the higher priority task away, which
> > might itself be
> > - *   in a migrate_disable() section, reducing it's available bandwidth.
> > - *
> > - * IOW it trades latency / moves the interference term, but it stays in the
> > - * system, and as long as it remains unbounded, the system is not fully
> > - * deterministic.
> > - *
> > - *
> > - * The reason we have it anyway.
> > - *
> > - * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
> > - * number of primitives into becoming preemptible, they would also allow
> > - * migration. This turns out to break a bunch of per-cpu usage. To this end,
> > - * all these primitives employ migirate_disable() to restore this implicit
> > - * assumption.
> > - *
> > - * This is a 'temporary' work-around at best. The correct solution is getting
> > - * rid of the above assumptions and reworking the code to employ explicit
> > - * per-cpu locking or short preempt-disable regions.
> > - *
> > - * The end goal must be to get rid of migrate_disable(), alternatively we need
> > - * a schedulability theory that does not depend on abritrary migration.
>
> and this one.

OK

>
> > - *
> > - *
> > - * Notes on the implementation.
> > - *
> > - * The implementation is particularly tricky since existing code patterns
> > - * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
> > - * This means that it cannot use cpus_read_lock() to serialize against hotplug,
> > - * nor can it easily migrate itself into a pending affinity mask change on
> > - * migrate_enable().
> > - *
> > - *
> > - * Note: even non-work-conserving schedulers like semi-partitioned depends on
> > - *       migration, so migrate_disable() is not only a problem for
> > - *       work-conserving schedulers.
> > - *
> > - */
> > -extern void migrate_disable(void);
> > -extern void migrate_enable(void);
> > -
> >  /**
> >   * preempt_disable_nested - Disable preemption inside a normally
> > preempt disabled section
> >   *
> > @@ -471,7 +413,6 @@ static __always_inline void preempt_enable_nested(void)
> >
> >  DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
> >  DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(),
> > preempt_enable_notrace())
> > -DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>
> hmm. why?

Because we moved migrate_disable and migrate_enable to
include/linux/sched.h, which makes them not available in
include/linux/preempt.h, so we need to move this line to
include/linux/sched.h too.

>
> >  #ifdef CONFIG_PREEMPT_DYNAMIC
> >
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 40d2fa90df42..365ac6d17504 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -48,6 +48,9 @@
> >  #include <linux/uidgid_types.h>
> >  #include <linux/tracepoint-defs.h>
> >  #include <asm/kmap_size.h>
> > +#ifndef COMPILE_OFFSETS
> > +#include <generated/rq-offsets.h>
> > +#endif
> >
> >  /* task_struct member predeclarations (sorted alphabetically): */
> >  struct audit_context;
> > @@ -2299,4 +2302,127 @@ static __always_inline void
> > alloc_tag_restore(struct alloc_tag *tag, struct allo
> >  #define alloc_tag_restore(_tag, _old)        do {} while (0)
> >  #endif
> >
> > +#if defined(CONFIG_SMP) && !defined(COMPILE_OFFSETS)
> > +
> > +extern void __migrate_enable(void);
> > +
> > +struct rq;
> > +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> > +#define this_rq_ptr() arch_raw_cpu_ptr(&runqueues)
> > +
> > +/*
> > + * Migrate-Disable and why it is undesired.
> > + *
> > + * When a preempted task becomes elegible to run under the ideal model (IOW it
> > + * becomes one of the M highest priority tasks), it might still have to wait
> > + * for the preemptee's migrate_disable() section to complete. Thereby suffering
> > + * a reduction in bandwidth in the exact duration of the migrate_disable()
> > + * section.
> > + *
> > + * Per this argument, the change from preempt_disable() to migrate_disable()
> > + * gets us:
> > + *
> > + * - a higher priority tasks gains reduced wake-up latency; with
> > preempt_disable()
> > + *   it would have had to wait for the lower priority task.
> > + *
> > + * - a lower priority tasks; which under preempt_disable() could've instantly
> > + *   migrated away when another CPU becomes available, is now constrained
> > + *   by the ability to push the higher priority task away, which
> > might itself be
> > + *   in a migrate_disable() section, reducing it's available bandwidth.
> > + *
> > + * IOW it trades latency / moves the interference term, but it stays in the
> > + * system, and as long as it remains unbounded, the system is not fully
> > + * deterministic.
> > + *
> > + *
> > + * The reason we have it anyway.
> > + *
> > + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
> > + * number of primitives into becoming preemptible, they would also allow
> > + * migration. This turns out to break a bunch of per-cpu usage. To this end,
> > + * all these primitives employ migirate_disable() to restore this implicit
> > + * assumption.
> > + *
> > + * This is a 'temporary' work-around at best. The correct solution is getting
> > + * rid of the above assumptions and reworking the code to employ explicit
> > + * per-cpu locking or short preempt-disable regions.
> > + *
> > + * The end goal must be to get rid of migrate_disable(), alternatively we need
> > + * a schedulability theory that does not depend on abritrary migration.
> > + *
> > + *
> > + * Notes on the implementation.
> > + *
> > + * The implementation is particularly tricky since existing code patterns
> > + * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
> > + * This means that it cannot use cpus_read_lock() to serialize against hotplug,
> > + * nor can it easily migrate itself into a pending affinity mask change on
> > + * migrate_enable().
> > + *
> > + *
> > + * Note: even non-work-conserving schedulers like semi-partitioned depends on
> > + *       migration, so migrate_disable() is not only a problem for
> > + *       work-conserving schedulers.
> > + *
> > + */
> > +static inline void migrate_enable(void)
> > +{
> > +    struct task_struct *p = current;
> > +
> > +#ifdef CONFIG_DEBUG_PREEMPT
> > +    /*
> > +     * Check both overflow from migrate_disable() and superfluous
> > +     * migrate_enable().
> > +     */
> > +    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
> > +        return;
> > +#endif
> > +
> > +    if (p->migration_disabled > 1) {
> > +        p->migration_disabled--;
> > +        return;
> > +    }
> > +
> > +    /*
> > +     * Ensure stop_task runs either before or after this, and that
> > +     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
> > +     */
> > +    guard(preempt)();
> > +    __migrate_enable();
>
> You're leaving performance on the table.
> In many case bpf is one and only user of migrate_enable/disable
> and it's not nested.
> So this call is likely hot.
> Move 'if (p->cpus_ptr != &p->cpus_mask)' check into .h
> and only keep slow path of __set_cpus_allowed_ptr() in .c

Oops, my mistake, I should do it this way :/

>
> Can probably wrap it with likely() too.
>
> > +    /*
> > +     * Mustn't clear migration_disabled() until cpus_ptr points back at the
> > +     * regular cpus_mask, otherwise things that race (eg.
> > +     * select_fallback_rq) get confused.
> > +     */
> > +    barrier();
> > +    p->migration_disabled = 0;
> > +    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))--;
> > +}
> > +
> > +static inline void migrate_disable(void)
> > +{
> > +    struct task_struct *p = current;
> > +
> > +    if (p->migration_disabled) {
> > +#ifdef CONFIG_DEBUG_PREEMPT
> > +        /*
> > +         *Warn about overflow half-way through the range.
> > +         */
> > +        WARN_ON_ONCE((s16)p->migration_disabled < 0);
> > +#endif
> > +        p->migration_disabled++;
> > +        return;
> > +    }
> > +
> > +    guard(preempt)();
> > +    (*(unsigned int *)((void *)this_rq_ptr() + RQ_nr_pinned))++;
> > +    p->migration_disabled = 1;
> > +}
> > +#else
> > +static inline void migrate_disable(void) { }
> > +static inline void migrate_enable(void) { }
> > +#endif
> > +
> > +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
> > +
> >  #endif
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 399f03e62508..75d5f145ca60 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -23853,8 +23853,7 @@ int bpf_check_attach_target(struct
> > bpf_verifier_log *log,
> >  BTF_SET_START(btf_id_deny)
> >  BTF_ID_UNUSED
> >  #ifdef CONFIG_SMP
> > -BTF_ID(func, migrate_disable)
> > -BTF_ID(func, migrate_enable)
> > +BTF_ID(func, __migrate_enable)
> >  #endif
> >  #if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
> >  BTF_ID(func, rcu_read_unlock_strict)
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 3ec00d08d46a..b521024c99ed 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -119,6 +119,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
> >
> >  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
> > +EXPORT_SYMBOL_GPL(runqueues);
>
> why?

Because the runqueues referenced in migrate_enable/migrate_disable
directly, and they can be used in modules, we need to export
the runqueues. Isn't it?

>
> >
> >  #ifdef CONFIG_SCHED_PROXY_EXEC
> >  DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
> > @@ -2375,28 +2376,7 @@ static void migrate_disable_switch(struct rq
> > *rq, struct task_struct *p)
> >      __do_set_cpus_allowed(p, &ac);
> >  }
> >
> > -void migrate_disable(void)
> > -{
> > -    struct task_struct *p = current;
> > -
> > -    if (p->migration_disabled) {
> > -#ifdef CONFIG_DEBUG_PREEMPT
> > -        /*
> > -         *Warn about overflow half-way through the range.
> > -         */
> > -        WARN_ON_ONCE((s16)p->migration_disabled < 0);
> > -#endif
> > -        p->migration_disabled++;
> > -        return;
> > -    }
> > -
> > -    guard(preempt)();
> > -    this_rq()->nr_pinned++;
> > -    p->migration_disabled = 1;
> > -}
> > -EXPORT_SYMBOL_GPL(migrate_disable);
> > -
> > -void migrate_enable(void)
> > +void __migrate_enable(void)
> >  {
> >      struct task_struct *p = current;
> >      struct affinity_context ac = {
> > @@ -2404,37 +2384,10 @@ void migrate_enable(void)
> >          .flags     = SCA_MIGRATE_ENABLE,
> >      };
> >
> > -#ifdef CONFIG_DEBUG_PREEMPT
> > -    /*
> > -     * Check both overflow from migrate_disable() and superfluous
> > -     * migrate_enable().
> > -     */
> > -    if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
> > -        return;
> > -#endif
> > -
> > -    if (p->migration_disabled > 1) {
> > -        p->migration_disabled--;
> > -        return;
> > -    }
> > -
> > -    /*
> > -     * Ensure stop_task runs either before or after this, and that
> > -     * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
> > -     */
> > -    guard(preempt)();
> >      if (p->cpus_ptr != &p->cpus_mask)
> >          __set_cpus_allowed_ptr(p, &ac);
> > -    /*
> > -     * Mustn't clear migration_disabled() until cpus_ptr points back at the
> > -     * regular cpus_mask, otherwise things that race (eg.
> > -     * select_fallback_rq) get confused.
> > -     */
> > -    barrier();
> > -    p->migration_disabled = 0;
> > -    this_rq()->nr_pinned--;
> >  }
> > -EXPORT_SYMBOL_GPL(migrate_enable);
> > +EXPORT_SYMBOL_GPL(__migrate_enable);
> >
> >  static inline bool rq_has_pinned_tasks(struct rq *rq)
> >  {
> > diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c
> > new file mode 100644
> > index 000000000000..a23747bbe25b
> > --- /dev/null
> > +++ b/kernel/sched/rq-offsets.c
> > @@ -0,0 +1,12 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#define COMPILE_OFFSETS
> > +#include <linux/kbuild.h>
> > +#include <linux/types.h>
> > +#include "sched.h"
> > +
> > +int main(void)
> > +{
> > +    DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
>
> This part looks nice and sweet. Not sure what you were concerned about.

The usage of arch_raw_cpu_ptr() looks ugly and there is
no such usage existing, which made me concerned.

>
> Respin it as a proper patch targeting tip tree.
>
> And explain the motivation in commit log with detailed
> 'perf report' before/after along with 111M/s to 121M/s speed up,
>
> I suspect with my other __set_cpus_allowed_ptr() suggestion
> the speed up should be even bigger.

It should be. I'll respin this patch and send it to the tip tree.

Thanks!
Menglong Dong

>
> > +    return 0;
> > +}

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-08-08  0:58                     ` Alexei Starovoitov
  2025-08-08  5:48                       ` Menglong Dong
@ 2025-08-08  6:32                       ` Menglong Dong
  2025-08-08 15:47                         ` Alexei Starovoitov
  1 sibling, 1 reply; 73+ messages in thread
From: Menglong Dong @ 2025-08-08  6:32 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Fri, Aug 8, 2025 at 8:58 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
[......]
> > +{
> > +    DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
>
> This part looks nice and sweet. Not sure what you were concerned about.
>
> Respin it as a proper patch targeting tip tree.
>
> And explain the motivation in commit log with detailed
> 'perf report' before/after along with 111M/s to 121M/s speed up,
>
> I suspect with my other __set_cpus_allowed_ptr() suggestion
> the speed up should be even bigger.

Much better.

Before:
fentry         :  113.030 ± 0.149M/s
fentry         :  112.501 ± 0.187M/s
fentry         :  112.828 ± 0.267M/s
fentry         :  115.287 ± 0.241M/s

After:
fentry         :  143.644 ± 0.670M/s
fentry         :  149.764 ± 0.362M/s
fentry         :  149.642 ± 0.156M/s
fentry         :  145.263 ± 0.221M/s
fentry         :  145.558 ± 0.145M/s

>
> > +    return 0;
> > +}

^ permalink raw reply	[flat|nested] 73+ messages in thread

* Re: Inlining migrate_disable/enable. Was: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline
  2025-08-08  6:32                       ` Menglong Dong
@ 2025-08-08 15:47                         ` Alexei Starovoitov
  0 siblings, 0 replies; 73+ messages in thread
From: Alexei Starovoitov @ 2025-08-08 15:47 UTC (permalink / raw)
  To: Menglong Dong
  Cc: Peter Zijlstra, Menglong Dong, Steven Rostedt, Jiri Olsa, bpf,
	Martin KaFai Lau, Eduard Zingerman, LKML, Network Development

On Thu, Aug 7, 2025 at 11:32 PM Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> On Fri, Aug 8, 2025 at 8:58 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> [......]
> > > +{
> > > +    DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned));
> >
> > This part looks nice and sweet. Not sure what you were concerned about.
> >
> > Respin it as a proper patch targeting tip tree.
> >
> > And explain the motivation in commit log with detailed
> > 'perf report' before/after along with 111M/s to 121M/s speed up,
> >
> > I suspect with my other __set_cpus_allowed_ptr() suggestion
> > the speed up should be even bigger.
>
> Much better.
>
> Before:
> fentry         :  113.030 ± 0.149M/s
> fentry         :  112.501 ± 0.187M/s
> fentry         :  112.828 ± 0.267M/s
> fentry         :  115.287 ± 0.241M/s
>
> After:
> fentry         :  143.644 ± 0.670M/s
> fentry         :  149.764 ± 0.362M/s
> fentry         :  149.642 ± 0.156M/s
> fentry         :  145.263 ± 0.221M/s
> fentry         :  145.558 ± 0.145M/s

Nice!

^ permalink raw reply	[flat|nested] 73+ messages in thread

end of thread, other threads:[~2025-08-08 15:47 UTC | newest]

Thread overview: 73+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20250703121521.1874196-1-dongml2@chinatelecom.cn>
2025-07-03 12:15 ` [PATCH bpf-next v2 01/18] bpf: add function hash table for tracing-multi Menglong Dong
2025-07-04 16:07   ` kernel test robot
2025-07-15  1:55   ` Alexei Starovoitov
2025-07-15  2:37     ` Menglong Dong
2025-07-15  2:49       ` Alexei Starovoitov
2025-07-15  3:13         ` Menglong Dong
2025-07-15  9:06           ` Menglong Dong
2025-07-15 16:22             ` Alexei Starovoitov
2025-07-03 12:15 ` [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline Menglong Dong
2025-07-15  2:25   ` Alexei Starovoitov
2025-07-15  8:36     ` Menglong Dong
2025-07-15  9:30       ` Menglong Dong
2025-07-16 16:56         ` Inlining migrate_disable/enable. Was: " Alexei Starovoitov
2025-07-16 18:24           ` Peter Zijlstra
2025-07-16 22:35             ` Alexei Starovoitov
2025-07-16 22:49               ` Steven Rostedt
2025-07-16 22:50                 ` Steven Rostedt
2025-07-28  9:20               ` Menglong Dong
2025-07-31 16:15                 ` Alexei Starovoitov
2025-08-01  1:42                   ` Menglong Dong
2025-08-06  8:44                   ` Menglong Dong
2025-08-08  0:58                     ` Alexei Starovoitov
2025-08-08  5:48                       ` Menglong Dong
2025-08-08  6:32                       ` Menglong Dong
2025-08-08 15:47                         ` Alexei Starovoitov
2025-07-15 16:35       ` Alexei Starovoitov
2025-07-16 13:05         ` Menglong Dong
2025-07-17  0:59           ` multi-fentry proposal. Was: " Alexei Starovoitov
2025-07-17  1:50             ` Menglong Dong
2025-07-17  2:13               ` Alexei Starovoitov
2025-07-17  2:37                 ` Menglong Dong
2025-07-16 14:40         ` Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 03/18] ftrace: factor out ftrace_direct_update from register_ftrace_direct Menglong Dong
2025-07-05  2:41   ` kernel test robot
2025-07-03 12:15 ` [PATCH bpf-next v2 04/18] ftrace: add reset_ftrace_direct_ips Menglong Dong
2025-07-03 15:30   ` Steven Rostedt
2025-07-04  1:54     ` Menglong Dong
2025-07-07 18:52       ` Steven Rostedt
2025-07-08  1:26         ` Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 05/18] bpf: introduce bpf_gtramp_link Menglong Dong
2025-07-04  7:00   ` kernel test robot
2025-07-04  7:52   ` kernel test robot
2025-07-03 12:15 ` [PATCH bpf-next v2 06/18] bpf: tracing: add support to record and check the accessed args Menglong Dong
2025-07-14 22:07   ` Andrii Nakryiko
2025-07-14 23:45     ` Menglong Dong
2025-07-15 17:11       ` Andrii Nakryiko
2025-07-16 12:50         ` Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 07/18] bpf: refactor the modules_array to ptr_array Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 08/18] bpf: verifier: add btf to the function args of bpf_check_attach_target Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 09/18] bpf: verifier: move btf_id_deny to bpf_check_attach_target Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 10/18] x86,bpf: factor out arch_bpf_get_regs_nr Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 11/18] bpf: tracing: add multi-link support Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 12/18] libbpf: don't free btf if tracing_multi progs existing Menglong Dong
2025-07-14 22:07   ` Andrii Nakryiko
2025-07-15  1:15     ` Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 13/18] libbpf: support tracing_multi Menglong Dong
2025-07-14 22:07   ` Andrii Nakryiko
2025-07-15  1:58     ` Menglong Dong
2025-07-15 17:20       ` Andrii Nakryiko
2025-07-16 12:43         ` Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 14/18] libbpf: add btf type hash lookup support Menglong Dong
2025-07-14 22:07   ` Andrii Nakryiko
2025-07-15  4:40     ` Menglong Dong
2025-07-15 17:20       ` Andrii Nakryiko
2025-07-16 11:53         ` Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 15/18] libbpf: add skip_invalid and attach_tracing for tracing_multi Menglong Dong
2025-07-14 22:07   ` Andrii Nakryiko
2025-07-15  5:48     ` Menglong Dong
2025-07-15 17:23       ` Andrii Nakryiko
2025-07-16 11:46         ` Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 16/18] selftests/bpf: move get_ksyms and get_addrs to trace_helpers.c Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 17/18] selftests/bpf: add basic testcases for tracing_multi Menglong Dong
2025-07-03 12:15 ` [PATCH bpf-next v2 18/18] selftests/bpf: add bench tests " Menglong Dong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).