From: Martin KaFai Lau <martin.lau@linux.dev>
To: bpf@vger.kernel.org
Cc: 'Alexei Starovoitov ' <ast@kernel.org>,
'Andrii Nakryiko ' <andrii@kernel.org>,
'Daniel Borkmann ' <daniel@iogearbox.net>,
'Shakeel Butt ' <shakeel.butt@linux.dev>,
'Roman Gushchin ' <roman.gushchin@linux.dev>,
'Amery Hung ' <ameryhung@gmail.com>,
netdev@vger.kernel.org
Subject: [RFC PATCH bpf-next 02/12] bpf: Make struct_ops tasks_rcu grace period optional
Date: Tue, 19 May 2026 14:58:09 -0700 [thread overview]
Message-ID: <20260519215841.2984970-3-martin.lau@linux.dev> (raw)
In-Reply-To: <20260519215841.2984970-1-martin.lau@linux.dev>
From: Martin KaFai Lau <martin.lau@kernel.org>
bpf_struct_ops_map_free() currently waits for both a regular RCU grace
period and a tasks RCU grace period for every struct_ops map through
synchronize_rcu_mult(call_rcu, call_rcu_tasks).
A regular RCU grace period is still required for all struct_ops maps
because the struct_ops trampoline ksyms requires a rcu grace period
(take a look at the list_del_rcu in __bpf_ksym_del).
Add a map_free_pre_rcu() callback so the struct_ops map can remove
ksyms before bpf_map_put() wait for the regular rcu grace period.
The tasks RCU grace period is only needed by tcp_congestion_ops.
Add free_after_tasks_rcu_gp only to struct bpf_struct_ops instead
of the bpf_map.
When CONFIG_TASKS_RCU=n, synchronize_rcu_tasks() is the same as
synchronize_rcu(). Since all struct_ops maps now complete a regular RCU
grace period before bpf_struct_ops_map_free() runs, skip the extra
synchronize_rcu_tasks() call in this case.
This cleanup prepares for a later patch that needs to support
free_after_mult_rcu_gp.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
include/linux/bpf.h | 7 +++++++
kernel/bpf/bpf_struct_ops.c | 31 +++++++++++++------------------
kernel/bpf/syscall.c | 3 +++
net/ipv4/bpf_tcp_ca.c | 16 ++++++++++++++++
4 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b28cacc3075..a276eada19c4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -86,6 +86,7 @@ struct bpf_map_ops {
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
+ void (*map_free_pre_rcu)(struct bpf_map *map);
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
@@ -1992,6 +1993,11 @@ struct btf_member;
* unloaded while in use.
* @name: The name of the struct bpf_struct_ops object.
* @func_models: Func models
+ * @free_after_tasks_rcu_gp: Set to true if it needs the bpf core to wait for
+ * a tasks_rcu gp before freeing the struct_ops map
+ * and its progs. It is unnecessary if the @unreg
+ * has waited for the correct rcu gp or the @unreg
+ * has ensured all struct_ops prog has finished running.
*/
struct bpf_struct_ops {
const struct bpf_verifier_ops *verifier_ops;
@@ -2010,6 +2016,7 @@ struct bpf_struct_ops {
struct module *owner;
const char *name;
struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
+ bool free_after_tasks_rcu_gp;
};
/* Every member of a struct_ops type has an instance even a member is not
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 08791180d71d..28eab24ef0ed 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -983,9 +983,18 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free_pre_rcu(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ bpf_struct_ops_map_del_ksyms(st_map);
+}
+
static void bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ struct bpf_struct_ops *st_ops = st_map->st_ops_desc->st_ops;
+ bool tasks_rcu = st_ops->free_after_tasks_rcu_gp;
/* st_ops->owner was acquired during map_alloc to implicitly holds
* the btf's refcnt. The acquire was only done when btf_is_module()
@@ -996,24 +1005,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
bpf_struct_ops_map_dissoc_progs(st_map);
- bpf_struct_ops_map_del_ksyms(st_map);
-
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its refcount may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
- * to finish. bpf-tcp-cc prog is non sleepable.
- * A rcu_tasks gp is to wait for the last few insn
- * in the tramopline image to finish before releasing
- * the trampoline image.
- */
- synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+ if (tasks_rcu && IS_ENABLED(CONFIG_TASKS_RCU))
+ synchronize_rcu_tasks();
__bpf_struct_ops_map_free(map);
}
@@ -1122,6 +1115,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
mutex_init(&st_map->lock);
bpf_map_init_from_attr(map, attr);
+ map->free_after_rcu_gp = true;
return map;
@@ -1154,6 +1148,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
.map_free = bpf_struct_ops_map_free,
+ .map_free_pre_rcu = bpf_struct_ops_map_free_pre_rcu,
.map_get_next_key = bpf_struct_ops_map_get_next_key,
.map_lookup_elem = bpf_struct_ops_map_lookup_elem,
.map_delete_elem = bpf_struct_ops_map_delete_elem,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6600e126fbfb..d0e8e9c8c888 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -950,6 +950,9 @@ void bpf_map_put(struct bpf_map *map)
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map);
+ if (map->ops->map_free_pre_rcu)
+ map->ops->map_free_pre_rcu(map);
+
WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
/* RCU tasks trace grace period implies RCU grace period. */
if (READ_ONCE(map->free_after_mult_rcu_gp))
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 791e15063237..e224ecafbd69 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -339,6 +339,22 @@ static struct bpf_struct_ops bpf_tcp_congestion_ops = {
.validate = bpf_tcp_ca_validate,
.name = "tcp_congestion_ops",
.cfi_stubs = &__bpf_ops_tcp_congestion_ops,
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ .free_after_tasks_rcu_gp = true,
.owner = THIS_MODULE,
};
--
2.53.0-Meta
next prev parent reply other threads:[~2026-05-19 21:59 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 01/12] bpf: Remove __rcu tagging in st_link->map Martin KaFai Lau
2026-05-19 21:58 ` Martin KaFai Lau [this message]
2026-05-19 22:54 ` [RFC PATCH bpf-next 02/12] bpf: Make struct_ops tasks_rcu grace period optional sashiko-bot
2026-05-20 0:25 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 03/12] bpf: Add bpf_struct_ops accessor helpers Martin KaFai Lau
2026-05-19 22:25 ` sashiko-bot
2026-05-19 21:58 ` [RFC PATCH bpf-next 04/12] bpf: Remove unnecessary prog_list_prog() check Martin KaFai Lau
2026-05-19 22:49 ` sashiko-bot
2026-05-19 21:58 ` [RFC PATCH bpf-next 05/12] bpf: Replace prog_list_prog() check with direct pl->prog and pl->link check Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 06/12] bpf: Add prog_list_init_item(), prog_list_replace_item(), and prog_list_id() Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 07/12] bpf: Move LSM trampoline unlink into bpf_cgroup_link_auto_detach() Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 08/12] bpf: Add a few bpf_cgroup_array_* helper functions Martin KaFai Lau
2026-05-19 22:45 ` sashiko-bot
2026-05-19 22:50 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 09/12] bpf: Add infrastructure to support attaching struct_ops to cgroups Martin KaFai Lau
2026-05-19 22:50 ` sashiko-bot
2026-05-19 23:56 ` Martin KaFai Lau
2026-05-26 17:54 ` Amery Hung
2026-05-26 21:37 ` Martin KaFai Lau
2026-05-26 22:23 ` Amery Hung
2026-05-19 21:58 ` [RFC PATCH bpf-next 10/12] bpf: tcp: Support selected sock_ops callbacks as struct_ops Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 11/12] libbpf: Support attaching struct_ops to a cgroup Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 12/12] selftests/bpf: Test " Martin KaFai Lau
2026-05-19 23:03 ` sashiko-bot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260519215841.2984970-3-martin.lau@linux.dev \
--to=martin.lau@linux.dev \
--cc=ameryhung@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=netdev@vger.kernel.org \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.