From: Martin KaFai Lau <martin.lau@linux.dev>
To: bpf@vger.kernel.org
Cc: 'Alexei Starovoitov ' <ast@kernel.org>,
'Andrii Nakryiko ' <andrii@kernel.org>,
'Daniel Borkmann ' <daniel@iogearbox.net>,
'Shakeel Butt ' <shakeel.butt@linux.dev>,
'Roman Gushchin ' <roman.gushchin@linux.dev>,
'Amery Hung ' <ameryhung@gmail.com>,
netdev@vger.kernel.org
Subject: [RFC PATCH bpf-next 02/12] bpf: Make struct_ops tasks_rcu grace period optional
Date: Tue, 19 May 2026 14:58:09 -0700 [thread overview]
Message-ID: <20260519215841.2984970-3-martin.lau@linux.dev> (raw)
In-Reply-To: <20260519215841.2984970-1-martin.lau@linux.dev>
From: Martin KaFai Lau <martin.lau@kernel.org>
bpf_struct_ops_map_free() currently waits for both a regular RCU grace
period and a tasks RCU grace period for every struct_ops map through
synchronize_rcu_mult(call_rcu, call_rcu_tasks).
A regular RCU grace period is still required for all struct_ops maps
because the struct_ops trampoline ksyms requires a rcu grace period
(take a look at the list_del_rcu in __bpf_ksym_del).
Add a map_free_pre_rcu() callback so the struct_ops map can remove
ksyms before bpf_map_put() wait for the regular rcu grace period.
The tasks RCU grace period is only needed by tcp_congestion_ops.
Add free_after_tasks_rcu_gp only to struct bpf_struct_ops instead
of the bpf_map.
When CONFIG_TASKS_RCU=n, synchronize_rcu_tasks() is the same as
synchronize_rcu(). Since all struct_ops maps now complete a regular RCU
grace period before bpf_struct_ops_map_free() runs, skip the extra
synchronize_rcu_tasks() call in this case.
This cleanup prepares for a later patch that needs to support
free_after_mult_rcu_gp.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
include/linux/bpf.h | 7 +++++++
kernel/bpf/bpf_struct_ops.c | 31 +++++++++++++------------------
kernel/bpf/syscall.c | 3 +++
net/ipv4/bpf_tcp_ca.c | 16 ++++++++++++++++
4 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b28cacc3075..a276eada19c4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -86,6 +86,7 @@ struct bpf_map_ops {
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
+ void (*map_free_pre_rcu)(struct bpf_map *map);
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
@@ -1992,6 +1993,11 @@ struct btf_member;
* unloaded while in use.
* @name: The name of the struct bpf_struct_ops object.
* @func_models: Func models
+ * @free_after_tasks_rcu_gp: Set to true if it needs the bpf core to wait for
+ * a tasks_rcu gp before freeing the struct_ops map
+ * and its progs. It is unnecessary if the @unreg
+ * has waited for the correct rcu gp or the @unreg
+ * has ensured all struct_ops prog has finished running.
*/
struct bpf_struct_ops {
const struct bpf_verifier_ops *verifier_ops;
@@ -2010,6 +2016,7 @@ struct bpf_struct_ops {
struct module *owner;
const char *name;
struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
+ bool free_after_tasks_rcu_gp;
};
/* Every member of a struct_ops type has an instance even a member is not
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 08791180d71d..28eab24ef0ed 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -983,9 +983,18 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free_pre_rcu(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ bpf_struct_ops_map_del_ksyms(st_map);
+}
+
static void bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ struct bpf_struct_ops *st_ops = st_map->st_ops_desc->st_ops;
+ bool tasks_rcu = st_ops->free_after_tasks_rcu_gp;
/* st_ops->owner was acquired during map_alloc to implicitly holds
* the btf's refcnt. The acquire was only done when btf_is_module()
@@ -996,24 +1005,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
bpf_struct_ops_map_dissoc_progs(st_map);
- bpf_struct_ops_map_del_ksyms(st_map);
-
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its refcount may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
- * to finish. bpf-tcp-cc prog is non sleepable.
- * A rcu_tasks gp is to wait for the last few insn
- * in the tramopline image to finish before releasing
- * the trampoline image.
- */
- synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+ if (tasks_rcu && IS_ENABLED(CONFIG_TASKS_RCU))
+ synchronize_rcu_tasks();
__bpf_struct_ops_map_free(map);
}
@@ -1122,6 +1115,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
mutex_init(&st_map->lock);
bpf_map_init_from_attr(map, attr);
+ map->free_after_rcu_gp = true;
return map;
@@ -1154,6 +1148,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
.map_free = bpf_struct_ops_map_free,
+ .map_free_pre_rcu = bpf_struct_ops_map_free_pre_rcu,
.map_get_next_key = bpf_struct_ops_map_get_next_key,
.map_lookup_elem = bpf_struct_ops_map_lookup_elem,
.map_delete_elem = bpf_struct_ops_map_delete_elem,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6600e126fbfb..d0e8e9c8c888 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -950,6 +950,9 @@ void bpf_map_put(struct bpf_map *map)
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map);
+ if (map->ops->map_free_pre_rcu)
+ map->ops->map_free_pre_rcu(map);
+
WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
/* RCU tasks trace grace period implies RCU grace period. */
if (READ_ONCE(map->free_after_mult_rcu_gp))
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 791e15063237..e224ecafbd69 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -339,6 +339,22 @@ static struct bpf_struct_ops bpf_tcp_congestion_ops = {
.validate = bpf_tcp_ca_validate,
.name = "tcp_congestion_ops",
.cfi_stubs = &__bpf_ops_tcp_congestion_ops,
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ .free_after_tasks_rcu_gp = true,
.owner = THIS_MODULE,
};
--
2.53.0-Meta
next prev parent reply other threads:[~2026-05-19 21:59 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 01/12] bpf: Remove __rcu tagging in st_link->map Martin KaFai Lau
2026-05-19 21:58 ` Martin KaFai Lau [this message]
2026-05-19 22:54 ` [RFC PATCH bpf-next 02/12] bpf: Make struct_ops tasks_rcu grace period optional sashiko-bot
2026-05-20 0:25 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 03/12] bpf: Add bpf_struct_ops accessor helpers Martin KaFai Lau
2026-05-19 22:25 ` sashiko-bot
2026-05-19 21:58 ` [RFC PATCH bpf-next 04/12] bpf: Remove unnecessary prog_list_prog() check Martin KaFai Lau
2026-05-19 22:49 ` sashiko-bot
2026-05-19 21:58 ` [RFC PATCH bpf-next 05/12] bpf: Replace prog_list_prog() check with direct pl->prog and pl->link check Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 06/12] bpf: Add prog_list_init_item(), prog_list_replace_item(), and prog_list_id() Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 07/12] bpf: Move LSM trampoline unlink into bpf_cgroup_link_auto_detach() Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 08/12] bpf: Add a few bpf_cgroup_array_* helper functions Martin KaFai Lau
2026-05-19 22:45 ` sashiko-bot
2026-05-19 22:50 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 09/12] bpf: Add infrastructure to support attaching struct_ops to cgroups Martin KaFai Lau
2026-05-19 22:50 ` sashiko-bot
2026-05-19 23:56 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 10/12] bpf: tcp: Support selected sock_ops callbacks as struct_ops Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 11/12] libbpf: Support attaching struct_ops to a cgroup Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 12/12] selftests/bpf: Test " Martin KaFai Lau
2026-05-19 23:03 ` sashiko-bot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260519215841.2984970-3-martin.lau@linux.dev \
--to=martin.lau@linux.dev \
--cc=ameryhung@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=netdev@vger.kernel.org \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox