From: Amery Hung <ameryhung@gmail.com>
To: bpf@vger.kernel.org
Cc: netdev@vger.kernel.org, alexei.starovoitov@gmail.com,
andrii@kernel.org, daniel@iogearbox.net, eddyz87@gmail.com,
memxor@gmail.com, martin.lau@kernel.org, shakeel.butt@linux.dev,
roman.gushchin@linux.dev, kuniyu@google.com,
kerneljasonxing@gmail.com, ameryhung@gmail.com,
kernel-team@meta.com
Subject: [PATCH bpf-next v2 02/15] bpf: Make struct_ops tasks_rcu grace period optional
Date: Tue, 23 Jun 2026 10:49:50 -0700 [thread overview]
Message-ID: <20260623175006.3136053-3-ameryhung@gmail.com> (raw)
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>
From: Martin KaFai Lau <martin.lau@kernel.org>
bpf_struct_ops_map_free() currently waits for both a regular RCU grace
period and a tasks RCU grace period for every struct_ops map through
synchronize_rcu_mult(call_rcu, call_rcu_tasks).
A regular RCU grace period is still required for all struct_ops maps
because the struct_ops trampoline ksyms requires a rcu grace period
(take a look at the list_del_rcu in __bpf_ksym_del).
Add a map_free_pre_rcu() callback so the struct_ops map can remove
ksyms before bpf_map_put() wait for the regular rcu grace period.
The tasks RCU grace period is only needed by tcp_congestion_ops.
Add free_after_tasks_rcu_gp only to struct bpf_struct_ops instead
of the bpf_map.
When CONFIG_TASKS_RCU=n, synchronize_rcu_tasks() is the same as
synchronize_rcu(). Since all struct_ops maps now complete a regular RCU
grace period before bpf_struct_ops_map_free() runs, skip the extra
synchronize_rcu_tasks() call in this case.
This cleanup prepares for a later patch that needs to support
free_after_mult_rcu_gp.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
include/linux/bpf.h | 7 +++++++
kernel/bpf/bpf_struct_ops.c | 31 +++++++++++++------------------
kernel/bpf/syscall.c | 3 +++
net/ipv4/bpf_tcp_ca.c | 16 ++++++++++++++++
4 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7719f6528445..7ac8873839f4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -90,6 +90,7 @@ struct bpf_map_ops {
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
+ void (*map_free_pre_rcu)(struct bpf_map *map);
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
@@ -2099,6 +2100,11 @@ struct btf_member;
* unloaded while in use.
* @name: The name of the struct bpf_struct_ops object.
* @func_models: Func models
+ * @free_after_tasks_rcu_gp: Set to true if it needs the bpf core to wait for
+ * a tasks_rcu gp before freeing the struct_ops map
+ * and its progs. It is unnecessary if the @unreg
+ * has waited for the correct rcu gp or the @unreg
+ * has ensured all struct_ops prog has finished running.
*/
struct bpf_struct_ops {
const struct bpf_verifier_ops *verifier_ops;
@@ -2117,6 +2123,7 @@ struct bpf_struct_ops {
struct module *owner;
const char *name;
struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
+ bool free_after_tasks_rcu_gp;
};
/* Every member of a struct_ops type has an instance even a member is not
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index d06b3d9bcc13..c422ce41873e 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -984,9 +984,18 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free_pre_rcu(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ bpf_struct_ops_map_del_ksyms(st_map);
+}
+
static void bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ struct bpf_struct_ops *st_ops = st_map->st_ops_desc->st_ops;
+ bool tasks_rcu = st_ops->free_after_tasks_rcu_gp;
/* st_ops->owner was acquired during map_alloc to implicitly holds
* the btf's refcnt. The acquire was only done when btf_is_module()
@@ -997,24 +1006,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
bpf_struct_ops_map_dissoc_progs(st_map);
- bpf_struct_ops_map_del_ksyms(st_map);
-
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its refcount may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
- * to finish. bpf-tcp-cc prog is non sleepable.
- * A rcu_tasks gp is to wait for the last few insn
- * in the tramopline image to finish before releasing
- * the trampoline image.
- */
- synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+ if (tasks_rcu && IS_ENABLED(CONFIG_TASKS_RCU))
+ synchronize_rcu_tasks();
__bpf_struct_ops_map_free(map);
}
@@ -1123,6 +1116,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
mutex_init(&st_map->lock);
bpf_map_init_from_attr(map, attr);
+ map->free_after_rcu_gp = true;
return map;
@@ -1155,6 +1149,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
.map_free = bpf_struct_ops_map_free,
+ .map_free_pre_rcu = bpf_struct_ops_map_free_pre_rcu,
.map_get_next_key = bpf_struct_ops_map_get_next_key,
.map_lookup_elem = bpf_struct_ops_map_lookup_elem,
.map_delete_elem = bpf_struct_ops_map_delete_elem,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6db306d23b47..b07acf37ad1d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -956,6 +956,9 @@ void bpf_map_put(struct bpf_map *map)
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map);
+ if (map->ops->map_free_pre_rcu)
+ map->ops->map_free_pre_rcu(map);
+
WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
/* RCU tasks trace grace period implies RCU grace period. */
if (READ_ONCE(map->free_after_mult_rcu_gp))
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 791e15063237..e224ecafbd69 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -339,6 +339,22 @@ static struct bpf_struct_ops bpf_tcp_congestion_ops = {
.validate = bpf_tcp_ca_validate,
.name = "tcp_congestion_ops",
.cfi_stubs = &__bpf_ops_tcp_congestion_ops,
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ .free_after_tasks_rcu_gp = true,
.owner = THIS_MODULE,
};
--
2.53.0-Meta
next prev parent reply other threads:[~2026-06-23 17:50 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-23 17:49 [PATCH bpf-next v2 00/15] bpf: A common way to attach struct_ops to a cgroup Amery Hung
2026-06-23 17:49 ` [PATCH bpf-next v2 01/15] bpf: Remove __rcu tagging in st_link->map Amery Hung
2026-06-23 17:49 ` Amery Hung [this message]
2026-06-23 17:49 ` [PATCH bpf-next v2 03/15] bpf: Add bpf_struct_ops accessor helpers Amery Hung
2026-06-23 17:49 ` [PATCH bpf-next v2 04/15] bpf: Remove unnecessary prog_list_prog() check Amery Hung
2026-06-23 17:49 ` [PATCH bpf-next v2 05/15] bpf: Replace prog_list_prog() check with direct pl->prog and pl->link check Amery Hung
2026-06-23 17:49 ` [PATCH bpf-next v2 06/15] bpf: Add prog_list_init_item(), prog_list_replace_item(), and prog_list_id() Amery Hung
2026-06-23 17:49 ` [PATCH bpf-next v2 07/15] bpf: Move LSM trampoline unlink into bpf_cgroup_link_auto_detach() Amery Hung
2026-06-23 17:49 ` [PATCH bpf-next v2 08/15] bpf: Add a few bpf_cgroup_array_* helper functions Amery Hung
2026-06-23 17:49 ` [PATCH bpf-next v2 09/15] bpf: Add infrastructure to support attaching struct_ops to cgroups Amery Hung
2026-06-23 17:49 ` [PATCH bpf-next v2 10/15] bpf: Allow all struct_ops to use bpf_dynptr_from_skb() Amery Hung
2026-06-23 17:49 ` [PATCH bpf-next v2 11/15] bpf: tcp: Support selected sock_ops callbacks as struct_ops Amery Hung
2026-06-23 17:50 ` [PATCH bpf-next v2 12/15] bpf: tcp: Support parse/len/write header option hooks in bpf_tcp_ops Amery Hung
2026-06-23 17:50 ` [PATCH bpf-next v2 13/15] libbpf: Support attaching struct_ops to a cgroup Amery Hung
2026-06-23 17:50 ` [PATCH bpf-next v2 14/15] selftests/bpf: Test " Amery Hung
2026-06-23 17:50 ` [PATCH bpf-next v2 15/15] selftests/bpf: Add test for bpf_tcp_ops header option hooks Amery Hung
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260623175006.3136053-3-ameryhung@gmail.com \
--to=ameryhung@gmail.com \
--cc=alexei.starovoitov@gmail.com \
--cc=andrii@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=eddyz87@gmail.com \
--cc=kernel-team@meta.com \
--cc=kerneljasonxing@gmail.com \
--cc=kuniyu@google.com \
--cc=martin.lau@kernel.org \
--cc=memxor@gmail.com \
--cc=netdev@vger.kernel.org \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox