* [RFC PATCH bpf-next 01/12] bpf: Remove __rcu tagging in st_link->map
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 02/12] bpf: Make struct_ops tasks_rcu grace period optional Martin KaFai Lau
` (10 subsequent siblings)
11 siblings, 0 replies; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
st_link->map is always written under update_mutex. The paths that read
st_link->map with rcu_read_lock() are not in the fast path, so they can
simply take update_mutex instead. Remove the __rcu annotation and replace
all RCU accessors with direct pointer reads under update_mutex. Use
READ_ONCE() in bpf_struct_ops_map_link_poll() which reads the pointer
without holding update_mutex.
It is a simplification change.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
kernel/bpf/bpf_struct_ops.c | 29 ++++++++++++++---------------
1 file changed, 14 insertions(+), 15 deletions(-)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 521cb9d7e8c7..08791180d71d 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -57,7 +57,7 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_link {
struct bpf_link link;
- struct bpf_map __rcu *map;
+ struct bpf_map *map;
wait_queue_head_t wait_hup;
};
@@ -1220,8 +1220,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
struct bpf_struct_ops_map *st_map;
st_link = container_of(link, struct bpf_struct_ops_link, link);
- st_map = (struct bpf_struct_ops_map *)
- rcu_dereference_protected(st_link->map, true);
+ st_map = (struct bpf_struct_ops_map *)st_link->map;
if (st_map) {
st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
bpf_map_put(&st_map->map);
@@ -1236,11 +1235,11 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
struct bpf_map *map;
st_link = container_of(link, struct bpf_struct_ops_link, link);
- rcu_read_lock();
- map = rcu_dereference(st_link->map);
+ mutex_lock(&update_mutex);
+ map = st_link->map;
if (map)
seq_printf(seq, "map_id:\t%d\n", map->id);
- rcu_read_unlock();
+ mutex_unlock(&update_mutex);
}
static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
@@ -1250,11 +1249,11 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
struct bpf_map *map;
st_link = container_of(link, struct bpf_struct_ops_link, link);
- rcu_read_lock();
- map = rcu_dereference(st_link->map);
+ mutex_lock(&update_mutex);
+ map = st_link->map;
if (map)
info->struct_ops.map_id = map->id;
- rcu_read_unlock();
+ mutex_unlock(&update_mutex);
return 0;
}
@@ -1277,7 +1276,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
mutex_lock(&update_mutex);
- old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ old_map = st_link->map;
if (!old_map) {
err = -ENOLINK;
goto err_out;
@@ -1299,7 +1298,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
goto err_out;
bpf_map_inc(new_map);
- rcu_assign_pointer(st_link->map, new_map);
+ WRITE_ONCE(st_link->map, new_map);
bpf_map_put(old_map);
err_out:
@@ -1316,7 +1315,7 @@ static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
mutex_lock(&update_mutex);
- map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ map = st_link->map;
if (!map) {
mutex_unlock(&update_mutex);
return 0;
@@ -1325,7 +1324,7 @@ static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
- RCU_INIT_POINTER(st_link->map, NULL);
+ WRITE_ONCE(st_link->map, NULL);
/* Pair with bpf_map_get() in bpf_struct_ops_link_create() or
* bpf_map_inc() in bpf_struct_ops_map_link_update().
*/
@@ -1345,7 +1344,7 @@ static __poll_t bpf_struct_ops_map_link_poll(struct file *file,
poll_wait(file, &st_link->wait_hup, pts);
- return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP;
+ return READ_ONCE(st_link->map) ? 0 : EPOLLHUP;
}
static const struct bpf_link_ops bpf_struct_ops_map_lops = {
@@ -1401,7 +1400,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
link = NULL;
goto err_out;
}
- RCU_INIT_POINTER(link->map, map);
+ link->map = map;
mutex_unlock(&update_mutex);
return bpf_link_settle(&link_primer);
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* [RFC PATCH bpf-next 02/12] bpf: Make struct_ops tasks_rcu grace period optional
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 01/12] bpf: Remove __rcu tagging in st_link->map Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 22:54 ` sashiko-bot
2026-05-19 21:58 ` [RFC PATCH bpf-next 03/12] bpf: Add bpf_struct_ops accessor helpers Martin KaFai Lau
` (9 subsequent siblings)
11 siblings, 1 reply; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
bpf_struct_ops_map_free() currently waits for both a regular RCU grace
period and a tasks RCU grace period for every struct_ops map through
synchronize_rcu_mult(call_rcu, call_rcu_tasks).
A regular RCU grace period is still required for all struct_ops maps
because the struct_ops trampoline ksyms requires a rcu grace period
(take a look at the list_del_rcu in __bpf_ksym_del).
Add a map_free_pre_rcu() callback so the struct_ops map can remove
ksyms before bpf_map_put() wait for the regular rcu grace period.
The tasks RCU grace period is only needed by tcp_congestion_ops.
Add free_after_tasks_rcu_gp only to struct bpf_struct_ops instead
of the bpf_map.
When CONFIG_TASKS_RCU=n, synchronize_rcu_tasks() is the same as
synchronize_rcu(). Since all struct_ops maps now complete a regular RCU
grace period before bpf_struct_ops_map_free() runs, skip the extra
synchronize_rcu_tasks() call in this case.
This cleanup prepares for a later patch that needs to support
free_after_mult_rcu_gp.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
include/linux/bpf.h | 7 +++++++
kernel/bpf/bpf_struct_ops.c | 31 +++++++++++++------------------
kernel/bpf/syscall.c | 3 +++
net/ipv4/bpf_tcp_ca.c | 16 ++++++++++++++++
4 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b28cacc3075..a276eada19c4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -86,6 +86,7 @@ struct bpf_map_ops {
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_release)(struct bpf_map *map, struct file *map_file);
void (*map_free)(struct bpf_map *map);
+ void (*map_free_pre_rcu)(struct bpf_map *map);
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
@@ -1992,6 +1993,11 @@ struct btf_member;
* unloaded while in use.
* @name: The name of the struct bpf_struct_ops object.
* @func_models: Func models
+ * @free_after_tasks_rcu_gp: Set to true if it needs the bpf core to wait for
+ * a tasks_rcu gp before freeing the struct_ops map
+ * and its progs. It is unnecessary if the @unreg
+ * has waited for the correct rcu gp or the @unreg
+ * has ensured all struct_ops prog has finished running.
*/
struct bpf_struct_ops {
const struct bpf_verifier_ops *verifier_ops;
@@ -2010,6 +2016,7 @@ struct bpf_struct_ops {
struct module *owner;
const char *name;
struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
+ bool free_after_tasks_rcu_gp;
};
/* Every member of a struct_ops type has an instance even a member is not
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 08791180d71d..28eab24ef0ed 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -983,9 +983,18 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free_pre_rcu(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+ bpf_struct_ops_map_del_ksyms(st_map);
+}
+
static void bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ struct bpf_struct_ops *st_ops = st_map->st_ops_desc->st_ops;
+ bool tasks_rcu = st_ops->free_after_tasks_rcu_gp;
/* st_ops->owner was acquired during map_alloc to implicitly holds
* the btf's refcnt. The acquire was only done when btf_is_module()
@@ -996,24 +1005,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
bpf_struct_ops_map_dissoc_progs(st_map);
- bpf_struct_ops_map_del_ksyms(st_map);
-
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its refcount may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
- * to finish. bpf-tcp-cc prog is non sleepable.
- * A rcu_tasks gp is to wait for the last few insn
- * in the tramopline image to finish before releasing
- * the trampoline image.
- */
- synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+ if (tasks_rcu && IS_ENABLED(CONFIG_TASKS_RCU))
+ synchronize_rcu_tasks();
__bpf_struct_ops_map_free(map);
}
@@ -1122,6 +1115,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
mutex_init(&st_map->lock);
bpf_map_init_from_attr(map, attr);
+ map->free_after_rcu_gp = true;
return map;
@@ -1154,6 +1148,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
.map_free = bpf_struct_ops_map_free,
+ .map_free_pre_rcu = bpf_struct_ops_map_free_pre_rcu,
.map_get_next_key = bpf_struct_ops_map_get_next_key,
.map_lookup_elem = bpf_struct_ops_map_lookup_elem,
.map_delete_elem = bpf_struct_ops_map_delete_elem,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6600e126fbfb..d0e8e9c8c888 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -950,6 +950,9 @@ void bpf_map_put(struct bpf_map *map)
/* bpf_map_free_id() must be called first */
bpf_map_free_id(map);
+ if (map->ops->map_free_pre_rcu)
+ map->ops->map_free_pre_rcu(map);
+
WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
/* RCU tasks trace grace period implies RCU grace period. */
if (READ_ONCE(map->free_after_mult_rcu_gp))
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 791e15063237..e224ecafbd69 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -339,6 +339,22 @@ static struct bpf_struct_ops bpf_tcp_congestion_ops = {
.validate = bpf_tcp_ca_validate,
.name = "tcp_congestion_ops",
.cfi_stubs = &__bpf_ops_tcp_congestion_ops,
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ .free_after_tasks_rcu_gp = true,
.owner = THIS_MODULE,
};
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [RFC PATCH bpf-next 02/12] bpf: Make struct_ops tasks_rcu grace period optional
2026-05-19 21:58 ` [RFC PATCH bpf-next 02/12] bpf: Make struct_ops tasks_rcu grace period optional Martin KaFai Lau
@ 2026-05-19 22:54 ` sashiko-bot
2026-05-20 0:25 ` Martin KaFai Lau
0 siblings, 1 reply; 22+ messages in thread
From: sashiko-bot @ 2026-05-19 22:54 UTC (permalink / raw)
To: Martin KaFai Lau; +Cc: bpf
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [Critical] The patch incorrectly skips the Tasks RCU grace period (`synchronize_rcu_tasks()`) before freeing executable trampoline memory for all `struct_ops` except `tcp_congestion_ops`, leading to Use-After-Free of executable memory.
- [High] RCU-001 violation: `bpf_struct_ops_map_dissoc_progs()` removes the map from `prog->aux->st_ops_assoc` after the regular RCU grace period has already completed, leading to a Use-After-Free.
--
commit 85a451ad8aefbb364a1840344e08d2f34596b96c
Author: Martin KaFai Lau <martin.lau@kernel.org>
bpf: Make struct_ops tasks_rcu grace period optional
This commit makes the tasks RCU grace period optional during struct_ops map
freeing. It introduces a map_free_pre_rcu callback to remove ksyms before the
regular RCU grace period. The tasks RCU grace period is now only explicitly
requested by tcp_congestion_ops.
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index 08791180d71d9..28eab24ef0ed0 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -996,24 +1005,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
>
> bpf_struct_ops_map_dissoc_progs(st_map);
[Severity: High]
Does this violate the RCU remove-before-reclaim rule?
Because map->free_after_rcu_gp is set to true in bpf_struct_ops_map_alloc(),
the regular RCU grace period occurs before bpf_struct_ops_map_free() is called.
Leaving bpf_struct_ops_map_dissoc_progs() here means the map is removed
from prog->aux->st_ops_assoc after the grace period has already finished.
Could a concurrent execution of the BPF program acquire the RCU read lock, read
the still-valid aux->st_ops_assoc pointer, and then access the map after it is
freed?
Should bpf_struct_ops_map_dissoc_progs() be moved to the new
bpf_struct_ops_map_free_pre_rcu() callback instead?
> - /* The struct_ops's function may switch to another struct_ops.
> - *
> - * For example, bpf_tcp_cc_x->init() may switch to
> - * another tcp_cc_y by calling
> - * setsockopt(TCP_CONGESTION, "tcp_cc_y").
> - * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
> - * and its refcount may reach 0 which then free its
> - * trampoline image while tcp_cc_x is still running.
> - *
> - * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
> - * to finish. bpf-tcp-cc prog is non sleepable.
> - * A rcu_tasks gp is to wait for the last few insn
> - * in the tramopline image to finish before releasing
> - * the trampoline image.
> - */
> - synchronize_rcu_mult(call_rcu, call_rcu_tasks);
> + if (tasks_rcu && IS_ENABLED(CONFIG_TASKS_RCU))
> + synchronize_rcu_tasks();
>
> __bpf_struct_ops_map_free(map);
> }
[Severity: Critical]
Does skipping the Tasks RCU grace period for other struct_ops cause a
use-after-free of executable trampoline memory?
All struct_ops programs are executed via dynamically generated trampolines. When
the program returns, the RCU read lock is dropped and preemption is re-enabled.
If a task is preempted while still executing the trampoline epilogue, the
regular RCU grace period will not wait for it.
Without the Tasks RCU wait, could the executable trampoline image be freed
prematurely, causing preempted tasks to execute freed memory when they resume?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260519215841.2984970-1-martin.lau@linux.dev?part=2
^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [RFC PATCH bpf-next 02/12] bpf: Make struct_ops tasks_rcu grace period optional
2026-05-19 22:54 ` sashiko-bot
@ 2026-05-20 0:25 ` Martin KaFai Lau
0 siblings, 0 replies; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-20 0:25 UTC (permalink / raw)
To: sashiko-reviews; +Cc: bpf, Amery Hung
On Tue, May 19, 2026 at 10:54:20PM +0000, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
> - [Critical] The patch incorrectly skips the Tasks RCU grace period (`synchronize_rcu_tasks()`) before freeing executable trampoline memory for all `struct_ops` except `tcp_congestion_ops`, leading to Use-After-Free of executable memory.
> - [High] RCU-001 violation: `bpf_struct_ops_map_dissoc_progs()` removes the map from `prog->aux->st_ops_assoc` after the regular RCU grace period has already completed, leading to a Use-After-Free.
> --
>
> commit 85a451ad8aefbb364a1840344e08d2f34596b96c
> Author: Martin KaFai Lau <martin.lau@kernel.org>
>
> bpf: Make struct_ops tasks_rcu grace period optional
>
> This commit makes the tasks RCU grace period optional during struct_ops map
> freeing. It introduces a map_free_pre_rcu callback to remove ksyms before the
> regular RCU grace period. The tasks RCU grace period is now only explicitly
> requested by tcp_congestion_ops.
>
> > diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> > index 08791180d71d9..28eab24ef0ed0 100644
> > --- a/kernel/bpf/bpf_struct_ops.c
> > +++ b/kernel/bpf/bpf_struct_ops.c
> > @@ -996,24 +1005,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
> >
> > bpf_struct_ops_map_dissoc_progs(st_map);
>
> [Severity: High]
> Does this violate the RCU remove-before-reclaim rule?
>
> Because map->free_after_rcu_gp is set to true in bpf_struct_ops_map_alloc(),
> the regular RCU grace period occurs before bpf_struct_ops_map_free() is called.
> Leaving bpf_struct_ops_map_dissoc_progs() here means the map is removed
> from prog->aux->st_ops_assoc after the grace period has already finished.
After the grace period finished, all struct_ops's bpf prog has
finished running
>
> Could a concurrent execution of the BPF program acquire the RCU read lock, read
> the still-valid aux->st_ops_assoc pointer, and then access the map after it is
> freed?
so this cannot happen.
afaik, for non struct_ops's prog, the st_ops_assoc has a refcnt.
[I could be wrong here. CC Amery.]
Overall, moving bpf_struct_ops_map_dissoc_progs to map_free_pre_rcu
could be a simpler read but don't think it is needed.
>
> Should bpf_struct_ops_map_dissoc_progs() be moved to the new
> bpf_struct_ops_map_free_pre_rcu() callback instead?
>
> > - /* The struct_ops's function may switch to another struct_ops.
> > - *
> > - * For example, bpf_tcp_cc_x->init() may switch to
> > - * another tcp_cc_y by calling
> > - * setsockopt(TCP_CONGESTION, "tcp_cc_y").
> > - * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
> > - * and its refcount may reach 0 which then free its
> > - * trampoline image while tcp_cc_x is still running.
> > - *
> > - * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
> > - * to finish. bpf-tcp-cc prog is non sleepable.
> > - * A rcu_tasks gp is to wait for the last few insn
> > - * in the tramopline image to finish before releasing
> > - * the trampoline image.
> > - */
> > - synchronize_rcu_mult(call_rcu, call_rcu_tasks);
> > + if (tasks_rcu && IS_ENABLED(CONFIG_TASKS_RCU))
> > + synchronize_rcu_tasks();
> >
> > __bpf_struct_ops_map_free(map);
> > }
>
> [Severity: Critical]
> Does skipping the Tasks RCU grace period for other struct_ops cause a
> use-after-free of executable trampoline memory?
I don't think so. Other subsystems using struct_ops wait for
the bpf prog finished running in unreg().
Note that in general this should only matter to the bpf-tcp-cc's struct_ops prog
that does a bpf_setsockopt(TCP_CONGESTION) which triggers a bpf_map_put() on
its own running struct_ops's map refcnt.
>
> All struct_ops programs are executed via dynamically generated trampolines. When
> the program returns, the RCU read lock is dropped and preemption is re-enabled.
> If a task is preempted while still executing the trampoline epilogue, the
> regular RCU grace period will not wait for it.
>
> Without the Tasks RCU wait, could the executable trampoline image be freed
> prematurely, causing preempted tasks to execute freed memory when they resume?
>
> --
> Sashiko AI review · https://sashiko.dev/#/patchset/20260519215841.2984970-1-martin.lau@linux.dev?part=2
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC PATCH bpf-next 03/12] bpf: Add bpf_struct_ops accessor helpers
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 01/12] bpf: Remove __rcu tagging in st_link->map Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 02/12] bpf: Make struct_ops tasks_rcu grace period optional Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 22:25 ` sashiko-bot
2026-05-19 21:58 ` [RFC PATCH bpf-next 04/12] bpf: Remove unnecessary prog_list_prog() check Martin KaFai Lau
` (8 subsequent siblings)
11 siblings, 1 reply; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
Add the helper functions bpf_struct_ops_map_kdata(),
bpf_struct_ops_kdata_map_id(), and bpf_struct_ops_map_cfi_stubs()
in bpf_struct_ops.c. They will be called from cgroup.c in the upcoming
patch to create a struct_ops to cgroup attachment link.
bpf_struct_ops_valid_to_reg() is also exposed for the upcoming caller
in cgroup.c.
The link update validation is also refactored into a new function
bpf_struct_ops_link_update_check() such that it can be reused by the caller
in cgroup.c in the upcoming patch.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
include/linux/bpf.h | 28 +++++++++++++++++
kernel/bpf/bpf_struct_ops.c | 63 ++++++++++++++++++++++++++++---------
2 files changed, 77 insertions(+), 14 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a276eada19c4..dbf98741f15b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2136,6 +2136,12 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
u32 bpf_struct_ops_id(const void *kdata);
+void *bpf_struct_ops_map_kdata(struct bpf_map *map);
+u32 bpf_struct_ops_kdata_map_id(void *kdata);
+void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map);
+bool bpf_struct_ops_valid_to_reg(struct bpf_map *map);
+int bpf_struct_ops_link_update_check(struct bpf_map *new_map, struct bpf_map *old_map,
+ struct bpf_map *expected_old_map);
#ifdef CONFIG_NET
/* Define it here to avoid the use of forward declaration */
@@ -2200,6 +2206,28 @@ static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struc
static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
{
}
+static inline void *bpf_struct_ops_map_kdata(struct bpf_map *map)
+{
+ return NULL;
+}
+static inline u32 bpf_struct_ops_kdata_map_id(void *kdata)
+{
+ return 0;
+}
+static inline void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map)
+{
+ return NULL;
+}
+static inline bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+{
+ return false;
+}
+static inline int bpf_struct_ops_link_update_check(struct bpf_map *new_map,
+ struct bpf_map *old_map,
+ struct bpf_map *expected_old_map)
+{
+ return -EOPNOTSUPP;
+}
#endif
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 28eab24ef0ed..8650a3b88bf6 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1199,7 +1199,33 @@ u32 bpf_struct_ops_id(const void *kdata)
}
EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
-static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+void *bpf_struct_ops_map_kdata(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(map, struct bpf_struct_ops_map, map);
+ return st_map->kvalue.data;
+}
+
+u32 bpf_struct_ops_kdata_map_id(void *kdata)
+{
+ struct bpf_struct_ops_value *kvalue =
+ container_of(kdata, struct bpf_struct_ops_value, data);
+ struct bpf_struct_ops_map *st_map =
+ container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ return st_map->map.id;
+}
+
+void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(map, struct bpf_struct_ops_map, map);
+ return st_map->st_ops_desc->st_ops->cfi_stubs;
+}
+
+bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
@@ -1252,6 +1278,26 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
return 0;
}
+int bpf_struct_ops_link_update_check(struct bpf_map *new_map,
+ struct bpf_map *old_map,
+ struct bpf_map *expected_old_map)
+{
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+
+ if (!old_map)
+ return -ENOLINK;
+ if (expected_old_map && old_map != expected_old_map)
+ return -EPERM;
+
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ /* The new and old struct_ops must be the same type. */
+ if (st_map->st_ops_desc != old_st_map->st_ops_desc)
+ return -EINVAL;
+
+ return 0;
+}
+
static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
struct bpf_map *expected_old_map)
{
@@ -1270,23 +1316,12 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
return -EOPNOTSUPP;
mutex_lock(&update_mutex);
-
old_map = st_link->map;
- if (!old_map) {
- err = -ENOLINK;
- goto err_out;
- }
- if (expected_old_map && old_map != expected_old_map) {
- err = -EPERM;
+ err = bpf_struct_ops_link_update_check(new_map, old_map, expected_old_map);
+ if (err)
goto err_out;
- }
old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
- /* The new and old struct_ops must be the same type. */
- if (st_map->st_ops_desc != old_st_map->st_ops_desc) {
- err = -EINVAL;
- goto err_out;
- }
err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);
if (err)
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [RFC PATCH bpf-next 03/12] bpf: Add bpf_struct_ops accessor helpers
2026-05-19 21:58 ` [RFC PATCH bpf-next 03/12] bpf: Add bpf_struct_ops accessor helpers Martin KaFai Lau
@ 2026-05-19 22:25 ` sashiko-bot
0 siblings, 0 replies; 22+ messages in thread
From: sashiko-bot @ 2026-05-19 22:25 UTC (permalink / raw)
To: Martin KaFai Lau; +Cc: bpf
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Low] The newly added bpf_struct_ops_kdata_map_id() function is an exact, redundant duplicate of the existing bpf_struct_ops_id() function.
--
commit 4987adf60416fed4393443ee95dc6549cf2cfba0
Author: Martin KaFai Lau <martin.lau@kernel.org>
bpf: Add bpf_struct_ops accessor helpers
This commit adds several new helper functions, including
bpf_struct_ops_kdata_map_id(), to access bpf_struct_ops fields. It also
refactors the link update validation to allow reuse in the upcoming
cgroup attachment changes.
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index 28eab24ef0ed0..8650a3b88bf6a 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -1199,7 +1199,33 @@ u32 bpf_struct_ops_id(const void *kdata)
> }
> EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
>
[ ... ]
> +u32 bpf_struct_ops_kdata_map_id(void *kdata)
> +{
> + struct bpf_struct_ops_value *kvalue =
> + container_of(kdata, struct bpf_struct_ops_value, data);
> + struct bpf_struct_ops_map *st_map =
> + container_of(kvalue, struct bpf_struct_ops_map, kvalue);
> +
> + return st_map->map.id;
> +}
[Severity: Low]
Is there a specific reason this new function is needed when the existing
bpf_struct_ops_id() already provides this exact functionality?
Looking at the code just above this addition, bpf_struct_ops_id()
performs the identical mapping and returns the identical field:
u32 bpf_struct_ops_id(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
struct bpf_struct_ops_map *st_map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
return st_map->map.id;
}
Since C allows implicit conversion from void * to const void *, could the
upcoming caller just reuse bpf_struct_ops_id() directly? If a stub is
needed for when CONFIG_NET is disabled, perhaps a static inline fallback
for bpf_struct_ops_id() could be added to include/linux/bpf.h instead of
duplicating the implementation.
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260519215841.2984970-1-martin.lau@linux.dev?part=3
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC PATCH bpf-next 04/12] bpf: Remove unnecessary prog_list_prog() check
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
` (2 preceding siblings ...)
2026-05-19 21:58 ` [RFC PATCH bpf-next 03/12] bpf: Add bpf_struct_ops accessor helpers Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 22:49 ` sashiko-bot
2026-05-19 21:58 ` [RFC PATCH bpf-next 05/12] bpf: Replace prog_list_prog() check with direct pl->prog and pl->link check Martin KaFai Lau
` (7 subsequent siblings)
11 siblings, 1 reply; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
Both replace_effective_prog() and purge_effective_progs() test
the "!prog_list_prog(pl)" to skip a 'detaching' pl.
When detaching a pl, pl->prog and pl->link are set to NULL in case
the update_effective_progs() failed.
However, replace_effective_prog() is not detaching a pl,
so the case "!prog_list_prog()" will not happen.
In purge_effective_prog(), the pl->prog and pl->link are restored
before calling purge_effective_progs(), so the case "!prog_list_prog()"
will not happen either.
This patch removes them as a prep work for the upcoming work
in attaching struct_ops to cgroup. When attaching a struct_ops
to cgroup, there is a link->map case and the prog_list_prog()
will not consider the link->map. The replace_effective_prog()
and purge_effective_progs() will then incorrectly skip a pl
with struct_ops map attached to it.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
kernel/bpf/cgroup.c | 4 ----
1 file changed, 4 deletions(-)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 876f6a81a9b6..71591f6dc458 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -946,8 +946,6 @@ static void replace_effective_prog(struct cgroup *cgrp,
head = &cg->bpf.progs[atype];
hlist_for_each_entry(pl, head, node) {
- if (!prog_list_prog(pl))
- continue;
if (pl->link == link)
goto found;
pos++;
@@ -1099,8 +1097,6 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
head = &cg->bpf.progs[atype];
hlist_for_each_entry(pl, head, node) {
- if (!prog_list_prog(pl))
- continue;
if (pl->prog == prog && pl->link == link)
goto found;
pos++;
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [RFC PATCH bpf-next 04/12] bpf: Remove unnecessary prog_list_prog() check
2026-05-19 21:58 ` [RFC PATCH bpf-next 04/12] bpf: Remove unnecessary prog_list_prog() check Martin KaFai Lau
@ 2026-05-19 22:49 ` sashiko-bot
0 siblings, 0 replies; 22+ messages in thread
From: sashiko-bot @ 2026-05-19 22:49 UTC (permalink / raw)
To: Martin KaFai Lau; +Cc: bpf
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
Pre-existing issues:
- [Critical] replace_effective_prog() incorrectly uses absolute indexing into the bpf_prog_array, failing to account for dummy programs left by failed detach operations.
--
commit 77c3b439749a112ce3763d2ced5be03b93639458
Author: Martin KaFai Lau <martin.lau@kernel.org>
bpf: Remove unnecessary prog_list_prog() check
This patch removes the !prog_list_prog(pl) checks in replace_effective_prog()
and purge_effective_progs(). It cleans up conditions that are not expected
to happen, serving as preparation for attaching struct_ops to cgroups.
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 876f6a81a9b61..71591f6dc4581 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -946,8 +946,6 @@ static void replace_effective_prog(struct cgroup *cgrp,
>
> head = &cg->bpf.progs[atype];
> hlist_for_each_entry(pl, head, node) {
> - if (!prog_list_prog(pl))
> - continue;
> if (pl->link == link)
> goto found;
> pos++;
[Severity: Critical]
This is a pre-existing issue, but does using pos as an absolute index here
cause problems if a prior detach operation failed?
When __cgroup_bpf_detach() fails to allocate a new array via
update_effective_progs(), it falls back to purge_effective_progs().
This fallback uses bpf_prog_array_delete_safe_at() to replace the detached
program with a dummy program in the active array, and then removes the
detached program's node from the cgroup's hlist.
Later in replace_effective_prog(), we iterate the hlist to calculate pos.
Because the detached program is no longer in the hlist, pos represents the
logical index of the program (skipping the dummy program).
However, looking at what happens immediately after the found label:
found:
BUG_ON(!cg);
progs = rcu_dereference_protected(
desc->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
item = &progs->items[pos];
WRITE_ONCE(item->prog, link->link.prog);
The code uses pos as an absolute index into the array (progs->items[pos]).
Because the array still contains the dummy program, the logical index
calculated from the hlist no longer matches the absolute index in the array.
Could this cause the wrong item to be overwritten, leaving the original
program pointer in the active effective array? Since the caller (like
__cgroup_bpf_replace()) drops the reference count of the replaced program,
would this directly lead to a use-after-free when the cgroup BPF hooks are
subsequently executed?
[ ... ]
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260519215841.2984970-1-martin.lau@linux.dev?part=4
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC PATCH bpf-next 05/12] bpf: Replace prog_list_prog() check with direct pl->prog and pl->link check
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
` (3 preceding siblings ...)
2026-05-19 21:58 ` [RFC PATCH bpf-next 04/12] bpf: Remove unnecessary prog_list_prog() check Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 06/12] bpf: Add prog_list_init_item(), prog_list_replace_item(), and prog_list_id() Martin KaFai Lau
` (6 subsequent siblings)
11 siblings, 0 replies; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
prog_list_length() and compute_effective_progs() use !prog_list_prog(pl)
to skip a 'detaching' pl.
When pl->link is not NULL, prog_list_prog(pl) returns
the pl->link->link.prog. This does not work for the upcoming struct_ops
patch where pl->link is not NULL but pl->link->link.prog is NULL,
because a struct_ops map is attached to the cgroup instead of a BPF prog.
To prepare for the upcoming struct_ops patch, this patch
replaces the prog_list_prog() test with the
"!pl->prog && !pl->link". In __cgroup_bpf_detach(),
both pl->prog and pl->link are set to NULL, so testing
"!pl->prog && !pl->link" is the same test to tell
if a pl is being detached. This change should be a no-op.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
kernel/bpf/cgroup.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 71591f6dc458..abfd6df349d4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -387,7 +387,7 @@ static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
u32 cnt = 0;
hlist_for_each_entry(pl, head, node) {
- if (!prog_list_prog(pl))
+ if (!pl->prog && !pl->link)
continue;
if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
(*preorder_cnt)++;
@@ -461,7 +461,7 @@ static int compute_effective_progs(struct cgroup *cgrp,
init_bstart = bstart;
hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
- if (!prog_list_prog(pl))
+ if (!pl->prog && !pl->link)
continue;
if (pl->flags & BPF_F_PREORDER) {
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* [RFC PATCH bpf-next 06/12] bpf: Add prog_list_init_item(), prog_list_replace_item(), and prog_list_id()
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
` (4 preceding siblings ...)
2026-05-19 21:58 ` [RFC PATCH bpf-next 05/12] bpf: Replace prog_list_prog() check with direct pl->prog and pl->link check Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 07/12] bpf: Move LSM trampoline unlink into bpf_cgroup_link_auto_detach() Martin KaFai Lau
` (5 subsequent siblings)
11 siblings, 0 replies; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
Add three helpers to abstract operations on a bpf_prog_list entry.
Right now, bpf_prog_array_item is initialized from prog_list_prog(pl),
which returns either pl->prog or pl->link->link.prog. This will not work
when struct_ops is attached to a cgroup because the attachment is backed
by a struct_ops map instead of a BPF prog.
The same applies to __cgroup_bpf_query(). Instead of always copying a
prog id to userspace, struct_ops cgroup attachment will need to copy the
struct_ops map id.
Refactor bpf_prog_array_item initialization into prog_list_init_item()
and prog_list_replace_item(), and refactor id lookup into prog_list_id().
These helpers will be extended to support pl->link->map in a later patch.
This is a no-op change.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
kernel/bpf/cgroup.c | 26 +++++++++++++++++++-------
1 file changed, 19 insertions(+), 7 deletions(-)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index abfd6df349d4..720eaa3285a2 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -378,6 +378,22 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
return NULL;
}
+static void prog_list_init_item(struct bpf_prog_list *pl, struct bpf_prog_array_item *item)
+{
+ item->prog = prog_list_prog(pl);
+ bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage);
+}
+
+static void prog_list_replace_item(struct bpf_prog_list *pl, struct bpf_prog_array_item *item)
+{
+ WRITE_ONCE(item->prog, pl->link->link.prog);
+}
+
+static u32 prog_list_id(struct bpf_prog_list *pl)
+{
+ return prog_list_prog(pl)->aux->id;
+}
+
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
@@ -471,9 +487,7 @@ static int compute_effective_progs(struct cgroup *cgrp,
item = &progs->items[fstart];
fstart++;
}
- item->prog = prog_list_prog(pl);
- bpf_cgroup_storages_assign(item->cgroup_storage,
- pl->storage);
+ prog_list_init_item(pl, item);
cnt++;
}
@@ -957,7 +971,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
desc->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
item = &progs->items[pos];
- WRITE_ONCE(item->prog, link->link.prog);
+ prog_list_replace_item(pl, item);
}
}
@@ -1275,15 +1289,13 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
} else {
struct hlist_head *progs;
struct bpf_prog_list *pl;
- struct bpf_prog *prog;
u32 id;
progs = &cgrp->bpf.progs[atype];
cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
i = 0;
hlist_for_each_entry(pl, progs, node) {
- prog = prog_list_prog(pl);
- id = prog->aux->id;
+ id = prog_list_id(pl);
if (copy_to_user(prog_ids + i, &id, sizeof(id)))
return -EFAULT;
if (++i == cnt)
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* [RFC PATCH bpf-next 07/12] bpf: Move LSM trampoline unlink into bpf_cgroup_link_auto_detach()
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
` (5 preceding siblings ...)
2026-05-19 21:58 ` [RFC PATCH bpf-next 06/12] bpf: Add prog_list_init_item(), prog_list_replace_item(), and prog_list_id() Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 08/12] bpf: Add a few bpf_cgroup_array_* helper functions Martin KaFai Lau
` (4 subsequent siblings)
11 siblings, 0 replies; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
Move the LSM trampoline unlink into bpf_cgroup_link_auto_detach().
The purpose is to consolidate the auto_detach cleanup logic.
It prepares for the upcoming struct_ops cgroup attachment patch where
bpf_cgroup_link_auto_detach() will need to handle the struct_ops case
(link->map != NULL).
This is a no-op change.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
kernel/bpf/cgroup.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 720eaa3285a2..b5769f5401e6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -292,6 +292,8 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
*/
static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
{
+ if (link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(link->link.prog);
cgroup_put(link->cgroup);
link->cgroup = NULL;
}
@@ -325,11 +327,8 @@ static void cgroup_bpf_release(struct work_struct *work)
bpf_trampoline_unlink_cgroup_shim(pl->prog);
bpf_prog_put(pl->prog);
}
- if (pl->link) {
- if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
- bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
+ if (pl->link)
bpf_cgroup_link_auto_detach(pl->link);
- }
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key[atype]);
}
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* [RFC PATCH bpf-next 08/12] bpf: Add a few bpf_cgroup_array_* helper functions
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
` (6 preceding siblings ...)
2026-05-19 21:58 ` [RFC PATCH bpf-next 07/12] bpf: Move LSM trampoline unlink into bpf_cgroup_link_auto_detach() Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 22:45 ` sashiko-bot
2026-05-19 21:58 ` [RFC PATCH bpf-next 09/12] bpf: Add infrastructure to support attaching struct_ops to cgroups Martin KaFai Lau
` (3 subsequent siblings)
11 siblings, 1 reply; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
In the upcoming patch, the array can store a struct_ops map.
The array could have a cfi_stubs acting as a dummy instead of
the dummy_bpf_prog. The array logic will need to skip the cfi_stubs
also in order to support storing struct_ops map in the array.
bpf_cgroup_array_length(), bpf_cgroup_array_copy_to_user(), and
bpf_cgroup_array_delete_safe_at() are added as a preparation work
to allow skipping the cfi_stubs in the upcoming patch. This patch
only skips the dummy_bpf_prog which is the same as the existing behavior.
The current bpf_prog_array_*() callers are changed to call the new
bpf_cgroup_array_*(). This is a no-op change.
Unlike bpf_prog_array_copy_to_user(), bpf_cgroup_array_copy_to_user()
does not need a temporary buffer. The cgroup caller already holds
cgroup_mutex and dereferences the effective array with
rcu_dereference_protected(), so it does not copy to userspace
from an RCU read-side critical section. Details in commit 0911287ce32b.
Another addition is the bpf_cgroup_array_free(). This prepares
the array to have a different rcu gp for the struct_ops use case,
for example, a struct_ops could have mix of sleepable ops and
non-sleepable ops. In this patch, bpf_cgroup_array_free() only
goes through the regular rcu gp. This is a no-op change also.
bpf_prog_dummy() is also added to return the global dummy_bpf_prog.
bpf_cgroup_array_dummy() is added to decide the sentinel based on atype.
It now always returns bpf_prog_dummy(). In the upcoming patch,
it can return a cfi_stubs if the atype belongs to a struct_ops.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
include/linux/bpf.h | 1 +
kernel/bpf/cgroup.c | 79 +++++++++++++++++++++++++++++++++++++++------
kernel/bpf/core.c | 5 +++
3 files changed, 76 insertions(+), 9 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index dbf98741f15b..26d641300f30 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2443,6 +2443,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *include_prog,
u64 bpf_cookie,
struct bpf_prog_array **new_array);
+struct bpf_prog *bpf_prog_dummy(void);
struct bpf_run_ctx {};
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b5769f5401e6..a033aa479ab6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -298,6 +298,67 @@ static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
link->cgroup = NULL;
}
+static void bpf_cgroup_array_free(struct bpf_prog_array *array)
+{
+ if (!array || array == &bpf_empty_prog_array)
+ return;
+ kfree_rcu(array, rcu);
+}
+
+static void *bpf_cgroup_array_dummy(enum cgroup_bpf_attach_type atype)
+{
+ return bpf_prog_dummy();
+}
+
+static int bpf_cgroup_array_length(struct bpf_prog_array *array,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct bpf_prog_array_item *item;
+ int cnt = 0;
+
+ for (item = array->items; item->prog; item++)
+ if (item->prog != bpf_cgroup_array_dummy(atype))
+ cnt++;
+
+ return cnt;
+}
+
+static int bpf_cgroup_array_copy_to_user(struct bpf_prog_array *array,
+ __u32 __user *prog_ids, int cnt,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct bpf_prog_array_item *item;
+ int i = 0;
+ u32 id;
+
+ for (item = array->items; item->prog && i < cnt; item++) {
+ if (item->prog == bpf_cgroup_array_dummy(atype))
+ continue;
+ id = item->prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ i++;
+ }
+ return item->prog ? -ENOSPC : 0;
+}
+
+static int bpf_cgroup_array_delete_safe_at(struct bpf_prog_array *array,
+ int index, enum cgroup_bpf_attach_type atype)
+{
+ struct bpf_prog_array_item *item;
+
+ for (item = array->items; item->prog; item++) {
+ if (item->prog == bpf_cgroup_array_dummy(atype))
+ continue;
+ if (!index) {
+ WRITE_ONCE(item->prog, bpf_cgroup_array_dummy(atype));
+ return 0;
+ }
+ index--;
+ }
+ return -ENOENT;
+}
+
/**
* cgroup_bpf_release() - put references of all bpf programs and
* release all cgroup bpf data
@@ -335,7 +396,7 @@ static void cgroup_bpf_release(struct work_struct *work)
old_array = rcu_dereference_protected(
cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
- bpf_prog_array_free(old_array);
+ bpf_cgroup_array_free(old_array);
}
list_for_each_entry_safe(storage, stmp, storages, list_cg) {
@@ -509,7 +570,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
*/
- bpf_prog_array_free(old_array);
+ bpf_cgroup_array_free(old_array);
}
/**
@@ -549,7 +610,7 @@ static int cgroup_bpf_inherit(struct cgroup *cgrp)
return 0;
cleanup:
for (i = 0; i < NR; i++)
- bpf_prog_array_free(arrays[i]);
+ bpf_cgroup_array_free(arrays[i]);
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
cgroup_bpf_put(p);
@@ -604,7 +665,7 @@ static int update_effective_progs(struct cgroup *cgrp,
if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
if (unlikely(desc->bpf.inactive)) {
- bpf_prog_array_free(desc->bpf.inactive);
+ bpf_cgroup_array_free(desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
continue;
@@ -623,7 +684,7 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
- bpf_prog_array_free(desc->bpf.inactive);
+ bpf_cgroup_array_free(desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -1124,7 +1185,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
lockdep_is_held(&cgroup_mutex));
/* Remove the program from the array */
- WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
+ WARN_ONCE(bpf_cgroup_array_delete_safe_at(progs, pos, atype),
"Failed to purge a prog from array at index %d", pos);
}
}
@@ -1254,7 +1315,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
if (effective_query) {
effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
- total_cnt += bpf_prog_array_length(effective);
+ total_cnt += bpf_cgroup_array_length(effective, atype);
} else {
total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
}
@@ -1283,8 +1344,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
if (effective_query) {
effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
- cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
- ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+ cnt = min_t(int, bpf_cgroup_array_length(effective, atype), total_cnt);
+ ret = bpf_cgroup_array_copy_to_user(effective, prog_ids, cnt, atype);
} else {
struct hlist_head *progs;
struct bpf_prog_list *pl;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index cdbe9fdf474f..7afca37fa315 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2729,6 +2729,11 @@ void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
}
+struct bpf_prog *bpf_prog_dummy(void)
+{
+ return &dummy_bpf_prog.prog;
+}
+
int bpf_prog_array_length(struct bpf_prog_array *array)
{
struct bpf_prog_array_item *item;
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [RFC PATCH bpf-next 08/12] bpf: Add a few bpf_cgroup_array_* helper functions
2026-05-19 21:58 ` [RFC PATCH bpf-next 08/12] bpf: Add a few bpf_cgroup_array_* helper functions Martin KaFai Lau
@ 2026-05-19 22:45 ` sashiko-bot
2026-05-19 22:50 ` Martin KaFai Lau
0 siblings, 1 reply; 22+ messages in thread
From: sashiko-bot @ 2026-05-19 22:45 UTC (permalink / raw)
To: Martin KaFai Lau; +Cc: bpf
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
New issues:
- [High] bpf_cgroup_array_copy_to_user() incorrectly returns -ENOSPC when the requested count is zero.
Pre-existing issues:
- [Critical] purge_effective_progs() calculates linear indices that do not account for BPF_F_PREORDER physical reordering, leading to Use-After-Free.
--
commit 23e774d89cf8101cef3b2e8789a63f7fb930be02
Author: Martin KaFai Lau <martin.lau@kernel.org>
bpf: Add a few bpf_cgroup_array_* helper functions
This patch adds bpf_cgroup_array_*() helper functions to prepare for
supporting struct_ops maps in the cgroup bpf array. These helpers currently
skip the dummy_bpf_prog, which is the same as the existing behavior.
The callers are changed to use these new helpers as a no-op change.
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index b5769f5401e64..a033aa479ab6e 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
[ ... ]
> +static int bpf_cgroup_array_copy_to_user(struct bpf_prog_array *array,
> + __u32 __user *prog_ids, int cnt,
> + enum cgroup_bpf_attach_type atype)
> +{
> + struct bpf_prog_array_item *item;
> + int i = 0;
> + u32 id;
> +
> + for (item = array->items; item->prog && i < cnt; item++) {
> + if (item->prog == bpf_cgroup_array_dummy(atype))
> + continue;
> + id = item->prog->aux->id;
> + if (copy_to_user(prog_ids + i, &id, sizeof(id)))
> + return -EFAULT;
> + i++;
> + }
> + return item->prog ? -ENOSPC : 0;
> +}
[Severity: High]
If cnt is 0 (which happens if the attach type only has dummy programs,
causing bpf_cgroup_array_length() to return 0), the loop condition i < cnt
will fail immediately. At this point, item->prog still points to a dummy
program (which is not NULL).
Does this incorrectly return -ENOSPC instead of 0?
If this occurs during __cgroup_bpf_query(), it looks like the entire
syscall could fail with -ENOSPC even when the user provided a large enough
buffer.
[ ... ]
> @@ -1124,7 +1185,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
> lockdep_is_held(&cgroup_mutex));
>
> /* Remove the program from the array */
> - WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
> + WARN_ONCE(bpf_cgroup_array_delete_safe_at(progs, pos, atype),
[Severity: Critical]
This is a pre-existing issue, but does pos correctly reflect the program's
physical index in the effective array?
When building the effective program array, compute_effective_progs() places
programs with BPF_F_PREORDER at the beginning of the array. However,
purge_effective_progs() calculates pos by sequentially iterating through
the hierarchy's hlists, which does not account for this reordering.
If update_effective_progs() fails (for example, due to OOM) during detach,
purge_effective_progs() is called as a fallback.
With the incorrect pos, could bpf_cgroup_array_delete_safe_at() overwrite
the wrong program with a dummy, leaving the intended program in the
effective array?
Since __cgroup_bpf_detach() subsequently calls bpf_prog_put() on the
detached program, the effective array would be left holding a dangling
pointer to a freed bpf_prog, which could result in a use-after-free when
the cgroup hook runs.
> "Failed to purge a prog from array at index %d", pos);
> }
> }
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260519215841.2984970-1-martin.lau@linux.dev?part=8
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [RFC PATCH bpf-next 08/12] bpf: Add a few bpf_cgroup_array_* helper functions
2026-05-19 22:45 ` sashiko-bot
@ 2026-05-19 22:50 ` Martin KaFai Lau
0 siblings, 0 replies; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 22:50 UTC (permalink / raw)
To: sashiko-reviews; +Cc: bpf, Yonghong Song
On 5/19/26 3:45 PM, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
>
> New issues:
> - [High] bpf_cgroup_array_copy_to_user() incorrectly returns -ENOSPC when the requested count is zero.
>
> Pre-existing issues:
> - [Critical] purge_effective_progs() calculates linear indices that do not account for BPF_F_PREORDER physical reordering, leading to Use-After-Free.
> --
>
> commit 23e774d89cf8101cef3b2e8789a63f7fb930be02
> Author: Martin KaFai Lau <martin.lau@kernel.org>
>
> bpf: Add a few bpf_cgroup_array_* helper functions
>
> This patch adds bpf_cgroup_array_*() helper functions to prepare for
> supporting struct_ops maps in the cgroup bpf array. These helpers currently
> skip the dummy_bpf_prog, which is the same as the existing behavior.
> The callers are changed to use these new helpers as a no-op change.
>
>> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
>> index b5769f5401e64..a033aa479ab6e 100644
>> --- a/kernel/bpf/cgroup.c
>> +++ b/kernel/bpf/cgroup.c
>
> [ ... ]
>
>> +static int bpf_cgroup_array_copy_to_user(struct bpf_prog_array *array,
>> + __u32 __user *prog_ids, int cnt,
>> + enum cgroup_bpf_attach_type atype)
>> +{
>> + struct bpf_prog_array_item *item;
>> + int i = 0;
>> + u32 id;
>> +
>> + for (item = array->items; item->prog && i < cnt; item++) {
>> + if (item->prog == bpf_cgroup_array_dummy(atype))
>> + continue;
>> + id = item->prog->aux->id;
>> + if (copy_to_user(prog_ids + i, &id, sizeof(id)))
>> + return -EFAULT;
>> + i++;
>> + }
>> + return item->prog ? -ENOSPC : 0;
>> +}
>
> [Severity: High]
> If cnt is 0 (which happens if the attach type only has dummy programs,
> causing bpf_cgroup_array_length() to return 0), the loop condition i < cnt
> will fail immediately. At this point, item->prog still points to a dummy
> program (which is not NULL).
>
> Does this incorrectly return -ENOSPC instead of 0?
>
> If this occurs during __cgroup_bpf_query(), it looks like the entire
> syscall could fail with -ENOSPC even when the user provided a large enough
> buffer.
>
> [ ... ]
>
>> @@ -1124,7 +1185,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
>> lockdep_is_held(&cgroup_mutex));
>>
>> /* Remove the program from the array */
>> - WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
>> + WARN_ONCE(bpf_cgroup_array_delete_safe_at(progs, pos, atype),
>
> [Severity: Critical]
> This is a pre-existing issue, but does pos correctly reflect the program's
> physical index in the effective array?
>
> When building the effective program array, compute_effective_progs() places
> programs with BPF_F_PREORDER at the beginning of the array. However,
> purge_effective_progs() calculates pos by sequentially iterating through
> the hierarchy's hlists, which does not account for this reordering.
It is a question I have also raised in the commit message of patch 9 also.
>
> If update_effective_progs() fails (for example, due to OOM) during detach,
> purge_effective_progs() is called as a fallback.
>
> With the incorrect pos, could bpf_cgroup_array_delete_safe_at() overwrite
> the wrong program with a dummy, leaving the intended program in the
> effective array?
>
> Since __cgroup_bpf_detach() subsequently calls bpf_prog_put() on the
> detached program, the effective array would be left holding a dangling
> pointer to a freed bpf_prog, which could result in a use-after-free when
> the cgroup hook runs.
>
>> "Failed to purge a prog from array at index %d", pos);
>> }
>> }
>
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC PATCH bpf-next 09/12] bpf: Add infrastructure to support attaching struct_ops to cgroups
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
` (7 preceding siblings ...)
2026-05-19 21:58 ` [RFC PATCH bpf-next 08/12] bpf: Add a few bpf_cgroup_array_* helper functions Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 22:50 ` sashiko-bot
2026-05-19 21:58 ` [RFC PATCH bpf-next 10/12] bpf: tcp: Support selected sock_ops callbacks as struct_ops Martin KaFai Lau
` (2 subsequent siblings)
11 siblings, 1 reply; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
This patch adds necessary infrastructure to attach a struct_ops
map to a cgroup. The initial need was to support migrating
the legacy BPF_PROG_TYPE_SOCK_OPS to a struct_ops.
Recently, there are other struct_ops use cases that
need to attach struct_ops to a cgroup. For example,
the recent BPF OOM and memcg discussion in LSFMMBPF 2026.
The motivation is to create a consistent expectation
for attaching struct_ops to cgroup instead of each subsystem
creating its own infrastructure. This logic includes
hierarchy expectation, ordering expectation,
attachment API, and rcu gp.
There is already an existing implementation for attaching
multiple bpf progs to a cgroup. There are also tools
built around it for querying. Attaching a struct_ops map
(which is a group of bpf programs) could also adhere to
a similar API and potentially reuse most of the existing
implementation.
A couple of ideas have been tried. One of them
is to use mprog.c. In terms of the amount of changes,
I eventually came to the same conclusion as in
commit 120933984460 ("bpf: Implement mprog API on top of existing cgroup progs").
I then shifted the focus to reusing the current
{update,compute,activate,purge}_effective_progs() which has
the main logic that implements the mprog API.
Since then, I tried to add a 'struct cgroup *cgroup' member
to the existing 'struct bpf_struct_ops_link' and link_create
will create a 'struct bpf_struct_ops_link' object to be stored
in the pl->link. This turns out to have more changes on
both cgroup.c and bpf_struct_ops.c than I like.
This patch directly reuses the 'struct bpf_cgroup_link' which
cgroup.c already understands. Add 'struct bpf_map *map'
to 'struct bpf_cgroup_link'. In the future, as more subsystems
are extended by struct_ops, we may consider to make
'struct bpf_map *map' as a primary citizen of a link
like 'struct bpf_prog *prog' and directly add
'struct bpf_map *map' to the generic 'struct bpf_link'.
The pl->link could be the traditional 'prog' link or the
new 'map' link. The places that need to handle them differently
have already been refactored into the new prog_list_*() added in
the earlier patch. In those new prog_list_*(), this patch will
check "pl->link && pl->link->map", learn that it is a 'map' link
and handle it correctly.
The bpf_prog_array also needs to handle that its item can store
the traditional 'prog' or it can store a struct_ops map.
The places that need to handle them differently have also
been refactored into the new bpf_cgroup_array_*() added
in the earlier patch. The two differences are:
- different sentinel (dummy_bpf_prog in prog vs cfi_stub in struct_ops)
- the array for struct_ops may need to go through different
rcu gp.
The bpf_cgroup_array_*() functions use the cgroup_bpf_attach_type (ie atype)
to distinguish the array is storing prog or storing struct_ops map.
This patch also implements a separate struct bpf_link_ops
"cgroup_struct_ops_link_ops" to have a separate link_ops implementation
that only handles the cgroup's struct_ops link.
Questions:
- Although this patch did not change it, it is not obvious to me how
the replace_effective_progs() and purge_effective_progs() handle
cases when there are existing BPF_F_PREORDER progs attached
in the hlist.
Misc notes:
- CGROUP_TCP_SOCK_OPS is added to the 'enum cgroup_bpf_attach_type'.
The actual implementation of the tcp_bpf_ops (a struct_ops)
will be added in the next patch.
- free_after_mult_rcu_gp is added to 'struct bpf_struct_ops' such that
the bpf_prog_array can have a mix of sleepable and
non-sleepable prog in a struct_ops. This can tell
how the bpf_prog_array should be freed.
- For a struct_ops that supports cgroup attachment, it does not need to
implement its own reg/unreg function. reg/unreg to a cgroup is
done by the common infrastructure added in this patch.
- The cgroup's struct_ops link only supports BPF_F_ALLOW_MULTI.
This is enforced internally in cgroup_bpf_struct_ops_attach.
This should be consistent with the current prog's link
behavior in cgroup_bpf_link_attach.
In the future, we may allow each subsystem to choose differently.
- A cgroup_atype member is added to 'struct bpf_struct_ops'.
When a subsystem struct_ops needs to support cgroup attachment,
it needs to add a value to 'enum cgroup_bpf_attach_type'
and then assign it to the newly added cgroup_atype member
in the bpf_struct_ops.
- During LINK_CREATE in syscall, the patch uses the same
BPF_STRUCT_OPS (in attr->link_create.attach_type).
The bpf_struct_ops_link_create learns the map and
from the map it learns the st_ops. If the st_ops->cgroup_atype
is not 0, it will create a cgroup's link.
- When a subsystem registers a struct_ops that supports cgroup
attachment, the struct_ops infrastructure will also ask the
cgroup infrastructure to remember a few things. This is done
by calling cgroup_bpf_struct_ops_register().
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
include/linux/bpf-cgroup-defs.h | 1 +
include/linux/bpf-cgroup.h | 28 +++
include/linux/bpf.h | 19 +-
include/uapi/linux/bpf.h | 4 +-
kernel/bpf/bpf_struct_ops.c | 29 +++
kernel/bpf/btf.c | 23 +-
kernel/bpf/cgroup.c | 358 ++++++++++++++++++++++++++++++--
kernel/bpf/syscall.c | 1 +
tools/include/uapi/linux/bpf.h | 4 +-
9 files changed, 446 insertions(+), 21 deletions(-)
diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index c9e6b26abab6..0147b8bec973 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -47,6 +47,7 @@ enum cgroup_bpf_attach_type {
CGROUP_INET6_GETSOCKNAME,
CGROUP_UNIX_GETSOCKNAME,
CGROUP_INET_SOCK_RELEASE,
+ CGROUP_TCP_SOCK_OPS,
CGROUP_LSM_START,
CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1,
MAX_CGROUP_BPF_ATTACH_TYPE
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index b2e79c2b41d5..8080f4a5c14b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -100,6 +100,8 @@ struct bpf_cgroup_storage {
struct bpf_cgroup_link {
struct bpf_link link;
struct cgroup *cgroup;
+ struct bpf_map *map;
+ wait_queue_head_t wait_hup;
};
struct bpf_prog_list {
@@ -110,6 +112,18 @@ struct bpf_prog_list {
u32 flags;
};
+#define bpf_cgroup_struct_ops_foreach(var, item, cgrp, atype) \
+ for (item = rcu_dereference((cgrp)->bpf.effective[atype])->items;\
+ ((var) = READ_ONCE(item->kdata)); \
+ item++)
+
+static inline bool cgroup_bpf_is_struct_ops_atype(enum cgroup_bpf_attach_type atype)
+{
+ return atype == CGROUP_TCP_SOCK_OPS;
+}
+void cgroup_bpf_struct_ops_register(int atype, u32 type_id, void *cfi_stubs, bool mult_trace);
+int cgroup_bpf_struct_ops_attach(struct bpf_map *map, const union bpf_attr *attr);
+
void __init cgroup_bpf_lifetime_notifier_init(void);
int __cgroup_bpf_run_filter_skb(struct sock *sk,
@@ -478,6 +492,20 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
return 0;
}
+static inline bool cgroup_bpf_is_struct_ops_atype(int atype)
+{
+ return false;
+}
+static inline void cgroup_bpf_struct_ops_register(int atype, u32 type_id, void *cfi_stubs,
+ bool mult_trace)
+{
+}
+static inline int cgroup_bpf_struct_ops_attach(struct bpf_map *map,
+ const union bpf_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
#define cgroup_bpf_enabled(atype) (0)
#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) ({ 0; })
#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 26d641300f30..90a0e0ae0e85 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1993,11 +1993,18 @@ struct btf_member;
* unloaded while in use.
* @name: The name of the struct bpf_struct_ops object.
* @func_models: Func models
+ * @cgroup_atype: A value in enum cgroup_bpf_attach_type for cgroup attachment.
+ * 0 means the struct_ops type does not support cgroup attachment.
+ * If cgroup_atype is non-zero, the @reg and @unreg must be NULL
+ * because the attachment/detachment will be handled by the bpf core.
* @free_after_tasks_rcu_gp: Set to true if it needs the bpf core to wait for
* a tasks_rcu gp before freeing the struct_ops map
* and its progs. It is unnecessary if the @unreg
* has waited for the correct rcu gp or the @unreg
* has ensured all struct_ops prog has finished running.
+ * @free_after_mult_rcu_gp: Same as @free_after_tasks_rcu_gp but waiting for
+ * both tasks_trace_rcu and regular rcu grace period.
+ * It is usually needed if the struct_ops has sleepable prog.
*/
struct bpf_struct_ops {
const struct bpf_verifier_ops *verifier_ops;
@@ -2016,7 +2023,9 @@ struct bpf_struct_ops {
struct module *owner;
const char *name;
struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
+ int cgroup_atype;
bool free_after_tasks_rcu_gp;
+ bool free_after_mult_rcu_gp;
};
/* Every member of a struct_ops type has an instance even a member is not
@@ -2142,6 +2151,7 @@ void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map);
bool bpf_struct_ops_valid_to_reg(struct bpf_map *map);
int bpf_struct_ops_link_update_check(struct bpf_map *new_map, struct bpf_map *old_map,
struct bpf_map *expected_old_map);
+int bpf_struct_ops_map_cgroup_atype(struct bpf_map *map);
#ifdef CONFIG_NET
/* Define it here to avoid the use of forward declaration */
@@ -2214,6 +2224,10 @@ static inline u32 bpf_struct_ops_kdata_map_id(void *kdata)
{
return 0;
}
+static inline int bpf_struct_ops_map_cgroup_atype(struct bpf_map *map)
+{
+ return 0;
+}
static inline void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map)
{
return NULL;
@@ -2401,7 +2415,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
* since other cpus are walking the array of pointers in parallel.
*/
struct bpf_prog_array_item {
- struct bpf_prog *prog;
+ union {
+ struct bpf_prog *prog;
+ void *kdata;
+ };
union {
struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
u64 bpf_cookie;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aec171ccb6ef..835aa27fde64 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1742,7 +1742,7 @@ union bpf_attr {
__u32 prog_cnt;
__u32 count;
};
- __u32 :32;
+ __u32 type_id;
/* output: per-program attach_flags.
* not allowed to be set during effective query.
*/
@@ -6793,6 +6793,8 @@ struct bpf_link_info {
} xdp;
struct {
__u32 map_id;
+ __u32 :32;
+ __u64 cgroup_id;
} struct_ops;
struct {
__u32 pf;
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 8650a3b88bf6..1cf2a1ff0a7d 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -13,6 +13,7 @@
#include <linux/btf_ids.h>
#include <linux/rcupdate_wait.h>
#include <linux/poll.h>
+#include <linux/bpf-cgroup.h>
struct bpf_struct_ops_value {
struct bpf_struct_ops_common_value common;
@@ -1075,6 +1076,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
goto errout;
}
+ if (st_ops_desc->st_ops->cgroup_atype && !(attr->map_flags & BPF_F_LINK)) {
+ ret = -EOPNOTSUPP;
+ goto errout;
+ }
+
vt = st_ops_desc->value_type;
if (attr->value_size != vt->size) {
ret = -EINVAL;
@@ -1115,6 +1121,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
mutex_init(&st_map->lock);
bpf_map_init_from_attr(map, attr);
+ map->free_after_mult_rcu_gp = st_ops_desc->st_ops->free_after_mult_rcu_gp;
map->free_after_rcu_gp = true;
return map;
@@ -1217,6 +1224,14 @@ u32 bpf_struct_ops_kdata_map_id(void *kdata)
return st_map->map.id;
}
+int bpf_struct_ops_map_cgroup_atype(struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(map, struct bpf_struct_ops_map, map);
+ return st_map->st_ops_desc->st_ops->cgroup_atype;
+}
+
void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map;
@@ -1392,6 +1407,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
struct bpf_link_primer link_primer;
struct bpf_struct_ops_map *st_map;
struct bpf_map *map;
+ int cgroup_atype;
int err;
map = bpf_map_get(attr->link_create.map_fd);
@@ -1405,6 +1421,19 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
goto err_out;
}
+ cgroup_atype = st_map->st_ops_desc->st_ops->cgroup_atype;
+ if (cgroup_atype) {
+ err = cgroup_bpf_struct_ops_attach(map, attr);
+ bpf_map_put(map);
+ return err;
+ }
+
+ if (memchr_inv(&attr->link_create.cgroup, 0, sizeof(attr->link_create.cgroup)) ||
+ attr->link_create.target_fd) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
link = kzalloc_obj(*link, GFP_USER);
if (!link) {
err = -ENOMEM;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 17d4ab0a8206..d282a77544ea 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -20,6 +20,7 @@
#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/bpf_lsm.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
@@ -9668,6 +9669,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
struct bpf_verifier_log *log)
{
struct btf_struct_ops_tab *tab, *new_tab;
+ int cgroup_atype;
int i, err;
tab = btf->struct_ops_tab;
@@ -9679,8 +9681,10 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
btf->struct_ops_tab = tab;
}
+ cgroup_atype = st_ops->cgroup_atype;
for (i = 0; i < tab->cnt; i++)
- if (tab->ops[i].st_ops == st_ops)
+ if (tab->ops[i].st_ops == st_ops ||
+ (cgroup_atype && cgroup_atype == tab->ops[i].st_ops->cgroup_atype))
return -EEXIST;
if (tab->cnt == tab->capacity) {
@@ -9700,6 +9704,23 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
if (err)
return err;
+ if (cgroup_atype) {
+ if (!cgroup_bpf_is_struct_ops_atype(cgroup_atype) ||
+ st_ops->reg || st_ops->unreg || st_ops->free_after_tasks_rcu_gp) {
+ bpf_struct_ops_desc_release(&tab->ops[btf->struct_ops_tab->cnt]);
+ return -EINVAL;
+ }
+
+ /* There is no need to unregister from cgroup when the
+ * btf_free(). No struct_ops map and its cgroup link
+ * can be created once its btf is gone.
+ */
+ cgroup_bpf_struct_ops_register(cgroup_atype,
+ tab->ops[btf->struct_ops_tab->cnt].type_id,
+ st_ops->cfi_stubs,
+ st_ops->free_after_mult_rcu_gp);
+ }
+
btf->struct_ops_tab->cnt++;
return 0;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a033aa479ab6..d496db48d2b8 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -24,6 +24,29 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+static u32 struct_ops_type_id[MAX_CGROUP_BPF_ATTACH_TYPE];
+static void *struct_ops_cfi_stubs[MAX_CGROUP_BPF_ATTACH_TYPE];
+static bool struct_ops_mult_rcu[MAX_CGROUP_BPF_ATTACH_TYPE];
+
+void cgroup_bpf_struct_ops_register(int atype, u32 type_id, void *cfi_stubs, bool mult_rcu)
+{
+ struct_ops_type_id[atype] = type_id;
+ struct_ops_cfi_stubs[atype] = cfi_stubs;
+ struct_ops_mult_rcu[atype] = mult_rcu;
+}
+
+static enum cgroup_bpf_attach_type find_atype_by_struct_ops_id(u32 type_id)
+{
+ enum cgroup_bpf_attach_type atype;
+
+ for (atype = 0; atype < MAX_CGROUP_BPF_ATTACH_TYPE; atype++) {
+ if (cgroup_bpf_is_struct_ops_atype(atype) &&
+ struct_ops_type_id[atype] == type_id)
+ return atype;
+ }
+ return CGROUP_BPF_ATTACH_TYPE_INVALID;
+}
+
/*
* cgroup bpf destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup bpf
@@ -285,6 +308,19 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
}
+static void cgroup_struct_ops_link_detach_wake(struct bpf_cgroup_link *link, bool wake_poll)
+{
+ cgroup_put(link->cgroup);
+ link->cgroup = NULL;
+
+ bpf_map_put(link->map);
+ /* READ_ONCE in cgroup_struct_ops_link_poll */
+ WRITE_ONCE(link->map, NULL);
+
+ if (wake_poll)
+ wake_up_interruptible_poll(&link->wait_hup, EPOLLHUP);
+}
+
/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
* It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
* doesn't free link memory, which will eventually be done by bpf_link's
@@ -292,21 +328,37 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
*/
static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
{
- if (link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
- bpf_trampoline_unlink_cgroup_shim(link->link.prog);
- cgroup_put(link->cgroup);
- link->cgroup = NULL;
+ if (link->map) {
+ cgroup_struct_ops_link_detach_wake(link, true);
+ } else {
+ if (link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(link->link.prog);
+ cgroup_put(link->cgroup);
+ link->cgroup = NULL;
+ }
}
-static void bpf_cgroup_array_free(struct bpf_prog_array *array)
+static void bpf_cgroup_array_free_rcu(struct rcu_head *rcu)
+{
+ kfree(container_of(rcu, struct bpf_prog_array, rcu));
+}
+
+static void bpf_cgroup_array_free(struct bpf_prog_array *array,
+ enum cgroup_bpf_attach_type atype)
{
if (!array || array == &bpf_empty_prog_array)
return;
- kfree_rcu(array, rcu);
+ if (struct_ops_mult_rcu[atype])
+ /* RCU tasks trace grace period implies RCU grace period. */
+ call_rcu_tasks_trace(&array->rcu, bpf_cgroup_array_free_rcu);
+ else
+ kfree_rcu(array, rcu);
}
static void *bpf_cgroup_array_dummy(enum cgroup_bpf_attach_type atype)
{
+ if (cgroup_bpf_is_struct_ops_atype(atype))
+ return struct_ops_cfi_stubs[atype];
return bpf_prog_dummy();
}
@@ -334,7 +386,12 @@ static int bpf_cgroup_array_copy_to_user(struct bpf_prog_array *array,
for (item = array->items; item->prog && i < cnt; item++) {
if (item->prog == bpf_cgroup_array_dummy(atype))
continue;
- id = item->prog->aux->id;
+
+ if (cgroup_bpf_is_struct_ops_atype(atype))
+ id = bpf_struct_ops_kdata_map_id(item->kdata);
+ else
+ id = item->prog->aux->id;
+
if (copy_to_user(prog_ids + i, &id, sizeof(id)))
return -EFAULT;
i++;
@@ -396,7 +453,7 @@ static void cgroup_bpf_release(struct work_struct *work)
old_array = rcu_dereference_protected(
cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
- bpf_cgroup_array_free(old_array);
+ bpf_cgroup_array_free(old_array, atype);
}
list_for_each_entry_safe(storage, stmp, storages, list_cg) {
@@ -440,17 +497,26 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
static void prog_list_init_item(struct bpf_prog_list *pl, struct bpf_prog_array_item *item)
{
- item->prog = prog_list_prog(pl);
- bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage);
+ if (pl->link && pl->link->map) {
+ item->kdata = bpf_struct_ops_map_kdata(pl->link->map);
+ } else {
+ item->prog = prog_list_prog(pl);
+ bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage);
+ }
}
static void prog_list_replace_item(struct bpf_prog_list *pl, struct bpf_prog_array_item *item)
{
- WRITE_ONCE(item->prog, pl->link->link.prog);
+ if (pl->link && pl->link->map)
+ WRITE_ONCE(item->kdata, bpf_struct_ops_map_kdata(pl->link->map));
+ else
+ WRITE_ONCE(item->prog, pl->link->link.prog);
}
static u32 prog_list_id(struct bpf_prog_list *pl)
{
+ if (pl->link && pl->link->map)
+ return pl->link->map->id;
return prog_list_prog(pl)->aux->id;
}
@@ -570,7 +636,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
*/
- bpf_cgroup_array_free(old_array);
+ bpf_cgroup_array_free(old_array, atype);
}
/**
@@ -610,7 +676,7 @@ static int cgroup_bpf_inherit(struct cgroup *cgrp)
return 0;
cleanup:
for (i = 0; i < NR; i++)
- bpf_cgroup_array_free(arrays[i]);
+ bpf_cgroup_array_free(arrays[i], i);
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
cgroup_bpf_put(p);
@@ -665,7 +731,7 @@ static int update_effective_progs(struct cgroup *cgrp,
if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
if (unlikely(desc->bpf.inactive)) {
- bpf_cgroup_array_free(desc->bpf.inactive);
+ bpf_cgroup_array_free(desc->bpf.inactive, atype);
desc->bpf.inactive = NULL;
}
continue;
@@ -684,7 +750,7 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
- bpf_cgroup_array_free(desc->bpf.inactive);
+ bpf_cgroup_array_free(desc->bpf.inactive, atype);
desc->bpf.inactive = NULL;
}
@@ -919,7 +985,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
if (pl) {
old_prog = pl->prog;
} else {
- pl = kmalloc_obj(*pl);
+ pl = kzalloc_obj(*pl);
if (!pl) {
bpf_cgroup_storages_free(new_storage);
return -ENOMEM;
@@ -1295,7 +1361,15 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
if (effective_query && prog_attach_flags)
return -EINVAL;
- if (type == BPF_LSM_CGROUP) {
+ if (type == BPF_STRUCT_OPS) {
+ u32 type_id = attr->query.type_id;
+
+ atype = find_atype_by_struct_ops_id(type_id);
+ if (atype == CGROUP_BPF_ATTACH_TYPE_INVALID)
+ return -ENOENT;
+ from_atype = to_atype = atype;
+ flags = 0;
+ } else if (type == BPF_LSM_CGROUP) {
if (!effective_query && attr->query.prog_cnt &&
prog_ids && !prog_attach_flags)
return -EINVAL;
@@ -2776,6 +2850,256 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
const struct bpf_prog_ops cg_sockopt_prog_ops = {
};
+static int __cgroup_struct_ops_link_detach(struct bpf_link *link, bool wake_poll)
+{
+ struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link);
+ enum cgroup_bpf_attach_type atype;
+ struct bpf_prog_list *pl;
+ struct bpf_map *map;
+ struct cgroup *cgrp;
+
+ cgroup_lock();
+
+ cgrp = cg_link->cgroup;
+ if (!cgrp) {
+ cgroup_unlock();
+ return 0;
+ }
+
+ map = cg_link->map;
+ atype = bpf_struct_ops_map_cgroup_atype(map);
+
+ hlist_for_each_entry(pl, &cgrp->bpf.progs[atype], node) {
+ if (pl->link == cg_link)
+ break;
+ }
+
+ /* mark deleted so compute_effective_progs() skips it */
+ pl->link = NULL;
+ if (update_effective_progs(cgrp, atype)) {
+ pl->link = cg_link;
+ purge_effective_progs(cgrp, NULL, cg_link, atype);
+ }
+
+ hlist_del(&pl->node);
+ cgroup_struct_ops_link_detach_wake(cg_link, wake_poll);
+ cgrp->bpf.revisions[atype]++;
+
+ cgroup_unlock();
+
+ kfree(pl);
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
+
+ return 0;
+}
+
+static int cgroup_struct_ops_link_detach(struct bpf_link *link)
+{
+ return __cgroup_struct_ops_link_detach(link, true);
+}
+
+static void cgroup_struct_ops_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+ __cgroup_struct_ops_link_detach(link, false);
+ kfree(cg_link);
+}
+
+static void cgroup_struct_ops_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+
+ cgroup_lock();
+ if (!cg_link->cgroup) {
+ cgroup_unlock();
+ return;
+ }
+
+ seq_printf(seq, "map_id:\t%u\n", cg_link->map->id);
+ seq_printf(seq, "cgroup_id:\t%llu\n", cgroup_id(cg_link->cgroup));
+ cgroup_unlock();
+}
+
+static int cgroup_struct_ops_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+ cgroup_lock();
+ if (!cg_link->cgroup) {
+ cgroup_unlock();
+ return 0;
+ }
+
+ info->struct_ops.map_id = cg_link->map->id;
+ info->struct_ops.cgroup_id = cgroup_id(cg_link->cgroup);
+ cgroup_unlock();
+ return 0;
+}
+
+static int cgroup_struct_ops_link_update(struct bpf_link *link, struct bpf_map *new_map,
+ struct bpf_map *expected_old_map)
+{
+ struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link);
+ enum cgroup_bpf_attach_type atype;
+ struct bpf_map *old_map;
+ struct cgroup *cgrp;
+ int err;
+
+ if (!bpf_struct_ops_valid_to_reg(new_map))
+ return -EINVAL;
+
+ cgroup_lock();
+
+ cgrp = cg_link->cgroup;
+ if (!cgrp) {
+ err = -ENOLINK;
+ goto out;
+ }
+
+ old_map = cg_link->map;
+ err = bpf_struct_ops_link_update_check(new_map, old_map, expected_old_map);
+ if (err)
+ goto out;
+
+ atype = bpf_struct_ops_map_cgroup_atype(new_map);
+ bpf_map_inc(new_map);
+ WRITE_ONCE(cg_link->map, new_map);
+ replace_effective_prog(cg_link->cgroup, atype, cg_link);
+ bpf_map_put(old_map);
+ cgrp->bpf.revisions[atype]++;
+
+out:
+ cgroup_unlock();
+ return err;
+}
+
+static __poll_t cgroup_struct_ops_link_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct bpf_cgroup_link *link = file->private_data;
+
+ poll_wait(file, &link->wait_hup, pts);
+
+ return READ_ONCE(link->map) ? 0 : EPOLLHUP;
+}
+
+static const struct bpf_link_ops cgroup_struct_ops_link_ops = {
+ .dealloc = cgroup_struct_ops_link_dealloc,
+ .detach = cgroup_struct_ops_link_detach,
+ .show_fdinfo = cgroup_struct_ops_link_show_fdinfo,
+ .fill_link_info = cgroup_struct_ops_link_fill_link_info,
+ .update_map = cgroup_struct_ops_link_update,
+ .poll = cgroup_struct_ops_link_poll,
+};
+
+int cgroup_bpf_struct_ops_attach(struct bpf_map *map, const union bpf_attr *attr)
+{
+ u32 flags = attr->link_create.flags;
+ u32 pl_flags = (flags & BPF_F_PREORDER) | BPF_F_ALLOW_MULTI;
+ enum cgroup_bpf_attach_type atype;
+ struct bpf_link_primer link_primer;
+ struct bpf_cgroup_link *link;
+ struct bpf_prog_list *pl = NULL;
+ struct hlist_head *progs;
+ struct cgroup *cgrp;
+ int err;
+
+ if (flags & ~BPF_F_LINK_ATTACH_MASK)
+ return -EINVAL;
+
+ /*
+ * Attaching struct_ops to cgroup is through link only. All relative
+ * position must be corresponding to a link id or fd.
+ */
+ if (attr->link_create.cgroup.relative_fd && !(flags & BPF_F_LINK))
+ return -EINVAL;
+
+ link = kzalloc_obj(*link, GFP_USER);
+ if (!link)
+ return -ENOMEM;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
+ &cgroup_struct_ops_link_ops, NULL,
+ attr->link_create.attach_type);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ return err;
+ }
+
+ cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
+ if (IS_ERR(cgrp)) {
+ err = PTR_ERR(cgrp);
+ goto cleanup;
+ }
+
+ bpf_map_inc(map);
+ link->map = map;
+ link->cgroup = cgrp;
+ init_waitqueue_head(&link->wait_hup);
+
+ atype = bpf_struct_ops_map_cgroup_atype(map);
+ progs = &cgrp->bpf.progs[atype];
+
+ cgroup_lock();
+
+ if (attr->link_create.cgroup.expected_revision &&
+ attr->link_create.cgroup.expected_revision != cgrp->bpf.revisions[atype]) {
+ err = -ESTALE;
+ goto unlock;
+ }
+
+ if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) {
+ err = -E2BIG;
+ goto unlock;
+ }
+
+ pl = kzalloc_obj(*pl);
+ if (!pl) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ pl->link = link;
+ pl->flags = pl_flags;
+ cgrp->bpf.flags[atype] = BPF_F_ALLOW_MULTI;
+
+ err = insert_pl_to_hlist(pl, progs, NULL, link,
+ flags | BPF_F_ALLOW_MULTI, attr->link_create.cgroup.relative_fd);
+ if (err)
+ goto unlock;
+
+ err = update_effective_progs(cgrp, atype);
+ if (err) {
+ hlist_del(&pl->node);
+ goto unlock;
+ }
+
+ cgrp->bpf.revisions[atype]++;
+
+ cgroup_unlock();
+
+ static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+ return bpf_link_settle(&link_primer);
+
+unlock:
+ cgroup_unlock();
+
+cleanup:
+ kfree(pl);
+ if (link->cgroup) {
+ cgroup_put(link->cgroup);
+ link->cgroup = NULL;
+ bpf_map_put(link->map);
+ link->map = NULL;
+ }
+ bpf_link_cleanup(&link_primer);
+ return err;
+}
+
/* Common helpers for cgroup hooks. */
const struct bpf_func_proto *
cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d0e8e9c8c888..eb2e5a668b6d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4747,6 +4747,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
case BPF_LSM_CGROUP:
+ case BPF_STRUCT_OPS:
return cgroup_bpf_prog_query(attr, uattr);
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 37142e6d911a..16582abe34f7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1742,7 +1742,7 @@ union bpf_attr {
__u32 prog_cnt;
__u32 count;
};
- __u32 :32;
+ __u32 type_id;
/* output: per-program attach_flags.
* not allowed to be set during effective query.
*/
@@ -6793,6 +6793,8 @@ struct bpf_link_info {
} xdp;
struct {
__u32 map_id;
+ __u32 :32;
+ __u64 cgroup_id;
} struct_ops;
struct {
__u32 pf;
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [RFC PATCH bpf-next 09/12] bpf: Add infrastructure to support attaching struct_ops to cgroups
2026-05-19 21:58 ` [RFC PATCH bpf-next 09/12] bpf: Add infrastructure to support attaching struct_ops to cgroups Martin KaFai Lau
@ 2026-05-19 22:50 ` sashiko-bot
2026-05-19 23:56 ` Martin KaFai Lau
0 siblings, 1 reply; 22+ messages in thread
From: sashiko-bot @ 2026-05-19 22:50 UTC (permalink / raw)
To: Martin KaFai Lau; +Cc: bpf
Thank you for your contribution! Sashiko AI review found 4 potential issue(s) to consider:
- [High] bpf_cgroup_struct_ops_foreach does not skip dummy items, leading to unintended execution of cfi_stubs.
- [High] Using a loadable module's cfi_stubs as a dummy sentinel causes a Use-After-Free when the module is unloaded and reloaded.
- [Medium] The macro bpf_cgroup_struct_ops_foreach uses rcu_dereference(), which is incompatible with sleepable struct_ops programs.
- [Medium] cgroup_bpf_struct_ops_register() lacks concurrency protection and global uniqueness enforcement.
--
commit bad29e4e0d1357905491993ee45063766d0f7705
Author: Martin KaFai Lau <martin.lau@kernel.org>
bpf: Add infrastructure to support attaching struct_ops to cgroups
This patch adds necessary infrastructure to attach a struct_ops
map to a cgroup. The initial need was to support migrating
the legacy BPF_PROG_TYPE_SOCK_OPS to a struct_ops.
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index b2e79c2b41d51..8080f4a5c14b6 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
[ ... ]
> @@ -110,6 +112,18 @@ struct bpf_prog_list {
> u32 flags;
> };
>
> +#define bpf_cgroup_struct_ops_foreach(var, item, cgrp, atype) \
> + for (item = rcu_dereference((cgrp)->bpf.effective[atype])->items;\
> + ((var) = READ_ONCE(item->kdata)); \
> + item++)
[Severity: High]
Does this macro need to skip dummy sentinel items?
In cgroups with BPF_F_ALLOW_MULTI, compute_effective_progs() inserts dummy
sentinel items at the boundaries between cgroup levels. For struct_ops, this
dummy is the cfi_stubs pointer.
This macro appears to iterate over all non-NULL items and fails to skip the
dummy sentinel, which could cause the subsystem to repeatedly yield the
cfi_stubs pointer as a valid attached struct_ops instance.
[Severity: Medium]
Will this rcu_dereference() trigger Lockdep warnings for sleepable struct_ops
programs?
The commit message mentions adding free_after_mult_rcu_gp to support a mix of
sleepable and non-sleepable struct_ops, but plain rcu_dereference() enforces
rcu_read_lock_held().
If a subsystem iterates the array while holding rcu_read_lock_trace()
(required for sleepable execution), this might trigger a 'suspicious
rcu_dereference_check() usage' warning. Conversely, if it holds
rcu_read_lock(), the struct_ops program cannot sleep without causing a panic.
> +
> +static inline bool cgroup_bpf_is_struct_ops_atype(enum cgroup_bpf_attach_type atype)
> +{
[ ... ]
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index a033aa479ab6e..d496db48d2b86 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -24,6 +24,29 @@
> DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
> EXPORT_SYMBOL(cgroup_bpf_enabled_key);
>
> +static u32 struct_ops_type_id[MAX_CGROUP_BPF_ATTACH_TYPE];
> +static void *struct_ops_cfi_stubs[MAX_CGROUP_BPF_ATTACH_TYPE];
> +static bool struct_ops_mult_rcu[MAX_CGROUP_BPF_ATTACH_TYPE];
> +
> +void cgroup_bpf_struct_ops_register(int atype, u32 type_id, void *cfi_stubs, bool mult_rcu)
> +{
> + struct_ops_type_id[atype] = type_id;
> + struct_ops_cfi_stubs[atype] = cfi_stubs;
> + struct_ops_mult_rcu[atype] = mult_rcu;
> +}
[Severity: Medium]
Does this registration function need concurrency protection to prevent global
array corruption?
If two modules are loaded concurrently and declare the same cgroup_atype, it
seems this could cause a data race or silently overwrite the registered type_id
and cfi_stubs of a legitimate module.
> +
> +static enum cgroup_bpf_attach_type find_atype_by_struct_ops_id(u32 type_id)
[ ... ]
> static void *bpf_cgroup_array_dummy(enum cgroup_bpf_attach_type atype)
> {
> + if (cgroup_bpf_is_struct_ops_atype(atype))
> + return struct_ops_cfi_stubs[atype];
> return bpf_prog_dummy();
> }
[Severity: High]
Can using a loadable module's cfi_stubs as a dummy sentinel cause a
use-after-free when the module is unloaded and reloaded?
If all maps are detached, the module providing the struct_ops can be unloaded,
and its cfi_stubs memory freed. However, the cgroup's effective array might
still retain the dangling pointer to the freed cfi_stubs if BPF_F_ALLOW_MULTI
was set.
If the module is reloaded, cgroup_bpf_struct_ops_register() updates
struct_ops_cfi_stubs[atype] with a new pointer. If a user then queries the
cgroup via BPF_PROG_QUERY, the loop checks
item->prog == bpf_cgroup_array_dummy(atype) to skip the dummy. Since the
array contains the old freed pointer, it won't match the new cfi_stubs
pointer, and the query loop might mistakenly treat the old dummy as a valid
map, causing a use-after-free when calling bpf_struct_ops_kdata_map_id().
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260519215841.2984970-1-martin.lau@linux.dev?part=9
^ permalink raw reply [flat|nested] 22+ messages in thread* Re: [RFC PATCH bpf-next 09/12] bpf: Add infrastructure to support attaching struct_ops to cgroups
2026-05-19 22:50 ` sashiko-bot
@ 2026-05-19 23:56 ` Martin KaFai Lau
0 siblings, 0 replies; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 23:56 UTC (permalink / raw)
To: sashiko-reviews; +Cc: bpf
On Tue, May 19, 2026 at 10:50:02PM +0000, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 4 potential issue(s) to consider:
> - [High] bpf_cgroup_struct_ops_foreach does not skip dummy items, leading to unintended execution of cfi_stubs.
> - [High] Using a loadable module's cfi_stubs as a dummy sentinel causes a Use-After-Free when the module is unloaded and reloaded.
> - [Medium] The macro bpf_cgroup_struct_ops_foreach uses rcu_dereference(), which is incompatible with sleepable struct_ops programs.
> - [Medium] cgroup_bpf_struct_ops_register() lacks concurrency protection and global uniqueness enforcement.
> --
>
> commit bad29e4e0d1357905491993ee45063766d0f7705
> Author: Martin KaFai Lau <martin.lau@kernel.org>
>
> bpf: Add infrastructure to support attaching struct_ops to cgroups
>
> This patch adds necessary infrastructure to attach a struct_ops
> map to a cgroup. The initial need was to support migrating
> the legacy BPF_PROG_TYPE_SOCK_OPS to a struct_ops.
>
> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > index b2e79c2b41d51..8080f4a5c14b6 100644
> > --- a/include/linux/bpf-cgroup.h
> > +++ b/include/linux/bpf-cgroup.h
>
> [ ... ]
>
> > @@ -110,6 +112,18 @@ struct bpf_prog_list {
> > u32 flags;
> > };
> >
> > +#define bpf_cgroup_struct_ops_foreach(var, item, cgrp, atype) \
> > + for (item = rcu_dereference((cgrp)->bpf.effective[atype])->items;\
> > + ((var) = READ_ONCE(item->kdata)); \
> > + item++)
>
> [Severity: High]
> Does this macro need to skip dummy sentinel items?
>
> In cgroups with BPF_F_ALLOW_MULTI, compute_effective_progs() inserts dummy
> sentinel items at the boundaries between cgroup levels. For struct_ops, this
> dummy is the cfi_stubs pointer.
>
> This macro appears to iterate over all non-NULL items and fails to skip the
> dummy sentinel, which could cause the subsystem to repeatedly yield the
> cfi_stubs pointer as a valid attached struct_ops instance.
The ops in the cfi_stubs is safe to be called. It intentionally skips
checking for dummy in the fast path.
>
> [Severity: Medium]
> Will this rcu_dereference() trigger Lockdep warnings for sleepable struct_ops
> programs?
>
> The commit message mentions adding free_after_mult_rcu_gp to support a mix of
> sleepable and non-sleepable struct_ops, but plain rcu_dereference() enforces
> rcu_read_lock_held().
>
> If a subsystem iterates the array while holding rcu_read_lock_trace()
> (required for sleepable execution), this might trigger a 'suspicious
> rcu_dereference_check() usage' warning. Conversely, if it holds
> rcu_read_lock(), the struct_ops program cannot sleep without causing a panic.
The proposed bpf_tcp_ops only has non-sleepable programs.
The future sleepable callsite will need another version of foreach or
rcu_dereference_check can be used here also. I think rcu_dereference_check
is better.
>
> > +
> > +static inline bool cgroup_bpf_is_struct_ops_atype(enum cgroup_bpf_attach_type atype)
> > +{
>
> [ ... ]
>
> > diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> > index a033aa479ab6e..d496db48d2b86 100644
> > --- a/kernel/bpf/cgroup.c
> > +++ b/kernel/bpf/cgroup.c
> > @@ -24,6 +24,29 @@
> > DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
> > EXPORT_SYMBOL(cgroup_bpf_enabled_key);
> >
> > +static u32 struct_ops_type_id[MAX_CGROUP_BPF_ATTACH_TYPE];
> > +static void *struct_ops_cfi_stubs[MAX_CGROUP_BPF_ATTACH_TYPE];
> > +static bool struct_ops_mult_rcu[MAX_CGROUP_BPF_ATTACH_TYPE];
> > +
> > +void cgroup_bpf_struct_ops_register(int atype, u32 type_id, void *cfi_stubs, bool mult_rcu)
> > +{
> > + struct_ops_type_id[atype] = type_id;
> > + struct_ops_cfi_stubs[atype] = cfi_stubs;
> > + struct_ops_mult_rcu[atype] = mult_rcu;
> > +}
>
> [Severity: Medium]
> Does this registration function need concurrency protection to prevent global
> array corruption?
>
> If two modules are loaded concurrently and declare the same cgroup_atype, it
> seems this could cause a data race or silently overwrite the registered type_id
> and cfi_stubs of a legitimate module.
This will be a bug in the subsystems having the same cgroup_atype
but (the next one)...
>
> > +
> > +static enum cgroup_bpf_attach_type find_atype_by_struct_ops_id(u32 type_id)
>
> [ ... ]
>
> > static void *bpf_cgroup_array_dummy(enum cgroup_bpf_attach_type atype)
> > {
> > + if (cgroup_bpf_is_struct_ops_atype(atype))
> > + return struct_ops_cfi_stubs[atype];
> > return bpf_prog_dummy();
> > }
>
> [Severity: High]
> Can using a loadable module's cfi_stubs as a dummy sentinel cause a
> use-after-free when the module is unloaded and reloaded?
>
> If all maps are detached, the module providing the struct_ops can be unloaded,
> and its cfi_stubs memory freed. However, the cgroup's effective array might
> still retain the dangling pointer to the freed cfi_stubs if BPF_F_ALLOW_MULTI
> was set.
>
> If the module is reloaded, cgroup_bpf_struct_ops_register() updates
> struct_ops_cfi_stubs[atype] with a new pointer. If a user then queries the
> cgroup via BPF_PROG_QUERY, the loop checks
> item->prog == bpf_cgroup_array_dummy(atype) to skip the dummy. Since the
> array contains the old freed pointer, it won't match the new cfi_stubs
> pointer, and the query loop might mistakenly treat the old dummy as a valid
> map, causing a use-after-free when calling bpf_struct_ops_kdata_map_id().
Agree. This will be an issue on BPF_PROG_QUERY when the module reload.
I did think about the module reload case but I have only considered the
rcu-reader in bpf_cgroup_struct_ops_foreach.
BPF_PROG_QUERY does not use rcu_read_lock and depends on cgroup_lock.
This will need to be addressed.
^ permalink raw reply [flat|nested] 22+ messages in thread
* [RFC PATCH bpf-next 10/12] bpf: tcp: Support selected sock_ops callbacks as struct_ops
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
` (8 preceding siblings ...)
2026-05-19 21:58 ` [RFC PATCH bpf-next 09/12] bpf: Add infrastructure to support attaching struct_ops to cgroups Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 11/12] libbpf: Support attaching struct_ops to a cgroup Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 12/12] selftests/bpf: Test " Martin KaFai Lau
11 siblings, 0 replies; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
In LSFMMBPF 2025, I have talked about moving the BPF_PROG_TYPE_SOCK_OPS
to a struct_ops interface [1].
The BPF_SOCK_OPS_*_CB enum interface has grown over time as new TCP callback
points were added. A BPF_PROG_TYPE_SOCK_OPS program now commonly needs a
large switch on sock_ops->op, and the shared bpf_sock_ops_kern context has
become harder to extend because different callbacks have different locking,
argument, skb, and helper requirements. The existing
'union { u32 args[4]; u32 replylong[4]; }' is also not reliable in
passing args to bpf prog when there are multiple progs attached to a cgroup.
The above has already been solved in struct_ops. Add a TCP-specific
struct_ops type, bpf_tcp_ops, and support attaching it to cgroups.
This allows each callback have its own func signature and allows
the verifier to select kfuncs/helpers based on the specific
struct_ops member being implemented.
This patch wires up the following existing sock_ops callbacks:
- BPF_SOCK_OPS_TIMEOUT_INIT
- BPF_SOCK_OPS_RWND_INIT
- BPF_SOCK_OPS_RTT_CB
- BPF_SOCK_OPS_STATE_CB
- BPF_SOCK_OPS_RETRANS_CB
- BPF_SOCK_OPS_TCP_CONNECT_CB
- BPF_SOCK_OPS_TCP_LISTEN_CB
I don't think BASE_RTT is useful. NEEDS_ECN should be done in
bpf-tcp-cc instead. The tstamp ones should be a separate
struct_ops (e.g. "bpf_sock_ops") that can work in both TCP and UDP.
timeout_init and rwnd_init could have a request_sock pointer. This patch
tries a different API and direclty passes the request_sock pointer as
an arg
The patch is incomplete. TODOs:
- A skb argument should be added to the retrans ops.
- ACTIVE_ESTABLISHED_CB and PASSIVE_ESTABLISHED_CB
- BPF_SOCK_OPS_*HDR related ops
- RTO
[1], page 13: https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view?usp=sharing
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
include/net/tcp.h | 74 ++++++++++++++++++++++-
net/ipv4/Makefile | 1 +
net/ipv4/af_inet.c | 1 +
net/ipv4/bpf_tcp_ops.c | 134 +++++++++++++++++++++++++++++++++++++++++
net/ipv4/tcp.c | 4 +-
net/ipv4/tcp_output.c | 5 +-
6 files changed, 214 insertions(+), 5 deletions(-)
create mode 100644 net/ipv4/bpf_tcp_ops.c
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ecbadcb3a744..eb9ff07f3c83 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2960,12 +2960,78 @@ static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
#endif
+#if defined(CONFIG_BPF_JIT) && defined(CONFIG_CGROUP_BPF)
+
+struct bpf_tcp_ops {
+ int (*timeout_init)(struct sock *sk, struct request_sock *req);
+ int (*rwnd_init)(struct sock *sk, struct request_sock *req);
+ void (*rtt)(struct sock *sk, long mrtt, u32 srtt);
+ void (*set_state)(struct sock *sk, int state);
+ void (*retrans)(struct sock *sk, int type);
+ void (*connect)(struct sock *sk);
+ void (*listen)(struct sock *sk);
+};
+
+#define bpf_tcp_ops_call(op, sk, ...) \
+do { \
+ if (cgroup_bpf_enabled(CGROUP_TCP_SOCK_OPS)) { \
+ const struct bpf_prog_array_item *item; \
+ const struct bpf_tcp_ops *tcp_ops; \
+ struct cgroup *cgrp; \
+ \
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); \
+ rcu_read_lock_dont_migrate(); \
+ bpf_cgroup_struct_ops_foreach(tcp_ops, item, cgrp, \
+ CGROUP_TCP_SOCK_OPS) { \
+ if (tcp_ops->op) \
+ tcp_ops->op(sk, ##__VA_ARGS__); \
+ } \
+ rcu_read_unlock_migrate(); \
+ } \
+} while (0)
+
+#define bpf_tcp_ops_call_int(op, init_retval, sk, ...) \
+({ \
+ int __retval = (init_retval); \
+ if (cgroup_bpf_enabled(CGROUP_TCP_SOCK_OPS)) { \
+ const struct bpf_prog_array_item *item; \
+ const struct bpf_tcp_ops *tcp_ops; \
+ struct bpf_run_ctx *old_run_ctx; \
+ struct bpf_cg_run_ctx run_ctx; \
+ struct sock *__sk = sk_to_full_sk(sk); \
+ struct request_sock *req = NULL; \
+ struct cgroup *cgrp; \
+ \
+ if (__sk) { \
+ run_ctx.retval = (init_retval); \
+ cgrp = sock_cgroup_ptr(&__sk->sk_cgrp_data); \
+ if (!sk_fullsock(sk)) \
+ req = (struct request_sock *)sk; \
+ rcu_read_lock_dont_migrate(); \
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\
+ bpf_cgroup_struct_ops_foreach(tcp_ops, item, cgrp, \
+ CGROUP_TCP_SOCK_OPS) { \
+ if (tcp_ops->op) \
+ run_ctx.retval = tcp_ops->op(__sk, req, ##__VA_ARGS__); \
+ } \
+ bpf_reset_run_ctx(old_run_ctx); \
+ rcu_read_unlock_migrate(); \
+ __retval = run_ctx.retval; \
+ } \
+ } \
+ __retval; \
+})
+#else
+#define bpf_tcp_ops_call(op, sk, ...) do { } while (0)
+#define bpf_tcp_ops_call_int(op, init_retval, sk, ...) (init_retval)
+#endif
+
static inline u32 tcp_timeout_init(struct sock *sk)
{
int timeout;
timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
-
+ timeout = bpf_tcp_ops_call_int(timeout_init, timeout, sk);
if (timeout <= 0)
timeout = TCP_TIMEOUT_INIT;
return min_t(int, timeout, TCP_RTO_MAX);
@@ -2976,7 +3042,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
int rwnd;
rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
-
+ rwnd = bpf_tcp_ops_call_int(rwnd_init, rwnd, sk);
if (rwnd < 0)
rwnd = 0;
return rwnd;
@@ -2989,8 +3055,10 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt)
{
- if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
+ if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) {
tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt);
+ bpf_tcp_ops_call(rtt, sk, mrtt, srtt);
+ }
}
#if IS_ENABLED(CONFIG_SMC)
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7f9f98813986..356335e06b4c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -71,4 +71,5 @@ obj-$(CONFIG_TCP_AO) += tcp_ao.o
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o
+obj-$(CONFIG_CGROUP_BPF) += bpf_tcp_ops.o
endif
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0e62032e76b1..71f9a171310f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -227,6 +227,7 @@ int __inet_listen_sk(struct sock *sk, int backlog)
return err;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
+ bpf_tcp_ops_call(listen, sk);
}
return 0;
}
diff --git a/net/ipv4/bpf_tcp_ops.c b/net/ipv4/bpf_tcp_ops.c
new file mode 100644
index 000000000000..aa647d805882
--- /dev/null
+++ b/net/ipv4/bpf_tcp_ops.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_verifier.h>
+#include <net/bpf_sk_storage.h>
+#include <net/tcp.h>
+
+static int timeout_init_stub(struct sock *sk, struct request_sock *req__nullable)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ return ctx->retval;
+}
+
+static int rwnd_init_stub(struct sock *sk, struct request_sock *req__nullable)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ return ctx->retval;
+}
+
+static void rtt_stub(struct sock *sk, long mrtt, u32 srtt)
+{
+}
+
+static void set_state_stub(struct sock *sk, int state)
+{
+}
+
+static void retrans_stub(struct sock *sk, int type)
+{
+}
+
+static void connect_stub(struct sock *sk)
+{
+}
+
+static void listen_stub(struct sock *sk)
+{
+}
+
+static struct bpf_tcp_ops __bpf_tcp_ops = {
+ .timeout_init = timeout_init_stub,
+ .rwnd_init = rwnd_init_stub,
+ .rtt = rtt_stub,
+ .set_state = set_state_stub,
+ .retrans = retrans_stub,
+ .connect = connect_stub,
+ .listen = listen_stub,
+};
+
+static const struct bpf_func_proto *
+get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ u32 moff = prog->aux->attach_st_ops_member_off;
+
+ switch (func_id) {
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ /* The listener is not locked. */
+ if (moff == offsetof(struct bpf_tcp_ops, rwnd_init) ||
+ moff == offsetof(struct bpf_tcp_ops, timeout_init))
+ return NULL;
+ return &bpf_sk_setsockopt_proto;
+ case BPF_FUNC_getsockopt:
+ if (moff == offsetof(struct bpf_tcp_ops, rwnd_init) ||
+ moff == offsetof(struct bpf_tcp_ops, timeout_init))
+ return NULL;
+ return &bpf_sk_getsockopt_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+static bool is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog, struct bpf_insn_access_aux *info)
+{
+ if (!bpf_tracing_btf_ctx_access(off, size, type, prog, info))
+ return false;
+
+ if (base_type(info->reg_type) == PTR_TO_BTF_ID &&
+ !bpf_type_has_unsafe_modifiers(info->reg_type) &&
+ info->btf_id == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
+ /* promote it to tcp_sock */
+ info->btf_id = btf_sock_ids[BTF_SOCK_TYPE_TCP];
+
+ return true;
+}
+
+static int bpf_tcp_ops_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ return 0;
+}
+
+static int bpf_tcp_ops_init(struct btf *btf)
+{
+ return 0;
+}
+
+static int bpf_tcp_ops_validate(void *kdata)
+{
+ return 0;
+}
+
+static const struct bpf_verifier_ops bpf_tcp_ops_verifier = {
+ .get_func_proto = get_func_proto,
+ .is_valid_access = is_valid_access,
+};
+
+static struct bpf_struct_ops bpf_tcp_ops = {
+ .verifier_ops = &bpf_tcp_ops_verifier,
+ .init_member = bpf_tcp_ops_init_member,
+ .init = bpf_tcp_ops_init,
+ .validate = bpf_tcp_ops_validate,
+ .name = "bpf_tcp_ops",
+ .cgroup_atype = CGROUP_TCP_SOCK_OPS,
+ .cfi_stubs = &__bpf_tcp_ops,
+ .owner = THIS_MODULE,
+};
+
+static int __init __bpf_tcp_ops_init(void)
+{
+ return register_bpf_struct_ops(&bpf_tcp_ops, bpf_tcp_ops);
+}
+late_initcall(__bpf_tcp_ops_init)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 432fa28e47d4..c0f47a4c7980 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2998,8 +2998,10 @@ void tcp_set_state(struct sock *sk, int state)
*/
BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
- if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+ if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) {
tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
+ bpf_tcp_ops_call(set_state, sk, state);
+ }
switch (state) {
case TCP_ESTABLISHED:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f9d8755705f7..ed51713b2216 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3673,9 +3673,11 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
- if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) {
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
TCP_SKB_CB(skb)->seq, segs, err);
+ bpf_tcp_ops_call(retrans, sk, err);
+ }
if (unlikely(err) && err != -EBUSY)
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
@@ -4296,6 +4298,7 @@ int tcp_connect(struct sock *sk)
int err;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
+ bpf_tcp_ops_call(connect, sk);
#if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO)
/* Has to be checked late, after setting daddr/saddr/ops.
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* [RFC PATCH bpf-next 11/12] libbpf: Support attaching struct_ops to a cgroup
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
` (9 preceding siblings ...)
2026-05-19 21:58 ` [RFC PATCH bpf-next 10/12] bpf: tcp: Support selected sock_ops callbacks as struct_ops Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 21:58 ` [RFC PATCH bpf-next 12/12] selftests/bpf: Test " Martin KaFai Lau
11 siblings, 0 replies; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
Add bpf_map__attach_cgroup_opts.
Make bpf_prog_query_opts support type_id.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
tools/lib/bpf/bpf.c | 2 ++
tools/lib/bpf/bpf.h | 3 +-
tools/lib/bpf/libbpf.c | 59 ++++++++++++++++++++++++++++++++++
tools/lib/bpf/libbpf.h | 3 ++
tools/lib/bpf/libbpf.map | 5 +++
tools/lib/bpf/libbpf_version.h | 2 +-
6 files changed, 72 insertions(+), 2 deletions(-)
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 483c02cf21d1..0be1fc5a64e9 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -919,6 +919,7 @@ int bpf_link_create(int prog_fd, int target_fd,
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
case BPF_LSM_CGROUP:
+ case BPF_STRUCT_OPS:
relative_fd = OPTS_GET(opts, cgroup.relative_fd, 0);
relative_id = OPTS_GET(opts, cgroup.relative_id, 0);
if (relative_fd && relative_id)
@@ -1041,6 +1042,7 @@ int bpf_prog_query_opts(int target, enum bpf_attach_type type,
attr.query.attach_type = type;
attr.query.query_flags = OPTS_GET(opts, query_flags, 0);
attr.query.count = OPTS_GET(opts, count, 0);
+ attr.query.type_id = OPTS_GET(opts, type_id, 0);
attr.query.prog_ids = ptr_to_u64(OPTS_GET(opts, prog_ids, NULL));
attr.query.link_ids = ptr_to_u64(OPTS_GET(opts, link_ids, NULL));
attr.query.prog_attach_flags = ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL));
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 2312900a3263..aabac85399b4 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -631,9 +631,10 @@ struct bpf_prog_query_opts {
__u32 *link_ids;
__u32 *link_attach_flags;
__u64 revision;
+ __u32 type_id;
size_t :0;
};
-#define bpf_prog_query_opts__last_field revision
+#define bpf_prog_query_opts__last_field type_id
/**
* @brief **bpf_prog_query_opts()** queries the BPF programs and BPF links
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index ab2071fdd3e8..7ac48b7eea6c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -13736,6 +13736,65 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
return &link->link;
}
+struct bpf_link *bpf_map__attach_cgroup_opts(const struct bpf_map *map, int cgroup_fd,
+ const struct bpf_cgroup_opts *opts)
+{
+ LIBBPF_OPTS(bpf_link_create_opts, link_create_opts);
+ struct bpf_link_struct_ops *link;
+ __u32 relative_id, zero = 0;
+ int err, fd, relative_fd;
+
+ if (!OPTS_VALID(opts, bpf_cgroup_opts))
+ return libbpf_err_ptr(-EINVAL);
+
+ if (!bpf_map__is_struct_ops(map)) {
+ pr_warn("map '%s': can't attach non-struct_ops map\n", map->name);
+ return libbpf_err_ptr(-EINVAL);
+ }
+
+ if (map->fd < 0) {
+ pr_warn("map '%s': can't attach BPF map without FD (was it created?)\n", map->name);
+ return libbpf_err_ptr(-EINVAL);
+ }
+
+ relative_id = OPTS_GET(opts, relative_id, 0);
+ relative_fd = OPTS_GET(opts, relative_fd, 0);
+
+ if (relative_fd && relative_id) {
+ pr_warn("map '%s': relative_fd and relative_id cannot be set at the same time\n",
+ map->name);
+ return libbpf_err_ptr(-EINVAL);
+ }
+
+ link_create_opts.cgroup.expected_revision = OPTS_GET(opts, expected_revision, 0);
+ link_create_opts.cgroup.relative_fd = relative_fd;
+ link_create_opts.cgroup.relative_id = relative_id;
+ link_create_opts.flags = OPTS_GET(opts, flags, 0);
+
+ link = calloc(1, sizeof(*link));
+ if (!link)
+ return libbpf_err_ptr(-ENOMEM);
+
+ err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
+ if (err && err != -EBUSY) {
+ free(link);
+ return libbpf_err_ptr(err);
+ }
+
+ link->link.detach = bpf_link__detach_struct_ops;
+
+ fd = bpf_link_create(map->fd, cgroup_fd, BPF_STRUCT_OPS, &link_create_opts);
+ if (fd < 0) {
+ free(link);
+ return libbpf_err_ptr(fd);
+ }
+
+ link->link.fd = fd;
+ link->map_fd = map->fd;
+
+ return &link->link;
+}
+
/*
* Swap the back struct_ops of a link with a new struct_ops map.
*/
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index bba4e8464396..33274f6f5710 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -945,6 +945,9 @@ bpf_program__attach_cgroup_opts(const struct bpf_program *prog, int cgroup_fd,
struct bpf_map;
LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map);
+LIBBPF_API struct bpf_link *bpf_map__attach_cgroup_opts(const struct bpf_map *map,
+ int cgroup_fd,
+ const struct bpf_cgroup_opts *opts);
LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map);
struct bpf_iter_attach_opts {
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index dfed8d60af05..38b49bbe3193 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -461,3 +461,8 @@ LIBBPF_1.8.0 {
bpf_program__clone;
btf__new_empty_opts;
} LIBBPF_1.7.0;
+
+LIBBPF_1.9.0 {
+ global:
+ bpf_map__attach_cgroup_opts;
+} LIBBPF_1.8.0;
diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h
index c446c0cd8cf9..57b74ef3618c 100644
--- a/tools/lib/bpf/libbpf_version.h
+++ b/tools/lib/bpf/libbpf_version.h
@@ -4,6 +4,6 @@
#define __LIBBPF_VERSION_H
#define LIBBPF_MAJOR_VERSION 1
-#define LIBBPF_MINOR_VERSION 8
+#define LIBBPF_MINOR_VERSION 9
#endif /* __LIBBPF_VERSION_H */
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* [RFC PATCH bpf-next 12/12] selftests/bpf: Test attaching struct_ops to a cgroup
2026-05-19 21:58 [RFC PATCH bpf-next 00/12] bpf: A common way to attach struct_ops to a cgroup Martin KaFai Lau
` (10 preceding siblings ...)
2026-05-19 21:58 ` [RFC PATCH bpf-next 11/12] libbpf: Support attaching struct_ops to a cgroup Martin KaFai Lau
@ 2026-05-19 21:58 ` Martin KaFai Lau
2026-05-19 23:03 ` sashiko-bot
11 siblings, 1 reply; 22+ messages in thread
From: Martin KaFai Lau @ 2026-05-19 21:58 UTC (permalink / raw)
To: bpf
Cc: 'Alexei Starovoitov ', 'Andrii Nakryiko ',
'Daniel Borkmann ', 'Shakeel Butt ',
'Roman Gushchin ', 'Amery Hung ', netdev
From: Martin KaFai Lau <martin.lau@kernel.org>
Basic tests for attaching struct_ops to cgroup.
TODO:
- more than one level of cgroup setup.
- BPF_F_BEFORE/AFTER.
- bpf_link__update_map an existing struct_ops with and without BPF_F_PREORDER.
- cgroup_bpf_inherit().
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
.../selftests/bpf/prog_tests/bpf_tcp_ops.c | 207 ++++++++++++++++++
.../testing/selftests/bpf/progs/bpf_tcp_ops.c | 97 ++++++++
2 files changed, 304 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c
create mode 100644 tools/testing/selftests/bpf/progs/bpf_tcp_ops.c
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c
new file mode 100644
index 000000000000..ae1cb874770b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include <network_helpers.h>
+#include <bpf/btf.h>
+#include "cgroup_helpers.h"
+#include "bpf_tcp_ops.skel.h"
+
+#define CGROUP_PATH "/bpf_tcp_ops"
+#define TEST_NETNS "bpf_tcp_ops"
+
+static __s32 get_bpf_tcp_ops_type_id(void)
+{
+ struct btf *vmlinux_btf;
+ __s32 type_id;
+
+ vmlinux_btf = btf__load_vmlinux_btf();
+ if (!ASSERT_OK_PTR(vmlinux_btf, "load_vmlinux_btf"))
+ return -1;
+
+ type_id = btf__find_by_name_kind(vmlinux_btf, "bpf_tcp_ops", BTF_KIND_STRUCT);
+ btf__free(vmlinux_btf);
+
+ ASSERT_GT(type_id, 0, "find_bpf_tcp_ops");
+ return type_id;
+}
+
+static void reset_order(struct bpf_tcp_ops *skel)
+{
+ memset(skel->bss->listen_order, 0, sizeof(skel->bss->listen_order));
+ memset(skel->bss->connect_order, 0, sizeof(skel->bss->connect_order));
+ skel->bss->listen_cnt = 0;
+ skel->bss->connect_cnt = 0;
+}
+
+static void do_listen_connect(int family)
+{
+ const char *addr = family == AF_INET ? "127.0.0.1" : "::1";
+ int server_fd, client_fd;
+
+ server_fd = start_server(family, SOCK_STREAM, addr, 0, 0);
+ if (!ASSERT_GE(server_fd, 0, "start_server"))
+ return;
+
+ client_fd = connect_to_fd(server_fd, 0);
+ if (ASSERT_OK_FD(client_fd, "connect_to_fd"))
+ close(client_fd);
+
+ close(server_fd);
+}
+
+/*
+ * Attach ops1 and ops2 normally (in that order), then ops3 with
+ * BPF_F_PREORDER. Expected execution order: [3, 1, 2] — ops3 runs
+ * first despite being attached last, ops1 before ops2 by attach order.
+ */
+static void test_order(int cgroup_fd, struct bpf_tcp_ops *skel, int family)
+{
+ LIBBPF_OPTS(bpf_cgroup_opts, preorder_opts, .flags = BPF_F_PREORDER);
+ struct bpf_link *link1 = NULL, *link2 = NULL, *link3 = NULL;
+
+ link1 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops1, cgroup_fd, NULL);
+ if (!ASSERT_OK_PTR(link1, "attach_ops1"))
+ goto done;
+
+ link2 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops2, cgroup_fd, NULL);
+ if (!ASSERT_OK_PTR(link2, "attach_ops2"))
+ goto done;
+
+ link3 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops3, cgroup_fd,
+ &preorder_opts);
+ if (!ASSERT_OK_PTR(link3, "attach_ops3_preorder"))
+ goto done;
+
+ reset_order(skel);
+ do_listen_connect(family);
+
+ ASSERT_EQ(skel->bss->listen_cnt, 3, "listen_cnt");
+ ASSERT_EQ(skel->bss->listen_order[0], 3, "listen_order[0]");
+ ASSERT_EQ(skel->bss->listen_order[1], 1, "listen_order[1]");
+ ASSERT_EQ(skel->bss->listen_order[2], 2, "listen_order[2]");
+
+ ASSERT_EQ(skel->bss->connect_cnt, 3, "connect_cnt");
+ ASSERT_EQ(skel->bss->connect_order[0], 3, "connect_order[0]");
+ ASSERT_EQ(skel->bss->connect_order[1], 1, "connect_order[1]");
+ ASSERT_EQ(skel->bss->connect_order[2], 2, "connect_order[2]");
+
+done:
+ bpf_link__destroy(link3);
+ bpf_link__destroy(link2);
+ bpf_link__destroy(link1);
+}
+
+static void test_query(int cgroup_fd, struct bpf_tcp_ops *skel)
+{
+ struct bpf_map_info info = {};
+ __u32 info_len = sizeof(info);
+ LIBBPF_OPTS(bpf_prog_query_opts, query_opts);
+ struct bpf_link *link1 = NULL, *link2 = NULL;
+ __u32 map1_id, map2_id, map_ids[2] = {};
+ __s32 type_id;
+
+ type_id = get_bpf_tcp_ops_type_id();
+ if (type_id <= 0)
+ return;
+
+ bpf_map_get_info_by_fd(bpf_map__fd(skel->maps.tcp_ops1), &info, &info_len);
+ map1_id = info.id;
+
+ bpf_map_get_info_by_fd(bpf_map__fd(skel->maps.tcp_ops2), &info, &info_len);
+ map2_id = info.id;
+
+ link1 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops1, cgroup_fd, NULL);
+ if (!ASSERT_OK_PTR(link1, "attach_ops1"))
+ goto done;
+
+ link2 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops2, cgroup_fd, NULL);
+ if (!ASSERT_OK_PTR(link2, "attach_ops2"))
+ goto done;
+
+ /* query effective: expect 2 entries in attachment order */
+ query_opts.type_id = type_id;
+ query_opts.prog_ids = map_ids;
+ query_opts.count = ARRAY_SIZE(map_ids);
+ query_opts.query_flags = BPF_F_QUERY_EFFECTIVE;
+ ASSERT_OK(bpf_prog_query_opts(cgroup_fd, BPF_STRUCT_OPS, &query_opts),
+ "query_effective");
+ ASSERT_EQ(query_opts.count, 2, "query_effective_count");
+ ASSERT_EQ(map_ids[0], map1_id, "map_ids[0]");
+ ASSERT_EQ(map_ids[1], map2_id, "map_ids[1]");
+
+ /* query attached (non-effective): expect 2 entries */
+ memset(map_ids, 0, sizeof(map_ids));
+ query_opts.query_flags = 0;
+ query_opts.count = ARRAY_SIZE(map_ids);
+ ASSERT_OK(bpf_prog_query_opts(cgroup_fd, BPF_STRUCT_OPS, &query_opts),
+ "query_attached");
+ ASSERT_EQ(query_opts.count, 2, "query_attached_count");
+ ASSERT_EQ(map_ids[0], map1_id, "attached_map_ids[0]");
+ ASSERT_EQ(map_ids[1], map2_id, "attached_map_ids[1]");
+
+done:
+ bpf_link__destroy(link2);
+ bpf_link__destroy(link1);
+}
+
+static void run_query_subtest(void)
+{
+ struct bpf_tcp_ops *skel = NULL;
+ struct netns_obj *ns = NULL;
+ int cgroup_fd;
+
+ cgroup_fd = test__join_cgroup(CGROUP_PATH);
+ if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+ return;
+
+ ns = netns_new(TEST_NETNS, true);
+ if (!ASSERT_OK_PTR(ns, "netns_new"))
+ goto done;
+
+ skel = bpf_tcp_ops__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open_and_load"))
+ goto done;
+
+ test_query(cgroup_fd, skel);
+
+done:
+ bpf_tcp_ops__destroy(skel);
+ netns_free(ns);
+ close(cgroup_fd);
+}
+
+static void run_order_subtest(void)
+{
+ struct bpf_tcp_ops *skel = NULL;
+ struct netns_obj *ns = NULL;
+ int cgroup_fd;
+
+ cgroup_fd = test__join_cgroup(CGROUP_PATH);
+ if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+ return;
+
+ ns = netns_new(TEST_NETNS, true);
+ if (!ASSERT_OK_PTR(ns, "netns_new"))
+ goto done;
+
+ skel = bpf_tcp_ops__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open_and_load"))
+ goto done;
+
+ test_order(cgroup_fd, skel, AF_INET);
+ test_order(cgroup_fd, skel, AF_INET6);
+
+done:
+ bpf_tcp_ops__destroy(skel);
+ netns_free(ns);
+ close(cgroup_fd);
+}
+
+void test_bpf_tcp_ops(void)
+{
+ if (test__start_subtest("query"))
+ run_query_subtest();
+ if (test__start_subtest("order"))
+ run_order_subtest();
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_ops.c b/tools/testing/selftests/bpf/progs/bpf_tcp_ops.c
new file mode 100644
index 000000000000..80590f097143
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_tcp_ops.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define MAX_CGROUP_OPS 8
+
+/* Call order for listen and connect, indexed by call sequence */
+u32 listen_order[MAX_CGROUP_OPS];
+u32 listen_cnt;
+
+u32 connect_order[MAX_CGROUP_OPS];
+u32 connect_cnt;
+
+static void record_listen(int id)
+{
+ u32 idx = listen_cnt;
+
+ if (idx < MAX_CGROUP_OPS) {
+ listen_order[idx] = id;
+ listen_cnt = idx + 1;
+ }
+}
+
+static void record_connect(int id)
+{
+ u32 idx = connect_cnt;
+
+ if (idx < MAX_CGROUP_OPS) {
+ connect_order[idx] = id;
+ connect_cnt = idx + 1;
+ }
+}
+
+/* struct_ops instance 1 */
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops1_listen, struct sock *sk)
+{
+ record_listen(1);
+}
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops1_connect, struct sock *sk)
+{
+ record_connect(1);
+}
+
+SEC(".struct_ops.link")
+struct bpf_tcp_ops tcp_ops1 = {
+ .listen = (void *)tcp_ops1_listen,
+ .connect = (void *)tcp_ops1_connect,
+};
+
+/* struct_ops instance 2 */
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops2_listen, struct sock *sk)
+{
+ record_listen(2);
+}
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops2_connect, struct sock *sk)
+{
+ record_connect(2);
+}
+
+SEC(".struct_ops.link")
+struct bpf_tcp_ops tcp_ops2 = {
+ .listen = (void *)tcp_ops2_listen,
+ .connect = (void *)tcp_ops2_connect,
+};
+
+/* struct_ops instance 3 */
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops3_listen, struct sock *sk)
+{
+ record_listen(3);
+}
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops3_connect, struct sock *sk)
+{
+ record_connect(3);
+}
+
+SEC(".struct_ops.link")
+struct bpf_tcp_ops tcp_ops3 = {
+ .listen = (void *)tcp_ops3_listen,
+ .connect = (void *)tcp_ops3_connect,
+};
+
+char _license[] SEC("license") = "GPL";
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 22+ messages in thread* Re: [RFC PATCH bpf-next 12/12] selftests/bpf: Test attaching struct_ops to a cgroup
2026-05-19 21:58 ` [RFC PATCH bpf-next 12/12] selftests/bpf: Test " Martin KaFai Lau
@ 2026-05-19 23:03 ` sashiko-bot
0 siblings, 0 replies; 22+ messages in thread
From: sashiko-bot @ 2026-05-19 23:03 UTC (permalink / raw)
To: Martin KaFai Lau; +Cc: bpf
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] The return values of `bpf_map_get_info_by_fd()` are not checked, and the `info` struct and `info_len` parameter are reused across successive calls without being reset.
--
commit 7b32bddd7d98a92b1f824ecda2b3ef053e323da3
Author: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue May 19 21:58:19 2026 +0000
selftests/bpf: Test attaching struct_ops to a cgroup
Basic tests for attaching struct_ops to cgroup.
> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c
[ ... ]
> +static void test_query(int cgroup_fd, struct bpf_tcp_ops *skel)
> +{
> + struct bpf_map_info info = {};
> + __u32 info_len = sizeof(info);
> + LIBBPF_OPTS(bpf_prog_query_opts, query_opts);
> + struct bpf_link *link1 = NULL, *link2 = NULL;
> + __u32 map1_id, map2_id, map_ids[2] = {};
> + __s32 type_id;
> +
> + type_id = get_bpf_tcp_ops_type_id();
> + if (type_id <= 0)
> + return;
> +
> + bpf_map_get_info_by_fd(bpf_map__fd(skel->maps.tcp_ops1), &info, &info_len);
> + map1_id = info.id;
> +
> + bpf_map_get_info_by_fd(bpf_map__fd(skel->maps.tcp_ops2), &info, &info_len);
> + map2_id = info.id;
[Severity: Medium]
Are the return values of bpf_map_get_info_by_fd() intentionally left
unchecked? If the function fails, it might lead to confusing test failures.
Additionally, does reusing the info struct and info_len between the two calls
without resetting them pose a risk?
The kernel updates info_len during the first call, so reusing it could be
problematic if sizes differ. Also, if the second call were to fail, info.id
would retain the value from the first call, causing map2_id to incorrectly
equal map1_id.
Would it be better to wrap these calls in the standard assert macros and reset
the variables before the second call?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260519215841.2984970-1-martin.lau@linux.dev?part=12
^ permalink raw reply [flat|nested] 22+ messages in thread