* [RFC PATCH 1/9] bpf/arena: Plumb struct bpf_arena * through PTE callbacks
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
@ 2026-04-27 10:51 ` Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 2/9] bpf/arena: Add BPF_F_ARENA_MAP_ALWAYS for direct kernel access Tejun Heo
` (7 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Tejun Heo @ 2026-04-27 10:51 UTC (permalink / raw)
To: Kumar Kartikeya Dwivedi, Alexei Starovoitov, Emil Tsalapatis,
Eduard Zingerman, Andrii Nakryiko
Cc: David Vernet, Andrea Righi, Changwoo Min, bpf, sched-ext,
linux-kernel
A subsequent change needs the PTE callbacks in bpf_arena to consult
per-arena state. Make struct bpf_arena * reachable from each:
- apply_range_set_cb: add an arena field to apply_range_data. The
data arg can no longer be NULL (it now carries arena), so the
"skip PTE install" sentinel used by populate_pgtable_except_pte()
shifts from data == NULL to data->pages == NULL.
- apply_range_clear_cb: introduce struct apply_range_clear_data
{ arena, free_pages } in place of the bare struct llist_head *
arg.
- existing_page_cb: arena_map_free() passes arena instead of NULL.
The callback doesn't read it yet.
No behavior change.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/bpf/arena.c | 39 +++++++++++++++++++++++++++++----------
1 file changed, 29 insertions(+), 10 deletions(-)
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 08d008cc471e..02249d2514f8 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -114,16 +114,22 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
}
struct apply_range_data {
- struct page **pages;
+ struct bpf_arena *arena;
+ struct page **pages; /* NULL: skip PTE install */
int i;
};
+struct apply_range_clear_data {
+ struct bpf_arena *arena;
+ struct llist_head *free_pages;
+};
+
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
{
struct apply_range_data *d = data;
struct page *page;
- if (!data)
+ if (!d->pages)
return 0;
/* sanity check */
if (unlikely(!pte_none(ptep_get(pte))))
@@ -144,8 +150,9 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
flush_cache_vmap(start, start + size);
}
-static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
{
+ struct apply_range_clear_data *d = data;
pte_t old_pte;
struct page *page;
@@ -161,16 +168,18 @@ static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages
pte_clear(&init_mm, addr, pte);
/* Add page to the list so it is freed later */
- if (free_pages)
- __llist_add(&page->pcp_llist, free_pages);
+ if (d->free_pages)
+ __llist_add(&page->pcp_llist, d->free_pages);
return 0;
}
static int populate_pgtable_except_pte(struct bpf_arena *arena)
{
+ struct apply_range_data data = { .arena = arena };
+
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+ KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, &data);
}
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
@@ -286,7 +295,7 @@ static void arena_map_free(struct bpf_map *map)
* free those pages.
*/
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+ KERN_VM_SZ - GUARD_SZ, existing_page_cb, arena);
free_vm_area(arena->kern_vm);
range_tree_destroy(&arena->rt);
bpf_map_area_free(arena);
@@ -388,7 +397,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
if (ret)
goto out_unlock_sigsegv;
- struct apply_range_data data = { .pages = &page, .i = 0 };
+ struct apply_range_data data = { .arena = arena, .pages = &page, .i = 0 };
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
@@ -569,6 +578,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
bpf_map_memcg_exit(old_memcg, new_memcg);
return 0;
}
+ data.arena = arena;
data.pages = pages;
if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
@@ -696,9 +706,13 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
range_tree_set(&arena->rt, pgoff, page_cnt);
init_llist_head(&free_pages);
+ struct apply_range_clear_data clear_data = {
+ .arena = arena,
+ .free_pages = &free_pages,
+ };
/* clear ptes and collect struct pages */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &clear_data);
/* drop the lock to do the tlb flush and zap pages */
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
@@ -804,6 +818,11 @@ static void arena_free_worker(struct work_struct *work)
arena_vm_start = bpf_arena_get_kern_vm_start(arena);
user_vm_start = bpf_arena_get_user_vm_start(arena);
+ struct apply_range_clear_data clear_data = {
+ .arena = arena,
+ .free_pages = &free_pages,
+ };
+
list = llist_del_all(&arena->free_spans);
llist_for_each(pos, list) {
s = llist_entry(pos, struct arena_free_span, node);
@@ -813,7 +832,7 @@ static void arena_free_worker(struct work_struct *work)
/* clear ptes and collect pages in free_pages llist */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &clear_data);
range_tree_set(&arena->rt, pgoff, page_cnt);
}
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC PATCH 2/9] bpf/arena: Add BPF_F_ARENA_MAP_ALWAYS for direct kernel access
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 1/9] bpf/arena: Plumb struct bpf_arena * through PTE callbacks Tejun Heo
@ 2026-04-27 10:51 ` Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 3/9] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
` (6 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Tejun Heo @ 2026-04-27 10:51 UTC (permalink / raw)
To: Kumar Kartikeya Dwivedi, Alexei Starovoitov, Emil Tsalapatis,
Eduard Zingerman, Andrii Nakryiko
Cc: David Vernet, Andrea Righi, Changwoo Min, bpf, sched-ext,
linux-kernel
bpf_arena's kern_vm range is selectively populated: only allocated pages
have PTEs. This catches a narrow class of buggy BPF programs that
dereference unmapped arena addresses, but the protection is shallow - within
the allocated set there are countless ways for a buggy program to corrupt
arena memory.
It does, however, impose cost on the kernel side accesses. A kfunc or
struct_ops callback that wants to consume an arena pointer cannot simply
load through it; the page may have been freed underneath, so the access has
to go through copy_from_kernel_nofault(). Out-parameter writes currently
have no equivalent.
Arena is becoming the primary memory model for BPF programs, and more kfunc
/ struct_ops surfaces will want to read and write arena memory directly. The
actual answer for catching arena memory bugs is arena ASAN, which addresses
all memory access bugs meaningfully. Given that, it's worth offering an
opt-in mode that drops the partial fault protection in exchange for cheap
direct kernel-side access.
Add BPF_F_ARENA_MAP_ALWAYS. Arenas created with this flag allocate a
per-arena "garbage" page and pre-populate every PTE in the kern_vm range to
point at it. arena_alloc_pages() replaces the garbage PTE with a real page;
arena_free_pages() restores the garbage PTE instead of clearing.
arena_vm_fault() ignores the garbage page so user-side fault semantics are
unchanged.
Stores into garbage-backed addresses are silently absorbed; loads return
indeterminate bytes. Userspace mappings are unaffected. The flag is opt-in -
arenas without it behave exactly as before.
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/uapi/linux/bpf.h | 7 +++++
kernel/bpf/arena.c | 62 ++++++++++++++++++++++++++++++++++++----
2 files changed, 64 insertions(+), 5 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 552bc5d9afbd..2bd7f2a31a0f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1456,6 +1456,13 @@ enum {
/* Enable BPF ringbuf overwrite mode */
BPF_F_RB_OVERWRITE = (1U << 19),
+
+/* Keep every kernel-side PTE in a BPF_MAP_TYPE_ARENA backed by a per-arena
+ * "garbage" page so that kernel-side accesses anywhere in the arena's 4G range
+ * never fault. Loads from unallocated or freed regions return indeterminate
+ * bytes; stores are silently absorbed. Userspace mappings are unaffected.
+ */
+ BPF_F_ARENA_MAP_ALWAYS = (1U << 20),
};
/* Flags for BPF_PROG_QUERY. */
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 02249d2514f8..4e480c2f3786 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -62,6 +62,8 @@ struct bpf_arena {
struct irq_work free_irq;
struct work_struct free_work;
struct llist_head free_spans;
+ /* BPF_F_ARENA_MAP_ALWAYS fallback page; NULL if the flag is off */
+ struct page *garbage_page;
};
static void arena_free_worker(struct work_struct *work);
@@ -127,12 +129,14 @@ struct apply_range_clear_data {
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
{
struct apply_range_data *d = data;
+ pte_t old_pte;
struct page *page;
if (!d->pages)
return 0;
- /* sanity check */
- if (unlikely(!pte_none(ptep_get(pte))))
+ /* slot must be empty, or point to garbage if MAP_ALWAYS */
+ old_pte = ptep_get(pte);
+ if (unlikely(!pte_none(old_pte) && pte_page(old_pte) != d->arena->garbage_page))
return -EBUSY;
page = d->pages[d->i];
@@ -153,6 +157,7 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
{
struct apply_range_clear_data *d = data;
+ struct page *garbage = d->arena->garbage_page;
pte_t old_pte;
struct page *page;
@@ -165,7 +170,14 @@ static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
if (WARN_ON_ONCE(!page))
return -EINVAL;
- pte_clear(&init_mm, addr, pte);
+ if (garbage) {
+ /* if already cleared, must not free the shared garbage page */
+ if (page == garbage)
+ return 0;
+ set_pte_at(&init_mm, addr, pte, mk_pte(garbage, PAGE_KERNEL));
+ } else {
+ pte_clear(&init_mm, addr, pte);
+ }
/* Add page to the list so it is freed later */
if (d->free_pages)
@@ -182,6 +194,21 @@ static int populate_pgtable_except_pte(struct bpf_arena *arena)
KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, &data);
}
+static int populate_garbage_pte_cb(pte_t *pte, unsigned long addr, void *data)
+{
+ struct page *garbage = data;
+
+ set_pte_at(&init_mm, addr, pte, mk_pte(garbage, PAGE_KERNEL));
+ return 0;
+}
+
+static int populate_pgtable_with_garbage(struct bpf_arena *arena)
+{
+ return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
+ KERN_VM_SZ - GUARD_SZ, populate_garbage_pte_cb,
+ arena->garbage_page);
+}
+
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
{
struct vm_struct *kern_vm;
@@ -197,7 +224,8 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
/* BPF_F_MMAPABLE must be set */
!(attr->map_flags & BPF_F_MMAPABLE) ||
/* No unsupported flags present */
- (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV)))
+ (attr->map_flags & ~(BPF_F_SEGV_ON_FAULT | BPF_F_MMAPABLE | BPF_F_NO_USER_CONV |
+ BPF_F_ARENA_MAP_ALWAYS)))
return ERR_PTR(-EINVAL);
if (attr->map_extra & ~PAGE_MASK)
@@ -245,7 +273,23 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
goto err;
}
+ if (attr->map_flags & BPF_F_ARENA_MAP_ALWAYS) {
+ arena->garbage_page = alloc_page(GFP_KERNEL);
+ if (!arena->garbage_page) {
+ err = -ENOMEM;
+ goto err_free_arena;
+ }
+ err = populate_pgtable_with_garbage(arena);
+ if (err)
+ goto err_free_garbage;
+ }
+
return &arena->map;
+err_free_garbage:
+ __free_page(arena->garbage_page);
+err_free_arena:
+ range_tree_destroy(&arena->rt);
+ bpf_map_area_free(arena);
err:
free_vm_area(kern_vm);
return ERR_PTR(err);
@@ -253,6 +297,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
{
+ struct bpf_arena *arena = data;
struct page *page;
pte_t pte;
@@ -260,6 +305,9 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
if (!pte_present(pte)) /* sanity check */
return 0;
page = pte_page(pte);
+ /* garbage is shared and will be freed once later */
+ if (page == arena->garbage_page)
+ return 0;
/*
* We do not update pte here:
* 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
@@ -297,6 +345,8 @@ static void arena_map_free(struct bpf_map *map)
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
KERN_VM_SZ - GUARD_SZ, existing_page_cb, arena);
free_vm_area(arena->kern_vm);
+ if (arena->garbage_page)
+ __free_page(arena->garbage_page);
range_tree_destroy(&arena->rt);
bpf_map_area_free(arena);
}
@@ -383,8 +433,10 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_RETRY;
page = vmalloc_to_page((void *)kaddr);
+ if (page == arena->garbage_page)
+ page = NULL;
if (page)
- /* already have a page vmap-ed */
+ /* already have a real page vmap-ed */
goto out;
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC PATCH 3/9] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 1/9] bpf/arena: Plumb struct bpf_arena * through PTE callbacks Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 2/9] bpf/arena: Add BPF_F_ARENA_MAP_ALWAYS for direct kernel access Tejun Heo
@ 2026-04-27 10:51 ` Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 4/9] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
` (5 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Tejun Heo @ 2026-04-27 10:51 UTC (permalink / raw)
To: Kumar Kartikeya Dwivedi, Alexei Starovoitov, Emil Tsalapatis,
Eduard Zingerman, Andrii Nakryiko
Cc: David Vernet, Andrea Righi, Changwoo Min, bpf, sched-ext,
linux-kernel
The existing kernel-side export of bpf_arena_alloc_pages is _non_sleepable
only - it's used by the verifier to inline the kfunc when the call site is
non-sleepable. There is no sleepable equivalent for kernel callers; the
kfunc bpf_arena_alloc_pages itself is BPF-only.
sched_ext needs sleepable kernel-side allocs for its arena pool init/grow
paths. Add bpf_arena_alloc_pages_sleepable() mirroring the _non_sleepable
wrapper but passing sleepable=true to arena_alloc_pages().
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/bpf.h | 8 ++++++++
kernel/bpf/arena.c | 13 +++++++++++++
2 files changed, 21 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0136a108d083..af54705611d7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -678,6 +678,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
u64 flags);
void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt);
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
+ u64 flags);
#else
static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
int node_id, u64 flags)
@@ -688,6 +690,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr
static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
{
}
+
+static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ return NULL;
+}
#endif
extern const struct bpf_map_ops bpf_map_offload_ops;
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 4e480c2f3786..73e43617761c 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -947,6 +947,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
}
+
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+ return NULL;
+
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+}
+
__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
{
struct bpf_map *map = p__map;
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC PATCH 4/9] bpf: Add bpf_struct_ops_for_each_prog()
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
` (2 preceding siblings ...)
2026-04-27 10:51 ` [RFC PATCH 3/9] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
@ 2026-04-27 10:51 ` Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 5/9] bpf: Add bpf_prog_for_each_used_map() Tejun Heo
` (4 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Tejun Heo @ 2026-04-27 10:51 UTC (permalink / raw)
To: Kumar Kartikeya Dwivedi, Alexei Starovoitov, Emil Tsalapatis,
Eduard Zingerman, Andrii Nakryiko
Cc: David Vernet, Andrea Righi, Changwoo Min, bpf, sched-ext,
linux-kernel
Add a helper that walks the member progs of the struct_ops map
containing a given @kdata vmtable. struct_ops ->reg() callbacks (and
similar) sometimes need to inspect the loaded BPF programs, e.g. to
discover maps they reference via prog->aux->used_maps.
The implementation mirrors bpf_struct_ops_id(): container_of @kdata
to recover the bpf_struct_ops_map, then iterate st_map->links[i]->prog
for i in [0, funcs_cnt). Same access pattern, no new locking - by the
time ->reg() fires st_map is fully populated and stable.
A sched_ext follow-up uses this to require cid-form schedulers to use
exactly one BPF_F_ARENA_MAP_ALWAYS arena across their member progs,
without requiring the BPF program to call a registration kfunc.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/bpf.h | 3 +++
kernel/bpf/bpf_struct_ops.c | 36 ++++++++++++++++++++++++++++++++++++
2 files changed, 39 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index af54705611d7..f4e4360b81f6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2128,6 +2128,9 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
u32 bpf_struct_ops_id(const void *kdata);
+int bpf_struct_ops_for_each_prog(const void *kdata,
+ int (*cb)(struct bpf_prog *prog, void *data),
+ void *data);
#ifdef CONFIG_NET
/* Define it here to avoid the use of forward declaration */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 05b366b821c3..16aec18ed31b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1203,6 +1203,42 @@ u32 bpf_struct_ops_id(const void *kdata)
}
EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
+/**
+ * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog
+ * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg)
+ * @cb: callback invoked once per member prog; non-zero return stops iteration
+ * @data: opaque argument passed to @cb
+ *
+ * Walks the struct_ops member progs registered on the map containing @kdata.
+ * Intended for use from struct_ops ->reg() callbacks (and similar) that need to
+ * inspect the loaded BPF programs (for example to discover maps they reference
+ * via @prog->aux->used_maps).
+ *
+ * Return 0 if iteration completed, otherwise the first non-zero @cb return.
+ */
+int bpf_struct_ops_for_each_prog(const void *kdata,
+ int (*cb)(struct bpf_prog *prog, void *data),
+ void *data)
+{
+ struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ u32 i;
+ int ret;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->links[i])
+ continue;
+ ret = cb(st_map->links[i]->prog, data);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog);
+
static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC PATCH 5/9] bpf: Add bpf_prog_for_each_used_map()
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
` (3 preceding siblings ...)
2026-04-27 10:51 ` [RFC PATCH 4/9] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
@ 2026-04-27 10:51 ` Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 6/9] bpf/arena: Add bpf_arena_map_kern_vm_start() Tejun Heo
` (3 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Tejun Heo @ 2026-04-27 10:51 UTC (permalink / raw)
To: Kumar Kartikeya Dwivedi, Alexei Starovoitov, Emil Tsalapatis,
Eduard Zingerman, Andrii Nakryiko
Cc: David Vernet, Andrea Righi, Changwoo Min, bpf, sched-ext,
linux-kernel
Wrap the prog->aux->used_maps[] walk and its used_maps_mutex behind a
helper. Existing in-tree callers open-code the same lock + iterate pattern
(e.g. bpf_check_tail_call in core.c, the verifier and syscall paths); a
sched_ext follow-up needs the same loop and would otherwise reach into
bpf_prog_aux directly.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/bpf.h | 3 +++
kernel/bpf/core.c | 29 +++++++++++++++++++++++++++++
2 files changed, 32 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f4e4360b81f6..587e5ff387bf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2338,6 +2338,9 @@ static inline bool map_type_contains_progs(struct bpf_map *map)
bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp);
int bpf_prog_calc_tag(struct bpf_prog *fp);
+int bpf_prog_for_each_used_map(struct bpf_prog *prog,
+ int (*cb)(struct bpf_map *map, void *data),
+ void *data);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 066b86e7233c..aa590a817176 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2510,6 +2510,35 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
return ret;
}
+/**
+ * bpf_prog_for_each_used_map - Invoke @cb for each map @prog references
+ * @prog: BPF program whose used_maps to walk
+ * @cb: callback invoked once per map; non-zero return stops iteration
+ * @data: opaque argument passed to @cb
+ *
+ * Holds prog->aux->used_maps_mutex across the walk.
+ *
+ * Return 0 if iteration completed, otherwise the first non-zero @cb return.
+ */
+int bpf_prog_for_each_used_map(struct bpf_prog *prog,
+ int (*cb)(struct bpf_map *map, void *data),
+ void *data)
+{
+ struct bpf_prog_aux *aux = prog->aux;
+ int ret = 0;
+ u32 i;
+
+ mutex_lock(&aux->used_maps_mutex);
+ for (i = 0; i < aux->used_map_cnt; i++) {
+ ret = cb(aux->used_maps[i], data);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&aux->used_maps_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_for_each_used_map);
+
static bool bpf_prog_select_interpreter(struct bpf_prog *fp)
{
bool select_interpreter = false;
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC PATCH 6/9] bpf/arena: Add bpf_arena_map_kern_vm_start()
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
` (4 preceding siblings ...)
2026-04-27 10:51 ` [RFC PATCH 5/9] bpf: Add bpf_prog_for_each_used_map() Tejun Heo
@ 2026-04-27 10:51 ` Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 7/9] sched_ext: Require MAP_ALWAYS arena for cid-form schedulers Tejun Heo
` (2 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Tejun Heo @ 2026-04-27 10:51 UTC (permalink / raw)
To: Kumar Kartikeya Dwivedi, Alexei Starovoitov, Emil Tsalapatis,
Eduard Zingerman, Andrii Nakryiko
Cc: David Vernet, Andrea Righi, Changwoo Min, bpf, sched-ext,
linux-kernel
bpf_arena_get_kern_vm_start() takes a struct bpf_arena *. The struct
itself isn't exposed, so callers outside arena.c that hold only a
struct bpf_map * (e.g. struct_ops subsystems handling an arena map fd)
can't reach it.
Add bpf_arena_map_kern_vm_start() which takes struct bpf_map * and
container_of()s to the arena, with a type check. A sched_ext follow-up
needs this to translate kern_va <-> uaddr for arena pages it claims.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/bpf.h | 1 +
kernel/bpf/arena.c | 13 +++++++++++++
2 files changed, 14 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 587e5ff387bf..1614af65a2d1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -617,6 +617,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
struct bpf_spin_lock *spin_lock);
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
struct bpf_offload_dev;
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 73e43617761c..d67a6000221d 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -85,6 +85,19 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
return arena ? arena->user_vm_start : 0;
}
+/**
+ * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map *
+ * @map: a BPF_MAP_TYPE_ARENA map
+ *
+ * Return @map's kern_vm_start, or 0 (with WARN) if @map isn't an arena.
+ */
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map)
+{
+ if (WARN_ON_ONCE(!map || map->map_type != BPF_MAP_TYPE_ARENA))
+ return 0;
+ return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map));
+}
+
static long arena_map_peek_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC PATCH 7/9] sched_ext: Require MAP_ALWAYS arena for cid-form schedulers
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
` (5 preceding siblings ...)
2026-04-27 10:51 ` [RFC PATCH 6/9] bpf/arena: Add bpf_arena_map_kern_vm_start() Tejun Heo
@ 2026-04-27 10:51 ` Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 8/9] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 9/9] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
8 siblings, 0 replies; 10+ messages in thread
From: Tejun Heo @ 2026-04-27 10:51 UTC (permalink / raw)
To: Kumar Kartikeya Dwivedi, Alexei Starovoitov, Emil Tsalapatis,
Eduard Zingerman, Andrii Nakryiko
Cc: David Vernet, Andrea Righi, Changwoo Min, bpf, sched-ext,
linux-kernel
Upcoming patches will let the kernel place arena-resident scratch
shared with the BPF program (e.g. per-CPU set_cmask cmask) so the
BPF side can dereference it directly via __arena pointers, replacing
the current cmask_copy_from_kernel() probe-read loop. That requires
each cid-form scheduler to expose its arena to the kernel and to opt
into BPF_F_ARENA_MAP_ALWAYS so kernel-side stores never fault.
bpf_scx_reg_cid() walks the struct_ops member progs via the new
bpf_struct_ops_for_each_prog() helper and discovers the arena from
prog->aux->used_maps. It requires exactly one BPF_MAP_TYPE_ARENA
across all member progs and rejects if BPF_F_ARENA_MAP_ALWAYS is not
set. The map ref is held on scx_sched and dropped on sched destroy.
cpu-form schedulers (bpf_scx_reg) are unchanged - no arena
requirement.
scx_qmap adds BPF_F_ARENA_MAP_ALWAYS to its arena map definition.
v2: Defer sch->arena_map = cmd->arena_map consumption past
scx_alloc_and_add_sched() failure points so an early kzalloc/kstrdup
failure leaves cmd->arena_map set; bpf_scx_reg_cid() then drops the
ref via the existing cmd.arena_map cleanup.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 59 +++++++++++++++++++++++++++++++++-
kernel/sched/ext_internal.h | 9 ++++++
tools/sched_ext/scx_qmap.bpf.c | 2 +-
3 files changed, 68 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a078cd4225c1..835ac505f991 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4916,6 +4916,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
+ if (sch->arena_map)
+ bpf_map_put(sch->arena_map);
kfree(sch);
}
@@ -6588,6 +6590,7 @@ struct scx_enable_cmd {
struct sched_ext_ops_cid *ops_cid;
};
bool is_cid_type;
+ struct bpf_map *arena_map; /* arena ref to transfer to sch */
int ret;
};
@@ -6751,6 +6754,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
return ERR_PTR(ret);
}
#endif /* CONFIG_EXT_SUB_SCHED */
+
+ /*
+ * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so
+ * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid
+ * drops the ref. After this point, sch owns the ref and any cleanup
+ * runs through scx_sched_free_rcu_work() which puts it.
+ */
+ sch->arena_map = cmd->arena_map;
+ cmd->arena_map = NULL;
return sch;
err_free_lb_resched:
@@ -7676,11 +7688,56 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
return scx_enable(&cmd, link);
}
+struct scx_arena_scan {
+ struct bpf_map *arena;
+ int err;
+};
+
+static int scx_arena_scan_map(struct bpf_map *m, void *data)
+{
+ struct scx_arena_scan *s = data;
+
+ if (m->map_type != BPF_MAP_TYPE_ARENA)
+ return 0;
+ if (s->arena && s->arena != m) {
+ s->err = -EINVAL;
+ return 1;
+ }
+ s->arena = m;
+ return 0;
+}
+
+static int scx_arena_scan_prog(struct bpf_prog *prog, void *data)
+{
+ return bpf_prog_for_each_used_map(prog, scx_arena_scan_map, data);
+}
+
static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link)
{
struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true };
+ struct scx_arena_scan scan = {};
+ int ret;
- return scx_enable(&cmd, link);
+ bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan);
+ if (scan.err) {
+ pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n");
+ return scan.err;
+ }
+ if (!scan.arena) {
+ pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n");
+ return -EINVAL;
+ }
+ if (!(scan.arena->map_flags & BPF_F_ARENA_MAP_ALWAYS)) {
+ pr_err("sched_ext: arena map requires BPF_F_ARENA_MAP_ALWAYS for cid-form\n");
+ return -EINVAL;
+ }
+
+ bpf_map_inc(scan.arena);
+ cmd.arena_map = scan.arena;
+ ret = scx_enable(&cmd, link);
+ if (cmd.arena_map) /* not consumed by scx_alloc_and_add_sched() */
+ bpf_map_put(cmd.arena_map);
+ return ret;
}
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index e5f52986d317..bcffbc32541c 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1102,6 +1102,15 @@ struct scx_sched {
struct sched_ext_ops_cid ops_cid;
};
bool is_cid_type; /* true if registered via bpf_sched_ext_ops_cid */
+
+ /*
+ * Arena map auto-discovered from member progs at struct_ops attach.
+ * cid-form schedulers must use exactly one arena with
+ * BPF_F_ARENA_MAP_ALWAYS to enable direct arena access from kernel
+ * side. NULL on cpu-form.
+ */
+ struct bpf_map *arena_map;
+
DECLARE_BITMAP(has_op, SCX_OPI_END);
/*
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 2ffea8a93217..edce734c3019 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -83,7 +83,7 @@ UEI_DEFINE(uei);
*/
struct {
__uint(type, BPF_MAP_TYPE_ARENA);
- __uint(map_flags, BPF_F_MMAPABLE);
+ __uint(map_flags, BPF_F_MMAPABLE | BPF_F_ARENA_MAP_ALWAYS);
__uint(max_entries, 1 << 16); /* upper bound in pages */
#if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
__ulong(map_extra, 0x1ull << 32); /* user/BPF mmap base */
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC PATCH 8/9] sched_ext: Sub-allocator over kernel-claimed BPF arena pages
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
` (6 preceding siblings ...)
2026-04-27 10:51 ` [RFC PATCH 7/9] sched_ext: Require MAP_ALWAYS arena for cid-form schedulers Tejun Heo
@ 2026-04-27 10:51 ` Tejun Heo
2026-04-27 10:51 ` [RFC PATCH 9/9] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
8 siblings, 0 replies; 10+ messages in thread
From: Tejun Heo @ 2026-04-27 10:51 UTC (permalink / raw)
To: Kumar Kartikeya Dwivedi, Alexei Starovoitov, Emil Tsalapatis,
Eduard Zingerman, Andrii Nakryiko
Cc: David Vernet, Andrea Righi, Changwoo Min, bpf, sched-ext,
linux-kernel
Build a per-scheduler sub-allocator on top of pages claimed from the BPF
arena registered in the previous patch. Subsequent kernel-managed
arena-resident structures (e.g. per-CPU set_cmask cmask) carve their storage
from this pool.
scx_arena_pool_init() creates a gen_pool. scx_arena_alloc() returns a kernel
VA and the matching BPF-arena uaddr in *@uaddr_out. On exhaustion, the pool
grows by claiming more pages via bpf_arena_alloc_pages_sleepable(). Each
chunk is added to the gen_pool with kern_va as the "virt" and uaddr as the
"phys", so gen_pool_virt_to_phys() recovers the uaddr for handing to BPF.
Allocations sleep (GFP_KERNEL) - they may grow the pool through vzalloc and
arena page allocation. All current consumers run from the enable path (after
ops.init() and the kernel-side arena auto-discovery, before validate_ops()),
where sleeping is fine.
scx_arena_pool_destroy() walks each chunk, returns outstanding ranges to the
gen_pool with gen_pool_free() and then calls gen_pool_destroy(). The
underlying arena pages are released when the arena map itself is torn down,
so the pool destroy doesn't free them explicitly.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/build_policy.c | 4 ++
kernel/sched/ext.c | 11 ++++
kernel/sched/ext_arena.c | 128 ++++++++++++++++++++++++++++++++++++
kernel/sched/ext_arena.h | 18 +++++
kernel/sched/ext_internal.h | 6 ++
5 files changed, 167 insertions(+)
create mode 100644 kernel/sched/ext_arena.c
create mode 100644 kernel/sched/ext_arena.h
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 5e76c9177d54..067979a7b69e 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -59,12 +59,16 @@
#ifdef CONFIG_SCHED_CLASS_EXT
# include <linux/btf_ids.h>
+# include <linux/find.h>
+# include <linux/genalloc.h>
# include "ext_types.h"
# include "ext_internal.h"
# include "ext_cid.h"
+# include "ext_arena.h"
# include "ext_idle.h"
# include "ext.c"
# include "ext_cid.c"
+# include "ext_arena.c"
# include "ext_idle.c"
#endif
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 835ac505f991..27c2b4df79d5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4916,6 +4916,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
+ scx_arena_pool_destroy(sch);
if (sch->arena_map)
bpf_map_put(sch->arena_map);
kfree(sch);
@@ -6975,6 +6976,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
+ ret = scx_arena_pool_init(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
@@ -7264,6 +7271,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
+ ret = scx_arena_pool_init(sch);
+ if (ret)
+ goto err_disable;
+
if (validate_ops(sch, ops))
goto err_disable;
diff --git a/kernel/sched/ext_arena.c b/kernel/sched/ext_arena.c
new file mode 100644
index 000000000000..561cfe5418ff
--- /dev/null
+++ b/kernel/sched/ext_arena.c
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * scx_arena_pool: kernel-side sub-allocator over BPF-arena pages.
+ *
+ * Each chunk added to @sch->arena_pool comes from one
+ * bpf_arena_alloc_pages_sleepable() call. We add it to gen_pool with the
+ * kernel-VA as the "virt" address and the matching BPF uaddr as the "phys" so
+ * gen_pool_virt_to_phys() recovers the uaddr for handing back to BPF.
+ *
+ * Allocations grow the pool on demand. Underlying arena pages are released
+ * when the arena map itself is torn down.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+
+enum scx_arena_consts {
+ SCX_ARENA_MIN_ORDER = 3, /* 8-byte minimum sub-allocation */
+ SCX_ARENA_GROW_PAGES = 4, /* per growth */
+};
+
+s32 scx_arena_pool_init(struct scx_sched *sch)
+{
+ if (!sch->arena_map)
+ return 0;
+
+ sch->arena_pool = gen_pool_create(SCX_ARENA_MIN_ORDER, NUMA_NO_NODE);
+ if (!sch->arena_pool)
+ return -ENOMEM;
+ return 0;
+}
+
+static void scx_arena_clear_chunk(struct gen_pool *pool, struct gen_pool_chunk *chunk,
+ void *data)
+{
+ int order = pool->min_alloc_order;
+ size_t chunk_sz = chunk->end_addr - chunk->start_addr + 1;
+ unsigned long end_bit = chunk_sz >> order;
+ unsigned long b, e;
+
+ for_each_set_bitrange(b, e, chunk->bits, end_bit)
+ gen_pool_free(pool, chunk->start_addr + (b << order),
+ (e - b) << order);
+}
+
+/*
+ * Tear down the pool. Outstanding gen_pool allocations are freed via
+ * scx_arena_clear_chunk() so gen_pool_destroy() doesn't BUG. The underlying
+ * arena pages are released when the arena map itself is torn down.
+ */
+void scx_arena_pool_destroy(struct scx_sched *sch)
+{
+ if (!sch->arena_pool)
+ return;
+ gen_pool_for_each_chunk(sch->arena_pool, scx_arena_clear_chunk, NULL);
+ gen_pool_destroy(sch->arena_pool);
+ sch->arena_pool = NULL;
+}
+
+/*
+ * Grow the pool by @page_cnt pages. bpf_arena_alloc_pages_sleepable() and
+ * gen_pool_add_virt() (which calls vzalloc(GFP_KERNEL)) require a sleepable
+ * context.
+ */
+static int scx_arena_grow(struct scx_sched *sch, u32 page_cnt)
+{
+ u64 kern_vm_start;
+ u32 uaddr32;
+ void *p;
+ int ret;
+
+ if (!sch->arena_map || !sch->arena_pool)
+ return -EINVAL;
+
+ p = bpf_arena_alloc_pages_sleepable(sch->arena_map, NULL,
+ page_cnt, NUMA_NO_NODE, 0);
+ if (!p)
+ return -ENOMEM;
+
+ uaddr32 = (u32)(unsigned long)p;
+ kern_vm_start = bpf_arena_map_kern_vm_start(sch->arena_map);
+
+ ret = gen_pool_add_virt(sch->arena_pool, kern_vm_start + uaddr32,
+ uaddr32, page_cnt * PAGE_SIZE, NUMA_NO_NODE);
+ if (ret) {
+ bpf_arena_free_pages_non_sleepable(sch->arena_map, p, page_cnt);
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL
+ * on failure. *@uaddr_out gets the BPF-arena address. May grow the pool via
+ * scx_arena_grow() which sleeps. Caller must be in a GFP_KERNEL context.
+ */
+void *scx_arena_alloc(struct scx_sched *sch, size_t size, u32 *uaddr_out)
+{
+ unsigned long kern_va;
+ u32 page_cnt;
+
+ might_sleep();
+
+ if (!sch->arena_pool)
+ return NULL;
+
+ kern_va = gen_pool_alloc(sch->arena_pool, size);
+ if (!kern_va) {
+ page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES,
+ (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ if (scx_arena_grow(sch, page_cnt))
+ return NULL;
+ kern_va = gen_pool_alloc(sch->arena_pool, size);
+ if (!kern_va)
+ return NULL;
+ }
+
+ *uaddr_out = (u32)gen_pool_virt_to_phys(sch->arena_pool, kern_va);
+ return (void *)kern_va;
+}
+
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size)
+{
+ if (sch->arena_pool && kern_va)
+ gen_pool_free(sch->arena_pool, (unsigned long)kern_va, size);
+}
diff --git a/kernel/sched/ext_arena.h b/kernel/sched/ext_arena.h
new file mode 100644
index 000000000000..d21b2e3fac93
--- /dev/null
+++ b/kernel/sched/ext_arena.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_ARENA_H
+#define _KERNEL_SCHED_EXT_ARENA_H
+
+struct scx_sched;
+
+s32 scx_arena_pool_init(struct scx_sched *sch);
+void scx_arena_pool_destroy(struct scx_sched *sch);
+void *scx_arena_alloc(struct scx_sched *sch, size_t size, u32 *uaddr_out);
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size);
+
+#endif /* _KERNEL_SCHED_EXT_ARENA_H */
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index bcffbc32541c..56d99e749c9d 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1108,8 +1108,14 @@ struct scx_sched {
* cid-form schedulers must use exactly one arena with
* BPF_F_ARENA_MAP_ALWAYS to enable direct arena access from kernel
* side. NULL on cpu-form.
+ *
+ * @arena_pool sub-allocates @arena_map. Each gen_pool chunk is added
+ * with kern_va as the "virt" address and the matching BPF uaddr as the
+ * "phys", so gen_pool_virt_to_phys() recovers the uaddr for handing to
+ * BPF. Grows on demand and pages are not released until sched destroy.
*/
struct bpf_map *arena_map;
+ struct gen_pool *arena_pool;
DECLARE_BITMAP(has_op, SCX_OPI_END);
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [RFC PATCH 9/9] sched_ext: Convert ops.set_cmask() to arena-resident cmask
2026-04-27 10:51 [RFC PATCH 0/9] bpf/arena: Direct kernel-side access Tejun Heo
` (7 preceding siblings ...)
2026-04-27 10:51 ` [RFC PATCH 8/9] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
@ 2026-04-27 10:51 ` Tejun Heo
8 siblings, 0 replies; 10+ messages in thread
From: Tejun Heo @ 2026-04-27 10:51 UTC (permalink / raw)
To: Kumar Kartikeya Dwivedi, Alexei Starovoitov, Emil Tsalapatis,
Eduard Zingerman, Andrii Nakryiko
Cc: David Vernet, Andrea Righi, Changwoo Min, bpf, sched-ext,
linux-kernel
ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
arena, so it translated cpumask -> cmask in kernel memory and passed the
result as a trusted pointer. The BPF cmask helpers all operate on arena
cmasks though, so the BPF side had to word-by-word probe-read the kernel
cmask into an arena cmask via cmask_copy_from_kernel() before any helper
could touch it. It works, but is clumsy.
With direct kernel-side arena access now in place, build the cmask in the
arena. The kernel writes to it through the kern_va side of the dual mapping;
BPF directly dereferences it via an __arena pointer like any other arena
struct.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 67 +++++++++++++++++++++++++--
kernel/sched/ext_cid.c | 16 +------
kernel/sched/ext_internal.h | 10 +++-
kernel/sched/ext_types.h | 10 ++++
tools/sched_ext/include/scx/cid.bpf.h | 44 ------------------
tools/sched_ext/scx_qmap.bpf.c | 6 ++-
6 files changed, 86 insertions(+), 67 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 27c2b4df79d5..30e29853edd0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -622,11 +622,15 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
update_locked_rq(rq);
if (scx_is_cid_type()) {
- struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
+ struct scx_cmask_scratch *s = this_cpu_ptr(sch->set_cmask_scratch);
- lockdep_assert_irqs_disabled();
- scx_cpumask_to_cmask(cpumask, cmask);
- sch->ops_cid.set_cmask(task, cmask);
+ /*
+ * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
+ * holds the rq lock with IRQs disabled, which makes us the sole
+ * user of the scratch area.
+ */
+ scx_cpumask_to_cmask(cpumask, s->kern_va);
+ sch->ops_cid.set_cmask(task, (struct scx_cmask *)(unsigned long)s->uaddr);
} else {
sch->ops.set_cpumask(task, cpumask);
}
@@ -4864,6 +4868,47 @@ static const struct attribute_group scx_global_attr_group = {
static void free_pnode(struct scx_sched_pnode *pnode);
static void free_exit_info(struct scx_exit_info *ei);
+/* Byte size of a struct scx_cmask covering num_possible_cpus(). Set at boot. */
+static size_t scx_possible_cmask_size;
+
+static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
+{
+ int cpu;
+
+ if (!sch->is_cid_type || !sch->arena_pool)
+ return 0;
+
+ sch->set_cmask_scratch = alloc_percpu(struct scx_cmask_scratch);
+ if (!sch->set_cmask_scratch)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_cmask_scratch *s = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+ s->kern_va = scx_arena_alloc(sch, scx_possible_cmask_size, &s->uaddr);
+ if (!s->kern_va)
+ return -ENOMEM;
+ scx_cmask_init(s->kern_va, 0, num_possible_cpus());
+ }
+ return 0;
+}
+
+static void scx_set_cmask_scratch_free(struct scx_sched *sch)
+{
+ int cpu;
+
+ if (!sch->set_cmask_scratch)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_cmask_scratch *s = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+ scx_arena_free(sch, s->kern_va, scx_possible_cmask_size);
+ }
+ free_percpu(sch->set_cmask_scratch);
+ sch->set_cmask_scratch = NULL;
+}
+
static void scx_sched_free_rcu_work(struct work_struct *work)
{
struct rcu_work *rcu_work = to_rcu_work(work);
@@ -4916,6 +4961,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
+ scx_set_cmask_scratch_free(sch);
scx_arena_pool_destroy(sch);
if (sch->arena_map)
bpf_map_put(sch->arena_map);
@@ -6982,6 +7028,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
goto err_disable;
}
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
@@ -7275,6 +7327,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
if (ret)
goto err_disable;
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret)
+ goto err_disable;
+
if (validate_ops(sch, ops))
goto err_disable;
@@ -8202,6 +8258,9 @@ void __init init_sched_ext_class(void)
WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
SCX_TG_ONLINE);
+ scx_possible_cmask_size = struct_size_t(struct scx_cmask, bits,
+ SCX_CMASK_NR_WORDS(num_possible_cpus()));
+
scx_idle_init_masks();
for_each_possible_cpu(cpu) {
diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index 71f7ef572eac..7ae251f20a13 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -7,14 +7,6 @@
*/
#include <linux/cacheinfo.h>
-/*
- * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
- * cmask from a cpumask. Allocated alongside the cid arrays on first enable
- * and never freed. Sized to the full cid space. Caller holds rq lock so
- * this_cpu_ptr is safe.
- */
-struct scx_cmask __percpu *scx_set_cmask_scratch;
-
/*
* cid tables.
*
@@ -54,7 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
u32 npossible = num_possible_cpus();
s16 *cid_to_cpu, *cpu_to_cid;
struct scx_cid_topo *cid_topo;
- struct scx_cmask __percpu *set_cmask_scratch;
if (scx_cid_to_cpu_tbl)
return 0;
@@ -62,22 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
- set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
- SCX_CMASK_NR_WORDS(npossible)),
- sizeof(u64));
- if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
+ if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
kfree(cid_to_cpu);
kfree(cpu_to_cid);
kfree(cid_topo);
- free_percpu(set_cmask_scratch);
return -ENOMEM;
}
WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
WRITE_ONCE(scx_cid_topo, cid_topo);
- WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
return 0;
}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 56d99e749c9d..d2ef8a5a3e69 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1117,6 +1117,14 @@ struct scx_sched {
struct bpf_map *arena_map;
struct gen_pool *arena_pool;
+ /*
+ * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
+ * to ops_cid.set_cmask(). Each entry stashes both the kernel VA (for
+ * the kernel to write into) and the BPF-arena uaddr (passed to BPF as
+ * the cmask pointer).
+ */
+ struct scx_cmask_scratch __percpu *set_cmask_scratch;
+
DECLARE_BITMAP(has_op, SCX_OPI_END);
/*
@@ -1473,8 +1481,6 @@ enum scx_ops_state {
extern struct scx_sched __rcu *scx_root;
DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
-extern struct scx_cmask __percpu *scx_set_cmask_scratch;
-
/*
* True when the currently loaded scheduler hierarchy is cid-form. All scheds
* in a hierarchy share one form, so this single key tells callsites which
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
index ebb8cdf90612..23edf73a84ae 100644
--- a/kernel/sched/ext_types.h
+++ b/kernel/sched/ext_types.h
@@ -101,4 +101,14 @@ struct scx_cmask {
#define SCX_CMASK_DEFINE(name, cap_bits) \
DEFINE_RAW_FLEX(struct scx_cmask, name, bits, SCX_CMASK_NR_WORDS(cap_bits))
+/*
+ * Stash for one arena-resident cmask. @kern_va points into the kernel's
+ * view of the BPF arena; @uaddr is the matching BPF-arena address to
+ * hand to BPF (cast to struct scx_cmask *).
+ */
+struct scx_cmask_scratch {
+ struct scx_cmask *kern_va;
+ u32 uaddr;
+};
+
#endif /* _KERNEL_SCHED_EXT_TYPES_H */
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index 629c3f078021..4e3c967151fc 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -612,48 +612,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
}
}
-/**
- * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
- * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
- * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
- *
- * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
- * scx_bpf_error() on probe failure or precondition violation.
- */
-static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
- const struct scx_cmask *src)
-{
- u32 nr_bits = 0, nr_words, dst_nr_words, wi;
-
- if (dst->base != 0) {
- scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
- return;
- }
-
- if (bpf_probe_read_kernel(&nr_bits, sizeof(nr_bits), &src->nr_bits)) {
- scx_bpf_error("probe-read cmask->nr_bits failed");
- return;
- }
-
- nr_words = CMASK_NR_WORDS(nr_bits);
- dst_nr_words = CMASK_NR_WORDS(dst->nr_bits);
- if (nr_words > dst_nr_words) {
- scx_bpf_error("src cmask nr_bits=%u exceeds dst capacity",
- nr_bits);
- return;
- }
-
- cmask_zero(dst);
- bpf_for(wi, 0, CMASK_MAX_WORDS) {
- u64 word = 0;
- if (wi >= nr_words)
- break;
- if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
- scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
- return;
- }
- dst->bits[wi] = word;
- }
-}
-
#endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index edce734c3019..3412cf0bff13 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -922,14 +922,16 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
}
void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
- const struct scx_cmask *cmask)
+ const struct scx_cmask *cmask_in)
{
+ struct scx_cmask __arena *cmask =
+ (struct scx_cmask __arena *)(long)cmask_in;
task_ctx_t *taskc;
taskc = lookup_task_ctx(p);
if (!taskc)
return;
- cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
+ cmask_copy(&taskc->cpus_allowed, cmask);
}
struct monitor_timer {
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread