* [PATCH bpf-next 3/4] bpf,slab: Add slub-backed allocator for bpf_arena
2026-05-29 20:24 [PATCH bpf-next 0/4] bpf,slab: Introduce bpf_arena_alloc() kfuncs Alexei Starovoitov
2026-05-29 20:24 ` [PATCH bpf-next 1/4] slab: Introduce kmem_cache_alloc_nolock() Alexei Starovoitov
2026-05-29 20:24 ` [PATCH bpf-next 2/4] slub: Pass kmem_cache to alloc_slab_page() Alexei Starovoitov
@ 2026-05-29 20:24 ` Alexei Starovoitov
2026-05-29 22:04 ` sashiko-bot
2026-05-29 20:24 ` [PATCH bpf-next 4/4] selftests/bpf: Add tests for arena slub-backed allocator Alexei Starovoitov
3 siblings, 1 reply; 11+ messages in thread
From: Alexei Starovoitov @ 2026-05-29 20:24 UTC (permalink / raw)
To: bpf
Cc: daniel, andrii, memxor, eddyz87, vbabka, harry.yoo, david, tj,
roman.gushchin, peterz, linux-mm
From: Alexei Starovoitov <ast@kernel.org>
Let BPF programs allocate typed objects in a bpf_arena via a
kvmalloc-style API: bpf_arena_alloc() routes requests up to
PAGE_SIZE through per-arena slab buckets, and falls back to
arena_alloc_pages() for larger sizes -- analogous to kvmalloc()
choosing between kmalloc and vmalloc by size. The fallback page
is stashed in arena->slab_pages[pgoff] (without PageSlab) with
page_cnt in page->private, so bpf_arena_free() can recover the
multi-page allocation from the arena offset alone and release it
via arena_free_pages().
Each arena page now has two kernel VAs that alias the same bytes:
the page allocator's direct-map VA, and the arena's vmalloc mapping
at kern_vm_start + uaddr32. slub uses only the direct-map view --
slab_address(), virt_to_slab(), in-object freepointers, percpu
sheaves, partial lists all work unchanged. BPF programs see the
arena view via kern_vm_addr + (u32)ptr addressing. Translation between
the two windows happens only at the bpf_arena_alloc/free kfunc boundary.
slub side:
- get_freepointer() clamps the decoded pointer to the same slab
page via (object & ~slab_mask) | (decoded & slab_mask), NULL
preserved. Worst case under BPF corruption: chain aliases within
one arena page.
- arena_alloc_slab_page() stashes uaddr32 in slab->stride via
slab_set_stride(); arena_slab_uaddr32() reads it back via
slab_get_stride(). alloc_slab_obj_exts_early() is skipped for
SLAB_BPF_ARENA so its own slab_set_stride() doesn't clobber the
stash.
- Arena caches get percpu sheaves sized by object size like any
other runtime cache.
- __refill_objects_node()'s trailing freelist walk is bounded by
slab->objects so a BPF-induced freepointer cycle can't loop
forever.
arena side:
- Per-arena kmalloc-style bucket caches built at map_alloc cover
sizes up to PAGE_SIZE; larger requests fall back to
arena_alloc_pages().
- slab_pages[pgoff] gives O(1) page lookup, and also anchors
fallback multi-page allocations for bpf_arena_free().
- bpf_arena_alloc: kmem_cache_alloc_nolock -> slab_get_stride -> uaddr32.
- bpf_arena_free: slab_pages[pgoff] -> direct-map kva -> kfree_nolock,
or arena_free_pages() when page->private records a multi-page span.
- apply_range_clear_cb() leaves PTEs of PageSlab pages installed
and skips __free_page(), so bpf_arena_free_pages() on a slab-backed
offset can't free a page out from under slub. The page is torn
down later by arena_free_slab_page() after __ClearPageSlab().
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
include/linux/bpf_defs.h | 13 ++
include/linux/slab.h | 22 +++
kernel/bpf/Kconfig | 3 +
kernel/bpf/arena.c | 366 ++++++++++++++++++++++++++++++++++++++-
mm/slab.h | 6 +-
mm/slab_common.c | 2 +-
mm/slub.c | 155 +++++++++++++++--
7 files changed, 543 insertions(+), 24 deletions(-)
diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
index 2185cd3966d4..e271ae78c4ce 100644
--- a/include/linux/bpf_defs.h
+++ b/include/linux/bpf_defs.h
@@ -6,14 +6,27 @@
#ifndef _LINUX_BPF_DEFS_H
#define _LINUX_BPF_DEFS_H
+#include <linux/types.h>
+
+struct slab;
+
#ifdef CONFIG_BPF_SYSCALL
bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
+struct slab *bpf_arena_alloc_slab_page(void *arena, gfp_t flags, int node,
+ bool allow_spin);
+void bpf_arena_free_slab_page(void *arena, struct slab *slab);
#else
static inline bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
unsigned long fault_ip)
{
return false;
}
+static inline struct slab *bpf_arena_alloc_slab_page(void *arena, gfp_t flags,
+ int node, bool allow_spin)
+{
+ return NULL;
+}
+static inline void bpf_arena_free_slab_page(void *arena, struct slab *slab) { }
#endif
#endif /* _LINUX_BPF_DEFS_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7ce9125a6a2c..6c6f1ba83c7d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -62,6 +62,7 @@ enum _slab_flag_bits {
#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
_SLAB_OBJ_EXT_IN_OBJ,
#endif
+ _SLAB_BPF_ARENA,
_SLAB_FLAGS_LAST_BIT
};
@@ -248,6 +249,15 @@ enum _slab_flag_bits {
#define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_UNUSED
#endif
+/*
+ * Cache is backed by bpf_arena pages instead of the page allocator.
+ * Slab pages live in the arena's kernel vmalloc range and are visible to
+ * BPF programs via 32-bit arena addressing. Freepointers stored inside
+ * free objects may be scribbled by BPF; get_freepointer() reconstructs a
+ * pointer that is always within the arena's 4GB window.
+ */
+#define SLAB_BPF_ARENA __SLAB_FLAG_BIT(_SLAB_BPF_ARENA)
+
/*
* ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
*
@@ -372,6 +382,15 @@ struct kmem_cache_args {
* %0 means no sheaves will be created.
*/
unsigned int sheaf_capacity;
+ /**
+ * @bpf_arena: Opaque arena pointer for SLAB_BPF_ARENA caches.
+ *
+ * When non-%NULL, slab pages for this cache are sourced from the
+ * arena via bpf_arena_alloc_slab_page()/bpf_arena_free_slab_page(),
+ * and freepointer reads are sanitized to remain inside the arena.
+ * Caller must also pass %SLAB_BPF_ARENA in the flags argument.
+ */
+ void *bpf_arena;
};
struct kmem_cache *__kmem_cache_create_args(const char *name,
@@ -963,6 +982,9 @@ void *kmem_cache_alloc_nolock_noprof(struct kmem_cache *s, gfp_t gfp_flags,
#define kmem_cache_alloc_nolock(...) \
alloc_hooks(kmem_cache_alloc_nolock_noprof(__VA_ARGS__))
+struct slab;
+void kmem_cache_force_discard_slab(struct kmem_cache *s, struct slab *slab);
+
/**
* __alloc_objs - Allocate objects of a given type using
* @KMALLOC: which size-based kmalloc wrapper to allocate with.
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index eb3de35734f0..42ef4fc3a6bd 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -34,6 +34,9 @@ config BPF_SYSCALL
select NET_SOCK_MSG if NET
select NET_XGRESS if NET
select PAGE_POOL if NET
+ # bpf_arena_alloc()/free() stashes uaddr32 in slab->stride which only
+ # becomes a real field with CONFIG_SLAB_OBJ_EXT.
+ select SLAB_OBJ_EXT if MMU && 64BIT
default n
help
Enable the bpf() system call that allows to manipulate BPF programs
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 1727503b25d8..807d806856d7 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -10,7 +10,9 @@
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
+#include <linux/slab.h>
#include <asm/tlbflush.h>
+#include "../../mm/slab.h"
#include "range_tree.h"
/*
@@ -48,6 +50,14 @@
static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable);
+/*
+ * Per-arena slab buckets. Mirrors the kmalloc size classes (powers of 2)
+ * up to one page.
+ */
+#define ARENA_KMALLOC_MIN_SHIFT KMALLOC_SHIFT_LOW
+#define ARENA_KMALLOC_MAX_SHIFT PAGE_SHIFT
+#define ARENA_KMALLOC_NUM_BUCKETS (ARENA_KMALLOC_MAX_SHIFT + 1)
+
struct bpf_arena {
struct bpf_map map;
u64 user_vm_start;
@@ -63,10 +73,20 @@ struct bpf_arena {
struct irq_work free_irq;
struct work_struct free_work;
struct llist_head free_spans;
+
+ /*
+ * SLAB_BPF_ARENA: kva <-> arena offset translation at the kfunc
+ * boundary. Forward (kva -> uaddr32) via slab->stride; reverse
+ * (uaddr32 -> page) via @slab_pages[pgoff], sized to max_entries.
+ */
+ struct page **slab_pages;
+ struct kmem_cache *kmalloc_caches[ARENA_KMALLOC_NUM_BUCKETS];
};
static void arena_free_worker(struct work_struct *work);
static void arena_free_irq(struct irq_work *iw);
+static int arena_init_slab_caches(struct bpf_arena *arena);
+static void arena_destroy_slab_caches(struct bpf_arena *arena);
struct arena_free_span {
struct llist_node node;
@@ -143,6 +163,7 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
struct apply_range_data {
struct page **pages;
int i;
+ bool set_page_slab;
};
struct clear_range_data {
@@ -166,6 +187,13 @@ static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
return -EINVAL;
+ /*
+ * Tag PageSlab under arena->spinlock so a racing bpf_arena_free_pages()
+ * sees the page as slub-owned (apply_range_clear_cb skips PageSlab).
+ */
+ if (d->set_page_slab)
+ __SetPageSlab(page);
+
set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
d->i++;
return 0;
@@ -179,9 +207,22 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
{
struct clear_range_data *d = data;
- pte_t old_pte;
+ pte_t old_pte, cur;
struct page *page;
+ /*
+ * Skip slub-owned pages: BPF must use bpf_arena_free() for per-object
+ * slab frees. The PTE stays; slub releases it via arena_free_slab_page()
+ * after __ClearPageSlab(). Non-atomic ptep_get() is safe -- ptep_try_set()
+ * only fires on pte_none, and arena_free_slab_page() can't race on this
+ * offset (range stays allocated in range_tree for our walk).
+ */
+ cur = ptep_get(pte);
+ if (pte_none(cur) || !pte_present(cur))
+ return 0;
+ if (PageSlab(pte_page(cur)))
+ return 0;
+
/*
* Pairs with ptep_try_set() in the kernel-fault scratch installer.
* Both sides must be atomic.
@@ -290,12 +331,25 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
goto err_free_scratch;
mutex_init(&arena->lock);
raw_res_spin_lock_init(&arena->spinlock);
+ arena->slab_pages = bpf_map_area_alloc(attr->max_entries *
+ sizeof(arena->slab_pages[0]),
+ numa_node);
+ if (!arena->slab_pages) {
+ err = -ENOMEM;
+ goto err_destroy_rt;
+ }
err = populate_pgtable_except_pte(arena);
if (err)
- goto err_destroy_rt;
+ goto err_free_slab_pages;
+
+ err = arena_init_slab_caches(arena);
+ if (err)
+ goto err_free_slab_pages;
return &arena->map;
+err_free_slab_pages:
+ bpf_map_area_free(arena->slab_pages);
err_destroy_rt:
range_tree_destroy(&arena->rt);
err_free_scratch:
@@ -347,6 +401,9 @@ static void arena_map_free(struct bpf_map *map)
if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
return;
+ /* Tear down slab caches first so all slab-backed pages return to arena. */
+ arena_destroy_slab_caches(arena);
+
/* Ensure no pending deferred frees */
irq_work_sync(&arena->free_irq);
flush_work(&arena->free_work);
@@ -359,6 +416,7 @@ static void arena_map_free(struct bpf_map *map)
*/
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
+ bpf_map_area_free(arena->slab_pages);
free_vm_area(arena->kern_vm);
range_tree_destroy(&arena->rt);
__free_page(arena->scratch_page);
@@ -461,6 +519,9 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
if (page == arena->scratch_page)
/* BPF triggered scratch here; don't lazy-alloc over it */
goto out_sigsegv;
+ if (PageSlab(page))
+ /* Don't return slab-backed arena page */
+ goto out_sigsegv;
/* already have a page vmap-ed */
goto out;
}
@@ -625,7 +686,8 @@ static u64 clear_lo32(u64 val)
* Later the pages will be mmaped into user space vma.
*/
static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id,
- bool sleepable)
+ bool sleepable, bool set_page_slab,
+ struct page **out_page)
{
/* user_vm_end/start are fixed before bpf prog runs */
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
@@ -633,6 +695,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
struct mem_cgroup *new_memcg, *old_memcg;
struct apply_range_data data;
struct page **pages = NULL;
+ struct page *first_page = NULL;
long remaining, mapped = 0;
long alloc_pages;
unsigned long flags;
@@ -647,6 +710,13 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (page_cnt > page_cnt_max)
return 0;
+ /*
+ * out-path rollback can't undo PageSlab on prior batches; restrict
+ * set_page_slab to the single-page arena_alloc_slab_page() caller.
+ */
+ if (WARN_ON_ONCE(set_page_slab && page_cnt > 1))
+ return 0;
+
if (uaddr) {
if (uaddr & ~PAGE_MASK)
return 0;
@@ -665,6 +735,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
return 0;
}
data.pages = pages;
+ data.set_page_slab = set_page_slab;
if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
goto out_free_pages;
@@ -695,6 +766,9 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (ret)
goto out;
+ if (!first_page)
+ first_page = pages[0];
+
/*
* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
* will not overflow 32-bit. Lower 32-bit need to represent
@@ -720,6 +794,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
}
flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+ if (out_page)
+ *out_page = first_page;
kfree_nolock(pages);
bpf_map_memcg_exit(old_memcg, new_memcg);
return clear_lo32(arena->user_vm_start) + uaddr32;
@@ -758,8 +834,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
{
struct mem_cgroup *new_memcg, *old_memcg;
u64 full_uaddr, uaddr_end;
- long kaddr, pgoff;
- struct page *page;
+ long kaddr, pgoff, i;
+ struct page *page, *fb_page;
struct llist_head free_pages;
struct llist_node *pos, *t;
struct arena_free_span *s;
@@ -778,6 +854,21 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
pgoff = compute_pgoff(arena, uaddr);
+
+ /*
+ * Drop bookkeeping for any bpf_arena_alloc() fallback pages within the
+ * freed range. PageSlab entries are owned by slub and must not be
+ * cleared here; slub clears them via bpf_arena_free_slab_page() when
+ * the slab page is released.
+ */
+ for (i = 0; i < page_cnt; i++) {
+ fb_page = READ_ONCE(arena->slab_pages[pgoff + i]);
+ if (fb_page && !PageSlab(fb_page)) {
+ WRITE_ONCE(arena->slab_pages[pgoff + i], NULL);
+ set_page_private(fb_page, 0);
+ }
+ }
+
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
if (!sleepable)
@@ -952,6 +1043,135 @@ static void arena_free_irq(struct irq_work *iw)
schedule_work(&arena->free_work);
}
+/*
+ * SLAB_BPF_ARENA: per-arena kmem_cache buckets backing bpf_arena_alloc/free.
+ * Slab pages come from the arena pool; slub uses direct-map VAs internally,
+ * BPF sees the arena vmalloc view, translation happens at the kfunc boundary.
+ */
+struct slab *bpf_arena_alloc_slab_page(void *arena_p, gfp_t flags, int node,
+ bool allow_spin)
+{
+ struct bpf_arena *arena = arena_p;
+ long ret_user_va;
+ struct page *page;
+ struct slab *slab;
+ u32 uaddr32;
+
+ /*
+ * set_page_slab=true makes apply_range_set_cb() tag PageSlab under
+ * arena->spinlock so a racing bpf_arena_free_pages() can't free it.
+ */
+ ret_user_va = arena_alloc_pages(arena, 0, 1, node, allow_spin, true, &page);
+ if (!ret_user_va)
+ return NULL;
+
+ uaddr32 = (u32)ret_user_va;
+ slab = page_slab(page);
+ /*
+ * Stash uaddr32 in slab->stride; allocate_slab() skips
+ * alloc_slab_obj_exts_early() for SLAB_BPF_ARENA so it survives.
+ */
+ slab_set_stride(slab, uaddr32);
+ WRITE_ONCE(arena->slab_pages[uaddr32 >> PAGE_SHIFT], page);
+
+ return slab;
+}
+
+static u32 arena_slab_uaddr32(const struct slab *slab)
+{
+ return slab_get_stride((struct slab *)slab);
+}
+
+void bpf_arena_free_slab_page(void *arena_p, struct slab *slab)
+{
+ struct bpf_arena *arena = arena_p;
+ u32 uaddr32 = arena_slab_uaddr32(slab);
+
+ WRITE_ONCE(arena->slab_pages[uaddr32 >> PAGE_SHIFT], NULL);
+ arena_free_pages(arena, uaddr32, 1, false);
+}
+
+static int arena_init_slab_caches(struct bpf_arena *arena)
+{
+ char name[KSYM_NAME_LEN];
+ unsigned int i;
+
+ for (i = ARENA_KMALLOC_MIN_SHIFT; i < ARENA_KMALLOC_NUM_BUCKETS; i++) {
+ struct kmem_cache *c;
+ struct kmem_cache_args args = {
+ .align = sizeof(void *),
+ .bpf_arena = arena,
+ };
+
+ snprintf(name, sizeof(name), "arena-%lx-%u",
+ (unsigned long)arena, 1U << i);
+ c = kmem_cache_create(name, 1U << i, &args, SLAB_BPF_ARENA);
+ if (!c)
+ goto err;
+ arena->kmalloc_caches[i] = c;
+ }
+ return 0;
+err:
+ arena_destroy_slab_caches(arena);
+ return -ENOMEM;
+}
+
+static void arena_destroy_slab_caches(struct bpf_arena *arena)
+{
+ long max = arena->map.max_entries;
+ unsigned int i;
+ long pgoff;
+
+ /*
+ * Drain per-cpu sheaves of every bucket before walking slab_pages[].
+ * Sheaves cache pointers into slab pages that the force-discard loop
+ * is about to release; kmem_cache_shrink() flushes those caches back
+ * into their slabs (and frees any slab that becomes empty), so the
+ * later force-discard cannot trigger __slab_free() on memory that has
+ * since been recycled. Frees triggered here go through
+ * bpf_arena_free_slab_page() which clears arena->slab_pages[], so
+ * those entries become NULL and the loop below skips them.
+ */
+ for (i = ARENA_KMALLOC_MIN_SHIFT; i < ARENA_KMALLOC_NUM_BUCKETS; i++) {
+ if (!arena->kmalloc_caches[i])
+ continue;
+ kmem_cache_shrink(arena->kmalloc_caches[i]);
+ }
+
+ /*
+ * Force-discard every slab page slub still tracks via slab_pages[].
+ * Catches orphans not on n->partial (trylock failures in __slab_free)
+ * and BPF-leaked slabs with inuse > 0; without this kmem_cache_destroy()
+ * would see n->nr_slabs > 0, WARN, and leak the kmem_cache descriptor.
+ */
+ for (pgoff = 0; pgoff < max; pgoff++) {
+ struct page *page = arena->slab_pages[pgoff];
+ struct slab *slab;
+
+ if (!page)
+ continue;
+ if (!PageSlab(page))
+ /*
+ * Leftover bpf_arena_alloc() fallback page; freed by
+ * existing_page_cb() in arena_map_free().
+ */
+ continue;
+ slab = page_slab(page);
+ kmem_cache_force_discard_slab(slab->slab_cache, slab);
+ }
+
+ /* Let deferred page frees from the discard pass run before teardown. */
+ irq_work_sync(&arena->free_irq);
+ flush_work(&arena->free_work);
+
+ for (i = 0; i < ARENA_KMALLOC_NUM_BUCKETS; i++) {
+ if (!arena->kmalloc_caches[i])
+ continue;
+ kmem_cache_destroy(arena->kmalloc_caches[i]);
+ arena->kmalloc_caches[i] = NULL;
+ }
+}
+
__bpf_kfunc_start_defs();
__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
@@ -963,7 +1183,8 @@ __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_
if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
return NULL;
- return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id,
+ true, false, NULL);
}
void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
@@ -975,7 +1196,8 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
return NULL;
- return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id,
+ false, false, NULL);
}
void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
@@ -987,7 +1209,8 @@ void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cn
if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
return NULL;
- return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id,
+ true, false, NULL);
}
__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
@@ -1023,12 +1246,139 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c
return arena_reserve_pages(arena, (long)ptr__ign, page_cnt);
}
+
+/*
+ * bpf_arena_alloc: allocate one object of @size bytes from the arena's
+ * slab buckets. Returns a value whose low 32 bits are the arena offset;
+ * BPF programs use it as a void __arena *. Slub gives us a direct-map kva;
+ * its slab page carries the arena uaddr32 in slab->stride.
+ *
+ * For @size > PAGE_SIZE the slab buckets cannot satisfy the request and
+ * the allocation falls back to arena_alloc_pages(). The first page of
+ * such a multi-page allocation is stashed in arena->slab_pages[pgoff]
+ * (without PageSlab) with page_cnt in page->private, so bpf_arena_free()
+ * can find it again from the arena offset alone.
+ */
+__bpf_kfunc void *bpf_arena_alloc(void *p__map, u32 size)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct kmem_cache *c;
+ struct slab *slab;
+ unsigned int idx;
+ void *kva;
+ u32 uaddr32;
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || !size)
+ return NULL;
+ if (size > (1U << ARENA_KMALLOC_MAX_SHIFT)) {
+ struct page *first_page;
+ long ret_user_va;
+ u32 page_cnt, pgoff;
+
+ page_cnt = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
+ if (!page_cnt)
+ return NULL;
+ /* sleepable=false mirrors kmem_cache_alloc_nolock() */
+ ret_user_va = arena_alloc_pages(arena, 0, page_cnt, NUMA_NO_NODE,
+ false, false, &first_page);
+ if (!ret_user_va)
+ return NULL;
+ pgoff = (u32)ret_user_va >> PAGE_SHIFT;
+ set_page_private(first_page, page_cnt);
+ WRITE_ONCE(arena->slab_pages[pgoff], first_page);
+ return (void *)ret_user_va;
+ }
+
+ idx = max_t(unsigned int, fls(size - 1), ARENA_KMALLOC_MIN_SHIFT);
+ if (idx >= ARENA_KMALLOC_NUM_BUCKETS)
+ return NULL;
+ c = arena->kmalloc_caches[idx];
+ if (!c)
+ return NULL;
+
+ /*
+ * Use the nolock variant so this kfunc is safe from any context.
+ * Skip __GFP_ACCOUNT because memcg charging already happens at
+ * the arena page level.
+ */
+ kva = kmem_cache_alloc_nolock(c, 0, NUMA_NO_NODE);
+ if (!kva)
+ return NULL;
+
+ slab = virt_to_slab(kva);
+ if (!slab || slab->slab_cache != c) {
+ bpf_prog_report_arena_violation(true, (long)kva, _RET_IP_);
+ return NULL;
+ }
+ uaddr32 = arena_slab_uaddr32(slab) |
+ ((u32)(unsigned long)kva & ~PAGE_MASK);
+ return (void *)(clear_lo32(arena->user_vm_start) + uaddr32);
+}
+
+/*
+ * bpf_arena_free: free an object previously returned by bpf_arena_alloc.
+ * The arena offset's high bits identify the slab page; slab->slab_cache's
+ * bpf_arena hook confirms it belongs to this arena. The kva handed to
+ * kfree_nolock is direct-map, so its virt_to_slab works normally.
+ */
+__bpf_kfunc void bpf_arena_free(void *p__map, void *ptr__ign)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+ struct page *page;
+ struct slab *slab;
+ u32 arena_off, pgoff;
+ void *kva;
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || !ptr__ign)
+ return;
+
+ arena_off = (u32)(unsigned long)ptr__ign;
+ pgoff = arena_off >> PAGE_SHIFT;
+ if (pgoff >= arena->map.max_entries)
+ goto violation;
+ page = READ_ONCE(arena->slab_pages[pgoff]);
+ if (!page)
+ goto violation;
+ if (!PageSlab(page)) {
+ /*
+ * Multi-page allocation from the bpf_arena_alloc() fallback.
+ * page->private holds page_cnt stashed at allocation time.
+ */
+ u32 page_cnt = page_private(page);
+
+ WRITE_ONCE(arena->slab_pages[pgoff], NULL);
+ set_page_private(page, 0);
+ arena_free_pages(arena, arena_off, page_cnt, false);
+ return;
+ }
+ slab = page_slab(page);
+ if (slab->slab_cache->bpf_arena != arena)
+ goto violation;
+ /*
+ * Reject arena offsets that do not land on an object boundary. Arena
+ * bucket caches have power-of-two s->size, so a simple IS_ALIGNED()
+ * suffices; without this kfree_nolock() would set a freepointer inside
+ * an unrelated object on the same slab page.
+ */
+ if (!IS_ALIGNED(arena_off, slab->slab_cache->size))
+ goto violation;
+ kva = page_to_virt(page) + (arena_off & ~PAGE_MASK);
+ /* nolock free mirrors the nolock alloc — safe from any context. */
+ kfree_nolock(kva);
+ return;
+violation:
+ bpf_prog_report_arena_violation(true, arena_off, _RET_IP_);
+}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(arena_kfuncs)
BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2)
BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2)
BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_alloc, KF_ARENA_RET)
+BTF_ID_FLAGS(func, bpf_arena_free, KF_ARENA_ARG2)
BTF_KFUNCS_END(arena_kfuncs)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/mm/slab.h b/mm/slab.h
index bf2f87acf5e3..2b0272c3f5fe 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -248,6 +248,9 @@ struct kmem_cache {
struct kmem_cache_stats __percpu *cpu_stats;
#endif
+ /* NULL unless SLAB_BPF_ARENA; opaque arena pointer. */
+ void *bpf_arena;
+
struct kmem_cache_per_node_ptrs per_node[MAX_NUMNODES];
};
@@ -414,7 +417,8 @@ void flush_rcu_sheaves_on_cache(struct kmem_cache *s);
SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | SLAB_ACCOUNT | \
- SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
+ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE | \
+ SLAB_BPF_ARENA)
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8b661fff5eed..c9eb6daf649a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -49,7 +49,7 @@ struct kmem_cache *kmem_cache;
*/
#define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \
SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \
- SLAB_OBJ_EXT_IN_OBJ)
+ SLAB_OBJ_EXT_IN_OBJ | SLAB_BPF_ARENA)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
diff --git a/mm/slub.c b/mm/slub.c
index 1daa89105e04..1a2e85605ab9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -21,6 +21,7 @@
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
+#include <linux/bpf_defs.h>
#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -531,11 +532,25 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
unsigned long ptr_addr;
freeptr_t p;
+ void *decoded;
object = kasan_reset_tag(object);
ptr_addr = (unsigned long)object + s->offset;
p = *(freeptr_t *)(ptr_addr);
- return freelist_ptr_decode(s, p, ptr_addr);
+ decoded = freelist_ptr_decode(s, p, ptr_addr);
+ /*
+ * SLAB_BPF_ARENA freepointer slots are BPF-writable. Clamp the decoded
+ * pointer to an s->size-aligned address within the same slab page so
+ * chain walks stay on legitimate object boundaries. Arena slabs are
+ * always one page (order 0). NULL preserved.
+ */
+ if (unlikely(s->bpf_arena) && decoded) {
+ unsigned long obj_mask = s->size - 1;
+
+ decoded = (void *)(((unsigned long)object & PAGE_MASK) |
+ ((unsigned long)decoded & ~PAGE_MASK & ~obj_mask));
+ }
+ return decoded;
}
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
@@ -543,7 +558,12 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
unsigned long freeptr_addr = (unsigned long)object + s->offset;
#ifdef CONFIG_SLAB_FREELIST_HARDENED
- BUG_ON(object == fp); /* naive detection of double free or corruption */
+ if (unlikely(object == fp)) {
+ /* BPF double-free of arena objects must not panic the kernel. */
+ if (s->bpf_arena)
+ return;
+ BUG_ON(object == fp); /* naive detection of double free or corruption */
+ }
#endif
freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
@@ -3270,6 +3290,9 @@ static inline struct slab *alloc_slab_page(struct kmem_cache *s, gfp_t flags,
struct slab *slab;
unsigned int order = oo_order(oo);
+ if (unlikely(s->bpf_arena))
+ return bpf_arena_alloc_slab_page(s->bpf_arena, flags, node, allow_spin);
+
if (unlikely(!allow_spin))
page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
node, order);
@@ -3493,9 +3516,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
init_slab_obj_exts(slab);
/*
* Poison the slab before initializing the slabobj_ext array
- * to prevent the array from being overwritten.
+ * to prevent the array from being overwritten. Arena caches
+ * stash uaddr32 in slab->stride; let them keep it.
*/
- alloc_slab_obj_exts_early(s, slab);
+ if (!(s->flags & SLAB_BPF_ARENA))
+ alloc_slab_obj_exts_early(s, slab);
account_slab(slab, oo_order(oo), s, flags);
shuffle = shuffle_freelist(s, slab, allow_spin);
@@ -3538,6 +3563,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin
__ClearPageSlab(page);
mm_account_reclaimed_pages(pages);
unaccount_slab(slab, order, s, allow_spin);
+ if (unlikely(s->bpf_arena)) {
+ bpf_arena_free_slab_page(s->bpf_arena, slab);
+ return;
+ }
if (allow_spin)
free_frozen_pages(page, order);
else
@@ -5442,6 +5471,32 @@ void *kmem_cache_alloc_nolock_noprof(struct kmem_cache *s, gfp_t gfp_flags,
}
EXPORT_SYMBOL_GPL(kmem_cache_alloc_nolock_noprof);
+/**
+ * kmem_cache_force_discard_slab - force-evict a slab page from its cache
+ * @s: kmem_cache that owns the slab
+ * @slab: the slab to evict
+ *
+ * Removes @slab from any per-node list it may be on and then discards it
+ * (decrements nr_slabs and frees the backing page). Intended for arena
+ * teardown: arena owns the page-tracking array and can enumerate every
+ * slab page it allocated, including orphans not on any partial list (left
+ * behind by spin_trylock failures in __slab_free()) and slabs whose
+ * objects were never returned (BPF program leak).
+ */
+void kmem_cache_force_discard_slab(struct kmem_cache *s, struct slab *slab)
+{
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
+ unsigned long flags;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ if (slab_test_node_partial(slab))
+ remove_partial(n, slab);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ discard_slab(s, slab);
+}
+EXPORT_SYMBOL_GPL(kmem_cache_force_discard_slab);
+
void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
int node, unsigned long caller)
{
@@ -5589,14 +5644,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
n = get_node(s, slab_nid(slab));
/*
- * Speculatively acquire the list_lock.
- * If the cmpxchg does not succeed then we may
- * drop the list_lock without any processing.
+ * Speculatively acquire the list_lock. If the cmpxchg
+ * does not succeed we drop the lock without processing.
*
- * Otherwise the list_lock will synchronize with
- * other processors updating the list of slabs.
+ * Arena caches may reach here from kfree_nolock() in
+ * NMI/irq-off context; trylock and orphan the slab on
+ * failure. A later allow_spin caller adopts it.
*/
- spin_lock_irqsave(&n->list_lock, flags);
+ if (unlikely(s->bpf_arena)) {
+ if (!spin_trylock_irqsave(&n->list_lock, flags))
+ n = NULL;
+ } else {
+ spin_lock_irqsave(&n->list_lock, flags);
+ }
on_node_partial = slab_test_node_partial(slab);
}
@@ -6666,6 +6726,15 @@ void kfree_nolock(const void *object)
if (likely(can_free_to_pcs(slab)) && likely(free_to_pcs(s, x, false)))
return;
+ /*
+ * Arena freepointer slots are BPF-writable; defer_free()'s in-object
+ * llist chain could be redirected. Route through __slab_free() instead;
+ * it trylocks n->list_lock and orphans the slab on failure.
+ */
+ if (s->bpf_arena) {
+ __slab_free(s, slab, x, x, 1, _RET_IP_);
+ return;
+ }
/*
* __slab_free() can locklessly cmpxchg16 into a slab, but then it might
* need to take spin_lock for further processing.
@@ -7181,16 +7250,22 @@ __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int mi
/*
* Freelist had more objects than we can accommodate, we need to
* free them back. We can treat it like a detached freelist, just
- * need to find the tail object.
+ * need to find the tail object. Bound the walk by slab->objects
+ * so a corrupted in-object freepointer (e.g. BPF arena cache
+ * where the slot is writable from BPF) cannot loop forever; a
+ * legitimate freelist on this slab has at most that many nodes.
*/
if (unlikely(object)) {
void *head = object;
void *tail;
- int cnt = 0;
+ unsigned int cnt = 0;
+ unsigned int limit = slab->objects;
do {
tail = object;
cnt++;
+ if (unlikely(cnt >= limit))
+ break;
object = get_freepointer(s, object);
} while (object);
__slab_free(s, slab, head, tail, cnt, _RET_IP_);
@@ -7763,12 +7838,21 @@ static unsigned int calculate_sheaf_capacity(struct kmem_cache *s,
return 0;
/*
- * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT).
+ * Bootstrap caches (kmem_cache, kmem_cache_node) carry SLAB_NO_OBJ_EXT
+ * and are created before kmalloc is available, so sheaf/barn setup
+ * can't run yet.
+ *
* SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not
* have sheaves to avoid recursion when sheaf allocation triggers
* kmemleak tracking.
+ *
+ * SLAB_BPF_ARENA caches also set SLAB_NO_OBJ_EXT to suppress per-object
+ * extensions, but they are created at runtime and want sheaves like any
+ * other cache, so exempt them.
*/
- if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
+ if (s->flags & SLAB_NOLEAKTRACE)
+ return 0;
+ if ((s->flags & SLAB_NO_OBJ_EXT) && !(s->flags & SLAB_BPF_ARENA))
return 0;
/*
@@ -8607,6 +8691,27 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
s->useroffset = args->useroffset;
s->usersize = args->usersize;
#endif
+ if (s->flags & SLAB_BPF_ARENA) {
+ if (!args->bpf_arena)
+ goto out;
+ /*
+ * Strip every SLAB_DEBUG_FLAGS bit from arena caches.
+ * Masking (rather than goto out) keeps arena maps creatable
+ * under slub_debug=... cmdline.
+ */
+ s->flags &= ~SLAB_DEBUG_FLAGS;
+ /* Non-debug knobs we cannot honor: refuse the cache. */
+ if (s->flags & (SLAB_KASAN | SLAB_TYPESAFE_BY_RCU | SLAB_ACCOUNT))
+ goto out;
+ /*
+ * Suppress per-object obj_exts for arena caches: accounting
+ * already happens at arena-page granularity (bpf_map_memcg_enter
+ * in arena_alloc_pages), and per-slab obj_exts would cost
+ * sizeof(slabobj_ext) * objs_per_slab of overhead per page.
+ */
+ s->flags |= SLAB_NO_OBJ_EXT;
+ s->bpf_arena = args->bpf_arena;
+ }
if (!calculate_sizes(args, s))
goto out;
@@ -8623,6 +8728,17 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
}
}
+ if (s->flags & SLAB_BPF_ARENA) {
+ /*
+ * Arena page source currently allocates one page at a time;
+ * force order 0 and pin s->min to s->oo so allocate_slab() has
+ * no fallback path and get_freepointer()'s slab-mask sanitize
+ * (oo_order(s->oo)) always matches the actual slab order.
+ */
+ s->oo = oo_make(0, s->size);
+ s->min = s->oo;
+ }
+
#ifdef system_has_freelist_aba
if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
/* Enable fast mode */
@@ -9628,6 +9744,17 @@ static int sysfs_slab_add(struct kmem_cache *s)
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
+ /*
+ * Hide arena caches from /sys/kernel/slab: shrink/validate/etc would
+ * BUG_ON on BPF-induced inuse underflow or corrupted freelists.
+ * kobject_init() (no kobject_add()) keeps the destroy-time
+ * kobject_put() -> slab_kmem_cache_release() path working.
+ */
+ if (s->bpf_arena) {
+ kobject_init(&s->kobj, &slab_ktype);
+ return 0;
+ }
+
if (!unmergeable && disable_higher_order_debug &&
(slub_debug & DEBUG_METADATA_FLAGS))
unmergeable = 1;
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH bpf-next 4/4] selftests/bpf: Add tests for arena slub-backed allocator
2026-05-29 20:24 [PATCH bpf-next 0/4] bpf,slab: Introduce bpf_arena_alloc() kfuncs Alexei Starovoitov
` (2 preceding siblings ...)
2026-05-29 20:24 ` [PATCH bpf-next 3/4] bpf,slab: Add slub-backed allocator for bpf_arena Alexei Starovoitov
@ 2026-05-29 20:24 ` Alexei Starovoitov
2026-05-29 22:39 ` sashiko-bot
3 siblings, 1 reply; 11+ messages in thread
From: Alexei Starovoitov @ 2026-05-29 20:24 UTC (permalink / raw)
To: bpf
Cc: daniel, andrii, memxor, eddyz87, vbabka, harry.yoo, david, tj,
roman.gushchin, peterz, linux-mm
From: Alexei Starovoitov <ast@kernel.org>
Cover the SLAB_BPF_ARENA path end-to-end:
- arena_slab: smoke tests for the bpf_arena_alloc/free kfuncs.
- arena_slab_freeptr_stale_pcs: exercise corrupted in-object
freepointers to validate get_freepointer() clamping and the
__refill_objects_node() bounded walk.
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
.../selftests/bpf/prog_tests/arena_slab.c | 59 ++++++
.../prog_tests/arena_slab_freeptr_stale_pcs.c | 28 +++
.../testing/selftests/bpf/progs/arena_slab.c | 179 ++++++++++++++++++
.../bpf/progs/arena_slab_freeptr_stale_pcs.c | 120 ++++++++++++
4 files changed, 386 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_slab.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_slab_freeptr_stale_pcs.c
create mode 100644 tools/testing/selftests/bpf/progs/arena_slab.c
create mode 100644 tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_slab.c b/tools/testing/selftests/bpf/prog_tests/arena_slab.c
new file mode 100644
index 000000000000..6cbaa6991c6b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/arena_slab.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include "arena_slab.skel.h"
+
+void test_arena_slab(void)
+{
+ LIBBPF_OPTS(bpf_test_run_opts, opts);
+ struct arena_slab *skel;
+ int ret;
+
+ skel = arena_slab__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "arena_slab__open_and_load"))
+ return;
+
+ ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_alloc), &opts);
+ ASSERT_OK(ret, "alloc_run");
+ ASSERT_OK(opts.retval, "alloc_retval");
+ ASSERT_EQ(skel->bss->alloc_failed, 0, "no alloc failures");
+
+ ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_free), &opts);
+ ASSERT_OK(ret, "free_run");
+ ASSERT_OK(opts.retval, "free_retval");
+ ASSERT_EQ(skel->bss->free_done, 1, "free completed");
+
+ /* Realloc to make sure freed objects can be returned again. */
+ ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_alloc), &opts);
+ ASSERT_OK(ret, "realloc_run");
+ ASSERT_EQ(skel->bss->alloc_failed, 0, "no alloc failures after free");
+
+ ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_free), &opts);
+ ASSERT_OK(ret, "free_run_2");
+
+ /*
+ * defer_free() corruption repro. Allocates CORRUPT_N PAGE_SIZE
+ * objects, then with local IRQs disabled frees each one and
+ * immediately overwrites its freepointer slot. With IRQs off the
+ * irq_work IPI raised by defer_free() is deferred; multiple
+ * defer_free()d objects chain onto the per-cpu llist via the
+ * poisoned freepointer slots. After local_irq_restore() the IPI
+ * fires and free_deferred_objects() walks the corrupted llist,
+ * oopsing on a pre-fix kernel. The spin_trylock __slab_free()
+ * fix keeps the freed objects out of any in-object llist, so the
+ * test completes cleanly.
+ */
+ ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_defer_corrupt), &opts);
+ ASSERT_OK(ret, "defer_corrupt_run");
+ ASSERT_OK(opts.retval, "defer_corrupt_retval");
+ ASSERT_EQ(skel->bss->corrupt_alloc_failed, 0, "no alloc failures in defer_corrupt");
+ ASSERT_EQ(skel->bss->corrupt_done, 1, "defer_corrupt completed");
+
+ ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_leak), &opts);
+ ASSERT_OK(ret, "leak_run");
+ ASSERT_OK(opts.retval, "leak_retval");
+ ASSERT_EQ(skel->bss->leak_alloc_failed, 0, "no alloc failures in leak");
+ ASSERT_EQ(skel->bss->leak_done, 1, "leak completed");
+
+ arena_slab__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_slab_freeptr_stale_pcs.c b/tools/testing/selftests/bpf/prog_tests/arena_slab_freeptr_stale_pcs.c
new file mode 100644
index 000000000000..fbf4f5a7e4b3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/arena_slab_freeptr_stale_pcs.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "arena_slab_freeptr_stale_pcs.skel.h"
+
+void test_arena_slab_freeptr_stale_pcs(void)
+{
+ LIBBPF_OPTS(bpf_test_run_opts, opts);
+ struct arena_slab_freeptr_stale_pcs *skel;
+ int ret;
+
+ skel = arena_slab_freeptr_stale_pcs__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "arena_slab_freeptr_stale_pcs__open_and_load"))
+ return;
+
+ ret = bpf_prog_test_run_opts(
+ bpf_program__fd(skel->progs.arena_slab_freeptr_stale_pcs),
+ &opts);
+ ASSERT_OK(ret, "arena_slab_freeptr_stale_pcs_run");
+ ASSERT_OK(opts.retval, "arena_slab_freeptr_stale_pcs_retval");
+ ASSERT_EQ(skel->bss->alloc_failed, 0, "initial allocs");
+ ASSERT_EQ(skel->bss->drain_failed, 0, "drain sheaf allocs");
+ ASSERT_EQ(skel->bss->cycle_alloc_failed, 0, "self-cycle alloc");
+ ASSERT_EQ(skel->bss->cycle_alloc_mismatch, 0, "cycle returned victim");
+ ASSERT_EQ(skel->bss->stale_alloc_null, 1, "stale sheaf alloc rejected");
+ ASSERT_EQ(skel->bss->done, 1, "stale pcs trigger completed");
+
+ arena_slab_freeptr_stale_pcs__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/arena_slab.c b/tools/testing/selftests/bpf/progs/arena_slab.c
new file mode 100644
index 000000000000..738a48c45da3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/arena_slab.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#define BPF_NO_KFUNC_PROTOTYPES
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_experimental.h"
+#include "bpf_arena_common.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARENA);
+ __uint(map_flags, BPF_F_MMAPABLE);
+ __uint(max_entries, 256); /* number of pages */
+#ifdef __TARGET_ARCH_arm64
+ __ulong(map_extra, 0x1ull << 32);
+#else
+ __ulong(map_extra, 0x1ull << 44);
+#endif
+} arena SEC(".maps");
+
+void __arena *bpf_arena_alloc(void *map, __u32 size) __ksym __weak;
+void bpf_arena_free(void *map, void __arena *ptr) __ksym __weak;
+void bpf_local_irq_save(unsigned long *flags) __ksym;
+void bpf_local_irq_restore(unsigned long *flags) __ksym;
+
+#define N 64
+#define LARGE_N 2
+#define CORRUPT_N 8
+#define LEAK_N 16
+
+int alloc_failed;
+int free_done;
+int corrupt_alloc_failed;
+int corrupt_done;
+int leak_alloc_failed;
+int leak_done;
+
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+static __u8 __arena *objs[N];
+static __u8 __arena *large_objs[LARGE_N];
+static __u64 __arena *corrupt_objs[CORRUPT_N];
+
+/* Sizes > PAGE_SIZE force bpf_arena_alloc() onto the arena_alloc_pages() fallback. */
+static const __u32 large_sizes[LARGE_N] = {
+ PAGE_SIZE + 1,
+ 3 * PAGE_SIZE,
+};
+#endif
+
+SEC("syscall")
+int arena_slab_alloc(void *ctx)
+{
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+ int i;
+
+ for (i = 0; i < N; i++) {
+ __u32 size = 8U << (i & 7); /* 8, 16, 32, ... 1024 */
+ __u8 __arena *p = bpf_arena_alloc(&arena, size);
+
+ if (!p) {
+ alloc_failed = i + 1;
+ return 0;
+ }
+ /*
+ * Write a sentinel that depends on the slot — proves the object
+ * is real arena memory and not aliased with another slot.
+ */
+ p[0] = (__u8)(i + 1);
+ p[size - 1] = (__u8)(i + 1);
+ objs[i] = p;
+ }
+
+ /*
+ * Exercise the >PAGE_SIZE fallback. bpf_arena_alloc() routes these
+ * through arena_alloc_pages(); bpf_arena_free() recovers page_cnt
+ * from page->private and tears the range back down.
+ */
+ for (i = 0; i < LARGE_N; i++) {
+ __u32 size = large_sizes[i];
+ __u8 __arena *p = bpf_arena_alloc(&arena, size);
+
+ if (!p) {
+ alloc_failed = N + i + 1;
+ return 0;
+ }
+ /* Touch first, middle, and last byte to exercise every mapped page. */
+ p[0] = (__u8)(N + i + 1);
+ p[size / 2] = (__u8)(N + i + 1);
+ p[size - 1] = (__u8)(N + i + 1);
+ large_objs[i] = p;
+ }
+#endif
+ return 0;
+}
+
+SEC("syscall")
+int arena_slab_free(void *ctx)
+{
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+ int i;
+
+ for (i = 0; i < N; i++) {
+ if (!objs[i])
+ continue;
+ bpf_arena_free(&arena, objs[i]);
+ objs[i] = NULL;
+ }
+ for (i = 0; i < LARGE_N; i++) {
+ if (!large_objs[i])
+ continue;
+ bpf_arena_free(&arena, large_objs[i]);
+ large_objs[i] = NULL;
+ }
+ free_done = 1;
+#endif
+ return 0;
+}
+
+SEC("syscall")
+int arena_slab_defer_corrupt(void *ctx)
+{
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < CORRUPT_N; i++) {
+ corrupt_objs[i] = bpf_arena_alloc(&arena, 4096);
+ if (!corrupt_objs[i]) {
+ corrupt_alloc_failed = i + 1;
+ return 0;
+ }
+ }
+
+ /*
+ * IRQs off so defer_free()'s irq_work IPI accumulates the chain across
+ * iterations instead of draining between each free.
+ */
+ bpf_local_irq_save(&flags);
+ for (i = 0; i < CORRUPT_N; i++) {
+ bpf_arena_free(&arena, corrupt_objs[i]);
+ /*
+ * Freepointer slot for non-debug caches is at object_size/2;
+ * 2048 for the 4096-byte bucket. Poison defer_free()'s next.
+ */
+ corrupt_objs[i][2048 / sizeof(__u64)] = 0xdeadbeefdeadbeefULL;
+ }
+ bpf_local_irq_restore(&flags);
+
+ corrupt_done = 1;
+#endif
+ return 0;
+}
+
+/*
+ * Intentional leak across multiple bucket caches; destroyed arena must
+ * still tear down cleanly (no kmem_cache_destroy() WARN, no leak).
+ */
+SEC("syscall")
+int arena_slab_leak(void *ctx)
+{
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+ int i;
+
+ for (i = 0; i < LEAK_N; i++) {
+ __u32 size = 16U << (i & 7); /* 16, 32, ..., 2048 */
+ __u8 __arena *p = bpf_arena_alloc(&arena, size);
+
+ if (!p) {
+ leak_alloc_failed = i + 1;
+ return 0;
+ }
+ p[0] = (__u8)(i + 1);
+ }
+ leak_done = 1;
+#endif
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c b/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c
new file mode 100644
index 000000000000..4d23d75419d6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0
+#define BPF_NO_KFUNC_PROTOTYPES
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_experimental.h"
+#include "bpf_arena_common.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARENA);
+ __uint(map_flags, BPF_F_MMAPABLE);
+ __uint(max_entries, 256);
+#ifdef __TARGET_ARCH_arm64
+ __ulong(map_extra, 0x1ull << 32);
+#else
+ __ulong(map_extra, 0x1ull << 44);
+#endif
+} arena SEC(".maps");
+
+void __arena *bpf_arena_alloc(void *map, __u32 size) __ksym __weak;
+void bpf_arena_free(void *map, void __arena *ptr) __ksym __weak;
+void bpf_preempt_disable(void) __ksym;
+void bpf_preempt_enable(void) __ksym;
+
+#define OBJ_SIZE 4096
+#define FREEPTR_OFFSET (OBJ_SIZE / 2)
+#define SHEAF_FILL 4
+#define TARGET_IDX SHEAF_FILL
+#define EXTRA_IDX (TARGET_IDX + 1)
+#define NR_OBJS (EXTRA_IDX + 1)
+
+int alloc_failed;
+int drain_failed;
+int cycle_alloc_failed;
+int cycle_alloc_mismatch;
+int stale_alloc_null;
+int done;
+
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+static __u8 __arena *objs[NR_OBJS];
+#endif
+
+SEC("syscall")
+int arena_slab_freeptr_stale_pcs(void *ctx)
+{
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+ __u8 __arena *victim, *p;
+ __u64 raw;
+ int i;
+
+ for (i = 0; i < NR_OBJS; i++) {
+ objs[i] = bpf_arena_alloc(&arena, OBJ_SIZE);
+ if (!objs[i]) {
+ alloc_failed = i + 1;
+ return 0;
+ }
+ objs[i][0] = i + 1;
+ }
+
+ bpf_preempt_disable();
+
+ /* Fill the per-cpu sheaf so the next free reaches SLUB proper. */
+ for (i = 1; i <= SHEAF_FILL; i++)
+ bpf_arena_free(&arena, objs[i - 1]);
+
+ victim = objs[TARGET_IDX];
+
+ /*
+ * The 4096-byte bucket has one object per slab and a 4-object sheaf.
+ * Free @victim while the sheaf is full, then turn its encoded NULL
+ * freepointer into any non-NULL decoded value. The arena clamp keeps
+ * non-NULL decoded values in the same slab and object-aligned, so this
+ * becomes a freelist self-cycle back to @victim.
+ */
+ bpf_arena_free(&arena, victim);
+ raw = *(__u64 __arena *)(victim + FREEPTR_OFFSET);
+ *(__u64 __arena *)(victim + FREEPTR_OFFSET) = raw ^ 1;
+
+ for (i = 0; i < SHEAF_FILL; i++) {
+ p = bpf_arena_alloc(&arena, OBJ_SIZE);
+ if (!p) {
+ drain_failed = i + 1;
+ goto out;
+ }
+ }
+
+ p = bpf_arena_alloc(&arena, OBJ_SIZE);
+ if (!p) {
+ cycle_alloc_failed = 1;
+ goto out;
+ }
+ if (p != victim)
+ cycle_alloc_mismatch = 1;
+
+ for (i = 0; i < SHEAF_FILL; i++)
+ bpf_arena_free(&arena, victim);
+
+ /*
+ * The sheaf is full of duplicate victim pointers now. Free the four
+ * filler objects plus one extra object directly to SLUB, leaving enough
+ * partial slabs that the next target-slab zero-inuse transition discards
+ * the target page instead of keeping it on the partial list.
+ */
+ for (i = 0; i < SHEAF_FILL; i++)
+ bpf_arena_free(&arena, objs[i]);
+ bpf_arena_free(&arena, objs[EXTRA_IDX]);
+
+ bpf_arena_free(&arena, victim);
+
+ p = bpf_arena_alloc(&arena, OBJ_SIZE);
+ if (!p)
+ stale_alloc_null = 1;
+
+ done = 1;
+out:
+ bpf_preempt_enable();
+#endif
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 11+ messages in thread