BPF List
 help / color / mirror / Atom feed
* [PATCH bpf-next 0/4] bpf,slab: Introduce bpf_arena_alloc() kfuncs
@ 2026-05-29 20:24 Alexei Starovoitov
  2026-05-29 20:24 ` [PATCH bpf-next 1/4] slab: Introduce kmem_cache_alloc_nolock() Alexei Starovoitov
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Alexei Starovoitov @ 2026-05-29 20:24 UTC (permalink / raw)
  To: bpf
  Cc: daniel, andrii, memxor, eddyz87, vbabka, harry.yoo, david, tj,
	roman.gushchin, peterz, linux-mm

From: Alexei Starovoitov <ast@kernel.org>

Let BPF programs and kernel allocate objects in bpf arena via
kvmalloc-style API: bpf_arena_alloc() kfunc.

Currently sched-ext BPF programs implement various allocators
as BPF programs on top of bpf_arena_alloc_pages() kfunc. 
This is inefficient.

Recently sched-ext kernel core introduced scx_arena_alloc() backed
by gen_pool_alloc() to allocate arena memory too.
This is also inefficient and gen_pool_alloc() is not fast enough.

Hence adopt sheaves and slub as allocator.
See patch 3 for algorithm details.

Alexei Starovoitov (4):
  slab: Introduce kmem_cache_alloc_nolock()
  slub: Pass kmem_cache to alloc_slab_page()
  bpf,slab: Add slub-backed allocator for bpf_arena
  selftests/bpf: Add tests for arena slub-backed allocator

 include/linux/bpf_defs.h                      |  13 +
 include/linux/slab.h                          |  27 ++
 kernel/bpf/Kconfig                            |   3 +
 kernel/bpf/arena.c                            | 366 +++++++++++++++++-
 mm/slab.h                                     |   6 +-
 mm/slab_common.c                              |   2 +-
 mm/slub.c                                     | 201 +++++++++-
 .../selftests/bpf/prog_tests/arena_slab.c     |  59 +++
 .../prog_tests/arena_slab_freeptr_stale_pcs.c |  28 ++
 .../testing/selftests/bpf/progs/arena_slab.c  | 179 +++++++++
 .../bpf/progs/arena_slab_freeptr_stale_pcs.c  | 120 ++++++
 11 files changed, 977 insertions(+), 27 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_slab.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_slab_freeptr_stale_pcs.c
 create mode 100644 tools/testing/selftests/bpf/progs/arena_slab.c
 create mode 100644 tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c

-- 
2.53.0-Meta


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH bpf-next 1/4] slab: Introduce kmem_cache_alloc_nolock()
  2026-05-29 20:24 [PATCH bpf-next 0/4] bpf,slab: Introduce bpf_arena_alloc() kfuncs Alexei Starovoitov
@ 2026-05-29 20:24 ` Alexei Starovoitov
  2026-05-29 21:20   ` sashiko-bot
  2026-05-29 20:24 ` [PATCH bpf-next 2/4] slub: Pass kmem_cache to alloc_slab_page() Alexei Starovoitov
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 11+ messages in thread
From: Alexei Starovoitov @ 2026-05-29 20:24 UTC (permalink / raw)
  To: bpf
  Cc: daniel, andrii, memxor, eddyz87, vbabka, harry.yoo, david, tj,
	roman.gushchin, peterz, linux-mm

From: Alexei Starovoitov <ast@kernel.org>

Add kmem_cache_alloc_nolock_noprof() and its kmem_cache_alloc_nolock()
alloc_hooks() wrapper, mirroring kmalloc_nolock(). Allocates one object
from a specific kmem_cache and is safe from any context, including NMI
and IRQ-off, returning NULL on transient trylock failure.

The new function is needed by the upcoming SLAB_BPF_ARENA kfunc path so
arena object allocation can run from BPF program context.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/slab.h |  5 +++++
 mm/slub.c            | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2b5ab488e96b..7ce9125a6a2c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -958,6 +958,11 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f
 void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node);
 #define kmalloc_nolock(...)			alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__))
 
+void *kmem_cache_alloc_nolock_noprof(struct kmem_cache *s, gfp_t gfp_flags,
+				     int node);
+#define kmem_cache_alloc_nolock(...)		\
+	alloc_hooks(kmem_cache_alloc_nolock_noprof(__VA_ARGS__))
+
 /**
  * __alloc_objs - Allocate objects of a given type using
  * @KMALLOC: which size-based kmalloc wrapper to allocate with.
diff --git a/mm/slub.c b/mm/slub.c
index a2bf3756ca7d..601986aaebdf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5402,6 +5402,45 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
 }
 EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
 
+/**
+ * kmem_cache_alloc_nolock - Allocate one object from a specific cache,
+ * safe from any context (including NMI/IRQ-off), like kmalloc_nolock().
+ *
+ * Returns NULL on failure (including the trylock paths that may transiently
+ * fail under contention).
+ */
+void *kmem_cache_alloc_nolock_noprof(struct kmem_cache *s, gfp_t gfp_flags,
+				     int node)
+{
+	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
+	void *ret;
+
+	VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
+				      __GFP_NO_OBJ_EXT));
+
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+		return NULL;
+	if (!IS_ENABLED(CONFIG_SMP) && in_nmi())
+		return NULL;
+
+	if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
+		return NULL;
+
+	ret = alloc_from_pcs(s, alloc_gfp, node);
+	if (!ret)
+		ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_,
+					s->object_size);
+	if (!ret)
+		return NULL;
+
+	maybe_wipe_obj_freeptr(s, ret);
+	slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
+			     slab_want_init_on_alloc(alloc_gfp, s),
+			     s->object_size);
+	return kasan_kmalloc(s, ret, s->object_size, alloc_gfp);
+}
+EXPORT_SYMBOL_GPL(kmem_cache_alloc_nolock_noprof);
+
 void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
 					 int node, unsigned long caller)
 {
-- 
2.53.0-Meta


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH bpf-next 2/4] slub: Pass kmem_cache to alloc_slab_page()
  2026-05-29 20:24 [PATCH bpf-next 0/4] bpf,slab: Introduce bpf_arena_alloc() kfuncs Alexei Starovoitov
  2026-05-29 20:24 ` [PATCH bpf-next 1/4] slab: Introduce kmem_cache_alloc_nolock() Alexei Starovoitov
@ 2026-05-29 20:24 ` Alexei Starovoitov
  2026-05-29 20:24 ` [PATCH bpf-next 3/4] bpf,slab: Add slub-backed allocator for bpf_arena Alexei Starovoitov
  2026-05-29 20:24 ` [PATCH bpf-next 4/4] selftests/bpf: Add tests for arena slub-backed allocator Alexei Starovoitov
  3 siblings, 0 replies; 11+ messages in thread
From: Alexei Starovoitov @ 2026-05-29 20:24 UTC (permalink / raw)
  To: bpf
  Cc: daniel, andrii, memxor, eddyz87, vbabka, harry.yoo, david, tj,
	roman.gushchin, peterz, linux-mm

From: Alexei Starovoitov <ast@kernel.org>

Add a struct kmem_cache *s parameter to alloc_slab_page() and update
the two call sites in allocate_slab(). No functional change.

This is a preparatory step for the upcoming SLAB_BPF_ARENA support,
where alloc_slab_page() needs to consult cache-specific state to route
arena caches through bpf_arena_alloc_slab_page() instead of the page
allocator.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 mm/slub.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 601986aaebdf..1daa89105e04 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3261,7 +3261,8 @@ static void barn_shrink(struct kmem_cache *s, struct node_barn *barn)
 /*
  * Slab allocation and freeing
  */
-static inline struct slab *alloc_slab_page(gfp_t flags, int node,
+static inline struct slab *alloc_slab_page(struct kmem_cache *s, gfp_t flags,
+					   int node,
 					   struct kmem_cache_order_objects oo,
 					   bool allow_spin)
 {
@@ -3464,7 +3465,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	 * __GFP_RECLAIM could be cleared on the first allocation attempt,
 	 * so pass allow_spin flag directly.
 	 */
-	slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
+	slab = alloc_slab_page(s, alloc_gfp, node, oo, allow_spin);
 	if (unlikely(!slab)) {
 		oo = s->min;
 		alloc_gfp = flags;
@@ -3472,7 +3473,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 		 * Allocation may have failed due to fragmentation.
 		 * Try a lower order alloc if possible
 		 */
-		slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
+		slab = alloc_slab_page(s, alloc_gfp, node, oo, allow_spin);
 		if (unlikely(!slab))
 			return NULL;
 		stat(s, ORDER_FALLBACK);
-- 
2.53.0-Meta


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH bpf-next 3/4] bpf,slab: Add slub-backed allocator for bpf_arena
  2026-05-29 20:24 [PATCH bpf-next 0/4] bpf,slab: Introduce bpf_arena_alloc() kfuncs Alexei Starovoitov
  2026-05-29 20:24 ` [PATCH bpf-next 1/4] slab: Introduce kmem_cache_alloc_nolock() Alexei Starovoitov
  2026-05-29 20:24 ` [PATCH bpf-next 2/4] slub: Pass kmem_cache to alloc_slab_page() Alexei Starovoitov
@ 2026-05-29 20:24 ` Alexei Starovoitov
  2026-05-29 22:04   ` sashiko-bot
  2026-05-29 20:24 ` [PATCH bpf-next 4/4] selftests/bpf: Add tests for arena slub-backed allocator Alexei Starovoitov
  3 siblings, 1 reply; 11+ messages in thread
From: Alexei Starovoitov @ 2026-05-29 20:24 UTC (permalink / raw)
  To: bpf
  Cc: daniel, andrii, memxor, eddyz87, vbabka, harry.yoo, david, tj,
	roman.gushchin, peterz, linux-mm

From: Alexei Starovoitov <ast@kernel.org>

Let BPF programs allocate typed objects in a bpf_arena via a
kvmalloc-style API: bpf_arena_alloc() routes requests up to
PAGE_SIZE through per-arena slab buckets, and falls back to
arena_alloc_pages() for larger sizes -- analogous to kvmalloc()
choosing between kmalloc and vmalloc by size. The fallback page
is stashed in arena->slab_pages[pgoff] (without PageSlab) with
page_cnt in page->private, so bpf_arena_free() can recover the
multi-page allocation from the arena offset alone and release it
via arena_free_pages().

Each arena page now has two kernel VAs that alias the same bytes:
the page allocator's direct-map VA, and the arena's vmalloc mapping
at kern_vm_start + uaddr32. slub uses only the direct-map view --
slab_address(), virt_to_slab(), in-object freepointers, percpu
sheaves, partial lists all work unchanged. BPF programs see the
arena view via kern_vm_addr + (u32)ptr addressing. Translation between
the two windows happens only at the bpf_arena_alloc/free kfunc boundary.

slub side:

  - get_freepointer() clamps the decoded pointer to the same slab
    page via (object & ~slab_mask) | (decoded & slab_mask), NULL
    preserved. Worst case under BPF corruption: chain aliases within
    one arena page.

  - arena_alloc_slab_page() stashes uaddr32 in slab->stride via
    slab_set_stride(); arena_slab_uaddr32() reads it back via
    slab_get_stride(). alloc_slab_obj_exts_early() is skipped for
    SLAB_BPF_ARENA so its own slab_set_stride() doesn't clobber the
    stash.

  - Arena caches get percpu sheaves sized by object size like any
    other runtime cache.

  - __refill_objects_node()'s trailing freelist walk is bounded by
    slab->objects so a BPF-induced freepointer cycle can't loop
    forever.

arena side:

  - Per-arena kmalloc-style bucket caches built at map_alloc cover
    sizes up to PAGE_SIZE; larger requests fall back to
    arena_alloc_pages().
  - slab_pages[pgoff] gives O(1) page lookup, and also anchors
    fallback multi-page allocations for bpf_arena_free().
  - bpf_arena_alloc: kmem_cache_alloc_nolock -> slab_get_stride -> uaddr32.
  - bpf_arena_free: slab_pages[pgoff] -> direct-map kva -> kfree_nolock,
    or arena_free_pages() when page->private records a multi-page span.
  - apply_range_clear_cb() leaves PTEs of PageSlab pages installed
    and skips __free_page(), so bpf_arena_free_pages() on a slab-backed
    offset can't free a page out from under slub. The page is torn
    down later by arena_free_slab_page() after __ClearPageSlab().

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_defs.h |  13 ++
 include/linux/slab.h     |  22 +++
 kernel/bpf/Kconfig       |   3 +
 kernel/bpf/arena.c       | 366 ++++++++++++++++++++++++++++++++++++++-
 mm/slab.h                |   6 +-
 mm/slab_common.c         |   2 +-
 mm/slub.c                | 155 +++++++++++++++--
 7 files changed, 543 insertions(+), 24 deletions(-)

diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
index 2185cd3966d4..e271ae78c4ce 100644
--- a/include/linux/bpf_defs.h
+++ b/include/linux/bpf_defs.h
@@ -6,14 +6,27 @@
 #ifndef _LINUX_BPF_DEFS_H
 #define _LINUX_BPF_DEFS_H
 
+#include <linux/types.h>
+
+struct slab;
+
 #ifdef CONFIG_BPF_SYSCALL
 bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
+struct slab *bpf_arena_alloc_slab_page(void *arena, gfp_t flags, int node,
+				       bool allow_spin);
+void bpf_arena_free_slab_page(void *arena, struct slab *slab);
 #else
 static inline bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
 					       unsigned long fault_ip)
 {
 	return false;
 }
+static inline struct slab *bpf_arena_alloc_slab_page(void *arena, gfp_t flags,
+						     int node, bool allow_spin)
+{
+	return NULL;
+}
+static inline void bpf_arena_free_slab_page(void *arena, struct slab *slab) { }
 #endif
 
 #endif /* _LINUX_BPF_DEFS_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7ce9125a6a2c..6c6f1ba83c7d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -62,6 +62,7 @@ enum _slab_flag_bits {
 #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
 	_SLAB_OBJ_EXT_IN_OBJ,
 #endif
+	_SLAB_BPF_ARENA,
 	_SLAB_FLAGS_LAST_BIT
 };
 
@@ -248,6 +249,15 @@ enum _slab_flag_bits {
 #define SLAB_OBJ_EXT_IN_OBJ	__SLAB_FLAG_UNUSED
 #endif
 
+/*
+ * Cache is backed by bpf_arena pages instead of the page allocator.
+ * Slab pages live in the arena's kernel vmalloc range and are visible to
+ * BPF programs via 32-bit arena addressing. Freepointers stored inside
+ * free objects may be scribbled by BPF; get_freepointer() reconstructs a
+ * pointer that is always within the arena's 4GB window.
+ */
+#define SLAB_BPF_ARENA		__SLAB_FLAG_BIT(_SLAB_BPF_ARENA)
+
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
@@ -372,6 +382,15 @@ struct kmem_cache_args {
 	 * %0 means no sheaves will be created.
 	 */
 	unsigned int sheaf_capacity;
+	/**
+	 * @bpf_arena: Opaque arena pointer for SLAB_BPF_ARENA caches.
+	 *
+	 * When non-%NULL, slab pages for this cache are sourced from the
+	 * arena via bpf_arena_alloc_slab_page()/bpf_arena_free_slab_page(),
+	 * and freepointer reads are sanitized to remain inside the arena.
+	 * Caller must also pass %SLAB_BPF_ARENA in the flags argument.
+	 */
+	void *bpf_arena;
 };
 
 struct kmem_cache *__kmem_cache_create_args(const char *name,
@@ -963,6 +982,9 @@ void *kmem_cache_alloc_nolock_noprof(struct kmem_cache *s, gfp_t gfp_flags,
 #define kmem_cache_alloc_nolock(...)		\
 	alloc_hooks(kmem_cache_alloc_nolock_noprof(__VA_ARGS__))
 
+struct slab;
+void kmem_cache_force_discard_slab(struct kmem_cache *s, struct slab *slab);
+
 /**
  * __alloc_objs - Allocate objects of a given type using
  * @KMALLOC: which size-based kmalloc wrapper to allocate with.
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index eb3de35734f0..42ef4fc3a6bd 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -34,6 +34,9 @@ config BPF_SYSCALL
 	select NET_SOCK_MSG if NET
 	select NET_XGRESS if NET
 	select PAGE_POOL if NET
+	# bpf_arena_alloc()/free() stashes uaddr32 in slab->stride which only
+	# becomes a real field with CONFIG_SLAB_OBJ_EXT.
+	select SLAB_OBJ_EXT if MMU && 64BIT
 	default n
 	help
 	  Enable the bpf() system call that allows to manipulate BPF programs
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 1727503b25d8..807d806856d7 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -10,7 +10,9 @@
 #include <linux/btf_ids.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <asm/tlbflush.h>
+#include "../../mm/slab.h"
 #include "range_tree.h"
 
 /*
@@ -48,6 +50,14 @@
 
 static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable);
 
+/*
+ * Per-arena slab buckets. Mirrors the kmalloc size classes (powers of 2)
+ * up to one page.
+ */
+#define ARENA_KMALLOC_MIN_SHIFT		KMALLOC_SHIFT_LOW
+#define ARENA_KMALLOC_MAX_SHIFT		PAGE_SHIFT
+#define ARENA_KMALLOC_NUM_BUCKETS	(ARENA_KMALLOC_MAX_SHIFT + 1)
+
 struct bpf_arena {
 	struct bpf_map map;
 	u64 user_vm_start;
@@ -63,10 +73,20 @@ struct bpf_arena {
 	struct irq_work     free_irq;
 	struct work_struct  free_work;
 	struct llist_head   free_spans;
+
+	/*
+	 * SLAB_BPF_ARENA: kva <-> arena offset translation at the kfunc
+	 * boundary. Forward (kva -> uaddr32) via slab->stride; reverse
+	 * (uaddr32 -> page) via @slab_pages[pgoff], sized to max_entries.
+	 */
+	struct page **slab_pages;
+	struct kmem_cache *kmalloc_caches[ARENA_KMALLOC_NUM_BUCKETS];
 };
 
 static void arena_free_worker(struct work_struct *work);
 static void arena_free_irq(struct irq_work *iw);
+static int arena_init_slab_caches(struct bpf_arena *arena);
+static void arena_destroy_slab_caches(struct bpf_arena *arena);
 
 struct arena_free_span {
 	struct llist_node node;
@@ -143,6 +163,7 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
 struct apply_range_data {
 	struct page **pages;
 	int i;
+	bool set_page_slab;
 };
 
 struct clear_range_data {
@@ -166,6 +187,13 @@ static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
 	if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
 		return -EINVAL;
 
+	/*
+	 * Tag PageSlab under arena->spinlock so a racing bpf_arena_free_pages()
+	 * sees the page as slub-owned (apply_range_clear_cb skips PageSlab).
+	 */
+	if (d->set_page_slab)
+		__SetPageSlab(page);
+
 	set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
 	d->i++;
 	return 0;
@@ -179,9 +207,22 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
 static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
 {
 	struct clear_range_data *d = data;
-	pte_t old_pte;
+	pte_t old_pte, cur;
 	struct page *page;
 
+	/*
+	 * Skip slub-owned pages: BPF must use bpf_arena_free() for per-object
+	 * slab frees. The PTE stays; slub releases it via arena_free_slab_page()
+	 * after __ClearPageSlab(). Non-atomic ptep_get() is safe -- ptep_try_set()
+	 * only fires on pte_none, and arena_free_slab_page() can't race on this
+	 * offset (range stays allocated in range_tree for our walk).
+	 */
+	cur = ptep_get(pte);
+	if (pte_none(cur) || !pte_present(cur))
+		return 0;
+	if (PageSlab(pte_page(cur)))
+		return 0;
+
 	/*
 	 * Pairs with ptep_try_set() in the kernel-fault scratch installer.
 	 * Both sides must be atomic.
@@ -290,12 +331,25 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
 		goto err_free_scratch;
 	mutex_init(&arena->lock);
 	raw_res_spin_lock_init(&arena->spinlock);
+	arena->slab_pages = bpf_map_area_alloc(attr->max_entries *
+					       sizeof(arena->slab_pages[0]),
+					       numa_node);
+	if (!arena->slab_pages) {
+		err = -ENOMEM;
+		goto err_destroy_rt;
+	}
 	err = populate_pgtable_except_pte(arena);
 	if (err)
-		goto err_destroy_rt;
+		goto err_free_slab_pages;
+
+	err = arena_init_slab_caches(arena);
+	if (err)
+		goto err_free_slab_pages;
 
 	return &arena->map;
 
+err_free_slab_pages:
+	bpf_map_area_free(arena->slab_pages);
 err_destroy_rt:
 	range_tree_destroy(&arena->rt);
 err_free_scratch:
@@ -347,6 +401,9 @@ static void arena_map_free(struct bpf_map *map)
 	if (WARN_ON_ONCE(!list_empty(&arena->vma_list)))
 		return;
 
+	/* Tear down slab caches first so all slab-backed pages return to arena. */
+	arena_destroy_slab_caches(arena);
+
 	/* Ensure no pending deferred frees */
 	irq_work_sync(&arena->free_irq);
 	flush_work(&arena->free_work);
@@ -359,6 +416,7 @@ static void arena_map_free(struct bpf_map *map)
 	 */
 	apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
 				     SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
+	bpf_map_area_free(arena->slab_pages);
 	free_vm_area(arena->kern_vm);
 	range_tree_destroy(&arena->rt);
 	__free_page(arena->scratch_page);
@@ -461,6 +519,9 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 		if (page == arena->scratch_page)
 			/* BPF triggered scratch here; don't lazy-alloc over it */
 			goto out_sigsegv;
+		if (PageSlab(page))
+			/* Don't return slab-backed arena page */
+			goto out_sigsegv;
 		/* already have a page vmap-ed */
 		goto out;
 	}
@@ -625,7 +686,8 @@ static u64 clear_lo32(u64 val)
  * Later the pages will be mmaped into user space vma.
  */
 static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id,
-			      bool sleepable)
+			      bool sleepable, bool set_page_slab,
+			      struct page **out_page)
 {
 	/* user_vm_end/start are fixed before bpf prog runs */
 	long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
@@ -633,6 +695,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
 	struct mem_cgroup *new_memcg, *old_memcg;
 	struct apply_range_data data;
 	struct page **pages = NULL;
+	struct page *first_page = NULL;
 	long remaining, mapped = 0;
 	long alloc_pages;
 	unsigned long flags;
@@ -647,6 +710,13 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
 	if (page_cnt > page_cnt_max)
 		return 0;
 
+	/*
+	 * out-path rollback can't undo PageSlab on prior batches; restrict
+	 * set_page_slab to the single-page arena_alloc_slab_page() caller.
+	 */
+	if (WARN_ON_ONCE(set_page_slab && page_cnt > 1))
+		return 0;
+
 	if (uaddr) {
 		if (uaddr & ~PAGE_MASK)
 			return 0;
@@ -665,6 +735,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
 		return 0;
 	}
 	data.pages = pages;
+	data.set_page_slab = set_page_slab;
 
 	if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
 		goto out_free_pages;
@@ -695,6 +766,9 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
 		if (ret)
 			goto out;
 
+		if (!first_page)
+			first_page = pages[0];
+
 		/*
 		 * Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
 		 * will not overflow 32-bit. Lower 32-bit need to represent
@@ -720,6 +794,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
 	}
 	flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
+	if (out_page)
+		*out_page = first_page;
 	kfree_nolock(pages);
 	bpf_map_memcg_exit(old_memcg, new_memcg);
 	return clear_lo32(arena->user_vm_start) + uaddr32;
@@ -758,8 +834,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 {
 	struct mem_cgroup *new_memcg, *old_memcg;
 	u64 full_uaddr, uaddr_end;
-	long kaddr, pgoff;
-	struct page *page;
+	long kaddr, pgoff, i;
+	struct page *page, *fb_page;
 	struct llist_head free_pages;
 	struct llist_node *pos, *t;
 	struct arena_free_span *s;
@@ -778,6 +854,21 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 
 	page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
 	pgoff = compute_pgoff(arena, uaddr);
+
+	/*
+	 * Drop bookkeeping for any bpf_arena_alloc() fallback pages within the
+	 * freed range. PageSlab entries are owned by slub and must not be
+	 * cleared here; slub clears them via bpf_arena_free_slab_page() when
+	 * the slab page is released.
+	 */
+	for (i = 0; i < page_cnt; i++) {
+		fb_page = READ_ONCE(arena->slab_pages[pgoff + i]);
+		if (fb_page && !PageSlab(fb_page)) {
+			WRITE_ONCE(arena->slab_pages[pgoff + i], NULL);
+			set_page_private(fb_page, 0);
+		}
+	}
+
 	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
 
 	if (!sleepable)
@@ -952,6 +1043,135 @@ static void arena_free_irq(struct irq_work *iw)
 	schedule_work(&arena->free_work);
 }
 
+/*
+ * SLAB_BPF_ARENA: per-arena kmem_cache buckets backing bpf_arena_alloc/free.
+ * Slab pages come from the arena pool; slub uses direct-map VAs internally,
+ * BPF sees the arena vmalloc view, translation happens at the kfunc boundary.
+ */
+struct slab *bpf_arena_alloc_slab_page(void *arena_p, gfp_t flags, int node,
+				       bool allow_spin)
+{
+	struct bpf_arena *arena = arena_p;
+	long ret_user_va;
+	struct page *page;
+	struct slab *slab;
+	u32 uaddr32;
+
+	/*
+	 * set_page_slab=true makes apply_range_set_cb() tag PageSlab under
+	 * arena->spinlock so a racing bpf_arena_free_pages() can't free it.
+	 */
+	ret_user_va = arena_alloc_pages(arena, 0, 1, node, allow_spin, true, &page);
+	if (!ret_user_va)
+		return NULL;
+
+	uaddr32 = (u32)ret_user_va;
+	slab = page_slab(page);
+	/*
+	 * Stash uaddr32 in slab->stride; allocate_slab() skips
+	 * alloc_slab_obj_exts_early() for SLAB_BPF_ARENA so it survives.
+	 */
+	slab_set_stride(slab, uaddr32);
+	WRITE_ONCE(arena->slab_pages[uaddr32 >> PAGE_SHIFT], page);
+
+	return slab;
+}
+
+static u32 arena_slab_uaddr32(const struct slab *slab)
+{
+	return slab_get_stride((struct slab *)slab);
+}
+
+void bpf_arena_free_slab_page(void *arena_p, struct slab *slab)
+{
+	struct bpf_arena *arena = arena_p;
+	u32 uaddr32 = arena_slab_uaddr32(slab);
+
+	WRITE_ONCE(arena->slab_pages[uaddr32 >> PAGE_SHIFT], NULL);
+	arena_free_pages(arena, uaddr32, 1, false);
+}
+
+static int arena_init_slab_caches(struct bpf_arena *arena)
+{
+	char name[KSYM_NAME_LEN];
+	unsigned int i;
+
+	for (i = ARENA_KMALLOC_MIN_SHIFT; i < ARENA_KMALLOC_NUM_BUCKETS; i++) {
+		struct kmem_cache *c;
+		struct kmem_cache_args args = {
+			.align		= sizeof(void *),
+			.bpf_arena	= arena,
+		};
+
+		snprintf(name, sizeof(name), "arena-%lx-%u",
+			 (unsigned long)arena, 1U << i);
+		c = kmem_cache_create(name, 1U << i, &args, SLAB_BPF_ARENA);
+		if (!c)
+			goto err;
+		arena->kmalloc_caches[i] = c;
+	}
+	return 0;
+err:
+	arena_destroy_slab_caches(arena);
+	return -ENOMEM;
+}
+
+static void arena_destroy_slab_caches(struct bpf_arena *arena)
+{
+	long max = arena->map.max_entries;
+	unsigned int i;
+	long pgoff;
+
+	/*
+	 * Drain per-cpu sheaves of every bucket before walking slab_pages[].
+	 * Sheaves cache pointers into slab pages that the force-discard loop
+	 * is about to release; kmem_cache_shrink() flushes those caches back
+	 * into their slabs (and frees any slab that becomes empty), so the
+	 * later force-discard cannot trigger __slab_free() on memory that has
+	 * since been recycled. Frees triggered here go through
+	 * bpf_arena_free_slab_page() which clears arena->slab_pages[], so
+	 * those entries become NULL and the loop below skips them.
+	 */
+	for (i = ARENA_KMALLOC_MIN_SHIFT; i < ARENA_KMALLOC_NUM_BUCKETS; i++) {
+		if (!arena->kmalloc_caches[i])
+			continue;
+		kmem_cache_shrink(arena->kmalloc_caches[i]);
+	}
+
+	/*
+	 * Force-discard every slab page slub still tracks via slab_pages[].
+	 * Catches orphans not on n->partial (trylock failures in __slab_free)
+	 * and BPF-leaked slabs with inuse > 0; without this kmem_cache_destroy()
+	 * would see n->nr_slabs > 0, WARN, and leak the kmem_cache descriptor.
+	 */
+	for (pgoff = 0; pgoff < max; pgoff++) {
+		struct page *page = arena->slab_pages[pgoff];
+		struct slab *slab;
+
+		if (!page)
+			continue;
+		if (!PageSlab(page))
+			/*
+			 * Leftover bpf_arena_alloc() fallback page; freed by
+			 * existing_page_cb() in arena_map_free().
+			 */
+			continue;
+		slab = page_slab(page);
+		kmem_cache_force_discard_slab(slab->slab_cache, slab);
+	}
+
+	/* Let deferred page frees from the discard pass run before teardown. */
+	irq_work_sync(&arena->free_irq);
+	flush_work(&arena->free_work);
+
+	for (i = 0; i < ARENA_KMALLOC_NUM_BUCKETS; i++) {
+		if (!arena->kmalloc_caches[i])
+			continue;
+		kmem_cache_destroy(arena->kmalloc_caches[i]);
+		arena->kmalloc_caches[i] = NULL;
+	}
+}
+
 __bpf_kfunc_start_defs();
 
 __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
@@ -963,7 +1183,8 @@ __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_
 	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
 		return NULL;
 
-	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id,
+					 true, false, NULL);
 }
 
 void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
@@ -975,7 +1196,8 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
 	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
 		return NULL;
 
-	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
+	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id,
+					 false, false, NULL);
 }
 
 void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
@@ -987,7 +1209,8 @@ void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cn
 	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
 		return NULL;
 
-	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id,
+					 true, false, NULL);
 }
 
 __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
@@ -1023,12 +1246,139 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c
 
 	return arena_reserve_pages(arena, (long)ptr__ign, page_cnt);
 }
+
+/*
+ * bpf_arena_alloc: allocate one object of @size bytes from the arena's
+ * slab buckets. Returns a value whose low 32 bits are the arena offset;
+ * BPF programs use it as a void __arena *. Slub gives us a direct-map kva;
+ * its slab page carries the arena uaddr32 in slab->stride.
+ *
+ * For @size > PAGE_SIZE the slab buckets cannot satisfy the request and
+ * the allocation falls back to arena_alloc_pages(). The first page of
+ * such a multi-page allocation is stashed in arena->slab_pages[pgoff]
+ * (without PageSlab) with page_cnt in page->private, so bpf_arena_free()
+ * can find it again from the arena offset alone.
+ */
+__bpf_kfunc void *bpf_arena_alloc(void *p__map, u32 size)
+{
+	struct bpf_map *map = p__map;
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+	struct kmem_cache *c;
+	struct slab *slab;
+	unsigned int idx;
+	void *kva;
+	u32 uaddr32;
+
+	if (map->map_type != BPF_MAP_TYPE_ARENA || !size)
+		return NULL;
+	if (size > (1U << ARENA_KMALLOC_MAX_SHIFT)) {
+		struct page *first_page;
+		long ret_user_va;
+		u32 page_cnt, pgoff;
+
+		page_cnt = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
+		if (!page_cnt)
+			return NULL;
+		/* sleepable=false mirrors kmem_cache_alloc_nolock() */
+		ret_user_va = arena_alloc_pages(arena, 0, page_cnt, NUMA_NO_NODE,
+						false, false, &first_page);
+		if (!ret_user_va)
+			return NULL;
+		pgoff = (u32)ret_user_va >> PAGE_SHIFT;
+		set_page_private(first_page, page_cnt);
+		WRITE_ONCE(arena->slab_pages[pgoff], first_page);
+		return (void *)ret_user_va;
+	}
+
+	idx = max_t(unsigned int, fls(size - 1), ARENA_KMALLOC_MIN_SHIFT);
+	if (idx >= ARENA_KMALLOC_NUM_BUCKETS)
+		return NULL;
+	c = arena->kmalloc_caches[idx];
+	if (!c)
+		return NULL;
+
+	/*
+	 * Use the nolock variant so this kfunc is safe from any context.
+	 * Skip __GFP_ACCOUNT because memcg charging already happens at
+	 * the arena page level.
+	 */
+	kva = kmem_cache_alloc_nolock(c, 0, NUMA_NO_NODE);
+	if (!kva)
+		return NULL;
+
+	slab = virt_to_slab(kva);
+	if (!slab || slab->slab_cache != c) {
+		bpf_prog_report_arena_violation(true, (long)kva, _RET_IP_);
+		return NULL;
+	}
+	uaddr32 = arena_slab_uaddr32(slab) |
+		  ((u32)(unsigned long)kva & ~PAGE_MASK);
+	return (void *)(clear_lo32(arena->user_vm_start) + uaddr32);
+}
+
+/*
+ * bpf_arena_free: free an object previously returned by bpf_arena_alloc.
+ * The arena offset's high bits identify the slab page; slab->slab_cache's
+ * bpf_arena hook confirms it belongs to this arena. The kva handed to
+ * kfree_nolock is direct-map, so its virt_to_slab works normally.
+ */
+__bpf_kfunc void bpf_arena_free(void *p__map, void *ptr__ign)
+{
+	struct bpf_map *map = p__map;
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+	struct page *page;
+	struct slab *slab;
+	u32 arena_off, pgoff;
+	void *kva;
+
+	if (map->map_type != BPF_MAP_TYPE_ARENA || !ptr__ign)
+		return;
+
+	arena_off = (u32)(unsigned long)ptr__ign;
+	pgoff = arena_off >> PAGE_SHIFT;
+	if (pgoff >= arena->map.max_entries)
+		goto violation;
+	page = READ_ONCE(arena->slab_pages[pgoff]);
+	if (!page)
+		goto violation;
+	if (!PageSlab(page)) {
+		/*
+		 * Multi-page allocation from the bpf_arena_alloc() fallback.
+		 * page->private holds page_cnt stashed at allocation time.
+		 */
+		u32 page_cnt = page_private(page);
+
+		WRITE_ONCE(arena->slab_pages[pgoff], NULL);
+		set_page_private(page, 0);
+		arena_free_pages(arena, arena_off, page_cnt, false);
+		return;
+	}
+	slab = page_slab(page);
+	if (slab->slab_cache->bpf_arena != arena)
+		goto violation;
+	/*
+	 * Reject arena offsets that do not land on an object boundary. Arena
+	 * bucket caches have power-of-two s->size, so a simple IS_ALIGNED()
+	 * suffices; without this kfree_nolock() would set a freepointer inside
+	 * an unrelated object on the same slab page.
+	 */
+	if (!IS_ALIGNED(arena_off, slab->slab_cache->size))
+		goto violation;
+	kva = page_to_virt(page) + (arena_off & ~PAGE_MASK);
+	/* nolock free mirrors the nolock alloc — safe from any context. */
+	kfree_nolock(kva);
+	return;
+violation:
+	bpf_prog_report_arena_violation(true, arena_off, _RET_IP_);
+}
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(arena_kfuncs)
 BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2)
 BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2)
 BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_alloc, KF_ARENA_RET)
+BTF_ID_FLAGS(func, bpf_arena_free, KF_ARENA_ARG2)
 BTF_KFUNCS_END(arena_kfuncs)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/mm/slab.h b/mm/slab.h
index bf2f87acf5e3..2b0272c3f5fe 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -248,6 +248,9 @@ struct kmem_cache {
 	struct kmem_cache_stats __percpu *cpu_stats;
 #endif
 
+	/* NULL unless SLAB_BPF_ARENA; opaque arena pointer. */
+	void *bpf_arena;
+
 	struct kmem_cache_per_node_ptrs per_node[MAX_NUMNODES];
 };
 
@@ -414,7 +417,8 @@ void flush_rcu_sheaves_on_cache(struct kmem_cache *s);
 			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
 			 SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
 			 SLAB_TEMPORARY | SLAB_ACCOUNT | \
-			 SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
+			 SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE | \
+			 SLAB_BPF_ARENA)
 
 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 			  SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8b661fff5eed..c9eb6daf649a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -49,7 +49,7 @@ struct kmem_cache *kmem_cache;
  */
 #define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \
 		SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \
-		SLAB_OBJ_EXT_IN_OBJ)
+		SLAB_OBJ_EXT_IN_OBJ | SLAB_BPF_ARENA)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
diff --git a/mm/slub.c b/mm/slub.c
index 1daa89105e04..1a2e85605ab9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -21,6 +21,7 @@
 #include <linux/bitops.h>
 #include <linux/slab.h>
 #include "slab.h"
+#include <linux/bpf_defs.h>
 #include <linux/vmalloc.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -531,11 +532,25 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
 {
 	unsigned long ptr_addr;
 	freeptr_t p;
+	void *decoded;
 
 	object = kasan_reset_tag(object);
 	ptr_addr = (unsigned long)object + s->offset;
 	p = *(freeptr_t *)(ptr_addr);
-	return freelist_ptr_decode(s, p, ptr_addr);
+	decoded = freelist_ptr_decode(s, p, ptr_addr);
+	/*
+	 * SLAB_BPF_ARENA freepointer slots are BPF-writable. Clamp the decoded
+	 * pointer to an s->size-aligned address within the same slab page so
+	 * chain walks stay on legitimate object boundaries. Arena slabs are
+	 * always one page (order 0). NULL preserved.
+	 */
+	if (unlikely(s->bpf_arena) && decoded) {
+		unsigned long obj_mask = s->size - 1;
+
+		decoded = (void *)(((unsigned long)object & PAGE_MASK) |
+				   ((unsigned long)decoded & ~PAGE_MASK & ~obj_mask));
+	}
+	return decoded;
 }
 
 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
@@ -543,7 +558,12 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 	unsigned long freeptr_addr = (unsigned long)object + s->offset;
 
 #ifdef CONFIG_SLAB_FREELIST_HARDENED
-	BUG_ON(object == fp); /* naive detection of double free or corruption */
+	if (unlikely(object == fp)) {
+		/* BPF double-free of arena objects must not panic the kernel. */
+		if (s->bpf_arena)
+			return;
+		BUG_ON(object == fp); /* naive detection of double free or corruption */
+	}
 #endif
 
 	freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
@@ -3270,6 +3290,9 @@ static inline struct slab *alloc_slab_page(struct kmem_cache *s, gfp_t flags,
 	struct slab *slab;
 	unsigned int order = oo_order(oo);
 
+	if (unlikely(s->bpf_arena))
+		return bpf_arena_alloc_slab_page(s->bpf_arena, flags, node, allow_spin);
+
 	if (unlikely(!allow_spin))
 		page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
 								  node, order);
@@ -3493,9 +3516,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	init_slab_obj_exts(slab);
 	/*
 	 * Poison the slab before initializing the slabobj_ext array
-	 * to prevent the array from being overwritten.
+	 * to prevent the array from being overwritten. Arena caches
+	 * stash uaddr32 in slab->stride; let them keep it.
 	 */
-	alloc_slab_obj_exts_early(s, slab);
+	if (!(s->flags & SLAB_BPF_ARENA))
+		alloc_slab_obj_exts_early(s, slab);
 	account_slab(slab, oo_order(oo), s, flags);
 
 	shuffle = shuffle_freelist(s, slab, allow_spin);
@@ -3538,6 +3563,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin
 	__ClearPageSlab(page);
 	mm_account_reclaimed_pages(pages);
 	unaccount_slab(slab, order, s, allow_spin);
+	if (unlikely(s->bpf_arena)) {
+		bpf_arena_free_slab_page(s->bpf_arena, slab);
+		return;
+	}
 	if (allow_spin)
 		free_frozen_pages(page, order);
 	else
@@ -5442,6 +5471,32 @@ void *kmem_cache_alloc_nolock_noprof(struct kmem_cache *s, gfp_t gfp_flags,
 }
 EXPORT_SYMBOL_GPL(kmem_cache_alloc_nolock_noprof);
 
+/**
+ * kmem_cache_force_discard_slab - force-evict a slab page from its cache
+ * @s: kmem_cache that owns the slab
+ * @slab: the slab to evict
+ *
+ * Removes @slab from any per-node list it may be on and then discards it
+ * (decrements nr_slabs and frees the backing page). Intended for arena
+ * teardown: arena owns the page-tracking array and can enumerate every
+ * slab page it allocated, including orphans not on any partial list (left
+ * behind by spin_trylock failures in __slab_free()) and slabs whose
+ * objects were never returned (BPF program leak).
+ */
+void kmem_cache_force_discard_slab(struct kmem_cache *s, struct slab *slab)
+{
+	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
+	unsigned long flags;
+
+	spin_lock_irqsave(&n->list_lock, flags);
+	if (slab_test_node_partial(slab))
+		remove_partial(n, slab);
+	spin_unlock_irqrestore(&n->list_lock, flags);
+
+	discard_slab(s, slab);
+}
+EXPORT_SYMBOL_GPL(kmem_cache_force_discard_slab);
+
 void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
 					 int node, unsigned long caller)
 {
@@ -5589,14 +5644,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
 
 			n = get_node(s, slab_nid(slab));
 			/*
-			 * Speculatively acquire the list_lock.
-			 * If the cmpxchg does not succeed then we may
-			 * drop the list_lock without any processing.
+			 * Speculatively acquire the list_lock. If the cmpxchg
+			 * does not succeed we drop the lock without processing.
 			 *
-			 * Otherwise the list_lock will synchronize with
-			 * other processors updating the list of slabs.
+			 * Arena caches may reach here from kfree_nolock() in
+			 * NMI/irq-off context; trylock and orphan the slab on
+			 * failure. A later allow_spin caller adopts it.
 			 */
-			spin_lock_irqsave(&n->list_lock, flags);
+			if (unlikely(s->bpf_arena)) {
+				if (!spin_trylock_irqsave(&n->list_lock, flags))
+					n = NULL;
+			} else {
+				spin_lock_irqsave(&n->list_lock, flags);
+			}
 
 			on_node_partial = slab_test_node_partial(slab);
 		}
@@ -6666,6 +6726,15 @@ void kfree_nolock(const void *object)
 	if (likely(can_free_to_pcs(slab)) && likely(free_to_pcs(s, x, false)))
 		return;
 
+	/*
+	 * Arena freepointer slots are BPF-writable; defer_free()'s in-object
+	 * llist chain could be redirected. Route through __slab_free() instead;
+	 * it trylocks n->list_lock and orphans the slab on failure.
+	 */
+	if (s->bpf_arena) {
+		__slab_free(s, slab, x, x, 1, _RET_IP_);
+		return;
+	}
 	/*
 	 * __slab_free() can locklessly cmpxchg16 into a slab, but then it might
 	 * need to take spin_lock for further processing.
@@ -7181,16 +7250,22 @@ __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int mi
 		/*
 		 * Freelist had more objects than we can accommodate, we need to
 		 * free them back. We can treat it like a detached freelist, just
-		 * need to find the tail object.
+		 * need to find the tail object. Bound the walk by slab->objects
+		 * so a corrupted in-object freepointer (e.g. BPF arena cache
+		 * where the slot is writable from BPF) cannot loop forever; a
+		 * legitimate freelist on this slab has at most that many nodes.
 		 */
 		if (unlikely(object)) {
 			void *head = object;
 			void *tail;
-			int cnt = 0;
+			unsigned int cnt = 0;
+			unsigned int limit = slab->objects;
 
 			do {
 				tail = object;
 				cnt++;
+				if (unlikely(cnt >= limit))
+					break;
 				object = get_freepointer(s, object);
 			} while (object);
 			__slab_free(s, slab, head, tail, cnt, _RET_IP_);
@@ -7763,12 +7838,21 @@ static unsigned int calculate_sheaf_capacity(struct kmem_cache *s,
 		return 0;
 
 	/*
-	 * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT).
+	 * Bootstrap caches (kmem_cache, kmem_cache_node) carry SLAB_NO_OBJ_EXT
+	 * and are created before kmalloc is available, so sheaf/barn setup
+	 * can't run yet.
+	 *
 	 * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not
 	 * have sheaves to avoid recursion when sheaf allocation triggers
 	 * kmemleak tracking.
+	 *
+	 * SLAB_BPF_ARENA caches also set SLAB_NO_OBJ_EXT to suppress per-object
+	 * extensions, but they are created at runtime and want sheaves like any
+	 * other cache, so exempt them.
 	 */
-	if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
+	if (s->flags & SLAB_NOLEAKTRACE)
+		return 0;
+	if ((s->flags & SLAB_NO_OBJ_EXT) && !(s->flags & SLAB_BPF_ARENA))
 		return 0;
 
 	/*
@@ -8607,6 +8691,27 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
 	s->useroffset = args->useroffset;
 	s->usersize = args->usersize;
 #endif
+	if (s->flags & SLAB_BPF_ARENA) {
+		if (!args->bpf_arena)
+			goto out;
+		/*
+		 * Strip every SLAB_DEBUG_FLAGS bit from arena caches.
+		 * Masking (rather than goto out) keeps arena maps creatable
+		 * under slub_debug=... cmdline.
+		 */
+		s->flags &= ~SLAB_DEBUG_FLAGS;
+		/* Non-debug knobs we cannot honor: refuse the cache. */
+		if (s->flags & (SLAB_KASAN | SLAB_TYPESAFE_BY_RCU | SLAB_ACCOUNT))
+			goto out;
+		/*
+		 * Suppress per-object obj_exts for arena caches: accounting
+		 * already happens at arena-page granularity (bpf_map_memcg_enter
+		 * in arena_alloc_pages), and per-slab obj_exts would cost
+		 * sizeof(slabobj_ext) * objs_per_slab of overhead per page.
+		 */
+		s->flags |= SLAB_NO_OBJ_EXT;
+		s->bpf_arena = args->bpf_arena;
+	}
 
 	if (!calculate_sizes(args, s))
 		goto out;
@@ -8623,6 +8728,17 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
 		}
 	}
 
+	if (s->flags & SLAB_BPF_ARENA) {
+		/*
+		 * Arena page source currently allocates one page at a time;
+		 * force order 0 and pin s->min to s->oo so allocate_slab() has
+		 * no fallback path and get_freepointer()'s slab-mask sanitize
+		 * (oo_order(s->oo)) always matches the actual slab order.
+		 */
+		s->oo = oo_make(0, s->size);
+		s->min = s->oo;
+	}
+
 #ifdef system_has_freelist_aba
 	if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
 		/* Enable fast mode */
@@ -9628,6 +9744,17 @@ static int sysfs_slab_add(struct kmem_cache *s)
 	struct kset *kset = cache_kset(s);
 	int unmergeable = slab_unmergeable(s);
 
+	/*
+	 * Hide arena caches from /sys/kernel/slab: shrink/validate/etc would
+	 * BUG_ON on BPF-induced inuse underflow or corrupted freelists.
+	 * kobject_init() (no kobject_add()) keeps the destroy-time
+	 * kobject_put() -> slab_kmem_cache_release() path working.
+	 */
+	if (s->bpf_arena) {
+		kobject_init(&s->kobj, &slab_ktype);
+		return 0;
+	}
+
 	if (!unmergeable && disable_higher_order_debug &&
 			(slub_debug & DEBUG_METADATA_FLAGS))
 		unmergeable = 1;
-- 
2.53.0-Meta


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH bpf-next 4/4] selftests/bpf: Add tests for arena slub-backed allocator
  2026-05-29 20:24 [PATCH bpf-next 0/4] bpf,slab: Introduce bpf_arena_alloc() kfuncs Alexei Starovoitov
                   ` (2 preceding siblings ...)
  2026-05-29 20:24 ` [PATCH bpf-next 3/4] bpf,slab: Add slub-backed allocator for bpf_arena Alexei Starovoitov
@ 2026-05-29 20:24 ` Alexei Starovoitov
  2026-05-29 22:39   ` sashiko-bot
  3 siblings, 1 reply; 11+ messages in thread
From: Alexei Starovoitov @ 2026-05-29 20:24 UTC (permalink / raw)
  To: bpf
  Cc: daniel, andrii, memxor, eddyz87, vbabka, harry.yoo, david, tj,
	roman.gushchin, peterz, linux-mm

From: Alexei Starovoitov <ast@kernel.org>

Cover the SLAB_BPF_ARENA path end-to-end:

  - arena_slab: smoke tests for the bpf_arena_alloc/free kfuncs.
  - arena_slab_freeptr_stale_pcs: exercise corrupted in-object
    freepointers to validate get_freepointer() clamping and the
    __refill_objects_node() bounded walk.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/arena_slab.c     |  59 ++++++
 .../prog_tests/arena_slab_freeptr_stale_pcs.c |  28 +++
 .../testing/selftests/bpf/progs/arena_slab.c  | 179 ++++++++++++++++++
 .../bpf/progs/arena_slab_freeptr_stale_pcs.c  | 120 ++++++++++++
 4 files changed, 386 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_slab.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_slab_freeptr_stale_pcs.c
 create mode 100644 tools/testing/selftests/bpf/progs/arena_slab.c
 create mode 100644 tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c

diff --git a/tools/testing/selftests/bpf/prog_tests/arena_slab.c b/tools/testing/selftests/bpf/prog_tests/arena_slab.c
new file mode 100644
index 000000000000..6cbaa6991c6b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/arena_slab.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include "arena_slab.skel.h"
+
+void test_arena_slab(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	struct arena_slab *skel;
+	int ret;
+
+	skel = arena_slab__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "arena_slab__open_and_load"))
+		return;
+
+	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_alloc), &opts);
+	ASSERT_OK(ret, "alloc_run");
+	ASSERT_OK(opts.retval, "alloc_retval");
+	ASSERT_EQ(skel->bss->alloc_failed, 0, "no alloc failures");
+
+	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_free), &opts);
+	ASSERT_OK(ret, "free_run");
+	ASSERT_OK(opts.retval, "free_retval");
+	ASSERT_EQ(skel->bss->free_done, 1, "free completed");
+
+	/* Realloc to make sure freed objects can be returned again. */
+	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_alloc), &opts);
+	ASSERT_OK(ret, "realloc_run");
+	ASSERT_EQ(skel->bss->alloc_failed, 0, "no alloc failures after free");
+
+	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_free), &opts);
+	ASSERT_OK(ret, "free_run_2");
+
+	/*
+	 * defer_free() corruption repro. Allocates CORRUPT_N PAGE_SIZE
+	 * objects, then with local IRQs disabled frees each one and
+	 * immediately overwrites its freepointer slot. With IRQs off the
+	 * irq_work IPI raised by defer_free() is deferred; multiple
+	 * defer_free()d objects chain onto the per-cpu llist via the
+	 * poisoned freepointer slots. After local_irq_restore() the IPI
+	 * fires and free_deferred_objects() walks the corrupted llist,
+	 * oopsing on a pre-fix kernel. The spin_trylock __slab_free()
+	 * fix keeps the freed objects out of any in-object llist, so the
+	 * test completes cleanly.
+	 */
+	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_defer_corrupt), &opts);
+	ASSERT_OK(ret, "defer_corrupt_run");
+	ASSERT_OK(opts.retval, "defer_corrupt_retval");
+	ASSERT_EQ(skel->bss->corrupt_alloc_failed, 0, "no alloc failures in defer_corrupt");
+	ASSERT_EQ(skel->bss->corrupt_done, 1, "defer_corrupt completed");
+
+	ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_slab_leak), &opts);
+	ASSERT_OK(ret, "leak_run");
+	ASSERT_OK(opts.retval, "leak_retval");
+	ASSERT_EQ(skel->bss->leak_alloc_failed, 0, "no alloc failures in leak");
+	ASSERT_EQ(skel->bss->leak_done, 1, "leak completed");
+
+	arena_slab__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/arena_slab_freeptr_stale_pcs.c b/tools/testing/selftests/bpf/prog_tests/arena_slab_freeptr_stale_pcs.c
new file mode 100644
index 000000000000..fbf4f5a7e4b3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/arena_slab_freeptr_stale_pcs.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "arena_slab_freeptr_stale_pcs.skel.h"
+
+void test_arena_slab_freeptr_stale_pcs(void)
+{
+	LIBBPF_OPTS(bpf_test_run_opts, opts);
+	struct arena_slab_freeptr_stale_pcs *skel;
+	int ret;
+
+	skel = arena_slab_freeptr_stale_pcs__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "arena_slab_freeptr_stale_pcs__open_and_load"))
+		return;
+
+	ret = bpf_prog_test_run_opts(
+		bpf_program__fd(skel->progs.arena_slab_freeptr_stale_pcs),
+		&opts);
+	ASSERT_OK(ret, "arena_slab_freeptr_stale_pcs_run");
+	ASSERT_OK(opts.retval, "arena_slab_freeptr_stale_pcs_retval");
+	ASSERT_EQ(skel->bss->alloc_failed, 0, "initial allocs");
+	ASSERT_EQ(skel->bss->drain_failed, 0, "drain sheaf allocs");
+	ASSERT_EQ(skel->bss->cycle_alloc_failed, 0, "self-cycle alloc");
+	ASSERT_EQ(skel->bss->cycle_alloc_mismatch, 0, "cycle returned victim");
+	ASSERT_EQ(skel->bss->stale_alloc_null, 1, "stale sheaf alloc rejected");
+	ASSERT_EQ(skel->bss->done, 1, "stale pcs trigger completed");
+
+	arena_slab_freeptr_stale_pcs__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/arena_slab.c b/tools/testing/selftests/bpf/progs/arena_slab.c
new file mode 100644
index 000000000000..738a48c45da3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/arena_slab.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#define BPF_NO_KFUNC_PROTOTYPES
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_experimental.h"
+#include "bpf_arena_common.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(max_entries, 256); /* number of pages */
+#ifdef __TARGET_ARCH_arm64
+	__ulong(map_extra, 0x1ull << 32);
+#else
+	__ulong(map_extra, 0x1ull << 44);
+#endif
+} arena SEC(".maps");
+
+void __arena *bpf_arena_alloc(void *map, __u32 size) __ksym __weak;
+void bpf_arena_free(void *map, void __arena *ptr) __ksym __weak;
+void bpf_local_irq_save(unsigned long *flags) __ksym;
+void bpf_local_irq_restore(unsigned long *flags) __ksym;
+
+#define N 64
+#define LARGE_N 2
+#define CORRUPT_N 8
+#define LEAK_N 16
+
+int alloc_failed;
+int free_done;
+int corrupt_alloc_failed;
+int corrupt_done;
+int leak_alloc_failed;
+int leak_done;
+
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+static __u8 __arena *objs[N];
+static __u8 __arena *large_objs[LARGE_N];
+static __u64 __arena *corrupt_objs[CORRUPT_N];
+
+/* Sizes > PAGE_SIZE force bpf_arena_alloc() onto the arena_alloc_pages() fallback. */
+static const __u32 large_sizes[LARGE_N] = {
+	PAGE_SIZE + 1,
+	3 * PAGE_SIZE,
+};
+#endif
+
+SEC("syscall")
+int arena_slab_alloc(void *ctx)
+{
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+	int i;
+
+	for (i = 0; i < N; i++) {
+		__u32 size = 8U << (i & 7); /* 8, 16, 32, ... 1024 */
+		__u8 __arena *p = bpf_arena_alloc(&arena, size);
+
+		if (!p) {
+			alloc_failed = i + 1;
+			return 0;
+		}
+		/*
+		 * Write a sentinel that depends on the slot — proves the object
+		 * is real arena memory and not aliased with another slot.
+		 */
+		p[0] = (__u8)(i + 1);
+		p[size - 1] = (__u8)(i + 1);
+		objs[i] = p;
+	}
+
+	/*
+	 * Exercise the >PAGE_SIZE fallback. bpf_arena_alloc() routes these
+	 * through arena_alloc_pages(); bpf_arena_free() recovers page_cnt
+	 * from page->private and tears the range back down.
+	 */
+	for (i = 0; i < LARGE_N; i++) {
+		__u32 size = large_sizes[i];
+		__u8 __arena *p = bpf_arena_alloc(&arena, size);
+
+		if (!p) {
+			alloc_failed = N + i + 1;
+			return 0;
+		}
+		/* Touch first, middle, and last byte to exercise every mapped page. */
+		p[0] = (__u8)(N + i + 1);
+		p[size / 2] = (__u8)(N + i + 1);
+		p[size - 1] = (__u8)(N + i + 1);
+		large_objs[i] = p;
+	}
+#endif
+	return 0;
+}
+
+SEC("syscall")
+int arena_slab_free(void *ctx)
+{
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+	int i;
+
+	for (i = 0; i < N; i++) {
+		if (!objs[i])
+			continue;
+		bpf_arena_free(&arena, objs[i]);
+		objs[i] = NULL;
+	}
+	for (i = 0; i < LARGE_N; i++) {
+		if (!large_objs[i])
+			continue;
+		bpf_arena_free(&arena, large_objs[i]);
+		large_objs[i] = NULL;
+	}
+	free_done = 1;
+#endif
+	return 0;
+}
+
+SEC("syscall")
+int arena_slab_defer_corrupt(void *ctx)
+{
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < CORRUPT_N; i++) {
+		corrupt_objs[i] = bpf_arena_alloc(&arena, 4096);
+		if (!corrupt_objs[i]) {
+			corrupt_alloc_failed = i + 1;
+			return 0;
+		}
+	}
+
+	/*
+	 * IRQs off so defer_free()'s irq_work IPI accumulates the chain across
+	 * iterations instead of draining between each free.
+	 */
+	bpf_local_irq_save(&flags);
+	for (i = 0; i < CORRUPT_N; i++) {
+		bpf_arena_free(&arena, corrupt_objs[i]);
+		/*
+		 * Freepointer slot for non-debug caches is at object_size/2;
+		 * 2048 for the 4096-byte bucket. Poison defer_free()'s next.
+		 */
+		corrupt_objs[i][2048 / sizeof(__u64)] = 0xdeadbeefdeadbeefULL;
+	}
+	bpf_local_irq_restore(&flags);
+
+	corrupt_done = 1;
+#endif
+	return 0;
+}
+
+/*
+ * Intentional leak across multiple bucket caches; destroyed arena must
+ * still tear down cleanly (no kmem_cache_destroy() WARN, no leak).
+ */
+SEC("syscall")
+int arena_slab_leak(void *ctx)
+{
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+	int i;
+
+	for (i = 0; i < LEAK_N; i++) {
+		__u32 size = 16U << (i & 7); /* 16, 32, ..., 2048 */
+		__u8 __arena *p = bpf_arena_alloc(&arena, size);
+
+		if (!p) {
+			leak_alloc_failed = i + 1;
+			return 0;
+		}
+		p[0] = (__u8)(i + 1);
+	}
+	leak_done = 1;
+#endif
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c b/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c
new file mode 100644
index 000000000000..4d23d75419d6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0
+#define BPF_NO_KFUNC_PROTOTYPES
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_experimental.h"
+#include "bpf_arena_common.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARENA);
+	__uint(map_flags, BPF_F_MMAPABLE);
+	__uint(max_entries, 256);
+#ifdef __TARGET_ARCH_arm64
+	__ulong(map_extra, 0x1ull << 32);
+#else
+	__ulong(map_extra, 0x1ull << 44);
+#endif
+} arena SEC(".maps");
+
+void __arena *bpf_arena_alloc(void *map, __u32 size) __ksym __weak;
+void bpf_arena_free(void *map, void __arena *ptr) __ksym __weak;
+void bpf_preempt_disable(void) __ksym;
+void bpf_preempt_enable(void) __ksym;
+
+#define OBJ_SIZE 4096
+#define FREEPTR_OFFSET (OBJ_SIZE / 2)
+#define SHEAF_FILL 4
+#define TARGET_IDX SHEAF_FILL
+#define EXTRA_IDX (TARGET_IDX + 1)
+#define NR_OBJS (EXTRA_IDX + 1)
+
+int alloc_failed;
+int drain_failed;
+int cycle_alloc_failed;
+int cycle_alloc_mismatch;
+int stale_alloc_null;
+int done;
+
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+static __u8 __arena *objs[NR_OBJS];
+#endif
+
+SEC("syscall")
+int arena_slab_freeptr_stale_pcs(void *ctx)
+{
+#ifdef __BPF_FEATURE_ADDR_SPACE_CAST
+	__u8 __arena *victim, *p;
+	__u64 raw;
+	int i;
+
+	for (i = 0; i < NR_OBJS; i++) {
+		objs[i] = bpf_arena_alloc(&arena, OBJ_SIZE);
+		if (!objs[i]) {
+			alloc_failed = i + 1;
+			return 0;
+		}
+		objs[i][0] = i + 1;
+	}
+
+	bpf_preempt_disable();
+
+	/* Fill the per-cpu sheaf so the next free reaches SLUB proper. */
+	for (i = 1; i <= SHEAF_FILL; i++)
+		bpf_arena_free(&arena, objs[i - 1]);
+
+	victim = objs[TARGET_IDX];
+
+	/*
+	 * The 4096-byte bucket has one object per slab and a 4-object sheaf.
+	 * Free @victim while the sheaf is full, then turn its encoded NULL
+	 * freepointer into any non-NULL decoded value. The arena clamp keeps
+	 * non-NULL decoded values in the same slab and object-aligned, so this
+	 * becomes a freelist self-cycle back to @victim.
+	 */
+	bpf_arena_free(&arena, victim);
+	raw = *(__u64 __arena *)(victim + FREEPTR_OFFSET);
+	*(__u64 __arena *)(victim + FREEPTR_OFFSET) = raw ^ 1;
+
+	for (i = 0; i < SHEAF_FILL; i++) {
+		p = bpf_arena_alloc(&arena, OBJ_SIZE);
+		if (!p) {
+			drain_failed = i + 1;
+			goto out;
+		}
+	}
+
+	p = bpf_arena_alloc(&arena, OBJ_SIZE);
+	if (!p) {
+		cycle_alloc_failed = 1;
+		goto out;
+	}
+	if (p != victim)
+		cycle_alloc_mismatch = 1;
+
+	for (i = 0; i < SHEAF_FILL; i++)
+		bpf_arena_free(&arena, victim);
+
+	/*
+	 * The sheaf is full of duplicate victim pointers now. Free the four
+	 * filler objects plus one extra object directly to SLUB, leaving enough
+	 * partial slabs that the next target-slab zero-inuse transition discards
+	 * the target page instead of keeping it on the partial list.
+	 */
+	for (i = 0; i < SHEAF_FILL; i++)
+		bpf_arena_free(&arena, objs[i]);
+	bpf_arena_free(&arena, objs[EXTRA_IDX]);
+
+	bpf_arena_free(&arena, victim);
+
+	p = bpf_arena_alloc(&arena, OBJ_SIZE);
+	if (!p)
+		stale_alloc_null = 1;
+
+	done = 1;
+out:
+	bpf_preempt_enable();
+#endif
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.53.0-Meta


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH bpf-next 1/4] slab: Introduce kmem_cache_alloc_nolock()
  2026-05-29 20:24 ` [PATCH bpf-next 1/4] slab: Introduce kmem_cache_alloc_nolock() Alexei Starovoitov
@ 2026-05-29 21:20   ` sashiko-bot
  2026-05-29 22:37     ` Alexei Starovoitov
  0 siblings, 1 reply; 11+ messages in thread
From: sashiko-bot @ 2026-05-29 21:20 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: bpf

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Critical] Deadlock in NMI context due to synchronous freeing on memcg charging failure
--

commit 831b8a379d6dfbde2570286ba8453174bc236695
Author: Alexei Starovoitov <ast@kernel.org>

slab: Introduce kmem_cache_alloc_nolock()

This commit introduces kmem_cache_alloc_nolock_noprof() and its wrapper 
kmem_cache_alloc_nolock(). It is designed to allocate an object from a
specific cache and aims to be safe to call from any context, including NMI
and IRQ-off environments.

> diff --git a/mm/slub.c b/mm/slub.c
> index a2bf3756ca7d0..601986aaebdfc 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
[ ... ]
> @@ -5402,6 +5402,45 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
>  }
>  EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
>  
> +/**
> + * kmem_cache_alloc_nolock - Allocate one object from a specific cache,
> + * safe from any context (including NMI/IRQ-off), like kmalloc_nolock().
> + *
> + * Returns NULL on failure (including the trylock paths that may transiently
> + * fail under contention).
> + */
> +void *kmem_cache_alloc_nolock_noprof(struct kmem_cache *s, gfp_t gfp_flags,
> +				     int node)
> +{
> +	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
> +	void *ret;
> +
> +	VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
> +				      __GFP_NO_OBJ_EXT));
[ ... ]
> +	maybe_wipe_obj_freeptr(s, ret);
> +	slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
> +			     slab_want_init_on_alloc(alloc_gfp, s),
> +			     s->object_size);

[Severity: Critical]
Does this sequence risk a hard deadlock if called from NMI context?

The function permits __GFP_ACCOUNT, but if memory cgroup charging fails
inside memcg_slab_post_alloc_hook(), it synchronously aborts the allocation:

mm/slub.c:memcg_slab_post_alloc_hook() {
    ...
    if (memcg_alloc_charge_slab(s, alloc_gfp, lru, p, size)) {
        for (i = 0; i < size; i++)
            memcg_alloc_abort_single(s, p[i]);
    ...
}

This calls memcg_alloc_abort_single(), which immediately attempts to free
the object:

mm/slub.c:memcg_alloc_abort_single() {
    ...
    if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
        __slab_free(s, slab, object, object, 1, _RET_IP_);
    ...
}

Both slab_free_hook() (via kmemleak and kfence hooks) and __slab_free()
will attempt to acquire non-NMI-safe spinlocks like n->list_lock. If the
interrupted context was holding these locks, wouldn't this synchronous free
cause a deadlock, breaking the "safe from any context" guarantee?

> +	return kasan_kmalloc(s, ret, s->object_size, alloc_gfp);
> +}

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260529202438.16408-1-alexei.starovoitov@gmail.com?part=1

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH bpf-next 3/4] bpf,slab: Add slub-backed allocator for bpf_arena
  2026-05-29 20:24 ` [PATCH bpf-next 3/4] bpf,slab: Add slub-backed allocator for bpf_arena Alexei Starovoitov
@ 2026-05-29 22:04   ` sashiko-bot
  2026-05-30  0:04     ` Alexei Starovoitov
  0 siblings, 1 reply; 11+ messages in thread
From: sashiko-bot @ 2026-05-29 22:04 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: bpf

Thank you for your contribution! Sashiko AI review found 7 potential issue(s) to consider:
- [Critical] BPF-induced freelist double-frees artificially drive SLUB's inuse counter to zero, prematurely freeing active slabs and causing Use-After-Free.
- [Critical] Skipping alloc_slab_obj_exts_early() for arena caches leaves slab->obj_exts uninitialized, causing a kernel panic when SLUB attempts to kfree() the leftover mem_cgroup pointer.
- [Critical] An out-of-bounds array access in arena_free_pages allows a malicious BPF program to read and overwrite arbitrary kernel memory by bypassing the bounds check.
- [Critical] A TOCTOU race condition in bpf_arena_free for multi-page fallback allocations allows concurrent frees to corrupt kernel page allocator metadata.
- [High] BPF program can permanently brick the arena allocator by calling bpf_arena_free_pages() on an active slab page.
- [High] Invalid wait context: calling apply_to_page_range() under a raw spinlock causes scheduling while atomic on PREEMPT_RT.
- [Medium] Trylock failures in __slab_free permanently leak orphaned empty slabs.
--

commit 0bd3f7f14bd0e0d83546ab9ece9d884e9f9bd2e5
Author: Alexei Starovoitov <ast@kernel.org>

bpf,slab: Add slub-backed allocator for bpf_arena

Let BPF programs allocate typed objects in a bpf_arena via a
kvmalloc-style API: bpf_arena_alloc() routes requests up to
PAGE_SIZE through per-arena slab buckets, and falls back to
arena_alloc_pages() for larger sizes.

> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c

[ ... ]

> @@ -179,9 +207,22 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
>  static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
>  {
>  	struct clear_range_data *d = data;
> -	pte_t old_pte;
> +	pte_t old_pte, cur;
>  	struct page *page;
>  
> +	/*
> +	 * Skip slub-owned pages: BPF must use bpf_arena_free() for per-object
> +	 * slab frees. The PTE stays; slub releases it via arena_free_slab_page()
> +	 * after __ClearPageSlab(). Non-atomic ptep_get() is safe -- ptep_try_set()
> +	 * only fires on pte_none, and arena_free_slab_page() can't race on this
> +	 * offset (range stays allocated in range_tree for our walk).
> +	 */
> +	cur = ptep_get(pte);
> +	if (pte_none(cur) || !pte_present(cur))
> +		return 0;
> +	if (PageSlab(pte_page(cur)))
> +		return 0;

[Severity: High]
If this skips clearing the kernel PTEs for PageSlab pages, could it permanently
brick the arena allocator?

If a BPF program calls bpf_arena_free_pages() on an active slab page,
arena_free_pages() adds the page offset back to the arena's free range tree
unconditionally. 

Since the physical page remains mapped, won't subsequent allocations via
arena_alloc_pages() repeatedly pick this slot from the interval tree, hit
-EBUSY in apply_to_page_range() because the PTE is not empty, and abort?

[ ... ]

> @@ -625,7 +686,8 @@ static u64 clear_lo32(u64 val)
>   * Later the pages will be mmaped into user space vma.
>   */
>  static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id,
> -			      bool sleepable)
> +			      bool sleepable, bool set_page_slab,
> +			      struct page **out_page)

[Severity: High]
Does this code acquire a sleeping lock inside a raw spinlock?

Both arena_alloc_pages() and arena_free_pages() acquire arena->spinlock using
raw_res_spin_lock_irqsave(), which is a raw spinlock. While holding this, they
call apply_to_page_range() or apply_to_existing_page_range().

These functions walk the page tables and acquire the init_mm page table lock
(ptl), which is a standard spinlock_t. On PREEMPT_RT kernels, standard
spinlocks are sleepable rt_mutexes. Could this trigger a BUG/panic on
PREEMPT_RT due to an invalid wait context?

[ ... ]

> @@ -778,6 +854,21 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
>  
>  	page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
>  	pgoff = compute_pgoff(arena, uaddr);
> +
> +	/*
> +	 * Drop bookkeeping for any bpf_arena_alloc() fallback pages within the
> +	 * freed range. PageSlab entries are owned by slub and must not be
> +	 * cleared here; slub clears them via bpf_arena_free_slab_page() when
> +	 * the slab page is released.
> +	 */
> +	for (i = 0; i < page_cnt; i++) {
> +		fb_page = READ_ONCE(arena->slab_pages[pgoff + i]);

[Severity: Critical]
Is there a risk of an out-of-bounds array access here that could bypass the
bounds check and corrupt memory?

If a BPF program passes ptr__ign = 0, and arena->user_vm_start has non-zero
lower 32 bits, clear_lo32() rounds it down to a 4GB boundary, making
full_uaddr this rounded-down value.

The bounds check:
if (full_uaddr >= uaddr_end)

evaluates to false because full_uaddr is less than uaddr_end. However,
compute_pgoff() evaluates (u32)(0 - (u32)user_vm_start) >> PAGE_SHIFT, which
underflows to a massive positive index.

Could this massive pgoff allow the loop to iterate over
arena->slab_pages[pgoff + i] out of bounds, potentially writing NULL to
arbitrary kernel memory if it finds a value that passes the !PageSlab() check?

[ ... ]

> +__bpf_kfunc void bpf_arena_free(void *p__map, void *ptr__ign)
> +{
[ ... ]
> +	if (!PageSlab(page)) {
> +		/*
> +		 * Multi-page allocation from the bpf_arena_alloc() fallback.
> +		 * page->private holds page_cnt stashed at allocation time.
> +		 */
> +		u32 page_cnt = page_private(page);
> +
> +		WRITE_ONCE(arena->slab_pages[pgoff], NULL);
> +		set_page_private(page, 0);
> +		arena_free_pages(arena, arena_off, page_cnt, false);
> +		return;
> +	}

[Severity: Critical]
Is there a TOCTOU race condition here when freeing multi-page fallback
allocations?

If a BPF program executes bpf_arena_free() concurrently from multiple threads
on the same valid pointer, they might both read the page pointer locklessly and
pass the !PageSlab(page) check.

If the first thread calls arena_free_pages(), unmapping the page and freeing it
back to the kernel buddy allocator, the second thread will then execute
set_page_private(page, 0). 

Since the page is already in the buddy allocator at that point, could writing 0
to its private field corrupt the buddy allocator's internal order-tracking
metadata?

> diff --git a/mm/slub.c b/mm/slub.c
> --- a/mm/slub.c
> +++ b/mm/slub.c

[ ... ]

> @@ -543,7 +558,12 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
>  	unsigned long freeptr_addr = (unsigned long)object + s->offset;
>  
>  #ifdef CONFIG_SLAB_FREELIST_HARDENED
> -	BUG_ON(object == fp); /* naive detection of double free or corruption */
> +	if (unlikely(object == fp)) {
> +		/* BPF double-free of arena objects must not panic the kernel. */
> +		if (s->bpf_arena)
> +			return;
> +		BUG_ON(object == fp); /* naive detection of double free or corruption */
> +	}
>  #endif

[Severity: Critical]
Does returning early here to suppress the double-free warning lead to a
corrupted inuse counter?

If a BPF program calls bpf_arena_free() multiple times on the same object,
this early return bypasses the double-free detection for arena caches. However,
__slab_free() still unconditionally executes new.inuse -= cnt.

Repeated double-frees will drive the inuse counter to 0. When inuse hits 0,
SLUB assumes the slab is empty and frees the page back to the buddy allocator,
even though other objects in the slab may still be in use by the BPF program
or cached in SLUB's per-CPU sheaves. Could this lead to arbitrary memory
corruption if the page is reused for kernel data structures?

[ ... ]

> @@ -3493,9 +3516,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>  	init_slab_obj_exts(slab);
>  	/*
>  	 * Poison the slab before initializing the slabobj_ext array
> -	 * to prevent the array from being overwritten.
> +	 * to prevent the array from being overwritten. Arena caches
> +	 * stash uaddr32 in slab->stride; let them keep it.
>  	 */
> -	alloc_slab_obj_exts_early(s, slab);
> +	if (!(s->flags & SLAB_BPF_ARENA))
> +		alloc_slab_obj_exts_early(s, slab);

[Severity: Critical]
Does skipping alloc_slab_obj_exts_early() for arena caches leave
slab->obj_exts uninitialized?

Pages allocated for the BPF arena via bpf_map_alloc_pages() use __GFP_ACCOUNT,
which sets folio->memcg_data to point to the memcg. In SLUB, slab->obj_exts
shares the same memory location as folio->memcg_data.

If alloc_slab_obj_exts_early() is skipped, slab->obj_exts is never zeroed and
retains the memcg pointer. When the slab is freed, won't unaccount_slab() call
free_slab_obj_exts(), which reads the non-null slab->obj_exts and passes it to
kfree(), causing a kernel panic?

[ ... ]

> @@ -5589,14 +5644,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>  
>  			n = get_node(s, slab_nid(slab));
>  			/*
> -			 * Speculatively acquire the list_lock.
> -			 * If the cmpxchg does not succeed then we may
> -			 * drop the list_lock without any processing.
> -			 *
> -			 * Otherwise the list_lock will synchronize with
> -			 * other processors updating the list of slabs.
> +			 * Speculatively acquire the list_lock. If the cmpxchg
> +			 * does not succeed we drop the lock without processing.
>  			 *
> +			 * Arena caches may reach here from kfree_nolock() in
> +			 * NMI/irq-off context; trylock and orphan the slab on
> +			 * failure. A later allow_spin caller adopts it.
>  			 */
> -			spin_lock_irqsave(&n->list_lock, flags);
> +			if (unlikely(s->bpf_arena)) {
> +				if (!spin_trylock_irqsave(&n->list_lock, flags))
> +					n = NULL;
> +			} else {
> +				spin_lock_irqsave(&n->list_lock, flags);
> +			}

[Severity: Medium]
Can this trylock failure result in permanently leaking orphaned empty slabs?

If spin_trylock_irqsave() fails for an arena cache, n is set to NULL, and the
function returns early without adding the slab to the partial list, orphaning
the slab.

Later, when the slab becomes completely empty (new.inuse == 0), __slab_free()
acquires the list lock but evaluates:

if (!was_full && !on_node_partial)

Since the slab was orphaned, on_node_partial is false, causing the function
to unlock and return early. The slab_empty discard logic is bypassed. Does
this permanently leak the empty slab until the entire arena map is destroyed?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260529202438.16408-1-alexei.starovoitov@gmail.com?part=3

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH bpf-next 1/4] slab: Introduce kmem_cache_alloc_nolock()
  2026-05-29 21:20   ` sashiko-bot
@ 2026-05-29 22:37     ` Alexei Starovoitov
  0 siblings, 0 replies; 11+ messages in thread
From: Alexei Starovoitov @ 2026-05-29 22:37 UTC (permalink / raw)
  To: sashiko-reviews; +Cc: bpf

On Fri May 29, 2026 at 2:20 PM PDT, sashiko-bot wrote:
> Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
> - [Critical] Deadlock in NMI context due to synchronous freeing on memcg charging failure
> --
>
> commit 831b8a379d6dfbde2570286ba8453174bc236695
> Author: Alexei Starovoitov <ast@kernel.org>
>
> slab: Introduce kmem_cache_alloc_nolock()
>
> This commit introduces kmem_cache_alloc_nolock_noprof() and its wrapper 
> kmem_cache_alloc_nolock(). It is designed to allocate an object from a
> specific cache and aims to be safe to call from any context, including NMI
> and IRQ-off environments.
>
>> diff --git a/mm/slub.c b/mm/slub.c
>> index a2bf3756ca7d0..601986aaebdfc 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
> [ ... ]
>> @@ -5402,6 +5402,45 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
>>  }
>>  EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
>>  
>> +/**
>> + * kmem_cache_alloc_nolock - Allocate one object from a specific cache,
>> + * safe from any context (including NMI/IRQ-off), like kmalloc_nolock().
>> + *
>> + * Returns NULL on failure (including the trylock paths that may transiently
>> + * fail under contention).
>> + */
>> +void *kmem_cache_alloc_nolock_noprof(struct kmem_cache *s, gfp_t gfp_flags,
>> +				     int node)
>> +{
>> +	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
>> +	void *ret;
>> +
>> +	VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
>> +				      __GFP_NO_OBJ_EXT));
> [ ... ]
>> +	maybe_wipe_obj_freeptr(s, ret);
>> +	slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
>> +			     slab_want_init_on_alloc(alloc_gfp, s),
>> +			     s->object_size);
>
> [Severity: Critical]
> Does this sequence risk a hard deadlock if called from NMI context?
>
> The function permits __GFP_ACCOUNT, but if memory cgroup charging fails
> inside memcg_slab_post_alloc_hook(), it synchronously aborts the allocation:
>
> mm/slub.c:memcg_slab_post_alloc_hook() {
>     ...
>     if (memcg_alloc_charge_slab(s, alloc_gfp, lru, p, size)) {
>         for (i = 0; i < size; i++)
>             memcg_alloc_abort_single(s, p[i]);
>     ...
> }
>
> This calls memcg_alloc_abort_single(), which immediately attempts to free
> the object:
>
> mm/slub.c:memcg_alloc_abort_single() {
>     ...
>     if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
>         __slab_free(s, slab, object, object, 1, _RET_IP_);
>     ...
> }
>
> Both slab_free_hook() (via kmemleak and kfence hooks) and __slab_free()
> will attempt to acquire non-NMI-safe spinlocks like n->list_lock. If the
> interrupted context was holding these locks, wouldn't this synchronous free
> cause a deadlock, breaking the "safe from any context" guarantee?

sashiko should have said that it's a pre-existing issue,
since kmalloc_nolock() follows exact same pattern.
Also patch 3 switches __slab_free() to use trylock.
So not really an issue.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH bpf-next 4/4] selftests/bpf: Add tests for arena slub-backed allocator
  2026-05-29 20:24 ` [PATCH bpf-next 4/4] selftests/bpf: Add tests for arena slub-backed allocator Alexei Starovoitov
@ 2026-05-29 22:39   ` sashiko-bot
  2026-05-30  0:13     ` Alexei Starovoitov
  0 siblings, 1 reply; 11+ messages in thread
From: sashiko-bot @ 2026-05-29 22:39 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: bpf

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] The `arena_slab_freeptr_stale_pcs` selftest will reliably fail on architectures with a page size larger than 4K (e.g., ARM64 with 64K pages) due to hardcoded assumptions about SLUB slab page capacities.
--

commit 307042b1f4f8cca55cad67f2c12869bf67a9f7c2
Author: Alexei Starovoitov <ast@kernel.org>

selftests/bpf: Add tests for arena slub-backed allocator

This commit adds end-to-end selftests covering the SLAB_BPF_ARENA path.
It includes smoke tests for basic alloc/free kfuncs and exercises corrupted
in-object freepointers to validate get_freepointer() clamping.

> diff --git a/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c b/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c
> new file mode 100644
> index 0000000000000..4d23d75419d61
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c

[ ... ]

> +	victim = objs[TARGET_IDX];
> +
> +	/*
> +	 * The 4096-byte bucket has one object per slab and a 4-object sheaf.
> +	 * Free @victim while the sheaf is full, then turn its encoded NULL
> +	 * freepointer into any non-NULL decoded value. The arena clamp keeps
> +	 * non-NULL decoded values in the same slab and object-aligned, so this
> +	 * becomes a freelist self-cycle back to @victim.
> +	 */

[Severity: Medium]
Is this assumption valid on architectures with page sizes larger than 4K?

On a system with 64KB pages, a 4096-byte bucket would hold multiple objects
per slab page instead of just one.

[ ... ]

> +	/*
> +	 * The sheaf is full of duplicate victim pointers now. Free the four
> +	 * filler objects plus one extra object directly to SLUB, leaving enough
> +	 * partial slabs that the next target-slab zero-inuse transition discards
> +	 * the target page instead of keeping it on the partial list.
> +	 */
> +	for (i = 0; i < SHEAF_FILL; i++)
> +		bpf_arena_free(&arena, objs[i]);
> +	bpf_arena_free(&arena, objs[EXTRA_IDX]);
> +
> +	bpf_arena_free(&arena, victim);
> +
> +	p = bpf_arena_alloc(&arena, OBJ_SIZE);
> +	if (!p)
> +		stale_alloc_null = 1;

[Severity: Medium]
Will this logic successfully discard the target page on architectures like
arm64 configured with 64K pages?

Since 64KB pages can hold multiple 4096-byte objects, all these allocations
might be placed into the exact same order-0 slab page. Freeing the filler
objects would merely drop the inuse count of that single slab page rather
than creating multiple empty slab pages to fill the partial list.

When the victim object is freed, wouldn't SLUB retain the page instead of
returning it to the OS, causing the final allocation to succeed and
stale_alloc_null to remain 0, thus failing the test?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260529202438.16408-1-alexei.starovoitov@gmail.com?part=4

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH bpf-next 3/4] bpf,slab: Add slub-backed allocator for bpf_arena
  2026-05-29 22:04   ` sashiko-bot
@ 2026-05-30  0:04     ` Alexei Starovoitov
  0 siblings, 0 replies; 11+ messages in thread
From: Alexei Starovoitov @ 2026-05-30  0:04 UTC (permalink / raw)
  To: sashiko-reviews; +Cc: bpf

On Fri May 29, 2026 at 3:04 PM PDT, sashiko-bot wrote:
> Thank you for your contribution! Sashiko AI review found 7 potential issue(s) to consider:
> - [Critical] BPF-induced freelist double-frees artificially drive SLUB's inuse counter to zero, prematurely freeing active slabs and causing Use-After-Free.
> - [Critical] Skipping alloc_slab_obj_exts_early() for arena caches leaves slab->obj_exts uninitialized, causing a kernel panic when SLUB attempts to kfree() the leftover mem_cgroup pointer.
> - [Critical] An out-of-bounds array access in arena_free_pages allows a malicious BPF program to read and overwrite arbitrary kernel memory by bypassing the bounds check.
> - [Critical] A TOCTOU race condition in bpf_arena_free for multi-page fallback allocations allows concurrent frees to corrupt kernel page allocator metadata.
> - [High] BPF program can permanently brick the arena allocator by calling bpf_arena_free_pages() on an active slab page.
> - [High] Invalid wait context: calling apply_to_page_range() under a raw spinlock causes scheduling while atomic on PREEMPT_RT.
> - [Medium] Trylock failures in __slab_free permanently leak orphaned empty slabs.

that's a lot of slop to unpack :)
I wish sashiko learned from these explanations, but here they are.. at least for humans.

>>  static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
>>  {
>>  	struct clear_range_data *d = data;
>> -	pte_t old_pte;
>> +	pte_t old_pte, cur;
>>  	struct page *page;
>>  
>> +	/*
>> +	 * Skip slub-owned pages: BPF must use bpf_arena_free() for per-object
>> +	 * slab frees. The PTE stays; slub releases it via arena_free_slab_page()
>> +	 * after __ClearPageSlab(). Non-atomic ptep_get() is safe -- ptep_try_set()
>> +	 * only fires on pte_none, and arena_free_slab_page() can't race on this
>> +	 * offset (range stays allocated in range_tree for our walk).
>> +	 */
>> +	cur = ptep_get(pte);
>> +	if (pte_none(cur) || !pte_present(cur))
>> +		return 0;
>> +	if (PageSlab(pte_page(cur)))
>> +		return 0;
>
> [Severity: High]
> If this skips clearing the kernel PTEs for PageSlab pages, could it permanently
> brick the arena allocator?

brick arena allocator? no.

> If a BPF program calls bpf_arena_free_pages() on an active slab page,
> arena_free_pages() adds the page offset back to the arena's free range tree
> unconditionally.

which is a bug in bpf prog.

> Since the physical page remains mapped, won't subsequent allocations via
> arena_alloc_pages() repeatedly pick this slot from the interval tree, hit
> -EBUSY in apply_to_page_range() because the PTE is not empty, and abort?

they won't "abort". subsequent arena_alloc_pages() will return NULL.
Works as designed.

>
> [ ... ]
>
>> @@ -625,7 +686,8 @@ static u64 clear_lo32(u64 val)
>>   * Later the pages will be mmaped into user space vma.
>>   */
>>  static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id,
>> -			      bool sleepable)
>> +			      bool sleepable, bool set_page_slab,
>> +			      struct page **out_page)
>
> [Severity: High]
> Does this code acquire a sleeping lock inside a raw spinlock?
>
> Both arena_alloc_pages() and arena_free_pages() acquire arena->spinlock using
> raw_res_spin_lock_irqsave(), which is a raw spinlock. While holding this, they
> call apply_to_page_range() or apply_to_existing_page_range().
>
> These functions walk the page tables and acquire the init_mm page table lock
> (ptl), which is a standard spinlock_t. On PREEMPT_RT kernels, standard
> spinlocks are sleepable rt_mutexes. Could this trigger a BUG/panic on
> PREEMPT_RT due to an invalid wait context?

Nope. There won't be any attempts to take locks.
sashiko is missing that populate_pgtable_except_pte() was called at arena creation time.

> [ ... ]
>
>> @@ -778,6 +854,21 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
>>  
>>  	page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
>>  	pgoff = compute_pgoff(arena, uaddr);
>> +
>> +	/*
>> +	 * Drop bookkeeping for any bpf_arena_alloc() fallback pages within the
>> +	 * freed range. PageSlab entries are owned by slub and must not be
>> +	 * cleared here; slub clears them via bpf_arena_free_slab_page() when
>> +	 * the slab page is released.
>> +	 */
>> +	for (i = 0; i < page_cnt; i++) {
>> +		fb_page = READ_ONCE(arena->slab_pages[pgoff + i]);
>
> [Severity: Critical]
> Is there a risk of an out-of-bounds array access here that could bypass the
> bounds check and corrupt memory?
>
> If a BPF program passes ptr__ign = 0, and arena->user_vm_start has non-zero
> lower 32 bits, clear_lo32() rounds it down to a 4GB boundary, making
> full_uaddr this rounded-down value.

sashiko is missing that bpf_arena_free() has if (!ptr__ign) return;
but lack of page alignment check is indeed missing bpf_arena_free().
It shouldn't be possible to pass ptr__ign == 1 and go down
arena_free_pages() path.
Will fix like:
      if (!IS_ALIGNED(arena_off, PAGE_SIZE))
              goto violation;

>> +__bpf_kfunc void bpf_arena_free(void *p__map, void *ptr__ign)
>> +{
> [ ... ]
>> +	if (!PageSlab(page)) {
>> +		/*
>> +		 * Multi-page allocation from the bpf_arena_alloc() fallback.
>> +		 * page->private holds page_cnt stashed at allocation time.
>> +		 */
>> +		u32 page_cnt = page_private(page);
>> +
>> +		WRITE_ONCE(arena->slab_pages[pgoff], NULL);
>> +		set_page_private(page, 0);
>> +		arena_free_pages(arena, arena_off, page_cnt, false);
>> +		return;
>> +	}
>
> [Severity: Critical]
> Is there a TOCTOU race condition here when freeing multi-page fallback
> allocations?
>
> If a BPF program executes bpf_arena_free() concurrently from multiple threads
> on the same valid pointer, they might both read the page pointer locklessly and
> pass the !PageSlab(page) check.
>
> If the first thread calls arena_free_pages(), unmapping the page and freeing it
> back to the kernel buddy allocator, the second thread will then execute
> set_page_private(page, 0). 
>
> Since the page is already in the buddy allocator at that point, could writing 0
> to its private field corrupt the buddy allocator's internal order-tracking
> metadata?

that's a good one. it's a race indeed.
Fixing like this:
      if (cmpxchg(&arena->slab_pages[pgoff], page, NULL) != page)
              goto violation;

>> diff --git a/mm/slub.c b/mm/slub.c
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>
> [ ... ]
>
>> @@ -543,7 +558,12 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
>>  	unsigned long freeptr_addr = (unsigned long)object + s->offset;
>>  
>>  #ifdef CONFIG_SLAB_FREELIST_HARDENED
>> -	BUG_ON(object == fp); /* naive detection of double free or corruption */
>> +	if (unlikely(object == fp)) {
>> +		/* BPF double-free of arena objects must not panic the kernel. */
>> +		if (s->bpf_arena)
>> +			return;
>> +		BUG_ON(object == fp); /* naive detection of double free or corruption */
>> +	}
>>  #endif
>
> [Severity: Critical]
> Does returning early here to suppress the double-free warning lead to a
> corrupted inuse counter?
>
> If a BPF program calls bpf_arena_free() multiple times on the same object,
> this early return bypasses the double-free detection for arena caches. However,
> __slab_free() still unconditionally executes new.inuse -= cnt.
>
> Repeated double-frees will drive the inuse counter to 0. When inuse hits 0,
> SLUB assumes the slab is empty and frees the page back to the buddy allocator,
> even though other objects in the slab may still be in use by the BPF program
> or cached in SLUB's per-CPU sheaves. Could this lead to arbitrary memory
> corruption if the page is reused for kernel data structures?

Not an issue. sashiko missed the key safety concept of the algorithm
that is described in the commit log.
__slab_free() calls bpf_arena_free_slab_page() which frees PTE for that page
in arena, but bpf progs never see kernel direct-map virtual addresses.
bpf progs operate on kern_vm_start + uaddr32.
So once arena page is freed, bpf has no way to access it.
new.inuse for this arena based kmem_cache can be garbage.
It doesn't affect kernel at all. Only bpf prog that caused corruption
will get garbage back.

>
> [ ... ]
>
>> @@ -3493,9 +3516,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>>  	init_slab_obj_exts(slab);
>>  	/*
>>  	 * Poison the slab before initializing the slabobj_ext array
>> -	 * to prevent the array from being overwritten.
>> +	 * to prevent the array from being overwritten. Arena caches
>> +	 * stash uaddr32 in slab->stride; let them keep it.
>>  	 */
>> -	alloc_slab_obj_exts_early(s, slab);
>> +	if (!(s->flags & SLAB_BPF_ARENA))
>> +		alloc_slab_obj_exts_early(s, slab);
>
> [Severity: Critical]
> Does skipping alloc_slab_obj_exts_early() for arena caches leave
> slab->obj_exts uninitialized?
>
> Pages allocated for the BPF arena via bpf_map_alloc_pages() use __GFP_ACCOUNT,
> which sets folio->memcg_data to point to the memcg. In SLUB, slab->obj_exts
> shares the same memory location as folio->memcg_data.
>
> If alloc_slab_obj_exts_early() is skipped, slab->obj_exts is never zeroed and
> retains the memcg pointer. When the slab is freed, won't unaccount_slab() call
> free_slab_obj_exts(), which reads the non-null slab->obj_exts and passes it to
> kfree(), causing a kernel panic?

sashiko is clearly confused in the last paragraph.
The code it quoted does init_slab_obj_exts() that zeros it.
But the point about sharing the same memory location as folio->memcg_data
is valid. As implemented it skips memcg uncharging when page is freed.
Fixing it with extra arena specific wrapper that does:
    if (!page->memcg_data && objcg && !obj_cgroup_is_root(objcg))
            page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM;
    __free_page(page);

>
> [ ... ]
>
>> @@ -5589,14 +5644,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>>  
>>  			n = get_node(s, slab_nid(slab));
>>  			/*
>> -			 * Speculatively acquire the list_lock.
>> -			 * If the cmpxchg does not succeed then we may
>> -			 * drop the list_lock without any processing.
>> -			 *
>> -			 * Otherwise the list_lock will synchronize with
>> -			 * other processors updating the list of slabs.
>> +			 * Speculatively acquire the list_lock. If the cmpxchg
>> +			 * does not succeed we drop the lock without processing.
>>  			 *
>> +			 * Arena caches may reach here from kfree_nolock() in
>> +			 * NMI/irq-off context; trylock and orphan the slab on
>> +			 * failure. A later allow_spin caller adopts it.
>>  			 */
>> -			spin_lock_irqsave(&n->list_lock, flags);
>> +			if (unlikely(s->bpf_arena)) {
>> +				if (!spin_trylock_irqsave(&n->list_lock, flags))
>> +					n = NULL;
>> +			} else {
>> +				spin_lock_irqsave(&n->list_lock, flags);
>> +			}
>
> [Severity: Medium]
> Can this trylock failure result in permanently leaking orphaned empty slabs?
>
> If spin_trylock_irqsave() fails for an arena cache, n is set to NULL, and the
> function returns early without adding the slab to the partial list, orphaning
> the slab.
>
> Later, when the slab becomes completely empty (new.inuse == 0), __slab_free()
> acquires the list lock but evaluates:
>
> if (!was_full && !on_node_partial)
>
> Since the slab was orphaned, on_node_partial is false, causing the function
> to unlock and return early. The slab_empty discard logic is bypassed. Does
> this permanently leak the empty slab until the entire arena map is destroyed?

No, it doesn't leak. That was also explained in the comment quoted above
and elsewhere throught arena.c. The whole arena_destroy_slab_caches() is about that. 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH bpf-next 4/4] selftests/bpf: Add tests for arena slub-backed allocator
  2026-05-29 22:39   ` sashiko-bot
@ 2026-05-30  0:13     ` Alexei Starovoitov
  0 siblings, 0 replies; 11+ messages in thread
From: Alexei Starovoitov @ 2026-05-30  0:13 UTC (permalink / raw)
  To: sashiko-reviews; +Cc: bpf

On Fri May 29, 2026 at 3:39 PM PDT, sashiko-bot wrote:
> Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
> - [Medium] The `arena_slab_freeptr_stale_pcs` selftest will reliably fail on architectures with a page size larger than 4K (e.g., ARM64 with 64K pages) due to hardcoded assumptions about SLUB slab page capacities.
> --
>
> commit 307042b1f4f8cca55cad67f2c12869bf67a9f7c2
> Author: Alexei Starovoitov <ast@kernel.org>
>
> selftests/bpf: Add tests for arena slub-backed allocator
>
> This commit adds end-to-end selftests covering the SLAB_BPF_ARENA path.
> It includes smoke tests for basic alloc/free kfuncs and exercises corrupted
> in-object freepointers to validate get_freepointer() clamping.
>
>> diff --git a/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c b/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c
>> new file mode 100644
>> index 0000000000000..4d23d75419d61
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/progs/arena_slab_freeptr_stale_pcs.c
>
> [ ... ]
>
>> +	victim = objs[TARGET_IDX];
>> +
>> +	/*
>> +	 * The 4096-byte bucket has one object per slab and a 4-object sheaf.
>> +	 * Free @victim while the sheaf is full, then turn its encoded NULL
>> +	 * freepointer into any non-NULL decoded value. The arena clamp keeps
>> +	 * non-NULL decoded values in the same slab and object-aligned, so this
>> +	 * becomes a freelist self-cycle back to @victim.
>> +	 */
>
> [Severity: Medium]
> Is this assumption valid on architectures with page sizes larger than 4K?
>
> On a system with 64KB pages, a 4096-byte bucket would hold multiple objects
> per slab page instead of just one.

It's fine. It's a stress test for the algorithm. It's gotta work
on at least one arch in CI.
But I'll switch to __PAGE_SIZE to avoid sashiko complains in v2.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2026-05-30  0:13 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-29 20:24 [PATCH bpf-next 0/4] bpf,slab: Introduce bpf_arena_alloc() kfuncs Alexei Starovoitov
2026-05-29 20:24 ` [PATCH bpf-next 1/4] slab: Introduce kmem_cache_alloc_nolock() Alexei Starovoitov
2026-05-29 21:20   ` sashiko-bot
2026-05-29 22:37     ` Alexei Starovoitov
2026-05-29 20:24 ` [PATCH bpf-next 2/4] slub: Pass kmem_cache to alloc_slab_page() Alexei Starovoitov
2026-05-29 20:24 ` [PATCH bpf-next 3/4] bpf,slab: Add slub-backed allocator for bpf_arena Alexei Starovoitov
2026-05-29 22:04   ` sashiko-bot
2026-05-30  0:04     ` Alexei Starovoitov
2026-05-29 20:24 ` [PATCH bpf-next 4/4] selftests/bpf: Add tests for arena slub-backed allocator Alexei Starovoitov
2026-05-29 22:39   ` sashiko-bot
2026-05-30  0:13     ` Alexei Starovoitov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox