[PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu

Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-15 11:05 [PATCH for-next v3 0/9] mm/slab: introduce kfree_rcu_nolock() and improve slub_kunit coverage Harry Yoo (Oracle)
@ 2026-06-15 11:06 ` Harry Yoo (Oracle)
  2026-06-16 17:28   ` Vlastimil Babka (SUSE)
  2026-06-21  0:29   ` XIAO WU
  0 siblings, 2 replies; 8+ messages in thread
From: Harry Yoo (Oracle) @ 2026-06-15 11:06 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, Hao Li, Christoph Lameter,
	David Rientjes, Roman Gushchin, Alexei Starovoitov,
	Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf

Currently, k[v]free_rcu() cannot be called in unknown context since
it could lead to a deadlock when called in the middle of k[v]free_rcu().

Make users' lives easier by introducing kfree_rcu_nolock() variant,
now that kfree_rcu_sheaf() is available on PREEMPT_RT and
__kfree_rcu_sheaf() handles unknown context.

Unlike k[v]free_rcu(), kfree_rcu_nolock() does not fall back to
the kvfree_rcu batching when the sheaves path fails, and falls back to
defer_kfree_rcu() instead. In most cases, the sheaves path is expected
to succeed and it's unnecessary to add complexity to the existing
kvfree_rcu batching.

Since defer_kfree_rcu() can be called on caches without sheaves, move
deferred_work_barrier() and rcu_barrier() outside the branch in
kvfree_rcu_barrier_on_cache().

Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>
---
 include/linux/rcupdate.h | 12 ++++++++++++
 mm/slab.h                |  1 +
 mm/slab_common.c         | 22 ++++++++++++++++++++--
 mm/slub.c                | 23 ++++++++++++++++++++++-
 4 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5e95acc33989..3025249bfcb5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1099,6 +1099,7 @@ static inline void rcu_read_unlock_migrate(void)
  * In mm/slab_common.c, no suitable header to include here.
  */
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
+void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr);
 
 /*
  * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
@@ -1122,6 +1123,17 @@ do {								\
 		kvfree_call_rcu(NULL, (void *) (___p));		\
 } while (0)
 
+/* kfree_rcu_nolock() supports 2-arg variant only */
+#define kfree_rcu_nolock(ptr, krhf)					\
+do {									\
+	typeof (ptr) ___p = (ptr);					\
+									\
+	if (___p) {							\
+		BUILD_BUG_ON(offsetof(typeof(*(ptr)), krhf) >= 4096);	\
+		kfree_call_rcu_nolock(&((___p)->krhf), (void *) (___p));\
+	}								\
+} while (0)
+
 /*
  * Place this after a lock-acquisition primitive to guarantee that
  * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
diff --git a/mm/slab.h b/mm/slab.h
index 961581e35ec8..a493c5201e96 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -745,6 +745,7 @@ void __check_heap_object(const void *ptr, unsigned long n,
 			 const struct slab *slab, bool to_user);
 
 void deferred_work_barrier(void);
+void defer_kfree_rcu(struct rcu_head *head);
 
 static inline bool slub_debug_orig_size(struct kmem_cache *s)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 807924a94fb0..5a39e6225160 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1263,6 +1263,23 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
 EXPORT_TRACEPOINT_SYMBOL(kfree);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
 
+void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
+{
+	struct slab *slab;
+	struct kmem_cache *s;
+
+	VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
+
+	slab = virt_to_slab(ptr);
+	s = slab->slab_cache;
+
+	if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
+		return;
+
+	defer_kfree_rcu(head);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu_nolock);
+
 #ifndef CONFIG_KVFREE_RCU_BATCHED
 
 void kvfree_call_rcu(struct rcu_head *head, void *ptr)
@@ -2120,10 +2137,11 @@ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
 		cpus_read_lock();
 		flush_rcu_sheaves_on_cache(s);
 		cpus_read_unlock();
-		deferred_work_barrier();
-		rcu_barrier();
 	}
 
+	/* kfree_rcu_nolock() might have deferred frees even without sheaves */
+	deferred_work_barrier();
+	rcu_barrier();
 	__kvfree_rcu_barrier();
 }
 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
diff --git a/mm/slub.c b/mm/slub.c
index 4850629774b2..19018a979445 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4075,6 +4075,7 @@ static void flush_all(struct kmem_cache *s)
 
 struct deferred_percpu_work {
 	struct llist_head objects;
+	struct llist_head objects_by_rcu;
 	struct llist_head rcu_sheaves;
 	struct irq_work work;
 };
@@ -4083,6 +4084,7 @@ static void deferred_percpu_work_fn(struct irq_work *work);
 
 static DEFINE_PER_CPU(struct deferred_percpu_work, deferred_percpu_work) = {
 	.objects = LLIST_HEAD_INIT(objects),
+	.objects_by_rcu = LLIST_HEAD_INIT(objects_by_rcu),
 	.rcu_sheaves = LLIST_HEAD_INIT(rcu_sheaves),
 	.work = IRQ_WORK_INIT(deferred_percpu_work_fn),
 };
@@ -6392,12 +6394,13 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 static void deferred_percpu_work_fn(struct irq_work *work)
 {
 	struct deferred_percpu_work *dpw;
-	struct llist_head *objs, *rcu_sheaves;
+	struct llist_head *objs, *objs_by_rcu, *rcu_sheaves;
 	struct llist_node *llnode, *pos, *t;
 
 	dpw = container_of(work, struct deferred_percpu_work, work);
 	rcu_sheaves = &dpw->rcu_sheaves;
 	objs = &dpw->objects;
+	objs_by_rcu = &dpw->objects_by_rcu;
 
 	llnode = llist_del_all(objs);
 	llist_for_each_safe(pos, t, llnode) {
@@ -6428,6 +6431,13 @@ static void deferred_percpu_work_fn(struct irq_work *work)
 
 		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
 	}
+
+	llnode = llist_del_all(objs_by_rcu);
+	llist_for_each_safe(pos, t, llnode) {
+		struct rcu_head *head = (struct rcu_head *)pos;
+
+		call_rcu(head, kvfree_rcu_cb);
+	}
 }
 
 static void defer_free(struct kmem_cache *s, void *head)
@@ -6443,6 +6453,17 @@ static void defer_free(struct kmem_cache *s, void *head)
 		irq_work_queue(&dpw->work);
 }
 
+void defer_kfree_rcu(struct rcu_head *head)
+{
+	struct deferred_percpu_work *dpw;
+
+	guard(preempt)();
+
+	dpw = this_cpu_ptr(&deferred_percpu_work);
+	if (llist_add((struct llist_node *)head, &dpw->objects_by_rcu))
+		irq_work_queue(&dpw->work);
+}
+
 void deferred_work_barrier(void)
 {
 	int cpu;

-- 
2.53.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-15 11:06 ` [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() Harry Yoo (Oracle)
@ 2026-06-16 17:28   ` Vlastimil Babka (SUSE)
  2026-06-21  0:29   ` XIAO WU
  1 sibling, 0 replies; 8+ messages in thread
From: Vlastimil Babka (SUSE) @ 2026-06-16 17:28 UTC (permalink / raw)
  To: Harry Yoo (Oracle), Andrew Morton, Hao Li, Christoph Lameter,
	David Rientjes, Roman Gushchin, Alexei Starovoitov,
	Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf

On 6/15/26 13:06, Harry Yoo (Oracle) wrote:
> Currently, k[v]free_rcu() cannot be called in unknown context since
> it could lead to a deadlock when called in the middle of k[v]free_rcu().
> 
> Make users' lives easier by introducing kfree_rcu_nolock() variant,
> now that kfree_rcu_sheaf() is available on PREEMPT_RT and
> __kfree_rcu_sheaf() handles unknown context.
> 
> Unlike k[v]free_rcu(), kfree_rcu_nolock() does not fall back to
> the kvfree_rcu batching when the sheaves path fails, and falls back to
> defer_kfree_rcu() instead. In most cases, the sheaves path is expected
> to succeed and it's unnecessary to add complexity to the existing
> kvfree_rcu batching.
> 
> Since defer_kfree_rcu() can be called on caches without sheaves, move
> deferred_work_barrier() and rcu_barrier() outside the branch in
> kvfree_rcu_barrier_on_cache().
> 
> Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>

LGTM, nice.
Reviewed-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-15 11:06 ` [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() Harry Yoo (Oracle)
  2026-06-16 17:28   ` Vlastimil Babka (SUSE)
@ 2026-06-21  0:29   ` XIAO WU
  2026-06-22  5:28     ` Harry Yoo
  1 sibling, 1 reply; 8+ messages in thread
From: XIAO WU @ 2026-06-21  0:29 UTC (permalink / raw)
  To: Harry Yoo (Oracle), Vlastimil Babka, Andrew Morton, Hao Li,
	Christoph Lameter, David Rientjes, Roman Gushchin,
	Alexei Starovoitov, Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf

Hi,

I noticed the Sashiko AI review [1] in this thread flagged that
kfree_call_rcu_nolock() dereferences slab->slab_cache even when
virt_to_slab() returns NULL (for large kmalloc objects that bypass
SLUB, or vmalloc addresses).  The VM_WARN_ON_ONCE fires but does not
stop execution, and the subsequent NULL dereference is deterministic.

I was able to reproduce this in QEMU with KASAN.  The trigger is as
simple as passing a large (>8KB) kmalloc buffer to the new function.

On Tue, Jun 16, 2026 at 12:06:14AM +0800, Harry Yoo (Oracle) wrote:
 > This commit introduces kfree_rcu_nolock(), a variant of kfree_rcu()
 > designed to be safely called from unknown contexts without falling
 > back to batched processing.
...
 > +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
 > +{
 > +    struct slab *slab;
 > +    struct kmem_cache *s;
 > +
 > +    VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
 > +
 > +    slab = virt_to_slab(ptr);
 > +    s = slab->slab_cache;

The problem: if ptr is a large kmalloc object (> KMALLOC_MAX_CACHE_SIZE,
which is 8 KB on x86_64), the allocation bypasses SLUB and comes from
the page allocator.  virt_to_slab() returns NULL.  VM_WARN_ON_ONCE
prints a warning but does NOT return, and the next line dereferences
NULL->slab_cache at offset 0x8.

[Reproduction]

I rebuilt the kernel with CONFIG_KASAN=y and added a small late_initcall
that allocates a 16 KB buffer and passes it to kfree_call_rcu_nolock():

   static int __init kfree_rcu_nolock_poc_trigger(void)
   {
       void *p = kmalloc(16384, GFP_KERNEL);
       struct rcu_head *head = kmalloc(sizeof(*head), GFP_KERNEL);
       kfree_call_rcu_nolock(head, p);
       return 0;
   }
   late_initcall(kfree_rcu_nolock_poc_trigger);

[Crash log — kernel 6.19.0-rc5, CONFIG_KASAN=y, CONFIG_DEBUG_VM=y]

   kfree_rcu_nolock PoC: calling kfree_call_rcu_nolock on large obj 
ffff888026c5c000

   WARNING: mm/slab_common.c:1271 at kfree_call_rcu_nolock+0x1e/0xc0
   VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr))

   BUG: kernel NULL pointer dereference, address: 0000000000000008
   #PF: supervisor read access in kernel mode
   #PF: error_code(0x0000) - not-present page

   RIP: 0010:kfree_call_rcu_nolock+0x5c/0xc0
   Call Trace:
    <TASK>
    poc_trigger_init+0x2a/0x40
    do_one_initcall+0x131/0x730
    kernel_init_freeable+0x471/0x7e0
    kernel_init+0x28/0x300
    ret_from_fork+0x2c/0xc0
    </TASK>

   Kernel panic - not syncing: Fatal exception

The crash is at offset 0x5c inside kfree_call_rcu_nolock(), which
corresponds to `s = slab->slab_cache`.  The fault address 0x8 is
exactly offsetof(struct slab, slab_cache).


[1] 
https://sashiko.dev/#/patchset/20260615-kfree_rcu_nolock-v3-0-70a54f3775bb%40kernel.org
     (Sashiko AI code review — "Null Pointer Dereference", Severity: 
Critical)

Thanks,
XIAO




^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-21  0:29   ` XIAO WU
@ 2026-06-22  5:28     ` Harry Yoo
  2026-06-22 14:56       ` XIAO WU
  0 siblings, 1 reply; 8+ messages in thread
From: Harry Yoo @ 2026-06-22  5:28 UTC (permalink / raw)
  To: XIAO WU, Vlastimil Babka, Andrew Morton, Hao Li,
	Christoph Lameter, David Rientjes, Roman Gushchin,
	Alexei Starovoitov, Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf


[-- Attachment #1.1: Type: text/plain, Size: 3544 bytes --]



On 6/21/26 9:29 AM, XIAO WU wrote:
> Hi,

Hi Xiao,

> I noticed the Sashiko AI review [1] in this thread flagged that
> kfree_call_rcu_nolock() dereferences slab->slab_cache even when
> virt_to_slab() returns NULL (for large kmalloc objects that bypass
> SLUB, or vmalloc addresses).  The VM_WARN_ON_ONCE fires but does not
> stop execution, and the subsequent NULL dereference is deterministic.

Thanks for taking a look, but this was intentional.

I should have documented that only kmalloc_nolock() ->
kfree_rcu_nolock() is allowed and kmalloc() -> kfree_rcu_nolock()
is not allowed (yet).

> I was able to reproduce this in QEMU with KASAN.  The trigger is as
> simple as passing a large (>8KB) kmalloc buffer to the new function.
> 
> On Tue, Jun 16, 2026 at 12:06:14AM +0800, Harry Yoo (Oracle) wrote:
>> This commit introduces kfree_rcu_nolock(), a variant of kfree_rcu()
>> designed to be safely called from unknown contexts without falling
>> back to batched processing.
> ...
>> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
>> +{
>> +    struct slab *slab;
>> +    struct kmem_cache *s;
>> +
>> +    VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
>> +
>> +    slab = virt_to_slab(ptr);
>> +    s = slab->slab_cache;
> 
> The problem: if ptr is a large kmalloc object (> KMALLOC_MAX_CACHE_SIZE,
> which is 8 KB on x86_64), the allocation bypasses SLUB and comes from
> the page allocator.  virt_to_slab() returns NULL.  VM_WARN_ON_ONCE
> prints a warning but does NOT return, and the next line dereferences
> NULL->slab_cache at offset 0x8.

Since kmalloc_nolock() does not support large kmalloc, the warning
is not supposed to trigger. That is why I added only debug warnings.

> [Reproduction]
> 
> I rebuilt the kernel with CONFIG_KASAN=y and added a small late_initcall
> that allocates a 16 KB buffer and passes it to kfree_call_rcu_nolock():
> 
>   static int __init kfree_rcu_nolock_poc_trigger(void)
>   {
>       void *p = kmalloc(16384, GFP_KERNEL);
>       struct rcu_head *head = kmalloc(sizeof(*head), GFP_KERNEL);
>       kfree_call_rcu_nolock(head, p);

As mentioned ealier, kmalloc() -> kfree_rcu_nolock() is not supported.

-- 
Cheers,
Harry / Hyeonggon

>       return 0;
>   }
>   late_initcall(kfree_rcu_nolock_poc_trigger);
> 
> [Crash log — kernel 6.19.0-rc5, CONFIG_KASAN=y, CONFIG_DEBUG_VM=y]
> 
>   kfree_rcu_nolock PoC: calling kfree_call_rcu_nolock on large obj
> ffff888026c5c000
> 
>   WARNING: mm/slab_common.c:1271 at kfree_call_rcu_nolock+0x1e/0xc0
>   VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr))
> 
>   BUG: kernel NULL pointer dereference, address: 0000000000000008
>   #PF: supervisor read access in kernel mode
>   #PF: error_code(0x0000) - not-present page
> 
>   RIP: 0010:kfree_call_rcu_nolock+0x5c/0xc0
>   Call Trace:
>    <TASK>
>    poc_trigger_init+0x2a/0x40
>    do_one_initcall+0x131/0x730
>    kernel_init_freeable+0x471/0x7e0
>    kernel_init+0x28/0x300
>    ret_from_fork+0x2c/0xc0
>    </TASK>
> 
>   Kernel panic - not syncing: Fatal exception
> 
> The crash is at offset 0x5c inside kfree_call_rcu_nolock(), which
> corresponds to `s = slab->slab_cache`.  The fault address 0x8 is
> exactly offsetof(struct slab, slab_cache).
> 
> [1] https://sashiko.dev/#/patchset/20260615-kfree_rcu_nolock-
> v3-0-70a54f3775bb%40kernel.org
>     (Sashiko AI code review — "Null Pointer Dereference", Severity:
> Critical)
> 
> Thanks,
> XIAO

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-22  5:28     ` Harry Yoo
@ 2026-06-22 14:56       ` XIAO WU
  2026-06-25  5:27         ` Harry Yoo
  0 siblings, 1 reply; 8+ messages in thread
From: XIAO WU @ 2026-06-22 14:56 UTC (permalink / raw)
  To: Harry Yoo, Vlastimil Babka, Andrew Morton, Hao Li,
	Christoph Lameter, David Rientjes, Roman Gushchin,
	Alexei Starovoitov, Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf

Hi Harry,

On Mon, Jun 22, 2026 at 02:28:44PM +0900, Harry Yoo wrote:
 > On 6/21/26 9:29 AM, XIAO WU wrote:
 > > I was able to reproduce this in QEMU with KASAN.  The trigger is as
 > > simple as passing a large (>8KB) kmalloc buffer to the new function.
 >
 > Thanks for taking a look, but this was intentional.
 >
 > I should have documented that only kmalloc_nolock() ->
 > kfree_rcu_nolock() is allowed and kmalloc() -> kfree_rcu_nolock()
 > is not allowed (yet).
 >
 > Since kmalloc_nolock() does not support large kmalloc, the warning
 > is not supposed to trigger. That is why I added only debug warnings.

Thank you very much for taking the time to explain — I really
appreciate it, especially since I'm still learning my way around the
mm/ subsystem.  You are absolutely right that kmalloc_nolock() returns
NULL for sizes above KMALLOC_MAX_CACHE_SIZE, so a proper caller using
the kmalloc_nolock() → kfree_rcu_nolock() pairing would never hit this.

I did notice one small thing that I wanted to gently bring up, though.
Please forgive me if I'm missing something obvious here.

When I was reading through the surrounding code to understand the
pattern better, I noticed that kfree_nolock() — which has the same
"only for kmalloc_nolock()" constraint (documented in the comment at
mm/slub.c:6828-6835) — actually does check for a NULL slab:

   void kfree_nolock(const void *object)
   {
   	...
   	slab = virt_to_slab(object);
   	if (unlikely(!slab)) {
   		WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
   		return;
   	}
   	s = slab->slab_cache;
   	...

So kfree_nolock() gracefully returns with a warning even though it too
expects only kmalloc_nolock() callers.  That pattern seemed really
sensible to me — it costs almost nothing and prevents a panic if
someone ever passes the wrong pointer (which they shouldn't, but as you
mentioned, the constraint isn't documented on kfree_call_rcu_nolock()
yet).

I also wondered about the difference between WARN_ONCE (used in
kfree_nolock) and VM_WARN_ON_ONCE (used in kfree_call_rcu_nolock).  If
I understand correctly, VM_WARN_ON_ONCE compiles away entirely on
production kernels without CONFIG_DEBUG_VM, which would make the
subsequent NULL dereference completely silent — no warning, just a
panic.  That seems a bit scary for something that's exported to
modules via EXPORT_SYMBOL_GPL.

And since you mentioned that kmalloc() → kfree_rcu_nolock() support is
planned for the future (the "yet") — wouldn't this code path need the
NULL check at that point anyway?

I was thinking something like this would make the function consistent
with kfree_nolock() and also make it forward-compatible with the
planned kmalloc() support:

--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1266,10 +1266,16 @@ void kfree_call_rcu_nolock(struct rcu_head 
*head, void *ptr)
  {
  	struct slab *slab;
  	struct kmem_cache *s;

-	VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
-
  	slab = virt_to_slab(ptr);
+	/*
+	 * kmalloc_nolock() never produces large-kmalloc or vmalloc
+	 * addresses, but be defensive: fall back to defer_kfree_rcu()
+	 * for unsupported pointer types, consistent with kfree_nolock().
+	 */
+	if (unlikely(!slab))
+		goto fallback;
+
  	s = slab->slab_cache;

  	if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
  		return;

+fallback:
  	defer_kfree_rcu(head);
  }

Of course, this is just a suggestion — you know this code far better
than I do.  If you feel the current code is fine as-is with proper
documentation, I completely understand and won't press the point
further.

Either way, thank you again for the explanation, and for working on
this series — having kfree_rcu_nolock() available for BPF and other
contexts will be really valuable.

Thanks,
XIAO

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
@ 2026-06-24  9:22 hu.shengming
  2026-06-25  5:40 ` Harry Yoo
  0 siblings, 1 reply; 8+ messages in thread
From: hu.shengming @ 2026-06-24  9:22 UTC (permalink / raw)
  To: harry
  Cc: vbabka, hao.li, cl, rientjes, roman.gushchin, linux-mm,
	linux-kernel, zhang.run, cai.qu

Harry wrote:
> Currently, k[v]free_rcu() cannot be called in unknown context since
> it could lead to a deadlock when called in the middle of k[v]free_rcu().
> 
> Make users' lives easier by introducing kfree_rcu_nolock() variant,
> now that kfree_rcu_sheaf() is available on PREEMPT_RT and
> __kfree_rcu_sheaf() handles unknown context.
> 
> Unlike k[v]free_rcu(), kfree_rcu_nolock() does not fall back to
> the kvfree_rcu batching when the sheaves path fails, and falls back to
> defer_kfree_rcu() instead. In most cases, the sheaves path is expected
> to succeed and it's unnecessary to add complexity to the existing
> kvfree_rcu batching.
> 
> Since defer_kfree_rcu() can be called on caches without sheaves, move
> deferred_work_barrier() and rcu_barrier() outside the branch in
> kvfree_rcu_barrier_on_cache().
> 
> Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>

Hi Harry,

Thanks for the series. These patches fill a clear functional gap in the
existing free APIs by adding an RCU-deferred free interface for contexts
where kfree_rcu() cannot safely be used.

> ---
>  include/linux/rcupdate.h | 12 ++++++++++++
>  mm/slab.h                |  1 +
>  mm/slab_common.c         | 22 ++++++++++++++++++++--
>  mm/slub.c                | 23 ++++++++++++++++++++++-
>  4 files changed, 55 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 5e95acc33989..3025249bfcb5 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -1099,6 +1099,7 @@ static inline void rcu_read_unlock_migrate(void)
>   * In mm/slab_common.c, no suitable header to include here.
>   */
>  void kvfree_call_rcu(struct rcu_head *head, void *ptr);
> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr);
>  
>  /*
>   * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
> @@ -1122,6 +1123,17 @@ do {								\
>  		kvfree_call_rcu(NULL, (void *) (___p));		\
>  } while (0)
>  
> +/* kfree_rcu_nolock() supports 2-arg variant only */
> +#define kfree_rcu_nolock(ptr, krhf)					\
> +do {									\
> +	typeof (ptr) ___p = (ptr);					\
> +									\
> +	if (___p) {							\
> +		BUILD_BUG_ON(offsetof(typeof(*(ptr)), krhf) >= 4096);	\
> +		kfree_call_rcu_nolock(&((___p)->krhf), (void *) (___p));\
> +	}								\
> +} while (0)
> +
>  /*
>   * Place this after a lock-acquisition primitive to guarantee that
>   * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
> diff --git a/mm/slab.h b/mm/slab.h
> index 961581e35ec8..a493c5201e96 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -745,6 +745,7 @@ void __check_heap_object(const void *ptr, unsigned long n,
>  			 const struct slab *slab, bool to_user);
>  
>  void deferred_work_barrier(void);
> +void defer_kfree_rcu(struct rcu_head *head);
>  
>  static inline bool slub_debug_orig_size(struct kmem_cache *s)
>  {
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 807924a94fb0..5a39e6225160 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1263,6 +1263,23 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
>  EXPORT_TRACEPOINT_SYMBOL(kfree);
>  EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
>  
> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
> +{
> +	struct slab *slab;
> +	struct kmem_cache *s;
> +
> +	VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
> +
> +	slab = virt_to_slab(ptr);
> +	s = slab->slab_cache;
> +
> +	if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
> +		return;
> +

One consistency issue to address here: kfree_rcu_sheaf() only calls
__kfree_rcu_sheaf() for objects belonging to the local NUMA node. This
avoids filling a CPU's per-CPU sheaves with objects from remote slabs.

kfree_call_rcu_nolock() currently skips that check and may therefore
place remote-node objects into the local CPU's RCU sheaf.

Could you add the same local-node check used by kfree_rcu_sheaf()
before calling __kfree_rcu_sheaf(), and route remote-node objects
directly to the defer_kfree_rcu() fallback path instead?

--
With Best Regards,
Shengming

> +	defer_kfree_rcu(head);
> +}
> +EXPORT_SYMBOL_GPL(kfree_call_rcu_nolock);
> +
>  #ifndef CONFIG_KVFREE_RCU_BATCHED
>  
>  void kvfree_call_rcu(struct rcu_head *head, void *ptr)
> @@ -2120,10 +2137,11 @@ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
>  		cpus_read_lock();
>  		flush_rcu_sheaves_on_cache(s);
>  		cpus_read_unlock();
> -		deferred_work_barrier();
> -		rcu_barrier();
>  	}
>  
> +	/* kfree_rcu_nolock() might have deferred frees even without sheaves */
> +	deferred_work_barrier();
> +	rcu_barrier();
>  	__kvfree_rcu_barrier();
>  }
>  EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
> diff --git a/mm/slub.c b/mm/slub.c
> index 4850629774b2..19018a979445 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4075,6 +4075,7 @@ static void flush_all(struct kmem_cache *s)
>  
>  struct deferred_percpu_work {
>  	struct llist_head objects;
> +	struct llist_head objects_by_rcu;
>  	struct llist_head rcu_sheaves;
>  	struct irq_work work;
>  };
> @@ -4083,6 +4084,7 @@ static void deferred_percpu_work_fn(struct irq_work *work);
>  
>  static DEFINE_PER_CPU(struct deferred_percpu_work, deferred_percpu_work) = {
>  	.objects = LLIST_HEAD_INIT(objects),
> +	.objects_by_rcu = LLIST_HEAD_INIT(objects_by_rcu),
>  	.rcu_sheaves = LLIST_HEAD_INIT(rcu_sheaves),
>  	.work = IRQ_WORK_INIT(deferred_percpu_work_fn),
>  };
> @@ -6392,12 +6394,13 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
>  static void deferred_percpu_work_fn(struct irq_work *work)
>  {
>  	struct deferred_percpu_work *dpw;
> -	struct llist_head *objs, *rcu_sheaves;
> +	struct llist_head *objs, *objs_by_rcu, *rcu_sheaves;
>  	struct llist_node *llnode, *pos, *t;
>  
>  	dpw = container_of(work, struct deferred_percpu_work, work);
>  	rcu_sheaves = &dpw->rcu_sheaves;
>  	objs = &dpw->objects;
> +	objs_by_rcu = &dpw->objects_by_rcu;
>  
>  	llnode = llist_del_all(objs);
>  	llist_for_each_safe(pos, t, llnode) {
> @@ -6428,6 +6431,13 @@ static void deferred_percpu_work_fn(struct irq_work *work)
>  
>  		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
>  	}
> +
> +	llnode = llist_del_all(objs_by_rcu);
> +	llist_for_each_safe(pos, t, llnode) {
> +		struct rcu_head *head = (struct rcu_head *)pos;
> +
> +		call_rcu(head, kvfree_rcu_cb);
> +	}
>  }
>  
>  static void defer_free(struct kmem_cache *s, void *head)
> @@ -6443,6 +6453,17 @@ static void defer_free(struct kmem_cache *s, void *head)
>  		irq_work_queue(&dpw->work);
>  }
>  
> +void defer_kfree_rcu(struct rcu_head *head)
> +{
> +	struct deferred_percpu_work *dpw;
> +
> +	guard(preempt)();
> +
> +	dpw = this_cpu_ptr(&deferred_percpu_work);
> +	if (llist_add((struct llist_node *)head, &dpw->objects_by_rcu))
> +		irq_work_queue(&dpw->work);
> +}
> +
>  void deferred_work_barrier(void)
>  {
>  	int cpu;
> 
> -- 
> 2.53.0


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-22 14:56       ` XIAO WU
@ 2026-06-25  5:27         ` Harry Yoo
  0 siblings, 0 replies; 8+ messages in thread
From: Harry Yoo @ 2026-06-25  5:27 UTC (permalink / raw)
  To: XIAO WU, Vlastimil Babka, Andrew Morton, Hao Li,
	Christoph Lameter, David Rientjes, Roman Gushchin,
	Alexei Starovoitov, Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf


[-- Attachment #1.1: Type: text/plain, Size: 4556 bytes --]



On 6/22/26 11:56 PM, XIAO WU wrote:
> Hi Harry,
> 
> On Mon, Jun 22, 2026 at 02:28:44PM +0900, Harry Yoo wrote:
>> On 6/21/26 9:29 AM, XIAO WU wrote:
>> > I was able to reproduce this in QEMU with KASAN.  The trigger is as
>> > simple as passing a large (>8KB) kmalloc buffer to the new function.
>>
>> Thanks for taking a look, but this was intentional.
>>
>> I should have documented that only kmalloc_nolock() ->
>> kfree_rcu_nolock() is allowed and kmalloc() -> kfree_rcu_nolock()
>> is not allowed (yet).
>>
>> Since kmalloc_nolock() does not support large kmalloc, the warning
>> is not supposed to trigger. That is why I added only debug warnings.
> 
> Thank you very much for taking the time to explain — I really
> appreciate it, especially since I'm still learning my way around the
> mm/ subsystem.  You are absolutely right that kmalloc_nolock() returns
> NULL for sizes above KMALLOC_MAX_CACHE_SIZE, so a proper caller using
> the kmalloc_nolock() → kfree_rcu_nolock() pairing would never hit this.
> 
> I did notice one small thing that I wanted to gently bring up, though.
> Please forgive me if I'm missing something obvious here.
> 
> When I was reading through the surrounding code to understand the
> pattern better, I noticed that kfree_nolock() — which has the same
> "only for kmalloc_nolock()" constraint (documented in the comment at
> mm/slub.c:6828-6835) — actually does check for a NULL slab:
> 
>   void kfree_nolock(const void *object)
>   {
>       ...
>       slab = virt_to_slab(object);
>       if (unlikely(!slab)) {
>           WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
>           return;
>       }
>       s = slab->slab_cache;
>       ...
> 
> So kfree_nolock() gracefully returns with a warning even though it too
> expects only kmalloc_nolock() callers.  That pattern seemed really
> sensible to me — it costs almost nothing and prevents a panic if
> someone ever passes the wrong pointer (which they shouldn't, but as you
> mentioned, the constraint isn't documented on kfree_call_rcu_nolock()
> yet).
>
> I also wondered about the difference between WARN_ONCE (used in
> kfree_nolock) and VM_WARN_ON_ONCE (used in kfree_call_rcu_nolock). If
> I understand correctly, VM_WARN_ON_ONCE compiles away entirely on
> production kernels without CONFIG_DEBUG_VM, which would make the
> subsequent NULL dereference completely silent — no warning, just a
> panic.

It would crash without debug option anyway, but warnings are there to
make it easier to point what's gone wrong.

And not testing the code path at least once with debug option is a big
problem :)

> And since you mentioned that kmalloc() → kfree_rcu_nolock() support is
> planned for the future (the "yet") — wouldn't this code path need the
> NULL check at that point anyway?
> 
> I was thinking something like this would make the function consistent
> with kfree_nolock() and also make it forward-compatible with the
> planned kmalloc() support:
> 
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1266,10 +1266,16 @@ void kfree_call_rcu_nolock(struct rcu_head
> *head, void *ptr)
>  {
>      struct slab *slab;
>      struct kmem_cache *s;
> 
> -    VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
> -
>      slab = virt_to_slab(ptr);
> +    /*
> +     * kmalloc_nolock() never produces large-kmalloc or vmalloc
> +     * addresses, but be defensive: fall back to defer_kfree_rcu()
> +     * for unsupported pointer types, consistent with kfree_nolock().
> +     */
> +    if (unlikely(!slab))
> +        goto fallback;

Just FYI, virt_to_slab() and virt_to_page()
don't work correctly for vmalloc addresses.

And I don't think silently making it work is good.

> +
>      s = slab->slab_cache;
> 
>      if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
>          return;
> 
> +fallback:
>      defer_kfree_rcu(head);
>  }
> 
> Of course, this is just a suggestion — you know this code far better
> than I do.  If you feel the current code is fine as-is with proper
> documentation, I completely understand and won't press the point
> further.
> 
> Either way, thank you again for the explanation, and for working on
> this series — having kfree_rcu_nolock() available for BPF and other
> contexts will be really valuable.
> 
> Thanks,
> XIAO

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-24  9:22 [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() hu.shengming
@ 2026-06-25  5:40 ` Harry Yoo
  0 siblings, 0 replies; 8+ messages in thread
From: Harry Yoo @ 2026-06-25  5:40 UTC (permalink / raw)
  To: hu.shengming
  Cc: vbabka, hao.li, cl, rientjes, roman.gushchin, linux-mm,
	linux-kernel, zhang.run, cai.qu


[-- Attachment #1.1: Type: text/plain, Size: 3078 bytes --]



On 6/24/26 6:22 PM, hu.shengming@zte.com.cn wrote:
> Harry wrote:
>> Currently, k[v]free_rcu() cannot be called in unknown context since
>> it could lead to a deadlock when called in the middle of k[v]free_rcu().
>>
>> Make users' lives easier by introducing kfree_rcu_nolock() variant,
>> now that kfree_rcu_sheaf() is available on PREEMPT_RT and
>> __kfree_rcu_sheaf() handles unknown context.
>>
>> Unlike k[v]free_rcu(), kfree_rcu_nolock() does not fall back to
>> the kvfree_rcu batching when the sheaves path fails, and falls back to
>> defer_kfree_rcu() instead. In most cases, the sheaves path is expected
>> to succeed and it's unnecessary to add complexity to the existing
>> kvfree_rcu batching.
>>
>> Since defer_kfree_rcu() can be called on caches without sheaves, move
>> deferred_work_barrier() and rcu_barrier() outside the branch in
>> kvfree_rcu_barrier_on_cache().
>>
>> Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>
> 
> Hi Harry,
> 
> Thanks for the series. These patches fill a clear functional gap in the
> existing free APIs by adding an RCU-deferred free interface for contexts
> where kfree_rcu() cannot safely be used.

Thanks for looking into this, Shengming.

>> ---
>>  include/linux/rcupdate.h | 12 ++++++++++++
>>  mm/slab.h                |  1 +
>>  mm/slab_common.c         | 22 ++++++++++++++++++++--
>>  mm/slub.c                | 23 ++++++++++++++++++++++-
>>  4 files changed, 55 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index 807924a94fb0..5a39e6225160 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -1263,6 +1263,23 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
>>  EXPORT_TRACEPOINT_SYMBOL(kfree);
>>  EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
>>  
>> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
>> +{
>> +	struct slab *slab;
>> +	struct kmem_cache *s;
>> +
>> +	VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
>> +
>> +	slab = virt_to_slab(ptr);
>> +	s = slab->slab_cache;
>> +
>> +	if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
>> +		return;
>> +
> 
> One consistency issue to address here: kfree_rcu_sheaf() only calls
> __kfree_rcu_sheaf() for objects belonging to the local NUMA node. This
> avoids filling a CPU's per-CPU sheaves with objects from remote slabs.
> 
> kfree_call_rcu_nolock() currently skips that check and may therefore
> place remote-node objects into the local CPU's RCU sheaf.

That was intentional, but actually, this is a good point. Thanks.

> Could you add the same local-node check used by kfree_rcu_sheaf()
> before calling __kfree_rcu_sheaf(), and route remote-node objects
> directly to the defer_kfree_rcu() fallback path instead?

Falling back to defer_kfree_rcu() in v3 didn't make much sense
as the object is inserted to a global list which would cause more
troubles than NUMA miss.

But once we make the fallback path percpu, your suggestion would make
more sense.

-- 
Cheers,
Harry / Hyeonggon


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-06-25  5:40 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-24  9:22 [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() hu.shengming
2026-06-25  5:40 ` Harry Yoo
  -- strict thread matches above, loose matches on Subject: below --
2026-06-15 11:05 [PATCH for-next v3 0/9] mm/slab: introduce kfree_rcu_nolock() and improve slub_kunit coverage Harry Yoo (Oracle)
2026-06-15 11:06 ` [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() Harry Yoo (Oracle)
2026-06-16 17:28   ` Vlastimil Babka (SUSE)
2026-06-21  0:29   ` XIAO WU
2026-06-22  5:28     ` Harry Yoo
2026-06-22 14:56       ` XIAO WU
2026-06-25  5:27         ` Harry Yoo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox