Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu

Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
@ 2026-06-24  9:22 hu.shengming
  2026-06-25  5:40 ` Harry Yoo
  0 siblings, 1 reply; 8+ messages in thread
From: hu.shengming @ 2026-06-24  9:22 UTC (permalink / raw)
  To: harry
  Cc: vbabka, hao.li, cl, rientjes, roman.gushchin, linux-mm,
	linux-kernel, zhang.run, cai.qu

Harry wrote:
> Currently, k[v]free_rcu() cannot be called in unknown context since
> it could lead to a deadlock when called in the middle of k[v]free_rcu().
> 
> Make users' lives easier by introducing kfree_rcu_nolock() variant,
> now that kfree_rcu_sheaf() is available on PREEMPT_RT and
> __kfree_rcu_sheaf() handles unknown context.
> 
> Unlike k[v]free_rcu(), kfree_rcu_nolock() does not fall back to
> the kvfree_rcu batching when the sheaves path fails, and falls back to
> defer_kfree_rcu() instead. In most cases, the sheaves path is expected
> to succeed and it's unnecessary to add complexity to the existing
> kvfree_rcu batching.
> 
> Since defer_kfree_rcu() can be called on caches without sheaves, move
> deferred_work_barrier() and rcu_barrier() outside the branch in
> kvfree_rcu_barrier_on_cache().
> 
> Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>

Hi Harry,

Thanks for the series. These patches fill a clear functional gap in the
existing free APIs by adding an RCU-deferred free interface for contexts
where kfree_rcu() cannot safely be used.

> ---
>  include/linux/rcupdate.h | 12 ++++++++++++
>  mm/slab.h                |  1 +
>  mm/slab_common.c         | 22 ++++++++++++++++++++--
>  mm/slub.c                | 23 ++++++++++++++++++++++-
>  4 files changed, 55 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 5e95acc33989..3025249bfcb5 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -1099,6 +1099,7 @@ static inline void rcu_read_unlock_migrate(void)
>   * In mm/slab_common.c, no suitable header to include here.
>   */
>  void kvfree_call_rcu(struct rcu_head *head, void *ptr);
> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr);
>  
>  /*
>   * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
> @@ -1122,6 +1123,17 @@ do {								\
>  		kvfree_call_rcu(NULL, (void *) (___p));		\
>  } while (0)
>  
> +/* kfree_rcu_nolock() supports 2-arg variant only */
> +#define kfree_rcu_nolock(ptr, krhf)					\
> +do {									\
> +	typeof (ptr) ___p = (ptr);					\
> +									\
> +	if (___p) {							\
> +		BUILD_BUG_ON(offsetof(typeof(*(ptr)), krhf) >= 4096);	\
> +		kfree_call_rcu_nolock(&((___p)->krhf), (void *) (___p));\
> +	}								\
> +} while (0)
> +
>  /*
>   * Place this after a lock-acquisition primitive to guarantee that
>   * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
> diff --git a/mm/slab.h b/mm/slab.h
> index 961581e35ec8..a493c5201e96 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -745,6 +745,7 @@ void __check_heap_object(const void *ptr, unsigned long n,
>  			 const struct slab *slab, bool to_user);
>  
>  void deferred_work_barrier(void);
> +void defer_kfree_rcu(struct rcu_head *head);
>  
>  static inline bool slub_debug_orig_size(struct kmem_cache *s)
>  {
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 807924a94fb0..5a39e6225160 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1263,6 +1263,23 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
>  EXPORT_TRACEPOINT_SYMBOL(kfree);
>  EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
>  
> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
> +{
> +	struct slab *slab;
> +	struct kmem_cache *s;
> +
> +	VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
> +
> +	slab = virt_to_slab(ptr);
> +	s = slab->slab_cache;
> +
> +	if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
> +		return;
> +

One consistency issue to address here: kfree_rcu_sheaf() only calls
__kfree_rcu_sheaf() for objects belonging to the local NUMA node. This
avoids filling a CPU's per-CPU sheaves with objects from remote slabs.

kfree_call_rcu_nolock() currently skips that check and may therefore
place remote-node objects into the local CPU's RCU sheaf.

Could you add the same local-node check used by kfree_rcu_sheaf()
before calling __kfree_rcu_sheaf(), and route remote-node objects
directly to the defer_kfree_rcu() fallback path instead?

--
With Best Regards,
Shengming

> +	defer_kfree_rcu(head);
> +}
> +EXPORT_SYMBOL_GPL(kfree_call_rcu_nolock);
> +
>  #ifndef CONFIG_KVFREE_RCU_BATCHED
>  
>  void kvfree_call_rcu(struct rcu_head *head, void *ptr)
> @@ -2120,10 +2137,11 @@ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
>  		cpus_read_lock();
>  		flush_rcu_sheaves_on_cache(s);
>  		cpus_read_unlock();
> -		deferred_work_barrier();
> -		rcu_barrier();
>  	}
>  
> +	/* kfree_rcu_nolock() might have deferred frees even without sheaves */
> +	deferred_work_barrier();
> +	rcu_barrier();
>  	__kvfree_rcu_barrier();
>  }
>  EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
> diff --git a/mm/slub.c b/mm/slub.c
> index 4850629774b2..19018a979445 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4075,6 +4075,7 @@ static void flush_all(struct kmem_cache *s)
>  
>  struct deferred_percpu_work {
>  	struct llist_head objects;
> +	struct llist_head objects_by_rcu;
>  	struct llist_head rcu_sheaves;
>  	struct irq_work work;
>  };
> @@ -4083,6 +4084,7 @@ static void deferred_percpu_work_fn(struct irq_work *work);
>  
>  static DEFINE_PER_CPU(struct deferred_percpu_work, deferred_percpu_work) = {
>  	.objects = LLIST_HEAD_INIT(objects),
> +	.objects_by_rcu = LLIST_HEAD_INIT(objects_by_rcu),
>  	.rcu_sheaves = LLIST_HEAD_INIT(rcu_sheaves),
>  	.work = IRQ_WORK_INIT(deferred_percpu_work_fn),
>  };
> @@ -6392,12 +6394,13 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
>  static void deferred_percpu_work_fn(struct irq_work *work)
>  {
>  	struct deferred_percpu_work *dpw;
> -	struct llist_head *objs, *rcu_sheaves;
> +	struct llist_head *objs, *objs_by_rcu, *rcu_sheaves;
>  	struct llist_node *llnode, *pos, *t;
>  
>  	dpw = container_of(work, struct deferred_percpu_work, work);
>  	rcu_sheaves = &dpw->rcu_sheaves;
>  	objs = &dpw->objects;
> +	objs_by_rcu = &dpw->objects_by_rcu;
>  
>  	llnode = llist_del_all(objs);
>  	llist_for_each_safe(pos, t, llnode) {
> @@ -6428,6 +6431,13 @@ static void deferred_percpu_work_fn(struct irq_work *work)
>  
>  		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
>  	}
> +
> +	llnode = llist_del_all(objs_by_rcu);
> +	llist_for_each_safe(pos, t, llnode) {
> +		struct rcu_head *head = (struct rcu_head *)pos;
> +
> +		call_rcu(head, kvfree_rcu_cb);
> +	}
>  }
>  
>  static void defer_free(struct kmem_cache *s, void *head)
> @@ -6443,6 +6453,17 @@ static void defer_free(struct kmem_cache *s, void *head)
>  		irq_work_queue(&dpw->work);
>  }
>  
> +void defer_kfree_rcu(struct rcu_head *head)
> +{
> +	struct deferred_percpu_work *dpw;
> +
> +	guard(preempt)();
> +
> +	dpw = this_cpu_ptr(&deferred_percpu_work);
> +	if (llist_add((struct llist_node *)head, &dpw->objects_by_rcu))
> +		irq_work_queue(&dpw->work);
> +}
> +
>  void deferred_work_barrier(void)
>  {
>  	int cpu;
> 
> -- 
> 2.53.0


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-24  9:22 [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() hu.shengming
@ 2026-06-25  5:40 ` Harry Yoo
  0 siblings, 0 replies; 8+ messages in thread
From: Harry Yoo @ 2026-06-25  5:40 UTC (permalink / raw)
  To: hu.shengming
  Cc: vbabka, hao.li, cl, rientjes, roman.gushchin, linux-mm,
	linux-kernel, zhang.run, cai.qu


[-- Attachment #1.1: Type: text/plain, Size: 3078 bytes --]



On 6/24/26 6:22 PM, hu.shengming@zte.com.cn wrote:
> Harry wrote:
>> Currently, k[v]free_rcu() cannot be called in unknown context since
>> it could lead to a deadlock when called in the middle of k[v]free_rcu().
>>
>> Make users' lives easier by introducing kfree_rcu_nolock() variant,
>> now that kfree_rcu_sheaf() is available on PREEMPT_RT and
>> __kfree_rcu_sheaf() handles unknown context.
>>
>> Unlike k[v]free_rcu(), kfree_rcu_nolock() does not fall back to
>> the kvfree_rcu batching when the sheaves path fails, and falls back to
>> defer_kfree_rcu() instead. In most cases, the sheaves path is expected
>> to succeed and it's unnecessary to add complexity to the existing
>> kvfree_rcu batching.
>>
>> Since defer_kfree_rcu() can be called on caches without sheaves, move
>> deferred_work_barrier() and rcu_barrier() outside the branch in
>> kvfree_rcu_barrier_on_cache().
>>
>> Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>
> 
> Hi Harry,
> 
> Thanks for the series. These patches fill a clear functional gap in the
> existing free APIs by adding an RCU-deferred free interface for contexts
> where kfree_rcu() cannot safely be used.

Thanks for looking into this, Shengming.

>> ---
>>  include/linux/rcupdate.h | 12 ++++++++++++
>>  mm/slab.h                |  1 +
>>  mm/slab_common.c         | 22 ++++++++++++++++++++--
>>  mm/slub.c                | 23 ++++++++++++++++++++++-
>>  4 files changed, 55 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index 807924a94fb0..5a39e6225160 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -1263,6 +1263,23 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
>>  EXPORT_TRACEPOINT_SYMBOL(kfree);
>>  EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
>>  
>> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
>> +{
>> +	struct slab *slab;
>> +	struct kmem_cache *s;
>> +
>> +	VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
>> +
>> +	slab = virt_to_slab(ptr);
>> +	s = slab->slab_cache;
>> +
>> +	if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
>> +		return;
>> +
> 
> One consistency issue to address here: kfree_rcu_sheaf() only calls
> __kfree_rcu_sheaf() for objects belonging to the local NUMA node. This
> avoids filling a CPU's per-CPU sheaves with objects from remote slabs.
> 
> kfree_call_rcu_nolock() currently skips that check and may therefore
> place remote-node objects into the local CPU's RCU sheaf.

That was intentional, but actually, this is a good point. Thanks.

> Could you add the same local-node check used by kfree_rcu_sheaf()
> before calling __kfree_rcu_sheaf(), and route remote-node objects
> directly to the defer_kfree_rcu() fallback path instead?

Falling back to defer_kfree_rcu() in v3 didn't make much sense
as the object is inserted to a global list which would cause more
troubles than NUMA miss.

But once we make the fallback path percpu, your suggestion would make
more sense.

-- 
Cheers,
Harry / Hyeonggon


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH for-next v3 0/9] mm/slab: introduce kfree_rcu_nolock() and improve slub_kunit coverage
@ 2026-06-15 11:05 Harry Yoo (Oracle)
  2026-06-15 11:06 ` [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() Harry Yoo (Oracle)
  0 siblings, 1 reply; 8+ messages in thread
From: Harry Yoo (Oracle) @ 2026-06-15 11:05 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, Hao Li, Christoph Lameter,
	David Rientjes, Roman Gushchin, Alexei Starovoitov,
	Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf

Not the best time to post a series, but didn't want to delay posting
the series for too long. no pressures ;)  This is aimed to be queued
for review and testing after the merge window closes.

This series is based on next-20260612, and is also available on
git.kernel.org [3].

To RCU folks: It would be great if you could kindly take a quick look at
patch 4 and either ack or nack the patch ;)

To BPF folks: Ulad asked to share workloads to measure performance
of kfree_rcu_nolock(). Unfortunately, I focused more on correctness
and have not spent much effort on that. It would be nice if BPF folks
could help evaluate it on their relevant workloads.

To PREEMPT_RT folks: The most relevant part is allowing
kfree_rcu_sheaf() on PREEMPT_RT (patch 6). It carefully avoids sleeping
by acquiring the locks via local_trylock() or spin_trylock_irqsave()
to avoid sleeping within a raw spinlock. When trylock or unlock is
unsafe, kmalloc_nolock() always fails.

Changes since RFC v2
====================

Reduced complexity and intrusiveness (Uladzislau Rezki)
-------------------------------------------------------

While discussing concerns about the complexity of adding allow_spin
handling with Ulad (Thanks!), I realized that adding complexity to the
kvfree_rcu batching is not strictly necessary: only slab objects need to
be batched, they are already batched by rcu sheaves, and slab already
supports unknown context. So it is enough to implement only a minimal
fallback for the sheaves path.

I tried to avoid making intrusive changes to the existing kvfree_rcu
path as much as possible. struct rcu_ptr is renamed to kfree_rcu_head
following Vlastimil's suggestion, and it is used only in the
kfree_rcu_nolock() path for now.

As a result, the complexity is significantly reduced and the series
became much less intrusive. This is also reflected well in the diffstat
below.

RFC v2 diffstat:
  8 files changed, 514 insertions(+), 163 deletions(-)

v3 diffstat:
  6 files changed, 370 insertions(+), 105 deletions(-)

v3 diffstat (slub_kunit improvements - patch 1, 2, 9 excluded):
  5 files changed, 199 insertions(+), 66 deletions(-)

kfree_rcu_sheaf() PREEMPT_RT support (Vlastimil Babka)
------------------------------------------------------

As suggested by Vlastimil (Thanks!), kfree_rcu_sheaf() can now be used
on PREEMPT_RT as well, by always assuming allow_spin is false on
PREEMPT_RT.

slub_kunit enhancements
-----------------------

- Currently the test is skipped when there is no hardware PMU. This can
  happen on machines without a PMU, or in virtualized environments
  (e.g., automated testing or virtme). Implement a fallback based on SW
  perf events so that the test can still run in such environments, even
  though the coverage is slightly smaller.

- While testing on PREEMPT_RT, I found that kmalloc_nolock() fails every
  time, so the fallback path is not properly tested. This is a limitation
  of perf events: the handler is called in NMI (HW perf events) or
  interrupt context (SW perf events), where kmalloc_nolock() cannot
  succeed.

  slub_kunit now registers a kprobe pre-handler at the points in the slab
  allocator where lockdep_assert_held() is invoked. The pre-handler calls
  kmalloc_nolock() and friends, to improve coverage on PREEMPT_RT instead
  of relying on perf events.

One thing that needs to be further explored
-------------------------------------------

The global deferred_free_by_rcu (introduced by patch 8) list for the
fallback should probably be per-CPU [5].

Actual Cover Letter
===================

This series improves kmalloc_nolock() and kfree_nolock() coverage
in slub_kunit (patch 1 and 2) and introduces kfree_rcu_nolock() for
an unknown context as suggested by Alexei Starovoitov.

Unknown context means the caller does not know whether spinning on a lock
is safe (e.g., a BPF program attached to an arbitrary kernel function or
in NMI context).

The slab allocator already supports unknown context via kmalloc_nolock()
and kfree_nolock(), but te slab allocator does not support freeing
objects by RCU in unknown context.

It is not ideal to have completely separate batching for unknown context
because the worst scenario where spinning on a lock would lead to
deadlock is very rare, and in most cases, it is safe to use the
existing mechanism (kfree_rcu_sheaf()).

Since most part of the slab allocator already supports unknown context
and sheaves support batching kvfree_rcu() calls for slab objects,
implement kfree_rcu_nolock() with minimal changes by teaching
kfree_rcu_sheaf() how to support unknown context and making
it a little bit harder to allocate an empty sheaf, instead of making
intrusive changes to the existing kvfree_rcu batching logic.

kfree_rcu_nolock() tries to free the object to the rcu sheaf if
trylock succeeds. Once the rcu sheaf becomes full, it is submitted to
RCU via call_rcu() if spinning is allowed or IRQs are enabled (to avoid
calling call_rcu() in the middle of call_rcu()). Otherwise, call_rcu()
is deferred via irq work.

In unknown context, when there is no sheaf available, kfree_rcu_sheaf()
falls back to defer_kfree_rcu(), which inserts the object to a global
lockless list [5] and those objects are freed after synchronize_rcu() in
a workqueue.

Unlike kfree_rcu(), only the 2-argument variant is supported.
This is because the last resort of the 1-arg variant is
synchronize_rcu(), which cannot be used in an unknown context.

As suggested by Alexei Starovoitov, kfree_rcu_nolock() can be used with
struct kfree_rcu_head (8 bytes), which is smaller than struct rcu_head
(16 bytes).

For more background and future plans, please see [4].

[1] RFC v1: https://lore.kernel.org/linux-mm/20260206093410.160622-1-harry.yoo@oracle.com

[2] RFC v2: https://lore.kernel.org/linux-mm/20260416091022.36823-1-harry@kernel.org

[3] https://git.kernel.org/pub/scm/linux/kernel/git/harry/linux.git/log/?h=kfree_rcu_nolock-v3r3

[4] kmalloc_nolock() follow-ups, including kfree_rcu_nolock(),
    https://lore.kernel.org/linux-mm/esepccfhqg7m6jo76ns2znj2cnuaepx2xvw5zaygtwohq4psma@563ypprp6rr3

[5] However, we should probably make the list percpu because,
    unlike RFC v2, it can be triggered more frequently under memory
    pressure.

    https://lore.kernel.org/linux-mm/805c33d7-3a7b-470c-bd9d-065717a3e3e2@paulmck-laptop

Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>
---
Harry Yoo (Oracle) (9):
      slub_kunit: fall back to SW perf events when HW PMU is not available
      mm/slab, slub_kunit: register kprobe to trigger _nolock APIs
      mm/slab: handle the !allow_spin case in kfree_rcu_sheaf()
      mm/slab: use call_rcu() in unknown context if irqs are enabled
      mm/slab: extend deferred free mechanism to handle rcu sheaves
      mm/slab: allow kfree_rcu_sheaf() on PREEMPT_RT
      mm/slab: introduce kfree_rcu_nolock()
      mm/slab: introduce struct kfree_rcu_head and use in kfree_rcu_nolock()
      slub_kunit: extend the test for kfree_rcu_nolock()

 include/linux/rcupdate.h |  12 +++
 include/linux/types.h    |   4 +
 lib/tests/slub_kunit.c   | 174 ++++++++++++++++++++++++++++------
 mm/slab.h                |   5 +-
 mm/slab_common.c         |  38 ++++++--
 mm/slub.c                | 242 ++++++++++++++++++++++++++++++++++-------------
 6 files changed, 370 insertions(+), 105 deletions(-)
---
base-commit: c425609d6ac4012c8bbf01ec2e10e801b1923a7b
change-id: 20260615-kfree_rcu_nolock-e5502555992f

Best regards,
-- 
Harry Yoo (Oracle) <harry@kernel.org>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-15 11:05 [PATCH for-next v3 0/9] mm/slab: introduce kfree_rcu_nolock() and improve slub_kunit coverage Harry Yoo (Oracle)
@ 2026-06-15 11:06 ` Harry Yoo (Oracle)
  2026-06-16 17:28   ` Vlastimil Babka (SUSE)
  2026-06-21  0:29   ` XIAO WU
  0 siblings, 2 replies; 8+ messages in thread
From: Harry Yoo (Oracle) @ 2026-06-15 11:06 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, Hao Li, Christoph Lameter,
	David Rientjes, Roman Gushchin, Alexei Starovoitov,
	Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf

Currently, k[v]free_rcu() cannot be called in unknown context since
it could lead to a deadlock when called in the middle of k[v]free_rcu().

Make users' lives easier by introducing kfree_rcu_nolock() variant,
now that kfree_rcu_sheaf() is available on PREEMPT_RT and
__kfree_rcu_sheaf() handles unknown context.

Unlike k[v]free_rcu(), kfree_rcu_nolock() does not fall back to
the kvfree_rcu batching when the sheaves path fails, and falls back to
defer_kfree_rcu() instead. In most cases, the sheaves path is expected
to succeed and it's unnecessary to add complexity to the existing
kvfree_rcu batching.

Since defer_kfree_rcu() can be called on caches without sheaves, move
deferred_work_barrier() and rcu_barrier() outside the branch in
kvfree_rcu_barrier_on_cache().

Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>
---
 include/linux/rcupdate.h | 12 ++++++++++++
 mm/slab.h                |  1 +
 mm/slab_common.c         | 22 ++++++++++++++++++++--
 mm/slub.c                | 23 ++++++++++++++++++++++-
 4 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5e95acc33989..3025249bfcb5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1099,6 +1099,7 @@ static inline void rcu_read_unlock_migrate(void)
  * In mm/slab_common.c, no suitable header to include here.
  */
 void kvfree_call_rcu(struct rcu_head *head, void *ptr);
+void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr);
 
 /*
  * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
@@ -1122,6 +1123,17 @@ do {								\
 		kvfree_call_rcu(NULL, (void *) (___p));		\
 } while (0)
 
+/* kfree_rcu_nolock() supports 2-arg variant only */
+#define kfree_rcu_nolock(ptr, krhf)					\
+do {									\
+	typeof (ptr) ___p = (ptr);					\
+									\
+	if (___p) {							\
+		BUILD_BUG_ON(offsetof(typeof(*(ptr)), krhf) >= 4096);	\
+		kfree_call_rcu_nolock(&((___p)->krhf), (void *) (___p));\
+	}								\
+} while (0)
+
 /*
  * Place this after a lock-acquisition primitive to guarantee that
  * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
diff --git a/mm/slab.h b/mm/slab.h
index 961581e35ec8..a493c5201e96 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -745,6 +745,7 @@ void __check_heap_object(const void *ptr, unsigned long n,
 			 const struct slab *slab, bool to_user);
 
 void deferred_work_barrier(void);
+void defer_kfree_rcu(struct rcu_head *head);
 
 static inline bool slub_debug_orig_size(struct kmem_cache *s)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 807924a94fb0..5a39e6225160 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1263,6 +1263,23 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
 EXPORT_TRACEPOINT_SYMBOL(kfree);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
 
+void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
+{
+	struct slab *slab;
+	struct kmem_cache *s;
+
+	VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
+
+	slab = virt_to_slab(ptr);
+	s = slab->slab_cache;
+
+	if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
+		return;
+
+	defer_kfree_rcu(head);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu_nolock);
+
 #ifndef CONFIG_KVFREE_RCU_BATCHED
 
 void kvfree_call_rcu(struct rcu_head *head, void *ptr)
@@ -2120,10 +2137,11 @@ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
 		cpus_read_lock();
 		flush_rcu_sheaves_on_cache(s);
 		cpus_read_unlock();
-		deferred_work_barrier();
-		rcu_barrier();
 	}
 
+	/* kfree_rcu_nolock() might have deferred frees even without sheaves */
+	deferred_work_barrier();
+	rcu_barrier();
 	__kvfree_rcu_barrier();
 }
 EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
diff --git a/mm/slub.c b/mm/slub.c
index 4850629774b2..19018a979445 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4075,6 +4075,7 @@ static void flush_all(struct kmem_cache *s)
 
 struct deferred_percpu_work {
 	struct llist_head objects;
+	struct llist_head objects_by_rcu;
 	struct llist_head rcu_sheaves;
 	struct irq_work work;
 };
@@ -4083,6 +4084,7 @@ static void deferred_percpu_work_fn(struct irq_work *work);
 
 static DEFINE_PER_CPU(struct deferred_percpu_work, deferred_percpu_work) = {
 	.objects = LLIST_HEAD_INIT(objects),
+	.objects_by_rcu = LLIST_HEAD_INIT(objects_by_rcu),
 	.rcu_sheaves = LLIST_HEAD_INIT(rcu_sheaves),
 	.work = IRQ_WORK_INIT(deferred_percpu_work_fn),
 };
@@ -6392,12 +6394,13 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 static void deferred_percpu_work_fn(struct irq_work *work)
 {
 	struct deferred_percpu_work *dpw;
-	struct llist_head *objs, *rcu_sheaves;
+	struct llist_head *objs, *objs_by_rcu, *rcu_sheaves;
 	struct llist_node *llnode, *pos, *t;
 
 	dpw = container_of(work, struct deferred_percpu_work, work);
 	rcu_sheaves = &dpw->rcu_sheaves;
 	objs = &dpw->objects;
+	objs_by_rcu = &dpw->objects_by_rcu;
 
 	llnode = llist_del_all(objs);
 	llist_for_each_safe(pos, t, llnode) {
@@ -6428,6 +6431,13 @@ static void deferred_percpu_work_fn(struct irq_work *work)
 
 		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
 	}
+
+	llnode = llist_del_all(objs_by_rcu);
+	llist_for_each_safe(pos, t, llnode) {
+		struct rcu_head *head = (struct rcu_head *)pos;
+
+		call_rcu(head, kvfree_rcu_cb);
+	}
 }
 
 static void defer_free(struct kmem_cache *s, void *head)
@@ -6443,6 +6453,17 @@ static void defer_free(struct kmem_cache *s, void *head)
 		irq_work_queue(&dpw->work);
 }
 
+void defer_kfree_rcu(struct rcu_head *head)
+{
+	struct deferred_percpu_work *dpw;
+
+	guard(preempt)();
+
+	dpw = this_cpu_ptr(&deferred_percpu_work);
+	if (llist_add((struct llist_node *)head, &dpw->objects_by_rcu))
+		irq_work_queue(&dpw->work);
+}
+
 void deferred_work_barrier(void)
 {
 	int cpu;

-- 
2.53.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-15 11:06 ` [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() Harry Yoo (Oracle)
@ 2026-06-16 17:28   ` Vlastimil Babka (SUSE)
  2026-06-21  0:29   ` XIAO WU
  1 sibling, 0 replies; 8+ messages in thread
From: Vlastimil Babka (SUSE) @ 2026-06-16 17:28 UTC (permalink / raw)
  To: Harry Yoo (Oracle), Andrew Morton, Hao Li, Christoph Lameter,
	David Rientjes, Roman Gushchin, Alexei Starovoitov,
	Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf

On 6/15/26 13:06, Harry Yoo (Oracle) wrote:
> Currently, k[v]free_rcu() cannot be called in unknown context since
> it could lead to a deadlock when called in the middle of k[v]free_rcu().
> 
> Make users' lives easier by introducing kfree_rcu_nolock() variant,
> now that kfree_rcu_sheaf() is available on PREEMPT_RT and
> __kfree_rcu_sheaf() handles unknown context.
> 
> Unlike k[v]free_rcu(), kfree_rcu_nolock() does not fall back to
> the kvfree_rcu batching when the sheaves path fails, and falls back to
> defer_kfree_rcu() instead. In most cases, the sheaves path is expected
> to succeed and it's unnecessary to add complexity to the existing
> kvfree_rcu batching.
> 
> Since defer_kfree_rcu() can be called on caches without sheaves, move
> deferred_work_barrier() and rcu_barrier() outside the branch in
> kvfree_rcu_barrier_on_cache().
> 
> Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>

LGTM, nice.
Reviewed-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-15 11:06 ` [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() Harry Yoo (Oracle)
  2026-06-16 17:28   ` Vlastimil Babka (SUSE)
@ 2026-06-21  0:29   ` XIAO WU
  2026-06-22  5:28     ` Harry Yoo
  1 sibling, 1 reply; 8+ messages in thread
From: XIAO WU @ 2026-06-21  0:29 UTC (permalink / raw)
  To: Harry Yoo (Oracle), Vlastimil Babka, Andrew Morton, Hao Li,
	Christoph Lameter, David Rientjes, Roman Gushchin,
	Alexei Starovoitov, Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf

Hi,

I noticed the Sashiko AI review [1] in this thread flagged that
kfree_call_rcu_nolock() dereferences slab->slab_cache even when
virt_to_slab() returns NULL (for large kmalloc objects that bypass
SLUB, or vmalloc addresses).  The VM_WARN_ON_ONCE fires but does not
stop execution, and the subsequent NULL dereference is deterministic.

I was able to reproduce this in QEMU with KASAN.  The trigger is as
simple as passing a large (>8KB) kmalloc buffer to the new function.

On Tue, Jun 16, 2026 at 12:06:14AM +0800, Harry Yoo (Oracle) wrote:
 > This commit introduces kfree_rcu_nolock(), a variant of kfree_rcu()
 > designed to be safely called from unknown contexts without falling
 > back to batched processing.
...
 > +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
 > +{
 > +    struct slab *slab;
 > +    struct kmem_cache *s;
 > +
 > +    VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
 > +
 > +    slab = virt_to_slab(ptr);
 > +    s = slab->slab_cache;

The problem: if ptr is a large kmalloc object (> KMALLOC_MAX_CACHE_SIZE,
which is 8 KB on x86_64), the allocation bypasses SLUB and comes from
the page allocator.  virt_to_slab() returns NULL.  VM_WARN_ON_ONCE
prints a warning but does NOT return, and the next line dereferences
NULL->slab_cache at offset 0x8.

[Reproduction]

I rebuilt the kernel with CONFIG_KASAN=y and added a small late_initcall
that allocates a 16 KB buffer and passes it to kfree_call_rcu_nolock():

   static int __init kfree_rcu_nolock_poc_trigger(void)
   {
       void *p = kmalloc(16384, GFP_KERNEL);
       struct rcu_head *head = kmalloc(sizeof(*head), GFP_KERNEL);
       kfree_call_rcu_nolock(head, p);
       return 0;
   }
   late_initcall(kfree_rcu_nolock_poc_trigger);

[Crash log — kernel 6.19.0-rc5, CONFIG_KASAN=y, CONFIG_DEBUG_VM=y]

   kfree_rcu_nolock PoC: calling kfree_call_rcu_nolock on large obj 
ffff888026c5c000

   WARNING: mm/slab_common.c:1271 at kfree_call_rcu_nolock+0x1e/0xc0
   VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr))

   BUG: kernel NULL pointer dereference, address: 0000000000000008
   #PF: supervisor read access in kernel mode
   #PF: error_code(0x0000) - not-present page

   RIP: 0010:kfree_call_rcu_nolock+0x5c/0xc0
   Call Trace:
    <TASK>
    poc_trigger_init+0x2a/0x40
    do_one_initcall+0x131/0x730
    kernel_init_freeable+0x471/0x7e0
    kernel_init+0x28/0x300
    ret_from_fork+0x2c/0xc0
    </TASK>

   Kernel panic - not syncing: Fatal exception

The crash is at offset 0x5c inside kfree_call_rcu_nolock(), which
corresponds to `s = slab->slab_cache`.  The fault address 0x8 is
exactly offsetof(struct slab, slab_cache).


[1] 
https://sashiko.dev/#/patchset/20260615-kfree_rcu_nolock-v3-0-70a54f3775bb%40kernel.org
     (Sashiko AI code review — "Null Pointer Dereference", Severity: 
Critical)

Thanks,
XIAO




^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-21  0:29   ` XIAO WU
@ 2026-06-22  5:28     ` Harry Yoo
  2026-06-22 14:56       ` XIAO WU
  0 siblings, 1 reply; 8+ messages in thread
From: Harry Yoo @ 2026-06-22  5:28 UTC (permalink / raw)
  To: XIAO WU, Vlastimil Babka, Andrew Morton, Hao Li,
	Christoph Lameter, David Rientjes, Roman Gushchin,
	Alexei Starovoitov, Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf


[-- Attachment #1.1: Type: text/plain, Size: 3544 bytes --]



On 6/21/26 9:29 AM, XIAO WU wrote:
> Hi,

Hi Xiao,

> I noticed the Sashiko AI review [1] in this thread flagged that
> kfree_call_rcu_nolock() dereferences slab->slab_cache even when
> virt_to_slab() returns NULL (for large kmalloc objects that bypass
> SLUB, or vmalloc addresses).  The VM_WARN_ON_ONCE fires but does not
> stop execution, and the subsequent NULL dereference is deterministic.

Thanks for taking a look, but this was intentional.

I should have documented that only kmalloc_nolock() ->
kfree_rcu_nolock() is allowed and kmalloc() -> kfree_rcu_nolock()
is not allowed (yet).

> I was able to reproduce this in QEMU with KASAN.  The trigger is as
> simple as passing a large (>8KB) kmalloc buffer to the new function.
> 
> On Tue, Jun 16, 2026 at 12:06:14AM +0800, Harry Yoo (Oracle) wrote:
>> This commit introduces kfree_rcu_nolock(), a variant of kfree_rcu()
>> designed to be safely called from unknown contexts without falling
>> back to batched processing.
> ...
>> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
>> +{
>> +    struct slab *slab;
>> +    struct kmem_cache *s;
>> +
>> +    VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
>> +
>> +    slab = virt_to_slab(ptr);
>> +    s = slab->slab_cache;
> 
> The problem: if ptr is a large kmalloc object (> KMALLOC_MAX_CACHE_SIZE,
> which is 8 KB on x86_64), the allocation bypasses SLUB and comes from
> the page allocator.  virt_to_slab() returns NULL.  VM_WARN_ON_ONCE
> prints a warning but does NOT return, and the next line dereferences
> NULL->slab_cache at offset 0x8.

Since kmalloc_nolock() does not support large kmalloc, the warning
is not supposed to trigger. That is why I added only debug warnings.

> [Reproduction]
> 
> I rebuilt the kernel with CONFIG_KASAN=y and added a small late_initcall
> that allocates a 16 KB buffer and passes it to kfree_call_rcu_nolock():
> 
>   static int __init kfree_rcu_nolock_poc_trigger(void)
>   {
>       void *p = kmalloc(16384, GFP_KERNEL);
>       struct rcu_head *head = kmalloc(sizeof(*head), GFP_KERNEL);
>       kfree_call_rcu_nolock(head, p);

As mentioned ealier, kmalloc() -> kfree_rcu_nolock() is not supported.

-- 
Cheers,
Harry / Hyeonggon

>       return 0;
>   }
>   late_initcall(kfree_rcu_nolock_poc_trigger);
> 
> [Crash log — kernel 6.19.0-rc5, CONFIG_KASAN=y, CONFIG_DEBUG_VM=y]
> 
>   kfree_rcu_nolock PoC: calling kfree_call_rcu_nolock on large obj
> ffff888026c5c000
> 
>   WARNING: mm/slab_common.c:1271 at kfree_call_rcu_nolock+0x1e/0xc0
>   VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr))
> 
>   BUG: kernel NULL pointer dereference, address: 0000000000000008
>   #PF: supervisor read access in kernel mode
>   #PF: error_code(0x0000) - not-present page
> 
>   RIP: 0010:kfree_call_rcu_nolock+0x5c/0xc0
>   Call Trace:
>    <TASK>
>    poc_trigger_init+0x2a/0x40
>    do_one_initcall+0x131/0x730
>    kernel_init_freeable+0x471/0x7e0
>    kernel_init+0x28/0x300
>    ret_from_fork+0x2c/0xc0
>    </TASK>
> 
>   Kernel panic - not syncing: Fatal exception
> 
> The crash is at offset 0x5c inside kfree_call_rcu_nolock(), which
> corresponds to `s = slab->slab_cache`.  The fault address 0x8 is
> exactly offsetof(struct slab, slab_cache).
> 
> [1] https://sashiko.dev/#/patchset/20260615-kfree_rcu_nolock-
> v3-0-70a54f3775bb%40kernel.org
>     (Sashiko AI code review — "Null Pointer Dereference", Severity:
> Critical)
> 
> Thanks,
> XIAO

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-22  5:28     ` Harry Yoo
@ 2026-06-22 14:56       ` XIAO WU
  2026-06-25  5:27         ` Harry Yoo
  0 siblings, 1 reply; 8+ messages in thread
From: XIAO WU @ 2026-06-22 14:56 UTC (permalink / raw)
  To: Harry Yoo, Vlastimil Babka, Andrew Morton, Hao Li,
	Christoph Lameter, David Rientjes, Roman Gushchin,
	Alexei Starovoitov, Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf

Hi Harry,

On Mon, Jun 22, 2026 at 02:28:44PM +0900, Harry Yoo wrote:
 > On 6/21/26 9:29 AM, XIAO WU wrote:
 > > I was able to reproduce this in QEMU with KASAN.  The trigger is as
 > > simple as passing a large (>8KB) kmalloc buffer to the new function.
 >
 > Thanks for taking a look, but this was intentional.
 >
 > I should have documented that only kmalloc_nolock() ->
 > kfree_rcu_nolock() is allowed and kmalloc() -> kfree_rcu_nolock()
 > is not allowed (yet).
 >
 > Since kmalloc_nolock() does not support large kmalloc, the warning
 > is not supposed to trigger. That is why I added only debug warnings.

Thank you very much for taking the time to explain — I really
appreciate it, especially since I'm still learning my way around the
mm/ subsystem.  You are absolutely right that kmalloc_nolock() returns
NULL for sizes above KMALLOC_MAX_CACHE_SIZE, so a proper caller using
the kmalloc_nolock() → kfree_rcu_nolock() pairing would never hit this.

I did notice one small thing that I wanted to gently bring up, though.
Please forgive me if I'm missing something obvious here.

When I was reading through the surrounding code to understand the
pattern better, I noticed that kfree_nolock() — which has the same
"only for kmalloc_nolock()" constraint (documented in the comment at
mm/slub.c:6828-6835) — actually does check for a NULL slab:

   void kfree_nolock(const void *object)
   {
   	...
   	slab = virt_to_slab(object);
   	if (unlikely(!slab)) {
   		WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
   		return;
   	}
   	s = slab->slab_cache;
   	...

So kfree_nolock() gracefully returns with a warning even though it too
expects only kmalloc_nolock() callers.  That pattern seemed really
sensible to me — it costs almost nothing and prevents a panic if
someone ever passes the wrong pointer (which they shouldn't, but as you
mentioned, the constraint isn't documented on kfree_call_rcu_nolock()
yet).

I also wondered about the difference between WARN_ONCE (used in
kfree_nolock) and VM_WARN_ON_ONCE (used in kfree_call_rcu_nolock).  If
I understand correctly, VM_WARN_ON_ONCE compiles away entirely on
production kernels without CONFIG_DEBUG_VM, which would make the
subsequent NULL dereference completely silent — no warning, just a
panic.  That seems a bit scary for something that's exported to
modules via EXPORT_SYMBOL_GPL.

And since you mentioned that kmalloc() → kfree_rcu_nolock() support is
planned for the future (the "yet") — wouldn't this code path need the
NULL check at that point anyway?

I was thinking something like this would make the function consistent
with kfree_nolock() and also make it forward-compatible with the
planned kmalloc() support:

--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1266,10 +1266,16 @@ void kfree_call_rcu_nolock(struct rcu_head 
*head, void *ptr)
  {
  	struct slab *slab;
  	struct kmem_cache *s;

-	VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
-
  	slab = virt_to_slab(ptr);
+	/*
+	 * kmalloc_nolock() never produces large-kmalloc or vmalloc
+	 * addresses, but be defensive: fall back to defer_kfree_rcu()
+	 * for unsupported pointer types, consistent with kfree_nolock().
+	 */
+	if (unlikely(!slab))
+		goto fallback;
+
  	s = slab->slab_cache;

  	if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
  		return;

+fallback:
  	defer_kfree_rcu(head);
  }

Of course, this is just a suggestion — you know this code far better
than I do.  If you feel the current code is fine as-is with proper
documentation, I completely understand and won't press the point
further.

Either way, thank you again for the explanation, and for working on
this series — having kfree_rcu_nolock() available for BPF and other
contexts will be really valuable.

Thanks,
XIAO

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
  2026-06-22 14:56       ` XIAO WU
@ 2026-06-25  5:27         ` Harry Yoo
  0 siblings, 0 replies; 8+ messages in thread
From: Harry Yoo @ 2026-06-25  5:27 UTC (permalink / raw)
  To: XIAO WU, Vlastimil Babka, Andrew Morton, Hao Li,
	Christoph Lameter, David Rientjes, Roman Gushchin,
	Alexei Starovoitov, Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf


[-- Attachment #1.1: Type: text/plain, Size: 4556 bytes --]



On 6/22/26 11:56 PM, XIAO WU wrote:
> Hi Harry,
> 
> On Mon, Jun 22, 2026 at 02:28:44PM +0900, Harry Yoo wrote:
>> On 6/21/26 9:29 AM, XIAO WU wrote:
>> > I was able to reproduce this in QEMU with KASAN.  The trigger is as
>> > simple as passing a large (>8KB) kmalloc buffer to the new function.
>>
>> Thanks for taking a look, but this was intentional.
>>
>> I should have documented that only kmalloc_nolock() ->
>> kfree_rcu_nolock() is allowed and kmalloc() -> kfree_rcu_nolock()
>> is not allowed (yet).
>>
>> Since kmalloc_nolock() does not support large kmalloc, the warning
>> is not supposed to trigger. That is why I added only debug warnings.
> 
> Thank you very much for taking the time to explain — I really
> appreciate it, especially since I'm still learning my way around the
> mm/ subsystem.  You are absolutely right that kmalloc_nolock() returns
> NULL for sizes above KMALLOC_MAX_CACHE_SIZE, so a proper caller using
> the kmalloc_nolock() → kfree_rcu_nolock() pairing would never hit this.
> 
> I did notice one small thing that I wanted to gently bring up, though.
> Please forgive me if I'm missing something obvious here.
> 
> When I was reading through the surrounding code to understand the
> pattern better, I noticed that kfree_nolock() — which has the same
> "only for kmalloc_nolock()" constraint (documented in the comment at
> mm/slub.c:6828-6835) — actually does check for a NULL slab:
> 
>   void kfree_nolock(const void *object)
>   {
>       ...
>       slab = virt_to_slab(object);
>       if (unlikely(!slab)) {
>           WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
>           return;
>       }
>       s = slab->slab_cache;
>       ...
> 
> So kfree_nolock() gracefully returns with a warning even though it too
> expects only kmalloc_nolock() callers.  That pattern seemed really
> sensible to me — it costs almost nothing and prevents a panic if
> someone ever passes the wrong pointer (which they shouldn't, but as you
> mentioned, the constraint isn't documented on kfree_call_rcu_nolock()
> yet).
>
> I also wondered about the difference between WARN_ONCE (used in
> kfree_nolock) and VM_WARN_ON_ONCE (used in kfree_call_rcu_nolock). If
> I understand correctly, VM_WARN_ON_ONCE compiles away entirely on
> production kernels without CONFIG_DEBUG_VM, which would make the
> subsequent NULL dereference completely silent — no warning, just a
> panic.

It would crash without debug option anyway, but warnings are there to
make it easier to point what's gone wrong.

And not testing the code path at least once with debug option is a big
problem :)

> And since you mentioned that kmalloc() → kfree_rcu_nolock() support is
> planned for the future (the "yet") — wouldn't this code path need the
> NULL check at that point anyway?
> 
> I was thinking something like this would make the function consistent
> with kfree_nolock() and also make it forward-compatible with the
> planned kmalloc() support:
> 
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1266,10 +1266,16 @@ void kfree_call_rcu_nolock(struct rcu_head
> *head, void *ptr)
>  {
>      struct slab *slab;
>      struct kmem_cache *s;
> 
> -    VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
> -
>      slab = virt_to_slab(ptr);
> +    /*
> +     * kmalloc_nolock() never produces large-kmalloc or vmalloc
> +     * addresses, but be defensive: fall back to defer_kfree_rcu()
> +     * for unsupported pointer types, consistent with kfree_nolock().
> +     */
> +    if (unlikely(!slab))
> +        goto fallback;

Just FYI, virt_to_slab() and virt_to_page()
don't work correctly for vmalloc addresses.

And I don't think silently making it work is good.

> +
>      s = slab->slab_cache;
> 
>      if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
>          return;
> 
> +fallback:
>      defer_kfree_rcu(head);
>  }
> 
> Of course, this is just a suggestion — you know this code far better
> than I do.  If you feel the current code is fine as-is with proper
> documentation, I completely understand and won't press the point
> further.
> 
> Either way, thank you again for the explanation, and for working on
> this series — having kfree_rcu_nolock() available for BPF and other
> contexts will be really valuable.
> 
> Thanks,
> XIAO

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-06-25  5:40 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-24  9:22 [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() hu.shengming
2026-06-25  5:40 ` Harry Yoo
  -- strict thread matches above, loose matches on Subject: below --
2026-06-15 11:05 [PATCH for-next v3 0/9] mm/slab: introduce kfree_rcu_nolock() and improve slub_kunit coverage Harry Yoo (Oracle)
2026-06-15 11:06 ` [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() Harry Yoo (Oracle)
2026-06-16 17:28   ` Vlastimil Babka (SUSE)
2026-06-21  0:29   ` XIAO WU
2026-06-22  5:28     ` Harry Yoo
2026-06-22 14:56       ` XIAO WU
2026-06-25  5:27         ` Harry Yoo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox