Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock()
@ 2026-06-24  9:22 hu.shengming
  2026-06-25  5:40 ` Harry Yoo
  0 siblings, 1 reply; 8+ messages in thread
From: hu.shengming @ 2026-06-24  9:22 UTC (permalink / raw)
  To: harry
  Cc: vbabka, hao.li, cl, rientjes, roman.gushchin, linux-mm,
	linux-kernel, zhang.run, cai.qu

Harry wrote:
> Currently, k[v]free_rcu() cannot be called in unknown context since
> it could lead to a deadlock when called in the middle of k[v]free_rcu().
> 
> Make users' lives easier by introducing kfree_rcu_nolock() variant,
> now that kfree_rcu_sheaf() is available on PREEMPT_RT and
> __kfree_rcu_sheaf() handles unknown context.
> 
> Unlike k[v]free_rcu(), kfree_rcu_nolock() does not fall back to
> the kvfree_rcu batching when the sheaves path fails, and falls back to
> defer_kfree_rcu() instead. In most cases, the sheaves path is expected
> to succeed and it's unnecessary to add complexity to the existing
> kvfree_rcu batching.
> 
> Since defer_kfree_rcu() can be called on caches without sheaves, move
> deferred_work_barrier() and rcu_barrier() outside the branch in
> kvfree_rcu_barrier_on_cache().
> 
> Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>

Hi Harry,

Thanks for the series. These patches fill a clear functional gap in the
existing free APIs by adding an RCU-deferred free interface for contexts
where kfree_rcu() cannot safely be used.

> ---
>  include/linux/rcupdate.h | 12 ++++++++++++
>  mm/slab.h                |  1 +
>  mm/slab_common.c         | 22 ++++++++++++++++++++--
>  mm/slub.c                | 23 ++++++++++++++++++++++-
>  4 files changed, 55 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 5e95acc33989..3025249bfcb5 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -1099,6 +1099,7 @@ static inline void rcu_read_unlock_migrate(void)
>   * In mm/slab_common.c, no suitable header to include here.
>   */
>  void kvfree_call_rcu(struct rcu_head *head, void *ptr);
> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr);
>  
>  /*
>   * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
> @@ -1122,6 +1123,17 @@ do {								\
>  		kvfree_call_rcu(NULL, (void *) (___p));		\
>  } while (0)
>  
> +/* kfree_rcu_nolock() supports 2-arg variant only */
> +#define kfree_rcu_nolock(ptr, krhf)					\
> +do {									\
> +	typeof (ptr) ___p = (ptr);					\
> +									\
> +	if (___p) {							\
> +		BUILD_BUG_ON(offsetof(typeof(*(ptr)), krhf) >= 4096);	\
> +		kfree_call_rcu_nolock(&((___p)->krhf), (void *) (___p));\
> +	}								\
> +} while (0)
> +
>  /*
>   * Place this after a lock-acquisition primitive to guarantee that
>   * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
> diff --git a/mm/slab.h b/mm/slab.h
> index 961581e35ec8..a493c5201e96 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -745,6 +745,7 @@ void __check_heap_object(const void *ptr, unsigned long n,
>  			 const struct slab *slab, bool to_user);
>  
>  void deferred_work_barrier(void);
> +void defer_kfree_rcu(struct rcu_head *head);
>  
>  static inline bool slub_debug_orig_size(struct kmem_cache *s)
>  {
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 807924a94fb0..5a39e6225160 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1263,6 +1263,23 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
>  EXPORT_TRACEPOINT_SYMBOL(kfree);
>  EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
>  
> +void kfree_call_rcu_nolock(struct rcu_head *head, void *ptr)
> +{
> +	struct slab *slab;
> +	struct kmem_cache *s;
> +
> +	VM_WARN_ON_ONCE(is_vmalloc_addr(ptr) || !virt_to_slab(ptr));
> +
> +	slab = virt_to_slab(ptr);
> +	s = slab->slab_cache;
> +
> +	if (__kfree_rcu_sheaf(s, ptr, /* allow_spin = */ false))
> +		return;
> +

One consistency issue to address here: kfree_rcu_sheaf() only calls
__kfree_rcu_sheaf() for objects belonging to the local NUMA node. This
avoids filling a CPU's per-CPU sheaves with objects from remote slabs.

kfree_call_rcu_nolock() currently skips that check and may therefore
place remote-node objects into the local CPU's RCU sheaf.

Could you add the same local-node check used by kfree_rcu_sheaf()
before calling __kfree_rcu_sheaf(), and route remote-node objects
directly to the defer_kfree_rcu() fallback path instead?

--
With Best Regards,
Shengming

> +	defer_kfree_rcu(head);
> +}
> +EXPORT_SYMBOL_GPL(kfree_call_rcu_nolock);
> +
>  #ifndef CONFIG_KVFREE_RCU_BATCHED
>  
>  void kvfree_call_rcu(struct rcu_head *head, void *ptr)
> @@ -2120,10 +2137,11 @@ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
>  		cpus_read_lock();
>  		flush_rcu_sheaves_on_cache(s);
>  		cpus_read_unlock();
> -		deferred_work_barrier();
> -		rcu_barrier();
>  	}
>  
> +	/* kfree_rcu_nolock() might have deferred frees even without sheaves */
> +	deferred_work_barrier();
> +	rcu_barrier();
>  	__kvfree_rcu_barrier();
>  }
>  EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
> diff --git a/mm/slub.c b/mm/slub.c
> index 4850629774b2..19018a979445 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4075,6 +4075,7 @@ static void flush_all(struct kmem_cache *s)
>  
>  struct deferred_percpu_work {
>  	struct llist_head objects;
> +	struct llist_head objects_by_rcu;
>  	struct llist_head rcu_sheaves;
>  	struct irq_work work;
>  };
> @@ -4083,6 +4084,7 @@ static void deferred_percpu_work_fn(struct irq_work *work);
>  
>  static DEFINE_PER_CPU(struct deferred_percpu_work, deferred_percpu_work) = {
>  	.objects = LLIST_HEAD_INIT(objects),
> +	.objects_by_rcu = LLIST_HEAD_INIT(objects_by_rcu),
>  	.rcu_sheaves = LLIST_HEAD_INIT(rcu_sheaves),
>  	.work = IRQ_WORK_INIT(deferred_percpu_work_fn),
>  };
> @@ -6392,12 +6394,13 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
>  static void deferred_percpu_work_fn(struct irq_work *work)
>  {
>  	struct deferred_percpu_work *dpw;
> -	struct llist_head *objs, *rcu_sheaves;
> +	struct llist_head *objs, *objs_by_rcu, *rcu_sheaves;
>  	struct llist_node *llnode, *pos, *t;
>  
>  	dpw = container_of(work, struct deferred_percpu_work, work);
>  	rcu_sheaves = &dpw->rcu_sheaves;
>  	objs = &dpw->objects;
> +	objs_by_rcu = &dpw->objects_by_rcu;
>  
>  	llnode = llist_del_all(objs);
>  	llist_for_each_safe(pos, t, llnode) {
> @@ -6428,6 +6431,13 @@ static void deferred_percpu_work_fn(struct irq_work *work)
>  
>  		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
>  	}
> +
> +	llnode = llist_del_all(objs_by_rcu);
> +	llist_for_each_safe(pos, t, llnode) {
> +		struct rcu_head *head = (struct rcu_head *)pos;
> +
> +		call_rcu(head, kvfree_rcu_cb);
> +	}
>  }
>  
>  static void defer_free(struct kmem_cache *s, void *head)
> @@ -6443,6 +6453,17 @@ static void defer_free(struct kmem_cache *s, void *head)
>  		irq_work_queue(&dpw->work);
>  }
>  
> +void defer_kfree_rcu(struct rcu_head *head)
> +{
> +	struct deferred_percpu_work *dpw;
> +
> +	guard(preempt)();
> +
> +	dpw = this_cpu_ptr(&deferred_percpu_work);
> +	if (llist_add((struct llist_node *)head, &dpw->objects_by_rcu))
> +		irq_work_queue(&dpw->work);
> +}
> +
>  void deferred_work_barrier(void)
>  {
>  	int cpu;
> 
> -- 
> 2.53.0


^ permalink raw reply	[flat|nested] 8+ messages in thread
* [PATCH for-next v3 0/9] mm/slab: introduce kfree_rcu_nolock() and improve slub_kunit coverage
@ 2026-06-15 11:05 Harry Yoo (Oracle)
  2026-06-15 11:06 ` [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() Harry Yoo (Oracle)
  0 siblings, 1 reply; 8+ messages in thread
From: Harry Yoo (Oracle) @ 2026-06-15 11:05 UTC (permalink / raw)
  To: Vlastimil Babka, Andrew Morton, Hao Li, Christoph Lameter,
	David Rientjes, Roman Gushchin, Alexei Starovoitov,
	Andrii Nakryiko, Puranjay Mohan, Amery Hung,
	Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
	Paul E. McKenney, Frederic Weisbecker, Neeraj Upadhyay,
	Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki,
	Mathieu Desnoyers, Lai Jiangshan, Zqiang, Pedro Falcato,
	Suren Baghdasaryan
  Cc: linux-mm, linux-kernel, linux-rt-devel, rcu, bpf

Not the best time to post a series, but didn't want to delay posting
the series for too long. no pressures ;)  This is aimed to be queued
for review and testing after the merge window closes.

This series is based on next-20260612, and is also available on
git.kernel.org [3].

To RCU folks: It would be great if you could kindly take a quick look at
patch 4 and either ack or nack the patch ;)

To BPF folks: Ulad asked to share workloads to measure performance
of kfree_rcu_nolock(). Unfortunately, I focused more on correctness
and have not spent much effort on that. It would be nice if BPF folks
could help evaluate it on their relevant workloads.

To PREEMPT_RT folks: The most relevant part is allowing
kfree_rcu_sheaf() on PREEMPT_RT (patch 6). It carefully avoids sleeping
by acquiring the locks via local_trylock() or spin_trylock_irqsave()
to avoid sleeping within a raw spinlock. When trylock or unlock is
unsafe, kmalloc_nolock() always fails.

Changes since RFC v2
====================

Reduced complexity and intrusiveness (Uladzislau Rezki)
-------------------------------------------------------

While discussing concerns about the complexity of adding allow_spin
handling with Ulad (Thanks!), I realized that adding complexity to the
kvfree_rcu batching is not strictly necessary: only slab objects need to
be batched, they are already batched by rcu sheaves, and slab already
supports unknown context. So it is enough to implement only a minimal
fallback for the sheaves path.

I tried to avoid making intrusive changes to the existing kvfree_rcu
path as much as possible. struct rcu_ptr is renamed to kfree_rcu_head
following Vlastimil's suggestion, and it is used only in the
kfree_rcu_nolock() path for now.

As a result, the complexity is significantly reduced and the series
became much less intrusive. This is also reflected well in the diffstat
below.

RFC v2 diffstat:
  8 files changed, 514 insertions(+), 163 deletions(-)

v3 diffstat:
  6 files changed, 370 insertions(+), 105 deletions(-)

v3 diffstat (slub_kunit improvements - patch 1, 2, 9 excluded):
  5 files changed, 199 insertions(+), 66 deletions(-)

kfree_rcu_sheaf() PREEMPT_RT support (Vlastimil Babka)
------------------------------------------------------

As suggested by Vlastimil (Thanks!), kfree_rcu_sheaf() can now be used
on PREEMPT_RT as well, by always assuming allow_spin is false on
PREEMPT_RT.

slub_kunit enhancements
-----------------------

- Currently the test is skipped when there is no hardware PMU. This can
  happen on machines without a PMU, or in virtualized environments
  (e.g., automated testing or virtme). Implement a fallback based on SW
  perf events so that the test can still run in such environments, even
  though the coverage is slightly smaller.

- While testing on PREEMPT_RT, I found that kmalloc_nolock() fails every
  time, so the fallback path is not properly tested. This is a limitation
  of perf events: the handler is called in NMI (HW perf events) or
  interrupt context (SW perf events), where kmalloc_nolock() cannot
  succeed.

  slub_kunit now registers a kprobe pre-handler at the points in the slab
  allocator where lockdep_assert_held() is invoked. The pre-handler calls
  kmalloc_nolock() and friends, to improve coverage on PREEMPT_RT instead
  of relying on perf events.

One thing that needs to be further explored
-------------------------------------------

The global deferred_free_by_rcu (introduced by patch 8) list for the
fallback should probably be per-CPU [5].

Actual Cover Letter
===================

This series improves kmalloc_nolock() and kfree_nolock() coverage
in slub_kunit (patch 1 and 2) and introduces kfree_rcu_nolock() for
an unknown context as suggested by Alexei Starovoitov.

Unknown context means the caller does not know whether spinning on a lock
is safe (e.g., a BPF program attached to an arbitrary kernel function or
in NMI context).

The slab allocator already supports unknown context via kmalloc_nolock()
and kfree_nolock(), but te slab allocator does not support freeing
objects by RCU in unknown context.

It is not ideal to have completely separate batching for unknown context
because the worst scenario where spinning on a lock would lead to
deadlock is very rare, and in most cases, it is safe to use the
existing mechanism (kfree_rcu_sheaf()).

Since most part of the slab allocator already supports unknown context
and sheaves support batching kvfree_rcu() calls for slab objects,
implement kfree_rcu_nolock() with minimal changes by teaching
kfree_rcu_sheaf() how to support unknown context and making
it a little bit harder to allocate an empty sheaf, instead of making
intrusive changes to the existing kvfree_rcu batching logic.

kfree_rcu_nolock() tries to free the object to the rcu sheaf if
trylock succeeds. Once the rcu sheaf becomes full, it is submitted to
RCU via call_rcu() if spinning is allowed or IRQs are enabled (to avoid
calling call_rcu() in the middle of call_rcu()). Otherwise, call_rcu()
is deferred via irq work.

In unknown context, when there is no sheaf available, kfree_rcu_sheaf()
falls back to defer_kfree_rcu(), which inserts the object to a global
lockless list [5] and those objects are freed after synchronize_rcu() in
a workqueue.

Unlike kfree_rcu(), only the 2-argument variant is supported.
This is because the last resort of the 1-arg variant is
synchronize_rcu(), which cannot be used in an unknown context.

As suggested by Alexei Starovoitov, kfree_rcu_nolock() can be used with
struct kfree_rcu_head (8 bytes), which is smaller than struct rcu_head
(16 bytes).

For more background and future plans, please see [4].

[1] RFC v1: https://lore.kernel.org/linux-mm/20260206093410.160622-1-harry.yoo@oracle.com

[2] RFC v2: https://lore.kernel.org/linux-mm/20260416091022.36823-1-harry@kernel.org

[3] https://git.kernel.org/pub/scm/linux/kernel/git/harry/linux.git/log/?h=kfree_rcu_nolock-v3r3

[4] kmalloc_nolock() follow-ups, including kfree_rcu_nolock(),
    https://lore.kernel.org/linux-mm/esepccfhqg7m6jo76ns2znj2cnuaepx2xvw5zaygtwohq4psma@563ypprp6rr3

[5] However, we should probably make the list percpu because,
    unlike RFC v2, it can be triggered more frequently under memory
    pressure.

    https://lore.kernel.org/linux-mm/805c33d7-3a7b-470c-bd9d-065717a3e3e2@paulmck-laptop

Signed-off-by: Harry Yoo (Oracle) <harry@kernel.org>
---
Harry Yoo (Oracle) (9):
      slub_kunit: fall back to SW perf events when HW PMU is not available
      mm/slab, slub_kunit: register kprobe to trigger _nolock APIs
      mm/slab: handle the !allow_spin case in kfree_rcu_sheaf()
      mm/slab: use call_rcu() in unknown context if irqs are enabled
      mm/slab: extend deferred free mechanism to handle rcu sheaves
      mm/slab: allow kfree_rcu_sheaf() on PREEMPT_RT
      mm/slab: introduce kfree_rcu_nolock()
      mm/slab: introduce struct kfree_rcu_head and use in kfree_rcu_nolock()
      slub_kunit: extend the test for kfree_rcu_nolock()

 include/linux/rcupdate.h |  12 +++
 include/linux/types.h    |   4 +
 lib/tests/slub_kunit.c   | 174 ++++++++++++++++++++++++++++------
 mm/slab.h                |   5 +-
 mm/slab_common.c         |  38 ++++++--
 mm/slub.c                | 242 ++++++++++++++++++++++++++++++++++-------------
 6 files changed, 370 insertions(+), 105 deletions(-)
---
base-commit: c425609d6ac4012c8bbf01ec2e10e801b1923a7b
change-id: 20260615-kfree_rcu_nolock-e5502555992f

Best regards,
-- 
Harry Yoo (Oracle) <harry@kernel.org>



^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-06-25  5:40 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-24  9:22 [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() hu.shengming
2026-06-25  5:40 ` Harry Yoo
  -- strict thread matches above, loose matches on Subject: below --
2026-06-15 11:05 [PATCH for-next v3 0/9] mm/slab: introduce kfree_rcu_nolock() and improve slub_kunit coverage Harry Yoo (Oracle)
2026-06-15 11:06 ` [PATCH for-next v3 7/9] mm/slab: introduce kfree_rcu_nolock() Harry Yoo (Oracle)
2026-06-16 17:28   ` Vlastimil Babka (SUSE)
2026-06-21  0:29   ` XIAO WU
2026-06-22  5:28     ` Harry Yoo
2026-06-22 14:56       ` XIAO WU
2026-06-25  5:27         ` Harry Yoo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox