From: Harry Yoo <harry.yoo@oracle.com>
To: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: bpf@vger.kernel.org, linux-mm@kvack.org, vbabka@suse.cz,
shakeel.butt@linux.dev, mhocko@suse.com, bigeasy@linutronix.de,
andrii@kernel.org, memxor@gmail.com, akpm@linux-foundation.org,
peterz@infradead.org, rostedt@goodmis.org, hannes@cmpxchg.org,
willy@infradead.org
Subject: Re: [PATCH 6/6] slab: Introduce kmalloc_nolock() and kfree_nolock().
Date: Fri, 9 May 2025 10:03:18 +0900 [thread overview]
Message-ID: <aB1UVkKSeJWEGECq@hyeyoo> (raw)
In-Reply-To: <20250501032718.65476-7-alexei.starovoitov@gmail.com>
On Wed, Apr 30, 2025 at 08:27:18PM -0700, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
>
> kmalloc_nolock() relies on ability of local_lock to detect the situation
> when it's locked.
> In !PREEMPT_RT local_lock_is_locked() is true only when NMI happened in
> irq saved region that protects _that specific_ per-cpu kmem_cache_cpu.
> In that case retry the operation in a different kmalloc bucket.
> The second attempt will likely succeed, since this cpu locked
> different kmem_cache_cpu.
> When lock_local_is_locked() sees locked memcg_stock.stock_lock
> fallback to atomic operations.
>
> Similarly, in PREEMPT_RT local_lock_is_locked() returns true when
> per-cpu rt_spin_lock is locked by current task. In this case re-entrance
> into the same kmalloc bucket is unsafe, and kmalloc_nolock() tries
> a different bucket that is most likely is not locked by current
> task. Though it may be locked by a different task it's safe to
> rt_spin_lock() on it.
>
> Similar to alloc_pages_nolock() the kmalloc_nolock() returns NULL
> immediately if called from hard irq or NMI in PREEMPT_RT.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
> include/linux/kasan.h | 13 +-
> include/linux/slab.h | 4 +
> mm/kasan/common.c | 5 +-
> mm/memcontrol.c | 60 ++++++++-
> mm/slab.h | 1 +
> mm/slub.c | 280 ++++++++++++++++++++++++++++++++++++++----
> 6 files changed, 330 insertions(+), 33 deletions(-)
>
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index d5a8ab98035c..743f6d196d57 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -470,6 +470,7 @@ void * __must_check krealloc_noprof(const void *objp, size_t new_size,
> #define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__))
>
> void kfree(const void *objp);
> +void kfree_nolock(const void *objp);
> void kfree_sensitive(const void *objp);
> size_t __ksize(const void *objp);
>
> @@ -910,6 +911,9 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f
> }
> #define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__))
>
> +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node);
> +#define kmalloc_nolock(...) alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__))
As it takes node parameter, it should be kmalloc_node_nolock() instead?
> diff --git a/mm/slab.h b/mm/slab.h
> index 05a21dc796e0..1688749d2995 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -273,6 +273,7 @@ struct kmem_cache {
> unsigned int cpu_partial_slabs;
> #endif
> struct kmem_cache_order_objects oo;
> + struct llist_head defer_free_objects;
>
> /* Allocation and freeing of slabs */
> struct kmem_cache_order_objects min;
> diff --git a/mm/slub.c b/mm/slub.c
> index dc9e729e1d26..307ea0135b92 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3918,7 +3949,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>
> retry_load_slab:
>
> - local_lock_irqsave(&s->cpu_slab->lock, flags);
> + local_lock_irqsave_check(&s->cpu_slab->lock, flags);
> if (unlikely(c->slab)) {
> void *flush_freelist = c->freelist;
> struct slab *flush_slab = c->slab;
> @@ -3958,8 +3989,28 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> */
> c = slub_get_cpu_ptr(s->cpu_slab);
> #endif
> + if (unlikely(!gfpflags_allow_spinning(gfpflags))) {
> + struct slab *slab;
> +
> + slab = c->slab;
> + if (slab && !node_match(slab, node))
> + /* In trylock mode numa node is a hint */
> + node = NUMA_NO_NODE;
This logic can be moved to ___slab_alloc() as the code to ignore
node constraint (on some conditions) is already there?
> +
> + if (!local_lock_is_locked(&s->cpu_slab->lock)) {
> + lockdep_assert_not_held(this_cpu_ptr(&s->cpu_slab->lock));
> + } else {
> + /*
> + * EBUSY is an internal signal to kmalloc_nolock() to
> + * retry a different bucket. It's not propagated further.
> + */
> + p = ERR_PTR(-EBUSY);
> + goto out;
> + }
> + }
> p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
> +out:
> #ifdef CONFIG_PREEMPT_COUNT
> slub_put_cpu_ptr(s->cpu_slab);
> #endif
> @@ -4354,6 +4406,88 @@ void *__kmalloc_noprof(size_t size, gfp_t flags)
> }
> EXPORT_SYMBOL(__kmalloc_noprof);
>
> +/**
> + * kmalloc_nolock - Allocate an object of given size from any context.
> + * @size: size to allocate
> + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO allowed.
> + * @node: node number of the target node.
> + *
> + * Return: pointer to the new object or NULL in case of error.
> + * NULL does not mean EBUSY or EAGAIN. It means ENOMEM.
> + * There is no reason to call it again and expect !NULL.
> + */
> +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
> +{
> + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
> + struct kmem_cache *s;
> + bool can_retry = true;
> + void *ret = ERR_PTR(-EBUSY);
> +
> + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO));
> +
> + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
> + return NULL;
> + if (unlikely(!size))
> + return ZERO_SIZE_PTR;
> +
> + if (!USE_LOCKLESS_FAST_PATH() && (in_nmi() || in_hardirq()))
> + /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */
> + return NULL;
> +retry:
> + s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_);
> +
> + if (!(s->flags & __CMPXCHG_DOUBLE))
> + /*
> + * kmalloc_nolock() is not supported on architectures that
> + * don't implement cmpxchg16b.
> + */
> + return NULL;
Hmm when someone uses slab debugging flags (e.g., passing boot
parameter slab_debug=FPZ as a hardening option on production [1], or
just for debugging), __CMPXCHG_DOUBLE is not set even when the arch
supports it.
Is it okay to fail all kmalloc_nolock() calls in such cases?
[1] https://lore.kernel.org/linux-mm/20250421165508.make.689-kees@kernel.org/
> +
> + /*
> + * Do not call slab_alloc_node(), since trylock mode isn't
> + * compatible with slab_pre_alloc_hook/should_failslab and
> + * kfence_alloc.
> + *
> + * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
> + * in irq saved region. It assumes that the same cpu will not
> + * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
> + * Therefore use in_nmi() to check whether particular bucket is in
> + * irq protected section.
> + */
> + if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
> + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
> +
> + if (PTR_ERR(ret) == -EBUSY) {
> + if (can_retry) {
> + /* pick the next kmalloc bucket */
> + size = s->object_size + 1;
> + /*
> + * Another alternative is to
> + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
> + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
> + * to retry from bucket of the same size.
> + */
> + can_retry = false;
> + goto retry;
> + }
> + ret = NULL;
> + }
> +
> + maybe_wipe_obj_freeptr(s, ret);
> + /*
> + * Make sure memcg_stock.stock_lock doesn't change cpu
> + * when memcg layers access it.
> + */
> + slub_get_cpu_ptr(s->cpu_slab);
> + slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
> + slab_want_init_on_alloc(alloc_gfp, s), size);
> + slub_put_cpu_ptr(s->cpu_slab);
Should this migration prevention really be in slab, not in memcg?
> + ret = kasan_kmalloc(s, ret, size, alloc_gfp);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
> +
> void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
> int node, unsigned long caller)
> {
> @@ -4568,6 +4701,30 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> }
>
> #ifndef CONFIG_SLUB_TINY
> +static void free_deferred_objects(struct llist_head *llhead, unsigned long addr)
> +{
If SLUB_TINY is enabled and thus it does not perform deferred free,
what happens?
> + struct llist_node *llnode, *pos, *t;
> +
> + if (likely(llist_empty(llhead)))
> + return;
> +
> + llnode = llist_del_all(llhead);
> + llist_for_each_safe(pos, t, llnode) {
> + struct kmem_cache *s;
> + struct slab *slab;
> + void *x = pos;
> +
> + slab = virt_to_slab(x);
> + s = slab->slab_cache;
> +
> + /*
> + * memcg, kasan_slab_pre are already done for 'x'.
> + * The only thing left is kasan_poison.
> + */
> + kasan_slab_free(s, x, false, false, true);
> + __slab_free(s, slab, x, x, 1, addr);
> + }
> +}
> /*
> * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
> * can perform fastpath freeing without additional function calls.
--
Cheers,
Harry / Hyeonggon
next prev parent reply other threads:[~2025-05-09 1:04 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-01 3:27 [PATCH 0/6] mm: Reentrant kmalloc Alexei Starovoitov
2025-05-01 3:27 ` [PATCH 1/6] mm: Rename try_alloc_pages() to alloc_pages_nolock() Alexei Starovoitov
2025-05-06 8:26 ` Vlastimil Babka
2025-05-07 1:24 ` Alexei Starovoitov
2025-05-01 3:27 ` [PATCH 2/6] locking/local_lock: Expose dep_map in local_trylock_t Alexei Starovoitov
2025-05-06 12:56 ` Vlastimil Babka
2025-05-06 14:55 ` Vlastimil Babka
2025-05-07 1:25 ` Alexei Starovoitov
2025-05-12 13:26 ` Sebastian Andrzej Siewior
2025-05-12 16:46 ` Alexei Starovoitov
2025-05-01 3:27 ` [PATCH 3/6] locking/local_lock: Introduce local_lock_is_locked() Alexei Starovoitov
2025-05-06 12:59 ` Vlastimil Babka
2025-05-07 1:28 ` Alexei Starovoitov
2025-05-12 14:56 ` Sebastian Andrzej Siewior
2025-05-12 15:01 ` Vlastimil Babka
2025-05-12 15:23 ` Sebastian Andrzej Siewior
2025-05-01 3:27 ` [PATCH 4/6] locking/local_lock: Introduce local_lock_irqsave_check() Alexei Starovoitov
2025-05-07 13:02 ` Vlastimil Babka
2025-05-12 14:03 ` Sebastian Andrzej Siewior
2025-05-12 17:16 ` Alexei Starovoitov
2025-05-13 6:58 ` Vlastimil Babka
2025-05-13 21:55 ` Alexei Starovoitov
2025-05-01 3:27 ` [PATCH 5/6] mm: Allow GFP_ACCOUNT and GFP_COMP to be used in alloc_pages_nolock() Alexei Starovoitov
2025-05-06 8:55 ` Vlastimil Babka
2025-05-07 1:33 ` Alexei Starovoitov
2025-05-01 3:27 ` [PATCH 6/6] slab: Introduce kmalloc_nolock() and kfree_nolock() Alexei Starovoitov
2025-05-05 18:46 ` Shakeel Butt
2025-05-06 0:49 ` Alexei Starovoitov
2025-05-06 1:24 ` Shakeel Butt
2025-05-06 1:51 ` Alexei Starovoitov
2025-05-06 18:05 ` Shakeel Butt
2025-05-06 12:01 ` Vlastimil Babka
2025-05-07 0:31 ` Harry Yoo
2025-05-07 2:23 ` Alexei Starovoitov
2025-05-07 8:38 ` Vlastimil Babka
2025-05-07 2:20 ` Alexei Starovoitov
2025-05-07 10:44 ` Vlastimil Babka
2025-05-09 1:03 ` Harry Yoo [this message]
2025-06-24 17:13 ` SLAB_NO_CMPXCHG was:: " Alexei Starovoitov
2025-06-25 11:38 ` Harry Yoo
2025-06-26 20:03 ` Alexei Starovoitov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aB1UVkKSeJWEGECq@hyeyoo \
--to=harry.yoo@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=alexei.starovoitov@gmail.com \
--cc=andrii@kernel.org \
--cc=bigeasy@linutronix.de \
--cc=bpf@vger.kernel.org \
--cc=hannes@cmpxchg.org \
--cc=linux-mm@kvack.org \
--cc=memxor@gmail.com \
--cc=mhocko@suse.com \
--cc=peterz@infradead.org \
--cc=rostedt@goodmis.org \
--cc=shakeel.butt@linux.dev \
--cc=vbabka@suse.cz \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.