From: Harry Yoo <harry@kernel.org>
To: Suren Baghdasaryan <surenb@google.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>,
"Vlastimil Babka (SUSE)" <vbabka@kernel.org>,
Andrew Morton <akpm@linux-foundation.org>,
Roman Gushchin <roman.gushchin@linux.dev>,
Hao Li <hao.li@linux.dev>, Christoph Lameter <cl@gentwo.org>,
David Rientjes <rientjes@google.com>,
Usama Arif <usama.arif@linux.dev>,
Meta kernel team <kernel-team@meta.com>,
linux-mm@kvack.org, linux-kernel@vger.kernel.org,
Danielle Costantino <dcostantino@meta.com>,
Kees Cook <kees@kernel.org>
Subject: Re: [PATCH] mm/slub: serve slabobj_ext array from a strictly larger kmalloc cache
Date: Wed, 1 Jul 2026 16:42:13 +0900 [thread overview]
Message-ID: <68e7f6cd-cf11-46b2-84a2-d512bb22dae4@kernel.org> (raw)
In-Reply-To: <92bf5e21-690e-4a77-929e-5217e0d7cb0c@kernel.org>
On 7/1/26 1:53 PM, Harry Yoo wrote:
>
>
> On 7/1/26 1:30 PM, Harry Yoo wrote:
>> We can do that in pre-7.2 kernels, by teaching kmalloc_type() and
>> kmalloc_slab() select the new KMALLOC_TYPE based on __GFP_NO_OBJ_EXT?
>>
>> e.g.) Select the new KMALLOC_TYPE when KMALLOC_NOT_NORMAL_BITS is not
>> set AND __GFP_NO_OBJ_EXT is set.
>
> Uh, this is bit subtle though.
>
> In some cases KMALLOC_DMA == KMALLOC_NORMAL,
> KMALLOC_CGROUP == KMALLOC_NORMAL,
> or KMALLOC_RECLAIM == KMALLOC_NORMAL.
>
> Just checking KMALLOC_NOT_NORMAL_BITS is misleading.
Here's a prototype for slab/for-next. Backporting it requires handling
__GFP_NO_OBJ_EXT instead of SLAB_ALLOC_NO_RECURSE, but shouldn't be
too difficult. Now writing changelog and going through testing...
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 51f03f18c9a7..91a71537a2fe 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -684,6 +684,26 @@ static inline unsigned int arch_slab_minalign(void)
#define KMALLOC_PARTITION_CACHES_NR 0
#endif
+/*
+ * SLUB needs a separate kmalloc type, KMALLOC_NO_RECURSE, when
internal slab
+ * metadata of kmalloc objects can be allocated from the same kmalloc type.
+ */
+#if defined(CONFIG_MEM_ALLOC_PROFILING)
+/*
+ * Memory allocation profiling can allocate internal slab metadata
+ * for any slab cache.
+ */
+#define HAS_KMALLOC_NO_RECURSE
+#elif defined(CONFIG_SLUB_TINY) && defined(CONFIG_MEMCG)
+/*
+ * Accounted slab objects are usually allocated from KMALLOC_CGROUP.
+ * On SLUB_TINY, those can be allocated from KMALLOC_NORMAL because
+ * KMALLOC_RECLAIM aliases with KMALLOC_CGROUP and has higher priority than
+ * KMALLOC_CGROUP.
+ */
+#define HAS_KMALLOC_NO_RECURSE
+#endif
+
/*
* Whenever changing this, take care of that kmalloc_type() and
* create_kmalloc_caches() still work as intended.
@@ -702,6 +722,9 @@ enum kmalloc_cache_type {
#endif
KMALLOC_PARTITION_START = KMALLOC_NORMAL,
KMALLOC_PARTITION_END = KMALLOC_PARTITION_START +
KMALLOC_PARTITION_CACHES_NR,
+#ifdef HAS_KMALLOC_NO_RECURSE
+ KMALLOC_NO_RECURSE,
+#endif
#ifdef CONFIG_SLUB_TINY
KMALLOC_RECLAIM = KMALLOC_NORMAL,
#else
@@ -716,6 +739,16 @@ enum kmalloc_cache_type {
NR_KMALLOC_TYPES
};
+#if !defined(HAS_KMALLOC_NO_RECURSE) && defined(CONFIG_SLAB_OBJ_EXT)
+/*
+ * kmalloc_flags() with SLAB_ALLOC_NO_RECURSE should not use KMALLOC_NORMAL
+ * if any of these alias with KMALLOC_NORMAL.
+ */
+static_assert(KMALLOC_DMA != KMALLOC_NORMAL);
+static_assert(KMALLOC_CGROUP != KMALLOC_NORMAL);
+static_assert(KMALLOC_RECLAIM != KMALLOC_NORMAL);
+#endif
+
typedef struct kmem_cache * kmem_buckets[KMALLOC_SHIFT_HIGH + 1];
extern kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES];
diff --git a/mm/slab.h b/mm/slab.h
index 281a65233795..ba0560111488 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -386,12 +386,21 @@ static inline unsigned int
size_index_elem(unsigned int bytes)
* KMALLOC_MAX_CACHE_SIZE and the caller must check that.
*/
static inline struct kmem_cache *
-kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, kmalloc_token_t
token)
+kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, kmalloc_token_t
token,
+ unsigned int alloc_flags)
{
unsigned int index;
+ enum kmalloc_cache_type type = kmalloc_type(flags, token);
+
+#ifdef HAS_KMALLOC_NO_RECURSE
+ if (type >= KMALLOC_PARTITION_START &&
+ type <= KMALLOC_PARTITION_END &&
+ (alloc_flags & SLAB_ALLOC_NO_RECURSE))
+ type = KMALLOC_NO_RECURSE;
+#endif
if (!b)
- b = &kmalloc_caches[kmalloc_type(flags, token)];
+ b = &kmalloc_caches[type];
if (size <= 192)
index = kmalloc_size_index[size_index_elem(size)];
else
diff --git a/mm/slab_common.c b/mm/slab_common.c
index b6426d7ceec9..8541f4a9cfda 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -783,11 +783,15 @@ u8 kmalloc_size_index[24] __ro_after_init = {
size_t kmalloc_size_roundup(size_t size)
{
if (size && size <= KMALLOC_MAX_CACHE_SIZE) {
+ struct kmem_cache *s;
+
/*
* The flags don't matter since size_index is common to all.
* Neither does the caller for just getting ->object_size.
*/
- return kmalloc_slab(size, NULL, GFP_KERNEL,
__kmalloc_token(0))->object_size;
+ s = kmalloc_slab(size, NULL, GFP_KERNEL, __kmalloc_token(0),
+ SLAB_ALLOC_DEFAULT);
+ return s->object_size;
}
/* Above the smaller buckets, size is a multiple of page size. */
@@ -843,6 +847,12 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
#define KMALLOC_PARTITION_NAME(N, sz)
#endif
+#ifdef HAS_KMALLOC_NO_RECURSE
+#define KMALLOC_NO_RECURSE_NAME(sz) .name[KMALLOC_NO_RECURSE] =
"kmalloc-no-recurse-" #sz,
+#else
+#define KMALLOC_NO_RECURSE_NAME(sz)
+#endif
+
#define INIT_KMALLOC_INFO(__size, __short_size) \
{ \
.name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
@@ -850,6 +860,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
KMALLOC_CGROUP_NAME(__short_size) \
KMALLOC_DMA_NAME(__short_size) \
KMALLOC_PARTITION_NAME(KMALLOC_PARTITION_CACHES_NR, __short_size) \
+ KMALLOC_NO_RECURSE_NAME(__short_size) \
.size = __size, \
}
@@ -966,6 +977,11 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type
type)
flags |= SLAB_NO_MERGE;
#endif
+#ifdef HAS_KMALLOC_NO_RECURSE
+ if (type == KMALLOC_NO_RECURSE)
+ flags |= SLAB_NO_OBJ_EXT;
+#endif
+
/*
* If CONFIG_MEMCG is enabled, disable cache merging for
* KMALLOC_NORMAL caches.
diff --git a/mm/slub.c b/mm/slub.c
index 9f754cf1c187..a5745759f0af 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2123,42 +2123,6 @@ static inline void init_slab_obj_exts(struct slab
*slab)
slab->obj_exts = 0;
}
-/*
- * Calculate the allocation size for slabobj_ext array.
- *
- * When memory allocation profiling is enabled, the obj_exts array
- * could be allocated from the same slab cache it's being allocated for.
- * This would prevent the slab from ever being freed because it would
- * always contain at least one allocated object (its own obj_exts array).
- *
- * To avoid this, increase the allocation size when we detect the array
- * may come from the same cache, forcing it to use a different cache.
- */
-static inline size_t obj_exts_alloc_size(struct kmem_cache *s,
- struct slab *slab, gfp_t gfp)
-{
- size_t sz = sizeof(struct slabobj_ext) * slab->objects;
- struct kmem_cache *obj_exts_cache;
-
- if (sz > KMALLOC_MAX_CACHE_SIZE)
- return sz;
-
- if (!is_kmalloc_normal(s))
- return sz;
-
- obj_exts_cache = kmalloc_slab(sz, NULL, gfp, __kmalloc_token(0));
- /*
- * We can't simply compare s with obj_exts_cache, because partitioned
kmalloc
- * caches have multiple caches per size, selected by caller address or
type.
- * Since caller address or type may differ between kmalloc_slab() and
actual
- * allocation, bump size when sizes are equal.
- */
- if (s->object_size == obj_exts_cache->object_size)
- return obj_exts_cache->object_size + 1;
-
- return sz;
-}
-
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, unsigned int alloc_flags)
{
@@ -2168,15 +2132,13 @@ int alloc_slab_obj_exts(struct slab *slab,
struct kmem_cache *s,
unsigned long new_exts;
unsigned long old_exts;
struct slabobj_ext *vec;
- size_t sz;
+ size_t sz = sizeof(struct slabobj_ext) * slab->objects;
gfp &= ~OBJCGS_CLEAR_MASK;
/* Prevent recursive extension vector allocation */
alloc_flags |= SLAB_ALLOC_NO_RECURSE;
alloc_flags &= ~SLAB_ALLOC_NEW_SLAB;
- sz = obj_exts_alloc_size(s, slab, gfp);
-
/* This will use kmalloc_nolock() if alloc_flags say so */
vec = kmalloc_flags(sz, gfp | __GFP_ZERO, alloc_flags, slab_nid(slab));
@@ -5330,7 +5292,7 @@ void *__do_kmalloc_node(kmem_buckets *b, gfp_t
flags, int node,
if (unlikely(!size))
return ZERO_SIZE_PTR;
- s = kmalloc_slab(size, b, flags, token);
+ s = kmalloc_slab(size, b, flags, token, ac->alloc_flags);
ret = slab_alloc_node(s, flags, node, ac);
ret = kasan_kmalloc(s, ret, size, flags);
@@ -5395,7 +5357,9 @@ static void
*__kmalloc_nolock_noprof(DECL_TOKEN_PARAMS(size, token), gfp_t gfp_f
retry:
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
return NULL;
- s = kmalloc_slab(size, NULL, gfp_flags, PASS_TOKEN_PARAM(token));
+
+ s = kmalloc_slab(size, NULL, gfp_flags, PASS_TOKEN_PARAM(token),
+ ac->alloc_flags);
if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
/*
--
Cheers,
Harry / Hyeonggon
next prev parent reply other threads:[~2026-07-01 7:42 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-25 23:00 [PATCH] mm/slub: serve slabobj_ext array from a strictly larger kmalloc cache Shakeel Butt
2026-06-26 4:22 ` Harry Yoo
2026-06-26 16:49 ` Shakeel Butt
2026-06-26 17:11 ` Vlastimil Babka (SUSE)
2026-06-28 2:58 ` Shakeel Butt
2026-06-28 3:23 ` Shakeel Butt
2026-06-28 7:47 ` Vlastimil Babka (SUSE)
2026-06-28 9:22 ` Harry Yoo
2026-06-28 23:37 ` Suren Baghdasaryan
2026-06-29 3:57 ` Harry Yoo
2026-06-29 4:28 ` Suren Baghdasaryan
2026-06-29 19:52 ` Shakeel Butt
2026-06-30 2:03 ` Harry Yoo
2026-06-30 2:30 ` Harry Yoo
2026-06-30 4:38 ` Suren Baghdasaryan
2026-06-30 4:39 ` Suren Baghdasaryan
2026-06-30 4:42 ` Harry Yoo
2026-06-30 5:29 ` Suren Baghdasaryan
2026-06-30 6:12 ` Vlastimil Babka (SUSE)
2026-06-30 7:03 ` Harry Yoo
2026-06-30 14:35 ` Shakeel Butt
2026-06-30 14:52 ` Suren Baghdasaryan
2026-06-30 15:27 ` Harry Yoo
2026-06-30 23:55 ` Suren Baghdasaryan
2026-07-01 4:30 ` Harry Yoo
2026-07-01 4:53 ` Harry Yoo
2026-07-01 7:42 ` Harry Yoo [this message]
2026-07-01 8:43 ` Harry Yoo
2026-07-01 10:31 ` Harry Yoo
2026-07-01 11:37 ` Suren Baghdasaryan
2026-06-28 8:10 ` Harry Yoo
2026-06-28 8:36 ` Harry Yoo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=68e7f6cd-cf11-46b2-84a2-d512bb22dae4@kernel.org \
--to=harry@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=cl@gentwo.org \
--cc=dcostantino@meta.com \
--cc=hao.li@linux.dev \
--cc=kees@kernel.org \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=rientjes@google.com \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=surenb@google.com \
--cc=usama.arif@linux.dev \
--cc=vbabka@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox