[RFC PATCH] mm/sparse: remove sparse

public inbox for linux-mm@kvack.org
 help / color / mirror / Atom feed

* [RFC PATCH] mm/sparse: remove sparse_buffer
@ 2026-04-07  8:39 Muchun Song
  2026-04-08 13:40 ` David Hildenbrand (Arm)
  0 siblings, 1 reply; 5+ messages in thread
From: Muchun Song @ 2026-04-07  8:39 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand
  Cc: yinghai, Muchun Song, Muchun Song, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, linux-mm, linux-kernel

The sparse_buffer was originally introduced in commit 9bdac9142407
("sparsemem: Put mem map for one node together.") to allocate a
contiguous block of memory for all memmaps of a NUMA node.

However, the original commit message did not clearly state the actual
benefits or the necessity of keeping all memmap areas strictly
contiguous for a given node.

With the evolution of memory management over the years, the current code
only requires a 2MB contiguous allocation to support huge page mappings
for CONFIG_SPARSEMEM_VMEMMAP. Thus, it seems we no longer need such
complex logic to keep all memmap allocations completely contiguous
across the entire node.

Since the original commit was merged 16 years ago and no additional
context regarding its original intention could be found, this patch
proposes removing this mechanism to reduce the maintenance burden.

If anyone knows the historical background or if there are specific
architectures (Note that the mechanism implemented in 9bdac9142407 was
restricted to x86_64. Therefore, I doubt there are any functional
dependencies for other architectures.) or edge cases that still
rely on this, sharing that context would be highly appreciated.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/mm.h  |  1 -
 mm/sparse-vmemmap.c |  7 +-----
 mm/sparse.c         | 58 +--------------------------------------------
 3 files changed, 2 insertions(+), 64 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0b776907152e..1d676fef4303 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4855,7 +4855,6 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 }
 #endif
 
-void *sparse_buffer_alloc(unsigned long size);
 unsigned long section_map_size(void);
 struct page * __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 6eadb9d116e4..aca1b00e86dd 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -87,15 +87,10 @@ static void * __meminit altmap_alloc_block_buf(unsigned long size,
 void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
 					 struct vmem_altmap *altmap)
 {
-	void *ptr;
-
 	if (altmap)
 		return altmap_alloc_block_buf(size, altmap);
 
-	ptr = sparse_buffer_alloc(size);
-	if (!ptr)
-		ptr = vmemmap_alloc_block(size, node);
-	return ptr;
+	return vmemmap_alloc_block(size, node);
 }
 
 static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
diff --git a/mm/sparse.c b/mm/sparse.c
index effdac6b0ab1..672e2ad396a8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -241,12 +241,9 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
 		struct dev_pagemap *pgmap)
 {
 	unsigned long size = section_map_size();
-	struct page *map = sparse_buffer_alloc(size);
+	struct page *map;
 	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
 
-	if (map)
-		return map;
-
 	map = memmap_alloc(size, size, addr, nid, false);
 	if (!map)
 		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
@@ -256,55 +253,6 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
-static void *sparsemap_buf __meminitdata;
-static void *sparsemap_buf_end __meminitdata;
-
-static inline void __meminit sparse_buffer_free(unsigned long size)
-{
-	WARN_ON(!sparsemap_buf || size == 0);
-	memblock_free(sparsemap_buf, size);
-}
-
-static void __init sparse_buffer_init(unsigned long size, int nid)
-{
-	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
-	WARN_ON(sparsemap_buf);	/* forgot to call sparse_buffer_fini()? */
-	/*
-	 * Pre-allocated buffer is mainly used by __populate_section_memmap
-	 * and we want it to be properly aligned to the section size - this is
-	 * especially the case for VMEMMAP which maps memmap to PMDs
-	 */
-	sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
-	sparsemap_buf_end = sparsemap_buf + size;
-}
-
-static void __init sparse_buffer_fini(void)
-{
-	unsigned long size = sparsemap_buf_end - sparsemap_buf;
-
-	if (sparsemap_buf && size > 0)
-		sparse_buffer_free(size);
-	sparsemap_buf = NULL;
-}
-
-void * __meminit sparse_buffer_alloc(unsigned long size)
-{
-	void *ptr = NULL;
-
-	if (sparsemap_buf) {
-		ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
-		if (ptr + size > sparsemap_buf_end)
-			ptr = NULL;
-		else {
-			/* Free redundant aligned space */
-			if ((unsigned long)(ptr - sparsemap_buf) > 0)
-				sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
-			sparsemap_buf = ptr + size;
-		}
-	}
-	return ptr;
-}
-
 void __weak __meminit vmemmap_populate_print_last(void)
 {
 }
@@ -362,8 +310,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 		goto failed;
 	}
 
-	sparse_buffer_init(map_count * section_map_size(), nid);
-
 	sparse_vmemmap_init_nid_early(nid);
 
 	for_each_present_section_nr(pnum_begin, pnum) {
@@ -381,7 +327,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 				       __func__, nid);
 				pnum_begin = pnum;
 				sparse_usage_fini();
-				sparse_buffer_fini();
 				goto failed;
 			}
 			memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
@@ -390,7 +335,6 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 		}
 	}
 	sparse_usage_fini();
-	sparse_buffer_fini();
 	return;
 failed:
 	/*
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] mm/sparse: remove sparse_buffer
  2026-04-07  8:39 [RFC PATCH] mm/sparse: remove sparse_buffer Muchun Song
@ 2026-04-08 13:40 ` David Hildenbrand (Arm)
  2026-04-09 11:40   ` Muchun Song
  0 siblings, 1 reply; 5+ messages in thread
From: David Hildenbrand (Arm) @ 2026-04-08 13:40 UTC (permalink / raw)
  To: Muchun Song, Andrew Morton
  Cc: yinghai, Muchun Song, Lorenzo Stoakes, Liam R. Howlett,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	linux-mm, linux-kernel

On 4/7/26 10:39, Muchun Song wrote:
> The sparse_buffer was originally introduced in commit 9bdac9142407
> ("sparsemem: Put mem map for one node together.") to allocate a
> contiguous block of memory for all memmaps of a NUMA node.
> 
> However, the original commit message did not clearly state the actual
> benefits or the necessity of keeping all memmap areas strictly
> contiguous for a given node.

We don't want the memmap to be scattered around, given that it is one of
the biggest allocations during boot.

It's related to not turning too many memory blocks/sections
un-offlinable I think.

I always imagined that memblock would still keep these allocations close
to each other. Can you verify if that is indeed true?

-- 
Cheers,

David


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] mm/sparse: remove sparse_buffer
  2026-04-08 13:40 ` David Hildenbrand (Arm)
@ 2026-04-09 11:40   ` Muchun Song
  2026-04-09 12:29     ` David Hildenbrand (Arm)
  0 siblings, 1 reply; 5+ messages in thread
From: Muchun Song @ 2026-04-09 11:40 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Muchun Song, Andrew Morton, yinghai, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, linux-mm, linux-kernel



> On Apr 8, 2026, at 21:40, David Hildenbrand (Arm) <david@kernel.org> wrote:
> 
> On 4/7/26 10:39, Muchun Song wrote:
>> The sparse_buffer was originally introduced in commit 9bdac9142407
>> ("sparsemem: Put mem map for one node together.") to allocate a
>> contiguous block of memory for all memmaps of a NUMA node.
>> 
>> However, the original commit message did not clearly state the actual
>> benefits or the necessity of keeping all memmap areas strictly
>> contiguous for a given node.
> 
> We don't want the memmap to be scattered around, given that it is one of
> the biggest allocations during boot.
> 
> It's related to not turning too many memory blocks/sections
> un-offlinable I think.

Hi David,

Got it.

> 
> I always imagined that memblock would still keep these allocations close
> to each other. Can you verify if that is indeed true?

You raised a very interesting point about whether memblock keeps
these allocations close to each other. I've done a thorough test
on a 16GB VM by printing the actual physical allocations.

I enabled the existing debug logs in arch/x86/mm/init_64.c to
trace the vmemmap_set_pmd allocations. Here is what really happens:

When using vmemmap_alloc_block without sparse_buffer, the
memblock allocator allocates 2MB chunks. Because memblock
allocates top-down by default, the physical allocations look
like this:

[ffe6475cc0000000-ffe6475cc01fffff] PMD -> [ff3cb082bfc00000-ff3cb082bfdfffff] on node 0
[ffe6475cc0200000-ffe6475cc03fffff] PMD -> [ff3cb082bfa00000-ff3cb082bfbfffff] on node 0
[ffe6475cc0400000-ffe6475cc05fffff] PMD -> [ff3cb082bf800000-ff3cb082bf9fffff] on node 0
[ffe6475cc0600000-ffe6475cc07fffff] PMD -> [ff3cb082bf600000-ff3cb082bf7fffff] on node 0
[ffe6475cc0800000-ffe6475cc09fffff] PMD -> [ff3cb082bf400000-ff3cb082bf5fffff] on node 0
[ffe6475cc0a00000-ffe6475cc0bfffff] PMD -> [ff3cb082bf200000-ff3cb082bf3fffff] on node 0
[ffe6475cc0c00000-ffe6475cc0dfffff] PMD -> [ff3cb082bf000000-ff3cb082bf1fffff] on node 0
[ffe6475cc0e00000-ffe6475cc0ffffff] PMD -> [ff3cb082bee00000-ff3cb082beffffff] on node 0
[ffe6475cc1000000-ffe6475cc11fffff] PMD -> [ff3cb082bec00000-ff3cb082bedfffff] on node 0
[ffe6475cc1200000-ffe6475cc13fffff] PMD -> [ff3cb082bea00000-ff3cb082bebfffff] on node 0
[ffe6475cc1400000-ffe6475cc15fffff] PMD -> [ff3cb082be800000-ff3cb082be9fffff] on node 0
[ffe6475cc1600000-ffe6475cc17fffff] PMD -> [ff3cb082be600000-ff3cb082be7fffff] on node 0
[ffe6475cc1800000-ffe6475cc19fffff] PMD -> [ff3cb082be400000-ff3cb082be5fffff] on node 0
[ffe6475cc1a00000-ffe6475cc1bfffff] PMD -> [ff3cb082be200000-ff3cb082be3fffff] on node 0
[ffe6475cc1c00000-ffe6475cc1dfffff] PMD -> [ff3cb082be000000-ff3cb082be1fffff] on node 0
[ffe6475cc1e00000-ffe6475cc1ffffff] PMD -> [ff3cb082bde00000-ff3cb082bdffffff] on node 0
[ffe6475cc2000000-ffe6475cc21fffff] PMD -> [ff3cb082bdc00000-ff3cb082bddfffff] on node 0
[ffe6475cc2200000-ffe6475cc23fffff] PMD -> [ff3cb082bda00000-ff3cb082bdbfffff] on node 0
[ffe6475cc2400000-ffe6475cc25fffff] PMD -> [ff3cb082bd800000-ff3cb082bd9fffff] on node 0
[ffe6475cc2600000-ffe6475cc27fffff] PMD -> [ff3cb082bd600000-ff3cb082bd7fffff] on node 0
[ffe6475cc2800000-ffe6475cc29fffff] PMD -> [ff3cb082bd400000-ff3cb082bd5fffff] on node 0
[ffe6475cc2a00000-ffe6475cc2bfffff] PMD -> [ff3cb082bd200000-ff3cb082bd3fffff] on node 0
[ffe6475cc2c00000-ffe6475cc2dfffff] PMD -> [ff3cb082bd000000-ff3cb082bd1fffff] on node 0
[ffe6475cc2e00000-ffe6475cc2ffffff] PMD -> [ff3cb082bce00000-ff3cb082bcffffff] on node 0
[ffe6475cc4000000-ffe6475cc41fffff] PMD -> [ff3cb082bcc00000-ff3cb082bcdfffff] on node 0
[ffe6475cc4200000-ffe6475cc43fffff] PMD -> [ff3cb082bca00000-ff3cb082bcbfffff] on node 0
[ffe6475cc4400000-ffe6475cc45fffff] PMD -> [ff3cb082bc800000-ff3cb082bc9fffff] on node 0
[ffe6475cc4600000-ffe6475cc47fffff] PMD -> [ff3cb082bc600000-ff3cb082bc7fffff] on node 0
[ffe6475cc4800000-ffe6475cc49fffff] PMD -> [ff3cb082bc400000-ff3cb082bc5fffff] on node 0
[ffe6475cc4a00000-ffe6475cc4bfffff] PMD -> [ff3cb082bc200000-ff3cb082bc3fffff] on node 0
[ffe6475cc4c00000-ffe6475cc4dfffff] PMD -> [ff3cb082bc000000-ff3cb082bc1fffff] on node 0
[ffe6475cc4e00000-ffe6475cc4ffffff] PMD -> [ff3cb082bbe00000-ff3cb082bbffffff] on node 0
[ffe6475cc5000000-ffe6475cc51fffff] PMD -> [ff3cb083bfa00000-ff3cb083bfbfffff] on node 1
[ffe6475cc5200000-ffe6475cc53fffff] PMD -> [ff3cb083bf800000-ff3cb083bf9fffff] on node 1
[ffe6475cc5400000-ffe6475cc55fffff] PMD -> [ff3cb083bf600000-ff3cb083bf7fffff] on node 1
[ffe6475cc5600000-ffe6475cc57fffff] PMD -> [ff3cb083bf400000-ff3cb083bf5fffff] on node 1
[ffe6475cc5800000-ffe6475cc59fffff] PMD -> [ff3cb083bf200000-ff3cb083bf3fffff] on node 1
[ffe6475cc5a00000-ffe6475cc5bfffff] PMD -> [ff3cb083bf000000-ff3cb083bf1fffff] on node 1
[ffe6475cc5c00000-ffe6475cc5dfffff] PMD -> [ff3cb083b6e00000-ff3cb083b6ffffff] on node 1
[ffe6475cc5e00000-ffe6475cc5ffffff] PMD -> [ff3cb083b6c00000-ff3cb083b6dfffff] on node 1
[ffe6475cc6000000-ffe6475cc61fffff] PMD -> [ff3cb083b6a00000-ff3cb083b6bfffff] on node 1
[ffe6475cc6200000-ffe6475cc63fffff] PMD -> [ff3cb083b6800000-ff3cb083b69fffff] on node 1
[ffe6475cc6400000-ffe6475cc65fffff] PMD -> [ff3cb083b6600000-ff3cb083b67fffff] on node 1
[ffe6475cc6600000-ffe6475cc67fffff] PMD -> [ff3cb083b6400000-ff3cb083b65fffff] on node 1
[ffe6475cc6800000-ffe6475cc69fffff] PMD -> [ff3cb083b6200000-ff3cb083b63fffff] on node 1
[ffe6475cc6a00000-ffe6475cc6bfffff] PMD -> [ff3cb083b6000000-ff3cb083b61fffff] on node 1
[ffe6475cc6c00000-ffe6475cc6dfffff] PMD -> [ff3cb083b5e00000-ff3cb083b5ffffff] on node 1
[ffe6475cc6e00000-ffe6475cc6ffffff] PMD -> [ff3cb083b5c00000-ff3cb083b5dfffff] on node 1
[ffe6475cc7000000-ffe6475cc71fffff] PMD -> [ff3cb083b5a00000-ff3cb083b5bfffff] on node 1
[ffe6475cc7200000-ffe6475cc73fffff] PMD -> [ff3cb083b5800000-ff3cb083b59fffff] on node 1
[ffe6475cc7400000-ffe6475cc75fffff] PMD -> [ff3cb083b5600000-ff3cb083b57fffff] on node 1
[ffe6475cc7600000-ffe6475cc77fffff] PMD -> [ff3cb083b5400000-ff3cb083b55fffff] on node 1
[ffe6475cc7800000-ffe6475cc79fffff] PMD -> [ff3cb083b5200000-ff3cb083b53fffff] on node 1
[ffe6475cc7a00000-ffe6475cc7bfffff] PMD -> [ff3cb083b5000000-ff3cb083b51fffff] on node 1
[ffe6475cc7c00000-ffe6475cc7dfffff] PMD -> [ff3cb083b4e00000-ff3cb083b4ffffff] on node 1
[ffe6475cc7e00000-ffe6475cc7ffffff] PMD -> [ff3cb083b4c00000-ff3cb083b4dfffff] on node 1
[ffe6475cc8000000-ffe6475cc81fffff] PMD -> [ff3cb083b4a00000-ff3cb083b4bfffff] on node 1
[ffe6475cc8200000-ffe6475cc83fffff] PMD -> [ff3cb083b4800000-ff3cb083b49fffff] on node 1
[ffe6475cc8400000-ffe6475cc85fffff] PMD -> [ff3cb083b4600000-ff3cb083b47fffff] on node 1
[ffe6475cc8600000-ffe6475cc87fffff] PMD -> [ff3cb083b4400000-ff3cb083b45fffff] on node 1
[ffe6475cc8800000-ffe6475cc89fffff] PMD -> [ff3cb083b4200000-ff3cb083b43fffff] on node 1
[ffe6475cc8a00000-ffe6475cc8bfffff] PMD -> [ff3cb083b4000000-ff3cb083b41fffff] on node 1
[ffe6475cc8c00000-ffe6475cc8dfffff] PMD -> [ff3cb083b3e00000-ff3cb083b3ffffff] on node 1
[ffe6475cc8e00000-ffe6475cc8ffffff] PMD -> [ff3cb083b3c00000-ff3cb083b3dfffff] on node 1

Notice that the physical chunks are strictly adjacent to each
other, but in descending order!

So, they are NOT "scattered around" the whole node randomly.
Instead, they are packed densely back-to-back in a single
contiguous physical range (just mapped top-down in 2MB pieces).

Because they are packed tightly together within the same
contiguous physical memory range, they will at most consume or
pollute the exact same number of memory blocks as a single
contiguous allocation (like sparse_buffer did). Therefore, this
will NOT turn additional memory blocks/sections into an
"un-offlinable" state.

It seems we can safely remove the sparse buffer preallocation
mechanism, don't you think?

Thanks,
Muchun

> 
> -- 
> Cheers,
> 
> David



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] mm/sparse: remove sparse_buffer
  2026-04-09 11:40   ` Muchun Song
@ 2026-04-09 12:29     ` David Hildenbrand (Arm)
  2026-04-09 15:10       ` Mike Rapoport
  0 siblings, 1 reply; 5+ messages in thread
From: David Hildenbrand (Arm) @ 2026-04-09 12:29 UTC (permalink / raw)
  To: Muchun Song
  Cc: Muchun Song, Andrew Morton, yinghai, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, linux-mm, linux-kernel

On 4/9/26 13:40, Muchun Song wrote:
> 
> 
>> On Apr 8, 2026, at 21:40, David Hildenbrand (Arm) <david@kernel.org> wrote:
>>
>> On 4/7/26 10:39, Muchun Song wrote:
>>> The sparse_buffer was originally introduced in commit 9bdac9142407
>>> ("sparsemem: Put mem map for one node together.") to allocate a
>>> contiguous block of memory for all memmaps of a NUMA node.
>>>
>>> However, the original commit message did not clearly state the actual
>>> benefits or the necessity of keeping all memmap areas strictly
>>> contiguous for a given node.
>>
>> We don't want the memmap to be scattered around, given that it is one of
>> the biggest allocations during boot.
>>
>> It's related to not turning too many memory blocks/sections
>> un-offlinable I think.
> 
> Hi David,
> 
> Got it.
> 
>>
>> I always imagined that memblock would still keep these allocations close
>> to each other. Can you verify if that is indeed true?
> 
> You raised a very interesting point about whether memblock keeps
> these allocations close to each other. I've done a thorough test
> on a 16GB VM by printing the actual physical allocations.
> 
> I enabled the existing debug logs in arch/x86/mm/init_64.c to
> trace the vmemmap_set_pmd allocations. Here is what really happens:
> 
> When using vmemmap_alloc_block without sparse_buffer, the
> memblock allocator allocates 2MB chunks. Because memblock
> allocates top-down by default, the physical allocations look
> like this:
> 
> [ffe6475cc0000000-ffe6475cc01fffff] PMD -> [ff3cb082bfc00000-ff3cb082bfdfffff] on node 0
> [ffe6475cc0200000-ffe6475cc03fffff] PMD -> [ff3cb082bfa00000-ff3cb082bfbfffff] on node 0
> [ffe6475cc0400000-ffe6475cc05fffff] PMD -> [ff3cb082bf800000-ff3cb082bf9fffff] on node 0
> [ffe6475cc0600000-ffe6475cc07fffff] PMD -> [ff3cb082bf600000-ff3cb082bf7fffff] on node 0
> [ffe6475cc0800000-ffe6475cc09fffff] PMD -> [ff3cb082bf400000-ff3cb082bf5fffff] on node 0
> [ffe6475cc0a00000-ffe6475cc0bfffff] PMD -> [ff3cb082bf200000-ff3cb082bf3fffff] on node 0
> [ffe6475cc0c00000-ffe6475cc0dfffff] PMD -> [ff3cb082bf000000-ff3cb082bf1fffff] on node 0
> [ffe6475cc0e00000-ffe6475cc0ffffff] PMD -> [ff3cb082bee00000-ff3cb082beffffff] on node 0
> [ffe6475cc1000000-ffe6475cc11fffff] PMD -> [ff3cb082bec00000-ff3cb082bedfffff] on node 0
> [ffe6475cc1200000-ffe6475cc13fffff] PMD -> [ff3cb082bea00000-ff3cb082bebfffff] on node 0
> [ffe6475cc1400000-ffe6475cc15fffff] PMD -> [ff3cb082be800000-ff3cb082be9fffff] on node 0
> [ffe6475cc1600000-ffe6475cc17fffff] PMD -> [ff3cb082be600000-ff3cb082be7fffff] on node 0
> [ffe6475cc1800000-ffe6475cc19fffff] PMD -> [ff3cb082be400000-ff3cb082be5fffff] on node 0
> [ffe6475cc1a00000-ffe6475cc1bfffff] PMD -> [ff3cb082be200000-ff3cb082be3fffff] on node 0
> [ffe6475cc1c00000-ffe6475cc1dfffff] PMD -> [ff3cb082be000000-ff3cb082be1fffff] on node 0
> [ffe6475cc1e00000-ffe6475cc1ffffff] PMD -> [ff3cb082bde00000-ff3cb082bdffffff] on node 0
> [ffe6475cc2000000-ffe6475cc21fffff] PMD -> [ff3cb082bdc00000-ff3cb082bddfffff] on node 0
> [ffe6475cc2200000-ffe6475cc23fffff] PMD -> [ff3cb082bda00000-ff3cb082bdbfffff] on node 0
> [ffe6475cc2400000-ffe6475cc25fffff] PMD -> [ff3cb082bd800000-ff3cb082bd9fffff] on node 0
> [ffe6475cc2600000-ffe6475cc27fffff] PMD -> [ff3cb082bd600000-ff3cb082bd7fffff] on node 0
> [ffe6475cc2800000-ffe6475cc29fffff] PMD -> [ff3cb082bd400000-ff3cb082bd5fffff] on node 0
> [ffe6475cc2a00000-ffe6475cc2bfffff] PMD -> [ff3cb082bd200000-ff3cb082bd3fffff] on node 0
> [ffe6475cc2c00000-ffe6475cc2dfffff] PMD -> [ff3cb082bd000000-ff3cb082bd1fffff] on node 0
> [ffe6475cc2e00000-ffe6475cc2ffffff] PMD -> [ff3cb082bce00000-ff3cb082bcffffff] on node 0
> [ffe6475cc4000000-ffe6475cc41fffff] PMD -> [ff3cb082bcc00000-ff3cb082bcdfffff] on node 0
> [ffe6475cc4200000-ffe6475cc43fffff] PMD -> [ff3cb082bca00000-ff3cb082bcbfffff] on node 0
> [ffe6475cc4400000-ffe6475cc45fffff] PMD -> [ff3cb082bc800000-ff3cb082bc9fffff] on node 0
> [ffe6475cc4600000-ffe6475cc47fffff] PMD -> [ff3cb082bc600000-ff3cb082bc7fffff] on node 0
> [ffe6475cc4800000-ffe6475cc49fffff] PMD -> [ff3cb082bc400000-ff3cb082bc5fffff] on node 0
> [ffe6475cc4a00000-ffe6475cc4bfffff] PMD -> [ff3cb082bc200000-ff3cb082bc3fffff] on node 0
> [ffe6475cc4c00000-ffe6475cc4dfffff] PMD -> [ff3cb082bc000000-ff3cb082bc1fffff] on node 0
> [ffe6475cc4e00000-ffe6475cc4ffffff] PMD -> [ff3cb082bbe00000-ff3cb082bbffffff] on node 0
> [ffe6475cc5000000-ffe6475cc51fffff] PMD -> [ff3cb083bfa00000-ff3cb083bfbfffff] on node 1
> [ffe6475cc5200000-ffe6475cc53fffff] PMD -> [ff3cb083bf800000-ff3cb083bf9fffff] on node 1
> [ffe6475cc5400000-ffe6475cc55fffff] PMD -> [ff3cb083bf600000-ff3cb083bf7fffff] on node 1
> [ffe6475cc5600000-ffe6475cc57fffff] PMD -> [ff3cb083bf400000-ff3cb083bf5fffff] on node 1
> [ffe6475cc5800000-ffe6475cc59fffff] PMD -> [ff3cb083bf200000-ff3cb083bf3fffff] on node 1
> [ffe6475cc5a00000-ffe6475cc5bfffff] PMD -> [ff3cb083bf000000-ff3cb083bf1fffff] on node 1
> [ffe6475cc5c00000-ffe6475cc5dfffff] PMD -> [ff3cb083b6e00000-ff3cb083b6ffffff] on node 1
> [ffe6475cc5e00000-ffe6475cc5ffffff] PMD -> [ff3cb083b6c00000-ff3cb083b6dfffff] on node 1
> [ffe6475cc6000000-ffe6475cc61fffff] PMD -> [ff3cb083b6a00000-ff3cb083b6bfffff] on node 1
> [ffe6475cc6200000-ffe6475cc63fffff] PMD -> [ff3cb083b6800000-ff3cb083b69fffff] on node 1
> [ffe6475cc6400000-ffe6475cc65fffff] PMD -> [ff3cb083b6600000-ff3cb083b67fffff] on node 1
> [ffe6475cc6600000-ffe6475cc67fffff] PMD -> [ff3cb083b6400000-ff3cb083b65fffff] on node 1
> [ffe6475cc6800000-ffe6475cc69fffff] PMD -> [ff3cb083b6200000-ff3cb083b63fffff] on node 1
> [ffe6475cc6a00000-ffe6475cc6bfffff] PMD -> [ff3cb083b6000000-ff3cb083b61fffff] on node 1
> [ffe6475cc6c00000-ffe6475cc6dfffff] PMD -> [ff3cb083b5e00000-ff3cb083b5ffffff] on node 1
> [ffe6475cc6e00000-ffe6475cc6ffffff] PMD -> [ff3cb083b5c00000-ff3cb083b5dfffff] on node 1
> [ffe6475cc7000000-ffe6475cc71fffff] PMD -> [ff3cb083b5a00000-ff3cb083b5bfffff] on node 1
> [ffe6475cc7200000-ffe6475cc73fffff] PMD -> [ff3cb083b5800000-ff3cb083b59fffff] on node 1
> [ffe6475cc7400000-ffe6475cc75fffff] PMD -> [ff3cb083b5600000-ff3cb083b57fffff] on node 1
> [ffe6475cc7600000-ffe6475cc77fffff] PMD -> [ff3cb083b5400000-ff3cb083b55fffff] on node 1
> [ffe6475cc7800000-ffe6475cc79fffff] PMD -> [ff3cb083b5200000-ff3cb083b53fffff] on node 1
> [ffe6475cc7a00000-ffe6475cc7bfffff] PMD -> [ff3cb083b5000000-ff3cb083b51fffff] on node 1
> [ffe6475cc7c00000-ffe6475cc7dfffff] PMD -> [ff3cb083b4e00000-ff3cb083b4ffffff] on node 1
> [ffe6475cc7e00000-ffe6475cc7ffffff] PMD -> [ff3cb083b4c00000-ff3cb083b4dfffff] on node 1
> [ffe6475cc8000000-ffe6475cc81fffff] PMD -> [ff3cb083b4a00000-ff3cb083b4bfffff] on node 1
> [ffe6475cc8200000-ffe6475cc83fffff] PMD -> [ff3cb083b4800000-ff3cb083b49fffff] on node 1
> [ffe6475cc8400000-ffe6475cc85fffff] PMD -> [ff3cb083b4600000-ff3cb083b47fffff] on node 1
> [ffe6475cc8600000-ffe6475cc87fffff] PMD -> [ff3cb083b4400000-ff3cb083b45fffff] on node 1
> [ffe6475cc8800000-ffe6475cc89fffff] PMD -> [ff3cb083b4200000-ff3cb083b43fffff] on node 1
> [ffe6475cc8a00000-ffe6475cc8bfffff] PMD -> [ff3cb083b4000000-ff3cb083b41fffff] on node 1
> [ffe6475cc8c00000-ffe6475cc8dfffff] PMD -> [ff3cb083b3e00000-ff3cb083b3ffffff] on node 1
> [ffe6475cc8e00000-ffe6475cc8ffffff] PMD -> [ff3cb083b3c00000-ff3cb083b3dfffff] on node 1
> 
> Notice that the physical chunks are strictly adjacent to each
> other, but in descending order!
> 
> So, they are NOT "scattered around" the whole node randomly.
> Instead, they are packed densely back-to-back in a single
> contiguous physical range (just mapped top-down in 2MB pieces).
> 
> Because they are packed tightly together within the same
> contiguous physical memory range, they will at most consume or
> pollute the exact same number of memory blocks as a single
> contiguous allocation (like sparse_buffer did). Therefore, this
> will NOT turn additional memory blocks/sections into an
> "un-offlinable" state.
> 
> It seems we can safely remove the sparse buffer preallocation
> mechanism, don't you think?

Yes, what I suspected. Is there a performance implication when doing
many individual memmap_alloc(), for example, on a larger system with
many sections?

-- 
Cheers,

David


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] mm/sparse: remove sparse_buffer
  2026-04-09 12:29     ` David Hildenbrand (Arm)
@ 2026-04-09 15:10       ` Mike Rapoport
  0 siblings, 0 replies; 5+ messages in thread
From: Mike Rapoport @ 2026-04-09 15:10 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Muchun Song, Muchun Song, Andrew Morton, yinghai, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Suren Baghdasaryan,
	Michal Hocko, linux-mm, linux-kernel

Hi,

On Thu, Apr 09, 2026 at 02:29:38PM +0200, David Hildenbrand (Arm) wrote:
> On 4/9/26 13:40, Muchun Song wrote:
> > 
> > 
> >> On Apr 8, 2026, at 21:40, David Hildenbrand (Arm) <david@kernel.org> wrote:
> >>
> >> On 4/7/26 10:39, Muchun Song wrote:
> >>> The sparse_buffer was originally introduced in commit 9bdac9142407
> >>> ("sparsemem: Put mem map for one node together.") to allocate a
> >>> contiguous block of memory for all memmaps of a NUMA node.
> >>>
> >>> However, the original commit message did not clearly state the actual
> >>> benefits or the necessity of keeping all memmap areas strictly
> >>> contiguous for a given node.
> >>
> >> We don't want the memmap to be scattered around, given that it is one of
> >> the biggest allocations during boot.
> >>
> >> It's related to not turning too many memory blocks/sections
> >> un-offlinable I think.
> >>
> >> I always imagined that memblock would still keep these allocations close
> >> to each other. Can you verify if that is indeed true?
> > 
> > You raised a very interesting point about whether memblock keeps
> > these allocations close to each other. I've done a thorough test
> > on a 16GB VM by printing the actual physical allocations.

memblock always allocates in order, so if there are no other memblock
allocations between the calls to memmap_alloc(), all these allocations will
be together and they all will be coalesced to a single region in
memblock.reserved.

> > I enabled the existing debug logs in arch/x86/mm/init_64.c to
> > trace the vmemmap_set_pmd allocations. Here is what really happens:
> > 
> > When using vmemmap_alloc_block without sparse_buffer, the
> > memblock allocator allocates 2MB chunks. Because memblock
> > allocates top-down by default, the physical allocations look
> > like this:
> > 
> > [ffe6475cc0000000-ffe6475cc01fffff] PMD -> [ff3cb082bfc00000-ff3cb082bfdfffff] on node 0
> > [ffe6475cc0200000-ffe6475cc03fffff] PMD -> [ff3cb082bfa00000-ff3cb082bfbfffff] on node 0
> > [ffe6475cc0400000-ffe6475cc05fffff] PMD -> [ff3cb082bf800000-ff3cb082bf9fffff] on node 0

...

> > Notice that the physical chunks are strictly adjacent to each
> > other, but in descending order!
> > 
> > So, they are NOT "scattered around" the whole node randomly.
> > Instead, they are packed densely back-to-back in a single
> > contiguous physical range (just mapped top-down in 2MB pieces).
> > 
> > Because they are packed tightly together within the same
> > contiguous physical memory range, they will at most consume or
> > pollute the exact same number of memory blocks as a single
> > contiguous allocation (like sparse_buffer did). Therefore, this
> > will NOT turn additional memory blocks/sections into an
> > "un-offlinable" state.
> > 
> > It seems we can safely remove the sparse buffer preallocation
> > mechanism, don't you think?
> 
> Yes, what I suspected. Is there a performance implication when doing
> many individual memmap_alloc(), for example, on a larger system with
> many sections?

memmap_alloc() will be slower than sparse_buffer_alloc(), allocating from
memblock is more involved that sparse_buffer_alloc(), but without
measurements it's hard to tell how much it'll affect overall sparse_init().
  
> -- 
> Cheers,
> 
> David

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-04-09 15:10 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-07  8:39 [RFC PATCH] mm/sparse: remove sparse_buffer Muchun Song
2026-04-08 13:40 ` David Hildenbrand (Arm)
2026-04-09 11:40   ` Muchun Song
2026-04-09 12:29     ` David Hildenbrand (Arm)
2026-04-09 15:10       ` Mike Rapoport

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox