LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 34/69] mm/sparse: Inline usemap allocation into sparse_init_nid()
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

After removing SPARSEMEM_VMEMMAP_PREINIT, sparse_init_nid() no longer
needs the transient sparse_usagebuf state and its helper wrappers.

Allocate the usemap buffer directly in sparse_init_nid(), pass it to
sparse_init_one_section(), and drop sparse_usage_init(),
sparse_usage_fini(), and sparse_init_early_section().

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/mmzone.h |  3 ---
 mm/sparse.c            | 46 +++++++-----------------------------------
 2 files changed, 7 insertions(+), 42 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b9baef8cca91..a60fd5785fa5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -2265,9 +2265,6 @@ static inline bool section_vmemmap_optimizable(const struct mem_section *section
 	return section_order(section) >= OPTIMIZABLE_FOLIO_MIN_ORDER;
 }
 
-void sparse_init_early_section(int nid, struct page *map, unsigned long pnum,
-			       unsigned long flags);
-
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 /**
  * pfn_valid - check if there is a valid memory map entry for a PFN
diff --git a/mm/sparse.c b/mm/sparse.c
index eab37504819d..54c38ea08190 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -237,42 +237,6 @@ void __weak __meminit vmemmap_populate_print_last(void)
 {
 }
 
-static void *sparse_usagebuf __meminitdata;
-static void *sparse_usagebuf_end __meminitdata;
-
-/*
- * Helper function that is used for generic section initialization, and
- * can also be used by any hooks added above.
- */
-void __init sparse_init_early_section(int nid, struct page *map,
-				      unsigned long pnum, unsigned long flags)
-{
-	BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end);
-	sparse_init_one_section(__nr_to_section(pnum), pnum, map,
-			sparse_usagebuf, SECTION_IS_EARLY | flags);
-	sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size();
-}
-
-static int __init sparse_usage_init(int nid, unsigned long map_count)
-{
-	unsigned long size;
-
-	size = mem_section_usage_size() * map_count;
-	sparse_usagebuf = memblock_alloc_node(size, SMP_CACHE_BYTES, nid);
-	if (!sparse_usagebuf) {
-		sparse_usagebuf_end = NULL;
-		return -ENOMEM;
-	}
-
-	sparse_usagebuf_end = sparse_usagebuf + size;
-	return 0;
-}
-
-static void __init sparse_usage_fini(void)
-{
-	sparse_usagebuf = sparse_usagebuf_end = NULL;
-}
-
 int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages,
 		struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
 {
@@ -312,8 +276,11 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 				   unsigned long map_count)
 {
 	unsigned long pnum;
+	struct mem_section_usage *usage;
 
-	if (sparse_usage_init(nid, map_count))
+	usage = memblock_alloc_node(map_count * mem_section_usage_size(),
+				    SMP_CACHE_BYTES, nid);
+	if (!usage)
 		panic("Failed to allocate usemap for node %d\n", nid);
 
 	for_each_present_section_nr(pnum_begin, pnum) {
@@ -329,9 +296,10 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 			panic("Failed to allocate memmap for section %lu\n", pnum);
 		memmap_boot_pages_add(section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION,
 							       NULL, NULL));
-		sparse_init_early_section(nid, map, pnum, 0);
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
+					SECTION_IS_EARLY);
+		usage = (void *)usage + mem_section_usage_size();
 	}
-	sparse_usage_fini();
 }
 
 /*
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 35/69] mm/hugetlb: Remove HUGE_BOOTMEM_HVO
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

The HUGE_BOOTMEM_HVO flag tracked whether a bootmem huge page had
already gone through the old early vmemmap optimization path.

Now that HugeTLB uses section-based vmemmap optimization, that state is
already reflected in the section order.

Remove HUGE_BOOTMEM_HVO and its helper, and use the section state
directly when deciding whether to mark a folio as vmemmap-optimized.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/hugetlb.h |  5 ++---
 include/linux/mmzone.h  |  7 ++++++-
 mm/hugetlb.c            | 12 +-----------
 mm/hugetlb_vmemmap.c    |  2 --
 4 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index dce8969961ea..18af8f304b95 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -695,9 +695,8 @@ struct huge_bootmem_page {
 	unsigned long flags;
 };
 
-#define HUGE_BOOTMEM_HVO		0x0001
-#define HUGE_BOOTMEM_ZONES_VALID	0x0002
-#define HUGE_BOOTMEM_CMA		0x0004
+#define HUGE_BOOTMEM_ZONES_VALID	BIT(0)
+#define HUGE_BOOTMEM_CMA		BIT(1)
 
 int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list);
 int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a60fd5785fa5..9b87d798a365 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -117,6 +117,11 @@
 #define NR_OPTIMIZABLE_FOLIO_ORDERS		\
 	(__NR_OPTIMIZABLE_FOLIO_ORDERS > 0 ? __NR_OPTIMIZABLE_FOLIO_ORDERS : 0)
 
+static inline bool order_vmemmap_optimizable(unsigned int order)
+{
+	return order >= OPTIMIZABLE_FOLIO_MIN_ORDER;
+}
+
 enum migratetype {
 	MIGRATE_UNMOVABLE,
 	MIGRATE_MOVABLE,
@@ -2262,7 +2267,7 @@ static inline bool section_vmemmap_optimizable(const struct mem_section *section
 	if (!is_power_of_2(sizeof(struct page)))
 		return false;
 
-	return section_order(section) >= OPTIMIZABLE_FOLIO_MIN_ORDER;
+	return order_vmemmap_optimizable(section_order(section));
 }
 
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 080f130017e3..abd79bb85b1c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3169,11 +3169,6 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
 	prep_compound_head(&folio->page, huge_page_order(h));
 }
 
-static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
-{
-	return m->flags & HUGE_BOOTMEM_HVO;
-}
-
 static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
 {
 	return m->flags & HUGE_BOOTMEM_CMA;
@@ -3265,12 +3260,7 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid)
 					   OPTIMIZED_FOLIO_VMEMMAP_NR_STRUCT_PAGES);
 		init_new_hugetlb_folio(folio);
 
-		if (hugetlb_bootmem_page_prehvo(m)) {
-			/*
-			 * If pre-HVO was done, just set the
-			 * flag, the HVO code will then skip
-			 * this folio.
-			 */
+		if (order_vmemmap_optimizable(pfn_to_section_order(folio_pfn(folio)))) {
 			folio_set_hugetlb_vmemmap_optimized(folio);
 			section_set_order_range(folio_pfn(folio), folio_nr_pages(folio), 0);
 		}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 730190390ba9..66362e553870 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -718,8 +718,6 @@ void __init hugetlb_vmemmap_optimize_bootmem_page(struct huge_bootmem_page *m)
 		return;
 
 	section_set_order_range(pfn, pages_per_huge_page(h), huge_page_order(h));
-	if (section_vmemmap_optimizable(__pfn_to_section(pfn)))
-		m->flags |= HUGE_BOOTMEM_HVO;
 }
 
 static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 36/69] mm/hugetlb: Remove HUGE_BOOTMEM_CMA
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

Track early CMA hugetlb pages from the hstate instead of storing a
redundant bootmem flag. This removes the unused helper and keeps the
bootmem metadata minimal.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/hugetlb.h | 1 -
 mm/hugetlb.c            | 9 ++-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 18af8f304b95..82dbb9ebead8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -696,7 +696,6 @@ struct huge_bootmem_page {
 };
 
 #define HUGE_BOOTMEM_ZONES_VALID	BIT(0)
-#define HUGE_BOOTMEM_CMA		BIT(1)
 
 int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list);
 int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index abd79bb85b1c..74770c1648fc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3093,7 +3093,7 @@ static bool __init alloc_bootmem_huge_page(struct hstate *h, int nid)
 	 */
 	INIT_LIST_HEAD(&m->list);
 	m->hstate = h;
-	m->flags = hugetlb_early_cma(h) ? HUGE_BOOTMEM_CMA : 0;
+	m->flags = 0;
 
 	/* CMA pages: zone-crossing is validated in hugetlb_cma_reserve(). */
 	if (!hugetlb_early_cma(h) &&
@@ -3169,11 +3169,6 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
 	prep_compound_head(&folio->page, huge_page_order(h));
 }
 
-static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
-{
-	return m->flags & HUGE_BOOTMEM_CMA;
-}
-
 /*
  * memblock-allocated pageblocks might not have the migrate type set
  * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
@@ -3265,7 +3260,7 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid)
 			section_set_order_range(folio_pfn(folio), folio_nr_pages(folio), 0);
 		}
 
-		if (hugetlb_bootmem_page_earlycma(m))
+		if (hugetlb_early_cma(h))
 			folio_set_hugetlb_cma(folio);
 
 		list_add(&folio->lru, &folio_list);
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 37/69] mm/sparse-vmemmap: Factor out shared vmemmap page allocation
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

HugeTLB and sparse-vmemmap each have their own helper to allocate the
shared tail page used by vmemmap optimization.

Factor that logic into a common vmemmap_shared_tail_page() helper in
sparse-vmemmap.c.  It allocates the page through
vmemmap_alloc_block_zero(), initializes the tail struct pages, and uses
cmpxchg() to install the per-zone shared page.

This removes duplicate allocation logic while still handling both the
early boot and runtime paths through the same helper.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/mm.h   |  1 +
 mm/hugetlb_vmemmap.c | 28 +-----------------
 mm/sparse-vmemmap.c  | 67 ++++++++++++++++++--------------------------
 3 files changed, 29 insertions(+), 67 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fef39be8acd2..5281f073230c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4866,6 +4866,7 @@ int vmemmap_populate(unsigned long start, unsigned long end, int node,
 void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
 			  unsigned long headsize);
 void vmemmap_populate_print_last(void);
+struct page *vmemmap_shared_tail_page(unsigned int order, struct zone *zone);
 #ifdef CONFIG_MEMORY_HOTPLUG
 void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 66362e553870..d24143dd6051 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -499,32 +499,6 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *
 	return vmemmap_should_optimize(h);
 }
 
-static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone)
-{
-	const unsigned int idx = order - OPTIMIZABLE_FOLIO_MIN_ORDER;
-	struct page *tail, *p;
-	int node = zone_to_nid(zone);
-
-	tail = READ_ONCE(zone->vmemmap_tails[idx]);
-	if (likely(tail))
-		return tail;
-
-	tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
-	if (!tail)
-		return NULL;
-
-	p = page_to_virt(tail);
-	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
-		init_compound_tail(p + i, NULL, order, zone);
-
-	if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) {
-		__free_page(tail);
-		tail = READ_ONCE(zone->vmemmap_tails[idx]);
-	}
-
-	return tail;
-}
-
 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 					    struct folio *folio,
 					    struct list_head *vmemmap_pages,
@@ -541,7 +515,7 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 		return ret;
 
 	nid = folio_nid(folio);
-	vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio));
+	vmemmap_tail = vmemmap_shared_tail_page(h->order, folio_zone(folio));
 	if (!vmemmap_tail)
 		return -ENOMEM;
 
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index dde4486195ad..53a341fcde74 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -34,27 +34,13 @@
 
 #include "internal.h"
 
-/*
- * Allocate a block of memory to be used to back the virtual memory map
- * or to back the page tables that are used to create the mapping.
- * Uses the main allocators if they are available, else bootmem.
- */
-
-static void * __ref __earlyonly_bootmem_alloc(int node,
-				unsigned long size,
-				unsigned long align,
-				unsigned long goal)
-{
-	return memmap_alloc(size, align, goal, node, false);
-}
-
-void * __meminit vmemmap_alloc_block(unsigned long size, int node)
+void __ref *vmemmap_alloc_block(unsigned long size, int node)
 {
 	/* If the main allocator is up use that, fallback to bootmem. */
 	if (slab_is_available()) {
 		gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
 		int order = get_order(size);
-		static bool warned __meminitdata;
+		static bool warned;
 		struct page *page;
 
 		page = alloc_pages_node(node, gfp_mask, order);
@@ -68,8 +54,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
 		}
 		return NULL;
 	} else
-		return __earlyonly_bootmem_alloc(node, size, size,
-				__pa(MAX_DMA_ADDRESS));
+		return memmap_alloc(size, size, __pa(MAX_DMA_ADDRESS), node, false);
 }
 
 static void * __meminit altmap_alloc_block_buf(unsigned long size,
@@ -138,8 +123,6 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
 			start, end - 1);
 }
 
-static __meminit struct page *vmemmap_get_tail(unsigned int order, struct zone *zone);
-
 static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
 					      struct vmem_altmap *altmap,
 					      unsigned long ptpfn)
@@ -158,7 +141,7 @@ static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, in
 
 			if (WARN_ON_ONCE(!zone))
 				return NULL;
-			page = vmemmap_get_tail(section_order(ms), zone);
+			page = vmemmap_shared_tail_page(section_order(ms), zone);
 			if (!page)
 				return NULL;
 			ptpfn = page_to_pfn(page);
@@ -190,7 +173,7 @@ static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, in
 	return pte;
 }
 
-static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
+static void *vmemmap_alloc_block_zero(unsigned long size, int node)
 {
 	void *p = vmemmap_alloc_block(size, node);
 
@@ -329,32 +312,36 @@ void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
 	}
 }
 
-static __meminit struct page *vmemmap_get_tail(unsigned int order, struct zone *zone)
+struct page __ref *vmemmap_shared_tail_page(unsigned int order, struct zone *zone)
 {
-	struct page *p, *tail;
-	unsigned int idx;
-	int node = zone_to_nid(zone);
+	void *addr;
+	struct page *page;
+	const unsigned int idx = order - OPTIMIZABLE_FOLIO_MIN_ORDER;
 
-	if (WARN_ON_ONCE(order < OPTIMIZABLE_FOLIO_MIN_ORDER))
-		return NULL;
-	if (WARN_ON_ONCE(order > MAX_FOLIO_ORDER))
+	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(zone->vmemmap_tails)))
 		return NULL;
 
-	idx = order - OPTIMIZABLE_FOLIO_MIN_ORDER;
-	tail = zone->vmemmap_tails[idx];
-	if (tail)
-		return tail;
+	page = READ_ONCE(zone->vmemmap_tails[idx]);
+	if (likely(page))
+		return page;
 
-	p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
-	if (!p)
+	addr = vmemmap_alloc_block_zero(PAGE_SIZE, zone_to_nid(zone));
+	if (!addr)
 		return NULL;
-	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
-		init_compound_tail(p + i, NULL, order, zone);
 
-	tail = virt_to_page(p);
-	zone->vmemmap_tails[idx] = tail;
+	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
+		init_compound_tail((struct page *)addr + i, NULL, order, zone);
+
+	page = virt_to_page(addr);
+	if (cmpxchg(&zone->vmemmap_tails[idx], NULL, page) != NULL) {
+		if (slab_is_available())
+			__free_page(page);
+		else
+			memblock_free(page_to_virt(page), PAGE_SIZE);
+		page = READ_ONCE(zone->vmemmap_tails[idx]);
+	}
 
-	return tail;
+	return page;
 }
 
 void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 38/69] mm/sparse-vmemmap: Introduce CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

The generic sparse-vmemmap optimization code is still guarded by
CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, even though it is no longer
HugeTLB-specific.

Introduce CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION to represent the common
vmemmap optimization infrastructure.  Have HugeTLB and DAX select it,
and use it to guard generic optimization code.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/x86/entry/vdso/vdso32/fake_32bit_build.h |  2 +-
 drivers/dax/Kconfig                           |  1 +
 fs/Kconfig                                    |  1 +
 include/linux/mmzone.h                        | 33 ++++++++++---------
 include/linux/page-flags.h                    |  5 +--
 mm/Kconfig                                    |  4 +++
 6 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/arch/x86/entry/vdso/vdso32/fake_32bit_build.h b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
index bc3e549795c3..5f8424eade2b 100644
--- a/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
+++ b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
@@ -11,7 +11,7 @@
 #undef CONFIG_PGTABLE_LEVELS
 #undef CONFIG_ILLEGAL_POINTER_VALUE
 #undef CONFIG_SPARSEMEM_VMEMMAP
-#undef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+#undef CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION
 #undef CONFIG_NR_CPUS
 #undef CONFIG_PARAVIRT_XXL
 
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 602f9a0839a9..60cb05dce53d 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -8,6 +8,7 @@ if DAX
 config DEV_DAX
 	tristate "Device DAX: direct access mapping device"
 	depends on TRANSPARENT_HUGEPAGE
+	select SPARSEMEM_VMEMMAP_OPTIMIZATION if ARCH_WANT_OPTIMIZE_DAX_VMEMMAP && SPARSEMEM_VMEMMAP
 	help
 	  Support raw access to differentiated (persistence, bandwidth,
 	  latency...) memory via an mmap(2) capable character
diff --git a/fs/Kconfig b/fs/Kconfig
index ccb9dd480523..f6cee1bbb1fc 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -278,6 +278,7 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	def_bool HUGETLB_PAGE
 	depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
+	select SPARSEMEM_VMEMMAP_OPTIMIZATION
 
 config HUGETLB_PMD_PAGE_TABLE_SHARING
 	def_bool HUGETLB_PAGE
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9b87d798a365..5285d53b0c53 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -102,9 +102,9 @@
  *
  * HVO which is only active if the size of struct page is a power of 2.
  */
-#define MAX_FOLIO_VMEMMAP_ALIGN \
-	(IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) && \
-	 is_power_of_2(sizeof(struct page)) ? \
+#define MAX_FOLIO_VMEMMAP_ALIGN					\
+	(IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION) &&	\
+	 is_power_of_2(sizeof(struct page)) ?			\
 	 MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0)
 
 /* The number of vmemmap pages required by a vmemmap-optimized folio. */
@@ -115,7 +115,8 @@
 
 #define __NR_OPTIMIZABLE_FOLIO_ORDERS		(MAX_FOLIO_ORDER - OPTIMIZABLE_FOLIO_MIN_ORDER + 1)
 #define NR_OPTIMIZABLE_FOLIO_ORDERS		\
-	(__NR_OPTIMIZABLE_FOLIO_ORDERS > 0 ? __NR_OPTIMIZABLE_FOLIO_ORDERS : 0)
+	((__NR_OPTIMIZABLE_FOLIO_ORDERS > 0 &&	\
+	  IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION)) ? __NR_OPTIMIZABLE_FOLIO_ORDERS : 0)
 
 static inline bool order_vmemmap_optimizable(unsigned int order)
 {
@@ -2033,7 +2034,7 @@ struct mem_section {
 	 */
 	struct page_ext *page_ext;
 #endif
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION
 	/*
 	 * The order of compound pages in this section. Typically, the section
 	 * holds compound pages of this order; a larger compound page will span
@@ -2213,7 +2214,19 @@ static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long
 	*pfn = (*pfn & PAGE_SECTION_MASK) + (bit * PAGES_PER_SUBSECTION);
 	return true;
 }
+#else
+static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
+{
+	return 1;
+}
+
+static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn)
+{
+	return true;
+}
+#endif
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION
 static inline void section_set_order(struct mem_section *section, unsigned int order)
 {
 	VM_WARN_ON(section->order && order && section->order != order);
@@ -2225,16 +2238,6 @@ static inline unsigned int section_order(const struct mem_section *section)
 	return section->order;
 }
 #else
-static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
-{
-	return 1;
-}
-
-static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn)
-{
-	return true;
-}
-
 static inline void section_set_order(struct mem_section *section, unsigned int order)
 {
 }
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0e03d816e8b9..12665b34586c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -208,14 +208,11 @@ enum pageflags {
 static __always_inline bool compound_info_has_mask(void)
 {
 	/*
-	 * Limit mask usage to HugeTLB vmemmap optimization (HVO) where it
-	 * makes a difference.
-	 *
 	 * The approach with mask would work in the wider set of conditions,
 	 * but it requires validating that struct pages are naturally aligned
 	 * for all orders up to the MAX_FOLIO_ORDER, which can be tricky.
 	 */
-	if (!IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP))
+	if (!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION))
 		return false;
 
 	return is_power_of_2(sizeof(struct page));
diff --git a/mm/Kconfig b/mm/Kconfig
index c26d2d2050d5..ddd10cb4d0a3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -410,6 +410,10 @@ config SPARSEMEM_VMEMMAP
 	  pfn_to_page and page_to_pfn operations.  This is the most
 	  efficient option when sufficient kernel resources are available.
 
+config SPARSEMEM_VMEMMAP_OPTIMIZATION
+	bool
+	depends on SPARSEMEM_VMEMMAP
+
 #
 # Select this config option from the architecture Kconfig, if it is preferred
 # to enable the feature of HugeTLB/dev_dax vmemmap optimization.
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 39/69] mm/sparse-vmemmap: Switch DAX to vmemmap_shared_tail_page()
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

DAX compound vmemmap population still has its own way to find a reusable
tail page by walking the previous section's PTEs.

Switch it to the common vmemmap_shared_tail_page() helper instead, so
DAX uses the same per-zone shared tail page as the other vmemmap
optimization users.  This removes the PTE walk and lets both the section
reuse path and the populate path use the same shared page directly.

When the target zone is ZONE_DEVICE, mark the shared tail page entries
PG_reserved as well, so they match the initialization requirements for
device pages.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/mmzone.h | 10 +++++++++
 mm/memory_hotplug.c    |  9 ++++++--
 mm/sparse-vmemmap.c    | 48 ++++++++++++++----------------------------
 3 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5285d53b0c53..7484e7be7b6d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1693,11 +1693,21 @@ static inline bool zone_is_zone_device(const struct zone *zone)
 {
 	return zone_idx(zone) == ZONE_DEVICE;
 }
+
+static inline struct zone *device_zone(int nid)
+{
+	return &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
+}
 #else
 static inline bool zone_is_zone_device(const struct zone *zone)
 {
 	return false;
 }
+
+static inline struct zone *device_zone(int nid)
+{
+	return NULL;
+}
 #endif
 
 /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 462d8dcd636d..9ff830703785 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -551,8 +551,13 @@ void remove_pfn_range_from_zone(struct zone *zone,
 		/* Select all remaining pages up to the next section boundary */
 		cur_nr_pages =
 			min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
-		page_init_poison(pfn_to_page(pfn),
-				 sizeof(struct page) * cur_nr_pages);
+		/*
+		 * This is a temporary workaround to prevent the shared vmemmap
+		 * page from being overwritten; it will be removed later.
+		 */
+		if (!zone_is_zone_device(zone))
+			page_init_poison(pfn_to_page(pfn),
+					 sizeof(struct page) * cur_nr_pages);
 	}
 
 	/*
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 53a341fcde74..0c0b54e94c07 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -329,8 +329,12 @@ struct page __ref *vmemmap_shared_tail_page(unsigned int order, struct zone *zon
 	if (!addr)
 		return NULL;
 
-	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
-		init_compound_tail((struct page *)addr + i, NULL, order, zone);
+	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++) {
+		page = (struct page *)addr + i;
+		if (zone_is_zone_device(zone))
+			__SetPageReserved(page);
+		init_compound_tail(page, NULL, order, zone);
+	}
 
 	page = virt_to_page(addr);
 	if (cmpxchg(&zone->vmemmap_tails[idx], NULL, page) != NULL) {
@@ -442,23 +446,6 @@ static bool __meminit reuse_compound_section(unsigned long start_pfn,
 	return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
 }
 
-static pte_t * __meminit compound_section_tail_page(unsigned long addr)
-{
-	pte_t *pte;
-
-	addr -= PAGE_SIZE;
-
-	/*
-	 * Assuming sections are populated sequentially, the previous section's
-	 * page data can be reused.
-	 */
-	pte = pte_offset_kernel(pmd_off_k(addr), addr);
-	if (!pte)
-		return NULL;
-
-	return pte;
-}
-
 static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 						     unsigned long start,
 						     unsigned long end, int node,
@@ -467,19 +454,15 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 	unsigned long size, addr;
 	pte_t *pte;
 	int rc;
+	struct page *page;
 
-	if (reuse_compound_section(start_pfn, pgmap)) {
-		pte = compound_section_tail_page(start);
-		if (!pte)
-			return -ENOMEM;
+	page = vmemmap_shared_tail_page(pgmap->vmemmap_shift, device_zone(node));
+	if (!page)
+		return -ENOMEM;
 
-		/*
-		 * Reuse the page that was populated in the prior iteration
-		 * with just tail struct pages.
-		 */
+	if (reuse_compound_section(start_pfn, pgmap))
 		return vmemmap_populate_range(start, end, node, NULL,
-					      pte_pfn(ptep_get(pte)));
-	}
+					      page_to_pfn(page));
 
 	size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
 	for (addr = start; addr < end; addr += size) {
@@ -497,12 +480,12 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 			return -ENOMEM;
 
 		/*
-		 * Reuse the previous page for the rest of tail pages
+		 * Reuse the shared page for the rest of tail pages
 		 * See layout diagram in Documentation/mm/vmemmap_dedup.rst
 		 */
 		next += PAGE_SIZE;
 		rc = vmemmap_populate_range(next, last, node, NULL,
-					    pte_pfn(ptep_get(pte)));
+					    page_to_pfn(page));
 		if (rc)
 			return -ENOMEM;
 	}
@@ -828,7 +811,8 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 	 * Poison uninitialized struct pages in order to catch invalid flags
 	 * combinations.
 	 */
-	page_init_poison(memmap, sizeof(struct page) * nr_pages);
+	if (!vmemmap_can_optimize(altmap, pgmap))
+		page_init_poison(memmap, sizeof(struct page) * nr_pages);
 
 	ms = __nr_to_section(section_nr);
 	__section_mark_present(ms, section_nr);
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 40/69] powerpc/mm: Switch DAX to vmemmap_shared_tail_page()
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

powerpc compound vmemmap population still finds a reusable tail page by
walking the vmemmap page tables.

Switch it to the common vmemmap_shared_tail_page() helper instead, so it
can use the shared tail page directly without probing or populating
neighboring mappings.

This removes the powerpc-specific tail-page lookup and its fallback path
and aligns the radix vmemmap optimization path with the generic shared
tail-page scheme.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 76 ++----------------------
 1 file changed, 6 insertions(+), 70 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index cf692b2b5f7b..95e65ac8cdea 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1250,59 +1250,6 @@ static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int
 	return pte;
 }
 
-static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
-						    unsigned long pfn_offset, int node)
-{
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long map_addr;
-
-	/* the second vmemmap page which we use for duplication */
-	map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
-	pgd = pgd_offset_k(map_addr);
-	p4d = p4d_offset(pgd, map_addr);
-	pud = vmemmap_pud_alloc(p4d, node, map_addr);
-	if (!pud)
-		return NULL;
-	pmd = vmemmap_pmd_alloc(pud, node, map_addr);
-	if (!pmd)
-		return NULL;
-	if (pmd_leaf(*pmd))
-		/*
-		 * The second page is mapped as a hugepage due to a nearby request.
-		 * Force our mapping to page size without deduplication
-		 */
-		return NULL;
-	pte = vmemmap_pte_alloc(pmd, node, map_addr);
-	if (!pte)
-		return NULL;
-	/*
-	 * Check if there exist a mapping to the left
-	 */
-	if (pte_none(*pte)) {
-		/*
-		 * Populate the head page vmemmap page.
-		 * It can fall in different pmd, hence
-		 * vmemmap_populate_address()
-		 */
-		pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
-		if (!pte)
-			return NULL;
-		/*
-		 * Populate the tail pages vmemmap page
-		 */
-		pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
-		if (!pte)
-			return NULL;
-		vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
-		return pte;
-	}
-	return pte;
-}
-
 int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 					      unsigned long start,
 					      unsigned long end, int node,
@@ -1320,6 +1267,11 @@ int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	struct page *tail_page;
+
+	tail_page = vmemmap_shared_tail_page(pgmap->vmemmap_shift, device_zone(node));
+	if (!tail_page)
+		return -ENOMEM;
 
 	for (addr = start; addr < end; addr = next) {
 
@@ -1352,7 +1304,6 @@ int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 			unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
 			unsigned long addr_pfn = page_to_pfn((struct page *)addr);
 			unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
-			pte_t *tail_page_pte;
 
 			/*
 			 * if the address is aligned to huge page size it is the
@@ -1377,23 +1328,8 @@ int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 				next = addr + 2 * PAGE_SIZE;
 				continue;
 			}
-			/*
-			 * get the 2nd mapping details
-			 * Also create it if that doesn't exist
-			 */
-			tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
-			if (!tail_page_pte) {
-
-				pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
-				if (!pte)
-					return -ENOMEM;
-				vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
-
-				next = addr + PAGE_SIZE;
-				continue;
-			}
 
-			pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
+			pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, tail_page);
 			if (!pte)
 				return -ENOMEM;
 			vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 41/69] mm/sparse-vmemmap: Drop the extra tail page from DAX reservation
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

DAX compound vmemmap population still reserves one extra tail vmemmap
page after the head page, and only maps the remaining tail pages
through the shared tail page.

Drop that extra reservation and let the shared tail page cover all tail
vmemmap pages after the head page, so DAX follows the same reservation
model as HugeTLB.

This reduces the reserved vmemmap pages for optimized DAX mappings to
OPTIMIZED_FOLIO_VMEMMAP_PAGES and removes the now-unneeded first-tail
population from the generic and powerpc paths.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 44 +-----------------------
 include/linux/mm.h                       |  2 +-
 mm/sparse-vmemmap.c                      |  8 +----
 3 files changed, 3 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 95e65ac8cdea..fb8738016b30 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1217,39 +1217,6 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
 	return 0;
 }
 
-static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
-							 struct vmem_altmap *altmap,
-							 struct page *reuse)
-{
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = pgd_offset_k(addr);
-	p4d = p4d_offset(pgd, addr);
-	pud = vmemmap_pud_alloc(p4d, node, addr);
-	if (!pud)
-		return NULL;
-	pmd = vmemmap_pmd_alloc(pud, node, addr);
-	if (!pmd)
-		return NULL;
-	if (pmd_leaf(*pmd))
-		/*
-		 * The second page is mapped as a hugepage due to a nearby request.
-		 * Force our mapping to page size without deduplication
-		 */
-		return NULL;
-	pte = vmemmap_pte_alloc(pmd, node, addr);
-	if (!pte)
-		return NULL;
-	radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
-	vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
-
-	return pte;
-}
-
 int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 					      unsigned long start,
 					      unsigned long end, int node,
@@ -1316,16 +1283,7 @@ int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 					return -ENOMEM;
 				vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 
-				/*
-				 * Populate the tail pages vmemmap page
-				 * It can fall in different pmd, hence
-				 * vmemmap_populate_address()
-				 */
-				pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
-				if (!pte)
-					return -ENOMEM;
-
-				next = addr + 2 * PAGE_SIZE;
+				next = addr + PAGE_SIZE;
 				continue;
 			}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5281f073230c..86d7cecb834e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4898,7 +4898,7 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
 }
 #endif
 
-#define VMEMMAP_RESERVE_NR	2
+#define VMEMMAP_RESERVE_NR	OPTIMIZED_FOLIO_VMEMMAP_PAGES
 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
 static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
 					  struct dev_pagemap *pgmap)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 0c0b54e94c07..b5c109b8af6f 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -473,17 +473,11 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 		if (!pte)
 			return -ENOMEM;
 
-		/* Populate the tail pages vmemmap page */
-		next = addr + PAGE_SIZE;
-		pte = vmemmap_populate_address(next, node, NULL, -1);
-		if (!pte)
-			return -ENOMEM;
-
 		/*
 		 * Reuse the shared page for the rest of tail pages
 		 * See layout diagram in Documentation/mm/vmemmap_dedup.rst
 		 */
-		next += PAGE_SIZE;
+		next = addr + PAGE_SIZE;
 		rc = vmemmap_populate_range(next, last, node, NULL,
 					    page_to_pfn(page));
 		if (rc)
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 42/69] mm/sparse-vmemmap: Switch DAX to section-based vmemmap optimization
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

DAX vmemmap optimization still uses pgmap-specific state to decide
whether a section should use the optimized layout.

Switch DAX to the compound page order recorded in struct mem_section, so
it follows the same section-based optimization state as the rest of
sparse-vmemmap.

This lets the DAX population, initialization, and teardown paths make
their optimization decisions from the section metadata instead of
carrying separate pgmap-specific state.

This makes DAX vmemmap optimization section-granular. Only
section-aligned ranges record a compound page order, so subsection
mappings remain unoptimized. The resulting loss of vmemmap savings
is negligible.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/powerpc/mm/book3s64/radix_pgtable.c |  5 +++--
 mm/memory_hotplug.c                      |  6 +-----
 mm/mm_init.c                             | 13 ++++---------
 mm/sparse-vmemmap.c                      | 24 ++++++++++++++++++------
 mm/sparse.c                              |  2 +-
 5 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index fb8738016b30..f0043c57694e 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1235,8 +1235,9 @@ int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 	pmd_t *pmd;
 	pte_t *pte;
 	struct page *tail_page;
+	const struct mem_section *ms = __pfn_to_section(start_pfn);
 
-	tail_page = vmemmap_shared_tail_page(pgmap->vmemmap_shift, device_zone(node));
+	tail_page = vmemmap_shared_tail_page(section_order(ms), device_zone(node));
 	if (!tail_page)
 		return -ENOMEM;
 
@@ -1268,7 +1269,7 @@ int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 			next = addr + PAGE_SIZE;
 			continue;
 		} else {
-			unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+			unsigned long nr_pages = 1UL << section_order(ms);
 			unsigned long addr_pfn = page_to_pfn((struct page *)addr);
 			unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9ff830703785..c9c69f827efa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -551,11 +551,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
 		/* Select all remaining pages up to the next section boundary */
 		cur_nr_pages =
 			min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
-		/*
-		 * This is a temporary workaround to prevent the shared vmemmap
-		 * page from being overwritten; it will be removed later.
-		 */
-		if (!zone_is_zone_device(zone))
+		if (!section_vmemmap_optimizable(__pfn_to_section(pfn)))
 			page_init_poison(pfn_to_page(pfn),
 					 sizeof(struct page) * cur_nr_pages);
 	}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 35c99e5c215c..2b94115e6dd5 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1071,16 +1071,11 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
  * of an altmap. See vmemmap_populate_compound_pages().
  */
 static inline unsigned long compound_nr_pages(unsigned long pfn,
-					      struct vmem_altmap *altmap,
 					      struct dev_pagemap *pgmap)
 {
-	/*
-	 * If DAX memory is hot-plugged into an unoccupied subsection
-	 * of an early section, the unoptimized boot memmap is reused.
-	 * See section_activate().
-	 */
-	if (early_section(__pfn_to_section(pfn)) ||
-	    !vmemmap_can_optimize(altmap, pgmap))
+	const struct mem_section *ms = __pfn_to_section(pfn);
+
+	if (!section_vmemmap_optimizable(ms))
 		return pgmap_vmemmap_nr(pgmap);
 
 	return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
@@ -1150,7 +1145,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 			continue;
 
 		memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
-				     compound_nr_pages(pfn, altmap, pgmap));
+				     compound_nr_pages(pfn, pgmap));
 	}
 
 	pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE, false, false);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index b5c109b8af6f..ad3e5b54abf7 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -455,8 +455,9 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 	pte_t *pte;
 	int rc;
 	struct page *page;
+	const struct mem_section *ms = __pfn_to_section(start_pfn);
 
-	page = vmemmap_shared_tail_page(pgmap->vmemmap_shift, device_zone(node));
+	page = vmemmap_shared_tail_page(section_order(ms), device_zone(node));
 	if (!page)
 		return -ENOMEM;
 
@@ -464,7 +465,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 		return vmemmap_populate_range(start, end, node, NULL,
 					      page_to_pfn(page));
 
-	size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
+	size = min(end - start, (1UL << section_order(ms)) * sizeof(struct page));
 	for (addr = start; addr < end; addr += size) {
 		unsigned long next, last = addr + size;
 
@@ -501,7 +502,9 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
 		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
 		return NULL;
 
-	if (vmemmap_can_optimize(altmap, pgmap))
+	/* This may occur in sub-section scenarios. */
+	if (vmemmap_can_optimize(altmap, pgmap) &&
+	    section_vmemmap_optimizable(__pfn_to_section(pfn)))
 		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
 	else
 		r = vmemmap_populate(start, end, nid, altmap);
@@ -718,8 +721,10 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 	else if (memmap)
 		free_map_bootmem(memmap);
 
-	if (empty)
+	if (empty) {
 		ms->section_mem_map = (unsigned long)NULL;
+		section_set_order(ms, 0);
+	}
 }
 
 static struct page * __meminit section_activate(int nid, unsigned long pfn,
@@ -729,8 +734,14 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
 	struct mem_section *ms = __pfn_to_section(pfn);
 	struct mem_section_usage *usage = NULL;
 	struct page *memmap;
+	unsigned int order;
 	int rc;
 
+	order = vmemmap_can_optimize(altmap, pgmap) ? pgmap->vmemmap_shift : 0;
+	/* All sub-sections within a section must share the same order. */
+	if (nr_pages < PAGES_PER_SECTION && section_order(ms) && section_order(ms) != order)
+		return ERR_PTR(-ENOTSUPP);
+
 	if (!ms->usage) {
 		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
 		if (!usage)
@@ -756,6 +767,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
 	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
 		return pfn_to_page(pfn);
 
+	section_set_order_range(pfn, nr_pages, order);
 	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
 	if (!memmap) {
 		section_deactivate(pfn, nr_pages, altmap, pgmap);
@@ -801,14 +813,14 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 	if (IS_ERR(memmap))
 		return PTR_ERR(memmap);
 
+	ms = __nr_to_section(section_nr);
 	/*
 	 * Poison uninitialized struct pages in order to catch invalid flags
 	 * combinations.
 	 */
-	if (!vmemmap_can_optimize(altmap, pgmap))
+	if (!section_vmemmap_optimizable(ms))
 		page_init_poison(memmap, sizeof(struct page) * nr_pages);
 
-	ms = __nr_to_section(section_nr);
 	__section_mark_present(ms, section_nr);
 
 	/* Align memmap to section boundary in the subsection case */
diff --git a/mm/sparse.c b/mm/sparse.c
index 54c38ea08190..6878f8941b4c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -251,7 +251,7 @@ int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages
 	if (vmemmap_can_optimize(altmap, pgmap))
 		vmemmap_pages = VMEMMAP_RESERVE_NR;
 
-	if (!vmemmap_can_optimize(altmap, pgmap) && !section_vmemmap_optimizable(ms))
+	if (!section_vmemmap_optimizable(ms))
 		return DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE);
 
 	if (order < PFN_SECTION_SHIFT) {
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 43/69] mm/sparse-vmemmap: Unify DAX and HugeTLB population paths
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

Now that DAX and HugeTLB use the same optimized vmemmap layout, they no
longer need separate population flows.

Move the shared-tail-page handling into vmemmap_pte_populate() so both
users can go through the normal basepage population path.  This removes
the compound-page-specific population helper and leaves the optimized
mapping decisions in one place.

At runtime, the optimized users are limited to ZONE_DEVICE memory, so
use device_zone() for shared-tail-page allocation instead of relying on
pfn_to_zone() before zone spans are available.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/powerpc/mm/book3s64/radix_pgtable.c |   3 +
 mm/mm_init.c                             |   2 +-
 mm/sparse-vmemmap.c                      | 183 ++++++-----------------
 3 files changed, 50 insertions(+), 138 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index f0043c57694e..c7f2327681cc 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1121,7 +1121,10 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	unsigned long pfn = page_to_pfn((struct page *)start);
 
+	if (section_vmemmap_optimizable(__pfn_to_section(pfn)))
+		return vmemmap_populate_compound_pages(pfn, start, end, node, NULL);
 	/*
 	 * If altmap is present, Make sure we align the start vmemmap addr
 	 * to PAGE_SIZE so that we calculate the correct start_pfn in
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2b94115e6dd5..9ff118e35641 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1068,7 +1068,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
  * initialize is a lot smaller that the total amount of struct pages being
  * mapped. This is a paired / mild layering violation with explicit knowledge
  * of how the sparse_vmemmap internals handle compound pages in the lack
- * of an altmap. See vmemmap_populate_compound_pages().
+ * of an altmap.
  */
 static inline unsigned long compound_nr_pages(unsigned long pfn,
 					      struct dev_pagemap *pgmap)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index ad3e5b54abf7..4833a2295abb 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -127,49 +127,48 @@ static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, in
 					      struct vmem_altmap *altmap,
 					      unsigned long ptpfn)
 {
-	pte_t *pte = pte_offset_kernel(pmd, addr);
-
-	if (pte_none(ptep_get(pte))) {
-		pte_t entry;
-
-		if (vmemmap_page_optimizable((struct page *)addr) &&
-		    ptpfn == (unsigned long)-1) {
-			struct page *page;
-			unsigned long pfn = page_to_pfn((struct page *)addr);
-			const struct mem_section *ms = __pfn_to_section(pfn);
-			struct zone *zone = pfn_to_zone(pfn, node);
-
-			if (WARN_ON_ONCE(!zone))
-				return NULL;
-			page = vmemmap_shared_tail_page(section_order(ms), zone);
-			if (!page)
-				return NULL;
-			ptpfn = page_to_pfn(page);
-		}
+	pte_t entry, *pte = pte_offset_kernel(pmd, addr);
+	struct page *page = (struct page *)addr;
+
+	if (!pte_none(ptep_get(pte)))
+		return WARN_ON_ONCE(vmemmap_page_optimizable(page)) ? NULL : pte;
+
+	/* See layout diagram in Documentation/mm/vmemmap_dedup.rst. */
+	if (vmemmap_page_optimizable(page)) {
+		struct zone *zone;
+		unsigned long pfn = page_to_pfn(page);
+
+		/*
+		 * At runtime (slab available), only ZONE_DEVICE pages (DAX)
+		 * trigger vmemmap optimization, so device_zone() suffices.
+		 * Note: pfn_to_zone() cannot be used at runtime because the
+		 * zone span is not set up now.
+		 */
+		zone = slab_is_available() ? device_zone(node) : pfn_to_zone(pfn, node);
+		if (WARN_ON_ONCE(!zone))
+			return NULL;
+		page = vmemmap_shared_tail_page(pfn_to_section_order(pfn), zone);
+		if (!page)
+			return NULL;
+
+		/*
+		 * When a PTE entry is freed, a free_pages() call occurs. This
+		 * get_page() pairs with put_page_testzero() on the freeing
+		 * path. This can only occur when slab is available.
+		 */
+		if (slab_is_available())
+			get_page(page);
+		ptpfn = page_to_pfn(page);
+	} else {
+		void *vaddr = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+
+		if (!vaddr)
+			return NULL;
+		ptpfn = PHYS_PFN(__pa(vaddr));
+	}
+	entry = pfn_pte(ptpfn, PAGE_KERNEL);
+	set_pte_at(&init_mm, addr, pte, entry);
 
-		if (ptpfn == (unsigned long)-1) {
-			void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
-
-			if (!p)
-				return NULL;
-			ptpfn = PHYS_PFN(__pa(p));
-		} else {
-			/*
-			 * When a PTE/PMD entry is freed from the init_mm
-			 * there's a free_pages() call to this page allocated
-			 * above. Thus this get_page() is paired with the
-			 * put_page_testzero() on the freeing path.
-			 * This can only called by certain ZONE_DEVICE path,
-			 * and through vmemmap_populate_compound_pages() when
-			 * slab is available.
-			 */
-			if (slab_is_available())
-				get_page(pfn_to_page(ptpfn));
-		}
-		entry = pfn_pte(ptpfn, PAGE_KERNEL);
-		set_pte_at(&init_mm, addr, pte, entry);
-	} else if (WARN_ON_ONCE(vmemmap_page_optimizable((struct page *)addr)))
-		return NULL;
 	return pte;
 }
 
@@ -265,30 +264,16 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
 	return pte;
 }
 
-static int __meminit vmemmap_populate_range(unsigned long start,
-					    unsigned long end, int node,
-					    struct vmem_altmap *altmap,
-					    unsigned long ptpfn)
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+					 int node, struct vmem_altmap *altmap)
 {
-	unsigned long addr = start;
-	pte_t *pte;
-
-	for (; addr < end; addr += PAGE_SIZE) {
-		pte = vmemmap_populate_address(addr, node, altmap,
-					       ptpfn);
-		if (!pte)
+	for (; start < end; start += PAGE_SIZE)
+		if (!vmemmap_populate_address(start, node, altmap, -1))
 			return -ENOMEM;
-	}
 
 	return 0;
 }
 
-int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
-					 int node, struct vmem_altmap *altmap)
-{
-	return vmemmap_populate_range(start, end, node, altmap, -1);
-}
-
 /*
  * Write protect the mirrored tail page structs for HVO. This will be
  * called from the hugetlb code when gathering and initializing the
@@ -425,94 +410,18 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
 	return 0;
 }
 
-#ifndef vmemmap_populate_compound_pages
-/*
- * For compound pages bigger than section size (e.g. x86 1G compound
- * pages with 2M subsection size) fill the rest of sections as tail
- * pages.
- *
- * Note that memremap_pages() resets @nr_range value and will increment
- * it after each range successful onlining. Thus the value or @nr_range
- * at section memmap populate corresponds to the in-progress range
- * being onlined here.
- */
-static bool __meminit reuse_compound_section(unsigned long start_pfn,
-					     struct dev_pagemap *pgmap)
-{
-	unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
-	unsigned long offset = start_pfn -
-		PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
-
-	return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
-}
-
-static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
-						     unsigned long start,
-						     unsigned long end, int node,
-						     struct dev_pagemap *pgmap)
-{
-	unsigned long size, addr;
-	pte_t *pte;
-	int rc;
-	struct page *page;
-	const struct mem_section *ms = __pfn_to_section(start_pfn);
-
-	page = vmemmap_shared_tail_page(section_order(ms), device_zone(node));
-	if (!page)
-		return -ENOMEM;
-
-	if (reuse_compound_section(start_pfn, pgmap))
-		return vmemmap_populate_range(start, end, node, NULL,
-					      page_to_pfn(page));
-
-	size = min(end - start, (1UL << section_order(ms)) * sizeof(struct page));
-	for (addr = start; addr < end; addr += size) {
-		unsigned long next, last = addr + size;
-
-		/* Populate the head page vmemmap page */
-		pte = vmemmap_populate_address(addr, node, NULL, -1);
-		if (!pte)
-			return -ENOMEM;
-
-		/*
-		 * Reuse the shared page for the rest of tail pages
-		 * See layout diagram in Documentation/mm/vmemmap_dedup.rst
-		 */
-		next = addr + PAGE_SIZE;
-		rc = vmemmap_populate_range(next, last, node, NULL,
-					    page_to_pfn(page));
-		if (rc)
-			return -ENOMEM;
-	}
-
-	return 0;
-}
-
-#endif
-
 struct page * __meminit __populate_section_memmap(unsigned long pfn,
 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
 		struct dev_pagemap *pgmap)
 {
 	unsigned long start = (unsigned long) pfn_to_page(pfn);
 	unsigned long end = start + nr_pages * sizeof(struct page);
-	int r;
 
 	if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
 		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
 		return NULL;
 
-	/* This may occur in sub-section scenarios. */
-	if (vmemmap_can_optimize(altmap, pgmap) &&
-	    section_vmemmap_optimizable(__pfn_to_section(pfn)))
-		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
-	else
-		r = vmemmap_populate(start, end, nid, altmap);
-
-	if (r < 0)
-		return NULL;
-
-	return pfn_to_page(pfn);
+	return vmemmap_populate(start, end, nid, altmap) ? NULL : (void *)start;
 }
 
 static void subsection_mask_set(unsigned long *map, unsigned long pfn,
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 44/69] mm/sparse-vmemmap: Remove the unused ptpfn argument
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

vmemmap_pte_populate() no longer uses ptpfn as an input.  It computes
the PFN locally in both cases before building the PTE.

Drop the argument and inline the PFN computation at the PTE creation
sites.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 mm/sparse-vmemmap.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 4833a2295abb..182d0c7dd1e7 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -124,8 +124,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
 }
 
 static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
-					      struct vmem_altmap *altmap,
-					      unsigned long ptpfn)
+					      struct vmem_altmap *altmap)
 {
 	pte_t entry, *pte = pte_offset_kernel(pmd, addr);
 	struct page *page = (struct page *)addr;
@@ -158,15 +157,15 @@ static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, in
 		 */
 		if (slab_is_available())
 			get_page(page);
-		ptpfn = page_to_pfn(page);
+
+		entry = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
 	} else {
 		void *vaddr = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
 
 		if (!vaddr)
 			return NULL;
-		ptpfn = PHYS_PFN(__pa(vaddr));
+		entry = pfn_pte(PHYS_PFN(__pa(vaddr)), PAGE_KERNEL);
 	}
-	entry = pfn_pte(ptpfn, PAGE_KERNEL);
 	set_pte_at(&init_mm, addr, pte, entry);
 
 	return pte;
@@ -235,8 +234,7 @@ static pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 }
 
 static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
-					      struct vmem_altmap *altmap,
-					      unsigned long ptpfn)
+						  struct vmem_altmap *altmap)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -256,7 +254,7 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
 	pmd = vmemmap_pmd_populate(pud, addr, node);
 	if (!pmd)
 		return NULL;
-	pte = vmemmap_pte_populate(pmd, addr, node, altmap, ptpfn);
+	pte = vmemmap_pte_populate(pmd, addr, node, altmap);
 	if (!pte)
 		return NULL;
 	vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -268,7 +266,7 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
 					 int node, struct vmem_altmap *altmap)
 {
 	for (; start < end; start += PAGE_SIZE)
-		if (!vmemmap_populate_address(start, node, altmap, -1))
+		if (!vmemmap_populate_address(start, node, altmap))
 			return -ENOMEM;
 
 	return 0;
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 45/69] powerpc/mm: Make vmemmap_populate_compound_pages() static
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

vmemmap_populate_compound_pages() is no longer used outside
radix_pgtable.c.

Make it static and drop the unused dev_pagemap and start_pfn
argument from its only remaining caller.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h |  6 ------
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 14 +++++++-------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index da954e779744..8452a2714cb1 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -356,11 +356,5 @@ int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #define vmemmap_can_optimize vmemmap_can_optimize
 bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
 #endif
-
-#define vmemmap_populate_compound_pages vmemmap_populate_compound_pages
-int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
-					      unsigned long start,
-					      unsigned long end, int node,
-					      struct dev_pagemap *pgmap);
 #endif /* __ASSEMBLER__ */
 #endif
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index c7f2327681cc..18b24bb891b7 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1109,7 +1109,8 @@ static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
 	return pte_offset_kernel(pmdp, address);
 }
 
-
+static int __meminit vmemmap_populate_compound_pages(unsigned long start,
+						     unsigned long end, int node);
 
 int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
 				      struct vmem_altmap *altmap)
@@ -1124,7 +1125,7 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
 	unsigned long pfn = page_to_pfn((struct page *)start);
 
 	if (section_vmemmap_optimizable(__pfn_to_section(pfn)))
-		return vmemmap_populate_compound_pages(pfn, start, end, node, NULL);
+		return vmemmap_populate_compound_pages(start, end, node);
 	/*
 	 * If altmap is present, Make sure we align the start vmemmap addr
 	 * to PAGE_SIZE so that we calculate the correct start_pfn in
@@ -1220,10 +1221,8 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
 	return 0;
 }
 
-int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
-					      unsigned long start,
-					      unsigned long end, int node,
-					      struct dev_pagemap *pgmap)
+static int __meminit vmemmap_populate_compound_pages(unsigned long start,
+						     unsigned long end, int node)
 {
 	/*
 	 * we want to map things as base page size mapping so that
@@ -1238,8 +1237,9 @@ int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
 	pmd_t *pmd;
 	pte_t *pte;
 	struct page *tail_page;
-	const struct mem_section *ms = __pfn_to_section(start_pfn);
+	const struct mem_section *ms;
 
+	ms = __pfn_to_section(page_to_pfn((struct page *)start));
 	tail_page = vmemmap_shared_tail_page(section_order(ms), device_zone(node));
 	if (!tail_page)
 		return -ENOMEM;
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 46/69] mm/sparse-vmemmap: Map shared vmemmap tail pages read-only
From: Muchun Song @ 2026-05-13 13:05 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

Shared vmemmap tail pages are now installed through
vmemmap_pte_populate().

Map those shared pages with PAGE_KERNEL_RO so writes to shared tail
vmemmap entries fault immediately instead of silently corrupting shared
metadata.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 mm/sparse-vmemmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 182d0c7dd1e7..9811c92ad258 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -158,7 +158,8 @@ static pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, in
 		if (slab_is_available())
 			get_page(page);
 
-		entry = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
+		/* Map shared tail page read-only to catch illegal writes. */
+		entry = pfn_pte(page_to_pfn(page), PAGE_KERNEL_RO);
 	} else {
 		void *vaddr = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
 
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 47/69] powerpc/mm: Map shared vmemmap tail pages read-only
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513130542.35604-1-songmuchun@bytedance.com>

Shared vmemmap tail pages can also be installed through the powerpc
radix vmemmap populate path.

Map reused tail pages with PAGE_KERNEL_RO so writes to shared tail
vmemmap entries fault immediately instead of silently corrupting shared
metadata.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 18b24bb891b7..4c3d027c823c 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1053,7 +1053,8 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long
 		}
 
 		VM_BUG_ON(!PAGE_ALIGNED(addr));
-		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+		entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+				reuse ? PAGE_KERNEL_RO : PAGE_KERNEL);
 		set_pte_at(&init_mm, addr, pte, entry);
 		asm volatile("ptesync": : :"memory");
 	}
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 48/69] mm/sparse-vmemmap: Inline vmemmap_populate_address() into its caller
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

vmemmap_populate_address() no longer has any callers that need the
returned PTE.  Its only remaining user just checks whether the call
succeeded.

Inline it back into vmemmap_populate_basepages() and return -ENOMEM
directly on failure.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 mm/sparse-vmemmap.c | 46 +++++++++++++++++++--------------------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 9811c92ad258..5d5cd5f73365 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -234,8 +234,8 @@ static pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 	return pgd;
 }
 
-static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
-						  struct vmem_altmap *altmap)
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+					 int node, struct vmem_altmap *altmap)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -243,32 +243,24 @@ static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
 	pmd_t *pmd;
 	pte_t *pte;
 
-	pgd = vmemmap_pgd_populate(addr, node);
-	if (!pgd)
-		return NULL;
-	p4d = vmemmap_p4d_populate(pgd, addr, node);
-	if (!p4d)
-		return NULL;
-	pud = vmemmap_pud_populate(p4d, addr, node);
-	if (!pud)
-		return NULL;
-	pmd = vmemmap_pmd_populate(pud, addr, node);
-	if (!pmd)
-		return NULL;
-	pte = vmemmap_pte_populate(pmd, addr, node, altmap);
-	if (!pte)
-		return NULL;
-	vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
-
-	return pte;
-}
-
-int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
-					 int node, struct vmem_altmap *altmap)
-{
-	for (; start < end; start += PAGE_SIZE)
-		if (!vmemmap_populate_address(start, node, altmap))
+	for (; start < end; start += PAGE_SIZE) {
+		pgd = vmemmap_pgd_populate(start, node);
+		if (!pgd)
+			return -ENOMEM;
+		p4d = vmemmap_p4d_populate(pgd, start, node);
+		if (!p4d)
 			return -ENOMEM;
+		pud = vmemmap_pud_populate(p4d, start, node);
+		if (!pud)
+			return -ENOMEM;
+		pmd = vmemmap_pmd_populate(pud, start, node);
+		if (!pmd)
+			return -ENOMEM;
+		pte = vmemmap_pte_populate(pmd, start, node, altmap);
+		if (!pte)
+			return -ENOMEM;
+		vmemmap_verify(pte, node, start, start + PAGE_SIZE);
+	}
 
 	return 0;
 }
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 49/69] mm/hugetlb_vmemmap: Remove vmemmap_wrprotect_hvo()
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

Shared vmemmap tail pages are now mapped read-only when their PTEs are
installed, so HugeTLB bootmem optimization no longer needs a separate
write-protect pass afterwards.

Remove vmemmap_wrprotect_hvo() and the bootmem-specific HugeTLB wrapper,
and let bootmem folios use the normal hugetlb_vmemmap_optimize_folios()
path.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/mm.h   |  2 --
 mm/hugetlb.c         |  2 +-
 mm/hugetlb_vmemmap.c | 45 +++++++++-----------------------------------
 mm/hugetlb_vmemmap.h |  6 ------
 mm/sparse-vmemmap.c  | 23 ----------------------
 5 files changed, 10 insertions(+), 68 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 86d7cecb834e..5e38c9a16a0a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4863,8 +4863,6 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
 			       int node, struct vmem_altmap *altmap);
 int vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap);
-void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
-			  unsigned long headsize);
 void vmemmap_populate_print_last(void);
 struct page *vmemmap_shared_tail_page(unsigned int order, struct zone *zone);
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 74770c1648fc..54ef7d12c585 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3202,7 +3202,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
 	struct folio *folio, *tmp_f;
 
 	/* Send list for bulk vmemmap optimization processing */
-	hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list);
+	hugetlb_vmemmap_optimize_folios(h, folio_list);
 
 	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
 		if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index d24143dd6051..fce772e95adc 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -595,31 +595,22 @@ static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *fol
 	return vmemmap_remap_split(vmemmap_start, vmemmap_end);
 }
 
-static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
-					      struct list_head *folio_list,
-					      bool boot)
+void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
 {
 	struct folio *folio;
-	int nr_to_optimize;
+	unsigned long nr_to_optimize = 0;
 	LIST_HEAD(vmemmap_pages);
 	unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH;
 
-	nr_to_optimize = 0;
 	list_for_each_entry(folio, folio_list, lru) {
 		int ret;
-		unsigned long spfn, epfn;
-
-		if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
-			/*
-			 * Already optimized by pre-HVO, just map the
-			 * mirrored tail page structs RO.
-			 */
-			spfn = (unsigned long)&folio->page;
-			epfn = spfn + hugetlb_vmemmap_size(h);
-			vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
-					OPTIMIZED_FOLIO_VMEMMAP_SIZE);
+
+		/*
+		 * Bootmem gigantic folios may already be marked optimized when
+		 * their vmemmap layout was prepared earlier, so skip them here.
+		 */
+		if (folio_test_hugetlb_vmemmap_optimized(folio))
 			continue;
-		}
 
 		nr_to_optimize++;
 
@@ -636,14 +627,7 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
 	}
 
 	if (!nr_to_optimize)
-		/*
-		 * All pre-HVO folios, nothing left to do. It's ok if
-		 * there is a mix of pre-HVO and not yet HVO-ed folios
-		 * here, as __hugetlb_vmemmap_optimize_folio() will
-		 * skip any folios that already have the optimized flag
-		 * set, see vmemmap_should_optimize_folio().
-		 */
-		goto out;
+		return;
 
 	flush_tlb_all();
 
@@ -668,21 +652,10 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
 		}
 	}
 
-out:
 	flush_tlb_all();
 	free_vmemmap_page_list(&vmemmap_pages);
 }
 
-void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
-{
-	__hugetlb_vmemmap_optimize_folios(h, folio_list, false);
-}
-
-void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
-{
-	__hugetlb_vmemmap_optimize_folios(h, folio_list, true);
-}
-
 void __init hugetlb_vmemmap_optimize_bootmem_page(struct huge_bootmem_page *m)
 {
 	struct hstate *h = m->hstate;
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 0d8c88997066..2b0a85e09602 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -17,7 +17,6 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 					struct list_head *non_hvo_folios);
 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio);
 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
-void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list);
 void hugetlb_vmemmap_optimize_bootmem_page(struct huge_bootmem_page *m);
 
 static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
@@ -59,11 +58,6 @@ static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list
 {
 }
 
-static inline void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h,
-						struct list_head *folio_list)
-{
-}
-
 static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
 {
 	return 0;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 5d5cd5f73365..ce1cf5cdf613 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -265,29 +265,6 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
 	return 0;
 }
 
-/*
- * Write protect the mirrored tail page structs for HVO. This will be
- * called from the hugetlb code when gathering and initializing the
- * memblock allocated gigantic pages. The write protect can't be
- * done earlier, since it can't be guaranteed that the reserved
- * page structures will not be written to during initialization,
- * even if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.
- *
- * The PTEs are known to exist, and nothing else should be touching
- * these pages. The caller is responsible for any TLB flushing.
- */
-void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
-				    int node, unsigned long headsize)
-{
-	unsigned long maddr;
-	pte_t *pte;
-
-	for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
-		pte = virt_to_kpte(maddr);
-		ptep_set_wrprotect(&init_mm, maddr, pte);
-	}
-}
-
 struct page __ref *vmemmap_shared_tail_page(unsigned int order, struct zone *zone)
 {
 	void *addr;
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 50/69] mm/sparse: Simplify section_nr_vmemmap_pages()
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

section_nr_vmemmap_pages() no longer needs altmap- or pgmap-specific
state to decide whether a section uses the optimized vmemmap layout.

Now that the optimization state is recorded in struct mem_section, use
section_vmemmap_optimizable() and section_order() directly and drop the
redundant arguments from the helper and its callers.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 mm/internal.h       |  3 +--
 mm/sparse-vmemmap.c |  7 +++----
 mm/sparse.c         | 19 ++++++-------------
 3 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 18276cd15622..06022074ebcb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -997,8 +997,7 @@ static inline void __section_mark_present(struct mem_section *ms,
 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
 }
 
-int section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
+int section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages);
 #else
 static inline void sparse_memblocks_present(void) {}
 static inline void sparse_init(void) {}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index ce1cf5cdf613..793fd4ce1393 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -468,7 +468,7 @@ static struct page * __meminit populate_section_memmap(unsigned long pfn,
 	struct page *page = __populate_section_memmap(pfn, nr_pages, nid, altmap,
 						      pgmap);
 
-	memmap_pages_add(section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap));
+	memmap_pages_add(section_nr_vmemmap_pages(pfn, nr_pages));
 
 	return page;
 }
@@ -479,7 +479,7 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 	unsigned long start = (unsigned long) pfn_to_page(pfn);
 	unsigned long end = start + nr_pages * sizeof(struct page);
 
-	memmap_pages_add(-section_nr_vmemmap_pages(pfn, nr_pages, altmap, pgmap));
+	memmap_pages_add(-section_nr_vmemmap_pages(pfn, nr_pages));
 	vmemmap_free(start, end, altmap);
 }
 
@@ -489,8 +489,7 @@ static void free_map_bootmem(struct page *memmap)
 	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
 	unsigned long pfn = page_to_pfn(memmap);
 
-	memmap_boot_pages_add(-section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION,
-							NULL, NULL));
+	memmap_boot_pages_add(-section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION));
 	vmemmap_free(start, end, NULL);
 }
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 6878f8941b4c..3390cb82f114 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -237,32 +237,26 @@ void __weak __meminit vmemmap_populate_print_last(void)
 {
 }
 
-int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages)
 {
-	const struct mem_section *ms = __pfn_to_section(pfn);
-	const unsigned int order = pgmap ? pgmap->vmemmap_shift : section_order(ms);
+	const unsigned int order = pfn_to_section_order(pfn);
 	const unsigned long pages_per_compound = 1UL << order;
-	unsigned int vmemmap_pages = OPTIMIZED_FOLIO_VMEMMAP_PAGES;
 
 	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION));
 	VM_WARN_ON_ONCE(nr_pages > PAGES_PER_SECTION);
 
-	if (vmemmap_can_optimize(altmap, pgmap))
-		vmemmap_pages = VMEMMAP_RESERVE_NR;
-
-	if (!section_vmemmap_optimizable(ms))
+	if (!order_vmemmap_optimizable(order))
 		return DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE);
 
 	if (order < PFN_SECTION_SHIFT) {
 		VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, pages_per_compound));
-		return vmemmap_pages * nr_pages / pages_per_compound;
+		return OPTIMIZED_FOLIO_VMEMMAP_PAGES * nr_pages / pages_per_compound;
 	}
 
 	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION));
 
 	if (IS_ALIGNED(pfn, pages_per_compound))
-		return vmemmap_pages;
+		return OPTIMIZED_FOLIO_VMEMMAP_PAGES;
 
 	return 0;
 }
@@ -294,8 +288,7 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 						nid, NULL, NULL);
 		if (!map)
 			panic("Failed to allocate memmap for section %lu\n", pnum);
-		memmap_boot_pages_add(section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION,
-							       NULL, NULL));
+		memmap_boot_pages_add(section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION));
 		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
 					SECTION_IS_EARLY);
 		usage = (void *)usage + mem_section_usage_size();
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 51/69] mm/sparse-vmemmap: Introduce vmemmap_nr_struct_pages()
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

compound_nr_pages() exposes sparse vmemmap optimization details to the
core memory initialization code.

Introduce vmemmap_nr_struct_pages() to report how many struct pages are
actually allocated and need initialization for an optimized vmemmap
mapping. This gives memmap_init_zone_device() the information it needs
without depending on sparse-vmemmap internals.

With this helper in place, drop compound_nr_pages() and keep the
vmemmap-specific logic inside sparse-vmemmap code.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 mm/internal.h | 11 ++++++++++-
 mm/mm_init.c  | 21 +--------------------
 mm/sparse.c   | 13 ++++++-------
 3 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 06022074ebcb..9597a703bc73 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -997,7 +997,16 @@ static inline void __section_mark_present(struct mem_section *ms,
 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
 }
 
-int section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages);
+int vmemmap_nr_struct_pages(unsigned long pfn, unsigned long nr_pages);
+
+static inline int section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages)
+{
+	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION));
+	VM_WARN_ON_ONCE(nr_pages > PAGES_PER_SECTION);
+
+	return DIV_ROUND_UP(vmemmap_nr_struct_pages(pfn, nr_pages) * sizeof(struct page),
+			    PAGE_SIZE);
+}
 #else
 static inline void sparse_memblocks_present(void) {}
 static inline void sparse_init(void) {}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 9ff118e35641..4ea39392993b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1062,25 +1062,6 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
 	}
 }
 
-/*
- * With compound page geometry and when struct pages are stored in ram most
- * tail pages are reused. Consequently, the amount of unique struct pages to
- * initialize is a lot smaller that the total amount of struct pages being
- * mapped. This is a paired / mild layering violation with explicit knowledge
- * of how the sparse_vmemmap internals handle compound pages in the lack
- * of an altmap.
- */
-static inline unsigned long compound_nr_pages(unsigned long pfn,
-					      struct dev_pagemap *pgmap)
-{
-	const struct mem_section *ms = __pfn_to_section(pfn);
-
-	if (!section_vmemmap_optimizable(ms))
-		return pgmap_vmemmap_nr(pgmap);
-
-	return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
-}
-
 static void __ref memmap_init_compound(struct page *head,
 				       unsigned long head_pfn,
 				       unsigned long zone_idx, int nid,
@@ -1145,7 +1126,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 			continue;
 
 		memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
-				     compound_nr_pages(pfn, pgmap));
+				     vmemmap_nr_struct_pages(pfn, pfns_per_compound));
 	}
 
 	pageblock_migratetype_init_range(start_pfn, nr_pages, MIGRATE_MOVABLE, false, false);
diff --git a/mm/sparse.c b/mm/sparse.c
index 3390cb82f114..f314b9babc4a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -237,26 +237,25 @@ void __weak __meminit vmemmap_populate_print_last(void)
 {
 }
 
-int __meminit section_nr_vmemmap_pages(unsigned long pfn, unsigned long nr_pages)
+int __meminit vmemmap_nr_struct_pages(unsigned long pfn, unsigned long nr_pages)
 {
 	const unsigned int order = pfn_to_section_order(pfn);
 	const unsigned long pages_per_compound = 1UL << order;
 
-	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SUBSECTION));
-	VM_WARN_ON_ONCE(nr_pages > PAGES_PER_SECTION);
-
 	if (!order_vmemmap_optimizable(order))
-		return DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE);
+		return nr_pages;
 
 	if (order < PFN_SECTION_SHIFT) {
 		VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, pages_per_compound));
-		return OPTIMIZED_FOLIO_VMEMMAP_PAGES * nr_pages / pages_per_compound;
+		return OPTIMIZED_FOLIO_VMEMMAP_NR_STRUCT_PAGES * nr_pages / pages_per_compound;
 	}
 
 	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION));
+	/* Ensure the requested range does not cross a compound page boundary. */
+	VM_WARN_ON_ONCE((pfn % pages_per_compound) + nr_pages > pages_per_compound);
 
 	if (IS_ALIGNED(pfn, pages_per_compound))
-		return OPTIMIZED_FOLIO_VMEMMAP_PAGES;
+		return OPTIMIZED_FOLIO_VMEMMAP_NR_STRUCT_PAGES;
 
 	return 0;
 }
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 52/69] powerpc/mm: Drop powerpc vmemmap_can_optimize()
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

PowerPC no longer needs an architecture-specific vmemmap_can_optimize()
override for DAX vmemmap optimization.

Whether the optimized mapping can be used is now decided in the
architecture-specific vmemmap_populate() path. When PowerPC has to fall
back, such as on Hash MMU, it can simply clear the section order there
and disable the optimization for that section.

Drop the radix-specific vmemmap_can_optimize() override and rely on the
generic checks instead.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h |  5 -----
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 10 ----------
 arch/powerpc/mm/init_64.c                  |  1 +
 3 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 8452a2714cb1..df67209b0c5b 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -351,10 +351,5 @@ int radix__create_section_mapping(unsigned long start, unsigned long end,
 				  int nid, pgprot_t prot);
 int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
-
-#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
-#define vmemmap_can_optimize vmemmap_can_optimize
-bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
-#endif
 #endif /* __ASSEMBLER__ */
 #endif
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 4c3d027c823c..2f8783b3f678 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -977,16 +977,6 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
 	return 0;
 }
 
-#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
-bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
-{
-	if (radix_enabled())
-		return __vmemmap_can_optimize(altmap, pgmap);
-
-	return false;
-}
-#endif
-
 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
 				unsigned long addr, unsigned long next)
 {
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index b6f3ae03ca9e..8e18ed427fdd 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -283,6 +283,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		return radix__vmemmap_populate(start, end, node, altmap);
 #endif
 
+	section_set_order(__pfn_to_section(page_to_pfn((struct page *)start)), 0);
 	return __vmemmap_populate(start, end, node, altmap);
 }
 
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 53/69] mm/sparse-vmemmap: Drop vmemmap_can_optimize()
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

vmemmap_can_optimize() no longer needs to gate section activation.

section_activate() can use pgmap->vmemmap_shift directly to record the
requested section order and leave support checks to the vmemmap
population path. That keeps the policy local to the code that actually
instantiates the mapping, instead of requiring callers to pre-filter
unsupported cases.

In particular, altmap-backed memmap allocation cannot support HVO, so
__populate_section_memmap() clears any inherited optimized section
order for full-section adds and rejects subsection re-adds. Unsupported
optimized mappings are therefore rejected where the vmemmap backing is
set up, and callers no longer have to care about that restriction.

With that handling in place, vmemmap_can_optimize() becomes redundant
and can be removed.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/mm.h  | 34 ----------------------------------
 mm/sparse-vmemmap.c | 14 +++++++++++++-
 2 files changed, 13 insertions(+), 35 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5e38c9a16a0a..5f45de90972d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4896,40 +4896,6 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap,
 }
 #endif
 
-#define VMEMMAP_RESERVE_NR	OPTIMIZED_FOLIO_VMEMMAP_PAGES
-#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
-static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
-					  struct dev_pagemap *pgmap)
-{
-	unsigned long nr_pages;
-	unsigned long nr_vmemmap_pages;
-
-	if (!pgmap || !is_power_of_2(sizeof(struct page)))
-		return false;
-
-	nr_pages = pgmap_vmemmap_nr(pgmap);
-	nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
-	/*
-	 * For vmemmap optimization with DAX we need minimum 2 vmemmap
-	 * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
-	 */
-	return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
-}
-/*
- * If we don't have an architecture override, use the generic rule
- */
-#ifndef vmemmap_can_optimize
-#define vmemmap_can_optimize __vmemmap_can_optimize
-#endif
-
-#else
-static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
-					   struct dev_pagemap *pgmap)
-{
-	return false;
-}
-#endif
-
 enum mf_flags {
 	MF_COUNT_INCREASED = 1 << 0,
 	MF_ACTION_REQUIRED = 1 << 1,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 793fd4ce1393..549be01d90f8 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -384,11 +384,23 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn,
 {
 	unsigned long start = (unsigned long) pfn_to_page(pfn);
 	unsigned long end = start + nr_pages * sizeof(struct page);
+	struct mem_section *ms = __pfn_to_section(pfn);
 
 	if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
 		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
 		return NULL;
 
+	/* HVO is not supported now when memmap pages are backed by an altmap. */
+	if (altmap && section_vmemmap_optimizable(ms)) {
+		/*
+		 * A subsection re-add can inherit order left by a partial
+		 * remove after full add.
+		 */
+		if (nr_pages < PAGES_PER_SECTION)
+			return NULL;
+		section_set_order(ms, 0);
+	}
+
 	return vmemmap_populate(start, end, nid, altmap) ? NULL : (void *)start;
 }
 
@@ -613,7 +625,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
 	unsigned int order;
 	int rc;
 
-	order = vmemmap_can_optimize(altmap, pgmap) ? pgmap->vmemmap_shift : 0;
+	order = pgmap ? pgmap->vmemmap_shift : 0;
 	/* All sub-sections within a section must share the same order. */
 	if (nr_pages < PAGES_PER_SECTION && section_order(ms) && section_order(ms) != order)
 		return ERR_PTR(-ENOTSUPP);
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 54/69] mm/sparse-vmemmap: Drop @pgmap from vmemmap population APIs
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

The vmemmap population and memory hotplug paths no longer need @pgmap
to decide whether a mapping can be optimized. That state is now carried
in mem_section, and the architecture-specific population code can make
the remaining decisions internally.

Drop the @pgmap parameter from the vmemmap population helpers and the
related memory hotplug interfaces, and remove the remaining
dev_pagemap-specific coupling from those call chains.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/arm64/mm/mmu.c                        |  5 ++---
 arch/loongarch/mm/init.c                   |  5 ++---
 arch/powerpc/include/asm/book3s/64/radix.h |  1 -
 arch/powerpc/mm/mem.c                      |  5 ++---
 arch/riscv/mm/init.c                       |  5 ++---
 arch/s390/mm/init.c                        |  5 ++---
 arch/x86/mm/init_64.c                      |  5 ++---
 include/linux/memory_hotplug.h             |  8 +++-----
 include/linux/mm.h                         |  3 +--
 mm/memory_hotplug.c                        | 13 ++++++------
 mm/memremap.c                              |  4 ++--
 mm/sparse-vmemmap.c                        | 23 ++++++++++------------
 mm/sparse.c                                |  6 ++----
 13 files changed, 36 insertions(+), 52 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index e5a42b7a0160..dd85e093ffdb 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -2024,13 +2024,12 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return ret;
 }
 
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
-			struct dev_pagemap *pgmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	__remove_pages(start_pfn, nr_pages, altmap, pgmap);
+	__remove_pages(start_pfn, nr_pages, altmap);
 	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
 }
 
diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c
index 055ecd2c8fd9..3f9ab54114c5 100644
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@@ -119,8 +119,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
 	return ret;
 }
 
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
-			struct dev_pagemap *pgmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -129,7 +128,7 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
 	/* With altmap the first mapped page is offset from @start */
 	if (altmap)
 		page += vmem_altmap_offset(altmap);
-	__remove_pages(start_pfn, nr_pages, altmap, pgmap);
+	__remove_pages(start_pfn, nr_pages, altmap);
 }
 #endif
 
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index df67209b0c5b..0c9195dd50c9 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -316,7 +316,6 @@ static inline int radix__has_transparent_pud_hugepage(void)
 #endif
 
 struct vmem_altmap;
-struct dev_pagemap;
 extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
 					     unsigned long page_size,
 					     unsigned long phys);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 4c1afab91996..648d0c5602ec 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -158,13 +158,12 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
 	return rc;
 }
 
-void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
-			      struct dev_pagemap *pgmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	__remove_pages(start_pfn, nr_pages, altmap, pgmap);
+	__remove_pages(start_pfn, nr_pages, altmap);
 	arch_remove_linear_mapping(start, size);
 }
 #endif
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 885f1db4e9bf..fa8d2f6f554b 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1742,10 +1742,9 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *param
 	return ret;
 }
 
-void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
-			      struct dev_pagemap *pgmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
-	__remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap, pgmap);
+	__remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
 	remove_linear_mapping(start, size);
 	flush_tlb_all();
 }
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 11a689423440..1f72efc2a579 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -276,13 +276,12 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return rc;
 }
 
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
-			struct dev_pagemap *pgmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	__remove_pages(start_pfn, nr_pages, altmap, pgmap);
+	__remove_pages(start_pfn, nr_pages, altmap);
 	vmem_remove_mapping(start, size);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 77b889b71cf3..df2261fa4f98 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1288,13 +1288,12 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 	remove_pagetable(start, end, true, NULL);
 }
 
-void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
-			      struct dev_pagemap *pgmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	__remove_pages(start_pfn, nr_pages, altmap, pgmap);
+	__remove_pages(start_pfn, nr_pages, altmap);
 	kernel_physical_mapping_remove(start, start + size);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7c9d66729c60..815e908c4135 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -135,10 +135,9 @@ static inline bool movable_node_is_enabled(void)
 	return movable_node_enabled;
 }
 
-extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap,
-			       struct dev_pagemap *pgmap);
+extern void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap);
 extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
-			   struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
+			   struct vmem_altmap *altmap);
 
 /* reasonably generic interface to expand the physical pages */
 extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
@@ -308,8 +307,7 @@ extern int sparse_add_section(int nid, unsigned long pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap,
 		struct dev_pagemap *pgmap);
 extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
-				  struct vmem_altmap *altmap,
-				  struct dev_pagemap *pgmap);
+				  struct vmem_altmap *altmap);
 extern struct zone *zone_for_pfn_range(enum mmop online_type,
 		int nid, struct memory_group *group, unsigned long start_pfn,
 		unsigned long nr_pages);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5f45de90972d..87e98bdb0417 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4846,8 +4846,7 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 #endif
 
 struct page * __populate_section_memmap(unsigned long pfn,
-		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
-		struct dev_pagemap *pgmap);
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
 void *vmemmap_alloc_block(unsigned long size, int node);
 struct vmem_altmap;
 void *vmemmap_alloc_block_buf(unsigned long size, int node,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c9c69f827efa..5c60533677a1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -577,7 +577,6 @@ void remove_pfn_range_from_zone(struct zone *zone,
  * @pfn: starting pageframe (must be aligned to start of a section)
  * @nr_pages: number of pages to remove (must be multiple of section size)
  * @altmap: alternative device page map or %NULL if default memmap is used
- * @pgmap: device page map or %NULL if not ZONE_DEVICE
  *
  * Generic helper function to remove section mappings and sysfs entries
  * for the section of the memory we are removing. Caller needs to make
@@ -585,7 +584,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
  * calling offline_pages().
  */
 void __remove_pages(unsigned long pfn, unsigned long nr_pages,
-		    struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+		    struct vmem_altmap *altmap)
 {
 	const unsigned long end_pfn = pfn + nr_pages;
 	unsigned long cur_nr_pages;
@@ -600,7 +599,7 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
 		/* Select all remaining pages up to the next section boundary */
 		cur_nr_pages = min(end_pfn - pfn,
 				   SECTION_ALIGN_UP(pfn + 1) - pfn);
-		sparse_remove_section(pfn, cur_nr_pages, altmap, pgmap);
+		sparse_remove_section(pfn, cur_nr_pages, altmap);
 	}
 }
 
@@ -1429,7 +1428,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
 
 		remove_memory_block_devices(cur_start, memblock_size);
 
-		arch_remove_memory(cur_start, memblock_size, altmap, NULL);
+		arch_remove_memory(cur_start, memblock_size, altmap);
 
 		/* Verify that all vmemmap pages have actually been freed. */
 		WARN(altmap->alloc, "Altmap not fully unmapped");
@@ -1472,7 +1471,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
 		ret = create_memory_block_devices(cur_start, memblock_size, nid,
 						  params.altmap, group);
 		if (ret) {
-			arch_remove_memory(cur_start, memblock_size, params.altmap, NULL);
+			arch_remove_memory(cur_start, memblock_size, params.altmap);
 			kfree(params.altmap);
 			goto out;
 		}
@@ -1558,7 +1557,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		/* create memory block devices after memory was added */
 		ret = create_memory_block_devices(start, size, nid, NULL, group);
 		if (ret) {
-			arch_remove_memory(start, size, params.altmap, NULL);
+			arch_remove_memory(start, size, params.altmap);
 			goto error;
 		}
 	}
@@ -2270,7 +2269,7 @@ static int try_remove_memory(u64 start, u64 size)
 		 * No altmaps present, do the removal directly
 		 */
 		remove_memory_block_devices(start, size);
-		arch_remove_memory(start, size, NULL, NULL);
+		arch_remove_memory(start, size, NULL);
 	} else {
 		/* all memblocks in the range have altmaps */
 		remove_memory_blocks_and_altmaps(start, size);
diff --git a/mm/memremap.c b/mm/memremap.c
index 81766d822400..053842d45cb1 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -97,10 +97,10 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
 				   PHYS_PFN(range_len(range)));
 	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
 		__remove_pages(PHYS_PFN(range->start),
-			       PHYS_PFN(range_len(range)), NULL, pgmap);
+			       PHYS_PFN(range_len(range)), NULL);
 	} else {
 		arch_remove_memory(range->start, range_len(range),
-				pgmap_altmap(pgmap), pgmap);
+				pgmap_altmap(pgmap));
 		kasan_remove_zero_shadow(__va(range->start), range_len(range));
 	}
 	mem_hotplug_done();
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 549be01d90f8..a807210fe9e1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -379,8 +379,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
 }
 
 struct page * __meminit __populate_section_memmap(unsigned long pfn,
-		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
-		struct dev_pagemap *pgmap)
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	unsigned long start = (unsigned long) pfn_to_page(pfn);
 	unsigned long end = start + nr_pages * sizeof(struct page);
@@ -474,11 +473,9 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 }
 
 static struct page * __meminit populate_section_memmap(unsigned long pfn,
-		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
-		struct dev_pagemap *pgmap)
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
-	struct page *page = __populate_section_memmap(pfn, nr_pages, nid, altmap,
-						      pgmap);
+	struct page *page = __populate_section_memmap(pfn, nr_pages, nid, altmap);
 
 	memmap_pages_add(section_nr_vmemmap_pages(pfn, nr_pages));
 
@@ -486,7 +483,7 @@ static struct page * __meminit populate_section_memmap(unsigned long pfn,
 }
 
 static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+		struct vmem_altmap *altmap)
 {
 	unsigned long start = (unsigned long) pfn_to_page(pfn);
 	unsigned long end = start + nr_pages * sizeof(struct page);
@@ -567,7 +564,7 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
  * usage map, but still need to free the vmemmap range.
  */
 static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+		struct vmem_altmap *altmap)
 {
 	struct mem_section *ms = __pfn_to_section(pfn);
 	bool section_is_early = early_section(ms);
@@ -605,7 +602,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 	 * section_activate() and pfn_valid() .
 	 */
 	if (!section_is_early)
-		depopulate_section_memmap(pfn, nr_pages, altmap, pgmap);
+		depopulate_section_memmap(pfn, nr_pages, altmap);
 	else if (memmap)
 		free_map_bootmem(memmap);
 
@@ -656,9 +653,9 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
 		return pfn_to_page(pfn);
 
 	section_set_order_range(pfn, nr_pages, order);
-	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
+	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
 	if (!memmap) {
-		section_deactivate(pfn, nr_pages, altmap, pgmap);
+		section_deactivate(pfn, nr_pages, altmap);
 		return ERR_PTR(-ENOMEM);
 	}
 
@@ -720,13 +717,13 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 }
 
 void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
+		struct vmem_altmap *altmap)
 {
 	struct mem_section *ms = __pfn_to_section(pfn);
 
 	if (WARN_ON_ONCE(!valid_section(ms)))
 		return;
 
-	section_deactivate(pfn, nr_pages, altmap, pgmap);
+	section_deactivate(pfn, nr_pages, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/sparse.c b/mm/sparse.c
index f314b9babc4a..bdf23709a1c7 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -224,8 +224,7 @@ size_t mem_section_usage_size(void)
 
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
 struct page __init *__populate_section_memmap(unsigned long pfn,
-		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
-		struct dev_pagemap *pgmap)
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	unsigned long size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
 
@@ -283,8 +282,7 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 		if (pnum >= pnum_end)
 			break;
 
-		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
-						nid, NULL, NULL);
+		map = __populate_section_memmap(pfn, PAGES_PER_SECTION, nid, NULL);
 		if (!map)
 			panic("Failed to allocate memmap for section %lu\n", pnum);
 		memmap_boot_pages_add(section_nr_vmemmap_pages(pfn, PAGES_PER_SECTION));
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 55/69] mm/sparse: Decouple section activation from ZONE_DEVICE
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

sparse_add_section()/section_activate() currently takes struct
dev_pagemap only to obtain the compound page order.

Pass the order explicitly instead of routing it through a ZONE_DEVICE
specific structure. This removes the dev_pagemap dependency from the
generic sparse memory population path and keeps the interface usable for
other callers (if possible).

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/memory_hotplug.h |  4 ++--
 mm/memory_hotplug.c            |  4 ++--
 mm/sparse-vmemmap.c            | 14 ++++++--------
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 815e908c4135..083f0abea62d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -304,8 +304,8 @@ extern void remove_pfn_range_from_zone(struct zone *zone,
 				       unsigned long start_pfn,
 				       unsigned long nr_pages);
 extern int sparse_add_section(int nid, unsigned long pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap,
-		struct dev_pagemap *pgmap);
+		unsigned long nr_pages, unsigned int order,
+		struct vmem_altmap *altmap);
 extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
 				  struct vmem_altmap *altmap);
 extern struct zone *zone_for_pfn_range(enum mmop online_type,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 5c60533677a1..ef1595bdfd3a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -385,6 +385,7 @@ int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
 	unsigned long cur_nr_pages;
 	int err;
 	struct vmem_altmap *altmap = params->altmap;
+	unsigned int order = params->pgmap ? params->pgmap->vmemmap_shift : 0;
 
 	if (WARN_ON_ONCE(!pgprot_val(params->pgprot)))
 		return -EINVAL;
@@ -412,8 +413,7 @@ int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
 		/* Select all remaining pages up to the next section boundary */
 		cur_nr_pages = min(end_pfn - pfn,
 				   SECTION_ALIGN_UP(pfn + 1) - pfn);
-		err = sparse_add_section(nid, pfn, cur_nr_pages, altmap,
-					 params->pgmap);
+		err = sparse_add_section(nid, pfn, cur_nr_pages, order, altmap);
 		if (err)
 			break;
 		cond_resched();
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a807210fe9e1..667424aadd6b 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -613,16 +613,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
 }
 
 static struct page * __meminit section_activate(int nid, unsigned long pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap,
-		struct dev_pagemap *pgmap)
+		unsigned long nr_pages, unsigned int order,
+		struct vmem_altmap *altmap)
 {
 	struct mem_section *ms = __pfn_to_section(pfn);
 	struct mem_section_usage *usage = NULL;
 	struct page *memmap;
-	unsigned int order;
 	int rc;
 
-	order = pgmap ? pgmap->vmemmap_shift : 0;
 	/* All sub-sections within a section must share the same order. */
 	if (nr_pages < PAGES_PER_SECTION && section_order(ms) && section_order(ms) != order)
 		return ERR_PTR(-ENOTSUPP);
@@ -667,8 +665,8 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
  * @nid: The node to add section on
  * @start_pfn: start pfn of the memory range
  * @nr_pages: number of pfns to add in the section
+ * @order: section order
  * @altmap: alternate pfns to allocate the memmap backing store
- * @pgmap: alternate compound page geometry for devmap mappings
  *
  * This is only intended for hotplug.
  *
@@ -682,8 +680,8 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
  * * -ENOMEM	- Out of memory.
  */
 int __meminit sparse_add_section(int nid, unsigned long start_pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap,
-		struct dev_pagemap *pgmap)
+		unsigned long nr_pages, unsigned int order,
+		struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
 	struct mem_section *ms;
@@ -694,7 +692,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 	if (ret < 0)
 		return ret;
 
-	memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap);
+	memmap = section_activate(nid, start_pfn, nr_pages, order, altmap);
 	if (IS_ERR(memmap))
 		return PTR_ERR(memmap);
 
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 56/69] mm: Redefine HVO as Hugepage Vmemmap Optimization
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

HVO no longer refers only to HugeTLB vmemmap optimization. The same
optimization is now used more broadly for large compound-page mappings,
so the old expansion is too narrow.

Redefine HVO as Hugepage Vmemmap Optimization and update the generic
documentation, Kconfig text, and comments accordingly.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 2 +-
 Documentation/admin-guide/mm/hugetlbpage.rst    | 4 ++--
 Documentation/admin-guide/mm/memory-hotplug.rst | 2 +-
 Documentation/admin-guide/sysctl/vm.rst         | 3 ++-
 Documentation/mm/vmemmap_dedup.rst              | 2 +-
 fs/Kconfig                                      | 4 ++--
 include/linux/mmzone.h                          | 2 +-
 mm/Kconfig                                      | 2 +-
 mm/hugetlb_vmemmap.c                            | 2 +-
 mm/hugetlb_vmemmap.h                            | 2 +-
 mm/memory-failure.c                             | 6 +++---
 11 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0eb64aab3685..2d4cfdcb7535 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2114,7 +2114,7 @@ Kernel parameters
 	hugetlb_free_vmemmap=
 			[KNL] Requires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 			enabled.
-			Control if HugeTLB Vmemmap Optimization (HVO) is enabled.
+			Control if Hugepage Vmemmap Optimization (HVO) for HugeTLB is enabled.
 			Allows heavy hugetlb users to free up some more
 			memory (7 * PAGE_SIZE for each 2MB hugetlb page).
 			Format: { on | off (default) }
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index 67a941903fd2..3f98ca1d7ce1 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -172,8 +172,8 @@ default_hugepagesz
 	will all result in 256 2M huge pages being allocated.  Valid default
 	huge page size is architecture dependent.
 hugetlb_free_vmemmap
-	When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables HugeTLB
-	Vmemmap Optimization (HVO).
+	When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables Hugepage
+	Vmemmap Optimization (HVO) for HugeTLB.
 
 When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
 indicates the current number of pre-allocated huge pages of the default size.
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst
index 0207f8725142..d5e350607baa 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -682,7 +682,7 @@ block might fail:
   ZONE_MOVABLE for increasing the reliability of gigantic page allocation
   against the potential loss of hot-unplug reliability.
 
-- Out of memory when dissolving huge pages, especially when HugeTLB Vmemmap
+- Out of memory when dissolving huge pages, especially when Hugepage Vmemmap
   Optimization (HVO) is enabled.
 
   Offlining code may be able to migrate huge page contents, but may not be able
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 97e12359775c..9f333970fdb2 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -665,7 +665,8 @@ This knob is not available when the size of 'struct page' (a structure defined
 in include/linux/mm_types.h) is not power of two (an unusual system config could
 result in this).
 
-Enable (set to 1) or disable (set to 0) HugeTLB Vmemmap Optimization (HVO).
+Enable (set to 1) or disable (set to 0) Hugepage Vmemmap Optimization (HVO) for
+HugeTLB.
 
 Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
 buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages
diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst
index 9fa8642ded48..44e80bd2e398 100644
--- a/Documentation/mm/vmemmap_dedup.rst
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -8,7 +8,7 @@ A vmemmap diet for HugeTLB and Device DAX
 HugeTLB
 =======
 
-This section is to explain how HugeTLB Vmemmap Optimization (HVO) works.
+This section is to explain how Hugepage Vmemmap Optimization (HVO) for HugeTLB works.
 
 The ``struct page`` structures are used to describe a physical page frame. By
 default, there is a one-to-one mapping from a page frame to its corresponding
diff --git a/fs/Kconfig b/fs/Kconfig
index f6cee1bbb1fc..496cfa2379e5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -261,11 +261,11 @@ menuconfig HUGETLBFS
 
 if HUGETLBFS
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
-	bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
+	bool "Hugepage Vmemmap Optimization (HVO) for HugeTLB defaults to on"
 	default n
 	depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	help
-	  The HugeTLB Vmemmap Optimization (HVO) defaults to off. Say Y here to
+	  The Hugepage Vmemmap Optimization (HVO) for HugeTLB defaults to off. Say Y here to
 	  enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
 	  (boot command line) or hugetlb_optimize_vmemmap (sysctl).
 endif # HUGETLBFS
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7484e7be7b6d..efb37f2ffec4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -97,7 +97,7 @@
 #define MAX_FOLIO_NR_PAGES	(1UL << MAX_FOLIO_ORDER)
 
 /*
- * HugeTLB Vmemmap Optimization (HVO) requires struct pages of the head page to
+ * Hugepage Vmemmap Optimization (HVO) requires struct pages of the head page to
  * be naturally aligned with regard to the folio size.
  *
  * HVO which is only active if the size of struct page is a power of 2.
diff --git a/mm/Kconfig b/mm/Kconfig
index ddd10cb4d0a3..c85ed7d7f37d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -416,7 +416,7 @@ config SPARSEMEM_VMEMMAP_OPTIMIZATION
 
 #
 # Select this config option from the architecture Kconfig, if it is preferred
-# to enable the feature of HugeTLB/dev_dax vmemmap optimization.
+# to enable the feature of HVO.
 #
 config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
 	bool
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index fce772e95adc..6f6f1740f540 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * HugeTLB Vmemmap Optimization (HVO)
+ * Hugepage Vmemmap Optimization (HVO) for HugeTLB
  *
  * Copyright (c) 2020, ByteDance. All rights reserved.
  *
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 2b0a85e09602..b4d0ba27b42c 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * HugeTLB Vmemmap Optimization (HVO)
+ * Hugepage Vmemmap Optimization (HVO) for HugeTLB
  *
  * Copyright (c) 2020, ByteDance. All rights reserved.
  *
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 866c4428ac7e..ad6416145667 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -175,9 +175,9 @@ static int __page_handle_poison(struct page *page)
 	/*
 	 * zone_pcp_disable() can't be used here. It will
 	 * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
-	 * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
-	 * optimization is enabled. This will break current lock dependency
-	 * chain and leads to deadlock.
+	 * cpu_hotplug_lock via static_key_slow_dec() when HVO for HugeTLB
+	 * is enabled. This will break current lock dependency chain and leads
+	 * to deadlock.
 	 * Disabling pcp before dissolving the page was a deterministic
 	 * approach because we made sure that those pages cannot end up in any
 	 * PCP list. Draining PCP lists expels those pages to the buddy system,
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 57/69] mm/sparse-vmemmap: Consolidate HVO enable checks
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

HVO depends on build-time conditions that are not fully expressible in
Kconfig, including whether sizeof(struct page) is a power of two and
whether the supported folio order range can use the optimized layout.

Those checks are currently duplicated in several places. Define
SPARSEMEM_VMEMMAP_OPTIMIZATION in bounds.c when the build-time
requirements are met, and use that generated constant to guard the
generic HVO code.

This centralizes the build-time checks instead of repeating them
throughout the HVO paths.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 arch/x86/entry/vdso/vdso32/fake_32bit_build.h |  1 -
 drivers/dax/Kconfig                           |  2 +-
 fs/Kconfig                                    |  2 +-
 include/linux/mm_types.h                      |  3 +-
 include/linux/mmzone.h                        | 38 ++++++++-----------
 include/linux/page-flags-layout.h             |  2 +
 include/linux/page-flags.h                    | 28 ++------------
 kernel/bounds.c                               |  5 +++
 mm/Kconfig                                    |  2 +-
 mm/hugetlb_vmemmap.c                          |  2 +
 mm/hugetlb_vmemmap.h                          |  4 +-
 mm/internal.h                                 |  3 --
 mm/sparse.c                                   |  6 +--
 mm/util.c                                     |  2 +-
 14 files changed, 38 insertions(+), 62 deletions(-)

diff --git a/arch/x86/entry/vdso/vdso32/fake_32bit_build.h b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
index 5f8424eade2b..db1b15f686e3 100644
--- a/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
+++ b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
@@ -11,7 +11,6 @@
 #undef CONFIG_PGTABLE_LEVELS
 #undef CONFIG_ILLEGAL_POINTER_VALUE
 #undef CONFIG_SPARSEMEM_VMEMMAP
-#undef CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION
 #undef CONFIG_NR_CPUS
 #undef CONFIG_PARAVIRT_XXL
 
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 60cb05dce53d..cb7710c29885 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -8,7 +8,7 @@ if DAX
 config DEV_DAX
 	tristate "Device DAX: direct access mapping device"
 	depends on TRANSPARENT_HUGEPAGE
-	select SPARSEMEM_VMEMMAP_OPTIMIZATION if ARCH_WANT_OPTIMIZE_DAX_VMEMMAP && SPARSEMEM_VMEMMAP
+	select SPARSEMEM_VMEMMAP_OPTIMIZATION_ENABLE if ARCH_WANT_OPTIMIZE_DAX_VMEMMAP && SPARSEMEM_VMEMMAP
 	help
 	  Support raw access to differentiated (persistence, bandwidth,
 	  latency...) memory via an mmap(2) capable character
diff --git a/fs/Kconfig b/fs/Kconfig
index 496cfa2379e5..ab3937abe07f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -278,7 +278,7 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	def_bool HUGETLB_PAGE
 	depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
 	depends on SPARSEMEM_VMEMMAP
-	select SPARSEMEM_VMEMMAP_OPTIMIZATION
+	select SPARSEMEM_VMEMMAP_OPTIMIZATION_ENABLE
 
 config HUGETLB_PMD_PAGE_TABLE_SHARING
 	def_bool HUGETLB_PAGE
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a308e2c23b82..9a7cd7575f3a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -546,6 +546,7 @@ FOLIO_MATCH(flags, _flags_3);
 FOLIO_MATCH(compound_info, _head_3);
 #undef FOLIO_MATCH
 
+#ifndef __GENERATING_BOUNDS_H
 /**
  * struct ptdesc -    Memory descriptor for page tables.
  * @pt_flags: enum pt_flags plus zone/node/section.
@@ -1990,5 +1991,5 @@ static inline unsigned long mmf_init_legacy_flags(unsigned long flags)
 			   (1UL << MMF_HAS_MDWE_NO_INHERIT));
 	return flags & MMF_INIT_LEGACY_MASK;
 }
-
+#endif /* __GENERATING_BOUNDS_H */
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index efb37f2ffec4..0d49d6e163ff 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -3,8 +3,6 @@
 #define _LINUX_MMZONE_H
 
 #ifndef __ASSEMBLY__
-#ifndef __GENERATING_BOUNDS_H
-
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/list_nulls.h>
@@ -96,33 +94,32 @@
 
 #define MAX_FOLIO_NR_PAGES	(1UL << MAX_FOLIO_ORDER)
 
-/*
- * Hugepage Vmemmap Optimization (HVO) requires struct pages of the head page to
- * be naturally aligned with regard to the folio size.
- *
- * HVO which is only active if the size of struct page is a power of 2.
- */
-#define MAX_FOLIO_VMEMMAP_ALIGN					\
-	(IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION) &&	\
-	 is_power_of_2(sizeof(struct page)) ?			\
-	 MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0)
-
 /* The number of vmemmap pages required by a vmemmap-optimized folio. */
 #define OPTIMIZED_FOLIO_VMEMMAP_PAGES		1
 #define OPTIMIZED_FOLIO_VMEMMAP_SIZE		(OPTIMIZED_FOLIO_VMEMMAP_PAGES * PAGE_SIZE)
 #define OPTIMIZED_FOLIO_VMEMMAP_NR_STRUCT_PAGES	(OPTIMIZED_FOLIO_VMEMMAP_SIZE / sizeof(struct page))
 #define OPTIMIZABLE_FOLIO_MIN_ORDER		(ilog2(OPTIMIZED_FOLIO_VMEMMAP_NR_STRUCT_PAGES) + 1)
 
-#define __NR_OPTIMIZABLE_FOLIO_ORDERS		(MAX_FOLIO_ORDER - OPTIMIZABLE_FOLIO_MIN_ORDER + 1)
-#define NR_OPTIMIZABLE_FOLIO_ORDERS		\
-	((__NR_OPTIMIZABLE_FOLIO_ORDERS > 0 &&	\
-	  IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION)) ? __NR_OPTIMIZABLE_FOLIO_ORDERS : 0)
+#ifdef SPARSEMEM_VMEMMAP_OPTIMIZATION
+/*
+ * Hugepage Vmemmap Optimization (HVO) requires the struct page of the head page
+ * to be naturally aligned with regard to the vmemmap size of the maximal folio.
+ */
+#define MAX_FOLIO_VMEMMAP_ALIGN			(MAX_FOLIO_NR_PAGES * sizeof(struct page))
+#define NR_OPTIMIZABLE_FOLIO_ORDERS		(MAX_FOLIO_ORDER - OPTIMIZABLE_FOLIO_MIN_ORDER + 1)
+#else
+#define MAX_FOLIO_VMEMMAP_ALIGN			0
+#define NR_OPTIMIZABLE_FOLIO_ORDERS		0
+#endif
 
 static inline bool order_vmemmap_optimizable(unsigned int order)
 {
+	if (!IS_ENABLED(SPARSEMEM_VMEMMAP_OPTIMIZATION))
+		return false;
 	return order >= OPTIMIZABLE_FOLIO_MIN_ORDER;
 }
 
+#ifndef __GENERATING_BOUNDS_H
 enum migratetype {
 	MIGRATE_UNMOVABLE,
 	MIGRATE_MOVABLE,
@@ -2044,7 +2041,7 @@ struct mem_section {
 	 */
 	struct page_ext *page_ext;
 #endif
-#ifdef CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION
+#ifdef SPARSEMEM_VMEMMAP_OPTIMIZATION
 	/*
 	 * The order of compound pages in this section. Typically, the section
 	 * holds compound pages of this order; a larger compound page will span
@@ -2236,7 +2233,7 @@ static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long
 }
 #endif
 
-#ifdef CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION
+#ifdef SPARSEMEM_VMEMMAP_OPTIMIZATION
 static inline void section_set_order(struct mem_section *section, unsigned int order)
 {
 	VM_WARN_ON(section->order && order && section->order != order);
@@ -2277,9 +2274,6 @@ static inline unsigned int pfn_to_section_order(unsigned long pfn)
 
 static inline bool section_vmemmap_optimizable(const struct mem_section *section)
 {
-	if (!is_power_of_2(sizeof(struct page)))
-		return false;
-
 	return order_vmemmap_optimizable(section_order(section));
 }
 
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 760006b1c480..6a7e7f3dbb93 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -2,6 +2,7 @@
 #ifndef PAGE_FLAGS_LAYOUT_H
 #define PAGE_FLAGS_LAYOUT_H
 
+#ifndef __GENERATING_BOUNDS_H
 #include <linux/numa.h>
 #include <generated/bounds.h>
 
@@ -121,4 +122,5 @@
 				(NR_NON_PAGEFLAG_BITS + NR_PAGEFLAGS))
 
 #endif
+#endif /* __GENERATING_BOUNDS_H */
 #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 12665b34586c..df7f6dea2e5b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -198,32 +198,12 @@ enum pageflags {
 
 #ifndef __GENERATING_BOUNDS_H
 
-/*
- * For tail pages, if the size of struct page is power-of-2 ->compound_info
- * encodes the mask that converts the address of the tail page address to
- * the head page address.
- *
- * Otherwise, ->compound_info has direct pointer to head pages.
- */
-static __always_inline bool compound_info_has_mask(void)
-{
-	/*
-	 * The approach with mask would work in the wider set of conditions,
-	 * but it requires validating that struct pages are naturally aligned
-	 * for all orders up to the MAX_FOLIO_ORDER, which can be tricky.
-	 */
-	if (!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION))
-		return false;
-
-	return is_power_of_2(sizeof(struct page));
-}
-
 static __always_inline unsigned long _compound_head(const struct page *page)
 {
 	unsigned long info = READ_ONCE(page->compound_info);
 	unsigned long mask;
 
-	if (!compound_info_has_mask()) {
+	if (!IS_ENABLED(SPARSEMEM_VMEMMAP_OPTIMIZATION)) {
 		/* Bit 0 encodes PageTail() */
 		if (info & 1)
 			return info - 1;
@@ -232,8 +212,8 @@ static __always_inline unsigned long _compound_head(const struct page *page)
 	}
 
 	/*
-	 * If compound_info_has_mask() is true the rest of the info encodes
-	 * the mask that converts the address of the tail page to the head page.
+	 * If HVO is enabled the rest of the info encodes the mask that converts
+	 * the address of the tail page to the head page.
 	 *
 	 * No need to clear bit 0 in the mask as 'page' always has it clear.
 	 *
@@ -257,7 +237,7 @@ static __always_inline void set_compound_head(struct page *tail,
 	unsigned int shift;
 	unsigned long mask;
 
-	if (!compound_info_has_mask()) {
+	if (!IS_ENABLED(SPARSEMEM_VMEMMAP_OPTIMIZATION)) {
 		WRITE_ONCE(tail->compound_info, (unsigned long)head | 1);
 		return;
 	}
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 02b619eb6106..9638260d67f8 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -8,6 +8,7 @@
 #define __GENERATING_BOUNDS_H
 #define COMPILE_OFFSETS
 /* Include headers that define the enum constants of interest */
+#include <linux/mm_types.h>
 #include <linux/page-flags.h>
 #include <linux/mmzone.h>
 #include <linux/kbuild.h>
@@ -30,6 +31,10 @@ int main(void)
 	DEFINE(LRU_GEN_WIDTH, 0);
 	DEFINE(__LRU_REFS_WIDTH, 0);
 #endif
+	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP_OPTIMIZATION_ENABLE) &&
+	    is_power_of_2(sizeof(struct page)) &&
+	    MAX_FOLIO_ORDER >= OPTIMIZABLE_FOLIO_MIN_ORDER)
+		DEFINE(SPARSEMEM_VMEMMAP_OPTIMIZATION, 1);
 	/* End of constants */
 
 	return 0;
diff --git a/mm/Kconfig b/mm/Kconfig
index c85ed7d7f37d..52d9d69a95ff 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -410,7 +410,7 @@ config SPARSEMEM_VMEMMAP
 	  pfn_to_page and page_to_pfn operations.  This is the most
 	  efficient option when sufficient kernel resources are available.
 
-config SPARSEMEM_VMEMMAP_OPTIMIZATION
+config SPARSEMEM_VMEMMAP_OPTIMIZATION_ENABLE
 	bool
 	depends on SPARSEMEM_VMEMMAP
 
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 6f6f1740f540..1305bee1195a 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -22,6 +22,7 @@
 #include "hugetlb_vmemmap.h"
 #include "internal.h"
 
+#ifdef SPARSEMEM_VMEMMAP_OPTIMIZATION
 /**
  * struct vmemmap_remap_walk - walk vmemmap page table
  *
@@ -693,3 +694,4 @@ static int __init hugetlb_vmemmap_init(void)
 	return 0;
 }
 late_initcall(hugetlb_vmemmap_init);
+#endif
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index b4d0ba27b42c..dfd48be6b231 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -10,7 +10,7 @@
 #define _LINUX_HUGETLB_VMEMMAP_H
 #include <linux/hugetlb.h>
 
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+#if defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) && defined(SPARSEMEM_VMEMMAP_OPTIMIZATION)
 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio);
 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 					struct list_head *folio_list,
@@ -32,8 +32,6 @@ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate
 {
 	int size = hugetlb_vmemmap_size(h) - OPTIMIZED_FOLIO_VMEMMAP_SIZE;
 
-	if (!is_power_of_2(sizeof(struct page)))
-		return 0;
 	return size > 0 ? size : 0;
 }
 #else
diff --git a/mm/internal.h b/mm/internal.h
index 9597a703bc73..afdae79640b5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1023,9 +1023,6 @@ static inline bool vmemmap_page_optimizable(const struct page *page)
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long nr_pages = 1UL << pfn_to_section_order(pfn);
 
-	if (!is_power_of_2(sizeof(struct page)))
-		return false;
-
 	return (pfn & (nr_pages - 1)) >= OPTIMIZED_FOLIO_VMEMMAP_NR_STRUCT_PAGES;
 }
 #else
diff --git a/mm/sparse.c b/mm/sparse.c
index bdf23709a1c7..598da1651e49 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -301,10 +301,8 @@ void __init sparse_init(void)
 	unsigned long pnum_end, pnum_begin, map_count = 1;
 	int nid_begin;
 
-	if (compound_info_has_mask()) {
-		VM_WARN_ON_ONCE(!IS_ALIGNED((unsigned long) pfn_to_page(0),
-				    MAX_FOLIO_VMEMMAP_ALIGN));
-	}
+	VM_WARN_ON(IS_ENABLED(SPARSEMEM_VMEMMAP_OPTIMIZATION) &&
+		   !IS_ALIGNED((unsigned long)pfn_to_page(0), MAX_FOLIO_VMEMMAP_ALIGN));
 
 	pnum_begin = first_present_section_nr();
 	nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
diff --git a/mm/util.c b/mm/util.c
index 3cc949a0b7ed..4543f2b6ffa1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1338,7 +1338,7 @@ void snapshot_page(struct page_snapshot *ps, const struct page *page)
 		foliop = (struct folio *)page;
 	} else {
 		/* See compound_head() */
-		if (compound_info_has_mask()) {
+		if (IS_ENABLED(SPARSEMEM_VMEMMAP_OPTIMIZATION)) {
 			unsigned long p = (unsigned long)page;
 
 			foliop = (struct folio *)(p & info);
-- 
2.54.0



^ permalink raw reply related

* [PATCH v2 58/69] mm/hugetlb: Make HVO optimizable checks depend on generic logic
From: Muchun Song @ 2026-05-13 13:20 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Muchun Song, Oscar Salvador,
	Michael Ellerman, Madhavan Srinivasan
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Nicholas Piggin,
	Christophe Leroy, Ackerley Tng, Frank van der Linden,
	aneesh.kumar, joao.m.martins, linux-mm, linuxppc-dev,
	linux-kernel, Muchun Song
In-Reply-To: <20260513132044.41690-1-songmuchun@bytedance.com>

Make hugetlb_vmemmap_optimizable() reuse the generic
order_vmemmap_optimizable() logic, and switch hugetlb boolean call sites
to use the dedicated helper directly.

This keeps HugeTLB-specific optimizable checks aligned with the generic
vmemmap optimization rules and avoids open-coding the size-based test.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/hugetlb.h |  2 +-
 mm/hugetlb.c            |  4 ++--
 mm/hugetlb_vmemmap.h    | 43 ++++++++++++++++++++---------------------
 3 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 82dbb9ebead8..2383adb22ce1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -778,7 +778,7 @@ static inline unsigned long huge_page_mask(struct hstate *h)
 	return h->mask;
 }
 
-static inline unsigned int huge_page_order(struct hstate *h)
+static inline unsigned int huge_page_order(const struct hstate *h)
 {
 	return h->order;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54ef7d12c585..bd136fc6aec0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3351,7 +3351,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
 			folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
 					&node_states[N_MEMORY], NULL);
 			if (!folio && !list_empty(&folio_list) &&
-			    hugetlb_vmemmap_optimizable_size(h)) {
+			    hugetlb_vmemmap_optimizable(h)) {
 				prep_and_add_allocated_folios(h, &folio_list);
 				INIT_LIST_HEAD(&folio_list);
 				folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
@@ -3420,7 +3420,7 @@ static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned l
 	for (i = 0; i < num; ++i) {
 		struct folio *folio;
 
-		if (hugetlb_vmemmap_optimizable_size(h) &&
+		if (hugetlb_vmemmap_optimizable(h) &&
 		    (si_mem_available() == 0) && !list_empty(&folio_list)) {
 			prep_and_add_allocated_folios(h, &folio_list);
 			INIT_LIST_HEAD(&folio_list);
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index dfd48be6b231..1765f8274220 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -18,22 +18,6 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio);
 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
 void hugetlb_vmemmap_optimize_bootmem_page(struct huge_bootmem_page *m);
-
-static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
-{
-	return pages_per_huge_page(h) * sizeof(struct page);
-}
-
-/*
- * Return how many vmemmap size associated with a HugeTLB page that can be
- * optimized and can be freed to the buddy allocator.
- */
-static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
-{
-	int size = hugetlb_vmemmap_size(h) - OPTIMIZED_FOLIO_VMEMMAP_SIZE;
-
-	return size > 0 ? size : 0;
-}
 #else
 static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
 {
@@ -56,11 +40,6 @@ static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list
 {
 }
 
-static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
-{
-	return 0;
-}
-
 static inline void hugetlb_vmemmap_optimize_bootmem_page(struct huge_bootmem_page *m)
 {
 }
@@ -68,6 +47,26 @@ static inline void hugetlb_vmemmap_optimize_bootmem_page(struct huge_bootmem_pag
 
 static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
 {
-	return hugetlb_vmemmap_optimizable_size(h) != 0;
+	if (!IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP))
+		return false;
+
+	return order_vmemmap_optimizable(huge_page_order(h));
+}
+
+static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
+{
+	return pages_per_huge_page(h) * sizeof(struct page);
+}
+
+/*
+ * Return the size of the vmemmap area associated with a HugeTLB page
+ * that can be optimized.
+ */
+static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
+{
+	if (!hugetlb_vmemmap_optimizable(h))
+		return 0;
+
+	return hugetlb_vmemmap_size(h) - OPTIMIZED_FOLIO_VMEMMAP_SIZE;
 }
 #endif /* _LINUX_HUGETLB_VMEMMAP_H */
-- 
2.54.0



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox