[PATCH 0/4] mm/mm_init: simplify deferred init of struct pages

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages
@ 2025-08-18  6:46 Mike Rapoport
  2025-08-18  6:46 ` [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone() Mike Rapoport
                   ` (5 more replies)
  0 siblings, 6 replies; 19+ messages in thread
From: Mike Rapoport @ 2025-08-18  6:46 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Bill Wendling, Daniel Jordan, Justin Stitt,
	Michael Ellerman, Miguel Ojeda, Mike Rapoport, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

Hi,

These patches simplify deferred initialization of the memory map.

Beside nice negative diffstat I measured 3ms reduction in the
initialization of deferred pages on single node system with 64GiB of RAM.

I don't have access to large memory machines, so I'd really appreciate
testing of these patches on them to make sure there's no regression there.

The patches are also available at git:
https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=deferred-memmap-init/v1

Mike Rapoport (Microsoft) (4):
  mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
  mm/mm_init: deferred_init_memmap: use a job per zone
  mm/mm_init: drop deferred_init_maxorder()
  memblock: drop for_each_free_mem_pfn_range_in_zone_from()

 .clang-format            |   1 -
 include/linux/memblock.h |  22 -----
 mm/memblock.c            |  64 -------------
 mm/mm_init.c             | 195 +++++++++++++--------------------------
 4 files changed, 62 insertions(+), 220 deletions(-)


base-commit: 8f5ae30d69d7543eee0d70083daf4de8fe15d585
-- 
2.50.1



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
  2025-08-18  6:46 [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Mike Rapoport
@ 2025-08-18  6:46 ` Mike Rapoport
  2025-08-19  7:44   ` David Hildenbrand
  2025-08-19  9:52   ` Wei Yang
  2025-08-18  6:46 ` [PATCH 2/4] mm/mm_init: deferred_init_memmap: use a job per zone Mike Rapoport
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 19+ messages in thread
From: Mike Rapoport @ 2025-08-18  6:46 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Bill Wendling, Daniel Jordan, Justin Stitt,
	Michael Ellerman, Miguel Ojeda, Mike Rapoport, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

deferred_grow_zone() initializes one or more sections in the memory map
if buddy runs out of initialized struct pages when
CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.

It loops through memblock regions and initializes and frees pages in
MAX_ORDER_NR_PAGES chunks.

Essentially the same loop is implemented in deferred_init_memmap_chunk(),
the only actual difference is that deferred_init_memmap_chunk() does not
count initialized pages.

Make deferred_init_memmap_chunk() count the initialized pages and return
their number, wrap it with deferred_init_memmap_job() for multithreaded
initialization with padata_do_multithreaded() and replace open-coded
initialization of struct pages in deferred_grow_zone() with a call to
deferred_init_memmap_chunk().

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 mm/mm_init.c | 65 ++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5c21b3af216b..81809b83814b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2134,12 +2134,12 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
 	return nr_pages;
 }
 
-static void __init
+static unsigned long __init
 deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
-			   void *arg)
+			   struct zone *zone)
 {
+	unsigned long nr_pages = 0;
 	unsigned long spfn, epfn;
-	struct zone *zone = arg;
 	u64 i = 0;
 
 	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
@@ -2149,9 +2149,20 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
 	 * we can avoid introducing any issues with the buddy allocator.
 	 */
 	while (spfn < end_pfn) {
-		deferred_init_maxorder(&i, zone, &spfn, &epfn);
+		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
 		cond_resched();
 	}
+
+	return nr_pages;
+}
+
+static void __init
+deferred_init_memmap_job(unsigned long start_pfn, unsigned long end_pfn,
+			 void *arg)
+{
+	struct zone *zone = arg;
+
+	deferred_init_memmap_chunk(start_pfn, end_pfn, zone);
 }
 
 static unsigned int __init
@@ -2204,7 +2215,7 @@ static int __init deferred_init_memmap(void *data)
 	while (deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, first_init_pfn)) {
 		first_init_pfn = ALIGN(epfn, PAGES_PER_SECTION);
 		struct padata_mt_job job = {
-			.thread_fn   = deferred_init_memmap_chunk,
+			.thread_fn   = deferred_init_memmap_job,
 			.fn_arg      = zone,
 			.start       = spfn,
 			.size        = first_init_pfn - spfn,
@@ -2240,12 +2251,11 @@ static int __init deferred_init_memmap(void *data)
  */
 bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
 {
-	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+	unsigned long nr_pages_needed = SECTION_ALIGN_UP(1 << order);
 	pg_data_t *pgdat = zone->zone_pgdat;
 	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
 	unsigned long spfn, epfn, flags;
 	unsigned long nr_pages = 0;
-	u64 i = 0;
 
 	/* Only the last zone may have deferred pages */
 	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
@@ -2262,37 +2272,26 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
 		return true;
 	}
 
-	/* If the zone is empty somebody else may have cleared out the zone */
-	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
-						 first_deferred_pfn)) {
-		pgdat->first_deferred_pfn = ULONG_MAX;
-		pgdat_resize_unlock(pgdat, &flags);
-		/* Retry only once. */
-		return first_deferred_pfn != ULONG_MAX;
+	/*
+	 * Initialize at least nr_pages_needed in section chunks.
+	 * If a section has less free memory than nr_pages_needed, the next
+	 * section will be also initalized.
+	 * Note, that it still does not guarantee that allocation of order can
+	 * be satisfied if the sections are fragmented because of memblock
+	 * allocations.
+	 */
+	for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1);
+	     nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone);
+	     spfn = epfn, epfn += PAGES_PER_SECTION) {
+		nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone);
 	}
 
 	/*
-	 * Initialize and free pages in MAX_PAGE_ORDER sized increments so
-	 * that we can avoid introducing any issues with the buddy
-	 * allocator.
+	 * There were no pages to initialize and free which means the zone's
+	 * memory map is completely initialized.
 	 */
-	while (spfn < epfn) {
-		/* update our first deferred PFN for this section */
-		first_deferred_pfn = spfn;
-
-		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
-		touch_nmi_watchdog();
-
-		/* We should only stop along section boundaries */
-		if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
-			continue;
-
-		/* If our quota has been met we can stop here */
-		if (nr_pages >= nr_pages_needed)
-			break;
-	}
+	pgdat->first_deferred_pfn = nr_pages ? spfn : ULONG_MAX;
 
-	pgdat->first_deferred_pfn = spfn;
 	pgdat_resize_unlock(pgdat, &flags);
 
 	return nr_pages > 0;
-- 
2.50.1



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/4] mm/mm_init: deferred_init_memmap: use a job per zone
  2025-08-18  6:46 [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Mike Rapoport
  2025-08-18  6:46 ` [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone() Mike Rapoport
@ 2025-08-18  6:46 ` Mike Rapoport
  2025-08-19  7:45   ` David Hildenbrand
  2025-08-18  6:46 ` [PATCH 3/4] mm/mm_init: drop deferred_init_maxorder() Mike Rapoport
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 19+ messages in thread
From: Mike Rapoport @ 2025-08-18  6:46 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Bill Wendling, Daniel Jordan, Justin Stitt,
	Michael Ellerman, Miguel Ojeda, Mike Rapoport, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

deferred_init_memmap() loops over free memory ranges and creates a
padata_mt_job for every free range that intersects with the zone being
initialized.

padata_do_multithreaded() then splits every such range to several chunks
and runs a thread that initializes struct pages in that chunk using
deferred_init_memmap_chunk(). The number of threads is limited by amount of
the CPUs on the node (or 1 for memoryless nodes).

Looping through free memory ranges is then repeated in
deferred_init_memmap_chunk() first to find the first range that should be
initialized and then to traverse the ranges until the end of the chunk is
reached.

Remove the loop over free memory regions in deferred_init_memmap() and pass
the entire zone to padata_do_multithreaded() so that it will be divided to
several chunks by the parallelization code.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 mm/mm_init.c | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/mm/mm_init.c b/mm/mm_init.c
index 81809b83814b..1ecfba98ddbe 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2176,12 +2176,10 @@ static int __init deferred_init_memmap(void *data)
 {
 	pg_data_t *pgdat = data;
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-	unsigned long spfn = 0, epfn = 0;
-	unsigned long first_init_pfn, flags;
+	int max_threads = deferred_page_init_max_threads(cpumask);
+	unsigned long first_init_pfn, last_pfn, flags;
 	unsigned long start = jiffies;
 	struct zone *zone;
-	int max_threads;
-	u64 i = 0;
 
 	/* Bind memory initialisation thread to a local node if possible */
 	if (!cpumask_empty(cpumask))
@@ -2209,24 +2207,20 @@ static int __init deferred_init_memmap(void *data)
 
 	/* Only the highest zone is deferred */
 	zone = pgdat->node_zones + pgdat->nr_zones - 1;
-
-	max_threads = deferred_page_init_max_threads(cpumask);
-
-	while (deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, first_init_pfn)) {
-		first_init_pfn = ALIGN(epfn, PAGES_PER_SECTION);
-		struct padata_mt_job job = {
-			.thread_fn   = deferred_init_memmap_job,
-			.fn_arg      = zone,
-			.start       = spfn,
-			.size        = first_init_pfn - spfn,
-			.align       = PAGES_PER_SECTION,
-			.min_chunk   = PAGES_PER_SECTION,
-			.max_threads = max_threads,
-			.numa_aware  = false,
-		};
-
-		padata_do_multithreaded(&job);
-	}
+	last_pfn = SECTION_ALIGN_UP(zone_end_pfn(zone));
+
+	struct padata_mt_job job = {
+		.thread_fn   = deferred_init_memmap_job,
+		.fn_arg      = zone,
+		.start       = first_init_pfn,
+		.size        = last_pfn - first_init_pfn,
+		.align       = PAGES_PER_SECTION,
+		.min_chunk   = PAGES_PER_SECTION,
+		.max_threads = max_threads,
+		.numa_aware  = false,
+	};
+
+	padata_do_multithreaded(&job);
 
 	/* Sanity check that the next zone really is unpopulated */
 	WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone));
-- 
2.50.1



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 3/4] mm/mm_init: drop deferred_init_maxorder()
  2025-08-18  6:46 [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Mike Rapoport
  2025-08-18  6:46 ` [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone() Mike Rapoport
  2025-08-18  6:46 ` [PATCH 2/4] mm/mm_init: deferred_init_memmap: use a job per zone Mike Rapoport
@ 2025-08-18  6:46 ` Mike Rapoport
  2025-08-19  7:54   ` David Hildenbrand
  2025-08-18  6:46 ` [PATCH 4/4] memblock: drop for_each_free_mem_pfn_range_in_zone_from() Mike Rapoport
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 19+ messages in thread
From: Mike Rapoport @ 2025-08-18  6:46 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Bill Wendling, Daniel Jordan, Justin Stitt,
	Michael Ellerman, Miguel Ojeda, Mike Rapoport, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

deferred_init_memmap_chunk() calls deferred_init_maxorder() to initialize
struct pages in MAX_ORDER_NR_PAGES because according to commit 0e56acae4b4d
("mm: initialize MAX_ORDER_NR_PAGES at a time instead of doing larger
sections") this provides better cache locality than initializing the memory
map in larger sections.

The looping through free memory ranges is quite cumbersome in the current
implementation as it is divided between deferred_init_memmap_chunk() and
deferred_init_maxorder(). Besides, the latter has two loops, one that
initializes struct pages and another one that frees them.

There is no need in two loops because it is safe to free pages in groups
smaller than MAX_ORDER_NR_PAGES. Even if lookup for a buddy page will
access a struct page ahead of the pages being initialized, that page is
guaranteed to be initialized either by memmap_init_reserved_pages() or by
init_unavailable_range().

Simplify the code by moving initialization and freeing of the pages into
deferred_init_memmap_chunk() and dropping deferred_init_maxorder().

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 mm/mm_init.c | 122 ++++++++++++---------------------------------------
 1 file changed, 29 insertions(+), 93 deletions(-)

diff --git a/mm/mm_init.c b/mm/mm_init.c
index 1ecfba98ddbe..bca05891cb16 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2046,111 +2046,47 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
 }
 
 /*
- * This function is meant to pre-load the iterator for the zone init from
- * a given point.
- * Specifically it walks through the ranges starting with initial index
- * passed to it until we are caught up to the first_init_pfn value and
- * exits there. If we never encounter the value we return false indicating
- * there are no valid ranges left.
- */
-static bool __init
-deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
-				    unsigned long *spfn, unsigned long *epfn,
-				    unsigned long first_init_pfn)
-{
-	u64 j = *i;
-
-	if (j == 0)
-		__next_mem_pfn_range_in_zone(&j, zone, spfn, epfn);
-
-	/*
-	 * Start out by walking through the ranges in this zone that have
-	 * already been initialized. We don't need to do anything with them
-	 * so we just need to flush them out of the system.
-	 */
-	for_each_free_mem_pfn_range_in_zone_from(j, zone, spfn, epfn) {
-		if (*epfn <= first_init_pfn)
-			continue;
-		if (*spfn < first_init_pfn)
-			*spfn = first_init_pfn;
-		*i = j;
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, then free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
+ * Initialize and free pages.
+ *
+ * At this point reserved pages and struct pages that correspond to holes in
+ * memblock.memory are already intialized so every free range has a valid
+ * memory map around it.
+ * This ensures that access of pages that are ahead of the range being
+ * initialized (computing buddy page in __free_one_page()) always reads a valid
+ * struct page.
  *
- * In order to try and keep some memory in the cache we have the loop
- * broken along max page order boundaries. This way we will not cause
- * any issues with the buddy page computation.
+ * In order to try and improve CPU cache locality we have the loop broken along
+ * max page order boundaries.
  */
 static unsigned long __init
-deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
-		       unsigned long *end_pfn)
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+			   struct zone *zone)
 {
-	unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
-	unsigned long spfn = *start_pfn, epfn = *end_pfn;
+	int nid = zone_to_nid(zone);
 	unsigned long nr_pages = 0;
-	u64 j = *i;
-
-	/* First we loop through and initialize the page values */
-	for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
-		unsigned long t;
-
-		if (mo_pfn <= *start_pfn)
-			break;
-
-		t = min(mo_pfn, *end_pfn);
-		nr_pages += deferred_init_pages(zone, *start_pfn, t);
-
-		if (mo_pfn < *end_pfn) {
-			*start_pfn = mo_pfn;
-			break;
-		}
-	}
-
-	/* Reset values and now loop through freeing pages as needed */
-	swap(j, *i);
-
-	for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
-		unsigned long t;
-
-		if (mo_pfn <= spfn)
-			break;
+	phys_addr_t start, end;
+	u64 i = 0;
 
-		t = min(mo_pfn, epfn);
-		deferred_free_pages(spfn, t - spfn);
+	for_each_free_mem_range(i, nid, 0, &start, &end, NULL) {
+		unsigned long spfn = PFN_UP(start);
+		unsigned long epfn = PFN_DOWN(end);
 
-		if (mo_pfn <= epfn)
+		if (spfn >= end_pfn)
 			break;
-	}
 
-	return nr_pages;
-}
+		spfn = max(spfn, start_pfn);
+		epfn = min(epfn, end_pfn);
 
-static unsigned long __init
-deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
-			   struct zone *zone)
-{
-	unsigned long nr_pages = 0;
-	unsigned long spfn, epfn;
-	u64 i = 0;
+		while (spfn < epfn) {
+			unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
+			unsigned long chunk_end = min(mo_pfn, epfn);
 
-	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+			nr_pages += deferred_init_pages(zone, spfn, chunk_end);
+			deferred_free_pages(spfn, chunk_end - spfn);
 
-	/*
-	 * Initialize and free pages in MAX_PAGE_ORDER sized increments so that
-	 * we can avoid introducing any issues with the buddy allocator.
-	 */
-	while (spfn < end_pfn) {
-		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
-		cond_resched();
+			spfn = chunk_end;
+			cond_resched();
+		}
 	}
 
 	return nr_pages;
-- 
2.50.1



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 4/4] memblock: drop for_each_free_mem_pfn_range_in_zone_from()
  2025-08-18  6:46 [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Mike Rapoport
                   ` (2 preceding siblings ...)
  2025-08-18  6:46 ` [PATCH 3/4] mm/mm_init: drop deferred_init_maxorder() Mike Rapoport
@ 2025-08-18  6:46 ` Mike Rapoport
  2025-08-19  7:39 ` [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Wei Yang
  2025-08-22  5:54 ` Mike Rapoport
  5 siblings, 0 replies; 19+ messages in thread
From: Mike Rapoport @ 2025-08-18  6:46 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Bill Wendling, Daniel Jordan, Justin Stitt,
	Michael Ellerman, Miguel Ojeda, Mike Rapoport, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

for_each_free_mem_pfn_range_in_zone_from() and its "backend" implementation
__next_mem_pfn_range_in_zone() were only used by deferred initialization of
the memory map.

Remove them as they are not used anymore.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 .clang-format            |  1 -
 include/linux/memblock.h | 22 --------------
 mm/memblock.c            | 64 ----------------------------------------
 3 files changed, 87 deletions(-)

diff --git a/.clang-format b/.clang-format
index 48405c54ef27..f371a13b4d19 100644
--- a/.clang-format
+++ b/.clang-format
@@ -294,7 +294,6 @@ ForEachMacros:
   - 'for_each_fib6_node_rt_rcu'
   - 'for_each_fib6_walker_rt'
   - 'for_each_file_lock'
-  - 'for_each_free_mem_pfn_range_in_zone_from'
   - 'for_each_free_mem_range'
   - 'for_each_free_mem_range_reverse'
   - 'for_each_func_rsrc'
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index b96746376e17..20b61e910f4d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -323,28 +323,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 	for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
 	     i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
 
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
-				  unsigned long *out_spfn,
-				  unsigned long *out_epfn);
-
-/**
- * for_each_free_mem_pfn_range_in_zone_from - iterate through zone specific
- * free memblock areas from a given point
- * @i: u64 used as loop variable
- * @zone: zone in which all of the memory blocks reside
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
- *
- * Walks over free (memory && !reserved) areas of memblock in a specific
- * zone, continuing from current position. Available as soon as memblock is
- * initialized.
- */
-#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
-	for (; i != U64_MAX;					  \
-	     __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
-
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 /**
  * for_each_free_mem_range - iterate through free memblock areas
diff --git a/mm/memblock.c b/mm/memblock.c
index 154f1d73b61f..337c025109fa 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1438,70 +1438,6 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 	return 0;
 }
 
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-/**
- * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
- *
- * @idx: pointer to u64 loop variable
- * @zone: zone in which all of the memory blocks reside
- * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
- * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
- *
- * This function is meant to be a zone/pfn specific wrapper for the
- * for_each_mem_range type iterators. Specifically they are used in the
- * deferred memory init routines and as such we were duplicating much of
- * this logic throughout the code. So instead of having it in multiple
- * locations it seemed like it would make more sense to centralize this to
- * one new iterator that does everything they need.
- */
-void __init_memblock
-__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
-			     unsigned long *out_spfn, unsigned long *out_epfn)
-{
-	int zone_nid = zone_to_nid(zone);
-	phys_addr_t spa, epa;
-
-	__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
-			 &memblock.memory, &memblock.reserved,
-			 &spa, &epa, NULL);
-
-	while (*idx != U64_MAX) {
-		unsigned long epfn = PFN_DOWN(epa);
-		unsigned long spfn = PFN_UP(spa);
-
-		/*
-		 * Verify the end is at least past the start of the zone and
-		 * that we have at least one PFN to initialize.
-		 */
-		if (zone->zone_start_pfn < epfn && spfn < epfn) {
-			/* if we went too far just stop searching */
-			if (zone_end_pfn(zone) <= spfn) {
-				*idx = U64_MAX;
-				break;
-			}
-
-			if (out_spfn)
-				*out_spfn = max(zone->zone_start_pfn, spfn);
-			if (out_epfn)
-				*out_epfn = min(zone_end_pfn(zone), epfn);
-
-			return;
-		}
-
-		__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
-				 &memblock.memory, &memblock.reserved,
-				 &spa, &epa, NULL);
-	}
-
-	/* signal end of iteration */
-	if (out_spfn)
-		*out_spfn = ULONG_MAX;
-	if (out_epfn)
-		*out_epfn = 0;
-}
-
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
-
 /**
  * memblock_alloc_range_nid - allocate boot memory block
  * @size: size of memory block to be allocated in bytes
-- 
2.50.1



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages
  2025-08-18  6:46 [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Mike Rapoport
                   ` (3 preceding siblings ...)
  2025-08-18  6:46 ` [PATCH 4/4] memblock: drop for_each_free_mem_pfn_range_in_zone_from() Mike Rapoport
@ 2025-08-19  7:39 ` Wei Yang
  2025-08-19 10:41   ` Mike Rapoport
  2025-08-22  5:54 ` Mike Rapoport
  5 siblings, 1 reply; 19+ messages in thread
From: Wei Yang @ 2025-08-19  7:39 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-mm, Andrew Morton, Bill Wendling, Daniel Jordan,
	Justin Stitt, Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

On Mon, Aug 18, 2025 at 09:46:11AM +0300, Mike Rapoport wrote:
>From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
>Hi,
>
>These patches simplify deferred initialization of the memory map.
>
>Beside nice negative diffstat I measured 3ms reduction in the
>initialization of deferred pages on single node system with 64GiB of RAM.

Nice cleanup.

For this series:

Reviewed-by: Wei Yang <richard.weiyang@gmail.com>

I guess the speed up is from "use a job per zone". So we do initialization per
zone instead of per memblock range in the zone, right?

>
>I don't have access to large memory machines, so I'd really appreciate
>testing of these patches on them to make sure there's no regression there.
>
>The patches are also available at git:
>https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=deferred-memmap-init/v1
>
>Mike Rapoport (Microsoft) (4):
>  mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
>  mm/mm_init: deferred_init_memmap: use a job per zone
>  mm/mm_init: drop deferred_init_maxorder()
>  memblock: drop for_each_free_mem_pfn_range_in_zone_from()
>
> .clang-format            |   1 -
> include/linux/memblock.h |  22 -----
> mm/memblock.c            |  64 -------------
> mm/mm_init.c             | 195 +++++++++++++--------------------------
> 4 files changed, 62 insertions(+), 220 deletions(-)
>
>
>base-commit: 8f5ae30d69d7543eee0d70083daf4de8fe15d585
>-- 
>2.50.1
>

-- 
Wei Yang
Help you, Help me


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
  2025-08-18  6:46 ` [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone() Mike Rapoport
@ 2025-08-19  7:44   ` David Hildenbrand
  2025-08-19  9:52   ` Wei Yang
  1 sibling, 0 replies; 19+ messages in thread
From: David Hildenbrand @ 2025-08-19  7:44 UTC (permalink / raw)
  To: Mike Rapoport, linux-mm
  Cc: Andrew Morton, Bill Wendling, Daniel Jordan, Justin Stitt,
	Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

On 18.08.25 08:46, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> 
> deferred_grow_zone() initializes one or more sections in the memory map
> if buddy runs out of initialized struct pages when
> CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.
> 
> It loops through memblock regions and initializes and frees pages in
> MAX_ORDER_NR_PAGES chunks.
> 
> Essentially the same loop is implemented in deferred_init_memmap_chunk(),
> the only actual difference is that deferred_init_memmap_chunk() does not
> count initialized pages.
> 
> Make deferred_init_memmap_chunk() count the initialized pages and return
> their number, wrap it with deferred_init_memmap_job() for multithreaded
> initialization with padata_do_multithreaded() and replace open-coded
> initialization of struct pages in deferred_grow_zone() with a call to
> deferred_init_memmap_chunk().
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>

Reviewed-by: David Hildenbrand <david@redhat.com>

-- 
Cheers

David / dhildenb



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/4] mm/mm_init: deferred_init_memmap: use a job per zone
  2025-08-18  6:46 ` [PATCH 2/4] mm/mm_init: deferred_init_memmap: use a job per zone Mike Rapoport
@ 2025-08-19  7:45   ` David Hildenbrand
  0 siblings, 0 replies; 19+ messages in thread
From: David Hildenbrand @ 2025-08-19  7:45 UTC (permalink / raw)
  To: Mike Rapoport, linux-mm
  Cc: Andrew Morton, Bill Wendling, Daniel Jordan, Justin Stitt,
	Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

On 18.08.25 08:46, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> 
> deferred_init_memmap() loops over free memory ranges and creates a
> padata_mt_job for every free range that intersects with the zone being
> initialized.
> 
> padata_do_multithreaded() then splits every such range to several chunks
> and runs a thread that initializes struct pages in that chunk using
> deferred_init_memmap_chunk(). The number of threads is limited by amount of
> the CPUs on the node (or 1 for memoryless nodes).
> 
> Looping through free memory ranges is then repeated in
> deferred_init_memmap_chunk() first to find the first range that should be
> initialized and then to traverse the ranges until the end of the chunk is
> reached.
> 
> Remove the loop over free memory regions in deferred_init_memmap() and pass
> the entire zone to padata_do_multithreaded() so that it will be divided to
> several chunks by the parallelization code.
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---

Reviewed-by: David Hildenbrand <david@redhat.com>

-- 
Cheers

David / dhildenb



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] mm/mm_init: drop deferred_init_maxorder()
  2025-08-18  6:46 ` [PATCH 3/4] mm/mm_init: drop deferred_init_maxorder() Mike Rapoport
@ 2025-08-19  7:54   ` David Hildenbrand
  2025-08-19  9:22     ` Wei Yang
  0 siblings, 1 reply; 19+ messages in thread
From: David Hildenbrand @ 2025-08-19  7:54 UTC (permalink / raw)
  To: Mike Rapoport, linux-mm
  Cc: Andrew Morton, Bill Wendling, Daniel Jordan, Justin Stitt,
	Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

>   
> -static unsigned long __init
> -deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
> -			   struct zone *zone)
> -{
> -	unsigned long nr_pages = 0;
> -	unsigned long spfn, epfn;
> -	u64 i = 0;
> +		while (spfn < epfn) {
> +			unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
> +			unsigned long chunk_end = min(mo_pfn, epfn);
>   
> -	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
> +			nr_pages += deferred_init_pages(zone, spfn, chunk_end);
> +			deferred_free_pages(spfn, chunk_end - spfn);


I assume the expectation is that all PFNs in the start_pfn -> end_pfn 
range will go to this zone, correct?

-- 
Cheers

David / dhildenb



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] mm/mm_init: drop deferred_init_maxorder()
  2025-08-19  7:54   ` David Hildenbrand
@ 2025-08-19  9:22     ` Wei Yang
  2025-08-19 10:39       ` Mike Rapoport
  0 siblings, 1 reply; 19+ messages in thread
From: Wei Yang @ 2025-08-19  9:22 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Mike Rapoport, linux-mm, Andrew Morton, Bill Wendling,
	Daniel Jordan, Justin Stitt, Michael Ellerman, Miguel Ojeda,
	Nathan Chancellor, Nick Desaulniers, linux-kernel, llvm

On Tue, Aug 19, 2025 at 09:54:22AM +0200, David Hildenbrand wrote:
>> -static unsigned long __init
>> -deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
>> -			   struct zone *zone)
>> -{
>> -	unsigned long nr_pages = 0;
>> -	unsigned long spfn, epfn;
>> -	u64 i = 0;
>> +		while (spfn < epfn) {
>> +			unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
>> +			unsigned long chunk_end = min(mo_pfn, epfn);
>> -	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
>> +			nr_pages += deferred_init_pages(zone, spfn, chunk_end);
>> +			deferred_free_pages(spfn, chunk_end - spfn);
>
>
>I assume the expectation is that all PFNs in the start_pfn -> end_pfn range
>will go to this zone, correct?

I think so.

defer_init only apply to the highest zone in one node.

>
>-- 
>Cheers
>
>David / dhildenb
>

-- 
Wei Yang
Help you, Help me


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
  2025-08-18  6:46 ` [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone() Mike Rapoport
  2025-08-19  7:44   ` David Hildenbrand
@ 2025-08-19  9:52   ` Wei Yang
  2025-08-19 10:54     ` Mike Rapoport
  1 sibling, 1 reply; 19+ messages in thread
From: Wei Yang @ 2025-08-19  9:52 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-mm, Andrew Morton, Bill Wendling, Daniel Jordan,
	Justin Stitt, Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

Hi, Mike

After going through the code again, I have some trivial thoughts to discuss
with you. If not right, please let me know.

On Mon, Aug 18, 2025 at 09:46:12AM +0300, Mike Rapoport wrote:
[...]
> bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
> {
>-	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
>+	unsigned long nr_pages_needed = SECTION_ALIGN_UP(1 << order);
> 	pg_data_t *pgdat = zone->zone_pgdat;
> 	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
> 	unsigned long spfn, epfn, flags;
> 	unsigned long nr_pages = 0;
>-	u64 i = 0;
> 
> 	/* Only the last zone may have deferred pages */
> 	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
>@@ -2262,37 +2272,26 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
> 		return true;
> 	}

In the file above this line, there is a compare between first_deferred_pfn and
its original value after grab pgdat_resize_lock.

I am thinking to compare first_deferred_pfn with ULONG_MAX, as it compared in
deferred_init_memmap(). This indicate this zone has already been initialized
totally.

Current code guard this by spfn < zone_end_pfn(zone). Maybe a check ahead
would be more clear?

> 
>-	/* If the zone is empty somebody else may have cleared out the zone */
>-	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
>-						 first_deferred_pfn)) {
>-		pgdat->first_deferred_pfn = ULONG_MAX;
>-		pgdat_resize_unlock(pgdat, &flags);
>-		/* Retry only once. */
>-		return first_deferred_pfn != ULONG_MAX;
>+	/*
>+	 * Initialize at least nr_pages_needed in section chunks.
>+	 * If a section has less free memory than nr_pages_needed, the next
>+	 * section will be also initalized.
>+	 * Note, that it still does not guarantee that allocation of order can
>+	 * be satisfied if the sections are fragmented because of memblock
>+	 * allocations.
>+	 */
>+	for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1);

I am expecting first_deferred_pfn is section aligned. So epfn += PAGES_PER_SECTION
is fine?

Maybe I missed something.

>+	     nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone);
>+	     spfn = epfn, epfn += PAGES_PER_SECTION) {
>+		nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone);
> 	}
> 
> 	/*
>-	 * Initialize and free pages in MAX_PAGE_ORDER sized increments so
>-	 * that we can avoid introducing any issues with the buddy
>-	 * allocator.
>+	 * There were no pages to initialize and free which means the zone's
>+	 * memory map is completely initialized.
> 	 */
>-	while (spfn < epfn) {
>-		/* update our first deferred PFN for this section */
>-		first_deferred_pfn = spfn;
>-
>-		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
>-		touch_nmi_watchdog();
>-
>-		/* We should only stop along section boundaries */
>-		if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
>-			continue;
>-
>-		/* If our quota has been met we can stop here */
>-		if (nr_pages >= nr_pages_needed)
>-			break;
>-	}
>+	pgdat->first_deferred_pfn = nr_pages ? spfn : ULONG_MAX;

If we come here because spfn >= zone_end_pfn(zone), first_deferred_pfn is left
a "valid" value and deferred_init_memmap() will try to do its job. But
actually nothing left to initialize.

For this case, I suggest to set it ULONG_MAX too. But this is really corner
case.

> 
>-	pgdat->first_deferred_pfn = spfn;
> 	pgdat_resize_unlock(pgdat, &flags);
> 
> 	return nr_pages > 0;
>-- 
>2.50.1
>

-- 
Wei Yang
Help you, Help me


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] mm/mm_init: drop deferred_init_maxorder()
  2025-08-19  9:22     ` Wei Yang
@ 2025-08-19 10:39       ` Mike Rapoport
  2025-08-19 12:31         ` David Hildenbrand
  0 siblings, 1 reply; 19+ messages in thread
From: Mike Rapoport @ 2025-08-19 10:39 UTC (permalink / raw)
  To: Wei Yang
  Cc: David Hildenbrand, linux-mm, Andrew Morton, Bill Wendling,
	Daniel Jordan, Justin Stitt, Michael Ellerman, Miguel Ojeda,
	Nathan Chancellor, Nick Desaulniers, linux-kernel, llvm

On Tue, Aug 19, 2025 at 09:22:54AM +0000, Wei Yang wrote:
> On Tue, Aug 19, 2025 at 09:54:22AM +0200, David Hildenbrand wrote:
> >> -static unsigned long __init
> >> -deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
> >> -			   struct zone *zone)
> >> -{
> >> -	unsigned long nr_pages = 0;
> >> -	unsigned long spfn, epfn;
> >> -	u64 i = 0;
> >> +		while (spfn < epfn) {
> >> +			unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
> >> +			unsigned long chunk_end = min(mo_pfn, epfn);
> >> -	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
> >> +			nr_pages += deferred_init_pages(zone, spfn, chunk_end);
> >> +			deferred_free_pages(spfn, chunk_end - spfn);
> >
> >
> >I assume the expectation is that all PFNs in the start_pfn -> end_pfn range
> >will go to this zone, correct?
> 
> I think so.
> 
> defer_init only apply to the highest zone in one node.

Right, we defer initialization of last zone in every node and there is a
thread per node that does the initialization.

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages
  2025-08-19  7:39 ` [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Wei Yang
@ 2025-08-19 10:41   ` Mike Rapoport
  0 siblings, 0 replies; 19+ messages in thread
From: Mike Rapoport @ 2025-08-19 10:41 UTC (permalink / raw)
  To: Wei Yang
  Cc: linux-mm, Andrew Morton, Bill Wendling, Daniel Jordan,
	Justin Stitt, Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

On Tue, Aug 19, 2025 at 07:39:41AM +0000, Wei Yang wrote:
> On Mon, Aug 18, 2025 at 09:46:11AM +0300, Mike Rapoport wrote:
> >From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> >
> >Hi,
> >
> >These patches simplify deferred initialization of the memory map.
> >
> >Beside nice negative diffstat I measured 3ms reduction in the
> >initialization of deferred pages on single node system with 64GiB of RAM.
> 
> Nice cleanup.
> 
> For this series:
> 
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>

Thanks!

> I guess the speed up is from "use a job per zone". So we do initialization per
> zone instead of per memblock range in the zone, right?
 
Yes, we run a job per zone instead of a job per memblock range in the zone.

> >I don't have access to large memory machines, so I'd really appreciate
> >testing of these patches on them to make sure there's no regression there.
> >
> >The patches are also available at git:
> >https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=deferred-memmap-init/v1
> >
> >Mike Rapoport (Microsoft) (4):
> >  mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
> >  mm/mm_init: deferred_init_memmap: use a job per zone
> >  mm/mm_init: drop deferred_init_maxorder()
> >  memblock: drop for_each_free_mem_pfn_range_in_zone_from()
> >
> > .clang-format            |   1 -
> > include/linux/memblock.h |  22 -----
> > mm/memblock.c            |  64 -------------
> > mm/mm_init.c             | 195 +++++++++++++--------------------------
> > 4 files changed, 62 insertions(+), 220 deletions(-)
> >
> >
> >base-commit: 8f5ae30d69d7543eee0d70083daf4de8fe15d585
> >-- 
> >2.50.1
> >
> 
> -- 
> Wei Yang
> Help you, Help me

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
  2025-08-19  9:52   ` Wei Yang
@ 2025-08-19 10:54     ` Mike Rapoport
  2025-08-19 23:51       ` Wei Yang
  0 siblings, 1 reply; 19+ messages in thread
From: Mike Rapoport @ 2025-08-19 10:54 UTC (permalink / raw)
  To: Wei Yang
  Cc: linux-mm, Andrew Morton, Bill Wendling, Daniel Jordan,
	Justin Stitt, Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

On Tue, Aug 19, 2025 at 09:52:23AM +0000, Wei Yang wrote:
> Hi, Mike
> 
> After going through the code again, I have some trivial thoughts to discuss
> with you. If not right, please let me know.
> 
> On Mon, Aug 18, 2025 at 09:46:12AM +0300, Mike Rapoport wrote:
> [...]
> > bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
> > {
> >-	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
> >+	unsigned long nr_pages_needed = SECTION_ALIGN_UP(1 << order);
> > 	pg_data_t *pgdat = zone->zone_pgdat;
> > 	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
> > 	unsigned long spfn, epfn, flags;
> > 	unsigned long nr_pages = 0;
> >-	u64 i = 0;
> > 
> > 	/* Only the last zone may have deferred pages */
> > 	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
> >@@ -2262,37 +2272,26 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
> > 		return true;
> > 	}
> 
> In the file above this line, there is a compare between first_deferred_pfn and
> its original value after grab pgdat_resize_lock.

Do you mean this one:

	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
		pgdat_resize_unlock(pgdat, &flags);
		return true;
	}
 
> I am thinking to compare first_deferred_pfn with ULONG_MAX, as it compared in
> deferred_init_memmap(). This indicate this zone has already been initialized
> totally.

It may be another CPU ran deferred_grow_zone() and won the race for resize
lock. Then pgdat->first_deferred_pfn will be larger than
first_deferred_pfn, but still not entire zone would be initialized.
 
> Current code guard this by spfn < zone_end_pfn(zone). Maybe a check ahead
> would be more clear?

Not sure I follow you here. The check that we don't pass zone_end_pfn is
inside the loop for every section we initialize.
 
> > 
> >-	/* If the zone is empty somebody else may have cleared out the zone */
> >-	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
> >-						 first_deferred_pfn)) {
> >-		pgdat->first_deferred_pfn = ULONG_MAX;
> >-		pgdat_resize_unlock(pgdat, &flags);
> >-		/* Retry only once. */
> >-		return first_deferred_pfn != ULONG_MAX;
> >+	/*
> >+	 * Initialize at least nr_pages_needed in section chunks.
> >+	 * If a section has less free memory than nr_pages_needed, the next
> >+	 * section will be also initalized.
> >+	 * Note, that it still does not guarantee that allocation of order can
> >+	 * be satisfied if the sections are fragmented because of memblock
> >+	 * allocations.
> >+	 */
> >+	for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1);
> 
> I am expecting first_deferred_pfn is section aligned. So epfn += PAGES_PER_SECTION
> is fine?

It should be, but I'd prefer to be on the safe side and keep it this way.
 
> Maybe I missed something.
> 
> >+	     nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone);
> >+	     spfn = epfn, epfn += PAGES_PER_SECTION) {
> >+		nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone);
> > 	}
> > 
> > 	/*
> >-	 * Initialize and free pages in MAX_PAGE_ORDER sized increments so
> >-	 * that we can avoid introducing any issues with the buddy
> >-	 * allocator.
> >+	 * There were no pages to initialize and free which means the zone's
> >+	 * memory map is completely initialized.
> > 	 */
> >-	while (spfn < epfn) {
> >-		/* update our first deferred PFN for this section */
> >-		first_deferred_pfn = spfn;
> >-
> >-		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
> >-		touch_nmi_watchdog();
> >-
> >-		/* We should only stop along section boundaries */
> >-		if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
> >-			continue;
> >-
> >-		/* If our quota has been met we can stop here */
> >-		if (nr_pages >= nr_pages_needed)
> >-			break;
> >-	}
> >+	pgdat->first_deferred_pfn = nr_pages ? spfn : ULONG_MAX;
> 
> If we come here because spfn >= zone_end_pfn(zone), first_deferred_pfn is left
> a "valid" value and deferred_init_memmap() will try to do its job. But
> actually nothing left to initialize.

We anyway run a thread for each node with memory. In the very unlikely case
we've completely initialized a deferred zone that thread will finish much
faster :)
 
> For this case, I suggest to set it ULONG_MAX too. But this is really corner
> case.

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] mm/mm_init: drop deferred_init_maxorder()
  2025-08-19 10:39       ` Mike Rapoport
@ 2025-08-19 12:31         ` David Hildenbrand
  0 siblings, 0 replies; 19+ messages in thread
From: David Hildenbrand @ 2025-08-19 12:31 UTC (permalink / raw)
  To: Mike Rapoport, Wei Yang
  Cc: linux-mm, Andrew Morton, Bill Wendling, Daniel Jordan,
	Justin Stitt, Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

On 19.08.25 12:39, Mike Rapoport wrote:
> On Tue, Aug 19, 2025 at 09:22:54AM +0000, Wei Yang wrote:
>> On Tue, Aug 19, 2025 at 09:54:22AM +0200, David Hildenbrand wrote:
>>>> -static unsigned long __init
>>>> -deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
>>>> -			   struct zone *zone)
>>>> -{
>>>> -	unsigned long nr_pages = 0;
>>>> -	unsigned long spfn, epfn;
>>>> -	u64 i = 0;
>>>> +		while (spfn < epfn) {
>>>> +			unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
>>>> +			unsigned long chunk_end = min(mo_pfn, epfn);
>>>> -	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
>>>> +			nr_pages += deferred_init_pages(zone, spfn, chunk_end);
>>>> +			deferred_free_pages(spfn, chunk_end - spfn);
>>>
>>>
>>> I assume the expectation is that all PFNs in the start_pfn -> end_pfn range
>>> will go to this zone, correct?
>>
>> I think so.
>>
>> defer_init only apply to the highest zone in one node.
> 
> Right, we defer initialization of last zone in every node and there is a
> thread per node that does the initialization.

Thanks, my memory comes back :)

Reviewed-by: David Hildenbrand <david@redhat.com>

-- 
Cheers

David / dhildenb



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
  2025-08-19 10:54     ` Mike Rapoport
@ 2025-08-19 23:51       ` Wei Yang
  2025-08-20  9:20         ` Mike Rapoport
  0 siblings, 1 reply; 19+ messages in thread
From: Wei Yang @ 2025-08-19 23:51 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Wei Yang, linux-mm, Andrew Morton, Bill Wendling, Daniel Jordan,
	Justin Stitt, Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

On Tue, Aug 19, 2025 at 01:54:46PM +0300, Mike Rapoport wrote:
>On Tue, Aug 19, 2025 at 09:52:23AM +0000, Wei Yang wrote:
>> Hi, Mike
>> 
>> After going through the code again, I have some trivial thoughts to discuss
>> with you. If not right, please let me know.
>> 
>> On Mon, Aug 18, 2025 at 09:46:12AM +0300, Mike Rapoport wrote:
>> [...]
>> > bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
>> > {
>> >-	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
>> >+	unsigned long nr_pages_needed = SECTION_ALIGN_UP(1 << order);
>> > 	pg_data_t *pgdat = zone->zone_pgdat;
>> > 	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
>> > 	unsigned long spfn, epfn, flags;
>> > 	unsigned long nr_pages = 0;
>> >-	u64 i = 0;
>> > 
>> > 	/* Only the last zone may have deferred pages */
>> > 	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
>> >@@ -2262,37 +2272,26 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
>> > 		return true;
>> > 	}
>> 
>> In the file above this line, there is a compare between first_deferred_pfn and
>> its original value after grab pgdat_resize_lock.
>
>Do you mean this one:
>
>	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
>		pgdat_resize_unlock(pgdat, &flags);
>		return true;
>	}
> 

Yes.

I am thinking something like this:

 	if (first_deferred_pfn != pgdat->first_deferred_pfn || 
	    first_deferred_pfn == ULONG_MAX)

This means

  * someone else has grow zone before we grab the lock
  * or the whole zone has already been initialized

>> I am thinking to compare first_deferred_pfn with ULONG_MAX, as it compared in
>> deferred_init_memmap(). This indicate this zone has already been initialized
>> totally.
>
>It may be another CPU ran deferred_grow_zone() and won the race for resize
>lock. Then pgdat->first_deferred_pfn will be larger than
>first_deferred_pfn, but still not entire zone would be initialized.
> 
>> Current code guard this by spfn < zone_end_pfn(zone). Maybe a check ahead
>> would be more clear?
>
>Not sure I follow you here. The check that we don't pass zone_end_pfn is
>inside the loop for every section we initialize.
> 

In case the zone has been initialized totally, first_deferred_pfn = ULONG_MAX.

Then we come to the loop with initial state:

    spfn = ULONG_MAX
    epfn = 0 (which is wrap around)

And loop condition check (spfn < zone_end_pfn(zone)) is false, so the loop is
skipped. This is how we handle a fully initialized zone now.

Would this be a little un-common?

>> > 
>> >-	/* If the zone is empty somebody else may have cleared out the zone */
>> >-	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
>> >-						 first_deferred_pfn)) {
>> >-		pgdat->first_deferred_pfn = ULONG_MAX;
>> >-		pgdat_resize_unlock(pgdat, &flags);
>> >-		/* Retry only once. */
>> >-		return first_deferred_pfn != ULONG_MAX;
>> >+	/*
>> >+	 * Initialize at least nr_pages_needed in section chunks.
>> >+	 * If a section has less free memory than nr_pages_needed, the next
>> >+	 * section will be also initalized.

Nit, one typo here. s/initalized/initialized/

>> >+	 * Note, that it still does not guarantee that allocation of order can
>> >+	 * be satisfied if the sections are fragmented because of memblock
>> >+	 * allocations.
>> >+	 */
>> >+	for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1);

-- 
Wei Yang
Help you, Help me


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
  2025-08-19 23:51       ` Wei Yang
@ 2025-08-20  9:20         ` Mike Rapoport
  2025-08-20 12:42           ` Wei Yang
  0 siblings, 1 reply; 19+ messages in thread
From: Mike Rapoport @ 2025-08-20  9:20 UTC (permalink / raw)
  To: Wei Yang
  Cc: linux-mm, Andrew Morton, Bill Wendling, Daniel Jordan,
	Justin Stitt, Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

On Tue, Aug 19, 2025 at 11:51:58PM +0000, Wei Yang wrote:
> On Tue, Aug 19, 2025 at 01:54:46PM +0300, Mike Rapoport wrote:
> >On Tue, Aug 19, 2025 at 09:52:23AM +0000, Wei Yang wrote:
> >> Hi, Mike
> >> 
> >> After going through the code again, I have some trivial thoughts to discuss
> >> with you. If not right, please let me know.
> >> 
> >> On Mon, Aug 18, 2025 at 09:46:12AM +0300, Mike Rapoport wrote:
> >> 
> >> In the file above this line, there is a compare between first_deferred_pfn and
> >> its original value after grab pgdat_resize_lock.
> >
> >Do you mean this one:
> >
> >	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
> >		pgdat_resize_unlock(pgdat, &flags);
> >		return true;
> >	}
> > 
> 
> Yes.
> 
> I am thinking something like this:
> 
>  	if (first_deferred_pfn != pgdat->first_deferred_pfn || 
> 	    first_deferred_pfn == ULONG_MAX)
> 
> This means
> 
>   * someone else has grow zone before we grab the lock
>   * or the whole zone has already been initialized

deferred_grow_zone() can be called only before deferred_init_memmap(), so
it's very unlikely that a zone will be completely initialized here. We
start with at least one section with each deferred zone and every call to
deferred_grow_zone() adds a section.

And even if that was a case and first_deferred_pfn is ULONG_MAX, the loop
below will end immediately, so I don't think additional condition here
would be helpful.
 
> >> I am thinking to compare first_deferred_pfn with ULONG_MAX, as it compared in
> >> deferred_init_memmap(). This indicate this zone has already been initialized
> >> totally.
> >
> >It may be another CPU ran deferred_grow_zone() and won the race for resize
> >lock. Then pgdat->first_deferred_pfn will be larger than
> >first_deferred_pfn, but still not entire zone would be initialized.
> > 
> >> Current code guard this by spfn < zone_end_pfn(zone). Maybe a check ahead
> >> would be more clear?
> >
> >Not sure I follow you here. The check that we don't pass zone_end_pfn is
> >inside the loop for every section we initialize.
> > 
> 
> In case the zone has been initialized totally, first_deferred_pfn = ULONG_MAX.
> 
> Then we come to the loop with initial state:
> 
>     spfn = ULONG_MAX
>     epfn = 0 (which is wrap around)
> 
> And loop condition check (spfn < zone_end_pfn(zone)) is false, so the loop is
> skipped. This is how we handle a fully initialized zone now.
> 
> Would this be a little un-common?

Why? The important thing is (spfn < zone_end_pfn(zone)) is false, and I
think that's good enough.
 
> >> > 
> >> >-	/* If the zone is empty somebody else may have cleared out the zone */
> >> >-	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
> >> >-						 first_deferred_pfn)) {
> >> >-		pgdat->first_deferred_pfn = ULONG_MAX;
> >> >-		pgdat_resize_unlock(pgdat, &flags);
> >> >-		/* Retry only once. */
> >> >-		return first_deferred_pfn != ULONG_MAX;
> >> >+	/*
> >> >+	 * Initialize at least nr_pages_needed in section chunks.
> >> >+	 * If a section has less free memory than nr_pages_needed, the next
> >> >+	 * section will be also initalized.
> 
> Nit, one typo here. s/initalized/initialized/

Thanks, will fix.
 
-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
  2025-08-20  9:20         ` Mike Rapoport
@ 2025-08-20 12:42           ` Wei Yang
  0 siblings, 0 replies; 19+ messages in thread
From: Wei Yang @ 2025-08-20 12:42 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Wei Yang, linux-mm, Andrew Morton, Bill Wendling, Daniel Jordan,
	Justin Stitt, Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, linux-kernel, llvm

On Wed, Aug 20, 2025 at 12:20:10PM +0300, Mike Rapoport wrote:
>On Tue, Aug 19, 2025 at 11:51:58PM +0000, Wei Yang wrote:
>> On Tue, Aug 19, 2025 at 01:54:46PM +0300, Mike Rapoport wrote:
>> >On Tue, Aug 19, 2025 at 09:52:23AM +0000, Wei Yang wrote:
>> >> Hi, Mike
>> >> 
>> >> After going through the code again, I have some trivial thoughts to discuss
>> >> with you. If not right, please let me know.
>> >> 
>> >> On Mon, Aug 18, 2025 at 09:46:12AM +0300, Mike Rapoport wrote:
>> >> 
>> >> In the file above this line, there is a compare between first_deferred_pfn and
>> >> its original value after grab pgdat_resize_lock.
>> >
>> >Do you mean this one:
>> >
>> >	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
>> >		pgdat_resize_unlock(pgdat, &flags);
>> >		return true;
>> >	}
>> > 
>> 
>> Yes.
>> 
>> I am thinking something like this:
>> 
>>  	if (first_deferred_pfn != pgdat->first_deferred_pfn || 
>> 	    first_deferred_pfn == ULONG_MAX)
>> 
>> This means
>> 
>>   * someone else has grow zone before we grab the lock
>>   * or the whole zone has already been initialized
>
>deferred_grow_zone() can be called only before deferred_init_memmap(), so
>it's very unlikely that a zone will be completely initialized here. We
>start with at least one section with each deferred zone and every call to
>deferred_grow_zone() adds a section.
>
>And even if that was a case and first_deferred_pfn is ULONG_MAX, the loop
>below will end immediately, so I don't think additional condition here
>would be helpful.
> 

I think you are right.

>> >> I am thinking to compare first_deferred_pfn with ULONG_MAX, as it compared in
>> >> deferred_init_memmap(). This indicate this zone has already been initialized
>> >> totally.
>> >
>> >It may be another CPU ran deferred_grow_zone() and won the race for resize
>> >lock. Then pgdat->first_deferred_pfn will be larger than
>> >first_deferred_pfn, but still not entire zone would be initialized.
>> > 
>> >> Current code guard this by spfn < zone_end_pfn(zone). Maybe a check ahead
>> >> would be more clear?
>> >
>> >Not sure I follow you here. The check that we don't pass zone_end_pfn is
>> >inside the loop for every section we initialize.
>> > 
>> 
>> In case the zone has been initialized totally, first_deferred_pfn = ULONG_MAX.
>> 
>> Then we come to the loop with initial state:
>> 
>>     spfn = ULONG_MAX
>>     epfn = 0 (which is wrap around)
>> 
>> And loop condition check (spfn < zone_end_pfn(zone)) is false, so the loop is
>> skipped. This is how we handle a fully initialized zone now.
>> 
>> Would this be a little un-common?
>
>Why? The important thing is (spfn < zone_end_pfn(zone)) is false, and I
>think that's good enough.
> 

Well, no more else.

>> >> > 
>> >> >-	/* If the zone is empty somebody else may have cleared out the zone */
>> >> >-	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
>> >> >-						 first_deferred_pfn)) {
>> >> >-		pgdat->first_deferred_pfn = ULONG_MAX;
>> >> >-		pgdat_resize_unlock(pgdat, &flags);
>> >> >-		/* Retry only once. */
>> >> >-		return first_deferred_pfn != ULONG_MAX;
>> >> >+	/*
>> >> >+	 * Initialize at least nr_pages_needed in section chunks.
>> >> >+	 * If a section has less free memory than nr_pages_needed, the next
>> >> >+	 * section will be also initalized.
>> 
>> Nit, one typo here. s/initalized/initialized/
>
>Thanks, will fix.
> 
>-- 
>Sincerely yours,
>Mike.

-- 
Wei Yang
Help you, Help me


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages
  2025-08-18  6:46 [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Mike Rapoport
                   ` (4 preceding siblings ...)
  2025-08-19  7:39 ` [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Wei Yang
@ 2025-08-22  5:54 ` Mike Rapoport
  5 siblings, 0 replies; 19+ messages in thread
From: Mike Rapoport @ 2025-08-22  5:54 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Bill Wendling, Daniel Jordan, Justin Stitt,
	Michael Ellerman, Miguel Ojeda, Nathan Chancellor,
	Nick Desaulniers, Wei Yang, linux-kernel, llvm

On Mon, Aug 18, 2025 at 09:46:11AM +0300, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> 
> Hi,
> 
> These patches simplify deferred initialization of the memory map.
> 
> Beside nice negative diffstat I measured 3ms reduction in the
> initialization of deferred pages on single node system with 64GiB of RAM.
> 
> I don't have access to large memory machines, so I'd really appreciate
> testing of these patches on them to make sure there's no regression there.
> 
> The patches are also available at git:
> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=deferred-memmap-init/v1
> 
> Mike Rapoport (Microsoft) (4):
>   mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
>   mm/mm_init: deferred_init_memmap: use a job per zone
>   mm/mm_init: drop deferred_init_maxorder()
>   memblock: drop for_each_free_mem_pfn_range_in_zone_from()

I've added those to memblock tree
 
>  .clang-format            |   1 -
>  include/linux/memblock.h |  22 -----
>  mm/memblock.c            |  64 -------------
>  mm/mm_init.c             | 195 +++++++++++++--------------------------
>  4 files changed, 62 insertions(+), 220 deletions(-)
> 
> 
> base-commit: 8f5ae30d69d7543eee0d70083daf4de8fe15d585
> -- 
> 2.50.1
> 

-- 
Sincerely yours,
Mike.


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2025-08-22  5:54 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-18  6:46 [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Mike Rapoport
2025-08-18  6:46 ` [PATCH 1/4] mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone() Mike Rapoport
2025-08-19  7:44   ` David Hildenbrand
2025-08-19  9:52   ` Wei Yang
2025-08-19 10:54     ` Mike Rapoport
2025-08-19 23:51       ` Wei Yang
2025-08-20  9:20         ` Mike Rapoport
2025-08-20 12:42           ` Wei Yang
2025-08-18  6:46 ` [PATCH 2/4] mm/mm_init: deferred_init_memmap: use a job per zone Mike Rapoport
2025-08-19  7:45   ` David Hildenbrand
2025-08-18  6:46 ` [PATCH 3/4] mm/mm_init: drop deferred_init_maxorder() Mike Rapoport
2025-08-19  7:54   ` David Hildenbrand
2025-08-19  9:22     ` Wei Yang
2025-08-19 10:39       ` Mike Rapoport
2025-08-19 12:31         ` David Hildenbrand
2025-08-18  6:46 ` [PATCH 4/4] memblock: drop for_each_free_mem_pfn_range_in_zone_from() Mike Rapoport
2025-08-19  7:39 ` [PATCH 0/4] mm/mm_init: simplify deferred init of struct pages Wei Yang
2025-08-19 10:41   ` Mike Rapoport
2025-08-22  5:54 ` Mike Rapoport

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).