public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Mike Rapoport <rppt@kernel.org>
To: Michal Clapinski <mclapinski@google.com>
Cc: Evangelos Petrongonas <epetron@amazon.de>,
	Pasha Tatashin <pasha.tatashin@soleen.com>,
	Pratyush Yadav <pratyush@kernel.org>,
	Alexander Graf <graf@amazon.com>,
	Samiullah Khawaja <skhawaja@google.com>,
	kexec@lists.infradead.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org,
	Andrew Morton <akpm@linux-foundation.org>
Subject: Re: [PATCH v6 1/2] kho: fix deferred init of kho scratch
Date: Thu, 12 Mar 2026 14:50:38 +0200	[thread overview]
Message-ID: <abK2ntdGDpDS35J1@kernel.org> (raw)
In-Reply-To: <20260311125539.4123672-2-mclapinski@google.com>

On Wed, Mar 11, 2026 at 01:55:38PM +0100, Michal Clapinski wrote:
> Currently, if DEFERRED is enabled, kho_release_scratch will initialize
> the struct pages and set migratetype of kho scratch. Unless the whole
> scratch fit below first_deferred_pfn, some of that will be overwritten
> either by deferred_init_pages or memmap_init_reserved_pages.
> 
> To fix it, I initialize kho scratch early and modify every other
> path to leave the scratch alone.
> 
> In detail:
> 1. Modify deferred_init_memmap_chunk to not initialize kho
> scratch, since we already did that. Then, modify deferred_free_pages
> to not set the migratetype. Also modify reserve_bootmem_region to skip
> initializing kho scratch.
> 
> 2. Since kho scratch is now not initialized by any other code, we have
> to initialize it ourselves also on cold boot. On cold boot memblock
> doesn't mark scratch as scratch, so we also have to modify the
> initialization function to not use memblock regions.
> 
> Signed-off-by: Michal Clapinski <mclapinski@google.com>
> ---
> My previous idea of marking scratch as CMA late, after deferred struct
> page init was done, was bad since allocations can be made before that
> and if they land in kho scratch, they become unpreservable.
> Such was the case with iommu page tables.
> ---
>  include/linux/kexec_handover.h     |  6 +++++
>  include/linux/memblock.h           |  2 --
>  kernel/liveupdate/kexec_handover.c | 35 +++++++++++++++++++++++++++++-
>  mm/memblock.c                      | 22 -------------------
>  mm/mm_init.c                       | 17 ++++++++++-----
>  5 files changed, 52 insertions(+), 30 deletions(-)
> 
> diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
> index ac4129d1d741..612a6da6127a 100644
> --- a/include/linux/kexec_handover.h
> +++ b/include/linux/kexec_handover.h
> @@ -35,6 +35,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation);
>  int kho_add_subtree(const char *name, void *fdt);
>  void kho_remove_subtree(void *fdt);
>  int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
> +bool pfn_is_kho_scratch(unsigned long pfn);

I think we can rely on MEMBLOCK_KHO_SCRATCH and query ranges rather than
individual pfns.

This will also eliminate the need to special case scratch memory map
initialization on cold boot. 

>  void kho_memory_init(void);
>  
> @@ -109,6 +110,11 @@ static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
>  	return -EOPNOTSUPP;
>  }
>  
> +static inline bool pfn_is_kho_scratch(unsigned long pfn)
> +{
> +	return false;
> +}
> +
>  static inline void kho_memory_init(void) { }
>  
>  static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 6ec5e9ac0699..3e217414e12d 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -614,11 +614,9 @@ static inline void memtest_report_meminfo(struct seq_file *m) { }
>  #ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
>  void memblock_set_kho_scratch_only(void);
>  void memblock_clear_kho_scratch_only(void);
> -void memmap_init_kho_scratch_pages(void);
>  #else
>  static inline void memblock_set_kho_scratch_only(void) { }
>  static inline void memblock_clear_kho_scratch_only(void) { }
> -static inline void memmap_init_kho_scratch_pages(void) {}
>  #endif
>  
>  #endif /* _LINUX_MEMBLOCK_H */
> diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c
> index 532f455c5d4f..09cb6660ade7 100644
> --- a/kernel/liveupdate/kexec_handover.c
> +++ b/kernel/liveupdate/kexec_handover.c
> @@ -1327,6 +1327,23 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
>  }
>  EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
>  
> +bool pfn_is_kho_scratch(unsigned long pfn)
> +{
> +	unsigned int i;
> +	phys_addr_t scratch_start, scratch_end, phys = __pfn_to_phys(pfn);
> +
> +	for (i = 0; i < kho_scratch_cnt; i++) {
> +		scratch_start = kho_scratch[i].addr;
> +		scratch_end = kho_scratch[i].addr + kho_scratch[i].size;
> +
> +		if (scratch_start <= phys && phys < scratch_end)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +EXPORT_SYMBOL_GPL(pfn_is_kho_scratch);
> +
>  static int __init kho_mem_retrieve(const void *fdt)
>  {
>  	struct kho_radix_tree tree;
> @@ -1453,12 +1470,27 @@ static __init int kho_init(void)
>  }
>  fs_initcall(kho_init);
>  
> +static void __init kho_init_scratch_pages(void)
> +{
> +	if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT))
> +		return;
> +
> +	for (int i = 0; i < kho_scratch_cnt; i++) {
> +		unsigned long pfn = PFN_DOWN(kho_scratch[i].addr);
> +		unsigned long end_pfn = PFN_UP(kho_scratch[i].addr + kho_scratch[i].size);
> +		int nid = early_pfn_to_nid(pfn);
> +
> +		for (; pfn < end_pfn; pfn++)
> +			init_deferred_page(pfn, nid);
> +	}
> +}
> +
>  static void __init kho_release_scratch(void)
>  {
>  	phys_addr_t start, end;
>  	u64 i;
>  
> -	memmap_init_kho_scratch_pages();
> +	kho_init_scratch_pages();

This should not be required if deferred init would check if a region is
MEMBLOCK_KHO_SCRATCH rather than pfn_is_kho_scratch().

>  	/*
>  	 * Mark scratch mem as CMA before we return it. That way we
> @@ -1487,6 +1519,7 @@ void __init kho_memory_init(void)
>  			kho_in.fdt_phys = 0;
>  	} else {
>  		kho_reserve_scratch();
> +		kho_init_scratch_pages();
>  	}
>  }
>  
> diff --git a/mm/memblock.c b/mm/memblock.c
> index b3ddfdec7a80..ae6a5af46bd7 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -959,28 +959,6 @@ __init void memblock_clear_kho_scratch_only(void)
>  {
>  	kho_scratch_only = false;
>  }
> -
> -__init void memmap_init_kho_scratch_pages(void)
> -{
> -	phys_addr_t start, end;
> -	unsigned long pfn;
> -	int nid;
> -	u64 i;
> -
> -	if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT))
> -		return;
> -
> -	/*
> -	 * Initialize struct pages for free scratch memory.
> -	 * The struct pages for reserved scratch memory will be set up in
> -	 * reserve_bootmem_region()
> -	 */
> -	__for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
> -			     MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) {
> -		for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++)
> -			init_deferred_page(pfn, nid);
> -	}
> -}
>  #endif
>  
>  /**
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index cec7bb758bdd..969048f9b320 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -798,7 +798,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>  	for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
>  		struct page *page = pfn_to_page(pfn);
>  
> -		__init_deferred_page(pfn, nid);
> +		if (!pfn_is_kho_scratch(pfn))
> +			__init_deferred_page(pfn, nid);

A bit unrelated, we can move reserve_bootmem_region() to memblock and make
it static.

As for skipping the initialization of, I think that
memmap_init_reserved_pages() should check if the region to reserve is in
scratch and if yes, make reserve_bootmem_region() to skip struct page
initialization.
I believe everything that is MEMBLOCK_RSRV_KERNEL would be in scratch and
all reserved memory in scratch would be MEMBLOCK_RSRV_KERNEL, but it's
better to double check it.

Another somewhat related thing, is that __init_page_from_nid() shouldn't
mess with pageblock migrate types, but only call __init_single_page().
It's up to __init_page_from_nid() caller to decide what migrate type to
use and the caller should set it explicitly.

>  
>  		/*
>  		 * no need for atomic set_bit because the struct
> @@ -2008,9 +2009,12 @@ static void __init deferred_free_pages(unsigned long pfn,
>  
>  	/* Free a large naturally-aligned chunk if possible */
>  	if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
> -		for (i = 0; i < nr_pages; i += pageblock_nr_pages)
> +		for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
> +			if (pfn_is_kho_scratch(page_to_pfn(page + i)))
> +				continue;
>  			init_pageblock_migratetype(page + i, MIGRATE_MOVABLE,
>  					false);

We can move init_pageblock_migratetype() here and below to
deferred_init_pages() and ...

> +		}
>  		__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
>  		return;
>  	}
> @@ -2019,7 +2023,7 @@ static void __init deferred_free_pages(unsigned long pfn,
>  	accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE);
>  
>  	for (i = 0; i < nr_pages; i++, page++, pfn++) {
> -		if (pageblock_aligned(pfn))
> +		if (pageblock_aligned(pfn) && !pfn_is_kho_scratch(pfn))
>  			init_pageblock_migratetype(page, MIGRATE_MOVABLE,
>  					false);
>  		__free_pages_core(page, 0, MEMINIT_EARLY);
> @@ -2090,9 +2094,11 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
>  			unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
>  			unsigned long chunk_end = min(mo_pfn, epfn);
>  
> -			nr_pages += deferred_init_pages(zone, spfn, chunk_end);
> -			deferred_free_pages(spfn, chunk_end - spfn);
> +			// KHO scratch is MAX_ORDER_NR_PAGES aligned.
> +			if (!pfn_is_kho_scratch(spfn))
> +				deferred_init_pages(zone, spfn, chunk_end);

skip the entire MEMBLOCK_KHO_SCRATCH regions here and only call
deferred_free_pages() for them.

Since the outer loop already walks regions in memblock.memory it shouldn't
be hard to query memblock_region flags from the iterator, or just replace
the simplified iterator with __for_each_mem_range().
  
> +			deferred_free_pages(spfn, chunk_end - spfn);
>  			spfn = chunk_end;
>  
>  			if (can_resched)
> @@ -2100,6 +2106,7 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
>  			else
>  				touch_nmi_watchdog();
>  		}
> +		nr_pages += epfn - spfn;
>  	}
>  
>  	return nr_pages;
> -- 
> 2.53.0.473.g4a7958ca14-goog
> 

-- 
Sincerely yours,
Mike.

  reply	other threads:[~2026-03-12 12:50 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-11 12:55 [PATCH v6 0/2] kho: add support for deferred struct page init Michal Clapinski
2026-03-11 12:55 ` [PATCH v6 1/2] kho: fix deferred init of kho scratch Michal Clapinski
2026-03-12 12:50   ` Mike Rapoport [this message]
2026-03-13 13:58   ` Pratyush Yadav
2026-03-11 12:55 ` [PATCH v6 2/2] kho: make preserved pages compatible with deferred struct page init Michal Clapinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=abK2ntdGDpDS35J1@kernel.org \
    --to=rppt@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=epetron@amazon.de \
    --cc=graf@amazon.com \
    --cc=kexec@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mclapinski@google.com \
    --cc=pasha.tatashin@soleen.com \
    --cc=pratyush@kernel.org \
    --cc=skhawaja@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox