[PATCH v2 2/4] mm: replace exec_folio_order() with generic preferred_exec_order()

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
	david@kernel.org, willy@infradead.org, ryan.roberts@arm.com,
	linux-mm@kvack.org
Cc: r@hev.cc, jack@suse.cz, ajd@linux.ibm.com, apopple@nvidia.com,
	baohua@kernel.org, baolin.wang@linux.alibaba.com,
	brauner@kernel.org, catalin.marinas@arm.com, dev.jain@arm.com,
	kees@kernel.org, kevin.brodsky@arm.com, lance.yang@linux.dev,
	Liam.Howlett@oracle.com, linux-arm-kernel@lists.infradead.org,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	lorenzo.stoakes@oracle.com, mhocko@suse.com, npache@redhat.com,
	pasha.tatashin@soleen.com, rmclure@linux.ibm.com,
	rppt@kernel.org, surenb@google.com, vbabka@kernel.org,
	Al Viro <viro@zeniv.linux.org.uk>,
	wilts.infradead.org, linux-fsdevel@vger.kernel.l@kernel.org,
	ziy@nvidia.com, hannes@cmpxchg.org, kas@kernel.org,
	shakeel.butt@linux.dev, kernel-team@meta.com,
	Usama Arif <usama.arif@linux.dev>
Subject: [PATCH v2 2/4] mm: replace exec_folio_order() with generic preferred_exec_order()
Date: Fri, 20 Mar 2026 06:58:52 -0700	[thread overview]
Message-ID: <20260320140315.979307-3-usama.arif@linux.dev> (raw)
In-Reply-To: <20260320140315.979307-1-usama.arif@linux.dev>

Replace the arch-specific exec_folio_order() hook with a generic
preferred_exec_order() that dynamically computes the readahead folio
order for executable memory. It targets min(PMD_ORDER, 2M) as the
maximum, which optimally gives the right answer for contpte (arm64),
PMD mapping (x86, arm64 4K), and architectures with smaller PMDs
(s390 1M). It adapts at runtime based on:

- VMA size: caps the order so folios fit within the mapping
- Memory pressure: steps down the order when the local node's free
  memory is below the high watermark for the requested order

This avoids over-allocating on memory-constrained systems while still
requesting the optimal order when memory is plentiful.

Since exec_folio_order() is no longer needed, remove the arm64
definition and the generic default from pgtable.h.

Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
 arch/arm64/include/asm/pgtable.h |  8 -----
 include/linux/pgtable.h          | 11 ------
 mm/filemap.c                     | 57 ++++++++++++++++++++++++++++----
 3 files changed, 51 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b3e58735c49bd..b1e74940624d8 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1599,14 +1599,6 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
  */
 #define arch_wants_old_prefaulted_pte	cpu_has_hw_af
 
-/*
- * Request exec memory is read into pagecache in at least 64K folios. This size
- * can be contpte-mapped when 4K base pages are in use (16 pages into 1 iTLB
- * entry), and HPA can coalesce it (4 pages into 1 TLB entry) when 16K base
- * pages are in use.
- */
-#define exec_folio_order() ilog2(SZ_64K >> PAGE_SHIFT)
-
 static inline bool pud_sect_supported(void)
 {
 	return PAGE_SIZE == SZ_4K;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a50df42a893fb..874333549eb3c 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -577,17 +577,6 @@ static inline bool arch_has_hw_pte_young(void)
 }
 #endif
 
-#ifndef exec_folio_order
-/*
- * Returns preferred minimum folio order for executable file-backed memory. Must
- * be in range [0, PMD_ORDER). Default to order-0.
- */
-static inline unsigned int exec_folio_order(void)
-{
-	return 0;
-}
-#endif
-
 #ifndef arch_check_zapped_pte
 static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
 					 pte_t pte)
diff --git a/mm/filemap.c b/mm/filemap.c
index 7d89c6b384cc4..aebfb78e487d7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3290,6 +3290,52 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
 	return 1;
 }
 
+/*
+ * Compute the preferred folio order for executable memory readahead.
+ * Targets min(PMD_ORDER, 2M) as the maximum, which gives the
+ * optimal order for contpte (arm64), PMD mapping (x86, arm64 4K), and
+ * architectures with smaller PMDs (s390 1M). The 2M cap also avoids
+ * requesting excessively large folios on configurations where PMD_ORDER
+ * is much larger (32M on 16K pages, 512M on 64K pages), which would cause
+ * unnecessary memory pressure. Adapts at runtime based on:
+ *
+ * - VMA size: cap the order so folios fit within the mapping.
+ *
+ * - Memory pressure: step down the order when free memory on the local
+ *   node is below the high watermark for the requested order. This
+ *   avoids expensive reclaim or compaction to satisfy large folio
+ *   allocations when memory is tight.
+ */
+static unsigned int preferred_exec_order(struct vm_area_struct *vma)
+{
+	int order;
+	unsigned long vma_len = vma_pages(vma);
+	struct zone *zone;
+	gfp_t gfp;
+
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return 0;
+
+	/* Cap at min(PMD_ORDER, 2M) */
+	order = min(HPAGE_PMD_ORDER, ilog2(SZ_2M >> PAGE_SHIFT));
+
+	/* Don't request folios larger than the VMA */
+	order = min(order, ilog2(vma_len));
+
+	/* Step down under memory pressure */
+	gfp = mapping_gfp_mask(vma->vm_file->f_mapping);
+	zone = first_zones_zonelist(node_zonelist(numa_node_id(), gfp),
+				    gfp_zone(gfp), NULL)->zone;
+	if (zone) {
+		while (order > 0 &&
+		       !zone_watermark_ok(zone, order,
+					  high_wmark_pages(zone), 0, 0))
+			order--;
+	}
+
+	return order;
+}
+
 /*
  * Synchronous readahead happens when we don't even find a page in the page
  * cache at all.  We don't want to perform IO under the mmap sem, so if we have
@@ -3363,11 +3409,10 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 
 	if (vm_flags & VM_EXEC) {
 		/*
-		 * Allow arch to request a preferred minimum folio order for
-		 * executable memory. This can often be beneficial to
-		 * performance if (e.g.) arm64 can contpte-map the folio.
-		 * Executable memory rarely benefits from readahead, due to its
-		 * random access nature, so set async_size to 0.
+		 * Request a preferred folio order for executable memory,
+		 * dynamically adapted to VMA size and memory pressure.
+		 * Executable memory rarely benefits from speculative readahead
+		 * due to its random access nature, so set async_size to 0.
 		 *
 		 * Limit to the boundaries of the VMA to avoid reading in any
 		 * pad that might exist between sections, which would be a waste
@@ -3378,7 +3423,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 		unsigned long end = start + vma_pages(vma);
 		unsigned long ra_end;
 
-		ra->order = exec_folio_order();
+		ra->order = preferred_exec_order(vma);
 		ra->start = round_down(vmf->pgoff, 1UL << ra->order);
 		ra->start = max(ra->start, start);
 		ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);
-- 
2.52.0

WARNING: multiple messages have this Message-ID (diff)

From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
	david@kernel.org, willy@infradead.org, ryan.roberts@arm.com,
	linux-mm@kvack.org
Cc: r@hev.cc, jack@suse.cz, ajd@linux.ibm.com, apopple@nvidia.com,
	baohua@kernel.org, baolin.wang@linux.alibaba.com,
	brauner@kernel.org, catalin.marinas@arm.com, dev.jain@arm.com,
	kees@kernel.org, kevin.brodsky@arm.com, lance.yang@linux.dev,
	Liam.Howlett@oracle.com, linux-arm-kernel@lists.infradead.org,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	lorenzo.stoakes@oracle.com, mhocko@suse.com, npache@redhat.com,
	pasha.tatashin@soleen.com, rmclure@linux.ibm.com,
	rppt@kernel.org, surenb@google.com, vbabka@kernel.org,
	Al Viro <viro@zeniv.linux.org.uk>,
	wilts.infradead.org@kvack.org,
	"linux-fsdevel@vger.kernel.l"@kernel.org, ziy@nvidia.com,
	hannes@cmpxchg.org, kas@kernel.org, shakeel.butt@linux.dev,
	kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [PATCH v2 2/4] mm: replace exec_folio_order() with generic preferred_exec_order()
Date: Fri, 20 Mar 2026 06:58:52 -0700	[thread overview]
Message-ID: <20260320140315.979307-3-usama.arif@linux.dev> (raw)
In-Reply-To: <20260320140315.979307-1-usama.arif@linux.dev>

Replace the arch-specific exec_folio_order() hook with a generic
preferred_exec_order() that dynamically computes the readahead folio
order for executable memory. It targets min(PMD_ORDER, 2M) as the
maximum, which optimally gives the right answer for contpte (arm64),
PMD mapping (x86, arm64 4K), and architectures with smaller PMDs
(s390 1M). It adapts at runtime based on:

- VMA size: caps the order so folios fit within the mapping
- Memory pressure: steps down the order when the local node's free
  memory is below the high watermark for the requested order

This avoids over-allocating on memory-constrained systems while still
requesting the optimal order when memory is plentiful.

Since exec_folio_order() is no longer needed, remove the arm64
definition and the generic default from pgtable.h.

Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
 arch/arm64/include/asm/pgtable.h |  8 -----
 include/linux/pgtable.h          | 11 ------
 mm/filemap.c                     | 57 ++++++++++++++++++++++++++++----
 3 files changed, 51 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b3e58735c49bd..b1e74940624d8 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1599,14 +1599,6 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
  */
 #define arch_wants_old_prefaulted_pte	cpu_has_hw_af
 
-/*
- * Request exec memory is read into pagecache in at least 64K folios. This size
- * can be contpte-mapped when 4K base pages are in use (16 pages into 1 iTLB
- * entry), and HPA can coalesce it (4 pages into 1 TLB entry) when 16K base
- * pages are in use.
- */
-#define exec_folio_order() ilog2(SZ_64K >> PAGE_SHIFT)
-
 static inline bool pud_sect_supported(void)
 {
 	return PAGE_SIZE == SZ_4K;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a50df42a893fb..874333549eb3c 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -577,17 +577,6 @@ static inline bool arch_has_hw_pte_young(void)
 }
 #endif
 
-#ifndef exec_folio_order
-/*
- * Returns preferred minimum folio order for executable file-backed memory. Must
- * be in range [0, PMD_ORDER). Default to order-0.
- */
-static inline unsigned int exec_folio_order(void)
-{
-	return 0;
-}
-#endif
-
 #ifndef arch_check_zapped_pte
 static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
 					 pte_t pte)
diff --git a/mm/filemap.c b/mm/filemap.c
index 7d89c6b384cc4..aebfb78e487d7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3290,6 +3290,52 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
 	return 1;
 }
 
+/*
+ * Compute the preferred folio order for executable memory readahead.
+ * Targets min(PMD_ORDER, 2M) as the maximum, which gives the
+ * optimal order for contpte (arm64), PMD mapping (x86, arm64 4K), and
+ * architectures with smaller PMDs (s390 1M). The 2M cap also avoids
+ * requesting excessively large folios on configurations where PMD_ORDER
+ * is much larger (32M on 16K pages, 512M on 64K pages), which would cause
+ * unnecessary memory pressure. Adapts at runtime based on:
+ *
+ * - VMA size: cap the order so folios fit within the mapping.
+ *
+ * - Memory pressure: step down the order when free memory on the local
+ *   node is below the high watermark for the requested order. This
+ *   avoids expensive reclaim or compaction to satisfy large folio
+ *   allocations when memory is tight.
+ */
+static unsigned int preferred_exec_order(struct vm_area_struct *vma)
+{
+	int order;
+	unsigned long vma_len = vma_pages(vma);
+	struct zone *zone;
+	gfp_t gfp;
+
+	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+		return 0;
+
+	/* Cap at min(PMD_ORDER, 2M) */
+	order = min(HPAGE_PMD_ORDER, ilog2(SZ_2M >> PAGE_SHIFT));
+
+	/* Don't request folios larger than the VMA */
+	order = min(order, ilog2(vma_len));
+
+	/* Step down under memory pressure */
+	gfp = mapping_gfp_mask(vma->vm_file->f_mapping);
+	zone = first_zones_zonelist(node_zonelist(numa_node_id(), gfp),
+				    gfp_zone(gfp), NULL)->zone;
+	if (zone) {
+		while (order > 0 &&
+		       !zone_watermark_ok(zone, order,
+					  high_wmark_pages(zone), 0, 0))
+			order--;
+	}
+
+	return order;
+}
+
 /*
  * Synchronous readahead happens when we don't even find a page in the page
  * cache at all.  We don't want to perform IO under the mmap sem, so if we have
@@ -3363,11 +3409,10 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 
 	if (vm_flags & VM_EXEC) {
 		/*
-		 * Allow arch to request a preferred minimum folio order for
-		 * executable memory. This can often be beneficial to
-		 * performance if (e.g.) arm64 can contpte-map the folio.
-		 * Executable memory rarely benefits from readahead, due to its
-		 * random access nature, so set async_size to 0.
+		 * Request a preferred folio order for executable memory,
+		 * dynamically adapted to VMA size and memory pressure.
+		 * Executable memory rarely benefits from speculative readahead
+		 * due to its random access nature, so set async_size to 0.
 		 *
 		 * Limit to the boundaries of the VMA to avoid reading in any
 		 * pad that might exist between sections, which would be a waste
@@ -3378,7 +3423,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 		unsigned long end = start + vma_pages(vma);
 		unsigned long ra_end;
 
-		ra->order = exec_folio_order();
+		ra->order = preferred_exec_order(vma);
 		ra->start = round_down(vmf->pgoff, 1UL << ra->order);
 		ra->start = max(ra->start, start);
 		ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);
-- 
2.52.0

next prev parent reply	other threads:[~2026-03-20 14:04 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-20 13:58 [PATCH v2 0/4] mm: improve large folio readahead and alignment for exec memory Usama Arif
2026-03-20 13:58 ` Usama Arif
2026-03-20 13:58 ` [PATCH v2 1/4] mm: bypass mmap_miss heuristic for VM_EXEC readahead Usama Arif
2026-03-20 13:58   ` Usama Arif
2026-03-20 14:18   ` Jan Kara
2026-03-20 14:18     ` Jan Kara
2026-03-20 14:26   ` Kiryl Shutsemau
2026-03-20 13:58 ` Usama Arif [this message]
2026-03-20 13:58   ` [PATCH v2 2/4] mm: replace exec_folio_order() with generic preferred_exec_order() Usama Arif
2026-03-20 14:41   ` Kiryl Shutsemau
2026-03-20 14:42   ` Jan Kara
2026-03-20 14:42     ` Jan Kara
2026-03-26 12:40     ` Usama Arif
2026-03-26 12:40       ` Usama Arif
2026-03-26 16:21       ` Jan Kara
2026-03-26 16:21         ` Jan Kara
2026-03-20 13:58 ` [PATCH v2 3/4] elf: align ET_DYN base to max folio size for PTE coalescing Usama Arif
2026-03-20 13:58   ` Usama Arif
2026-03-20 14:55   ` Kiryl Shutsemau
2026-03-20 15:58   ` Matthew Wilcox
2026-03-27 16:51     ` Usama Arif
2026-03-20 16:05   ` WANG Rui
2026-03-20 17:47     ` Matthew Wilcox
2026-03-27 16:53     ` Usama Arif
2026-03-29  4:37       ` WANG Rui
2026-03-30 12:56         ` Matthew Wilcox
2026-03-30 14:00           ` Usama Arif
2026-03-20 13:58 ` [PATCH v2 4/4] mm: align file-backed mmap to max folio order in thp_get_unmapped_area Usama Arif
2026-03-20 13:58   ` Usama Arif
2026-03-20 15:06   ` Kiryl Shutsemau

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:b3e58735c49b dfblob:b1e74940624d dfblob:a50df42a893f
dfblob:874333549eb3 dfblob:7d89c6b384cc dfblob:aebfb78e487d
dfblob:b3e58735c49b dfblob:b1e74940624d dfblob:a50df42a893f
dfblob:874333549eb3 dfblob:7d89c6b384cc dfblob:aebfb78e487d )
 OR (
bs:"[PATCH v2 2/4] mm: replace exec_folio_order() with generic preferred_exec_order()" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260320140315.979307-3-usama.arif@linux.dev \
    --to=usama.arif@linux.dev \
    --cc=Liam.Howlett@oracle.com \
    --cc=ajd@linux.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=apopple@nvidia.com \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=brauner@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=jack@suse.cz \
    --cc=kees@kernel.org \
    --cc=kevin.brodsky@arm.com \
    --cc=lance.yang@linux.dev \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-fsdevel@vger.kernel.l \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=mhocko@suse.com \
    --cc=npache@redhat.com \
    --cc=pasha.tatashin@soleen.com \
    --cc=r@hev.cc \
    --cc=rmclure@linux.ibm.com \
    --cc=rppt@kernel.org \
    --cc=ryan.roberts@arm.com \
    --cc=surenb@google.com \
    --cc=vbabka@kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.