public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
	david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
	ljs@kernel.org, ziy@nvidia.com
Cc: bhe@redhat.com, willy@infradead.org, youngjun.park@lge.com,
	hannes@cmpxchg.org, riel@surriel.com, shakeel.butt@linux.dev,
	alex@ghiti.fr, kas@kernel.org, baohua@kernel.org,
	dev.jain@arm.com, baolin.wang@linux.alibaba.com,
	npache@redhat.com, Liam.Howlett@oracle.com, ryan.roberts@arm.com,
	Vlastimil Babka <vbabka@kernel.org>,
	lance.yang@linux.dev, linux-kernel@vger.kernel.org,
	nphamcs@gmail.com, shikemeng@huaweicloud.com,
	kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [PATCH 07/13] mm: handle PMD swap entries in fork path
Date: Mon, 27 Apr 2026 03:01:56 -0700	[thread overview]
Message-ID: <20260427100553.2754667-8-usama.arif@linux.dev> (raw)
In-Reply-To: <20260427100553.2754667-1-usama.arif@linux.dev>

Teach copy_huge_pmd()/copy_huge_non_present_pmd() about swap entries,
mirroring copy_nonpresent_pte().

swap_dup_entry_direct() gains a nr parameter (and is renamed to
swap_dup_entries_direct()) so it can duplicate a contiguous range of
swap slots in one call, matching the existing
swap_put_entries_direct(entry, nr) API.  Existing callers pass 1.

copy_huge_non_present_pmd() "copies" PMD swap entries during fork
instead of splitting, preserving the THP.  This mirrors
copy_nonpresent_pte() which duplicates the swap slot refcount,
clears the exclusive bit on the source, and adds the destination
mm to mmlist.  If swap_dup_entries_direct() fails (GFP_ATOMIC table
alloc), copy_huge_pmd() retries after swap_retry_table_alloc() with
GFP_KERNEL, matching the PTE retry in copy_pte_range().  The PMD is
stable across the retry because dup_mmap() holds write mmap_lock on
both mm_structs.

Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
 include/linux/swap.h |  4 ++--
 mm/huge_memory.c     | 52 +++++++++++++++++++++++++++++++++++++++-----
 mm/memory.c          |  2 +-
 mm/swapfile.c        |  7 +++---
 4 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1930f81e6be4..2f12c20baba1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -457,7 +457,7 @@ sector_t swap_folio_sector(struct folio *folio);
  * All entries must be allocated by folio_alloc_swap(). And they must have
  * a swap count > 1. See comments of folio_*_swap helpers for more info.
  */
-int swap_dup_entry_direct(swp_entry_t entry);
+int swap_dup_entries_direct(swp_entry_t entry, int nr);
 void swap_put_entries_direct(swp_entry_t entry, int nr);
 
 /*
@@ -501,7 +501,7 @@ static inline void free_swap_cache(struct folio *folio)
 {
 }
 
-static inline int swap_dup_entry_direct(swp_entry_t ent)
+static inline int swap_dup_entries_direct(swp_entry_t ent, int nr)
 {
 	return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9f67638e43c8..42887cf518cd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1867,7 +1867,7 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return false;
 }
 
-static void copy_huge_non_present_pmd(
+static int copy_huge_non_present_pmd(
 		struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
@@ -1913,14 +1913,35 @@ static void copy_huge_non_present_pmd(
 		 */
 		folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
 					    dst_vma, src_vma);
+	} else if (softleaf_is_swap(entry)) {
+		int err;
+
+		/*
+		 * PMD swap entry: duplicate swap references and clear
+		 * exclusive on source, matching copy_nonpresent_pte().
+		 */
+		err = swap_dup_entries_direct(entry, HPAGE_PMD_NR);
+		if (err < 0)
+			return err;
+
+		ensure_on_mmlist(dst_mm);
+
+		if (pmd_swp_exclusive(pmd)) {
+			pmd = pmd_swp_clear_exclusive(pmd);
+			set_pmd_at(src_mm, addr, src_pmd, pmd);
+		}
 	}
 
-	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+	if (softleaf_is_swap(entry))
+		add_mm_counter(dst_mm, MM_SWAPENTS, HPAGE_PMD_NR);
+	else
+		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 	mm_inc_nr_ptes(dst_mm);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 	if (!userfaultfd_wp(dst_vma))
 		pmd = pmd_swp_clear_uffd_wp(pmd);
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+	return 0;
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1961,6 +1982,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (unlikely(!pgtable))
 		goto out;
 
+retry:
 	dst_ptl = pmd_lock(dst_mm, dst_pmd);
 	src_ptl = pmd_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -1968,10 +1990,28 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	ret = -EAGAIN;
 	pmd = *src_pmd;
 
-	if (unlikely(thp_migration_supported() &&
-		     pmd_is_valid_softleaf(pmd))) {
-		copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
-					  dst_vma, src_vma, pmd, pgtable);
+	if (unlikely(pmd_is_valid_softleaf(pmd))) {
+		ret = copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+						addr, dst_vma, src_vma, pmd,
+						pgtable);
+		if (ret) {
+			spin_unlock(src_ptl);
+			spin_unlock(dst_ptl);
+			/*
+			 * For PMD swap entries -ENOMEM means the per-cluster
+			 * swap-extend table couldn't be GFP_ATOMIC-allocated.
+			 * try the GFP_KERNEL fallback once before giving up.
+			 */
+			if (ret == -ENOMEM) {
+				softleaf_t entry = softleaf_from_pmd(pmd);
+
+				if (softleaf_is_swap(entry) &&
+				    !swap_retry_table_alloc(entry, GFP_KERNEL))
+					goto retry;
+			}
+			pte_free(dst_mm, pgtable);
+			goto out;
+		}
 		ret = 0;
 		goto out_unlock;
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 33d7cc274e23..8aa90afd601a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -934,7 +934,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	struct page *page;
 
 	if (likely(softleaf_is_swap(entry))) {
-		if (swap_dup_entry_direct(entry) < 0)
+		if (swap_dup_entries_direct(entry, 1) < 0)
 			return -EIO;
 
 		ensure_on_mmlist(dst_mm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c7e173b93e11..390f191be9a6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3801,8 +3801,9 @@ void si_swapinfo(struct sysinfo *val)
 }
 
 /*
- * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
+ * swap_dup_entries_direct() - Increase reference count of swap entries by one.
  * @entry: first swap entry from which we want to increase the refcount.
+ * @nr: number of contiguous swap entries to duplicate.
  *
  * Returns 0 for success, or -ENOMEM if the extend table is required
  * but could not be atomically allocated.  Returns -EINVAL if the swap
@@ -3814,7 +3815,7 @@ void si_swapinfo(struct sysinfo *val)
  * Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should
  * be used.
  */
-int swap_dup_entry_direct(swp_entry_t entry)
+int swap_dup_entries_direct(swp_entry_t entry, int nr)
 {
 	struct swap_info_struct *si;
 
@@ -3831,7 +3832,7 @@ int swap_dup_entry_direct(swp_entry_t entry)
 	 */
 	VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry));
 
-	return swap_dup_entries_cluster(si, swp_offset(entry), 1);
+	return swap_dup_entries_cluster(si, swp_offset(entry), nr);
 }
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-- 
2.52.0


  parent reply	other threads:[~2026-04-27 10:06 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-27 10:01 [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 10:01 ` [PATCH 01/13] mm: add softleaf_to_pmd() and convert existing callers Usama Arif
2026-04-27 10:01 ` [PATCH 02/13] mm: extract ensure_on_mmlist() helper Usama Arif
2026-04-27 10:01 ` [PATCH 03/13] fs/proc: use softleaf_has_pfn() in pagemap PMD walker Usama Arif
2026-04-27 10:01 ` [PATCH 04/13] mm/huge_memory: move softleaf_to_folio() inside migration branch Usama Arif
2026-04-27 10:01 ` [PATCH 05/13] mm: add PMD swap entry detection support Usama Arif
2026-04-27 10:01 ` [PATCH 06/13] mm: add PMD swap entry splitting support Usama Arif
2026-04-27 10:01 ` Usama Arif [this message]
2026-04-27 10:01 ` [PATCH 08/13] mm: swap in PMD swap entries as whole THPs during swapoff Usama Arif
2026-04-27 10:01 ` [PATCH 09/13] mm: handle PMD swap entries in non-present PMD walkers Usama Arif
2026-04-27 10:01 ` [PATCH 10/13] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-04-27 10:02 ` [PATCH 11/13] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-04-27 10:02 ` [PATCH 12/13] mm: install PMD swap entries on swap-out Usama Arif
2026-04-27 10:02 ` [PATCH 13/13] selftests/mm: add PMD swap entry tests Usama Arif
2026-04-27 13:38 ` [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 18:26 ` Zi Yan
2026-04-27 20:12   ` Usama Arif

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260427100553.2754667-8-usama.arif@linux.dev \
    --to=usama.arif@linux.dev \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=alex@ghiti.fr \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=bhe@redhat.com \
    --cc=chrisl@kernel.org \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=kas@kernel.org \
    --cc=kasong@tencent.com \
    --cc=kernel-team@meta.com \
    --cc=lance.yang@linux.dev \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ljs@kernel.org \
    --cc=npache@redhat.com \
    --cc=nphamcs@gmail.com \
    --cc=riel@surriel.com \
    --cc=ryan.roberts@arm.com \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=vbabka@kernel.org \
    --cc=willy@infradead.org \
    --cc=youngjun.park@lge.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox