All of lore.kernel.org
 help / color / mirror / Atom feed
From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
	david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
	ljs@kernel.org, ziy@nvidia.com
Cc: bhe@redhat.com, willy@infradead.org, youngjun.park@lge.com,
	hannes@cmpxchg.org, riel@surriel.com, shakeel.butt@linux.dev,
	alex@ghiti.fr, kas@kernel.org, baohua@kernel.org,
	dev.jain@arm.com, baolin.wang@linux.alibaba.com,
	npache@redhat.com, Liam.Howlett@oracle.com, ryan.roberts@arm.com,
	Vlastimil Babka <vbabka@kernel.org>,
	lance.yang@linux.dev, linux-kernel@vger.kernel.org,
	nphamcs@gmail.com, shikemeng@huaweicloud.com,
	kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [PATCH 07/13] mm: handle PMD swap entries in fork path
Date: Mon, 27 Apr 2026 03:01:56 -0700	[thread overview]
Message-ID: <20260427100553.2754667-8-usama.arif@linux.dev> (raw)
In-Reply-To: <20260427100553.2754667-1-usama.arif@linux.dev>

Teach copy_huge_pmd()/copy_huge_non_present_pmd() about swap entries,
mirroring copy_nonpresent_pte().

swap_dup_entry_direct() gains a nr parameter (and is renamed to
swap_dup_entries_direct()) so it can duplicate a contiguous range of
swap slots in one call, matching the existing
swap_put_entries_direct(entry, nr) API.  Existing callers pass 1.

copy_huge_non_present_pmd() "copies" PMD swap entries during fork
instead of splitting, preserving the THP.  This mirrors
copy_nonpresent_pte() which duplicates the swap slot refcount,
clears the exclusive bit on the source, and adds the destination
mm to mmlist.  If swap_dup_entries_direct() fails (GFP_ATOMIC table
alloc), copy_huge_pmd() retries after swap_retry_table_alloc() with
GFP_KERNEL, matching the PTE retry in copy_pte_range().  The PMD is
stable across the retry because dup_mmap() holds write mmap_lock on
both mm_structs.

Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
 include/linux/swap.h |  4 ++--
 mm/huge_memory.c     | 52 +++++++++++++++++++++++++++++++++++++++-----
 mm/memory.c          |  2 +-
 mm/swapfile.c        |  7 +++---
 4 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1930f81e6be4..2f12c20baba1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -457,7 +457,7 @@ sector_t swap_folio_sector(struct folio *folio);
  * All entries must be allocated by folio_alloc_swap(). And they must have
  * a swap count > 1. See comments of folio_*_swap helpers for more info.
  */
-int swap_dup_entry_direct(swp_entry_t entry);
+int swap_dup_entries_direct(swp_entry_t entry, int nr);
 void swap_put_entries_direct(swp_entry_t entry, int nr);
 
 /*
@@ -501,7 +501,7 @@ static inline void free_swap_cache(struct folio *folio)
 {
 }
 
-static inline int swap_dup_entry_direct(swp_entry_t ent)
+static inline int swap_dup_entries_direct(swp_entry_t ent, int nr)
 {
 	return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9f67638e43c8..42887cf518cd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1867,7 +1867,7 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return false;
 }
 
-static void copy_huge_non_present_pmd(
+static int copy_huge_non_present_pmd(
 		struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
@@ -1913,14 +1913,35 @@ static void copy_huge_non_present_pmd(
 		 */
 		folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
 					    dst_vma, src_vma);
+	} else if (softleaf_is_swap(entry)) {
+		int err;
+
+		/*
+		 * PMD swap entry: duplicate swap references and clear
+		 * exclusive on source, matching copy_nonpresent_pte().
+		 */
+		err = swap_dup_entries_direct(entry, HPAGE_PMD_NR);
+		if (err < 0)
+			return err;
+
+		ensure_on_mmlist(dst_mm);
+
+		if (pmd_swp_exclusive(pmd)) {
+			pmd = pmd_swp_clear_exclusive(pmd);
+			set_pmd_at(src_mm, addr, src_pmd, pmd);
+		}
 	}
 
-	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+	if (softleaf_is_swap(entry))
+		add_mm_counter(dst_mm, MM_SWAPENTS, HPAGE_PMD_NR);
+	else
+		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 	mm_inc_nr_ptes(dst_mm);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 	if (!userfaultfd_wp(dst_vma))
 		pmd = pmd_swp_clear_uffd_wp(pmd);
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+	return 0;
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1961,6 +1982,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (unlikely(!pgtable))
 		goto out;
 
+retry:
 	dst_ptl = pmd_lock(dst_mm, dst_pmd);
 	src_ptl = pmd_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -1968,10 +1990,28 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	ret = -EAGAIN;
 	pmd = *src_pmd;
 
-	if (unlikely(thp_migration_supported() &&
-		     pmd_is_valid_softleaf(pmd))) {
-		copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
-					  dst_vma, src_vma, pmd, pgtable);
+	if (unlikely(pmd_is_valid_softleaf(pmd))) {
+		ret = copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+						addr, dst_vma, src_vma, pmd,
+						pgtable);
+		if (ret) {
+			spin_unlock(src_ptl);
+			spin_unlock(dst_ptl);
+			/*
+			 * For PMD swap entries -ENOMEM means the per-cluster
+			 * swap-extend table couldn't be GFP_ATOMIC-allocated.
+			 * try the GFP_KERNEL fallback once before giving up.
+			 */
+			if (ret == -ENOMEM) {
+				softleaf_t entry = softleaf_from_pmd(pmd);
+
+				if (softleaf_is_swap(entry) &&
+				    !swap_retry_table_alloc(entry, GFP_KERNEL))
+					goto retry;
+			}
+			pte_free(dst_mm, pgtable);
+			goto out;
+		}
 		ret = 0;
 		goto out_unlock;
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 33d7cc274e23..8aa90afd601a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -934,7 +934,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	struct page *page;
 
 	if (likely(softleaf_is_swap(entry))) {
-		if (swap_dup_entry_direct(entry) < 0)
+		if (swap_dup_entries_direct(entry, 1) < 0)
 			return -EIO;
 
 		ensure_on_mmlist(dst_mm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c7e173b93e11..390f191be9a6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3801,8 +3801,9 @@ void si_swapinfo(struct sysinfo *val)
 }
 
 /*
- * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
+ * swap_dup_entries_direct() - Increase reference count of swap entries by one.
  * @entry: first swap entry from which we want to increase the refcount.
+ * @nr: number of contiguous swap entries to duplicate.
  *
  * Returns 0 for success, or -ENOMEM if the extend table is required
  * but could not be atomically allocated.  Returns -EINVAL if the swap
@@ -3814,7 +3815,7 @@ void si_swapinfo(struct sysinfo *val)
  * Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should
  * be used.
  */
-int swap_dup_entry_direct(swp_entry_t entry)
+int swap_dup_entries_direct(swp_entry_t entry, int nr)
 {
 	struct swap_info_struct *si;
 
@@ -3831,7 +3832,7 @@ int swap_dup_entry_direct(swp_entry_t entry)
 	 */
 	VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry));
 
-	return swap_dup_entries_cluster(si, swp_offset(entry), 1);
+	return swap_dup_entries_cluster(si, swp_offset(entry), nr);
 }
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-- 
2.52.0


  parent reply	other threads:[~2026-04-27 10:06 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-27 10:01 [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 10:01 ` [PATCH 01/13] mm: add softleaf_to_pmd() and convert existing callers Usama Arif
2026-05-13 19:24   ` David Hildenbrand (Arm)
2026-05-29  7:20   ` Dev Jain
2026-05-29 14:47     ` Usama Arif
2026-04-27 10:01 ` [PATCH 02/13] mm: extract ensure_on_mmlist() helper Usama Arif
2026-05-13 13:32   ` David Hildenbrand (Arm)
2026-05-13 17:21     ` Usama Arif
2026-05-13 19:22       ` David Hildenbrand (Arm)
2026-05-29  7:42   ` Dev Jain
2026-04-27 10:01 ` [PATCH 03/13] fs/proc: use softleaf_has_pfn() in pagemap PMD walker Usama Arif
2026-05-13 13:35   ` David Hildenbrand (Arm)
2026-05-29  9:34   ` Dev Jain
2026-04-27 10:01 ` [PATCH 04/13] mm/huge_memory: move softleaf_to_folio() inside migration branch Usama Arif
2026-05-13 19:25   ` David Hildenbrand (Arm)
2026-05-29 11:31   ` Dev Jain
2026-04-27 10:01 ` [PATCH 05/13] mm: add PMD swap entry detection support Usama Arif
2026-05-30  8:06   ` Dev Jain
2026-04-27 10:01 ` [PATCH 06/13] mm: add PMD swap entry splitting support Usama Arif
2026-05-30 10:52   ` Dev Jain
2026-06-02 12:59     ` Usama Arif
2026-04-27 10:01 ` Usama Arif [this message]
2026-04-27 10:01 ` [PATCH 08/13] mm: swap in PMD swap entries as whole THPs during swapoff Usama Arif
2026-05-26 19:44   ` Alexandre Ghiti
2026-05-29 14:49     ` Usama Arif
2026-04-27 10:01 ` [PATCH 09/13] mm: handle PMD swap entries in non-present PMD walkers Usama Arif
2026-04-27 10:01 ` [PATCH 10/13] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-04-27 10:02 ` [PATCH 11/13] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-04-27 10:02 ` [PATCH 12/13] mm: install PMD swap entries on swap-out Usama Arif
2026-04-27 10:02 ` [PATCH 13/13] selftests/mm: add PMD swap entry tests Usama Arif
2026-04-27 13:38 ` [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 18:26 ` Zi Yan
2026-04-27 20:12   ` Usama Arif
2026-04-29 12:57     ` Zi Yan
2026-04-28 19:54 ` David Hildenbrand (Arm)
2026-04-29  9:39   ` Usama Arif
2026-04-29 12:52     ` Lorenzo Stoakes
2026-04-29 10:44 ` Kairui Song
2026-04-30 10:38   ` Usama Arif

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260427100553.2754667-8-usama.arif@linux.dev \
    --to=usama.arif@linux.dev \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=alex@ghiti.fr \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=bhe@redhat.com \
    --cc=chrisl@kernel.org \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=kas@kernel.org \
    --cc=kasong@tencent.com \
    --cc=kernel-team@meta.com \
    --cc=lance.yang@linux.dev \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ljs@kernel.org \
    --cc=npache@redhat.com \
    --cc=nphamcs@gmail.com \
    --cc=riel@surriel.com \
    --cc=ryan.roberts@arm.com \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=vbabka@kernel.org \
    --cc=willy@infradead.org \
    --cc=youngjun.park@lge.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.