All of lore.kernel.org
 help / color / mirror / Atom feed
From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
	david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
	ljs@kernel.org, ziy@nvidia.com
Cc: ying.huang@linux.alibaba.com, Baoquan He <baoquan.he@linux.dev>,
	willy@infradead.org, youngjun.park@lge.com, hannes@cmpxchg.org,
	riel@surriel.com, shakeel.butt@linux.dev, alex@ghiti.fr,
	kas@kernel.org, baohua@kernel.org, dev.jain@arm.com,
	baolin.wang@linux.alibaba.com, npache@redhat.com,
	Liam R. Howlett <liam@infradead.org>,
	ryan.roberts@arm.com, Vlastimil Babka <vbabka@kernel.org>,
	lance.yang@linux.dev, linux-kernel@vger.kernel.org,
	nphamcs@gmail.com, shikemeng@huaweicloud.com,
	kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [v2 09/16] mm: handle PMD swap entries in fork path
Date: Tue,  2 Jun 2026 07:24:17 -0700	[thread overview]
Message-ID: <20260602142537.198755-10-usama.arif@linux.dev> (raw)
In-Reply-To: <20260602142537.198755-1-usama.arif@linux.dev>

Teach copy_huge_pmd()/copy_huge_non_present_pmd() about swap entries,
mirroring copy_nonpresent_pte().

swap_dup_entry_direct() gains a nr parameter (and is renamed to
swap_dup_entries_direct()) so it can duplicate a contiguous range of
swap slots in one call, matching the existing
swap_put_entries_direct(entry, nr) API.  Existing callers pass 1.

copy_huge_non_present_pmd() "copies" PMD swap entries during fork
instead of splitting, preserving the THP.  This mirrors
copy_nonpresent_pte() which duplicates the swap slot refcount,
clears the exclusive bit on the source, and adds the destination
mm to mmlist.  If swap_dup_entries_direct() fails (GFP_ATOMIC table
alloc), copy_huge_pmd() retries after swap_retry_table_alloc() with
GFP_KERNEL, matching the PTE retry in copy_pte_range().  The PMD is
stable across the retry because dup_mmap() holds write mmap_lock on
both mm_structs.

Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
 include/linux/swap.h |  4 ++--
 mm/huge_memory.c     | 52 +++++++++++++++++++++++++++++++++++++++-----
 mm/memory.c          |  2 +-
 mm/swapfile.c        |  7 +++---
 4 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6d72778e6cc3..8a5ec5f0a7c7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -458,7 +458,7 @@ sector_t swap_folio_sector(struct folio *folio);
  * All entries must be allocated by folio_alloc_swap(). And they must have
  * a swap count > 1. See comments of folio_*_swap helpers for more info.
  */
-int swap_dup_entry_direct(swp_entry_t entry);
+int swap_dup_entries_direct(swp_entry_t entry, int nr);
 void swap_put_entries_direct(swp_entry_t entry, int nr);
 
 /*
@@ -502,7 +502,7 @@ static inline void free_swap_cache(struct folio *folio)
 {
 }
 
-static inline int swap_dup_entry_direct(swp_entry_t ent)
+static inline int swap_dup_entries_direct(swp_entry_t ent, int nr)
 {
 	return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7cb1afde46e1..a525417d13f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1806,7 +1806,7 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return false;
 }
 
-static void copy_huge_non_present_pmd(
+static int copy_huge_non_present_pmd(
 		struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
@@ -1852,14 +1852,35 @@ static void copy_huge_non_present_pmd(
 		 */
 		folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
 					    dst_vma, src_vma);
+	} else if (softleaf_is_swap(entry)) {
+		int err;
+
+		/*
+		 * PMD swap entry: duplicate swap references and clear
+		 * exclusive on source, matching copy_nonpresent_pte().
+		 */
+		err = swap_dup_entries_direct(entry, HPAGE_PMD_NR);
+		if (err < 0)
+			return err;
+
+		mm_prepare_for_swap_entries(dst_mm);
+
+		if (pmd_swp_exclusive(pmd)) {
+			pmd = pmd_swp_clear_exclusive(pmd);
+			set_pmd_at(src_mm, addr, src_pmd, pmd);
+		}
 	}
 
-	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+	if (softleaf_is_swap(entry))
+		add_mm_counter(dst_mm, MM_SWAPENTS, HPAGE_PMD_NR);
+	else
+		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 	mm_inc_nr_ptes(dst_mm);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 	if (!userfaultfd_wp(dst_vma))
 		pmd = pmd_swp_clear_uffd_wp(pmd);
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+	return 0;
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1900,6 +1921,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (unlikely(!pgtable))
 		goto out;
 
+retry:
 	dst_ptl = pmd_lock(dst_mm, dst_pmd);
 	src_ptl = pmd_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -1907,10 +1929,28 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	ret = -EAGAIN;
 	pmd = *src_pmd;
 
-	if (unlikely(thp_migration_supported() &&
-		     pmd_is_valid_softleaf(pmd))) {
-		copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
-					  dst_vma, src_vma, pmd, pgtable);
+	if (unlikely(pmd_is_valid_softleaf(pmd))) {
+		ret = copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+						addr, dst_vma, src_vma, pmd,
+						pgtable);
+		if (ret) {
+			spin_unlock(src_ptl);
+			spin_unlock(dst_ptl);
+			/*
+			 * For PMD swap entries -ENOMEM means the per-cluster
+			 * swap-extend table couldn't be GFP_ATOMIC-allocated.
+			 * try the GFP_KERNEL fallback once before giving up.
+			 */
+			if (ret == -ENOMEM) {
+				softleaf_t entry = softleaf_from_pmd(pmd);
+
+				if (softleaf_is_swap(entry) &&
+				    !swap_retry_table_alloc(entry, GFP_KERNEL))
+					goto retry;
+			}
+			pte_free(dst_mm, pgtable);
+			goto out;
+		}
 		ret = 0;
 		goto out_unlock;
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 137f34c3fd32..5cf02e394c92 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -950,7 +950,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	struct page *page;
 
 	if (likely(softleaf_is_swap(entry))) {
-		if (swap_dup_entry_direct(entry) < 0)
+		if (swap_dup_entries_direct(entry, 1) < 0)
 			return -EIO;
 
 		mm_prepare_for_swap_entries(dst_mm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e3d126602a1e..37408905490e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3899,8 +3899,9 @@ void si_swapinfo(struct sysinfo *val)
 }
 
 /*
- * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
+ * swap_dup_entries_direct() - Increase reference count of swap entries by one.
  * @entry: first swap entry from which we want to increase the refcount.
+ * @nr: number of contiguous swap entries to duplicate.
  *
  * Returns 0 for success, or -ENOMEM if the extend table is required
  * but could not be atomically allocated.  Returns -EINVAL if the swap
@@ -3912,7 +3913,7 @@ void si_swapinfo(struct sysinfo *val)
  * Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should
  * be used.
  */
-int swap_dup_entry_direct(swp_entry_t entry)
+int swap_dup_entries_direct(swp_entry_t entry, int nr)
 {
 	struct swap_info_struct *si;
 
@@ -3929,7 +3930,7 @@ int swap_dup_entry_direct(swp_entry_t entry)
 	 */
 	VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry));
 
-	return swap_dup_entries_cluster(si, swp_offset(entry), 1);
+	return swap_dup_entries_cluster(si, swp_offset(entry), nr);
 }
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-- 
2.52.0


  parent reply	other threads:[~2026-06-02 14:26 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-02 14:24 [v2 00/16] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-06-02 14:24 ` [v2 01/16] mm: add softleaf_to_pmd() and convert existing callers Usama Arif
2026-06-02 14:24 ` [v2 02/16] mm: extract mm_prepare_for_swap_entries() helper Usama Arif
2026-06-02 14:24 ` [v2 03/16] fs/proc: use softleaf_has_pfn() in pagemap PMD walker Usama Arif
2026-06-02 14:24 ` [v2 04/16] mm/huge_memory: move softleaf_to_folio() inside migration branch Usama Arif
2026-06-02 14:24 ` [v2 05/16] mm/migrate_device: move softleaf_to_folio() inside device-private branch Usama Arif
2026-06-02 14:24 ` [v2 06/16] mm: rename ARCH_ENABLE_THP_MIGRATION to ARCH_SUPPORTS_PMD_SOFTLEAF Usama Arif
2026-06-02 14:24 ` [v2 07/16] mm: add PMD swap entry detection support Usama Arif
2026-06-02 14:24 ` [v2 08/16] mm: add PMD swap entry splitting support Usama Arif
2026-06-02 14:24 ` Usama Arif [this message]
2026-06-02 14:24 ` [v2 10/16] mm: swap in PMD swap entries as whole THPs during swapoff Usama Arif
2026-06-02 14:24 ` [v2 11/16] mm: handle PMD swap entries in non-present PMD walkers Usama Arif
2026-06-02 14:24 ` [v2 12/16] mm: handle PMD swap entries in MADV_WILLNEED Usama Arif
2026-06-02 14:24 ` [v2 13/16] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-06-02 14:24 ` [v2 14/16] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-06-02 14:24 ` [v2 15/16] mm: install PMD swap entries on swap-out Usama Arif
2026-06-02 14:24 ` [v2 16/16] selftests/mm: add PMD swap entry tests Usama Arif
2026-06-09 14:29 ` [v2 00/16] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-06-10 12:24   ` David Hildenbrand (Arm)
2026-06-10 13:01     ` Lance Yang
2026-06-10 13:48       ` David Hildenbrand (Arm)
2026-06-10 14:44         ` Usama Arif

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260602142537.198755-10-usama.arif@linux.dev \
    --to=usama.arif@linux.dev \
    --cc=akpm@linux-foundation.org \
    --cc=alex@ghiti.fr \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=baoquan.he@linux.dev \
    --cc=chrisl@kernel.org \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=kas@kernel.org \
    --cc=kasong@tencent.com \
    --cc=kernel-team@meta.com \
    --cc=lance.yang@linux.dev \
    --cc=liam@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ljs@kernel.org \
    --cc=npache@redhat.com \
    --cc=nphamcs@gmail.com \
    --cc=riel@surriel.com \
    --cc=ryan.roberts@arm.com \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=vbabka@kernel.org \
    --cc=willy@infradead.org \
    --cc=ying.huang@linux.alibaba.com \
    --cc=youngjun.park@lge.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.