From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
ljs@kernel.org, ziy@nvidia.com
Cc: bhe@redhat.com, willy@infradead.org, youngjun.park@lge.com,
hannes@cmpxchg.org, riel@surriel.com, shakeel.butt@linux.dev,
alex@ghiti.fr, kas@kernel.org, baohua@kernel.org,
dev.jain@arm.com, baolin.wang@linux.alibaba.com,
npache@redhat.com, Liam.Howlett@oracle.com, ryan.roberts@arm.com,
Vlastimil Babka <vbabka@kernel.org>,
lance.yang@linux.dev, linux-kernel@vger.kernel.org,
nphamcs@gmail.com, shikemeng@huaweicloud.com,
kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [PATCH 07/13] mm: handle PMD swap entries in fork path
Date: Mon, 27 Apr 2026 03:01:56 -0700 [thread overview]
Message-ID: <20260427100553.2754667-8-usama.arif@linux.dev> (raw)
In-Reply-To: <20260427100553.2754667-1-usama.arif@linux.dev>
Teach copy_huge_pmd()/copy_huge_non_present_pmd() about swap entries,
mirroring copy_nonpresent_pte().
swap_dup_entry_direct() gains a nr parameter (and is renamed to
swap_dup_entries_direct()) so it can duplicate a contiguous range of
swap slots in one call, matching the existing
swap_put_entries_direct(entry, nr) API. Existing callers pass 1.
copy_huge_non_present_pmd() "copies" PMD swap entries during fork
instead of splitting, preserving the THP. This mirrors
copy_nonpresent_pte() which duplicates the swap slot refcount,
clears the exclusive bit on the source, and adds the destination
mm to mmlist. If swap_dup_entries_direct() fails (GFP_ATOMIC table
alloc), copy_huge_pmd() retries after swap_retry_table_alloc() with
GFP_KERNEL, matching the PTE retry in copy_pte_range(). The PMD is
stable across the retry because dup_mmap() holds write mmap_lock on
both mm_structs.
Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
include/linux/swap.h | 4 ++--
mm/huge_memory.c | 52 +++++++++++++++++++++++++++++++++++++++-----
mm/memory.c | 2 +-
mm/swapfile.c | 7 +++---
4 files changed, 53 insertions(+), 12 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1930f81e6be4..2f12c20baba1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -457,7 +457,7 @@ sector_t swap_folio_sector(struct folio *folio);
* All entries must be allocated by folio_alloc_swap(). And they must have
* a swap count > 1. See comments of folio_*_swap helpers for more info.
*/
-int swap_dup_entry_direct(swp_entry_t entry);
+int swap_dup_entries_direct(swp_entry_t entry, int nr);
void swap_put_entries_direct(swp_entry_t entry, int nr);
/*
@@ -501,7 +501,7 @@ static inline void free_swap_cache(struct folio *folio)
{
}
-static inline int swap_dup_entry_direct(swp_entry_t ent)
+static inline int swap_dup_entries_direct(swp_entry_t ent, int nr)
{
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9f67638e43c8..42887cf518cd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1867,7 +1867,7 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
return false;
}
-static void copy_huge_non_present_pmd(
+static int copy_huge_non_present_pmd(
struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
@@ -1913,14 +1913,35 @@ static void copy_huge_non_present_pmd(
*/
folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
dst_vma, src_vma);
+ } else if (softleaf_is_swap(entry)) {
+ int err;
+
+ /*
+ * PMD swap entry: duplicate swap references and clear
+ * exclusive on source, matching copy_nonpresent_pte().
+ */
+ err = swap_dup_entries_direct(entry, HPAGE_PMD_NR);
+ if (err < 0)
+ return err;
+
+ ensure_on_mmlist(dst_mm);
+
+ if (pmd_swp_exclusive(pmd)) {
+ pmd = pmd_swp_clear_exclusive(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
}
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ if (softleaf_is_swap(entry))
+ add_mm_counter(dst_mm, MM_SWAPENTS, HPAGE_PMD_NR);
+ else
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
if (!userfaultfd_wp(dst_vma))
pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ return 0;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1961,6 +1982,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (unlikely(!pgtable))
goto out;
+retry:
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -1968,10 +1990,28 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- if (unlikely(thp_migration_supported() &&
- pmd_is_valid_softleaf(pmd))) {
- copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
- dst_vma, src_vma, pmd, pgtable);
+ if (unlikely(pmd_is_valid_softleaf(pmd))) {
+ ret = copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+ addr, dst_vma, src_vma, pmd,
+ pgtable);
+ if (ret) {
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ /*
+ * For PMD swap entries -ENOMEM means the per-cluster
+ * swap-extend table couldn't be GFP_ATOMIC-allocated.
+ * try the GFP_KERNEL fallback once before giving up.
+ */
+ if (ret == -ENOMEM) {
+ softleaf_t entry = softleaf_from_pmd(pmd);
+
+ if (softleaf_is_swap(entry) &&
+ !swap_retry_table_alloc(entry, GFP_KERNEL))
+ goto retry;
+ }
+ pte_free(dst_mm, pgtable);
+ goto out;
+ }
ret = 0;
goto out_unlock;
}
diff --git a/mm/memory.c b/mm/memory.c
index 33d7cc274e23..8aa90afd601a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -934,7 +934,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct page *page;
if (likely(softleaf_is_swap(entry))) {
- if (swap_dup_entry_direct(entry) < 0)
+ if (swap_dup_entries_direct(entry, 1) < 0)
return -EIO;
ensure_on_mmlist(dst_mm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c7e173b93e11..390f191be9a6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3801,8 +3801,9 @@ void si_swapinfo(struct sysinfo *val)
}
/*
- * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
+ * swap_dup_entries_direct() - Increase reference count of swap entries by one.
* @entry: first swap entry from which we want to increase the refcount.
+ * @nr: number of contiguous swap entries to duplicate.
*
* Returns 0 for success, or -ENOMEM if the extend table is required
* but could not be atomically allocated. Returns -EINVAL if the swap
@@ -3814,7 +3815,7 @@ void si_swapinfo(struct sysinfo *val)
* Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should
* be used.
*/
-int swap_dup_entry_direct(swp_entry_t entry)
+int swap_dup_entries_direct(swp_entry_t entry, int nr)
{
struct swap_info_struct *si;
@@ -3831,7 +3832,7 @@ int swap_dup_entry_direct(swp_entry_t entry)
*/
VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry));
- return swap_dup_entries_cluster(si, swp_offset(entry), 1);
+ return swap_dup_entries_cluster(si, swp_offset(entry), nr);
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
--
2.52.0
next prev parent reply other threads:[~2026-04-27 10:06 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-27 10:01 [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 10:01 ` [PATCH 01/13] mm: add softleaf_to_pmd() and convert existing callers Usama Arif
2026-04-27 10:01 ` [PATCH 02/13] mm: extract ensure_on_mmlist() helper Usama Arif
2026-04-27 10:01 ` [PATCH 03/13] fs/proc: use softleaf_has_pfn() in pagemap PMD walker Usama Arif
2026-04-27 10:01 ` [PATCH 04/13] mm/huge_memory: move softleaf_to_folio() inside migration branch Usama Arif
2026-04-27 10:01 ` [PATCH 05/13] mm: add PMD swap entry detection support Usama Arif
2026-04-27 10:01 ` [PATCH 06/13] mm: add PMD swap entry splitting support Usama Arif
2026-04-27 10:01 ` Usama Arif [this message]
2026-04-27 10:01 ` [PATCH 08/13] mm: swap in PMD swap entries as whole THPs during swapoff Usama Arif
2026-04-27 10:01 ` [PATCH 09/13] mm: handle PMD swap entries in non-present PMD walkers Usama Arif
2026-04-27 10:01 ` [PATCH 10/13] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-04-27 10:02 ` [PATCH 11/13] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-04-27 10:02 ` [PATCH 12/13] mm: install PMD swap entries on swap-out Usama Arif
2026-04-27 10:02 ` [PATCH 13/13] selftests/mm: add PMD swap entry tests Usama Arif
2026-04-27 13:38 ` [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 18:26 ` Zi Yan
2026-04-27 20:12 ` Usama Arif
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260427100553.2754667-8-usama.arif@linux.dev \
--to=usama.arif@linux.dev \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=alex@ghiti.fr \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=chrisl@kernel.org \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=hannes@cmpxchg.org \
--cc=kas@kernel.org \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=linux-kernel@vger.kernel.org \
--cc=ljs@kernel.org \
--cc=npache@redhat.com \
--cc=nphamcs@gmail.com \
--cc=riel@surriel.com \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=vbabka@kernel.org \
--cc=willy@infradead.org \
--cc=youngjun.park@lge.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox