From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
ljs@kernel.org, ziy@nvidia.com
Cc: ying.huang@linux.alibaba.com, Baoquan He <baoquan.he@linux.dev>,
willy@infradead.org, youngjun.park@lge.com, hannes@cmpxchg.org,
riel@surriel.com, shakeel.butt@linux.dev, alex@ghiti.fr,
kas@kernel.org, baohua@kernel.org, dev.jain@arm.com,
baolin.wang@linux.alibaba.com, npache@redhat.com,
Liam R. Howlett <liam@infradead.org>,
ryan.roberts@arm.com, Vlastimil Babka <vbabka@kernel.org>,
lance.yang@linux.dev, linux-kernel@vger.kernel.org,
nphamcs@gmail.com, shikemeng@huaweicloud.com,
kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [v2 09/16] mm: handle PMD swap entries in fork path
Date: Tue, 2 Jun 2026 07:24:17 -0700 [thread overview]
Message-ID: <20260602142537.198755-10-usama.arif@linux.dev> (raw)
In-Reply-To: <20260602142537.198755-1-usama.arif@linux.dev>
Teach copy_huge_pmd()/copy_huge_non_present_pmd() about swap entries,
mirroring copy_nonpresent_pte().
swap_dup_entry_direct() gains a nr parameter (and is renamed to
swap_dup_entries_direct()) so it can duplicate a contiguous range of
swap slots in one call, matching the existing
swap_put_entries_direct(entry, nr) API. Existing callers pass 1.
copy_huge_non_present_pmd() "copies" PMD swap entries during fork
instead of splitting, preserving the THP. This mirrors
copy_nonpresent_pte() which duplicates the swap slot refcount,
clears the exclusive bit on the source, and adds the destination
mm to mmlist. If swap_dup_entries_direct() fails (GFP_ATOMIC table
alloc), copy_huge_pmd() retries after swap_retry_table_alloc() with
GFP_KERNEL, matching the PTE retry in copy_pte_range(). The PMD is
stable across the retry because dup_mmap() holds write mmap_lock on
both mm_structs.
Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
include/linux/swap.h | 4 ++--
mm/huge_memory.c | 52 +++++++++++++++++++++++++++++++++++++++-----
mm/memory.c | 2 +-
mm/swapfile.c | 7 +++---
4 files changed, 53 insertions(+), 12 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6d72778e6cc3..8a5ec5f0a7c7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -458,7 +458,7 @@ sector_t swap_folio_sector(struct folio *folio);
* All entries must be allocated by folio_alloc_swap(). And they must have
* a swap count > 1. See comments of folio_*_swap helpers for more info.
*/
-int swap_dup_entry_direct(swp_entry_t entry);
+int swap_dup_entries_direct(swp_entry_t entry, int nr);
void swap_put_entries_direct(swp_entry_t entry, int nr);
/*
@@ -502,7 +502,7 @@ static inline void free_swap_cache(struct folio *folio)
{
}
-static inline int swap_dup_entry_direct(swp_entry_t ent)
+static inline int swap_dup_entries_direct(swp_entry_t ent, int nr)
{
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7cb1afde46e1..a525417d13f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1806,7 +1806,7 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
return false;
}
-static void copy_huge_non_present_pmd(
+static int copy_huge_non_present_pmd(
struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
@@ -1852,14 +1852,35 @@ static void copy_huge_non_present_pmd(
*/
folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
dst_vma, src_vma);
+ } else if (softleaf_is_swap(entry)) {
+ int err;
+
+ /*
+ * PMD swap entry: duplicate swap references and clear
+ * exclusive on source, matching copy_nonpresent_pte().
+ */
+ err = swap_dup_entries_direct(entry, HPAGE_PMD_NR);
+ if (err < 0)
+ return err;
+
+ mm_prepare_for_swap_entries(dst_mm);
+
+ if (pmd_swp_exclusive(pmd)) {
+ pmd = pmd_swp_clear_exclusive(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
}
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ if (softleaf_is_swap(entry))
+ add_mm_counter(dst_mm, MM_SWAPENTS, HPAGE_PMD_NR);
+ else
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
if (!userfaultfd_wp(dst_vma))
pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ return 0;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1900,6 +1921,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (unlikely(!pgtable))
goto out;
+retry:
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -1907,10 +1929,28 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- if (unlikely(thp_migration_supported() &&
- pmd_is_valid_softleaf(pmd))) {
- copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
- dst_vma, src_vma, pmd, pgtable);
+ if (unlikely(pmd_is_valid_softleaf(pmd))) {
+ ret = copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+ addr, dst_vma, src_vma, pmd,
+ pgtable);
+ if (ret) {
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ /*
+ * For PMD swap entries -ENOMEM means the per-cluster
+ * swap-extend table couldn't be GFP_ATOMIC-allocated.
+ * try the GFP_KERNEL fallback once before giving up.
+ */
+ if (ret == -ENOMEM) {
+ softleaf_t entry = softleaf_from_pmd(pmd);
+
+ if (softleaf_is_swap(entry) &&
+ !swap_retry_table_alloc(entry, GFP_KERNEL))
+ goto retry;
+ }
+ pte_free(dst_mm, pgtable);
+ goto out;
+ }
ret = 0;
goto out_unlock;
}
diff --git a/mm/memory.c b/mm/memory.c
index 137f34c3fd32..5cf02e394c92 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -950,7 +950,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct page *page;
if (likely(softleaf_is_swap(entry))) {
- if (swap_dup_entry_direct(entry) < 0)
+ if (swap_dup_entries_direct(entry, 1) < 0)
return -EIO;
mm_prepare_for_swap_entries(dst_mm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e3d126602a1e..37408905490e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3899,8 +3899,9 @@ void si_swapinfo(struct sysinfo *val)
}
/*
- * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
+ * swap_dup_entries_direct() - Increase reference count of swap entries by one.
* @entry: first swap entry from which we want to increase the refcount.
+ * @nr: number of contiguous swap entries to duplicate.
*
* Returns 0 for success, or -ENOMEM if the extend table is required
* but could not be atomically allocated. Returns -EINVAL if the swap
@@ -3912,7 +3913,7 @@ void si_swapinfo(struct sysinfo *val)
* Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should
* be used.
*/
-int swap_dup_entry_direct(swp_entry_t entry)
+int swap_dup_entries_direct(swp_entry_t entry, int nr)
{
struct swap_info_struct *si;
@@ -3929,7 +3930,7 @@ int swap_dup_entry_direct(swp_entry_t entry)
*/
VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry));
- return swap_dup_entries_cluster(si, swp_offset(entry), 1);
+ return swap_dup_entries_cluster(si, swp_offset(entry), nr);
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
--
2.52.0
next prev parent reply other threads:[~2026-06-02 14:26 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-02 14:24 [v2 00/16] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-06-02 14:24 ` [v2 01/16] mm: add softleaf_to_pmd() and convert existing callers Usama Arif
2026-06-02 14:24 ` [v2 02/16] mm: extract mm_prepare_for_swap_entries() helper Usama Arif
2026-06-02 14:24 ` [v2 03/16] fs/proc: use softleaf_has_pfn() in pagemap PMD walker Usama Arif
2026-06-02 14:24 ` [v2 04/16] mm/huge_memory: move softleaf_to_folio() inside migration branch Usama Arif
2026-06-02 14:24 ` [v2 05/16] mm/migrate_device: move softleaf_to_folio() inside device-private branch Usama Arif
2026-06-02 14:24 ` [v2 06/16] mm: rename ARCH_ENABLE_THP_MIGRATION to ARCH_SUPPORTS_PMD_SOFTLEAF Usama Arif
2026-06-02 14:24 ` [v2 07/16] mm: add PMD swap entry detection support Usama Arif
2026-06-02 14:24 ` [v2 08/16] mm: add PMD swap entry splitting support Usama Arif
2026-06-02 14:24 ` Usama Arif [this message]
2026-06-02 14:24 ` [v2 10/16] mm: swap in PMD swap entries as whole THPs during swapoff Usama Arif
2026-06-02 14:24 ` [v2 11/16] mm: handle PMD swap entries in non-present PMD walkers Usama Arif
2026-06-12 6:45 ` Lance Yang
2026-06-02 14:24 ` [v2 12/16] mm: handle PMD swap entries in MADV_WILLNEED Usama Arif
2026-06-02 14:24 ` [v2 13/16] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-06-12 8:50 ` Lance Yang
2026-06-02 14:24 ` [v2 14/16] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-06-02 14:24 ` [v2 15/16] mm: install PMD swap entries on swap-out Usama Arif
2026-06-12 14:21 ` Lance Yang
2026-06-02 14:24 ` [v2 16/16] selftests/mm: add PMD swap entry tests Usama Arif
2026-06-09 14:29 ` [v2 00/16] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-06-10 12:24 ` David Hildenbrand (Arm)
2026-06-10 13:01 ` Lance Yang
2026-06-10 13:48 ` David Hildenbrand (Arm)
2026-06-10 14:44 ` Usama Arif
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260602142537.198755-10-usama.arif@linux.dev \
--to=usama.arif@linux.dev \
--cc=akpm@linux-foundation.org \
--cc=alex@ghiti.fr \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=baoquan.he@linux.dev \
--cc=chrisl@kernel.org \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=hannes@cmpxchg.org \
--cc=kas@kernel.org \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=liam@infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=ljs@kernel.org \
--cc=npache@redhat.com \
--cc=nphamcs@gmail.com \
--cc=riel@surriel.com \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=vbabka@kernel.org \
--cc=willy@infradead.org \
--cc=ying.huang@linux.alibaba.com \
--cc=youngjun.park@lge.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox