From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
ljs@kernel.org, ziy@nvidia.com, linux-mm@kvack.org
Cc: ying.huang@linux.alibaba.com, Baoquan He <baoquan.he@linux.dev>,
willy@infradead.org, youngjun.park@lge.com, hannes@cmpxchg.org,
riel@surriel.com, shakeel.butt@linux.dev, alex@ghiti.fr,
kas@kernel.org, baohua@kernel.org, dev.jain@arm.com,
baolin.wang@linux.alibaba.com, npache@redhat.com,
Liam R. Howlett <liam@infradead.org>,
ryan.roberts@arm.com, Vlastimil Babka <vbabka@kernel.org>,
lance.yang@linux.dev, linux-kernel@vger.kernel.org,
nphamcs@gmail.com, shikemeng@huaweicloud.com,
kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [PATCH v3 03/11] mm: handle PMD swap entries in fork path
Date: Fri, 3 Jul 2026 10:38:20 -0700 [thread overview]
Message-ID: <20260703173903.3789516-4-usama.arif@linux.dev> (raw)
In-Reply-To: <20260703173903.3789516-1-usama.arif@linux.dev>
Teach copy_huge_pmd()/copy_huge_non_present_pmd() about swap entries,
mirroring copy_nonpresent_pte().
swap_dup_entry_direct() gains a nr parameter (and is renamed to
swap_dup_entries_direct()) so it can duplicate a contiguous range of
swap slots in one call, matching the existing
swap_put_entries_direct(entry, nr) API. Existing callers pass 1.
copy_huge_non_present_pmd() "copies" PMD swap entries during fork
instead of splitting, preserving the THP. This mirrors
copy_nonpresent_pte() which duplicates the swap slot refcount,
clears the exclusive bit on the source, and adds the destination
mm to mmlist. If swap_dup_entries_direct() fails (GFP_ATOMIC table
alloc), copy_huge_pmd() retries after swap_retry_table_alloc() with
GFP_KERNEL, matching the PTE retry in copy_pte_range(). The PMD is
stable across the retry because dup_mmap() holds write mmap_lock on
both mm_structs.
Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
include/linux/swap.h | 4 ++--
mm/huge_memory.c | 53 ++++++++++++++++++++++++++++++++++++++------
mm/memory.c | 2 +-
mm/swapfile.c | 7 +++---
4 files changed, 53 insertions(+), 13 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8d19be675baf..0b1db19e6ae3 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -451,7 +451,7 @@ sector_t swap_folio_sector(struct folio *folio);
* All entries must be allocated by folio_alloc_swap(). And they must have
* a swap count > 1. See comments of folio_*_swap helpers for more info.
*/
-int swap_dup_entry_direct(swp_entry_t entry);
+int swap_dup_entries_direct(swp_entry_t entry, int nr);
void swap_put_entries_direct(swp_entry_t entry, int nr);
/*
@@ -495,7 +495,7 @@ static inline void free_swap_cache(struct folio *folio)
{
}
-static inline int swap_dup_entry_direct(swp_entry_t ent)
+static inline int swap_dup_entries_direct(swp_entry_t ent, int nr)
{
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 201193ce0373..69e4e09ac1f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1805,7 +1805,7 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
return false;
}
-static void copy_huge_non_present_pmd(
+static int copy_huge_non_present_pmd(
struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
@@ -1851,14 +1851,35 @@ static void copy_huge_non_present_pmd(
*/
folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
dst_vma, src_vma);
+ } else if (softleaf_is_swap(entry)) {
+ int err;
+
+ /*
+ * PMD swap entry: duplicate swap references and clear
+ * exclusive on source, matching copy_nonpresent_pte().
+ */
+ err = swap_dup_entries_direct(entry, HPAGE_PMD_NR);
+ if (err < 0)
+ return err;
+
+ mm_prepare_for_swap_entries(dst_mm);
+
+ if (pmd_swp_exclusive(pmd)) {
+ pmd = pmd_swp_clear_exclusive(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
}
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ if (softleaf_is_swap(entry))
+ add_mm_counter(dst_mm, MM_SWAPENTS, HPAGE_PMD_NR);
+ else
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
if (!userfaultfd_wp(dst_vma))
pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ return 0;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1899,6 +1920,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (unlikely(!pgtable))
goto out;
+retry:
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -1906,11 +1928,28 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- if (unlikely(thp_migration_supported() &&
- pmd_is_valid_softleaf(pmd))) {
- copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
- dst_vma, src_vma, pmd, pgtable);
- ret = 0;
+ if (unlikely(pmd_is_valid_softleaf(pmd))) {
+ ret = copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+ addr, dst_vma, src_vma, pmd,
+ pgtable);
+ if (ret) {
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ /*
+ * For PMD swap entries -ENOMEM means the per-cluster
+ * swap-extend table couldn't be GFP_ATOMIC-allocated.
+ * try the GFP_KERNEL fallback once before giving up.
+ */
+ if (ret == -ENOMEM) {
+ softleaf_t entry = softleaf_from_pmd(pmd);
+
+ if (softleaf_is_swap(entry) &&
+ !swap_retry_table_alloc(entry, GFP_KERNEL))
+ goto retry;
+ }
+ pte_free(dst_mm, pgtable);
+ goto out;
+ }
goto out_unlock;
}
diff --git a/mm/memory.c b/mm/memory.c
index 6637c5b13c9b..e0819a562187 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -950,7 +950,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
struct page *page;
if (likely(softleaf_is_swap(entry))) {
- if (swap_dup_entry_direct(entry) < 0)
+ if (swap_dup_entries_direct(entry, 1) < 0)
return -EIO;
mm_prepare_for_swap_entries(dst_mm);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5a69716b2052..0695dbd1a8b1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3899,8 +3899,9 @@ void si_swapinfo(struct sysinfo *val)
}
/*
- * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
+ * swap_dup_entries_direct() - Increase reference count of swap entries by one.
* @entry: first swap entry from which we want to increase the refcount.
+ * @nr: number of contiguous swap entries to duplicate.
*
* Returns 0 for success, or -ENOMEM if the extend table is required
* but could not be atomically allocated. Returns -EINVAL if the swap
@@ -3912,7 +3913,7 @@ void si_swapinfo(struct sysinfo *val)
* Also the swap entry must have a count >= 1. Otherwise folio_dup_swap should
* be used.
*/
-int swap_dup_entry_direct(swp_entry_t entry)
+int swap_dup_entries_direct(swp_entry_t entry, int nr)
{
struct swap_info_struct *si;
@@ -3929,7 +3930,7 @@ int swap_dup_entry_direct(swp_entry_t entry)
*/
VM_WARN_ON_ONCE(!swap_entry_swapped(si, entry));
- return swap_dup_entries_cluster(si, swp_offset(entry), 1);
+ return swap_dup_entries_cluster(si, swp_offset(entry), nr);
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
--
2.53.0-Meta
next prev parent reply other threads:[~2026-07-03 17:39 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-07-03 17:38 [PATCH v3 00/11] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-07-03 17:38 ` [PATCH v3 01/11] mm: add PMD swap entry detection support Usama Arif
2026-07-03 17:38 ` [PATCH v3 02/11] mm: add PMD swap entry splitting support Usama Arif
2026-07-03 17:38 ` Usama Arif [this message]
2026-07-03 17:38 ` [PATCH v3 04/11] mm: zswap: add range lookup for large-folio swapin Usama Arif
2026-07-03 17:38 ` [PATCH v3 05/11] mm: swap in PMD swap entries as whole THPs during swapoff Usama Arif
2026-07-03 17:38 ` [PATCH v3 06/11] mm: handle PMD swap entries in non-present PMD walkers Usama Arif
2026-07-03 17:38 ` [PATCH v3 07/11] mm: handle PMD swap entries in MADV_WILLNEED Usama Arif
2026-07-03 17:38 ` [PATCH v3 08/11] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-07-03 17:38 ` [PATCH v3 09/11] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-07-03 17:38 ` [PATCH v3 10/11] mm: install PMD swap entries on swap-out Usama Arif
2026-07-03 17:38 ` [PATCH v3 11/11] selftests/mm: add PMD swap entry tests Usama Arif
2026-07-04 6:27 ` kernel test robot
2026-07-04 8:30 ` kernel test robot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260703173903.3789516-4-usama.arif@linux.dev \
--to=usama.arif@linux.dev \
--cc=akpm@linux-foundation.org \
--cc=alex@ghiti.fr \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=baoquan.he@linux.dev \
--cc=chrisl@kernel.org \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=hannes@cmpxchg.org \
--cc=kas@kernel.org \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=liam@infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=npache@redhat.com \
--cc=nphamcs@gmail.com \
--cc=riel@surriel.com \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=vbabka@kernel.org \
--cc=willy@infradead.org \
--cc=ying.huang@linux.alibaba.com \
--cc=youngjun.park@lge.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox