From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
ljs@kernel.org, ziy@nvidia.com
Cc: ying.huang@linux.alibaba.com, Baoquan He <baoquan.he@linux.dev>,
willy@infradead.org, youngjun.park@lge.com, hannes@cmpxchg.org,
riel@surriel.com, shakeel.butt@linux.dev, alex@ghiti.fr,
kas@kernel.org, baohua@kernel.org, dev.jain@arm.com,
baolin.wang@linux.alibaba.com, npache@redhat.com,
Liam R. Howlett <liam@infradead.org>,
ryan.roberts@arm.com, Vlastimil Babka <vbabka@kernel.org>,
lance.yang@linux.dev, linux-kernel@vger.kernel.org,
nphamcs@gmail.com, shikemeng@huaweicloud.com,
kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [v2 10/16] mm: swap in PMD swap entries as whole THPs during swapoff
Date: Tue, 2 Jun 2026 07:24:18 -0700 [thread overview]
Message-ID: <20260602142537.198755-11-usama.arif@linux.dev> (raw)
In-Reply-To: <20260602142537.198755-1-usama.arif@linux.dev>
Add unuse_pmd() and call it from unuse_pmd_range() to swap in
PMD-level swap entries as whole THPs during swapoff. This mirrors
the existing unuse_pte_range() but operates at PMD granularity.
If the PMD-order folio cannot be allocated, the cached folio is no
longer PMD-sized (e.g. split in the swap cache by
deferred_split_scan() or memory_failure() while the PMD swap entry
was installed), or the folio is not uptodate, the PMD swap entry is
split into PTE-level entries via __split_huge_pmd() and a non-zero
error is returned so unuse_pmd_range() falls through to
unuse_pte_range(), which handles the individual entries at order-0.
Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
mm/swapfile.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 145 insertions(+)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 37408905490e..56454e486324 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -42,6 +42,7 @@
#include <linux/suspend.h>
#include <linux/zswap.h>
#include <linux/plist.h>
+#include <linux/huge_mm.h>
#include <asm/tlbflush.h>
#include <linux/leafops.h>
@@ -2641,6 +2642,138 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
return 0;
}
+/*
+ * unuse_pmd - Map a locked folio at PMD granularity during swapoff.
+ *
+ * The caller provides a locked, swapped-in folio. Returns 0 on success
+ * (PMD was mapped). Returns -EAGAIN if the swap cache folio no longer
+ * matches the entry or the PMD changed under the lock (try_to_unuse will
+ * rescan). Returns -EIO if the folio is not uptodate; in that case the
+ * PMD is split so unuse_pte_range() can handle individual pages.
+ */
+static int unuse_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, softleaf_t entry,
+ struct folio *folio)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ pmd_t new_pmd, old_pmd;
+ spinlock_t *ptl;
+ rmap_t rmap_flags = RMAP_NONE;
+ bool exclusive;
+
+ if (unlikely(!folio_matches_swap_entry(folio, entry)))
+ return -EAGAIN;
+
+ if (unlikely(!folio_test_uptodate(folio))) {
+ __split_huge_pmd(vma, pmd, addr, false);
+ return -EIO;
+ }
+
+ page = folio_page(folio, 0);
+
+ ptl = pmd_lock(mm, pmd);
+ old_pmd = pmdp_get(pmd);
+
+ if (!pmd_is_swap_entry(old_pmd) ||
+ softleaf_from_pmd(old_pmd).val != entry.val) {
+ spin_unlock(ptl);
+ return -EAGAIN;
+ }
+
+ exclusive = pmd_swp_exclusive(old_pmd);
+
+ /*
+ * Some architectures may have to restore extra metadata to the folio
+ * when reading from swap. This metadata may be indexed by swap entry
+ * so this must be called before folio_put_swap().
+ */
+ arch_swap_restore(folio_swap(entry, folio), folio);
+
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+
+ new_pmd = folio_mk_pmd(folio, vma->vm_page_prot);
+ new_pmd = pmd_mkold(new_pmd);
+ if (pmd_swp_soft_dirty(old_pmd))
+ new_pmd = pmd_mksoft_dirty(new_pmd);
+ if (pmd_swp_uffd_wp(old_pmd))
+ new_pmd = pmd_mkuffd_wp(new_pmd);
+
+ if (exclusive)
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ folio_get(folio);
+ if (!folio_test_anon(folio))
+ folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
+ else
+ folio_add_anon_rmap_pmd(folio, page, vma, addr, rmap_flags);
+
+ set_pmd_at(mm, addr, pmd, new_pmd);
+ folio_put_swap(folio, NULL);
+
+ spin_unlock(ptl);
+
+ folio_free_swap(folio);
+ return 0;
+}
+
+/*
+ * Try to swap in a PMD swap entry as a whole THP. Returns 0 on success.
+ * Returns -ENOMEM if the PMD-order folio could not be allocated/charged,
+ * -EIO if swap-in failed, or -EAGAIN if the cached folio is no longer
+ * PMD-sized; in all of these the PMD is split so the caller can fall
+ * back to unuse_pte_range(). Otherwise propagates the error from
+ * unuse_pmd().
+ */
+static int unuse_pmd_entry(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, softleaf_t entry)
+{
+ struct folio *folio;
+ int ret;
+
+ folio = swap_cache_get_folio(entry);
+ if (!folio) {
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = addr,
+ .real_address = addr,
+ .pmd = pmd,
+ };
+
+ folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE,
+ BIT(HPAGE_PMD_ORDER), &vmf, NULL, 0);
+ if (IS_ERR_OR_NULL(folio)) {
+ ret = -ENOMEM;
+ goto split_fallback;
+ }
+ }
+
+ folio_lock(folio);
+ folio_wait_writeback(folio);
+ /*
+ * If the cached folio is no longer PMD-sized (e.g. split in the
+ * swap cache by deferred_split_scan() or memory_failure() while
+ * the PMD swap entry was installed), the PMD swap entry no longer
+ * maps a single contiguous folio. Split the PMD swap entry so
+ * unuse_pte_range() can swap the per-slot folios in individually.
+ */
+ if (folio_nr_pages(folio) != HPAGE_PMD_NR) {
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = -EAGAIN;
+ goto split_fallback;
+ }
+ ret = unuse_pmd(vma, pmd, addr, entry, folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+
+split_fallback:
+ __split_huge_pmd(vma, pmd, addr, false);
+ return ret;
+}
+
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned int type)
@@ -2653,6 +2786,18 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
do {
cond_resched();
next = pmd_addr_end(addr, end);
+
+ pmd_t pmdval = pmdp_get(pmd);
+
+ if (pmd_is_swap_entry(pmdval)) {
+ softleaf_t sl = softleaf_from_pmd(pmdval);
+
+ if (swp_type(sl) == type) {
+ if (!unuse_pmd_entry(vma, pmd, addr, sl))
+ continue;
+ }
+ }
+
ret = unuse_pte_range(vma, pmd, addr, next, type);
if (ret)
return ret;
--
2.52.0
next prev parent reply other threads:[~2026-06-02 14:26 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-02 14:24 [v2 00/16] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-06-02 14:24 ` [v2 01/16] mm: add softleaf_to_pmd() and convert existing callers Usama Arif
2026-06-02 14:24 ` [v2 02/16] mm: extract mm_prepare_for_swap_entries() helper Usama Arif
2026-06-02 14:24 ` [v2 03/16] fs/proc: use softleaf_has_pfn() in pagemap PMD walker Usama Arif
2026-06-02 14:24 ` [v2 04/16] mm/huge_memory: move softleaf_to_folio() inside migration branch Usama Arif
2026-06-02 14:24 ` [v2 05/16] mm/migrate_device: move softleaf_to_folio() inside device-private branch Usama Arif
2026-06-02 14:24 ` [v2 06/16] mm: rename ARCH_ENABLE_THP_MIGRATION to ARCH_SUPPORTS_PMD_SOFTLEAF Usama Arif
2026-06-02 14:24 ` [v2 07/16] mm: add PMD swap entry detection support Usama Arif
2026-06-02 14:24 ` [v2 08/16] mm: add PMD swap entry splitting support Usama Arif
2026-06-02 14:24 ` [v2 09/16] mm: handle PMD swap entries in fork path Usama Arif
2026-06-02 14:24 ` Usama Arif [this message]
2026-06-02 14:24 ` [v2 11/16] mm: handle PMD swap entries in non-present PMD walkers Usama Arif
2026-06-02 14:24 ` [v2 12/16] mm: handle PMD swap entries in MADV_WILLNEED Usama Arif
2026-06-02 14:24 ` [v2 13/16] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-06-02 14:24 ` [v2 14/16] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-06-02 14:24 ` [v2 15/16] mm: install PMD swap entries on swap-out Usama Arif
2026-06-02 14:24 ` [v2 16/16] selftests/mm: add PMD swap entry tests Usama Arif
2026-06-09 14:29 ` [v2 00/16] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-06-10 12:24 ` David Hildenbrand (Arm)
2026-06-10 13:01 ` Lance Yang
2026-06-10 13:48 ` David Hildenbrand (Arm)
2026-06-10 14:44 ` Usama Arif
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260602142537.198755-11-usama.arif@linux.dev \
--to=usama.arif@linux.dev \
--cc=akpm@linux-foundation.org \
--cc=alex@ghiti.fr \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=baoquan.he@linux.dev \
--cc=chrisl@kernel.org \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=hannes@cmpxchg.org \
--cc=kas@kernel.org \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=liam@infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=ljs@kernel.org \
--cc=npache@redhat.com \
--cc=nphamcs@gmail.com \
--cc=riel@surriel.com \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=vbabka@kernel.org \
--cc=willy@infradead.org \
--cc=ying.huang@linux.alibaba.com \
--cc=youngjun.park@lge.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.