From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
ljs@kernel.org, ziy@nvidia.com
Cc: bhe@redhat.com, willy@infradead.org, youngjun.park@lge.com,
hannes@cmpxchg.org, riel@surriel.com, shakeel.butt@linux.dev,
alex@ghiti.fr, kas@kernel.org, baohua@kernel.org,
dev.jain@arm.com, baolin.wang@linux.alibaba.com,
npache@redhat.com, Liam.Howlett@oracle.com, ryan.roberts@arm.com,
Vlastimil Babka <vbabka@kernel.org>,
lance.yang@linux.dev, linux-kernel@vger.kernel.org,
nphamcs@gmail.com, shikemeng@huaweicloud.com,
kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [PATCH 08/13] mm: swap in PMD swap entries as whole THPs during swapoff
Date: Mon, 27 Apr 2026 03:01:57 -0700 [thread overview]
Message-ID: <20260427100553.2754667-9-usama.arif@linux.dev> (raw)
In-Reply-To: <20260427100553.2754667-1-usama.arif@linux.dev>
Add unuse_pmd() and call it from unuse_pmd_range() to swap in
PMD-level swap entries as whole THPs during swapoff. This mirrors
the existing unuse_pte_range() but operates at PMD granularity.
If the PMD-order folio cannot be allocated, the cached folio is no
longer PMD-sized (e.g. split in the swap cache by
deferred_split_scan() or memory_failure() while the PMD swap entry
was installed), or the folio is not uptodate, the PMD swap entry is
split into PTE-level entries via __split_huge_pmd() and a non-zero
error is returned so unuse_pmd_range() falls through to
unuse_pte_range(), which handles the individual entries at order-0.
swapin_alloc_pmd_folio() is a separate function in swap_state.c
as it will be reused in swapin in a later patch.
Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
mm/swap.h | 7 +++
mm/swap_state.c | 35 +++++++++++++
mm/swapfile.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 179 insertions(+)
diff --git a/mm/swap.h b/mm/swap.h
index a77016f2423b..76752df71693 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -301,6 +301,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
struct folio *swapin_folio(swp_entry_t entry, struct folio *folio);
+struct folio *swapin_alloc_pmd_folio(swp_entry_t entry, struct mm_struct *mm);
void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr);
@@ -438,6 +439,12 @@ static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
return NULL;
}
+static inline struct folio *swapin_alloc_pmd_folio(swp_entry_t entry,
+ struct mm_struct *mm)
+{
+ return NULL;
+}
+
static inline void swap_update_readahead(struct folio *folio,
struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1415a5c54a43..c2e8c76658f5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -584,6 +584,41 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
return swapcache;
}
+#ifdef CONFIG_THP_SWAP
+/**
+ * swapin_alloc_pmd_folio - allocate, charge, and read a PMD-sized swap folio.
+ * @entry: starting swap entry to swap in
+ * @mm: mm to charge for the swap-in
+ *
+ * Allocate a HPAGE_PMD_ORDER folio, charge it to @mm's memcg for @entry, and
+ * issue the swap-in via swapin_folio(). Used by callers that need to map a
+ * PMD swap entry as a whole THP (PMD swapoff).
+ *
+ * Return: the swapped-in folio, or NULL on alloc/charge/swapin failure (in
+ * which case the caller should fall back to splitting the PMD).
+ */
+struct folio *swapin_alloc_pmd_folio(swp_entry_t entry, struct mm_struct *mm)
+{
+ struct folio *folio;
+
+ folio = folio_alloc(GFP_HIGHUSER_MOVABLE, HPAGE_PMD_ORDER);
+ if (!folio)
+ return NULL;
+
+ if (mem_cgroup_swapin_charge_folio(folio, mm, GFP_KERNEL, entry)) {
+ folio_put(folio);
+ return NULL;
+ }
+
+ if (!swapin_folio(entry, folio)) {
+ folio_put(folio);
+ return NULL;
+ }
+
+ return folio;
+}
+#endif /* CONFIG_THP_SWAP */
+
/*
* Locate a page of swap in physical memory, reserving swap cache space
* and reading the disk if it is not already cached.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 390f191be9a6..7256edf4ce66 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -42,6 +42,7 @@
#include <linux/suspend.h>
#include <linux/zswap.h>
#include <linux/plist.h>
+#include <linux/huge_mm.h>
#include <asm/tlbflush.h>
#include <linux/leafops.h>
@@ -2519,6 +2520,130 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
return 0;
}
+/*
+ * unuse_pmd - Map a locked folio at PMD granularity during swapoff.
+ *
+ * The caller provides a locked, swapped-in folio. Returns 0 on success
+ * (PMD was mapped). Returns -EAGAIN if the swap cache folio no longer
+ * matches the entry or the PMD changed under the lock (try_to_unuse will
+ * rescan). Returns -EIO if the folio is not uptodate; in that case the
+ * PMD is split so unuse_pte_range() can handle individual pages.
+ */
+static int unuse_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, softleaf_t entry,
+ struct folio *folio)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page;
+ pmd_t new_pmd, old_pmd;
+ spinlock_t *ptl;
+ rmap_t rmap_flags = RMAP_NONE;
+ bool exclusive;
+
+ if (unlikely(!folio_matches_swap_entry(folio, entry)))
+ return -EAGAIN;
+
+ if (unlikely(!folio_test_uptodate(folio))) {
+ __split_huge_pmd(vma, pmd, addr, false);
+ return -EIO;
+ }
+
+ page = folio_page(folio, 0);
+
+ ptl = pmd_lock(mm, pmd);
+ old_pmd = pmdp_get(pmd);
+
+ if (!pmd_is_swap_entry(old_pmd) ||
+ softleaf_from_pmd(old_pmd).val != entry.val) {
+ spin_unlock(ptl);
+ return -EAGAIN;
+ }
+
+ exclusive = pmd_swp_exclusive(old_pmd);
+
+ /*
+ * Some architectures may have to restore extra metadata to the folio
+ * when reading from swap. This metadata may be indexed by swap entry
+ * so this must be called before folio_put_swap().
+ */
+ arch_swap_restore(folio_swap(entry, folio), folio);
+
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+
+ new_pmd = folio_mk_pmd(folio, vma->vm_page_prot);
+ new_pmd = pmd_mkold(new_pmd);
+ if (pmd_swp_soft_dirty(old_pmd))
+ new_pmd = pmd_mksoft_dirty(new_pmd);
+ if (pmd_swp_uffd_wp(old_pmd))
+ new_pmd = pmd_mkuffd_wp(new_pmd);
+
+ if (exclusive)
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ folio_get(folio);
+ if (!folio_test_anon(folio))
+ folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
+ else
+ folio_add_anon_rmap_pmd(folio, page, vma, addr, rmap_flags);
+
+ set_pmd_at(mm, addr, pmd, new_pmd);
+ folio_put_swap(folio, NULL);
+
+ spin_unlock(ptl);
+
+ folio_free_swap(folio);
+ return 0;
+}
+
+/*
+ * Try to swap in a PMD swap entry as a whole THP. Returns 0 on success.
+ * Returns -ENOMEM if the PMD-order folio could not be allocated/charged,
+ * -EIO if swap-in failed, or -EAGAIN if the cached folio is no longer
+ * PMD-sized; in all of these the PMD is split so the caller can fall
+ * back to unuse_pte_range(). Otherwise propagates the error from
+ * unuse_pmd().
+ */
+static int unuse_pmd_entry(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, softleaf_t entry)
+{
+ struct folio *folio;
+ int ret;
+
+ folio = swap_cache_get_folio(entry);
+ if (!folio) {
+ folio = swapin_alloc_pmd_folio(entry, vma->vm_mm);
+ if (!folio) {
+ ret = -ENOMEM;
+ goto split_fallback;
+ }
+ }
+
+ folio_lock(folio);
+ folio_wait_writeback(folio);
+ /*
+ * If the cached folio is no longer PMD-sized (e.g. split in the
+ * swap cache by deferred_split_scan() or memory_failure() while
+ * the PMD swap entry was installed), the PMD swap entry no longer
+ * maps a single contiguous folio. Split the PMD swap entry so
+ * unuse_pte_range() can swap the per-slot folios in individually.
+ */
+ if (folio_nr_pages(folio) != HPAGE_PMD_NR) {
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = -EAGAIN;
+ goto split_fallback;
+ }
+ ret = unuse_pmd(vma, pmd, addr, entry, folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+
+split_fallback:
+ __split_huge_pmd(vma, pmd, addr, false);
+ return ret;
+}
+
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned int type)
@@ -2531,6 +2656,18 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
do {
cond_resched();
next = pmd_addr_end(addr, end);
+
+ pmd_t pmdval = pmdp_get(pmd);
+
+ if (pmd_is_swap_entry(pmdval)) {
+ softleaf_t sl = softleaf_from_pmd(pmdval);
+
+ if (swp_type(sl) == type) {
+ if (!unuse_pmd_entry(vma, pmd, addr, sl))
+ continue;
+ }
+ }
+
ret = unuse_pte_range(vma, pmd, addr, next, type);
if (ret)
return ret;
--
2.52.0
next prev parent reply other threads:[~2026-04-27 10:06 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-27 10:01 [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 10:01 ` [PATCH 01/13] mm: add softleaf_to_pmd() and convert existing callers Usama Arif
2026-05-13 19:24 ` David Hildenbrand (Arm)
2026-05-29 7:20 ` Dev Jain
2026-05-29 14:47 ` Usama Arif
2026-04-27 10:01 ` [PATCH 02/13] mm: extract ensure_on_mmlist() helper Usama Arif
2026-05-13 13:32 ` David Hildenbrand (Arm)
2026-05-13 17:21 ` Usama Arif
2026-05-13 19:22 ` David Hildenbrand (Arm)
2026-05-29 7:42 ` Dev Jain
2026-04-27 10:01 ` [PATCH 03/13] fs/proc: use softleaf_has_pfn() in pagemap PMD walker Usama Arif
2026-05-13 13:35 ` David Hildenbrand (Arm)
2026-05-29 9:34 ` Dev Jain
2026-04-27 10:01 ` [PATCH 04/13] mm/huge_memory: move softleaf_to_folio() inside migration branch Usama Arif
2026-05-13 19:25 ` David Hildenbrand (Arm)
2026-05-29 11:31 ` Dev Jain
2026-04-27 10:01 ` [PATCH 05/13] mm: add PMD swap entry detection support Usama Arif
2026-05-30 8:06 ` Dev Jain
2026-04-27 10:01 ` [PATCH 06/13] mm: add PMD swap entry splitting support Usama Arif
2026-05-30 10:52 ` Dev Jain
2026-06-02 12:59 ` Usama Arif
2026-04-27 10:01 ` [PATCH 07/13] mm: handle PMD swap entries in fork path Usama Arif
2026-04-27 10:01 ` Usama Arif [this message]
2026-05-26 19:44 ` [PATCH 08/13] mm: swap in PMD swap entries as whole THPs during swapoff Alexandre Ghiti
2026-05-29 14:49 ` Usama Arif
2026-04-27 10:01 ` [PATCH 09/13] mm: handle PMD swap entries in non-present PMD walkers Usama Arif
2026-04-27 10:01 ` [PATCH 10/13] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-04-27 10:02 ` [PATCH 11/13] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-04-27 10:02 ` [PATCH 12/13] mm: install PMD swap entries on swap-out Usama Arif
2026-04-27 10:02 ` [PATCH 13/13] selftests/mm: add PMD swap entry tests Usama Arif
2026-04-27 13:38 ` [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 18:26 ` Zi Yan
2026-04-27 20:12 ` Usama Arif
2026-04-29 12:57 ` Zi Yan
2026-04-28 19:54 ` David Hildenbrand (Arm)
2026-04-29 9:39 ` Usama Arif
2026-04-29 12:52 ` Lorenzo Stoakes
2026-04-29 10:44 ` Kairui Song
2026-04-30 10:38 ` Usama Arif
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260427100553.2754667-9-usama.arif@linux.dev \
--to=usama.arif@linux.dev \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=alex@ghiti.fr \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=chrisl@kernel.org \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=hannes@cmpxchg.org \
--cc=kas@kernel.org \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=linux-kernel@vger.kernel.org \
--cc=ljs@kernel.org \
--cc=npache@redhat.com \
--cc=nphamcs@gmail.com \
--cc=riel@surriel.com \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=vbabka@kernel.org \
--cc=willy@infradead.org \
--cc=youngjun.park@lge.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.