The Linux Kernel Mailing List
 help / color / mirror / Atom feed
From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
	david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
	ljs@kernel.org, ziy@nvidia.com
Cc: ying.huang@linux.alibaba.com, Baoquan He <baoquan.he@linux.dev>,
	willy@infradead.org, youngjun.park@lge.com, hannes@cmpxchg.org,
	riel@surriel.com, shakeel.butt@linux.dev, alex@ghiti.fr,
	kas@kernel.org, baohua@kernel.org, dev.jain@arm.com,
	baolin.wang@linux.alibaba.com, npache@redhat.com,
	Liam R. Howlett <liam@infradead.org>,
	ryan.roberts@arm.com, Vlastimil Babka <vbabka@kernel.org>,
	lance.yang@linux.dev, linux-kernel@vger.kernel.org,
	nphamcs@gmail.com, shikemeng@huaweicloud.com,
	kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [v2 10/16] mm: swap in PMD swap entries as whole THPs during swapoff
Date: Tue,  2 Jun 2026 07:24:18 -0700	[thread overview]
Message-ID: <20260602142537.198755-11-usama.arif@linux.dev> (raw)
In-Reply-To: <20260602142537.198755-1-usama.arif@linux.dev>

Add unuse_pmd() and call it from unuse_pmd_range() to swap in
PMD-level swap entries as whole THPs during swapoff.  This mirrors
the existing unuse_pte_range() but operates at PMD granularity.

If the PMD-order folio cannot be allocated, the cached folio is no
longer PMD-sized (e.g. split in the swap cache by
deferred_split_scan() or memory_failure() while the PMD swap entry
was installed), or the folio is not uptodate, the PMD swap entry is
split into PTE-level entries via __split_huge_pmd() and a non-zero
error is returned so unuse_pmd_range() falls through to
unuse_pte_range(), which handles the individual entries at order-0.

Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
 mm/swapfile.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 37408905490e..56454e486324 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -42,6 +42,7 @@
 #include <linux/suspend.h>
 #include <linux/zswap.h>
 #include <linux/plist.h>
+#include <linux/huge_mm.h>
 
 #include <asm/tlbflush.h>
 #include <linux/leafops.h>
@@ -2641,6 +2642,138 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	return 0;
 }
 
+/*
+ * unuse_pmd - Map a locked folio at PMD granularity during swapoff.
+ *
+ * The caller provides a locked, swapped-in folio.  Returns 0 on success
+ * (PMD was mapped).  Returns -EAGAIN if the swap cache folio no longer
+ * matches the entry or the PMD changed under the lock (try_to_unuse will
+ * rescan).  Returns -EIO if the folio is not uptodate; in that case the
+ * PMD is split so unuse_pte_range() can handle individual pages.
+ */
+static int unuse_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+		     unsigned long addr, softleaf_t entry,
+		     struct folio *folio)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct page *page;
+	pmd_t new_pmd, old_pmd;
+	spinlock_t *ptl;
+	rmap_t rmap_flags = RMAP_NONE;
+	bool exclusive;
+
+	if (unlikely(!folio_matches_swap_entry(folio, entry)))
+		return -EAGAIN;
+
+	if (unlikely(!folio_test_uptodate(folio))) {
+		__split_huge_pmd(vma, pmd, addr, false);
+		return -EIO;
+	}
+
+	page = folio_page(folio, 0);
+
+	ptl = pmd_lock(mm, pmd);
+	old_pmd = pmdp_get(pmd);
+
+	if (!pmd_is_swap_entry(old_pmd) ||
+	    softleaf_from_pmd(old_pmd).val != entry.val) {
+		spin_unlock(ptl);
+		return -EAGAIN;
+	}
+
+	exclusive = pmd_swp_exclusive(old_pmd);
+
+	/*
+	 * Some architectures may have to restore extra metadata to the folio
+	 * when reading from swap. This metadata may be indexed by swap entry
+	 * so this must be called before folio_put_swap().
+	 */
+	arch_swap_restore(folio_swap(entry, folio), folio);
+
+	add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+	add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+
+	new_pmd = folio_mk_pmd(folio, vma->vm_page_prot);
+	new_pmd = pmd_mkold(new_pmd);
+	if (pmd_swp_soft_dirty(old_pmd))
+		new_pmd = pmd_mksoft_dirty(new_pmd);
+	if (pmd_swp_uffd_wp(old_pmd))
+		new_pmd = pmd_mkuffd_wp(new_pmd);
+
+	if (exclusive)
+		rmap_flags |= RMAP_EXCLUSIVE;
+
+	folio_get(folio);
+	if (!folio_test_anon(folio))
+		folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
+	else
+		folio_add_anon_rmap_pmd(folio, page, vma, addr, rmap_flags);
+
+	set_pmd_at(mm, addr, pmd, new_pmd);
+	folio_put_swap(folio, NULL);
+
+	spin_unlock(ptl);
+
+	folio_free_swap(folio);
+	return 0;
+}
+
+/*
+ * Try to swap in a PMD swap entry as a whole THP.  Returns 0 on success.
+ * Returns -ENOMEM if the PMD-order folio could not be allocated/charged,
+ * -EIO if swap-in failed, or -EAGAIN if the cached folio is no longer
+ * PMD-sized; in all of these the PMD is split so the caller can fall
+ * back to unuse_pte_range().  Otherwise propagates the error from
+ * unuse_pmd().
+ */
+static int unuse_pmd_entry(struct vm_area_struct *vma, pmd_t *pmd,
+			   unsigned long addr, softleaf_t entry)
+{
+	struct folio *folio;
+	int ret;
+
+	folio = swap_cache_get_folio(entry);
+	if (!folio) {
+		struct vm_fault vmf = {
+			.vma = vma,
+			.address = addr,
+			.real_address = addr,
+			.pmd = pmd,
+		};
+
+		folio = swapin_sync(entry, GFP_HIGHUSER_MOVABLE,
+				    BIT(HPAGE_PMD_ORDER), &vmf, NULL, 0);
+		if (IS_ERR_OR_NULL(folio)) {
+			ret = -ENOMEM;
+			goto split_fallback;
+		}
+	}
+
+	folio_lock(folio);
+	folio_wait_writeback(folio);
+	/*
+	 * If the cached folio is no longer PMD-sized (e.g. split in the
+	 * swap cache by deferred_split_scan() or memory_failure() while
+	 * the PMD swap entry was installed), the PMD swap entry no longer
+	 * maps a single contiguous folio.  Split the PMD swap entry so
+	 * unuse_pte_range() can swap the per-slot folios in individually.
+	 */
+	if (folio_nr_pages(folio) != HPAGE_PMD_NR) {
+		folio_unlock(folio);
+		folio_put(folio);
+		ret = -EAGAIN;
+		goto split_fallback;
+	}
+	ret = unuse_pmd(vma, pmd, addr, entry, folio);
+	folio_unlock(folio);
+	folio_put(folio);
+	return ret;
+
+split_fallback:
+	__split_huge_pmd(vma, pmd, addr, false);
+	return ret;
+}
+
 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
 				unsigned int type)
@@ -2653,6 +2786,18 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 	do {
 		cond_resched();
 		next = pmd_addr_end(addr, end);
+
+		pmd_t pmdval = pmdp_get(pmd);
+
+		if (pmd_is_swap_entry(pmdval)) {
+			softleaf_t sl = softleaf_from_pmd(pmdval);
+
+			if (swp_type(sl) == type) {
+				if (!unuse_pmd_entry(vma, pmd, addr, sl))
+					continue;
+			}
+		}
+
 		ret = unuse_pte_range(vma, pmd, addr, next, type);
 		if (ret)
 			return ret;
-- 
2.52.0


  parent reply	other threads:[~2026-06-02 14:26 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-02 14:24 [v2 00/16] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-06-02 14:24 ` [v2 01/16] mm: add softleaf_to_pmd() and convert existing callers Usama Arif
2026-06-02 14:24 ` [v2 02/16] mm: extract mm_prepare_for_swap_entries() helper Usama Arif
2026-06-02 14:24 ` [v2 03/16] fs/proc: use softleaf_has_pfn() in pagemap PMD walker Usama Arif
2026-06-02 14:24 ` [v2 04/16] mm/huge_memory: move softleaf_to_folio() inside migration branch Usama Arif
2026-06-02 14:24 ` [v2 05/16] mm/migrate_device: move softleaf_to_folio() inside device-private branch Usama Arif
2026-06-02 14:24 ` [v2 06/16] mm: rename ARCH_ENABLE_THP_MIGRATION to ARCH_SUPPORTS_PMD_SOFTLEAF Usama Arif
2026-06-02 14:24 ` [v2 07/16] mm: add PMD swap entry detection support Usama Arif
2026-06-02 14:24 ` [v2 08/16] mm: add PMD swap entry splitting support Usama Arif
2026-06-02 14:24 ` [v2 09/16] mm: handle PMD swap entries in fork path Usama Arif
2026-06-02 14:24 ` Usama Arif [this message]
2026-06-02 14:24 ` [v2 11/16] mm: handle PMD swap entries in non-present PMD walkers Usama Arif
2026-06-12  6:45   ` Lance Yang
2026-06-02 14:24 ` [v2 12/16] mm: handle PMD swap entries in MADV_WILLNEED Usama Arif
2026-06-02 14:24 ` [v2 13/16] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-06-12  8:50   ` Lance Yang
2026-06-02 14:24 ` [v2 14/16] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-06-02 14:24 ` [v2 15/16] mm: install PMD swap entries on swap-out Usama Arif
2026-06-12 14:21   ` Lance Yang
2026-06-02 14:24 ` [v2 16/16] selftests/mm: add PMD swap entry tests Usama Arif
2026-06-09 14:29 ` [v2 00/16] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-06-10 12:24   ` David Hildenbrand (Arm)
2026-06-10 13:01     ` Lance Yang
2026-06-10 13:48       ` David Hildenbrand (Arm)
2026-06-10 14:44         ` Usama Arif

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260602142537.198755-11-usama.arif@linux.dev \
    --to=usama.arif@linux.dev \
    --cc=akpm@linux-foundation.org \
    --cc=alex@ghiti.fr \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=baoquan.he@linux.dev \
    --cc=chrisl@kernel.org \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=kas@kernel.org \
    --cc=kasong@tencent.com \
    --cc=kernel-team@meta.com \
    --cc=lance.yang@linux.dev \
    --cc=liam@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ljs@kernel.org \
    --cc=npache@redhat.com \
    --cc=nphamcs@gmail.com \
    --cc=riel@surriel.com \
    --cc=ryan.roberts@arm.com \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=vbabka@kernel.org \
    --cc=willy@infradead.org \
    --cc=ying.huang@linux.alibaba.com \
    --cc=youngjun.park@lge.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox