From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
ljs@kernel.org, ziy@nvidia.com
Cc: bhe@redhat.com, willy@infradead.org, youngjun.park@lge.com,
hannes@cmpxchg.org, riel@surriel.com, shakeel.butt@linux.dev,
alex@ghiti.fr, kas@kernel.org, baohua@kernel.org,
dev.jain@arm.com, baolin.wang@linux.alibaba.com,
npache@redhat.com, Liam.Howlett@oracle.com, ryan.roberts@arm.com,
Vlastimil Babka <vbabka@kernel.org>,
lance.yang@linux.dev, linux-kernel@vger.kernel.org,
nphamcs@gmail.com, shikemeng@huaweicloud.com,
kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [PATCH 09/13] mm: handle PMD swap entries in non-present PMD walkers
Date: Mon, 27 Apr 2026 03:01:58 -0700 [thread overview]
Message-ID: <20260427100553.2754667-10-usama.arif@linux.dev> (raw)
In-Reply-To: <20260427100553.2754667-1-usama.arif@linux.dev>
Teach the remaining non-present PMD walkers about swap entries,
mirroring the PTE-level equivalents.
smaps_pmd_entry() accounts swap and swap_pss via a new shared
smaps_account_swap() helper used by both PTE and PMD paths.
zap_huge_pmd() frees swap slots via swap_put_entries_direct(),
matching zap_nonpresent_ptes().
change_non_present_huge_pmd() skips write-permission changes for swap
entries and only updates uffd_wp, matching change_softleaf_pte().
move_soft_dirty_pmd(), clear_soft_dirty_pmd(), and make_uffd_wp_pmd(),
pagemap_pmd_range_thp() and change_huge_pmd() handle swap entries
alongside migration entries.
madvise_cold_or_pageout_pmd_range() extends its non-present PMD
VM_BUG_ON to allow swap entries; without this, hitting a PMD swap
entry on a DEBUG_VM kernel would BUG().
queue_folios_pmd() in mempolicy silently skips swap entries, matching
the PTE walker which only counts migration entries as failures.
Without this, mbind(MPOL_MF_STRICT) would spuriously return -EIO on
a swapped-out THP.
madvise_free_huge_pmd() handles PMD swap entries directly: for a
full-range MADV_FREE it clears the PMD, frees the deposited page
table, and releases the swap slots; for a partial range it splits to
PTE swap entries. Without this, MADV_FREE silently becomes a no-op
on swapped-out THPs, leaking swap slots.
hmm_vma_handle_absent_pmd() faults in PMD swap entries via
hmm_vma_fault() instead of returning -EFAULT. The first per-page
handle_mm_fault() call triggers do_huge_pmd_swap_page(), which maps
the entire folio; subsequent calls become harmless
huge_pmd_set_accessed() and the walker retries with a present PMD.
Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
fs/proc/task_mmu.c | 43 +++++++++++++++++++++-------------
mm/hmm.c | 3 ++-
mm/huge_memory.c | 58 +++++++++++++++++++++++++++++++++++-----------
mm/khugepaged.c | 6 +++++
mm/madvise.c | 5 ++--
mm/mempolicy.c | 2 ++
6 files changed, 85 insertions(+), 32 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6d9f43881e62..a6dd91d4cf24 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1015,6 +1015,23 @@ static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
#endif
}
+static void smaps_account_swap(struct mem_size_stats *mss,
+ softleaf_t entry, unsigned long size)
+{
+ int mapcount;
+
+ mss->swap += size;
+ mapcount = swp_swapcount(entry);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)size << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)size << PSS_SHIFT;
+ }
+}
+
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
@@ -1036,18 +1053,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
const softleaf_t entry = softleaf_from_pte(ptent);
if (softleaf_is_swap(entry)) {
- int mapcount;
-
- mss->swap += PAGE_SIZE;
- mapcount = swp_swapcount(entry);
- if (mapcount >= 2) {
- u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
-
- do_div(pss_delta, mapcount);
- mss->swap_pss += pss_delta;
- } else {
- mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
- }
+ smaps_account_swap(mss, entry, PAGE_SIZE);
} else if (softleaf_has_pfn(entry)) {
if (softleaf_is_device_private(entry))
present = true;
@@ -1077,9 +1083,13 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
if (pmd_present(*pmd)) {
page = vm_normal_page_pmd(vma, addr, *pmd);
present = true;
- } else if (unlikely(thp_migration_supported())) {
+ } else {
const softleaf_t entry = softleaf_from_pmd(*pmd);
+ if (softleaf_is_swap(entry)) {
+ smaps_account_swap(mss, entry, HPAGE_PMD_SIZE);
+ return;
+ }
if (softleaf_has_pfn(entry))
page = softleaf_to_page(entry);
}
@@ -1665,7 +1675,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
pmd = pmd_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
- } else if (pmd_is_migration_entry(pmd)) {
+ } else if (pmd_is_migration_entry(pmd) || pmd_is_swap_entry(pmd)) {
pmd = pmd_swp_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
@@ -2025,7 +2035,8 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
flags |= PM_UFFD_WP;
if (pm->show_pfn)
frame = pmd_pfn(pmd) + idx;
- } else if (thp_migration_supported()) {
+ } else if (pmd_is_swap_entry(pmd) ||
+ (thp_migration_supported() && pmd_is_migration_entry(pmd))) {
const softleaf_t entry = softleaf_from_pmd(pmd);
unsigned long offset;
@@ -2463,7 +2474,7 @@ static void make_uffd_wp_pmd(struct vm_area_struct *vma,
old = pmdp_invalidate_ad(vma, addr, pmdp);
pmd = pmd_mkuffd_wp(old);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
- } else if (pmd_is_migration_entry(pmd)) {
+ } else if (pmd_is_migration_entry(pmd) || pmd_is_swap_entry(pmd)) {
pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
diff --git a/mm/hmm.c b/mm/hmm.c
index 5955f2f0c83d..2bd3ebd1b8d6 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -370,7 +370,8 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
npages, 0);
if (required_fault) {
- if (softleaf_is_device_private(entry))
+ if (softleaf_is_device_private(entry) ||
+ softleaf_is_swap(entry))
return hmm_vma_fault(addr, end, required_fault, walk);
else
return -EFAULT;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42887cf518cd..109e4dc4a167 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2375,6 +2375,14 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
return 0;
}
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+ pgtable_t pgtable;
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pte_free(mm, pgtable);
+ mm_dec_nr_ptes(mm);
+}
/*
* Return true if we do MADV_FREE successfully on entire pmd page.
* Otherwise, return false.
@@ -2399,8 +2407,23 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto out;
if (unlikely(!pmd_present(orig_pmd))) {
+ if (pmd_is_swap_entry(orig_pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE) {
+ spin_unlock(ptl);
+ __split_huge_pmd(vma, pmd, addr, false);
+ goto out_unlocked;
+ }
+ softleaf_t sl = softleaf_from_pmd(orig_pmd);
+
+ pmdp_huge_get_and_clear(mm, addr, pmd);
+ zap_deposited_table(mm, pmd);
+ spin_unlock(ptl);
+ swap_put_entries_direct(sl, HPAGE_PMD_NR);
+ add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+ return true;
+ }
VM_BUG_ON(thp_migration_supported() &&
- !pmd_is_migration_entry(orig_pmd));
+ !pmd_is_migration_entry(orig_pmd));
goto out;
}
@@ -2449,15 +2472,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return ret;
}
-static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
-{
- pgtable_t pgtable;
-
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
- pte_free(mm, pgtable);
- mm_dec_nr_ptes(mm);
-}
-
static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t pmdval, struct folio *folio, bool is_present)
{
@@ -2550,6 +2564,16 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ if (pmd_is_swap_entry(orig_pmd)) {
+ softleaf_t sl = softleaf_from_pmd(orig_pmd);
+
+ zap_deposited_table(mm, pmd);
+ spin_unlock(ptl);
+ swap_put_entries_direct(sl, HPAGE_PMD_NR);
+ add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+ return true;
+ }
+
is_present = pmd_present(orig_pmd);
folio = normal_or_softleaf_folio_pmd(vma, addr, orig_pmd, is_present);
has_deposit = has_deposited_pgtable(vma, orig_pmd, folio);
@@ -2582,7 +2606,8 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
if (pgtable_supports_soft_dirty()) {
- if (unlikely(pmd_is_migration_entry(pmd)))
+ if (unlikely(pmd_is_migration_entry(pmd) ||
+ pmd_is_swap_entry(pmd)))
pmd = pmd_swp_mksoft_dirty(pmd);
else if (pmd_present(pmd))
pmd = pmd_mksoft_dirty(pmd);
@@ -2662,7 +2687,14 @@ static void change_non_present_huge_pmd(struct mm_struct *mm,
pmd_t newpmd;
VM_WARN_ON(!pmd_is_valid_softleaf(*pmd));
- if (softleaf_is_migration_write(entry)) {
+
+ /*
+ * PMD swap entries don't encode write permission in the entry type,
+ * so only uffd_wp flag changes apply. No folio lookup needed.
+ */
+ if (softleaf_is_swap(entry)) {
+ newpmd = *pmd;
+ } else if (softleaf_is_migration_write(entry)) {
const struct folio *folio = softleaf_to_folio(entry);
/*
@@ -2719,7 +2751,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (!ptl)
return 0;
- if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) {
+ if (pmd_is_valid_softleaf(*pmd)) {
change_non_present_huge_pmd(mm, addr, pmd, uffd_wp,
uffd_wp_resolve);
goto unlock;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b8452dbdb043..a7cc65c3d06a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -950,6 +950,12 @@ static inline enum scan_result check_pmd_state(pmd_t *pmd)
*/
if (pmd_is_migration_entry(pmde))
return SCAN_PMD_MAPPED;
+ /*
+ * A PMD-mapped THP that has been swapped out is still a THP from
+ * khugepaged's perspective; treat it like a present huge PMD.
+ */
+ if (pmd_is_swap_entry(pmde))
+ return SCAN_PMD_MAPPED;
if (!pmd_present(pmde))
return SCAN_NO_PTE_TABLE;
if (pmd_trans_huge(pmde))
diff --git a/mm/madvise.c b/mm/madvise.c
index 69708e953cf5..2702eb0b1134 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -390,7 +390,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (unlikely(!pmd_present(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
- !pmd_is_migration_entry(orig_pmd));
+ !pmd_is_migration_entry(orig_pmd) &&
+ !pmd_is_swap_entry(orig_pmd));
goto huge_unlock;
}
@@ -666,7 +667,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
int nr, max_nr;
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd))
+ if (pmd_trans_huge(*pmd) || pmd_is_swap_entry(*pmd))
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
return 0;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4e4421b22b59..55b38fe13a63 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -658,6 +658,8 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
qp->nr_failed++;
return;
}
+ if (unlikely(pmd_is_swap_entry(*pmd)))
+ return;
folio = pmd_folio(*pmd);
if (is_huge_zero_folio(folio)) {
walk->action = ACTION_CONTINUE;
--
2.52.0
next prev parent reply other threads:[~2026-04-27 10:06 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-27 10:01 [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 10:01 ` [PATCH 01/13] mm: add softleaf_to_pmd() and convert existing callers Usama Arif
2026-04-27 10:01 ` [PATCH 02/13] mm: extract ensure_on_mmlist() helper Usama Arif
2026-04-27 10:01 ` [PATCH 03/13] fs/proc: use softleaf_has_pfn() in pagemap PMD walker Usama Arif
2026-04-27 10:01 ` [PATCH 04/13] mm/huge_memory: move softleaf_to_folio() inside migration branch Usama Arif
2026-04-27 10:01 ` [PATCH 05/13] mm: add PMD swap entry detection support Usama Arif
2026-04-27 10:01 ` [PATCH 06/13] mm: add PMD swap entry splitting support Usama Arif
2026-04-27 10:01 ` [PATCH 07/13] mm: handle PMD swap entries in fork path Usama Arif
2026-04-27 10:01 ` [PATCH 08/13] mm: swap in PMD swap entries as whole THPs during swapoff Usama Arif
2026-04-27 10:01 ` Usama Arif [this message]
2026-04-27 10:01 ` [PATCH 10/13] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-04-27 10:02 ` [PATCH 11/13] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-04-27 10:02 ` [PATCH 12/13] mm: install PMD swap entries on swap-out Usama Arif
2026-04-27 10:02 ` [PATCH 13/13] selftests/mm: add PMD swap entry tests Usama Arif
2026-04-27 13:38 ` [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 18:26 ` Zi Yan
2026-04-27 20:12 ` Usama Arif
2026-04-28 19:54 ` David Hildenbrand (Arm)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260427100553.2754667-10-usama.arif@linux.dev \
--to=usama.arif@linux.dev \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=alex@ghiti.fr \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=chrisl@kernel.org \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=hannes@cmpxchg.org \
--cc=kas@kernel.org \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=linux-kernel@vger.kernel.org \
--cc=ljs@kernel.org \
--cc=npache@redhat.com \
--cc=nphamcs@gmail.com \
--cc=riel@surriel.com \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=vbabka@kernel.org \
--cc=willy@infradead.org \
--cc=youngjun.park@lge.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox