From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
david@kernel.org, chrisl@kernel.org, kasong@tencent.com,
ljs@kernel.org, ziy@nvidia.com
Cc: bhe@redhat.com, willy@infradead.org, youngjun.park@lge.com,
hannes@cmpxchg.org, riel@surriel.com, shakeel.butt@linux.dev,
alex@ghiti.fr, kas@kernel.org, baohua@kernel.org,
dev.jain@arm.com, baolin.wang@linux.alibaba.com,
npache@redhat.com, Liam.Howlett@oracle.com, ryan.roberts@arm.com,
Vlastimil Babka <vbabka@kernel.org>,
lance.yang@linux.dev, linux-kernel@vger.kernel.org,
nphamcs@gmail.com, shikemeng@huaweicloud.com,
kernel-team@meta.com, Usama Arif <usama.arif@linux.dev>
Subject: [PATCH 09/13] mm: handle PMD swap entries in non-present PMD walkers
Date: Mon, 27 Apr 2026 03:01:58 -0700 [thread overview]
Message-ID: <20260427100553.2754667-10-usama.arif@linux.dev> (raw)
In-Reply-To: <20260427100553.2754667-1-usama.arif@linux.dev>
Teach the remaining non-present PMD walkers about swap entries,
mirroring the PTE-level equivalents.
smaps_pmd_entry() accounts swap and swap_pss via a new shared
smaps_account_swap() helper used by both PTE and PMD paths.
zap_huge_pmd() frees swap slots via swap_put_entries_direct(),
matching zap_nonpresent_ptes().
change_non_present_huge_pmd() skips write-permission changes for swap
entries and only updates uffd_wp, matching change_softleaf_pte().
move_soft_dirty_pmd(), clear_soft_dirty_pmd(), and make_uffd_wp_pmd(),
pagemap_pmd_range_thp() and change_huge_pmd() handle swap entries
alongside migration entries.
madvise_cold_or_pageout_pmd_range() extends its non-present PMD
VM_BUG_ON to allow swap entries; without this, hitting a PMD swap
entry on a DEBUG_VM kernel would BUG().
queue_folios_pmd() in mempolicy silently skips swap entries, matching
the PTE walker which only counts migration entries as failures.
Without this, mbind(MPOL_MF_STRICT) would spuriously return -EIO on
a swapped-out THP.
madvise_free_huge_pmd() handles PMD swap entries directly: for a
full-range MADV_FREE it clears the PMD, frees the deposited page
table, and releases the swap slots; for a partial range it splits to
PTE swap entries. Without this, MADV_FREE silently becomes a no-op
on swapped-out THPs, leaking swap slots.
hmm_vma_handle_absent_pmd() faults in PMD swap entries via
hmm_vma_fault() instead of returning -EFAULT. The first per-page
handle_mm_fault() call triggers do_huge_pmd_swap_page(), which maps
the entire folio; subsequent calls become harmless
huge_pmd_set_accessed() and the walker retries with a present PMD.
Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
fs/proc/task_mmu.c | 43 +++++++++++++++++++++-------------
mm/hmm.c | 3 ++-
mm/huge_memory.c | 58 +++++++++++++++++++++++++++++++++++-----------
mm/khugepaged.c | 6 +++++
mm/madvise.c | 5 ++--
mm/mempolicy.c | 2 ++
6 files changed, 85 insertions(+), 32 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6d9f43881e62..a6dd91d4cf24 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1015,6 +1015,23 @@ static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
#endif
}
+static void smaps_account_swap(struct mem_size_stats *mss,
+ softleaf_t entry, unsigned long size)
+{
+ int mapcount;
+
+ mss->swap += size;
+ mapcount = swp_swapcount(entry);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)size << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)size << PSS_SHIFT;
+ }
+}
+
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
@@ -1036,18 +1053,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
const softleaf_t entry = softleaf_from_pte(ptent);
if (softleaf_is_swap(entry)) {
- int mapcount;
-
- mss->swap += PAGE_SIZE;
- mapcount = swp_swapcount(entry);
- if (mapcount >= 2) {
- u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
-
- do_div(pss_delta, mapcount);
- mss->swap_pss += pss_delta;
- } else {
- mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
- }
+ smaps_account_swap(mss, entry, PAGE_SIZE);
} else if (softleaf_has_pfn(entry)) {
if (softleaf_is_device_private(entry))
present = true;
@@ -1077,9 +1083,13 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
if (pmd_present(*pmd)) {
page = vm_normal_page_pmd(vma, addr, *pmd);
present = true;
- } else if (unlikely(thp_migration_supported())) {
+ } else {
const softleaf_t entry = softleaf_from_pmd(*pmd);
+ if (softleaf_is_swap(entry)) {
+ smaps_account_swap(mss, entry, HPAGE_PMD_SIZE);
+ return;
+ }
if (softleaf_has_pfn(entry))
page = softleaf_to_page(entry);
}
@@ -1665,7 +1675,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
pmd = pmd_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
- } else if (pmd_is_migration_entry(pmd)) {
+ } else if (pmd_is_migration_entry(pmd) || pmd_is_swap_entry(pmd)) {
pmd = pmd_swp_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
@@ -2025,7 +2035,8 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
flags |= PM_UFFD_WP;
if (pm->show_pfn)
frame = pmd_pfn(pmd) + idx;
- } else if (thp_migration_supported()) {
+ } else if (pmd_is_swap_entry(pmd) ||
+ (thp_migration_supported() && pmd_is_migration_entry(pmd))) {
const softleaf_t entry = softleaf_from_pmd(pmd);
unsigned long offset;
@@ -2463,7 +2474,7 @@ static void make_uffd_wp_pmd(struct vm_area_struct *vma,
old = pmdp_invalidate_ad(vma, addr, pmdp);
pmd = pmd_mkuffd_wp(old);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
- } else if (pmd_is_migration_entry(pmd)) {
+ } else if (pmd_is_migration_entry(pmd) || pmd_is_swap_entry(pmd)) {
pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
diff --git a/mm/hmm.c b/mm/hmm.c
index 5955f2f0c83d..2bd3ebd1b8d6 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -370,7 +370,8 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
npages, 0);
if (required_fault) {
- if (softleaf_is_device_private(entry))
+ if (softleaf_is_device_private(entry) ||
+ softleaf_is_swap(entry))
return hmm_vma_fault(addr, end, required_fault, walk);
else
return -EFAULT;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42887cf518cd..109e4dc4a167 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2375,6 +2375,14 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
return 0;
}
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+ pgtable_t pgtable;
+
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pte_free(mm, pgtable);
+ mm_dec_nr_ptes(mm);
+}
/*
* Return true if we do MADV_FREE successfully on entire pmd page.
* Otherwise, return false.
@@ -2399,8 +2407,23 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto out;
if (unlikely(!pmd_present(orig_pmd))) {
+ if (pmd_is_swap_entry(orig_pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE) {
+ spin_unlock(ptl);
+ __split_huge_pmd(vma, pmd, addr, false);
+ goto out_unlocked;
+ }
+ softleaf_t sl = softleaf_from_pmd(orig_pmd);
+
+ pmdp_huge_get_and_clear(mm, addr, pmd);
+ zap_deposited_table(mm, pmd);
+ spin_unlock(ptl);
+ swap_put_entries_direct(sl, HPAGE_PMD_NR);
+ add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+ return true;
+ }
VM_BUG_ON(thp_migration_supported() &&
- !pmd_is_migration_entry(orig_pmd));
+ !pmd_is_migration_entry(orig_pmd));
goto out;
}
@@ -2449,15 +2472,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return ret;
}
-static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
-{
- pgtable_t pgtable;
-
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
- pte_free(mm, pgtable);
- mm_dec_nr_ptes(mm);
-}
-
static void zap_huge_pmd_folio(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t pmdval, struct folio *folio, bool is_present)
{
@@ -2550,6 +2564,16 @@ bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ if (pmd_is_swap_entry(orig_pmd)) {
+ softleaf_t sl = softleaf_from_pmd(orig_pmd);
+
+ zap_deposited_table(mm, pmd);
+ spin_unlock(ptl);
+ swap_put_entries_direct(sl, HPAGE_PMD_NR);
+ add_mm_counter(mm, MM_SWAPENTS, -HPAGE_PMD_NR);
+ return true;
+ }
+
is_present = pmd_present(orig_pmd);
folio = normal_or_softleaf_folio_pmd(vma, addr, orig_pmd, is_present);
has_deposit = has_deposited_pgtable(vma, orig_pmd, folio);
@@ -2582,7 +2606,8 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
if (pgtable_supports_soft_dirty()) {
- if (unlikely(pmd_is_migration_entry(pmd)))
+ if (unlikely(pmd_is_migration_entry(pmd) ||
+ pmd_is_swap_entry(pmd)))
pmd = pmd_swp_mksoft_dirty(pmd);
else if (pmd_present(pmd))
pmd = pmd_mksoft_dirty(pmd);
@@ -2662,7 +2687,14 @@ static void change_non_present_huge_pmd(struct mm_struct *mm,
pmd_t newpmd;
VM_WARN_ON(!pmd_is_valid_softleaf(*pmd));
- if (softleaf_is_migration_write(entry)) {
+
+ /*
+ * PMD swap entries don't encode write permission in the entry type,
+ * so only uffd_wp flag changes apply. No folio lookup needed.
+ */
+ if (softleaf_is_swap(entry)) {
+ newpmd = *pmd;
+ } else if (softleaf_is_migration_write(entry)) {
const struct folio *folio = softleaf_to_folio(entry);
/*
@@ -2719,7 +2751,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (!ptl)
return 0;
- if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) {
+ if (pmd_is_valid_softleaf(*pmd)) {
change_non_present_huge_pmd(mm, addr, pmd, uffd_wp,
uffd_wp_resolve);
goto unlock;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b8452dbdb043..a7cc65c3d06a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -950,6 +950,12 @@ static inline enum scan_result check_pmd_state(pmd_t *pmd)
*/
if (pmd_is_migration_entry(pmde))
return SCAN_PMD_MAPPED;
+ /*
+ * A PMD-mapped THP that has been swapped out is still a THP from
+ * khugepaged's perspective; treat it like a present huge PMD.
+ */
+ if (pmd_is_swap_entry(pmde))
+ return SCAN_PMD_MAPPED;
if (!pmd_present(pmde))
return SCAN_NO_PTE_TABLE;
if (pmd_trans_huge(pmde))
diff --git a/mm/madvise.c b/mm/madvise.c
index 69708e953cf5..2702eb0b1134 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -390,7 +390,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (unlikely(!pmd_present(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
- !pmd_is_migration_entry(orig_pmd));
+ !pmd_is_migration_entry(orig_pmd) &&
+ !pmd_is_swap_entry(orig_pmd));
goto huge_unlock;
}
@@ -666,7 +667,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
int nr, max_nr;
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd))
+ if (pmd_trans_huge(*pmd) || pmd_is_swap_entry(*pmd))
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
return 0;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4e4421b22b59..55b38fe13a63 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -658,6 +658,8 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
qp->nr_failed++;
return;
}
+ if (unlikely(pmd_is_swap_entry(*pmd)))
+ return;
folio = pmd_folio(*pmd);
if (is_huge_zero_folio(folio)) {
walk->action = ACTION_CONTINUE;
--
2.52.0
next prev parent reply other threads:[~2026-04-27 10:06 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-27 10:01 [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 10:01 ` [PATCH 01/13] mm: add softleaf_to_pmd() and convert existing callers Usama Arif
2026-05-13 19:24 ` David Hildenbrand (Arm)
2026-05-29 7:20 ` Dev Jain
2026-05-29 14:47 ` Usama Arif
2026-04-27 10:01 ` [PATCH 02/13] mm: extract ensure_on_mmlist() helper Usama Arif
2026-05-13 13:32 ` David Hildenbrand (Arm)
2026-05-13 17:21 ` Usama Arif
2026-05-13 19:22 ` David Hildenbrand (Arm)
2026-05-29 7:42 ` Dev Jain
2026-04-27 10:01 ` [PATCH 03/13] fs/proc: use softleaf_has_pfn() in pagemap PMD walker Usama Arif
2026-05-13 13:35 ` David Hildenbrand (Arm)
2026-05-29 9:34 ` Dev Jain
2026-04-27 10:01 ` [PATCH 04/13] mm/huge_memory: move softleaf_to_folio() inside migration branch Usama Arif
2026-05-13 19:25 ` David Hildenbrand (Arm)
2026-05-29 11:31 ` Dev Jain
2026-04-27 10:01 ` [PATCH 05/13] mm: add PMD swap entry detection support Usama Arif
2026-05-30 8:06 ` Dev Jain
2026-04-27 10:01 ` [PATCH 06/13] mm: add PMD swap entry splitting support Usama Arif
2026-05-30 10:52 ` Dev Jain
2026-06-02 12:59 ` Usama Arif
2026-04-27 10:01 ` [PATCH 07/13] mm: handle PMD swap entries in fork path Usama Arif
2026-04-27 10:01 ` [PATCH 08/13] mm: swap in PMD swap entries as whole THPs during swapoff Usama Arif
2026-05-26 19:44 ` Alexandre Ghiti
2026-05-29 14:49 ` Usama Arif
2026-04-27 10:01 ` Usama Arif [this message]
2026-04-27 10:01 ` [PATCH 10/13] mm: handle PMD swap entries in UFFDIO_MOVE Usama Arif
2026-04-27 10:02 ` [PATCH 11/13] mm: handle PMD swap entry faults on swap-in Usama Arif
2026-04-27 10:02 ` [PATCH 12/13] mm: install PMD swap entries on swap-out Usama Arif
2026-04-27 10:02 ` [PATCH 13/13] selftests/mm: add PMD swap entry tests Usama Arif
2026-04-27 13:38 ` [PATCH 00/13] mm: PMD-level swap entries for anonymous THPs Usama Arif
2026-04-27 18:26 ` Zi Yan
2026-04-27 20:12 ` Usama Arif
2026-04-29 12:57 ` Zi Yan
2026-04-28 19:54 ` David Hildenbrand (Arm)
2026-04-29 9:39 ` Usama Arif
2026-04-29 12:52 ` Lorenzo Stoakes
2026-04-29 10:44 ` Kairui Song
2026-04-30 10:38 ` Usama Arif
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260427100553.2754667-10-usama.arif@linux.dev \
--to=usama.arif@linux.dev \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=alex@ghiti.fr \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=chrisl@kernel.org \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=hannes@cmpxchg.org \
--cc=kas@kernel.org \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=linux-kernel@vger.kernel.org \
--cc=ljs@kernel.org \
--cc=npache@redhat.com \
--cc=nphamcs@gmail.com \
--cc=riel@surriel.com \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=vbabka@kernel.org \
--cc=willy@infradead.org \
--cc=youngjun.park@lge.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.