* [PATCH RFC v3 4/4] mm: add PMD-level huge page support for remap_pfn_range()
2026-02-28 7:09 [PATCH RFC v3 0/4] mm: add huge pfnmap " Yin Tirui
@ 2026-02-28 7:09 ` Yin Tirui
2026-04-13 20:02 ` David Hildenbrand (Arm)
0 siblings, 1 reply; 3+ messages in thread
From: Yin Tirui @ 2026-02-28 7:09 UTC (permalink / raw)
To: linux-kernel, linux-mm, x86, linux-arm-kernel, willy, david,
catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, luto,
peterz, akpm, lorenzo.stoakes, ziy, baolin.wang, Liam.Howlett,
npache, ryan.roberts, dev.jain, baohua, lance.yang, vbabka, rppt,
surenb, mhocko, anshuman.khandual, rmclure, kevin.brodsky,
apopple, ajd, pasha.tatashin, bhe, thuth, coxu, dan.j.williams,
yu-cheng.yu, yangyicong, baolu.lu, jgross, conor.dooley,
Jonathan.Cameron, riel
Cc: wangkefeng.wang, chenjun102, yintirui
Add PMD-level huge page support to remap_pfn_range(), automatically
creating huge mappings when prerequisites are satisfied (size, alignment,
architecture support, etc.) and falling back to normal page mappings
otherwise.
Implement special huge PMD splitting by utilizing the pgtable deposit/
withdraw mechanism. When splitting is needed, the deposited pgtable is
withdrawn and populated with individual PTEs created from the original
huge mapping.
Signed-off-by: Yin Tirui <yintirui@huawei.com>
---
mm/huge_memory.c | 36 ++++++++++++++++++++++++++++++++++--
mm/memory.c | 40 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 74 insertions(+), 2 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d4ca8cfd7f9d..e463d51005ee 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1857,6 +1857,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd = pmdp_get_lockless(src_pmd);
if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
!is_huge_zero_pmd(pmd))) {
+ pgtable = pte_alloc_one(dst_mm);
+ if (unlikely(!pgtable))
+ goto out;
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -1870,6 +1873,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* able to wrongly write to the backend MMIO.
*/
VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
+
+ /* dax won't reach here, it will be intercepted at vma_needs_copy() */
+ VM_WARN_ON_ONCE(vma_is_dax(src_vma));
+
+ mm_inc_nr_ptes(dst_mm);
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
goto set_pmd;
}
@@ -2360,6 +2369,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
+ if (pmd_special(orig_pmd))
+ zap_deposited_table(tlb->mm, pmd);
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
@@ -3005,14 +3016,35 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (!vma_is_anonymous(vma)) {
old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
+
+ if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
+ pte_t entry;
+
+ if (!pmd_special(old_pmd)) {
+ zap_deposited_table(mm, pmd);
+ return;
+ }
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ if (unlikely(!pgtable))
+ return;
+ pmd_populate(mm, &_pmd, pgtable);
+ pte = pte_offset_map(&_pmd, haddr);
+ entry = pfn_pte(pmd_pfn(old_pmd), pmd_pgprot(old_pmd));
+ set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
+ pte_unmap(pte);
+
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ return;
+ }
+
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
- if (!vma_is_dax(vma) && vma_is_special_huge(vma))
- return;
+
if (unlikely(pmd_is_migration_entry(old_pmd))) {
const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
diff --git a/mm/memory.c b/mm/memory.c
index 07778814b4a8..affccf38cbcf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2890,6 +2890,40 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
return err;
}
+#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ pgtable_t pgtable;
+ spinlock_t *ptl;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(pfn, HPAGE_PMD_NR))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ pgtable = pte_alloc_one(mm);
+ if (unlikely(!pgtable))
+ return 0;
+
+ mm_inc_nr_ptes(mm);
+ ptl = pmd_lock(mm, pmd);
+ set_pmd_at(mm, addr, pmd, pmd_mkspecial(pmd_mkhuge(pfn_pmd(pfn, prot))));
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ spin_unlock(ptl);
+
+ return 1;
+}
+#endif
+
static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
@@ -2905,6 +2939,12 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
+#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+ if (remap_try_huge_pmd(mm, pmd, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot)) {
+ continue;
+ }
+#endif
err = remap_pte_range(mm, pmd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
--
2.22.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH RFC v3 4/4] mm: add PMD-level huge page support for remap_pfn_range()
2026-02-28 7:09 ` [PATCH RFC v3 4/4] mm: add PMD-level huge page " Yin Tirui
@ 2026-04-13 20:02 ` David Hildenbrand (Arm)
0 siblings, 0 replies; 3+ messages in thread
From: David Hildenbrand (Arm) @ 2026-04-13 20:02 UTC (permalink / raw)
To: Yin Tirui, linux-kernel, linux-mm, x86, linux-arm-kernel, willy,
catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, luto,
peterz, akpm, lorenzo.stoakes, ziy, baolin.wang, Liam.Howlett,
npache, ryan.roberts, dev.jain, baohua, lance.yang, vbabka, rppt,
surenb, mhocko, anshuman.khandual, rmclure, kevin.brodsky,
apopple, ajd, pasha.tatashin, bhe, thuth, coxu, dan.j.williams,
yu-cheng.yu, yangyicong, baolu.lu, jgross, conor.dooley,
Jonathan.Cameron, riel
Cc: wangkefeng.wang, chenjun102
On 2/28/26 08:09, Yin Tirui wrote:
> Add PMD-level huge page support to remap_pfn_range(), automatically
> creating huge mappings when prerequisites are satisfied (size, alignment,
> architecture support, etc.) and falling back to normal page mappings
> otherwise.
>
> Implement special huge PMD splitting by utilizing the pgtable deposit/
> withdraw mechanism. When splitting is needed, the deposited pgtable is
> withdrawn and populated with individual PTEs created from the original
> huge mapping.
>
> Signed-off-by: Yin Tirui <yintirui@huawei.com>
> ---
[...]
>
> if (!vma_is_anonymous(vma)) {
> old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
> +
> + if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
These magical vma checks are really bad. This all needs a cleanup
(Lorenzo is doing some, hoping it will look better on top of that).
> + pte_t entry;
> +
> + if (!pmd_special(old_pmd)) {
If you are using pmd_special(), you are doing something wrong.
Hint: vm_normal_page_pmd() is usually what you want.
> + zap_deposited_table(mm, pmd);
> + return;
> + }
> + pgtable = pgtable_trans_huge_withdraw(mm, pmd);
> + if (unlikely(!pgtable))
> + return;
> + pmd_populate(mm, &_pmd, pgtable);
> + pte = pte_offset_map(&_pmd, haddr);
> + entry = pfn_pte(pmd_pfn(old_pmd), pmd_pgprot(old_pmd));
> + set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
> + pte_unmap(pte);
> +
> + smp_wmb(); /* make pte visible before pmd */
> + pmd_populate(mm, pmd, pgtable);
> + return;
> + }
> +
> /*
> * We are going to unmap this huge page. So
> * just go ahead and zap it
> */
> if (arch_needs_pgtable_deposit())
> zap_deposited_table(mm, pmd);
> - if (!vma_is_dax(vma) && vma_is_special_huge(vma))
> - return;
> +
> if (unlikely(pmd_is_migration_entry(old_pmd))) {
> const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 07778814b4a8..affccf38cbcf 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2890,6 +2890,40 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
> return err;
> }
>
> +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
Why exactly do we need arch support for that in form of a Kconfig.
Usually, we guard pmd support by CONFIG_TRANSPARENT_HUGEPAGE.
And then, we must check at runtime if PMD leaves are actually supported.
Luiz is working on a cleanup series:
https://lore.kernel.org/r/cover.1775679721.git.luizcap@redhat.com
pgtable_has_pmd_leaves() is what you would want to check.
> +static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd,
> + unsigned long addr, unsigned long end,
> + unsigned long pfn, pgprot_t prot)
Use two-tab indent. (currently 3? :) )
Also, we tend to call these things now "pmd leaves". Call it
"remap_try_pmd_leaf" or something even more expressive like
"remap_try_install_pmd_leaf()"
> +{
> + pgtable_t pgtable;
> + spinlock_t *ptl;
> +
> + if ((end - addr) != PMD_SIZE)
if (end - addr != PMD_SIZE)
Should work
> + return 0;
> +
> + if (!IS_ALIGNED(addr, PMD_SIZE))
> + return 0;
> +
You could likely combine both things into a
if (!IS_ALIGNED(addr | end, PMD_SIZE))
> + if (!IS_ALIGNED(pfn, HPAGE_PMD_NR))
Another sign that you piggy-back on THP support ;)
> + return 0;
> +
> + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
> + return 0;
Ripping out a page table?! That doesn't sound right :)
Why is that required? We shouldn't be doing that here. Gah.
Especially, without any pmd locks etc.
> +
> + pgtable = pte_alloc_one(mm);
> + if (unlikely(!pgtable))
> + return 0;
> +
> + mm_inc_nr_ptes(mm);
> + ptl = pmd_lock(mm, pmd);
> + set_pmd_at(mm, addr, pmd, pmd_mkspecial(pmd_mkhuge(pfn_pmd(pfn, prot))));
> + pgtable_trans_huge_deposit(mm, pmd, pgtable);
> + spin_unlock(ptl);
> +
> + return 1;
> +}
> +#endif
> +
> static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
> unsigned long addr, unsigned long end,
> unsigned long pfn, pgprot_t prot)
> @@ -2905,6 +2939,12 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
> VM_BUG_ON(pmd_trans_huge(*pmd));
> do {
> next = pmd_addr_end(addr, end);
> +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
> + if (remap_try_huge_pmd(mm, pmd, addr, next,
> + pfn + (addr >> PAGE_SHIFT), prot)) {
Please provide a stub instead so we don't end up with ifdef in this code.
--
Cheers,
David
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH RFC v3 4/4] mm: add PMD-level huge page support for remap_pfn_range()
[not found] <5d04929b-576f-4926-9f3b-be9a41a3e010@gmail.com>
@ 2026-04-19 11:24 ` Yin Tirui
0 siblings, 0 replies; 3+ messages in thread
From: Yin Tirui @ 2026-04-19 11:24 UTC (permalink / raw)
To: David Hildenbrand (Arm), lorenzo.stoakes
Cc: linux-kernel, linux-mm, x86, linux-arm-kernel, willy, jgross,
catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, luto,
peterz, akpm, ziy, baolin.wang, Liam.Howlett, npache,
ryan.roberts, dev.jain, baohua, lance.yang, vbabka, rppt, surenb,
mhocko, anshuman.khandual, rmclure, kevin.brodsky, apopple, ajd,
pasha.tatashin, bhe, thuth, coxu, dan.j.williams, yu-cheng.yu,
yangyicong, baolu.lu, conor.dooley, Jonathan.Cameron, riel,
wangkefeng.wang, chenjun102
Hi David,
Thanks a lot for the thorough review!
On 4/14/26 04:02, David Hildenbrand (Arm) wrote:
> On 2/28/26 08:09, Yin Tirui wrote:
>> Add PMD-level huge page support to remap_pfn_range(), automatically
>> creating huge mappings when prerequisites are satisfied (size, alignment,
>> architecture support, etc.) and falling back to normal page mappings
>> otherwise.
>>
>> Implement special huge PMD splitting by utilizing the pgtable deposit/
>> withdraw mechanism. When splitting is needed, the deposited pgtable is
>> withdrawn and populated with individual PTEs created from the original
>> huge mapping.
>>
>> Signed-off-by: Yin Tirui <yintirui@huawei.com>
>> ---
>
> [...]
>
>>
>> if (!vma_is_anonymous(vma)) {
>> old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
>> +
>> + if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
>
> These magical vma checks are really bad. This all needs a cleanup
> (Lorenzo is doing some, hoping it will look better on top of that).
>
Agreed. I am following Lorenzo's recent cleanups closely.
>> + pte_t entry;
>> +
>> + if (!pmd_special(old_pmd)) {
>
> If you are using pmd_special(), you are doing something wrong.
>
> Hint: vm_normal_page_pmd() is usually what you want.
Spot on.
While looking into applying vm_normal_folio_pmd() here to avoid the
magical VMA checks, I realized that both __split_huge_pmd_locked() and
copy_huge_pmd() currently suffer from the same !vma_is_anonymous(vma)
top-level entanglement.I think these functions could benefit from a
structural refactoring similar to what Lorenzo is currently doing in
zap_huge_pmd().
My idea is to flatten both functions into a pmd_present()-driven
decision tree:
1. Branch strictly on pmd_present().
2. For present PMDs, rely exclusively on vm_normal_folio_pmd() to
determine the underlying memory type, rather than guessing from VMA flags.
3. If !folio (and not a huge zero page), it cleanly identifies special
mappings (like PFNMAPs) without relying on vma_is_special_huge(). We can
handle the split/copy directly and return early.
4. Otherwise, proceed with the normal Anon/File THP logic, or handle
non-present migration entries in the !pmd_present() branch.
I have drafted two preparation patches demonstrating this approach and
appended the diffs at the end of this email. Does this direction look
reasonable to you? If so, I will iron out the implementation details and
include these refactoring patches in my upcoming v4 series.
>
>> + zap_deposited_table(mm, pmd);
>> + return;
>> + }
>> + pgtable = pgtable_trans_huge_withdraw(mm, pmd);
>> + if (unlikely(!pgtable))
>> + return;
>> + pmd_populate(mm, &_pmd, pgtable);
>> + pte = pte_offset_map(&_pmd, haddr);
>> + entry = pfn_pte(pmd_pfn(old_pmd), pmd_pgprot(old_pmd));
>> + set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
>> + pte_unmap(pte);
>> +
>> + smp_wmb(); /* make pte visible before pmd */
>> + pmd_populate(mm, pmd, pgtable);
>> + return;
>> + }
>> +
>> /*
>> * We are going to unmap this huge page. So
>> * just go ahead and zap it
>> */
>> if (arch_needs_pgtable_deposit())
>> zap_deposited_table(mm, pmd);
>> - if (!vma_is_dax(vma) && vma_is_special_huge(vma))
>> - return;
>> +
>> if (unlikely(pmd_is_migration_entry(old_pmd))) {
>> const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 07778814b4a8..affccf38cbcf 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2890,6 +2890,40 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
>> return err;
>> }
>>
>> +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
>
> Why exactly do we need arch support for that in form of a Kconfig.
>
> Usually, we guard pmd support by CONFIG_TRANSPARENT_HUGEPAGE.
>
> And then, we must check at runtime if PMD leaves are actually supported.
>
> Luiz is working on a cleanup series:
>
> https://lore.kernel.org/r/cover.1775679721.git.luizcap@redhat.com
>
> pgtable_has_pmd_leaves() is what you would want to check.
Makes sense. This Kconfig was inherited from Peter Xu's earlier
proposal, but depending on CONFIG_TRANSPARENT_HUGEPAGE and
pgtable_has_pmd_leaves() is indeed the correct standard. I will rebase
on Luiz's series.
>
>
>> +static int remap_try_huge_pmd(struct mm_struct *mm, pmd_t *pmd,
>> + unsigned long addr, unsigned long end,
>> + unsigned long pfn, pgprot_t prot)
>
> Use two-tab indent. (currently 3? :) )
>
> Also, we tend to call these things now "pmd leaves". Call it
> "remap_try_pmd_leaf" or something even more expressive like
>
> "remap_try_install_pmd_leaf()"
>
Noted. Will fix the indentation and rename it.
>> +{
>> + pgtable_t pgtable;
>> + spinlock_t *ptl;
>> +
>> + if ((end - addr) != PMD_SIZE)
>
> if (end - addr != PMD_SIZE)
>
> Should work
Noted.
>
>> + return 0;
>> +
>> + if (!IS_ALIGNED(addr, PMD_SIZE))
>> + return 0;
>> +
>
> You could likely combine both things into a
>
> if (!IS_ALIGNED(addr | end, PMD_SIZE))
>
>> + if (!IS_ALIGNED(pfn, HPAGE_PMD_NR))
>
> Another sign that you piggy-back on THP support ;)
Indeed! :)
>
>> + return 0;
>> +
>> + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
>> + return 0;
>
> Ripping out a page table?! That doesn't sound right :)
>
> Why is that required? We shouldn't be doing that here. Gah.
>
> Especially, without any pmd locks etc.
...oops. That is indeed a silly one. Thanks for catching it.
I will fix this to:
if (!pmd_none(*pmd))
return 0;
>
>> +
>> + pgtable = pte_alloc_one(mm);
>> + if (unlikely(!pgtable))
>> + return 0;
>> +
>> + mm_inc_nr_ptes(mm);
>> + ptl = pmd_lock(mm, pmd);
>> + set_pmd_at(mm, addr, pmd, pmd_mkspecial(pmd_mkhuge(pfn_pmd(pfn, prot))));
>> + pgtable_trans_huge_deposit(mm, pmd, pgtable);
>> + spin_unlock(ptl);
>> +
>> + return 1;
>> +}
>> +#endif
>> +
>> static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
>> unsigned long addr, unsigned long end,
>> unsigned long pfn, pgprot_t prot)
>> @@ -2905,6 +2939,12 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
>> VM_BUG_ON(pmd_trans_huge(*pmd));
>> do {
>> next = pmd_addr_end(addr, end);
>> +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
>> + if (remap_try_huge_pmd(mm, pmd, addr, next,
>> + pfn + (addr >> PAGE_SHIFT), prot)) {
>
> Please provide a stub instead so we don't end up with ifdef in this code.
Will do.
>
Appendix:
Based on the mm-stable branch.
1. copy_huge_pmd()
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42c983821c03..3f8b3f15c6ba 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1912,35 +1912,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm,
struct mm_struct *src_mm,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
spinlock_t *dst_ptl, *src_ptl;
- struct page *src_page;
struct folio *src_folio;
pmd_t pmd;
pgtable_t pgtable = NULL;
int ret = -ENOMEM;
- pmd = pmdp_get_lockless(src_pmd);
- if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
- !is_huge_zero_pmd(pmd))) {
- dst_ptl = pmd_lock(dst_mm, dst_pmd);
- src_ptl = pmd_lockptr(src_mm, src_pmd);
- spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
- /*
- * No need to recheck the pmd, it can't change with write
- * mmap lock held here.
- *
- * Meanwhile, making sure it's not a CoW VMA with writable
- * mapping, otherwise it means either the anon page wrongly
- * applied special bit, or we made the PRIVATE mapping be
- * able to wrongly write to the backend MMIO.
- */
- VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
- goto set_pmd;
- }
-
- /* Skip if can be re-fill on fault */
- if (!vma_is_anonymous(dst_vma))
- return 0;
-
pgtable = pte_alloc_one(dst_mm);
if (unlikely(!pgtable))
goto out;
@@ -1952,48 +1928,69 @@ int copy_huge_pmd(struct mm_struct *dst_mm,
struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- if (unlikely(thp_migration_supported() &&
- pmd_is_valid_softleaf(pmd))) {
+ if (likely(pmd_present(pmd))) {
+ src_folio = vm_normal_folio_pmd(src_vma, addr, pmd);
+ if (unlikely(!src_folio)) {
+ /*
+ * When page table lock is held, the huge zero pmd should not be
+ * under splitting since we don't split the page itself, only pmd to
+ * a page table.
+ */
+ if (is_huge_zero_pmd(pmd)) {
+ /*
+ * mm_get_huge_zero_folio() will never allocate a new
+ * folio here, since we already have a zero page to
+ * copy. It just takes a reference.
+ */
+ mm_get_huge_zero_folio(dst_mm);
+ goto out_zero_page;
+ }
+
+ /*
+ * Making sure it's not a CoW VMA with writable
+ * mapping, otherwise it means either the anon page wrongly
+ * applied special bit, or we made the PRIVATE mapping be
+ * able to wrongly write to the backend MMIO.
+ */
+ VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
+ pte_free(dst_mm, pgtable);
+ goto set_pmd;
+ }
+
+ if (!folio_test_anon(src_folio)) {
+ pte_free(dst_mm, pgtable);
+ ret = 0;
+ goto out_unlock;
+ }
+
+ folio_get(src_folio);
+ if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
dst_vma, src_vma))) {
+ /* Page maybe pinned: split and retry the fault on PTEs. */
+ folio_put(src_folio);
+ pte_free(dst_mm, pgtable);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pmd(src_vma, src_pmd, addr, false);
+ return -EAGAIN;
+ }
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+ } else if (unlikely(thp_migration_supported() &&
pmd_is_valid_softleaf(pmd))) {
+ if (unlikely(!vma_is_anonymous(dst_vma))) {
+ pte_free(dst_mm, pgtable);
+ ret = 0;
+ goto out_unlock;
+ }
copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
dst_vma, src_vma, pmd, pgtable);
ret = 0;
goto out_unlock;
- }
- if (unlikely(!pmd_trans_huge(pmd))) {
+ } else {
pte_free(dst_mm, pgtable);
goto out_unlock;
}
- /*
- * When page table lock is held, the huge zero pmd should not be
- * under splitting since we don't split the page itself, only pmd to
- * a page table.
- */
- if (is_huge_zero_pmd(pmd)) {
- /*
- * mm_get_huge_zero_folio() will never allocate a new
- * folio here, since we already have a zero page to
- * copy. It just takes a reference.
- */
- mm_get_huge_zero_folio(dst_mm);
- goto out_zero_page;
- }
- src_page = pmd_page(pmd);
- VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
- src_folio = page_folio(src_page);
-
- folio_get(src_folio);
- if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma,
src_vma))) {
- /* Page maybe pinned: split and retry the fault on PTEs. */
- folio_put(src_folio);
- pte_free(dst_mm, pgtable);
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
- __split_huge_pmd(src_vma, src_pmd, addr, false);
- return -EAGAIN;
- }
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
out_zero_page:
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
2. __split_huge_pmd_locked()
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3f8b3f15c6ba..c02c2843520f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3090,98 +3090,50 @@ static void __split_huge_pmd_locked(struct
vm_area_struct *vma, pmd_t *pmd,
count_vm_event(THP_SPLIT_PMD);
- if (!vma_is_anonymous(vma)) {
- old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
- /*
- * We are going to unmap this huge page. So
- * just go ahead and zap it
- */
- if (arch_needs_pgtable_deposit())
- zap_deposited_table(mm, pmd);
- if (vma_is_special_huge(vma))
- return;
- if (unlikely(pmd_is_migration_entry(old_pmd))) {
- const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
+ if (pmd_present(*pmd)) {
+ folio = vm_normal_folio_pmd(vma, haddr, *pmd);
- folio = softleaf_to_folio(old_entry);
- } else if (is_huge_zero_pmd(old_pmd)) {
+ if (unlikely(!folio)) {
+ /* Huge Zero Page */
+ if (is_huge_zero_pmd(*pmd))
+ /*
+ * FIXME: Do we want to invalidate secondary mmu by calling
+ * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
+ * inside __split_huge_pmd() ?
+ *
+ * We are going from a zero huge page write protected to zero
+ * small page also write protected so it does not seems useful
+ * to invalidate secondary mmu at this time.
+ */
+ return __split_huge_zero_page_pmd(vma, haddr, pmd);
+
+ /* Huge PFNMAP */
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(mm, pmd);
return;
- } else {
+ }
+
+ /* File/Shmem THP */
+ if (unlikely(!folio_test_anon(folio))) {
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(mm, pmd);
+ if (vma_is_special_huge(vma))
+ return;
+
page = pmd_page(old_pmd);
- folio = page_folio(page);
if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
folio_mark_dirty(folio);
if (!folio_test_referenced(folio) && pmd_young(old_pmd))
folio_set_referenced(folio);
folio_remove_rmap_pmd(folio, page, vma);
folio_put(folio);
+ add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
+ return;
}
- add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
- return;
- }
-
- if (is_huge_zero_pmd(*pmd)) {
- /*
- * FIXME: Do we want to invalidate secondary mmu by calling
- * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
- * inside __split_huge_pmd() ?
- *
- * We are going from a zero huge page write protected to zero
- * small page also write protected so it does not seems useful
- * to invalidate secondary mmu at this time.
- */
- return __split_huge_zero_page_pmd(vma, haddr, pmd);
- }
-
- if (pmd_is_migration_entry(*pmd)) {
- softleaf_t entry;
-
- old_pmd = *pmd;
- entry = softleaf_from_pmd(old_pmd);
- page = softleaf_to_page(entry);
- folio = page_folio(page);
-
- soft_dirty = pmd_swp_soft_dirty(old_pmd);
- uffd_wp = pmd_swp_uffd_wp(old_pmd);
-
- write = softleaf_is_migration_write(entry);
- if (PageAnon(page))
- anon_exclusive = softleaf_is_migration_read_exclusive(entry);
- young = softleaf_is_migration_young(entry);
- dirty = softleaf_is_migration_dirty(entry);
- } else if (pmd_is_device_private_entry(*pmd)) {
- softleaf_t entry;
-
- old_pmd = *pmd;
- entry = softleaf_from_pmd(old_pmd);
- page = softleaf_to_page(entry);
- folio = page_folio(page);
-
- soft_dirty = pmd_swp_soft_dirty(old_pmd);
- uffd_wp = pmd_swp_uffd_wp(old_pmd);
-
- write = softleaf_is_device_private_write(entry);
- anon_exclusive = PageAnonExclusive(page);
- /*
- * Device private THP should be treated the same as regular
- * folios w.r.t anon exclusive handling. See the comments for
- * folio handling and anon_exclusive below.
- */
- if (freeze && anon_exclusive &&
- folio_try_share_anon_rmap_pmd(folio, page))
- freeze = false;
- if (!freeze) {
- rmap_t rmap_flags = RMAP_NONE;
-
- folio_ref_add(folio, HPAGE_PMD_NR - 1);
- if (anon_exclusive)
- rmap_flags |= RMAP_EXCLUSIVE;
-
- folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
- vma, haddr, rmap_flags);
- }
- } else {
+ /* Anon THP */
/*
* Up to this point the pmd is present and huge and userland has
* the whole access to the hugepage during the split (which
@@ -3207,7 +3159,6 @@ static void __split_huge_pmd_locked(struct
vm_area_struct *vma, pmd_t *pmd,
*/
old_pmd = pmdp_invalidate(vma, haddr, pmd);
page = pmd_page(old_pmd);
- folio = page_folio(page);
if (pmd_dirty(old_pmd)) {
dirty = true;
folio_set_dirty(folio);
@@ -3218,8 +3169,6 @@ static void __split_huge_pmd_locked(struct
vm_area_struct *vma, pmd_t *pmd,
uffd_wp = pmd_uffd_wp(old_pmd);
VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
- VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
-
/*
* Without "freeze", we'll simply split the PMD, propagating the
* PageAnonExclusive() flag for each PTE by setting it for
@@ -3236,17 +3185,82 @@ static void __split_huge_pmd_locked(struct
vm_area_struct *vma, pmd_t *pmd,
* See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
*/
anon_exclusive = PageAnonExclusive(page);
- if (freeze && anon_exclusive &&
- folio_try_share_anon_rmap_pmd(folio, page))
+ if (freeze && anon_exclusive && folio_try_share_anon_rmap_pmd(folio,
page))
freeze = false;
if (!freeze) {
rmap_t rmap_flags = RMAP_NONE;
-
folio_ref_add(folio, HPAGE_PMD_NR - 1);
if (anon_exclusive)
rmap_flags |= RMAP_EXCLUSIVE;
- folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
- vma, haddr, rmap_flags);
+ folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, vma, haddr,
rmap_flags);
+ }
+ } else { /* pmd not present */
+ folio = pmd_to_softleaf_folio(*pmd);
+ if (unlikely(!folio))
+ return;
+
+ /* Migration of File/Shmem THP */
+ if (unlikely(!folio_test_anon(folio))) {
+ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
+ if (arch_needs_pgtable_deposit())
+ zap_deposited_table(mm, pmd);
+ if (vma_is_special_huge(vma))
+ return;
+ add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
+ return;
+ }
+
+ /* Migration of Anon THP or Device Private*/
+ if (pmd_is_migration_entry(*pmd)) {
+ softleaf_t entry;
+
+ old_pmd = *pmd;
+ entry = softleaf_from_pmd(old_pmd);
+ page = softleaf_to_page(entry);
+ folio = page_folio(page);
+
+ soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+ write = softleaf_is_migration_write(entry);
+ if (PageAnon(page))
+ anon_exclusive = softleaf_is_migration_read_exclusive(entry);
+ young = softleaf_is_migration_young(entry);
+ dirty = softleaf_is_migration_dirty(entry);
+ } else if (pmd_is_device_private_entry(*pmd)) {
+ softleaf_t entry;
+
+ old_pmd = *pmd;
+ entry = softleaf_from_pmd(old_pmd);
+ page = softleaf_to_page(entry);
+
+ soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+ write = softleaf_is_device_private_write(entry);
+ anon_exclusive = PageAnonExclusive(page);
+
+ /*
+ * Device private THP should be treated the same as regular
+ * folios w.r.t anon exclusive handling. See the comments for
+ * folio handling and anon_exclusive below.
+ */
+ if (freeze && anon_exclusive &&
+ folio_try_share_anon_rmap_pmd(folio, page))
+ freeze = false;
+ if (!freeze) {
+ rmap_t rmap_flags = RMAP_NONE;
+
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
+ if (anon_exclusive)
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
+ vma, haddr, rmap_flags);
+ }
+ } else {
+ VM_WARN_ONCE(1, "unknown situation.");
+ return;
}
}
--
2.43.0
--
Yin Tirui
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-04-19 11:24 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <5d04929b-576f-4926-9f3b-be9a41a3e010@gmail.com>
2026-04-19 11:24 ` [PATCH RFC v3 4/4] mm: add PMD-level huge page support for remap_pfn_range() Yin Tirui
2026-02-28 7:09 [PATCH RFC v3 0/4] mm: add huge pfnmap " Yin Tirui
2026-02-28 7:09 ` [PATCH RFC v3 4/4] mm: add PMD-level huge page " Yin Tirui
2026-04-13 20:02 ` David Hildenbrand (Arm)
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox