* [PATCH 1/7] riscv: mm: split raw and public PTE helpers
2026-04-21 9:24 [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support Yunhui Cui
@ 2026-04-21 9:24 ` Yunhui Cui
2026-04-21 9:24 ` [PATCH 2/7] riscv/kvm: use raw PTE helpers for G-stage leaf PTEs Yunhui Cui
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Yunhui Cui @ 2026-04-21 9:24 UTC (permalink / raw)
To: akpm, alex, andrew+kernel, andreyknvl, anup, aou, apopple, ardb,
atish.patra, baolin.wang, cuiyunhui, david, debug,
djordje.todorovic, dvyukov, elver, glider, ilias.apalodimas,
junhui.liu, kasan-dev, kees, kevin.brodsky, kvm-riscv, kvm,
linux-efi, linux-kernel, linux-riscv, liu.xuemei1, ljs, namcao,
osalvador, palmer, pjw, rmclure, rostedt, rppt, ryabinin.a.a,
surenb, vincenzo.frascino, vishal.moola, wangruikang,
zhangchunyan
Introduce raw PTE helpers prefixed with double underscores for callers
that need direct access to the underlying PTE encoding. These __* helpers
form private low-level primitives, while the existing helpers remain the
public core-MM-facing API that RISC-V can later wrap with additional
architecture-specific semantics without exposing those details to generic
callers.
Switch kernel internal page table users in early boot, KASAN, EFI,
hibernate and pageattr to use the private raw helpers directly.
No functional change intended.
Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
arch/riscv/include/asm/kfence.h | 4 +-
arch/riscv/include/asm/pgtable.h | 87 ++++++++++++++++++++++++++++----
arch/riscv/kernel/efi.c | 4 +-
arch/riscv/kernel/hibernate.c | 2 +-
arch/riscv/mm/fault.c | 4 +-
arch/riscv/mm/init.c | 8 +--
arch/riscv/mm/kasan_init.c | 14 ++---
arch/riscv/mm/pageattr.c | 12 ++---
arch/riscv/mm/pgtable.c | 19 ++++++-
9 files changed, 117 insertions(+), 37 deletions(-)
diff --git a/arch/riscv/include/asm/kfence.h b/arch/riscv/include/asm/kfence.h
index d08bf7fb3aee6..2bcaeff1167c6 100644
--- a/arch/riscv/include/asm/kfence.h
+++ b/arch/riscv/include/asm/kfence.h
@@ -18,9 +18,9 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect)
pte_t *pte = virt_to_kpte(addr);
if (protect)
- set_pte(pte, __pte(pte_val(ptep_get(pte)) & ~_PAGE_PRESENT));
+ __set_pte(pte, __pte(pte_val(__ptep_get(pte)) & ~_PAGE_PRESENT));
else
- set_pte(pte, __pte(pte_val(ptep_get(pte)) | _PAGE_PRESENT));
+ __set_pte(pte, __pte(pte_val(__ptep_get(pte)) | _PAGE_PRESENT));
preempt_disable();
local_flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index a1a7c6520a095..4de1f40fa77ea 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -602,11 +602,18 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)
* a page table are directly modified. Thus, the following hook is
* made available.
*/
-static inline void set_pte(pte_t *ptep, pte_t pteval)
+static inline void __set_pte(pte_t *ptep, pte_t pteval)
{
WRITE_ONCE(*ptep, pteval);
}
+#define __set_pte __set_pte
+
+static inline void set_pte(pte_t *ptep, pte_t pteval)
+{
+ __set_pte(ptep, pteval);
+}
+
void flush_icache_pte(struct mm_struct *mm, pte_t pte);
static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
@@ -619,8 +626,8 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
#define PFN_PTE_SHIFT _PAGE_PFN_SHIFT
-static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval, unsigned int nr)
+static inline void __set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval, unsigned int nr)
{
page_table_check_ptes_set(mm, addr, ptep, pteval, nr);
@@ -632,31 +639,61 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
}
}
-#define set_ptes set_ptes
+
+#define __set_ptes __set_ptes
+
+static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval, unsigned int nr)
+{
+ __set_ptes(mm, addr, ptep, pteval, nr);
+}
+
+static inline void __pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ __set_pte_at(mm, ptep, __pte(0));
+}
static inline void pte_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- __set_pte_at(mm, ptep, __pte(0));
+ __pte_clear(mm, addr, ptep);
+}
+
+#define __ptep_get __ptep_get
+static inline pte_t __ptep_get(pte_t *ptep)
+{
+ return READ_ONCE(*ptep);
+}
+
+#define __ptep_get_lockless __ptep_get_lockless
+static inline pte_t __ptep_get_lockless(pte_t *ptep)
+{
+ return __ptep_get(ptep);
}
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* defined in mm/pgtable.c */
extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep, pte_t entry, int dirty);
+int __ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG /* defined in mm/pgtable.c */
bool ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep);
+bool __ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
+static inline pte_t
+__ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
#ifdef CONFIG_SMP
pte_t pte = __pte(xchg(&ptep->pte, 0));
#else
pte_t pte = *ptep;
- set_pte(ptep, __pte(0));
+ __set_pte(ptep, __pte(0));
#endif
page_table_check_pte_clear(mm, address, pte);
@@ -664,9 +701,16 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
return pte;
}
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
+#define __ptep_get_and_clear __ptep_get_and_clear
+
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
+{
+ return __ptep_get_and_clear(mm, address, ptep);
+}
+
+static inline void
+__ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
pte_t read_pte = READ_ONCE(*ptep);
/*
@@ -679,6 +723,27 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
((pte_val(read_pte) & ~(unsigned long)_PAGE_WRITE) | _PAGE_READ));
}
+#define __ptep_set_wrprotect __ptep_set_wrprotect
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
+{
+ __ptep_set_wrprotect(mm, address, ptep);
+}
+
+static inline pte_t __ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *ptep)
+{
+ pte_t pte = __ptep_get_and_clear(vma->vm_mm, address, ptep);
+
+ if (pte_accessible(vma->vm_mm, pte))
+ flush_tlb_page(vma, address);
+
+ return pte;
+}
+
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
static inline bool ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
diff --git a/arch/riscv/kernel/efi.c b/arch/riscv/kernel/efi.c
index b64bf1624a052..673eca7705ba5 100644
--- a/arch/riscv/kernel/efi.c
+++ b/arch/riscv/kernel/efi.c
@@ -60,7 +60,7 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
{
efi_memory_desc_t *md = data;
- pte_t pte = ptep_get(ptep);
+ pte_t pte = __ptep_get(ptep);
unsigned long val;
if (md->attribute & EFI_MEMORY_RO) {
@@ -72,7 +72,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
val = pte_val(pte) & ~_PAGE_EXEC;
pte = __pte(val);
}
- set_pte(ptep, pte);
+ __set_pte(ptep, pte);
return 0;
}
diff --git a/arch/riscv/kernel/hibernate.c b/arch/riscv/kernel/hibernate.c
index 982843828adb7..0360a6f3e1bf2 100644
--- a/arch/riscv/kernel/hibernate.c
+++ b/arch/riscv/kernel/hibernate.c
@@ -186,7 +186,7 @@ static int temp_pgtable_map_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long
pte_t pte = READ_ONCE(*src_ptep);
if (pte_present(pte))
- set_pte(dst_ptep, __pte(pte_val(pte) | pgprot_val(prot)));
+ __set_pte(dst_ptep, __pte(pte_val(pte) | pgprot_val(prot)));
} while (dst_ptep++, src_ptep++, start += PAGE_SIZE, start < end);
return 0;
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 04ed6f8acae4f..fe8b11a8ad143 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -69,7 +69,7 @@ static void show_pte(unsigned long addr)
if (!ptep)
goto out;
- pte = ptep_get(ptep);
+ pte = READ_ONCE(*ptep);
pr_cont(", pte=%016lx", pte_val(pte));
pte_unmap(ptep);
out:
@@ -231,7 +231,7 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a
* silently loop forever.
*/
pte_k = pte_offset_kernel(pmd_k, addr);
- if (!pte_present(ptep_get(pte_k))) {
+ if (!pte_present(__ptep_get(pte_k))) {
no_context(regs, addr);
return;
}
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index decd7df40fa42..86321b093d252 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -376,9 +376,9 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
ptep = &fixmap_pte[pte_index(addr)];
if (pgprot_val(prot))
- set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, prot));
+ __set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, prot));
else
- pte_clear(&init_mm, addr, ptep);
+ __pte_clear(&init_mm, addr, ptep);
local_flush_tlb_page(addr);
}
@@ -1558,11 +1558,11 @@ static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, un
next = end;
ptep = pte_base + pte_index(addr);
- pte = ptep_get(ptep);
+ pte = __ptep_get(ptep);
if (!pte_present(*ptep))
continue;
- pte_clear(&init_mm, addr, ptep);
+ __pte_clear(&init_mm, addr, ptep);
if (is_vmemmap)
free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
}
diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index c4a2a9e5586e7..0c2f5e8e48063 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -39,9 +39,9 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned
ptep = pte_offset_kernel(pmd, vaddr);
do {
- if (pte_none(ptep_get(ptep))) {
+ if (pte_none(__ptep_get(ptep))) {
phys_addr = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
- set_pte(ptep, pfn_pte(PFN_DOWN(phys_addr), PAGE_KERNEL));
+ __set_pte(ptep, pfn_pte(PFN_DOWN(phys_addr), PAGE_KERNEL));
memset(__va(phys_addr), KASAN_SHADOW_INIT, PAGE_SIZE);
}
} while (ptep++, vaddr += PAGE_SIZE, vaddr != end);
@@ -327,8 +327,8 @@ asmlinkage void __init kasan_early_init(void)
KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT)));
for (i = 0; i < PTRS_PER_PTE; ++i)
- set_pte(kasan_early_shadow_pte + i,
- pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL));
+ __set_pte(kasan_early_shadow_pte + i,
+ pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL));
for (i = 0; i < PTRS_PER_PMD; ++i)
set_pmd(kasan_early_shadow_pmd + i,
@@ -523,9 +523,9 @@ void __init kasan_init(void)
kasan_mem_to_shadow((const void *)MODULES_VADDR + SZ_2G));
for (i = 0; i < PTRS_PER_PTE; i++)
- set_pte(&kasan_early_shadow_pte[i],
- mk_pte(virt_to_page(kasan_early_shadow_page),
- __pgprot(_PAGE_PRESENT | _PAGE_READ |
+ __set_pte(&kasan_early_shadow_pte[i],
+ mk_pte(virt_to_page(kasan_early_shadow_page),
+ __pgprot(_PAGE_PRESENT | _PAGE_READ |
_PAGE_ACCESSED)));
memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE);
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index 3f76db3d27699..e0271e2a0b295 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -68,10 +68,10 @@ static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
static int pageattr_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
- pte_t val = ptep_get(pte);
+ pte_t val = __ptep_get(pte);
val = __pte(set_pageattr_masks(pte_val(val), walk));
- set_pte(pte, val);
+ __set_pte(pte, val);
return 0;
}
@@ -121,7 +121,7 @@ static int __split_linear_mapping_pmd(pud_t *pudp,
ptep_new = (pte_t *)page_address(pte_page);
for (i = 0; i < PTRS_PER_PTE; ++i, ++ptep_new)
- set_pte(ptep_new, pfn_pte(pfn + i, prot));
+ __set_pte(ptep_new, pfn_pte(pfn + i, prot));
smp_wmb();
@@ -406,14 +406,14 @@ static int debug_pagealloc_set_page(pte_t *pte, unsigned long addr, void *data)
{
int enable = *(int *)data;
- unsigned long val = pte_val(ptep_get(pte));
+ unsigned long val = pte_val(__ptep_get(pte));
if (enable)
val |= _PAGE_PRESENT;
else
val &= ~_PAGE_PRESENT;
- set_pte(pte, __pte(val));
+ __set_pte(pte, __pte(val));
return 0;
}
@@ -466,5 +466,5 @@ bool kernel_page_present(struct page *page)
return true;
pte = pte_offset_kernel(pmd, addr);
- return pte_present(ptep_get(pte));
+ return pte_present(__ptep_get(pte));
}
diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c
index 9c4427d0b1874..9131a78fe15c4 100644
--- a/arch/riscv/mm/pgtable.c
+++ b/arch/riscv/mm/pgtable.c
@@ -8,6 +8,13 @@
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
+{
+ return __ptep_set_access_flags(vma, address, ptep, entry, dirty);
+}
+
+int __ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
{
if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SVVPTC)) {
if (!pte_same(ptep_get(ptep), entry)) {
@@ -32,11 +39,19 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
bool ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
- if (!pte_young(ptep_get(ptep)))
+ return __ptep_test_and_clear_young(vma, address, ptep);
+}
+EXPORT_SYMBOL_GPL(ptep_test_and_clear_young);
+
+bool __ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ if (!pte_young(__ptep_get(ptep)))
return false;
+
return test_and_clear_bit(_PAGE_ACCESSED_OFFSET, &pte_val(*ptep));
}
-EXPORT_SYMBOL_GPL(ptep_test_and_clear_young);
+EXPORT_SYMBOL_GPL(__ptep_test_and_clear_young);
#ifdef CONFIG_64BIT
pud_t *pud_offset(p4d_t *p4d, unsigned long address)
--
2.39.5
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 2/7] riscv/kvm: use raw PTE helpers for G-stage leaf PTEs
2026-04-21 9:24 [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support Yunhui Cui
2026-04-21 9:24 ` [PATCH 1/7] riscv: mm: split raw and public PTE helpers Yunhui Cui
@ 2026-04-21 9:24 ` Yunhui Cui
2026-04-21 9:24 ` [PATCH 3/7] riscv: mm: add Svnapot-aware contiguous PTE wrappers Yunhui Cui
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Yunhui Cui @ 2026-04-21 9:24 UTC (permalink / raw)
To: akpm, alex, andrew+kernel, andreyknvl, anup, aou, apopple, ardb,
atish.patra, baolin.wang, cuiyunhui, david, debug,
djordje.todorovic, dvyukov, elver, glider, ilias.apalodimas,
junhui.liu, kasan-dev, kees, kevin.brodsky, kvm-riscv, kvm,
linux-efi, linux-kernel, linux-riscv, liu.xuemei1, ljs, namcao,
osalvador, palmer, pjw, rmclure, rostedt, rppt, ryabinin.a.a,
surenb, vincenzo.frascino, vishal.moola, wangruikang,
zhangchunyan
Use the raw RISC-V PTE helpers when KVM G-stage code needs to inspect
or update the exact leaf entry encoding. This keeps G-stage page tables
independent from the public PTE wrappers that will gain Svnapot-aware
behavior.
No functional change intended.
Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
arch/riscv/kvm/gstage.c | 48 ++++++++++++++++++++++-------------------
arch/riscv/kvm/mmu.c | 4 ++--
2 files changed, 28 insertions(+), 24 deletions(-)
diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
index d9fe8be2a1516..fda235092533a 100644
--- a/arch/riscv/kvm/gstage.c
+++ b/arch/riscv/kvm/gstage.c
@@ -88,7 +88,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
*ptep_level = current_level;
ptep = (pte_t *)gstage->pgd;
ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
- while (ptep && pte_val(ptep_get(ptep))) {
+ while (ptep && pte_val(__ptep_get(ptep))) {
if (gstage_pte_leaf(ptep)) {
*ptep_level = current_level;
*ptepp = ptep;
@@ -98,7 +98,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
if (current_level) {
current_level--;
*ptep_level = current_level;
- ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ ptep = (pte_t *)gstage_pte_page_vaddr(__ptep_get(ptep));
ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
} else {
ptep = NULL;
@@ -138,18 +138,19 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
if (gstage_pte_leaf(ptep))
return -EEXIST;
- if (!pte_val(ptep_get(ptep))) {
+ if (!pte_val(__ptep_get(ptep))) {
if (!pcache)
return -ENOMEM;
next_ptep = kvm_mmu_memory_cache_alloc(pcache);
if (!next_ptep)
return -ENOMEM;
- set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
- __pgprot(_PAGE_TABLE)));
+ __set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(next_ptep)),
+ __pgprot(_PAGE_TABLE)));
} else {
if (gstage_pte_leaf(ptep))
return -EEXIST;
- next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ next_ptep = (pte_t *)gstage_pte_page_vaddr(__ptep_get(ptep));
}
current_level--;
@@ -157,7 +158,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
}
if (pte_val(*ptep) != pte_val(map->pte)) {
- set_pte(ptep, map->pte);
+ __set_pte(ptep, map->pte);
if (gstage_pte_leaf(ptep))
gstage_tlb_flush(gstage, current_level, map->addr);
}
@@ -170,13 +171,13 @@ static void kvm_riscv_gstage_update_pte_prot(struct kvm_gstage *gstage, u32 leve
{
pte_t new_pte;
- if (pgprot_val(pte_pgprot(ptep_get(ptep))) == pgprot_val(prot))
+ if (pgprot_val(pte_pgprot(__ptep_get(ptep))) == pgprot_val(prot))
return;
- new_pte = pfn_pte(pte_pfn(ptep_get(ptep)), prot);
+ new_pte = pfn_pte(pte_pfn(__ptep_get(ptep)), prot);
new_pte = pte_mkdirty(new_pte);
- set_pte(ptep, new_pte);
+ __set_pte(ptep, new_pte);
gstage_tlb_flush(gstage, level, addr);
}
@@ -255,7 +256,8 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
if (ptep_level > out_map->level) {
kvm_riscv_gstage_split_huge(gstage, pcache, gpa,
out_map->level, true);
- } else if (ALIGN_DOWN(PFN_PHYS(pte_pfn(ptep_get(ptep))), page_size) == hpa) {
+ } else if (ALIGN_DOWN(PFN_PHYS(pte_pfn(__ptep_get(ptep))),
+ page_size) == hpa) {
kvm_riscv_gstage_update_pte_prot(gstage, ptep_level, gpa, ptep, prot);
return 0;
}
@@ -301,16 +303,16 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
while(current_level > target_level) {
ptep = (pte_t *)&next_ptep[gstage_pte_index(gstage, addr, current_level)];
- if (!pte_val(ptep_get(ptep)))
+ if (!pte_val(__ptep_get(ptep)))
break;
if (!gstage_pte_leaf(ptep)) {
- next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ next_ptep = (pte_t *)gstage_pte_page_vaddr(__ptep_get(ptep));
current_level--;
continue;
}
- huge_pte = pte_val(ptep_get(ptep));
+ huge_pte = pte_val(__ptep_get(ptep));
ret = gstage_level_to_page_size(gstage, current_level - 1, &child_page_size);
if (ret)
@@ -322,11 +324,12 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
for (i = 0; i < PTRS_PER_PTE; i++) {
child_pte = make_child_pte(huge_pte, i, child_page_size);
- set_pte((pte_t *)&next_ptep[i], __pte(child_pte));
+ __set_pte((pte_t *)&next_ptep[i], __pte(child_pte));
}
- set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
- __pgprot(_PAGE_TABLE)));
+ __set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(next_ptep)),
+ __pgprot(_PAGE_TABLE)));
if (flush)
gstage_tlb_flush(gstage, current_level, addr);
@@ -351,18 +354,18 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
WARN_ON(addr & (page_size - 1));
- if (!pte_val(ptep_get(ptep)))
+ if (!pte_val(__ptep_get(ptep)))
return;
if (ptep_level && !gstage_pte_leaf(ptep)) {
- next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
+ next_ptep = (pte_t *)gstage_pte_page_vaddr(__ptep_get(ptep));
next_ptep_level = ptep_level - 1;
ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size);
if (ret)
return;
if (op == GSTAGE_OP_CLEAR)
- set_pte(ptep, __pte(0));
+ __set_pte(ptep, __pte(0));
for (i = 0; i < PTRS_PER_PTE; i++)
kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
&next_ptep[i], next_ptep_level, op);
@@ -371,9 +374,10 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
} else {
old_pte = *ptep;
if (op == GSTAGE_OP_CLEAR)
- set_pte(ptep, __pte(0));
+ __set_pte(ptep, __pte(0));
else if (op == GSTAGE_OP_WP)
- set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE));
+ __set_pte(ptep,
+ __pte(pte_val(__ptep_get(ptep)) & ~_PAGE_WRITE));
if (pte_val(*ptep) != pte_val(old_pte))
gstage_tlb_flush(gstage, ptep_level, addr);
}
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 2d3def024270c..f338ef08a6d13 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -262,7 +262,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
&ptep, &ptep_level))
return false;
- return ptep_test_and_clear_young(NULL, 0, ptep);
+ return __ptep_test_and_clear_young(NULL, 0, ptep);
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -282,7 +282,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
&ptep, &ptep_level))
return false;
- return pte_young(ptep_get(ptep));
+ return pte_young(__ptep_get(ptep));
}
static bool fault_supports_gstage_huge_mapping(struct kvm_memory_slot *memslot,
--
2.39.5
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 3/7] riscv: mm: add Svnapot-aware contiguous PTE wrappers
2026-04-21 9:24 [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support Yunhui Cui
2026-04-21 9:24 ` [PATCH 1/7] riscv: mm: split raw and public PTE helpers Yunhui Cui
2026-04-21 9:24 ` [PATCH 2/7] riscv/kvm: use raw PTE helpers for G-stage leaf PTEs Yunhui Cui
@ 2026-04-21 9:24 ` Yunhui Cui
2026-04-21 9:24 ` [PATCH 4/7] riscv: hugetlb: switch NAPOT mappings to raw PTE helpers Yunhui Cui
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Yunhui Cui @ 2026-04-21 9:24 UTC (permalink / raw)
To: akpm, alex, andrew+kernel, andreyknvl, anup, aou, apopple, ardb,
atish.patra, baolin.wang, cuiyunhui, david, debug,
djordje.todorovic, dvyukov, elver, glider, ilias.apalodimas,
junhui.liu, kasan-dev, kees, kevin.brodsky, kvm-riscv, kvm,
linux-efi, linux-kernel, linux-riscv, liu.xuemei1, ljs, namcao,
osalvador, palmer, pjw, rmclure, rostedt, rppt, ryabinin.a.a,
surenb, vincenzo.frascino, vishal.moola, wangruikang,
zhangchunyan
Add Svnapot-aware wrappers around the public PTE helpers so core MM
callers can operate on contiguous mappings without learning the NAPOT
encoding details. Introduce contpte.c to handle folding, unfolding and
accessed/dirty state aggregation for contiguous PTE blocks.
Keep the raw __* helpers unchanged so NAPOT-aware callers can continue
to access the underlying PTE encoding directly, and centralize the
public Svnapot-aware wrappers under a single CONFIG_RISCV_ISA_SVNAPOT
block with simple alias fallbacks for the non-Svnapot case.
Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
arch/riscv/include/asm/pgtable.h | 288 +++++++++++++++++--
arch/riscv/mm/Makefile | 1 +
arch/riscv/mm/contpte.c | 479 +++++++++++++++++++++++++++++++
arch/riscv/mm/pgtable.c | 39 ++-
4 files changed, 769 insertions(+), 38 deletions(-)
create mode 100644 arch/riscv/mm/contpte.c
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 4de1f40fa77ea..722483d4df37f 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -11,6 +11,10 @@
#include <asm/pgtable-bits.h>
+#ifndef __ASSEMBLER__
+#include <asm/cmpxchg.h>
+#endif
+
#ifndef CONFIG_MMU
#ifdef CONFIG_RELOCATABLE
#define KERNEL_LINK_ADDR UL(0)
@@ -301,6 +305,12 @@ static inline unsigned long pte_napot(pte_t pte)
return 0;
}
+static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
+{
+ (void)order;
+ return pte;
+}
+
#endif /* CONFIG_RISCV_ISA_SVNAPOT */
/* Yields the page frame number (PFN) of a page table entry */
@@ -339,6 +349,11 @@ static inline int pte_present(pte_t pte)
return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE));
}
+static inline bool pte_present_napot(pte_t pte)
+{
+ return pte_present(pte) && pte_napot(pte);
+}
+
#define pte_accessible pte_accessible
static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
{
@@ -392,6 +407,23 @@ static inline int pte_special(pte_t pte)
return pte_val(pte) & _PAGE_SPECIAL;
}
+static inline pte_t pte_mknonnapot(pte_t pte, unsigned long addr)
+{
+ unsigned long pfn;
+ unsigned long offset;
+ pgprot_t prot;
+
+ if (!pte_present_napot(pte))
+ return pte;
+
+ offset = (addr & (napot_cont_size(napot_cont_order(pte)) - 1)) >>
+ PAGE_SHIFT;
+ pfn = pte_pfn(pte) + offset;
+ prot = __pgprot((pte_val(pte) & ~_PAGE_PFN_MASK) & ~_PAGE_NAPOT);
+
+ return pfn_pte(pfn, prot);
+}
+
/* static inline pte_t pte_rdprotect(pte_t pte) */
static inline pte_t pte_wrprotect(pte_t pte)
@@ -642,24 +674,12 @@ static inline void __set_ptes(struct mm_struct *mm, unsigned long addr,
#define __set_ptes __set_ptes
-static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval, unsigned int nr)
-{
- __set_ptes(mm, addr, ptep, pteval, nr);
-}
-
static inline void __pte_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
__set_pte_at(mm, ptep, __pte(0));
}
-static inline void pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- __pte_clear(mm, addr, ptep);
-}
-
#define __ptep_get __ptep_get
static inline pte_t __ptep_get(pte_t *ptep)
{
@@ -672,6 +692,47 @@ static inline pte_t __ptep_get_lockless(pte_t *ptep)
return __ptep_get(ptep);
}
+static inline void __clear_young_dirty_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, cydp_t flags)
+{
+ pte_t old_pte;
+
+ do {
+ old_pte = pte;
+
+ if (flags & CYDP_CLEAR_YOUNG)
+ pte = pte_mkold(pte);
+ if (flags & CYDP_CLEAR_DIRTY)
+ pte = pte_mkclean(pte);
+
+ pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
+ pte_val(old_pte),
+ pte_val(pte));
+ } while (pte_val(pte) != pte_val(old_pte));
+}
+
+static inline void __clear_young_dirty_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, cydp_t flags)
+{
+ pte_t pte;
+
+ for (;;) {
+ pte = __ptep_get(ptep);
+
+ if (flags == (CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY))
+ __set_pte(ptep, pte_mkclean(pte_mkold(pte)));
+ else
+ __clear_young_dirty_pte(vma, addr, ptep, pte, flags);
+
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* defined in mm/pgtable.c */
extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep, pte_t entry, int dirty);
@@ -703,12 +764,6 @@ __ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep)
#define __ptep_get_and_clear __ptep_get_and_clear
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
-{
- return __ptep_get_and_clear(mm, address, ptep);
-}
-
static inline void
__ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
@@ -725,13 +780,6 @@ __ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
#define __ptep_set_wrprotect __ptep_set_wrprotect
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
-{
- __ptep_set_wrprotect(mm, address, ptep);
-}
-
static inline pte_t __ptep_clear_flush(struct vm_area_struct *vma,
unsigned long address,
pte_t *ptep)
@@ -744,9 +792,8 @@ static inline pte_t __ptep_clear_flush(struct vm_area_struct *vma,
return pte;
}
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-static inline bool ptep_clear_flush_young(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep)
+static inline bool __ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
{
/*
* This comment is borrowed from x86, but applies equally to RISC-V:
@@ -763,9 +810,192 @@ static inline bool ptep_clear_flush_young(struct vm_area_struct *vma,
* shouldn't really matter because there's no real memory
* pressure for swapout to react to. ]
*/
- return ptep_test_and_clear_young(vma, address, ptep);
+ return __ptep_test_and_clear_young(vma, address, ptep);
+}
+
+#define __ptep_clear_flush_young __ptep_clear_flush_young
+
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+
+/*
+ * The Svnapot helpers transparently manage napot-encoded PTEs for the public
+ * core-MM-facing API below. The napot bit is a private implementation detail
+ * of those public helpers. Callers that need direct access to the underlying
+ * PTE encoding must use the low-level __* helpers instead.
+ */
+void __napotpte_try_fold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+void __napotpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+pte_t napotpte_ptep_get(pte_t *ptep, pte_t orig_pte);
+pte_t napotpte_ptep_get_lockless(pte_t *ptep);
+void napotpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr);
+void napotpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, cydp_t flags);
+bool napotpte_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
+bool napotpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
+bool napotpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
+
+static __always_inline bool riscv_pte_present_napot(pte_t pte)
+{
+ return riscv_has_extension_unlikely(RISCV_ISA_EXT_SVNAPOT) &&
+ pte_present_napot(pte);
+}
+
+static __always_inline void
+napotpte_try_fold(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte)
+{
+ const unsigned long contmask = napot_pte_num(NAPOT_CONT64KB_ORDER) - 1;
+ bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask;
+
+ if (unlikely(valign)) {
+ bool palign = (pte_pfn(pte) & contmask) == contmask;
+
+ if (unlikely(palign && pte_present(pte) && !pte_napot(pte) &&
+ !pte_special(pte)))
+ __napotpte_try_fold(mm, addr, ptep, pte);
+ }
+}
+
+static __always_inline void
+napotpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ pte_t pte)
+{
+ if (unlikely(pte_present_napot(pte)))
+ __napotpte_try_unfold(mm, addr, ptep, pte);
+}
+
+#define set_ptes set_ptes
+static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval, unsigned int nr)
+{
+ pteval = pte_mknonnapot(pteval, addr);
+
+ if (likely(nr == 1)) {
+ napotpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __set_ptes(mm, addr, ptep, pteval, 1);
+ napotpte_try_fold(mm, addr, ptep, pteval);
+ return;
+ }
+
+ napotpte_set_ptes(mm, addr, ptep, pteval, nr);
+}
+
+static inline void pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ napotpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __pte_clear(mm, addr, ptep);
+}
+
+#define ptep_get ptep_get
+static inline pte_t ptep_get(pte_t *ptep)
+{
+ pte_t pte = __ptep_get(ptep);
+
+ if (likely(!pte_present_napot(pte)))
+ return pte;
+
+ return napotpte_ptep_get(ptep, pte);
+}
+
+#define ptep_get_lockless ptep_get_lockless
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+ pte_t pte = __ptep_get_lockless(ptep);
+
+ if (likely(!pte_present_napot(pte)))
+ return pte;
+
+ return napotpte_ptep_get_lockless(ptep);
+}
+
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
+{
+ napotpte_try_unfold(mm, address, ptep, __ptep_get(ptep));
+
+ return __ptep_get_and_clear(mm, address, ptep);
+}
+
+#define clear_young_dirty_ptes clear_young_dirty_ptes
+static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, cydp_t flags)
+{
+ napotpte_clear_young_dirty_ptes(vma, addr, ptep, nr, flags);
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
+{
+ __ptep_set_wrprotect(mm, address, ptep);
}
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+static inline bool ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (likely(!riscv_pte_present_napot(orig_pte)))
+ return __ptep_clear_flush_young(vma, address, ptep);
+
+ return napotpte_ptep_clear_flush_young(vma, address, ptep);
+}
+
+#else /* CONFIG_RISCV_ISA_SVNAPOT */
+
+static __always_inline bool riscv_pte_present_napot(pte_t pte)
+{
+ return false;
+}
+
+static inline bool napotpte_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *ptep, pte_t entry,
+ int dirty)
+{
+ return false;
+}
+
+static inline bool
+napotpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *ptep)
+{
+ return false;
+}
+
+static inline bool
+napotpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *ptep)
+{
+ return false;
+}
+
+#define set_ptes __set_ptes
+#define pte_clear __pte_clear
+#define ptep_get __ptep_get
+#define ptep_get_lockless __ptep_get_lockless
+#define ptep_get_and_clear __ptep_get_and_clear
+#define clear_young_dirty_ptes __clear_young_dirty_ptes
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#define ptep_set_wrprotect __ptep_set_wrprotect
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young __ptep_clear_flush_young
+
+#endif /* CONFIG_RISCV_ISA_SVNAPOT */
+
#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
{
diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile
index b916a68d324ad..5855f923b83ec 100644
--- a/arch/riscv/mm/Makefile
+++ b/arch/riscv/mm/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_MMU) += extable.o fault.o pageattr.o pgtable.o tlbflush.o
obj-y += cacheflush.o
obj-y += context.o
obj-y += pmem.o
+obj-$(CONFIG_RISCV_ISA_SVNAPOT) += contpte.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP) += ptdump.o
diff --git a/arch/riscv/mm/contpte.c b/arch/riscv/mm/contpte.c
new file mode 100644
index 0000000000000..f73af7d9b099a
--- /dev/null
+++ b/arch/riscv/mm/contpte.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/align.h>
+#include <linux/cpufeature.h>
+#include <linux/efi.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/page_table_check.h>
+#include <linux/pgtable.h>
+
+#include <asm/tlbflush.h>
+
+static inline bool napot_hw_supported(void)
+{
+ return riscv_has_extension_unlikely(RISCV_ISA_EXT_SVNAPOT);
+}
+
+static inline bool mm_is_user(struct mm_struct *mm)
+{
+ if (unlikely(mm_is_efi(mm)))
+ return false;
+
+ return mm != &init_mm;
+}
+
+static inline unsigned int napotpte_order(void)
+{
+ return NAPOT_CONT64KB_ORDER;
+}
+
+static inline unsigned long napotpte_size(void)
+{
+ return napot_cont_size(napotpte_order());
+}
+
+static inline unsigned int napotpte_pte_num(void)
+{
+ return napot_pte_num(napotpte_order());
+}
+
+static inline unsigned long napotpte_mask(void)
+{
+ return napotpte_size() - 1;
+}
+
+static inline unsigned long napot_align_addr(unsigned long addr)
+{
+ return ALIGN_DOWN(addr, napotpte_size());
+}
+
+static inline pte_t *napot_align_ptep(pte_t *ptep)
+{
+ return PTR_ALIGN_DOWN(ptep, napotpte_pte_num() * sizeof(*ptep));
+}
+
+static inline pte_t pte_mask_ad(pte_t pte)
+{
+ return pte_mkold(pte_mkclean(pte));
+}
+
+static inline unsigned long pte_protval_no_pfn_no_napot(pte_t pte)
+{
+ return (pte_val(pte) & ~_PAGE_PFN_MASK) & ~_PAGE_NAPOT;
+}
+
+static inline void napotpte_clear_young_dirty_pte(pte_t *ptep, cydp_t flags)
+{
+ pte_t old_pte, new_pte;
+ unsigned long old_val, new_val;
+
+ do {
+ old_pte = READ_ONCE(*ptep);
+ new_pte = old_pte;
+ if (flags & CYDP_CLEAR_YOUNG)
+ new_pte = pte_mkold(new_pte);
+ if (flags & CYDP_CLEAR_DIRTY)
+ new_pte = pte_mkclean(new_pte);
+
+ old_val = pte_val(old_pte);
+ new_val = pte_val(new_pte);
+ } while (cmpxchg_relaxed(&pte_val(*ptep), old_val, new_val) != old_val);
+}
+
+static inline pte_t napotpte_subpte(pte_t *ptep, pte_t pte)
+{
+ unsigned long pfn;
+ pgprot_t prot;
+
+ if (!pte_present_napot(pte))
+ return pte;
+
+ pfn = pte_pfn(pte) + (ptep - napot_align_ptep(ptep));
+ prot = __pgprot(pte_protval_no_pfn_no_napot(pte));
+
+ return pfn_pte(pfn, prot);
+}
+
+static inline pte_t
+__napot_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ pte_t pte;
+
+ pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0));
+ page_table_check_pte_clear(mm, addr, pte);
+
+ return pte;
+}
+
+static void napotpte_convert(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t target)
+{
+ unsigned long start_addr, end;
+ pte_t *start_ptep;
+ pte_t ptent, pte;
+ unsigned int i, nr;
+
+ start_addr = napot_align_addr(addr);
+ start_ptep = napot_align_ptep(ptep);
+ nr = napotpte_pte_num();
+ end = start_addr + napotpte_size();
+
+ for (i = 0; i < nr; i++) {
+ ptent = __napot_ptep_get_and_clear(mm, start_addr + i * PAGE_SIZE,
+ start_ptep + i);
+ if (pte_dirty(ptent))
+ target = pte_mkdirty(target);
+ if (pte_young(ptent))
+ target = pte_mkyoung(target);
+ }
+
+ flush_tlb_mm_range(mm, start_addr, end, PAGE_SIZE);
+
+ page_table_check_ptes_set(mm, start_addr, start_ptep, target, nr);
+ if (pte_napot(target)) {
+ for (i = 0; i < nr; i++)
+ __set_pte_at(mm, start_ptep + i, target);
+ return;
+ }
+
+ for (i = 0; i < nr; i++) {
+ pte = pfn_pte(pte_pfn(target) + i,
+ __pgprot(pte_protval_no_pfn_no_napot(target)));
+ if (pte_dirty(target))
+ pte = pte_mkdirty(pte);
+ if (pte_young(target))
+ pte = pte_mkyoung(pte);
+ __set_pte_at(mm, start_ptep + i, pte);
+ }
+}
+
+static inline bool napotpte_is_consistent(pte_t pte, pte_t orig_pte)
+{
+ return pte_present_napot(pte) &&
+ pte_val(pte_mask_ad(pte)) == pte_val(pte_mask_ad(orig_pte));
+}
+
+void __napotpte_try_fold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ struct page *page;
+ struct folio *folio;
+ unsigned long folio_start, folio_end;
+ unsigned long cont_start, cont_end;
+ unsigned long pfn;
+ pgprot_t prot;
+ pte_t expected, cur;
+ pte_t *start;
+ unsigned int i, nr;
+
+ if (!napot_hw_supported() || !mm_is_user(mm))
+ return;
+
+ if (!pte_present(pte) || pte_napot(pte) || pte_special(pte))
+ return;
+
+ page = pte_page(pte);
+ folio = page_folio(page);
+ folio_start = addr - (page - &folio->page) * PAGE_SIZE;
+ folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE;
+ cont_start = napot_align_addr(addr);
+ cont_end = cont_start + napotpte_size();
+ if (folio_start > cont_start || folio_end < cont_end)
+ return;
+
+ nr = napotpte_pte_num();
+ start = napot_align_ptep(ptep);
+
+ pfn = ALIGN_DOWN(pte_pfn(pte), nr);
+ prot = pte_pgprot(pte_mask_ad(pte));
+ expected = pfn_pte(pfn, prot);
+
+ for (i = 0; i < nr; i++) {
+ cur = READ_ONCE(start[i]);
+ if (pte_val(pte_mask_ad(cur)) != pte_val(expected))
+ return;
+ pte_val(expected) += 1UL << _PAGE_PFN_SHIFT;
+ }
+
+ expected = pte_mknapot(pfn_pte(pfn, prot), napotpte_order());
+ napotpte_convert(mm, addr, ptep, expected);
+}
+EXPORT_SYMBOL(__napotpte_try_fold);
+
+void __napotpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ pte_t target;
+ pgprot_t prot;
+
+ if (!napot_hw_supported() || !mm_is_user(mm))
+ return;
+
+ prot = __pgprot(pte_protval_no_pfn_no_napot(pte));
+ target = pfn_pte(pte_pfn(pte), prot);
+
+ napotpte_convert(mm, addr, ptep, target);
+}
+EXPORT_SYMBOL(__napotpte_try_unfold);
+
+pte_t napotpte_ptep_get(pte_t *ptep, pte_t orig_pte)
+{
+ pte_t pte, cur;
+ pte_t *start;
+ unsigned int i, nr;
+
+ if (!napot_hw_supported() || !pte_present_napot(orig_pte))
+ return orig_pte;
+
+ pte = orig_pte;
+ start = napot_align_ptep(ptep);
+ nr = napotpte_pte_num();
+
+ for (i = 0; i < nr; i++) {
+ cur = READ_ONCE(start[i]);
+ if (!napotpte_is_consistent(cur, orig_pte))
+ return napotpte_subpte(ptep, orig_pte);
+ if (pte_dirty(cur))
+ pte = pte_mkdirty(pte);
+ if (pte_young(cur))
+ pte = pte_mkyoung(pte);
+ }
+
+ return napotpte_subpte(ptep, pte);
+}
+EXPORT_SYMBOL(napotpte_ptep_get);
+
+pte_t napotpte_ptep_get_lockless(pte_t *orig_ptep)
+{
+ pte_t orig_pte, pte;
+ pte_t *ptep;
+ unsigned int i, nr;
+
+ if (!napot_hw_supported())
+ return READ_ONCE(*orig_ptep);
+
+ nr = napotpte_pte_num();
+
+retry:
+ orig_pte = READ_ONCE(*orig_ptep);
+ if (!pte_present_napot(orig_pte))
+ return orig_pte;
+
+ ptep = napot_align_ptep(orig_ptep);
+
+ for (i = 0; i < nr; i++, ptep++) {
+ pte = READ_ONCE(*ptep);
+
+ if (!napotpte_is_consistent(pte, orig_pte))
+ goto retry;
+
+ if (pte_dirty(pte)) {
+ orig_pte = pte_mkdirty(orig_pte);
+ for (; i < nr; i++, ptep++) {
+ pte = READ_ONCE(*ptep);
+
+ if (!napotpte_is_consistent(pte, orig_pte))
+ goto retry;
+
+ if (pte_young(pte)) {
+ orig_pte = pte_mkyoung(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
+
+ if (pte_young(pte)) {
+ orig_pte = pte_mkyoung(orig_pte);
+ i++;
+ ptep++;
+ for (; i < nr; i++, ptep++) {
+ pte = READ_ONCE(*ptep);
+
+ if (!napotpte_is_consistent(pte, orig_pte))
+ goto retry;
+
+ if (pte_dirty(pte)) {
+ orig_pte = pte_mkdirty(orig_pte);
+ break;
+ }
+ }
+ break;
+ }
+ }
+
+ return napotpte_subpte(orig_ptep, orig_pte);
+}
+EXPORT_SYMBOL(napotpte_ptep_get_lockless);
+
+void napotpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr)
+{
+ unsigned long next, end;
+ unsigned long pfn, size, boundary;
+ pgprot_t prot;
+ unsigned int chunk, i;
+ pte_t cur;
+
+ if (!napot_hw_supported() || !mm_is_user(mm)) {
+ __set_ptes(mm, addr, ptep, pte, nr);
+ return;
+ }
+
+ size = napotpte_size();
+ end = addr + ((unsigned long)nr << PAGE_SHIFT);
+ pfn = pte_pfn(pte);
+ prot = __pgprot(pte_protval_no_pfn_no_napot(pte));
+
+ do {
+ boundary = (addr + size) & ~napotpte_mask();
+ next = (boundary - 1 < end - 1) ? boundary : end;
+ chunk = (next - addr) >> PAGE_SHIFT;
+
+ cur = pfn_pte(pfn, prot);
+ if (((addr | next | (pfn << PAGE_SHIFT)) & napotpte_mask()) == 0) {
+ cur = pte_mknapot(cur, napotpte_order());
+ page_table_check_ptes_set(mm, addr, ptep, cur, chunk);
+ for (i = 0; i < chunk; i++)
+ __set_pte_at(mm, ptep + i, cur);
+ } else {
+ __set_ptes(mm, addr, ptep, cur, chunk);
+ }
+
+ addr = next;
+ ptep += chunk;
+ pfn += chunk;
+ } while (addr != end);
+}
+EXPORT_SYMBOL(napotpte_set_ptes);
+
+void napotpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, cydp_t flags)
+{
+ struct mm_struct *mm;
+ unsigned long start, end;
+ unsigned int total;
+
+ mm = vma->vm_mm;
+ if (!napot_hw_supported() || !mm_is_user(mm)) {
+ for (;;) {
+ if (flags == CYDP_CLEAR_YOUNG)
+ __ptep_test_and_clear_young(vma, addr, ptep);
+ else
+ napotpte_clear_young_dirty_pte(ptep, flags);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+ return;
+ }
+
+ start = addr;
+ end = start + nr * PAGE_SIZE;
+
+ if (pte_present_napot(READ_ONCE(*(ptep + nr - 1))))
+ end = ALIGN(end, napotpte_size());
+
+ if (pte_present_napot(READ_ONCE(*ptep))) {
+ start = napot_align_addr(start);
+ ptep = napot_align_ptep(ptep);
+ }
+
+ total = (end - start) >> PAGE_SHIFT;
+ for (; total; total--, ptep++, start += PAGE_SIZE)
+ napotpte_clear_young_dirty_pte(ptep, flags);
+}
+EXPORT_SYMBOL(napotpte_clear_young_dirty_ptes);
+
+bool napotpte_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ pte_t orig_pte, raw_pte, napot_pte;
+ pte_t *start;
+ pgprot_t prot;
+ unsigned long start_addr;
+ unsigned int i, nr;
+ bool changed;
+
+ raw_pte = READ_ONCE(*ptep);
+ if (!napot_hw_supported() || !pte_present_napot(raw_pte))
+ return false;
+
+ orig_pte = ptep_get(ptep);
+ if (pte_val(orig_pte) == pte_val(entry))
+ return false;
+
+ if (pte_write(orig_pte) != pte_write(entry)) {
+ __napotpte_try_unfold(vma->vm_mm, address, ptep, raw_pte);
+ entry = pte_mknonnapot(entry, address);
+
+ return ptep_set_access_flags(vma, address, ptep, entry, dirty);
+ }
+
+ prot = pte_pgprot(entry);
+ napot_pte = pfn_pte(pte_pfn(raw_pte), prot);
+ napot_pte = pte_mknapot(napot_pte, napotpte_order());
+
+ start = napot_align_ptep(ptep);
+ start_addr = napot_align_addr(address);
+ nr = napotpte_pte_num();
+ changed = false;
+
+ page_table_check_ptes_set(vma->vm_mm, start_addr, start, napot_pte, nr);
+ for (i = 0; i < nr; i++) {
+ if (!pte_same(READ_ONCE(start[i]), napot_pte)) {
+ __set_pte_at(vma->vm_mm, start + i, napot_pte);
+ changed = true;
+ }
+ }
+
+ if (changed)
+ flush_tlb_range(vma, start_addr, start_addr + napotpte_size());
+
+ return changed;
+}
+EXPORT_SYMBOL(napotpte_ptep_set_access_flags);
+
+bool napotpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t *start;
+ unsigned int i, nr;
+ bool young;
+
+ if (!napot_hw_supported() || !pte_present_napot(READ_ONCE(*ptep)))
+ return false;
+
+ start = napot_align_ptep(ptep);
+ nr = napotpte_pte_num();
+ young = false;
+
+ for (i = 0; i < nr; i++)
+ young |= test_and_clear_bit(_PAGE_ACCESSED_OFFSET,
+ &pte_val(start[i]));
+
+ return young;
+}
+EXPORT_SYMBOL(napotpte_ptep_test_and_clear_young);
+
+bool napotpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ unsigned long start_addr;
+ bool young;
+
+ young = napotpte_ptep_test_and_clear_young(vma, address, ptep);
+ if (!young)
+ return false;
+
+ start_addr = napot_align_addr(address);
+ flush_tlb_range(vma, start_addr, start_addr + napotpte_size());
+
+ return true;
+}
+EXPORT_SYMBOL(napotpte_ptep_clear_flush_young);
diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c
index 9131a78fe15c4..85ff49286f91c 100644
--- a/arch/riscv/mm/pgtable.c
+++ b/arch/riscv/mm/pgtable.c
@@ -9,6 +9,14 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
+ pte_t raw_pte;
+
+ entry = pte_mknonnapot(entry, address);
+ raw_pte = READ_ONCE(*ptep);
+ if (riscv_pte_present_napot(raw_pte))
+ return napotpte_ptep_set_access_flags(vma, address, ptep, entry,
+ dirty);
+
return __ptep_set_access_flags(vma, address, ptep, entry, dirty);
}
@@ -16,19 +24,26 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
- if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SVVPTC)) {
- if (!pte_same(ptep_get(ptep), entry)) {
- __set_pte_at(vma->vm_mm, ptep, entry);
- /* Here only not svadu is impacted */
- flush_tlb_page(vma, address);
- return true;
- }
+ pte_t raw_pte;
+ bool changed;
+
+ entry = pte_mknonnapot(entry, address);
+ raw_pte = READ_ONCE(*ptep);
+ if (riscv_pte_present_napot(raw_pte))
+ return false;
+ changed = !pte_same(raw_pte, entry);
+ if (!changed)
return false;
+
+ __set_pte_at(vma->vm_mm, ptep, entry);
+
+ if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SVVPTC)) {
+ /* Here only not svadu is impacted */
+ flush_tlb_page(vma, address);
+ return true;
}
- if (!pte_same(ptep_get(ptep), entry))
- __set_pte_at(vma->vm_mm, ptep, entry);
/*
* update_mmu_cache will unconditionally execute, handling both
* the case that the PTE changed and the spurious fault case.
@@ -39,6 +54,12 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
bool ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
+ pte_t raw_pte;
+
+ raw_pte = READ_ONCE(*ptep);
+ if (riscv_pte_present_napot(raw_pte))
+ return napotpte_ptep_test_and_clear_young(vma, address, ptep);
+
return __ptep_test_and_clear_young(vma, address, ptep);
}
EXPORT_SYMBOL_GPL(ptep_test_and_clear_young);
--
2.39.5
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 4/7] riscv: hugetlb: switch NAPOT mappings to raw PTE helpers
2026-04-21 9:24 [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support Yunhui Cui
` (2 preceding siblings ...)
2026-04-21 9:24 ` [PATCH 3/7] riscv: mm: add Svnapot-aware contiguous PTE wrappers Yunhui Cui
@ 2026-04-21 9:24 ` Yunhui Cui
2026-04-21 9:24 ` [PATCH 5/7] riscv: add contiguous PTE range clearing helpers Yunhui Cui
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Yunhui Cui @ 2026-04-21 9:24 UTC (permalink / raw)
To: akpm, alex, andrew+kernel, andreyknvl, anup, aou, apopple, ardb,
atish.patra, baolin.wang, cuiyunhui, david, debug,
djordje.todorovic, dvyukov, elver, glider, ilias.apalodimas,
junhui.liu, kasan-dev, kees, kevin.brodsky, kvm-riscv, kvm,
linux-efi, linux-kernel, linux-riscv, liu.xuemei1, ljs, namcao,
osalvador, palmer, pjw, rmclure, rostedt, rppt, ryabinin.a.a,
surenb, vincenzo.frascino, vishal.moola, wangruikang,
zhangchunyan
Use raw PTE helpers in hugetlb code to operate directly on the
underlying PTE entries. This lets hugetlb manage NAPOT
folding/unfolding explicitly instead of going through Svnapot-aware
public wrappers.
Add explicit NAPOT unfolding in set_huge_pte_at() before replacing an
existing NAPOT mapping with non-NAPOT entries.
No functional change intended.
Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
arch/riscv/mm/hugetlbpage.c | 55 +++++++++++++++++++++++--------------
1 file changed, 35 insertions(+), 20 deletions(-)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index a6d217112cf46..65a89b4fdad8b 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -7,7 +7,7 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
unsigned long pte_num;
int i;
- pte_t orig_pte = ptep_get(ptep);
+ pte_t orig_pte = __ptep_get(ptep);
if (!pte_present(orig_pte) || !pte_napot(orig_pte))
return orig_pte;
@@ -15,7 +15,7 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
pte_num = napot_pte_num(napot_cont_order(orig_pte));
for (i = 0; i < pte_num; i++, ptep++) {
- pte_t pte = ptep_get(ptep);
+ pte_t pte = __ptep_get(ptep);
if (pte_dirty(pte))
orig_pte = pte_mkdirty(orig_pte);
@@ -74,7 +74,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
out:
if (pte) {
- pte_t pteval = ptep_get_lockless(pte);
+ pte_t pteval = __ptep_get_lockless(pte);
WARN_ON_ONCE(pte_present(pteval) && !pte_huge(pteval));
}
@@ -153,12 +153,12 @@ static pte_t get_clear_contig(struct mm_struct *mm,
pte_t pte, tmp_pte;
bool present;
- pte = ptep_get_and_clear(mm, addr, ptep);
+ pte = __ptep_get_and_clear(mm, addr, ptep);
present = pte_present(pte);
while (--ncontig) {
ptep++;
addr += PAGE_SIZE;
- tmp_pte = ptep_get_and_clear(mm, addr, ptep);
+ tmp_pte = __ptep_get_and_clear(mm, addr, ptep);
if (present) {
if (pte_dirty(tmp_pte))
pte = pte_mkdirty(pte);
@@ -210,7 +210,7 @@ static void clear_flush(struct mm_struct *mm,
unsigned long i, saddr = addr;
for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- ptep_get_and_clear(mm, addr, ptep);
+ __ptep_get_and_clear(mm, addr, ptep);
flush_tlb_range(&vma, saddr, addr);
}
@@ -250,25 +250,40 @@ void set_huge_pte_at(struct mm_struct *mm,
unsigned long sz)
{
size_t pgsize;
+ pte_t orig_pte;
+ pte_t pteval;
int i, pte_num;
pte_num = num_contig_ptes_from_size(sz, &pgsize);
if (!pte_present(pte)) {
- for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
- set_ptes(mm, addr, ptep, pte, 1);
+ for (i = 0; i < pte_num; i++, ptep++, addr += pgsize) {
+ pteval = pte_mknonnapot(pte, addr);
+ orig_pte = __ptep_get(ptep);
+
+ if (pte_present_napot(orig_pte))
+ __napotpte_try_unfold(mm, addr, ptep, orig_pte);
+
+ __set_ptes(mm, addr, ptep, pteval, 1);
+ }
return;
}
if (!pte_napot(pte)) {
- set_ptes(mm, addr, ptep, pte, 1);
+ pteval = pte_mknonnapot(pte, addr);
+ orig_pte = __ptep_get(ptep);
+
+ if (pte_present_napot(orig_pte))
+ __napotpte_try_unfold(mm, addr, ptep, orig_pte);
+
+ __set_ptes(mm, addr, ptep, pteval, 1);
return;
}
clear_flush(mm, addr, ptep, pgsize, pte_num);
for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
- set_pte_at(mm, addr, ptep, pte);
+ __set_ptes(mm, addr, ptep, pte, 1);
}
int huge_ptep_set_access_flags(struct vm_area_struct *vma,
@@ -283,7 +298,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
int i, pte_num;
if (!pte_napot(pte))
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+ return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
order = napot_cont_order(pte);
pte_num = napot_pte_num(order);
@@ -307,11 +322,11 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
pte_t *ptep, unsigned long sz)
{
size_t pgsize;
- pte_t orig_pte = ptep_get(ptep);
+ pte_t orig_pte = __ptep_get(ptep);
int pte_num;
if (!pte_napot(orig_pte))
- return ptep_get_and_clear(mm, addr, ptep);
+ return __ptep_get_and_clear(mm, addr, ptep);
pte_num = num_contig_ptes_from_size(sz, &pgsize);
@@ -322,13 +337,13 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr,
pte_t *ptep)
{
- pte_t pte = ptep_get(ptep);
+ pte_t pte = __ptep_get(ptep);
unsigned long order;
pte_t orig_pte;
int i, pte_num;
if (!pte_napot(pte)) {
- ptep_set_wrprotect(mm, addr, ptep);
+ __ptep_set_wrprotect(mm, addr, ptep);
return;
}
@@ -347,11 +362,11 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep)
{
- pte_t pte = ptep_get(ptep);
+ pte_t pte = __ptep_get(ptep);
int pte_num;
if (!pte_napot(pte))
- return ptep_clear_flush(vma, addr, ptep);
+ return __ptep_clear_flush(vma, addr, ptep);
pte_num = napot_pte_num(napot_cont_order(pte));
@@ -364,18 +379,18 @@ void huge_pte_clear(struct mm_struct *mm,
unsigned long sz)
{
size_t pgsize;
- pte_t pte = ptep_get(ptep);
+ pte_t pte = __ptep_get(ptep);
int i, pte_num;
if (!pte_napot(pte)) {
- pte_clear(mm, addr, ptep);
+ __pte_clear(mm, addr, ptep);
return;
}
pte_num = num_contig_ptes_from_size(sz, &pgsize);
for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
- pte_clear(mm, addr, ptep);
+ __pte_clear(mm, addr, ptep);
}
static bool is_napot_size(unsigned long size)
--
2.39.5
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 5/7] riscv: add contiguous PTE range clearing helpers
2026-04-21 9:24 [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support Yunhui Cui
` (3 preceding siblings ...)
2026-04-21 9:24 ` [PATCH 4/7] riscv: hugetlb: switch NAPOT mappings to raw PTE helpers Yunhui Cui
@ 2026-04-21 9:24 ` Yunhui Cui
2026-04-21 9:24 ` [PATCH 6/7] riscv: batch write-protect contiguous PTE ranges Yunhui Cui
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Yunhui Cui @ 2026-04-21 9:24 UTC (permalink / raw)
To: akpm, alex, andrew+kernel, andreyknvl, anup, aou, apopple, ardb,
atish.patra, baolin.wang, cuiyunhui, david, debug,
djordje.todorovic, dvyukov, elver, glider, ilias.apalodimas,
junhui.liu, kasan-dev, kees, kevin.brodsky, kvm-riscv, kvm,
linux-efi, linux-kernel, linux-riscv, liu.xuemei1, ljs, namcao,
osalvador, palmer, pjw, rmclure, rostedt, rppt, ryabinin.a.a,
surenb, vincenzo.frascino, vishal.moola, wangruikang,
zhangchunyan
Add Svnapot-aware implementations of clear_full_ptes() and
get_and_clear_full_ptes() so full PTE batches can be cleared without
losing the required unfold semantics for NAPOT mappings.
Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
arch/riscv/include/asm/pgtable.h | 75 ++++++++++++++++++++++++-
arch/riscv/mm/contpte.c | 96 ++++++++++++++++++++++++++++++++
2 files changed, 170 insertions(+), 1 deletion(-)
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 722483d4df37f..3e6516b5a4587 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -657,7 +657,6 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
}
#define PFN_PTE_SHIFT _PAGE_PFN_SHIFT
-
static inline void __set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval, unsigned int nr)
{
@@ -764,6 +763,47 @@ __ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep)
#define __ptep_get_and_clear __ptep_get_and_clear
+static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ (void)full;
+
+ for (;;) {
+ __ptep_get_and_clear(mm, addr, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+
+#define __clear_full_ptes __clear_full_ptes
+
+static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep,
+ unsigned int nr,
+ int full)
+{
+ pte_t pte, tmp_pte;
+
+ (void)full;
+
+ pte = __ptep_get_and_clear(mm, addr, ptep);
+ while (--nr) {
+ ptep++;
+ addr += PAGE_SIZE;
+ tmp_pte = __ptep_get_and_clear(mm, addr, ptep);
+ if (pte_dirty(tmp_pte))
+ pte = pte_mkdirty(pte);
+ if (pte_young(tmp_pte))
+ pte = pte_mkyoung(pte);
+ }
+
+ return pte;
+}
+
+#define __get_and_clear_full_ptes __get_and_clear_full_ptes
static inline void
__ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
@@ -831,6 +871,11 @@ pte_t napotpte_ptep_get(pte_t *ptep, pte_t orig_pte);
pte_t napotpte_ptep_get_lockless(pte_t *ptep);
void napotpte_set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr);
+void napotpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full);
+pte_t napotpte_get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full);
void napotpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
unsigned int nr, cydp_t flags);
@@ -933,6 +978,32 @@ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
napotpte_clear_young_dirty_ptes(vma, addr, ptep, nr, flags);
}
+#define clear_full_ptes clear_full_ptes
+static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ if (likely(nr == 1)) {
+ napotpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __clear_full_ptes(mm, addr, ptep, nr, full);
+ return;
+ }
+
+ napotpte_clear_full_ptes(mm, addr, ptep, nr, full);
+}
+
+#define get_and_clear_full_ptes get_and_clear_full_ptes
+static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full)
+{
+ if (likely(nr == 1)) {
+ napotpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ return __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+ }
+
+ return napotpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+}
+
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long address, pte_t *ptep)
@@ -989,6 +1060,8 @@ napotpte_ptep_clear_flush_young(struct vm_area_struct *vma,
#define ptep_get_lockless __ptep_get_lockless
#define ptep_get_and_clear __ptep_get_and_clear
#define clear_young_dirty_ptes __clear_young_dirty_ptes
+#define clear_full_ptes __clear_full_ptes
+#define get_and_clear_full_ptes __get_and_clear_full_ptes
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
#define ptep_set_wrprotect __ptep_set_wrprotect
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
diff --git a/arch/riscv/mm/contpte.c b/arch/riscv/mm/contpte.c
index f73af7d9b099a..77c2a4dbd3dda 100644
--- a/arch/riscv/mm/contpte.c
+++ b/arch/riscv/mm/contpte.c
@@ -107,6 +107,38 @@ __napot_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep
return pte;
}
+static void __napot_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ for (;;) {
+ __napot_ptep_get_and_clear(mm, addr, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+
+static pte_t __napot_get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr)
+{
+ pte_t pte, tmp_pte;
+
+ pte = __napot_ptep_get_and_clear(mm, addr, ptep);
+ while (--nr) {
+ ptep++;
+ addr += PAGE_SIZE;
+ tmp_pte = __napot_ptep_get_and_clear(mm, addr, ptep);
+ if (pte_dirty(tmp_pte))
+ pte = pte_mkdirty(pte);
+ if (pte_young(tmp_pte))
+ pte = pte_mkyoung(pte);
+ }
+
+ return pte;
+}
+
static void napotpte_convert(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t target)
{
@@ -202,6 +234,33 @@ void __napotpte_try_fold(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(__napotpte_try_fold);
+static void napotpte_try_unfold_range(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr)
+{
+ unsigned long next;
+ pte_t pte;
+ unsigned int chunk;
+
+ while (nr) {
+ pte = READ_ONCE(*ptep);
+ if (pte_present_napot(pte)) {
+ __napotpte_try_unfold(mm, addr, ptep, pte);
+ next = napot_align_addr(addr) + napotpte_size();
+ chunk = (next - addr) >> PAGE_SHIFT;
+ } else {
+ chunk = 1;
+ }
+
+ if (chunk > nr)
+ chunk = nr;
+
+ ptep += chunk;
+ addr += chunk * PAGE_SIZE;
+ nr -= chunk;
+ }
+}
+
void __napotpte_try_unfold(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
@@ -349,6 +408,43 @@ void napotpte_set_ptes(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(napotpte_set_ptes);
+void napotpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ (void)full;
+
+ if (!napot_hw_supported() || !mm_is_user(mm)) {
+ __napot_clear_full_ptes(mm, addr, ptep, nr);
+ return;
+ }
+
+ /*
+ * Unlike arm64 contpte, a Svnapot PTE block stores identical
+ * napot-encoded entries across the whole block rather than per-page
+ * PFNs. Batch zap paths must therefore unfold the whole covered range
+ * so the core MM later sees ordinary per-page PTEs for rmap/rss/tlb
+ * batching.
+ */
+ napotpte_try_unfold_range(mm, addr, ptep, nr);
+ __napot_clear_full_ptes(mm, addr, ptep, nr);
+}
+EXPORT_SYMBOL(napotpte_clear_full_ptes);
+
+pte_t napotpte_get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full)
+{
+ (void)full;
+
+ if (!napot_hw_supported() || !mm_is_user(mm))
+ return __napot_get_and_clear_full_ptes(mm, addr, ptep, nr);
+
+ napotpte_try_unfold_range(mm, addr, ptep, nr);
+
+ return __napot_get_and_clear_full_ptes(mm, addr, ptep, nr);
+}
+EXPORT_SYMBOL(napotpte_get_and_clear_full_ptes);
+
void napotpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
unsigned int nr, cydp_t flags)
--
2.39.5
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 6/7] riscv: batch write-protect contiguous PTE ranges
2026-04-21 9:24 [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support Yunhui Cui
` (4 preceding siblings ...)
2026-04-21 9:24 ` [PATCH 5/7] riscv: add contiguous PTE range clearing helpers Yunhui Cui
@ 2026-04-21 9:24 ` Yunhui Cui
2026-04-21 9:24 ` [PATCH 7/7] riscv: add Svnapot-aware pte_batch_hint support Yunhui Cui
2026-04-21 11:28 ` [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support David Hildenbrand (Arm)
7 siblings, 0 replies; 9+ messages in thread
From: Yunhui Cui @ 2026-04-21 9:24 UTC (permalink / raw)
To: akpm, alex, andrew+kernel, andreyknvl, anup, aou, apopple, ardb,
atish.patra, baolin.wang, cuiyunhui, david, debug,
djordje.todorovic, dvyukov, elver, glider, ilias.apalodimas,
junhui.liu, kasan-dev, kees, kevin.brodsky, kvm-riscv, kvm,
linux-efi, linux-kernel, linux-riscv, liu.xuemei1, ljs, namcao,
osalvador, palmer, pjw, rmclure, rostedt, rppt, ryabinin.a.a,
surenb, vincenzo.frascino, vishal.moola, wangruikang,
zhangchunyan
Hook wrprotect_ptes() into the Svnapot contpte helpers so write
protection can preserve fully covered NAPOT blocks and only unfold
partial ranges at the edges.
Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
arch/riscv/include/asm/pgtable.h | 38 +++++++++++++++++++++++++++--
arch/riscv/mm/contpte.c | 42 ++++++++++++++++++++++++++++++++
2 files changed, 78 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 3e6516b5a4587..db82253efb218 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -813,13 +813,30 @@ __ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
* shadow stack memory is XWR = 010 and thus clearing _PAGE_WRITE will lead to
* encoding 000b which is wrong encoding with V = 1. This should lead to page fault
* but we dont want this wrong configuration to be set in page tables.
+ * Keep the entry readable when clearing write permissions so we don't create
+ * an invalid present encoding.
*/
atomic_long_set((atomic_long_t *)ptep,
- ((pte_val(read_pte) & ~(unsigned long)_PAGE_WRITE) | _PAGE_READ));
+ (pte_val(read_pte) & ~(unsigned long)_PAGE_WRITE) |
+ _PAGE_READ);
}
#define __ptep_set_wrprotect __ptep_set_wrprotect
+static inline void __wrprotect_ptes(struct mm_struct *mm,
+ unsigned long address,
+ pte_t *ptep, unsigned int nr)
+{
+ for (;;) {
+ __ptep_set_wrprotect(mm, address, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ address += PAGE_SIZE;
+ }
+}
+
+#define __wrprotect_ptes __wrprotect_ptes
static inline pte_t __ptep_clear_flush(struct vm_area_struct *vma,
unsigned long address,
pte_t *ptep)
@@ -879,6 +896,8 @@ pte_t napotpte_get_and_clear_full_ptes(struct mm_struct *mm,
void napotpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
unsigned int nr, cydp_t flags);
+void napotpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr);
bool napotpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty);
@@ -1004,11 +1023,25 @@ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
return napotpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full);
}
+#define wrprotect_ptes wrprotect_ptes
+static inline void wrprotect_ptes(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep,
+ unsigned int nr)
+{
+ if (likely(nr == 1)) {
+ napotpte_try_unfold(mm, address, ptep, __ptep_get(ptep));
+ __ptep_set_wrprotect(mm, address, ptep);
+ return;
+ }
+
+ napotpte_wrprotect_ptes(mm, address, ptep, nr);
+}
+
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long address, pte_t *ptep)
{
- __ptep_set_wrprotect(mm, address, ptep);
+ wrprotect_ptes(mm, address, ptep, 1);
}
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
@@ -1062,6 +1095,7 @@ napotpte_ptep_clear_flush_young(struct vm_area_struct *vma,
#define clear_young_dirty_ptes __clear_young_dirty_ptes
#define clear_full_ptes __clear_full_ptes
#define get_and_clear_full_ptes __get_and_clear_full_ptes
+#define wrprotect_ptes __wrprotect_ptes
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
#define ptep_set_wrprotect __ptep_set_wrprotect
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
diff --git a/arch/riscv/mm/contpte.c b/arch/riscv/mm/contpte.c
index 77c2a4dbd3dda..077ffa49e89d9 100644
--- a/arch/riscv/mm/contpte.c
+++ b/arch/riscv/mm/contpte.c
@@ -261,6 +261,30 @@ static void napotpte_try_unfold_range(struct mm_struct *mm,
}
}
+static void napotpte_try_unfold_partial(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr)
+{
+ pte_t pte;
+
+ if (ptep != napot_align_ptep(ptep) || nr < napotpte_pte_num()) {
+ pte = READ_ONCE(*ptep);
+ if (pte_present_napot(pte))
+ __napotpte_try_unfold(mm, addr, ptep, pte);
+ }
+
+ if (ptep + nr != napot_align_ptep(ptep + nr)) {
+ unsigned long last_addr;
+ pte_t *last_ptep;
+
+ last_addr = addr + PAGE_SIZE * (nr - 1);
+ last_ptep = ptep + nr - 1;
+ pte = READ_ONCE(*last_ptep);
+ if (pte_present_napot(pte))
+ __napotpte_try_unfold(mm, last_addr, last_ptep, pte);
+ }
+}
+
void __napotpte_try_unfold(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
@@ -485,6 +509,24 @@ void napotpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
}
EXPORT_SYMBOL(napotpte_clear_young_dirty_ptes);
+void napotpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ unsigned int i;
+
+ if (!napot_hw_supported() || !mm_is_user(mm)) {
+ for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE)
+ __ptep_set_wrprotect(mm, addr, ptep);
+ return;
+ }
+
+ napotpte_try_unfold_partial(mm, addr, ptep, nr);
+
+ for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE)
+ __ptep_set_wrprotect(mm, addr, ptep);
+}
+EXPORT_SYMBOL(napotpte_wrprotect_ptes);
+
bool napotpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
--
2.39.5
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 7/7] riscv: add Svnapot-aware pte_batch_hint support
2026-04-21 9:24 [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support Yunhui Cui
` (5 preceding siblings ...)
2026-04-21 9:24 ` [PATCH 6/7] riscv: batch write-protect contiguous PTE ranges Yunhui Cui
@ 2026-04-21 9:24 ` Yunhui Cui
2026-04-21 11:28 ` [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support David Hildenbrand (Arm)
7 siblings, 0 replies; 9+ messages in thread
From: Yunhui Cui @ 2026-04-21 9:24 UTC (permalink / raw)
To: akpm, alex, andrew+kernel, andreyknvl, anup, aou, apopple, ardb,
atish.patra, baolin.wang, cuiyunhui, david, debug,
djordje.todorovic, dvyukov, elver, glider, ilias.apalodimas,
junhui.liu, kasan-dev, kees, kevin.brodsky, kvm-riscv, kvm,
linux-efi, linux-kernel, linux-riscv, liu.xuemei1, ljs, namcao,
osalvador, palmer, pjw, rmclure, rostedt, rppt, ryabinin.a.a,
surenb, vincenzo.frascino, vishal.moola, wangruikang,
zhangchunyan
Provide a Svnapot-specific pte_batch_hint() implementation so callers can
batch over a contiguous napot range without re-reading each PTE entry.
Keep the public wrapper in pgtable.h and leave the CONFIG-disabled case on
the existing single-entry fallback.
Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
---
arch/riscv/include/asm/pgtable.h | 19 +++++++++++++++++-
arch/riscv/mm/contpte.c | 33 ++++++++++++++++++++++++++++++++
2 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index db82253efb218..264af77392c6e 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -872,6 +872,13 @@ static inline bool __ptep_clear_flush_young(struct vm_area_struct *vma,
#define __ptep_clear_flush_young __ptep_clear_flush_young
+static inline unsigned int __pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+ return 1;
+}
+
+#define __pte_batch_hint __pte_batch_hint
+
#ifdef CONFIG_RISCV_ISA_SVNAPOT
/*
@@ -886,6 +893,7 @@ void __napotpte_try_unfold(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
pte_t napotpte_ptep_get(pte_t *ptep, pte_t orig_pte);
pte_t napotpte_ptep_get_lockless(pte_t *ptep);
+unsigned int napotpte_pte_batch_hint(pte_t *ptep);
void napotpte_set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr);
void napotpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
@@ -1056,6 +1064,15 @@ static inline bool ptep_clear_flush_young(struct vm_area_struct *vma,
return napotpte_ptep_clear_flush_young(vma, address, ptep);
}
+#define pte_batch_hint pte_batch_hint
+static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+ if (!pte_present(pte))
+ return 1;
+
+ return napotpte_pte_batch_hint(ptep);
+}
+
#else /* CONFIG_RISCV_ISA_SVNAPOT */
static __always_inline bool riscv_pte_present_napot(pte_t pte)
@@ -1100,9 +1117,9 @@ napotpte_ptep_clear_flush_young(struct vm_area_struct *vma,
#define ptep_set_wrprotect __ptep_set_wrprotect
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
#define ptep_clear_flush_young __ptep_clear_flush_young
+#define pte_batch_hint __pte_batch_hint
#endif /* CONFIG_RISCV_ISA_SVNAPOT */
-
#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
{
diff --git a/arch/riscv/mm/contpte.c b/arch/riscv/mm/contpte.c
index 077ffa49e89d9..134b8c401cabc 100644
--- a/arch/riscv/mm/contpte.c
+++ b/arch/riscv/mm/contpte.c
@@ -187,6 +187,12 @@ static inline bool napotpte_is_consistent(pte_t pte, pte_t orig_pte)
pte_val(pte_mask_ad(pte)) == pte_val(pte_mask_ad(orig_pte));
}
+static inline bool napotpte_is_batch_consistent(pte_t pte, pte_t orig_pte)
+{
+ return pte_present_napot(pte) &&
+ pte_val(pte_mkold(pte)) == pte_val(pte_mkold(orig_pte));
+}
+
void __napotpte_try_fold(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
@@ -391,6 +397,33 @@ pte_t napotpte_ptep_get_lockless(pte_t *orig_ptep)
}
EXPORT_SYMBOL(napotpte_ptep_get_lockless);
+unsigned int napotpte_pte_batch_hint(pte_t *ptep)
+{
+ pte_t orig_pte, pte;
+ pte_t *start;
+ unsigned int i, nr, off;
+
+ if (!napot_hw_supported())
+ return 1;
+
+ orig_pte = READ_ONCE(*ptep);
+ if (!pte_present_napot(orig_pte))
+ return 1;
+
+ start = napot_align_ptep(ptep);
+ nr = napotpte_pte_num();
+ off = ptep - start;
+
+ for (i = off; i < nr; i++) {
+ pte = READ_ONCE(start[i]);
+ if (!napotpte_is_batch_consistent(pte, orig_pte))
+ return 1;
+ }
+
+ return nr - off;
+}
+EXPORT_SYMBOL(napotpte_pte_batch_hint);
+
void napotpte_set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr)
{
--
2.39.5
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support
2026-04-21 9:24 [PATCH v1 0/7] riscv: add Svnapot-based contiguous PTE support Yunhui Cui
` (6 preceding siblings ...)
2026-04-21 9:24 ` [PATCH 7/7] riscv: add Svnapot-aware pte_batch_hint support Yunhui Cui
@ 2026-04-21 11:28 ` David Hildenbrand (Arm)
7 siblings, 0 replies; 9+ messages in thread
From: David Hildenbrand (Arm) @ 2026-04-21 11:28 UTC (permalink / raw)
To: Yunhui Cui, akpm, alex, andrew+kernel, andreyknvl, anup, aou,
apopple, ardb, atish.patra, baolin.wang, debug, djordje.todorovic,
dvyukov, elver, glider, ilias.apalodimas, junhui.liu, kasan-dev,
kees, kevin.brodsky, kvm-riscv, kvm, linux-efi, linux-kernel,
linux-riscv, liu.xuemei1, ljs, namcao, osalvador, palmer, pjw,
rmclure, rostedt, rppt, ryabinin.a.a, surenb, vincenzo.frascino,
vishal.moola, wangruikang, zhangchunyan
On 4/21/26 11:24, Yunhui Cui wrote:
> Hi,
>
> First of all, thanks to Ryan Roberts for the work on mTHP and
> Contiguous PTE support on arm64. That work provides a very useful
> reference for reducing page fault overhead and TLB pressure for
> large but still PTE-mapped memory ranges.
>
> This series adds Svnapot-based contiguous PTE support for RISC-V.
>
> To achieve similar benefits on RISC-V, this series introduces a
> Contiguous-PTE-like mechanism built on top of the Svnapot extension.
> The intent is to preserve the core-MM PTE semantics while allowing
> RISC-V to transparently fold eligible base-page mappings into
> Svnapot-encoded contiguous mappings when possible.
>
> The series splits the low-level raw PTE helpers from the public
> core-MM-facing PTE helpers, so that:
>
> -the __xxx helpers expose the raw hardware PTE encoding,
> -the xxx helpers provide the semantic view expected by core MM,
> -and Svnapot-aware handling is centralized in the public wrapper layer.
Just curious, is there opportunity to share some of the code with arm64,
factoring out helpers that handle something like
CONFIG_ARCH_HAS_CONTIG_PTES ?
--
Cheers,
David
^ permalink raw reply [flat|nested] 9+ messages in thread