* [RFC PATCH 1/2] mm: make lazy MMU mode context-aware
2026-03-25 7:41 [RFC PATCH 0/2] s390/mm: Batch PTE updates in lazy MMU mode Alexander Gordeev
@ 2026-03-25 7:41 ` Alexander Gordeev
2026-03-25 9:55 ` David Hildenbrand (Arm)
2026-03-25 7:41 ` [RFC PATCH 2/2] s390/mm: Batch PTE updates in lazy MMU mode Alexander Gordeev
1 sibling, 1 reply; 6+ messages in thread
From: Alexander Gordeev @ 2026-03-25 7:41 UTC (permalink / raw)
To: Kevin Brodsky, David Hildenbrand, Andrew Morton, Gerald Schaefer,
Heiko Carstens, Christian Borntraeger, Vasily Gorbik
Cc: linux-s390, linux-mm, linux-kernel
Lazy MMU mode is assumed to be context-independent, in the sense
that it does not need any additional information while operating.
However, the s390 architecture benefits from knowing the exact
page table entries being modified.
Introduce lazy_mmu_mode_enable_pte(), which is provided with the
process address space and the page table being operated on. This
information is required to enable s390-specific optimizations.
The function takes parameters that are typically passed to page-
table level walkers, which implies that the span of PTE entries
never crosses a page table boundary.
Architectures that do not require such information simply do not
need to define the arch_enter_lazy_mmu_mode_pte() callback.
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
fs/proc/task_mmu.c | 2 +-
include/linux/pgtable.h | 42 +++++++++++++++++++++++++++++++++++++++++
mm/madvise.c | 8 ++++----
mm/memory.c | 8 ++++----
mm/mprotect.c | 2 +-
mm/mremap.c | 2 +-
mm/vmalloc.c | 6 +++---
7 files changed, 56 insertions(+), 14 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e091931d7ca1..4e3b1987874a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2752,7 +2752,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
return 0;
}
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(vma->vm_mm, start, end, start_pte);
if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
/* Fast path for performing exclusive WP */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a50df42a893f..481b45954800 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -271,6 +271,44 @@ static inline void lazy_mmu_mode_enable(void)
arch_enter_lazy_mmu_mode();
}
+#ifndef arch_enter_lazy_mmu_mode_pte
+static inline void arch_enter_lazy_mmu_mode_pte(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long end,
+ pte_t *ptep)
+{
+ arch_enter_lazy_mmu_mode();
+}
+#endif
+
+/**
+ * lazy_mmu_mode_enable_pte() - Enable the lazy MMU mode with parameters
+ *
+ * Enters a new lazy MMU mode section; if the mode was not already enabled,
+ * enables it and calls arch_enter_lazy_mmu_mode_pte().
+ *
+ * Must be paired with a call to lazy_mmu_mode_disable().
+ *
+ * Has no effect if called:
+ * - While paused - see lazy_mmu_mode_pause()
+ * - In interrupt context
+ */
+static inline void lazy_mmu_mode_enable_pte(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long end,
+ pte_t *ptep)
+{
+ struct lazy_mmu_state *state = ¤t->lazy_mmu_state;
+
+ if (in_interrupt() || state->pause_count > 0)
+ return;
+
+ VM_WARN_ON_ONCE(state->enable_count == U8_MAX);
+
+ if (state->enable_count++ == 0)
+ arch_enter_lazy_mmu_mode_pte(mm, addr, end, ptep);
+}
+
/**
* lazy_mmu_mode_disable() - Disable the lazy MMU mode.
*
@@ -353,6 +391,10 @@ static inline void lazy_mmu_mode_resume(void)
}
#else
static inline void lazy_mmu_mode_enable(void) {}
+static inline void lazy_mmu_mode_enable_pte(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long end,
+ pte_t *ptep) {}
static inline void lazy_mmu_mode_disable(void) {}
static inline void lazy_mmu_mode_pause(void) {}
static inline void lazy_mmu_mode_resume(void) {}
diff --git a/mm/madvise.c b/mm/madvise.c
index dbb69400786d..02edc80f678b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -451,7 +451,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (!start_pte)
return 0;
flush_tlb_batched_pending(mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(mm, addr, end, start_pte);
for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
nr = 1;
ptent = ptep_get(pte);
@@ -506,7 +506,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (!start_pte)
break;
flush_tlb_batched_pending(mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(mm, addr, end, start_pte);
if (!err)
nr = 0;
continue;
@@ -673,7 +673,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (!start_pte)
return 0;
flush_tlb_batched_pending(mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(mm, addr, end, start_pte);
for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
nr = 1;
ptent = ptep_get(pte);
@@ -733,7 +733,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (!start_pte)
break;
flush_tlb_batched_pending(mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(mm, addr, end, pte);
if (!err)
nr = 0;
continue;
diff --git a/mm/memory.c b/mm/memory.c
index 2f815a34d924..43fa9965fb5f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1269,7 +1269,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(src_mm, addr, end, src_pte);
do {
nr = 1;
@@ -1917,7 +1917,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
return addr;
flush_tlb_batched_pending(mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(mm, addr, end, start_pte);
do {
bool any_skipped = false;
@@ -2875,7 +2875,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(mm, addr, end, mapped_pte);
do {
BUG_ON(!pte_none(ptep_get(pte)));
if (!pfn_modify_allowed(pfn, prot)) {
@@ -3235,7 +3235,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
return -EINVAL;
}
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(mm, addr, end, mapped_pte);
if (fn) {
do {
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c0571445bef7..43a2a65b8caf 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -233,7 +233,7 @@ static long change_pte_range(struct mmu_gather *tlb,
is_private_single_threaded = vma_is_single_threaded_private(vma);
flush_tlb_batched_pending(vma->vm_mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(vma->vm_mm, addr, end, pte);
do {
nr_ptes = 1;
oldpte = ptep_get(pte);
diff --git a/mm/mremap.c b/mm/mremap.c
index 2be876a70cc0..ac7f649f3aad 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
flush_tlb_batched_pending(vma->vm_mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(mm, old_addr, old_end, old_ptep);
for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 61caa55a4402..5e702bcf03fd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -108,7 +108,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (!pte)
return -ENOMEM;
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(&init_mm, addr, end, pte);
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
@@ -371,7 +371,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(&init_mm, addr, end, pte);
do {
#ifdef CONFIG_HUGETLB_PAGE
@@ -538,7 +538,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte)
return -ENOMEM;
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_pte(&init_mm, addr, end, pte);
do {
struct page *page = pages[*nr];
--
2.51.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [RFC PATCH 2/2] s390/mm: Batch PTE updates in lazy MMU mode
2026-03-25 7:41 [RFC PATCH 0/2] s390/mm: Batch PTE updates in lazy MMU mode Alexander Gordeev
2026-03-25 7:41 ` [RFC PATCH 1/2] mm: make lazy MMU mode context-aware Alexander Gordeev
@ 2026-03-25 7:41 ` Alexander Gordeev
1 sibling, 0 replies; 6+ messages in thread
From: Alexander Gordeev @ 2026-03-25 7:41 UTC (permalink / raw)
To: Kevin Brodsky, David Hildenbrand, Andrew Morton, Gerald Schaefer,
Heiko Carstens, Christian Borntraeger, Vasily Gorbik
Cc: linux-s390, linux-mm, linux-kernel
Make use of the IPTE instruction's "Additional Entries" field to
invalidate multiple PTEs in one go while in lazy MMU mode. This
is the mode in which many memory-management system calls (like
mremap(), mprotect(), etc.) update memory attributes.
To achieve that, the set_pte() and ptep_get() primitives use a
per-CPU cache to store and retrieve PTE values and apply the
cached values to the real page table once lazy MMU mode is left.
The same is done for memory-management platform callbacks that
would otherwise cause intense per-PTE IPTE traffic, reducing the
number of IPTE instructions from up to PTRS_PER_PTE to a single
instruction in the best case. The average reduction is of course
smaller.
Since all existing page table iterators called in lazy MMU mode
handle one table at a time, the per-CPU cache does not need to be
larger than PTRS_PER_PTE entries. That also naturally aligns with
the IPTE instruction, which must not cross a page table boundary.
Before this change, the system calls did:
lazy_mmu_mode_enable_pte()
...
<update PTEs> // up to PTRS_PER_PTE single-IPTEs
...
lazy_mmu_mode_disable()
With this change, the system calls do:
lazy_mmu_mode_enable_pte()
...
<store new PTE values in the per-CPU cache>
...
lazy_mmu_mode_disable() // apply cache with one multi-IPTE
When applied to large memory ranges, some system calls show
significant speedups:
mprotect() ~15x
munmap() ~3x
mremap() ~28x
At the same time, fork() shows a measurable slowdown of ~1.5x.
The overall results depend on memory size and access patterns,
but the change generally does not degrade performance.
In addition to a process-wide impact, the rework affects the
whole Central Electronics Complex (CEC). Each (global) IPTE
instruction initiates a quiesce state in a CEC, so reducing
the number of IPTE calls relieves CEC-wide quiesce traffic.
In an extreme case of mprotect() contiguously triggering the
quiesce state on four LPARs in parallel, measurements show
~25x fewer quiesce events.
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
arch/s390/Kconfig | 8 +
arch/s390/include/asm/pgtable.h | 209 +++++++++++++++--
arch/s390/mm/Makefile | 1 +
arch/s390/mm/ipte_batch.c | 396 ++++++++++++++++++++++++++++++++
arch/s390/mm/pgtable.c | 8 +-
5 files changed, 603 insertions(+), 19 deletions(-)
create mode 100644 arch/s390/mm/ipte_batch.c
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 7828fbe0fc42..5821d4d42d1d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -732,6 +732,14 @@ config MAX_PHYSMEM_BITS
Increasing the number of bits also increases the kernel image size.
By default 46 bits (64TB) are supported.
+config IPTE_BATCH
+ def_bool y
+ prompt "Enables Additional Entries for IPTE instruction"
+ select ARCH_HAS_LAZY_MMU_MODE
+ help
+ This option enables using of "Additional Entries" field of the IPTE
+ instruction, which capitalizes on the lazy MMU mode infrastructure.
+
endmenu
menu "I/O subsystem"
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 67f5df20a57e..fd135e2a1ecf 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -39,6 +39,82 @@ enum {
extern atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+#if !defined(CONFIG_IPTE_BATCH) || defined(__DECOMPRESSOR)
+static inline
+bool ipte_batch_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ int *res)
+{
+ return false;
+}
+
+static inline
+bool ipte_batch_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, pte_t *res)
+{
+ return false;
+}
+
+static inline
+bool ipte_batch_ptep_get_and_clear_full(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ int full, pte_t *res)
+{
+ return false;
+}
+
+static inline
+bool ipte_batch_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t *res)
+{
+ return false;
+}
+
+static inline
+bool ipte_batch_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ return false;
+}
+
+static inline
+bool ipte_batch_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ return false;
+}
+
+static inline bool ipte_batch_set_pte(pte_t *ptep, pte_t pte)
+{
+ return false;
+}
+
+static inline bool ipte_batch_ptep_get(pte_t *ptep, pte_t *res)
+{
+ return false;
+}
+#else
+bool ipte_batch_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ int *res);
+bool ipte_batch_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, pte_t *res);
+bool ipte_batch_ptep_get_and_clear_full(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ int full, pte_t *res);
+bool ipte_batch_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t *res);
+bool ipte_batch_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte);
+
+bool ipte_batch_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep);
+bool ipte_batch_set_pte(pte_t *ptep, pte_t pte);
+bool ipte_batch_ptep_get(pte_t *ptep, pte_t *res);
+#endif
+
static inline void update_page_count(int level, long count)
{
if (IS_ENABLED(CONFIG_PROC_FS))
@@ -978,11 +1054,32 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
WRITE_ONCE(*pmdp, pmd);
}
-static inline void set_pte(pte_t *ptep, pte_t pte)
+static inline void __set_pte(pte_t *ptep, pte_t pte)
{
WRITE_ONCE(*ptep, pte);
}
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+ if (!ipte_batch_set_pte(ptep, pte))
+ __set_pte(ptep, pte);
+}
+
+static inline pte_t __ptep_get(pte_t *ptep)
+{
+ return READ_ONCE(*ptep);
+}
+
+#define ptep_get ptep_get
+static inline pte_t ptep_get(pte_t *ptep)
+{
+ pte_t res;
+
+ if (ipte_batch_ptep_get(ptep, &res))
+ return res;
+ return __ptep_get(ptep);
+}
+
static inline void pgd_clear(pgd_t *pgd)
{
if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1)
@@ -1149,6 +1246,26 @@ static __always_inline void __ptep_ipte_range(unsigned long address, int nr,
} while (nr != 255);
}
+#ifdef CONFIG_IPTE_BATCH
+void arch_enter_lazy_mmu_mode_pte(struct mm_struct *mm,
+ unsigned long addr, unsigned long end,
+ pte_t *pte);
+#define arch_enter_lazy_mmu_mode_pte arch_enter_lazy_mmu_mode_pte
+
+void arch_pause_lazy_mmu_mode(void);
+#define arch_pause_lazy_mmu_mode arch_pause_lazy_mmu_mode
+
+void arch_resume_lazy_mmu_mode(void);
+#define arch_resume_lazy_mmu_mode arch_resume_lazy_mmu_mode
+
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+}
+
+void arch_leave_lazy_mmu_mode(void);
+void arch_flush_lazy_mmu_mode(void);
+#endif
+
/*
* This is hard to understand. ptep_get_and_clear and ptep_clear_flush
* both clear the TLB for the unmapped pte. The reason is that
@@ -1166,8 +1283,8 @@ pte_t ptep_xchg_direct(struct mm_struct *, unsigned long, pte_t *, pte_t);
pte_t ptep_xchg_lazy(struct mm_struct *, unsigned long, pte_t *, pte_t);
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
@@ -1175,6 +1292,16 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
return pte_young(pte);
}
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int res;
+
+ if (ipte_batch_ptep_test_and_clear_young(vma, addr, ptep, &res))
+ return res;
+ return __ptep_test_and_clear_young(vma, addr, ptep);
+}
+
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
@@ -1183,8 +1310,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
}
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
+static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
pte_t res;
@@ -1192,14 +1319,49 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
/* At this point the reference through the mapping is still present */
if (mm_is_protected(mm) && pte_present(res))
WARN_ON_ONCE(uv_convert_from_secure_pte(res));
+ return res;
+}
+
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t res;
+
+ if (!ipte_batch_ptep_get_and_clear(mm, addr, ptep, &res))
+ res = __ptep_get_and_clear(mm, addr, ptep);
page_table_check_pte_clear(mm, addr, res);
return res;
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
-pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
-void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
- pte_t *, pte_t, pte_t);
+pte_t ___ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
+void ___ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
+ pte_t *, pte_t, pte_t);
+
+static inline
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t res;
+
+ if (ipte_batch_ptep_modify_prot_start(vma, addr, ptep, &res))
+ return res;
+ return ___ptep_modify_prot_start(vma, addr, ptep);
+}
+
+static inline
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+ if (!ipte_batch_ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte))
+ ___ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte);
+}
+
+bool ipte_batch_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t *res);
+bool ipte_batch_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte);
#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
@@ -1223,9 +1385,9 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
* full==1 and a simple pte_clear is enough. See tlb.h.
*/
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
-static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep, int full)
+static inline pte_t __ptep_get_and_clear_full(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, int full)
{
pte_t res;
@@ -1236,8 +1398,6 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
}
- page_table_check_pte_clear(mm, addr, res);
-
/* Nothing to do */
if (!mm_is_protected(mm) || !pte_present(res))
return res;
@@ -1258,9 +1418,21 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
return res;
}
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, int full)
+{
+ pte_t res;
+
+ if (!ipte_batch_ptep_get_and_clear_full(mm, addr, ptep, full, &res))
+ res = __ptep_get_and_clear_full(mm, addr, ptep, full);
+ page_table_check_pte_clear(mm, addr, res);
+ return res;
+}
+
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
@@ -1268,6 +1440,13 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte));
}
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ if (!ipte_batch_ptep_set_wrprotect(mm, addr, ptep))
+ __ptep_set_wrprotect(mm, addr, ptep);
+}
+
/*
* Check if PTEs only differ in _PAGE_PROTECT HW bit, but also allow SW PTE
* bits in the comparison. Those might change e.g. because of dirty and young
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 193899c39ca7..0f6c6de447d4 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -11,5 +11,6 @@ obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_PFAULT) += pfault.o
+obj-$(CONFIG_IPTE_BATCH) += ipte_batch.o
obj-$(subst m,y,$(CONFIG_KVM)) += gmap_helpers.o
diff --git a/arch/s390/mm/ipte_batch.c b/arch/s390/mm/ipte_batch.c
new file mode 100644
index 000000000000..49b166d499a9
--- /dev/null
+++ b/arch/s390/mm/ipte_batch.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pgtable.h>
+#include <asm/facility.h>
+#include <kunit/visibility.h>
+
+#define PTE_POISON 0
+
+struct ipte_batch {
+ struct mm_struct *mm;
+ unsigned long base_addr;
+ unsigned long base_end;
+ pte_t *base_pte;
+ pte_t *start_pte;
+ pte_t *end_pte;
+ pte_t cache[PTRS_PER_PTE];
+};
+
+static DEFINE_PER_CPU(struct ipte_batch, ipte_range);
+
+static int count_contiguous(pte_t *start, pte_t *end, bool *valid)
+{
+ pte_t *ptep;
+
+ *valid = !(pte_val(*start) & _PAGE_INVALID);
+
+ for (ptep = start + 1; ptep < end; ptep++) {
+ if (*valid) {
+ if (pte_val(*ptep) & _PAGE_INVALID)
+ break;
+ } else {
+ if (!(pte_val(*ptep) & _PAGE_INVALID))
+ break;
+ }
+ }
+
+ return ptep - start;
+}
+
+static void __invalidate_pte_range(struct mm_struct *mm, unsigned long addr,
+ int nr_ptes, pte_t *ptep)
+{
+ atomic_inc(&mm->context.flush_count);
+ if (cpu_has_tlb_lc() &&
+ cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+ __ptep_ipte_range(addr, nr_ptes - 1, ptep, IPTE_LOCAL);
+ else
+ __ptep_ipte_range(addr, nr_ptes - 1, ptep, IPTE_GLOBAL);
+ atomic_dec(&mm->context.flush_count);
+}
+
+static int invalidate_pte_range(struct mm_struct *mm, unsigned long addr,
+ pte_t *start, pte_t *end)
+{
+ int nr_ptes;
+ bool valid;
+
+ nr_ptes = count_contiguous(start, end, &valid);
+ if (valid)
+ __invalidate_pte_range(mm, addr, nr_ptes, start);
+
+ return nr_ptes;
+}
+
+static void set_pte_range(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t *end, pte_t *cache)
+{
+ int i, nr_ptes;
+
+ while (ptep < end) {
+ nr_ptes = invalidate_pte_range(mm, addr, ptep, end);
+
+ for (i = 0; i < nr_ptes; i++, ptep++, cache++) {
+ __set_pte(ptep, *cache);
+ *cache = __pte(PTE_POISON);
+ }
+
+ addr += nr_ptes * PAGE_SIZE;
+ }
+}
+
+static void enter_ipte_batch(struct mm_struct *mm,
+ unsigned long addr, unsigned long end, pte_t *pte)
+{
+ struct ipte_batch *ib;
+
+ ib = &get_cpu_var(ipte_range);
+
+ ib->mm = mm;
+ ib->base_addr = addr;
+ ib->base_end = end;
+ ib->base_pte = pte;
+}
+
+static void leave_ipte_batch(void)
+{
+ pte_t *ptep, *start, *start_cache, *cache;
+ unsigned long start_addr, addr;
+ struct ipte_batch *ib;
+ int start_idx;
+
+ ib = &get_cpu_var(ipte_range);
+ if (!ib->mm) {
+ put_cpu_var(ipte_range);
+ return;
+ }
+ put_cpu_var(ipte_range);
+
+ lockdep_assert_preemption_disabled();
+ if (!ib->start_pte)
+ goto done;
+
+ start = ib->start_pte;
+ start_idx = ib->start_pte - ib->base_pte;
+ start_addr = ib->base_addr + start_idx * PAGE_SIZE;
+ addr = start_addr;
+ start_cache = &ib->cache[start_idx];
+ cache = start_cache;
+ for (ptep = start; ptep < ib->end_pte; ptep++, cache++, addr += PAGE_SIZE) {
+ if (pte_val(*cache) == PTE_POISON) {
+ if (start) {
+ set_pte_range(ib->mm, start_addr, start, ptep, start_cache);
+ start = NULL;
+ }
+ } else if (!start) {
+ start = ptep;
+ start_addr = addr;
+ start_cache = cache;
+ }
+ }
+ set_pte_range(ib->mm, start_addr, start, ptep, start_cache);
+
+ ib->start_pte = NULL;
+ ib->end_pte = NULL;
+
+done:
+ ib->mm = NULL;
+ ib->base_addr = 0;
+ ib->base_end = 0;
+ ib->base_pte = NULL;
+
+ put_cpu_var(ipte_range);
+}
+
+static void flush_lazy_mmu_mode(void)
+{
+ unsigned long addr, end;
+ struct ipte_batch *ib;
+ struct mm_struct *mm;
+ pte_t *pte;
+
+ ib = &get_cpu_var(ipte_range);
+ if (ib->mm) {
+ mm = ib->mm;
+ addr = ib->base_addr;
+ end = ib->base_end;
+ pte = ib->base_pte;
+
+ leave_ipte_batch();
+ enter_ipte_batch(mm, addr, end, pte);
+ }
+ put_cpu_var(ipte_range);
+}
+
+void arch_enter_lazy_mmu_mode_pte(struct mm_struct *mm,
+ unsigned long addr, unsigned long end,
+ pte_t *pte)
+{
+ if (!test_facility(13))
+ return;
+ enter_ipte_batch(mm, addr, end, pte);
+}
+EXPORT_SYMBOL_IF_KUNIT(arch_enter_lazy_mmu_mode_pte);
+
+void arch_leave_lazy_mmu_mode(void)
+{
+ if (!test_facility(13))
+ return;
+ leave_ipte_batch();
+}
+EXPORT_SYMBOL_IF_KUNIT(arch_leave_lazy_mmu_mode);
+
+void arch_flush_lazy_mmu_mode(void)
+{
+ if (!test_facility(13))
+ return;
+ flush_lazy_mmu_mode();
+}
+EXPORT_SYMBOL_IF_KUNIT(arch_flush_lazy_mmu_mode);
+
+static void __ipte_batch_set_pte(struct ipte_batch *ib, pte_t *ptep, pte_t pte)
+{
+ unsigned int idx = ptep - ib->base_pte;
+
+ lockdep_assert_preemption_disabled();
+ ib->cache[idx] = pte;
+
+ if (!ib->start_pte) {
+ ib->start_pte = ptep;
+ ib->end_pte = ptep + 1;
+ } else if (ptep < ib->start_pte) {
+ ib->start_pte = ptep;
+ } else if (ptep + 1 > ib->end_pte) {
+ ib->end_pte = ptep + 1;
+ }
+}
+
+static pte_t __ipte_batch_ptep_get(struct ipte_batch *ib, pte_t *ptep)
+{
+ unsigned int idx = ptep - ib->base_pte;
+
+ lockdep_assert_preemption_disabled();
+ if (pte_val(ib->cache[idx]) == PTE_POISON)
+ return __ptep_get(ptep);
+ return ib->cache[idx];
+}
+
+static bool lazy_mmu_mode(struct ipte_batch *ib, struct mm_struct *mm, pte_t *ptep)
+{
+ unsigned int nr_ptes;
+
+ lockdep_assert_preemption_disabled();
+ if (!is_lazy_mmu_mode_active())
+ return false;
+ if (!mm)
+ return false;
+ if (!ib->mm)
+ return false;
+ if (ptep < ib->base_pte)
+ return false;
+ nr_ptes = (ib->base_end - ib->base_addr) / PAGE_SIZE;
+ if (ptep >= ib->base_pte + nr_ptes)
+ return false;
+ return true;
+}
+
+static struct ipte_batch *get_ipte_batch_nomm(pte_t *ptep)
+{
+ struct ipte_batch *ib;
+
+ ib = &get_cpu_var(ipte_range);
+ if (!lazy_mmu_mode(ib, ib->mm, ptep)) {
+ put_cpu_var(ipte_range);
+ return NULL;
+ }
+
+ return ib;
+}
+
+static struct ipte_batch *get_ipte_batch(struct mm_struct *mm, pte_t *ptep)
+{
+ struct ipte_batch *ib;
+
+ ib = &get_cpu_var(ipte_range);
+ if (!lazy_mmu_mode(ib, mm, ptep)) {
+ put_cpu_var(ipte_range);
+ return NULL;
+ }
+
+ return ib;
+}
+
+static void put_ipte_batch(struct ipte_batch *ib)
+{
+ put_cpu_var(ipte_range);
+}
+
+bool ipte_batch_set_pte(pte_t *ptep, pte_t pte)
+{
+ struct ipte_batch *ib;
+
+ ib = get_ipte_batch_nomm(ptep);
+ if (!ib)
+ return false;
+ __ipte_batch_set_pte(ib, ptep, pte);
+ put_ipte_batch(ib);
+
+ return true;
+}
+
+bool ipte_batch_ptep_get(pte_t *ptep, pte_t *res)
+{
+ struct ipte_batch *ib;
+
+ ib = get_ipte_batch_nomm(ptep);
+ if (!ib)
+ return false;
+ *res = __ipte_batch_ptep_get(ib, ptep);
+ put_ipte_batch(ib);
+
+ return true;
+}
+
+bool ipte_batch_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ int *res)
+{
+ struct ipte_batch *ib;
+ pte_t pte, old;
+
+ ib = get_ipte_batch(vma->vm_mm, ptep);
+ if (!ib)
+ return false;
+
+ old = __ipte_batch_ptep_get(ib, ptep);
+ pte = pte_mkold(old);
+ __ipte_batch_set_pte(ib, ptep, pte);
+
+ put_ipte_batch(ib);
+
+ *res = pte_young(old);
+
+ return true;
+}
+
+bool ipte_batch_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, pte_t *res)
+{
+ struct ipte_batch *ib;
+ pte_t pte, old;
+
+ ib = get_ipte_batch(mm, ptep);
+ if (!ib)
+ return false;
+
+ old = __ipte_batch_ptep_get(ib, ptep);
+ pte = __pte(_PAGE_INVALID);
+ __ipte_batch_set_pte(ib, ptep, pte);
+
+ put_ipte_batch(ib);
+
+ *res = old;
+
+ return true;
+}
+
+bool ipte_batch_ptep_get_and_clear_full(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ int full, pte_t *res)
+{
+ struct ipte_batch *ib;
+ pte_t pte, old;
+
+ ib = get_ipte_batch(mm, ptep);
+ if (!ib)
+ return false;
+
+ old = __ipte_batch_ptep_get(ib, ptep);
+ pte = __pte(_PAGE_INVALID);
+ __ipte_batch_set_pte(ib, ptep, pte);
+
+ put_ipte_batch(ib);
+
+ *res = old;
+
+ return true;
+}
+
+bool ipte_batch_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t *res)
+{
+ return ipte_batch_ptep_get_and_clear(vma->vm_mm, addr, ptep, res);
+}
+
+bool ipte_batch_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct ipte_batch *ib;
+
+ ib = get_ipte_batch(vma->vm_mm, ptep);
+ if (!ib)
+ return false;
+ __ipte_batch_set_pte(ib, ptep, pte);
+ put_ipte_batch(ib);
+
+ return true;
+}
+
+bool ipte_batch_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ struct ipte_batch *ib;
+ pte_t pte, old;
+
+ ib = get_ipte_batch(mm, ptep);
+ if (!ib)
+ return false;
+
+ old = __ipte_batch_ptep_get(ib, ptep);
+ pte = pte_wrprotect(old);
+ __ipte_batch_set_pte(ib, ptep, pte);
+
+ put_ipte_batch(ib);
+
+ return true;
+}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 4acd8b140c4b..df36523bcbbb 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -166,14 +166,14 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(ptep_xchg_lazy);
-pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep)
+pte_t ___ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep)
{
return ptep_flush_lazy(vma->vm_mm, addr, ptep, 1);
}
-void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t old_pte, pte_t pte)
+void ___ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
{
set_pte(ptep, pte);
}
--
2.51.0
^ permalink raw reply related [flat|nested] 6+ messages in thread