* [PATCH -next v4 1/4] mm: Make lazy MMU mode context-aware
2026-06-18 14:47 [PATCH -next v4 0/4] s390/mm: Batch PTE updates in lazy MMU mode Alexander Gordeev
@ 2026-06-18 14:47 ` Alexander Gordeev
2026-06-18 14:47 ` [PATCH -next v4 2/4] s390/mm: Batch PTE updates in lazy MMU mode Alexander Gordeev
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Alexander Gordeev @ 2026-06-18 14:47 UTC (permalink / raw)
To: Gerald Schaefer, Heiko Carstens, Christian Borntraeger,
Vasily Gorbik, Claudio Imbrenda
Cc: linux-s390, linux-mm, linux-kernel, Kevin Brodsky,
David Hildenbrand
Lazy MMU mode is assumed to be context-independent, in the sense
that it does not need any additional information while operating.
However, the s390 architecture benefits from knowing the exact
page table entries being modified.
Introduce lazy_mmu_mode_enable_with_ptes(), which is provided with
the process address space and the page table being operated on.
This information is required to enable s390-specific optimizations.
The function takes parameters that are typically passed to page-
table level walkers, which implies that the span of PTE entries
never crosses a page table boundary.
Architectures that do not require such information simply do not
need to define the lazy_mmu_mode_enable_with_ptes() callback.
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
fs/proc/task_mmu.c | 2 +-
include/linux/pgtable.h | 46 +++++++++++++++++++++++++++++++++++++++++
mm/madvise.c | 8 +++----
mm/memory.c | 8 +++----
mm/mprotect.c | 2 +-
mm/mremap.c | 2 +-
mm/vmalloc.c | 6 +++---
7 files changed, 60 insertions(+), 14 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index d32408f7cd5e..750f6095147f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2842,7 +2842,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
return 0;
}
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(vma->vm_mm, start, end, start_pte);
if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
/* Fast path for performing exclusive WP */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2981e386da7b..cc85daf30739 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -271,6 +271,50 @@ static inline void lazy_mmu_mode_enable(void)
arch_enter_lazy_mmu_mode();
}
+#ifndef arch_enter_lazy_mmu_mode_with_ptes
+static inline void arch_enter_lazy_mmu_mode_with_ptes(struct mm_struct *mm,
+ unsigned long addr, unsigned long end, pte_t *ptep)
+{
+ arch_enter_lazy_mmu_mode();
+}
+#endif
+
+/**
+ * lazy_mmu_mode_enable_with_ptes() - Enable the lazy MMU mode with a speedup hint.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Start address of the range.
+ * @end: End address of the range.
+ * @ptep: Page table pointer for the first entry.
+ *
+ * Enters a new lazy MMU mode section; if the mode was not already enabled,
+ * enables it and calls arch_enter_lazy_mmu_mode_with_ptes().
+ *
+ * PTEs that fall within the specified range might observe update speedups.
+ * The PTEs must belong to the specified address space and be in the same PMD.
+ *
+ * There are no requirements on the order or range completeness of PTE
+ * updates for the specified range.
+ *
+ * Must be paired with a call to lazy_mmu_mode_disable().
+ *
+ * Has no effect if called:
+ * - While paused - see lazy_mmu_mode_pause()
+ * - In interrupt context
+ */
+static inline void lazy_mmu_mode_enable_with_ptes(struct mm_struct *mm,
+ unsigned long addr, unsigned long end, pte_t *ptep)
+{
+ struct lazy_mmu_state *state = ¤t->lazy_mmu_state;
+
+ if (in_interrupt() || state->pause_count > 0)
+ return;
+
+ VM_WARN_ON_ONCE(state->enable_count == U8_MAX);
+
+ if (state->enable_count++ == 0)
+ arch_enter_lazy_mmu_mode_with_ptes(mm, addr, end, ptep);
+}
+
/**
* lazy_mmu_mode_disable() - Disable the lazy MMU mode.
*
@@ -387,6 +431,8 @@ static inline void lazy_mmu_mode_resume(void)
}
#else
static inline void lazy_mmu_mode_enable(void) {}
+static inline void lazy_mmu_mode_enable_with_ptes(struct mm_struct *mm,
+ unsigned long addr, unsigned long end, pte_t *ptep) {}
static inline void lazy_mmu_mode_disable(void) {}
static inline void lazy_mmu_mode_pause(void) {}
static inline void lazy_mmu_mode_resume(void) {}
diff --git a/mm/madvise.c b/mm/madvise.c
index cd9bb077072c..c14bd5d1828e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -453,7 +453,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (!start_pte)
return 0;
flush_tlb_batched_pending(mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(mm, addr, end, start_pte);
for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
nr = 1;
ptent = ptep_get(pte);
@@ -508,7 +508,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (!start_pte)
break;
flush_tlb_batched_pending(mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(mm, addr, end, start_pte);
if (!err)
nr = 0;
continue;
@@ -675,7 +675,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (!start_pte)
return 0;
flush_tlb_batched_pending(mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(mm, addr, end, start_pte);
for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
nr = 1;
ptent = ptep_get(pte);
@@ -735,7 +735,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (!start_pte)
break;
flush_tlb_batched_pending(mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(mm, addr, end, pte);
if (!err)
nr = 0;
continue;
diff --git a/mm/memory.c b/mm/memory.c
index ff338c2abe92..ee1770ff4a64 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1272,7 +1272,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
orig_dst_pte = dst_pte;
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(src_mm, addr, end, src_pte);
do {
nr = 1;
@@ -1922,7 +1922,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
return addr;
flush_tlb_batched_pending(mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(mm, addr, end, start_pte);
do {
bool any_skipped = false;
@@ -2919,7 +2919,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(mm, addr, end, mapped_pte);
do {
BUG_ON(!pte_none(ptep_get(pte)));
if (!pfn_modify_allowed(pfn, prot)) {
@@ -3330,7 +3330,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
return -EINVAL;
}
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(mm, addr, end, mapped_pte);
if (fn) {
do {
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 9cbf932b028c..3fc26418e837 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -337,7 +337,7 @@ static long change_pte_range(struct mmu_gather *tlb,
is_private_single_threaded = vma_is_single_threaded_private(vma);
flush_tlb_batched_pending(vma->vm_mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(vma->vm_mm, addr, end, pte);
do {
nr_ptes = 1;
oldpte = ptep_get(pte);
diff --git a/mm/mremap.c b/mm/mremap.c
index e9c8b1d05832..0dfe3de39ccc 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
flush_tlb_batched_pending(vma->vm_mm);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(mm, old_addr, old_end, old_ptep);
for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1afca3568b9b..b5ed2b05771f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -108,7 +108,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (!pte)
return -ENOMEM;
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(&init_mm, addr, end, pte);
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
@@ -371,7 +371,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(&init_mm, addr, end, pte);
do {
#ifdef CONFIG_HUGETLB_PAGE
@@ -538,7 +538,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte)
return -ENOMEM;
- lazy_mmu_mode_enable();
+ lazy_mmu_mode_enable_with_ptes(&init_mm, addr, end, pte);
do {
struct page *page = pages[*nr];
--
2.53.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH -next v4 2/4] s390/mm: Batch PTE updates in lazy MMU mode
2026-06-18 14:47 [PATCH -next v4 0/4] s390/mm: Batch PTE updates in lazy MMU mode Alexander Gordeev
2026-06-18 14:47 ` [PATCH -next v4 1/4] mm: Make lazy MMU mode context-aware Alexander Gordeev
@ 2026-06-18 14:47 ` Alexander Gordeev
2026-06-18 14:47 ` [PATCH -next v4 3/4] mm/kasan: Introduce helpers for lazy MMU mode sanitizer Alexander Gordeev
2026-06-18 14:47 ` [PATCH -next v4 4/4] s390/mm: Lazy " Alexander Gordeev
3 siblings, 0 replies; 5+ messages in thread
From: Alexander Gordeev @ 2026-06-18 14:47 UTC (permalink / raw)
To: Gerald Schaefer, Heiko Carstens, Christian Borntraeger,
Vasily Gorbik, Claudio Imbrenda
Cc: linux-s390, linux-mm, linux-kernel, Kevin Brodsky,
David Hildenbrand
Make use of the IPTE instruction's "Additional Entries" field to
invalidate multiple PTEs in one go while in lazy MMU mode. This
is the mode in which many memory-management system calls (like
mremap(), mprotect(), etc.) update memory attributes.
To achieve that, the set_pte() and ptep_get() primitives use a
per-CPU cache to store and retrieve PTE values and apply the
cached values to the real page table once lazy MMU mode is left.
The same is done for memory-management platform callbacks that
would otherwise cause intense per-PTE IPTE traffic, reducing the
number of IPTE instructions from up to PTRS_PER_PTE to a single
instruction in the best case. The average reduction is of course
smaller.
Since all existing page table iterators called in lazy MMU mode
handle one table at a time, the per-CPU cache does not need to be
larger than PTRS_PER_PTE entries. That also naturally aligns with
the IPTE instruction, which must not cross a page table boundary.
Before this change, the system calls did:
lazy_mmu_mode_enable_pte()
...
<update PTEs> // up to PTRS_PER_PTE single-IPTEs
...
lazy_mmu_mode_disable()
With this change, the system calls do:
lazy_mmu_mode_enable_pte()
...
<store new PTE values in the per-CPU cache>
...
lazy_mmu_mode_disable() // apply cache with one multi-IPTE
When applied to large memory ranges, some system calls show
significant speedups:
mprotect() ~15x
munmap() ~3x
mremap() ~28x
At the same time, fork() shows a measurable slowdown of ~1.5x.
The overall results depend on memory size and access patterns,
but the change generally does not degrade performance.
In addition to a process-wide impact, the rework affects the
whole Central Electronics Complex (CEC). Each (global) IPTE
instruction initiates a quiesce state in a CEC, so reducing
the number of IPTE calls relieves CEC-wide quiesce traffic.
In an extreme case of mprotect() contiguously triggering the
quiesce state on four LPARs in parallel, measurements show
~25x fewer quiesce events.
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
arch/s390/Kconfig | 1 +
arch/s390/include/asm/lazy_mmu.h | 9 +
arch/s390/include/asm/lowcore.h | 2 +-
arch/s390/include/asm/pgtable.h | 157 +++++++++++--
arch/s390/kernel/setup.c | 2 +
arch/s390/kernel/smp.c | 7 +
arch/s390/mm/Makefile | 2 +-
arch/s390/mm/lazy_mmu.c | 382 +++++++++++++++++++++++++++++++
arch/s390/mm/pgtable.c | 8 +-
9 files changed, 546 insertions(+), 24 deletions(-)
create mode 100644 arch/s390/include/asm/lazy_mmu.h
create mode 100644 arch/s390/mm/lazy_mmu.c
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 84404e6778d5..7846332dcd0a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -97,6 +97,7 @@ config S390
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+ select ARCH_HAS_LAZY_MMU_MODE
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
diff --git a/arch/s390/include/asm/lazy_mmu.h b/arch/s390/include/asm/lazy_mmu.h
new file mode 100644
index 000000000000..98366e9de9bc
--- /dev/null
+++ b/arch/s390/include/asm/lazy_mmu.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LAZY_MMU_H
+#define __LAZY_MMU_H
+
+void lazy_mmu_online_boot_cpu(void);
+int lazy_mmu_online_cpu(gfp_t gfp, unsigned int cpu);
+void lazy_mmu_offline_cpu(unsigned int cpu);
+
+#endif /* __LAZY_MMU_H */
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 3b3ecc647993..dba236664da9 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -163,7 +163,7 @@ struct lowcore {
__s32 preempt_count; /* 0x03a8 */
__u32 spinlock_lockval; /* 0x03ac */
__u32 spinlock_index; /* 0x03b0 */
- __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */
+ __s32 lazy_mmu_count; /* 0x03b4 */
__u64 percpu_offset; /* 0x03b8 */
__u8 percpu_register; /* 0x03c0 */
__u8 pad_0x03c1[0x0400-0x03c1]; /* 0x03c1 */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index f9a8a92fa160..2b6659d61fa5 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -39,6 +39,64 @@ enum {
extern atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+bool __lazy_mmu_ptep_test_and_clear_young(unsigned long addr, pte_t *ptep, int *res);
+bool __lazy_mmu_ptep_get_and_clear(unsigned long addr, pte_t *ptep, pte_t *res);
+bool __lazy_mmu_ptep_modify_prot_start(unsigned long addr, pte_t *ptep, pte_t *res);
+bool __lazy_mmu_ptep_modify_prot_commit(unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte);
+bool __lazy_mmu_ptep_set_wrprotect(unsigned long addr, pte_t *ptep);
+bool __lazy_mmu_set_pte(pte_t *ptep, pte_t pte);
+bool __lazy_mmu_ptep_get(pte_t *ptep, pte_t *res);
+
+static __always_inline bool is_lazy_mmu_active(void)
+{
+ if (__is_defined(__DECOMPRESSOR))
+ return false;
+ if (!get_lowcore()->lazy_mmu_count)
+ return false;
+ return true;
+}
+
+static inline
+bool lazy_mmu_ptep_test_and_clear_young(unsigned long addr, pte_t *ptep, int *res)
+{
+ if (!is_lazy_mmu_active())
+ return false;
+ return __lazy_mmu_ptep_test_and_clear_young(addr, ptep, res);
+}
+
+static inline
+bool lazy_mmu_ptep_get_and_clear(unsigned long addr, pte_t *ptep, pte_t *res)
+{
+ if (!is_lazy_mmu_active())
+ return false;
+ return __lazy_mmu_ptep_get_and_clear(addr, ptep, res);
+}
+
+static inline
+bool lazy_mmu_ptep_modify_prot_start(unsigned long addr, pte_t *ptep, pte_t *res)
+{
+ if (!is_lazy_mmu_active())
+ return false;
+ return __lazy_mmu_ptep_modify_prot_start(addr, ptep, res);
+}
+
+static inline
+bool lazy_mmu_ptep_modify_prot_commit(unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ if (!is_lazy_mmu_active())
+ return false;
+ return __lazy_mmu_ptep_modify_prot_commit(addr, ptep, old_pte, pte);
+}
+
+static inline
+bool lazy_mmu_ptep_set_wrprotect(unsigned long addr, pte_t *ptep)
+{
+ if (!is_lazy_mmu_active())
+ return false;
+ return __lazy_mmu_ptep_set_wrprotect(addr, ptep);
+}
+
static inline void update_page_count(int level, long count)
{
if (IS_ENABLED(CONFIG_PROC_FS))
@@ -978,15 +1036,30 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
WRITE_ONCE(*pmdp, pmd);
}
-static inline void set_pte(pte_t *ptep, pte_t pte)
+static inline void __set_pte(pte_t *ptep, pte_t pte)
{
WRITE_ONCE(*ptep, pte);
}
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+ if (!is_lazy_mmu_active() || !__lazy_mmu_set_pte(ptep, pte))
+ __set_pte(ptep, pte);
+}
+
+static inline pte_t __ptep_get(pte_t *ptep)
+{
+ return READ_ONCE(*ptep);
+}
+
#define ptep_get ptep_get
static inline pte_t ptep_get(pte_t *ptep)
{
- return READ_ONCE(*ptep);
+ pte_t res;
+
+ if (!is_lazy_mmu_active() || !__lazy_mmu_ptep_get(ptep, &res))
+ res = __ptep_get(ptep);
+ return res;
}
#define pmdp_get pmdp_get
@@ -1179,6 +1252,15 @@ static __always_inline void __ptep_ipte_range(unsigned long address, int nr,
} while (nr != 255);
}
+void arch_enter_lazy_mmu_mode_with_ptes(struct mm_struct *mm,
+ unsigned long addr, unsigned long end,
+ pte_t *pte);
+#define arch_enter_lazy_mmu_mode_with_ptes arch_enter_lazy_mmu_mode_with_ptes
+
+void arch_enter_lazy_mmu_mode(void);
+void arch_leave_lazy_mmu_mode(void);
+void arch_flush_lazy_mmu_mode(void);
+
/*
* This is hard to understand. ptep_get_and_clear and ptep_clear_flush
* both clear the TLB for the unmapped pte. The reason is that
@@ -1199,10 +1281,16 @@ pte_t ptep_xchg_lazy(struct mm_struct *, unsigned long, pte_t *, pte_t);
static inline bool ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
- pte_t pte = ptep_get(ptep);
+ pte_t pte;
+ int res;
- pte = ptep_xchg_direct(vma->vm_mm, addr, ptep, pte_mkold(pte));
- return pte_young(pte);
+ if (!lazy_mmu_ptep_test_and_clear_young(addr, ptep, &res)) {
+ pte = __ptep_get(ptep);
+ pte = pte_mkold(pte);
+ pte = ptep_xchg_direct(vma->vm_mm, addr, ptep, pte);
+ res = pte_young(pte);
+ }
+ return res;
}
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
@@ -1218,7 +1306,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
{
pte_t res;
- res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+ if (!lazy_mmu_ptep_get_and_clear(addr, ptep, &res))
+ res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
page_table_check_pte_clear(mm, addr, res);
/* At this point the reference through the mapping is still present */
if (mm_is_protected(mm) && pte_present(res))
@@ -1227,9 +1316,34 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
-pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
-void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
- pte_t *, pte_t, pte_t);
+pte_t ___ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
+void ___ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
+ pte_t *, pte_t, pte_t);
+
+static inline
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t res;
+
+ if (!lazy_mmu_ptep_modify_prot_start(addr, ptep, &res))
+ res = ___ptep_modify_prot_start(vma, addr, ptep);
+ return res;
+}
+
+static inline
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+ if (!lazy_mmu_ptep_modify_prot_commit(addr, ptep, old_pte, pte))
+ ___ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte);
+}
+
+bool ipte_range_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, pte_t *res);
+bool ipte_range_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte);
#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
@@ -1259,11 +1373,13 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
{
pte_t res;
- if (full) {
- res = ptep_get(ptep);
- set_pte(ptep, __pte(_PAGE_INVALID));
- } else {
- res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+ if (!lazy_mmu_ptep_get_and_clear(addr, ptep, &res)) {
+ if (full) {
+ res = __ptep_get(ptep);
+ __set_pte(ptep, __pte(_PAGE_INVALID));
+ } else {
+ res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
+ }
}
page_table_check_pte_clear(mm, addr, res);
/* At this point the reference through the mapping is still present */
@@ -1289,10 +1405,15 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- pte_t pte = ptep_get(ptep);
+ pte_t pte;
- if (pte_write(pte))
- ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte));
+ if (!lazy_mmu_ptep_set_wrprotect(addr, ptep)) {
+ pte = __ptep_get(ptep);
+ if (pte_write(pte)) {
+ pte = pte_wrprotect(pte);
+ ptep_xchg_lazy(mm, addr, ptep, pte);
+ }
+ }
}
/*
@@ -1325,7 +1446,7 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
* PTE does not have _PAGE_PROTECT set, to avoid unnecessary overhead.
* A local RDP can be used to do the flush.
*/
- if (cpu_has_rdp() && !(pte_val(ptep_get(ptep)) & _PAGE_PROTECT))
+ if (cpu_has_rdp() && !(pte_val(__ptep_get(ptep)) & _PAGE_PROTECT))
__ptep_rdp(address, ptep, 1);
}
#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index b60284328fe3..f5a3c9e1b6b8 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -77,6 +77,7 @@
#include <asm/maccess.h>
#include <asm/uv.h>
#include <asm/asm-offsets.h>
+#include <asm/lazy_mmu.h>
#include "entry.h"
/*
@@ -1012,5 +1013,6 @@ void __init setup_arch(char **cmdline_p)
void __init arch_cpu_finalize_init(void)
{
+ lazy_mmu_online_boot_cpu();
sclp_init();
}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 0ba7f89b8161..0a826bbaf1dd 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -59,6 +59,7 @@
#include <asm/topology.h>
#include <asm/vdso.h>
#include <asm/maccess.h>
+#include <asm/lazy_mmu.h>
#include "entry.h"
enum {
@@ -866,6 +867,11 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
rc = pcpu_alloc_lowcore(pcpu, cpu);
if (rc)
return rc;
+ rc = lazy_mmu_online_cpu(GFP_KERNEL, cpu);
+ if (rc) {
+ pcpu_free_lowcore(pcpu, cpu);
+ return rc;
+ }
/*
* Make sure global control register contents do not change
* until new CPU has initialized control registers.
@@ -921,6 +927,7 @@ void __cpu_die(unsigned int cpu)
pcpu = per_cpu_ptr(&pcpu_devices, cpu);
while (!pcpu_stopped(pcpu))
cpu_relax();
+ lazy_mmu_offline_cpu(cpu);
pcpu_free_lowcore(pcpu, cpu);
cpumask_clear_cpu(cpu, mm_cpumask(&init_mm));
cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask);
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 193899c39ca7..26e9fc11543a 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -3,7 +3,7 @@
# Makefile for the linux s390-specific parts of the memory manager.
#
-obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o
+obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o lazy_mmu.o
obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o
obj-$(CONFIG_CMM) += cmm.o
diff --git a/arch/s390/mm/lazy_mmu.c b/arch/s390/mm/lazy_mmu.c
new file mode 100644
index 000000000000..d75b93d9b0de
--- /dev/null
+++ b/arch/s390/mm/lazy_mmu.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pgtable.h>
+#include <linux/kasan.h>
+#include <linux/slab.h>
+#include <asm/facility.h>
+#include <asm/lazy_mmu.h>
+#include <kunit/visibility.h>
+
+#define PTE_POISON _PAGE_LARGE
+
+struct ipte_range {
+ struct mm_struct *mm;
+ unsigned long base_addr;
+ unsigned long base_end;
+ pte_t *base_pte;
+ pte_t *start_pte;
+ pte_t *end_pte;
+ pte_t cache[PTRS_PER_PTE];
+};
+
+static DEFINE_PER_CPU(struct ipte_range *, ipte_range);
+
+static int count_contiguous(pte_t *start, pte_t *end, bool *valid)
+{
+ unsigned long page_invalid_bit;
+ pte_t *ptep, pte;
+
+ pte = __ptep_get(start);
+ page_invalid_bit = pte_val(pte) & _PAGE_INVALID;
+
+ for (ptep = start + 1; ptep < end; ptep++) {
+ pte = __ptep_get(ptep);
+ if ((pte_val(pte) & _PAGE_INVALID) != page_invalid_bit)
+ break;
+ }
+
+ *valid = !(page_invalid_bit);
+ return ptep - start;
+}
+
+static void __invalidate_pte_range(struct mm_struct *mm, unsigned long addr,
+ int nr_ptes, pte_t *ptep)
+{
+ atomic_inc(&mm->context.flush_count);
+ if (cpu_has_tlb_lc() && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+ __ptep_ipte_range(addr, nr_ptes - 1, ptep, IPTE_LOCAL);
+ else
+ __ptep_ipte_range(addr, nr_ptes - 1, ptep, IPTE_GLOBAL);
+ atomic_dec(&mm->context.flush_count);
+}
+
+static int invalidate_pte_range(struct mm_struct *mm, unsigned long addr,
+ pte_t *start, pte_t *end)
+{
+ int nr_ptes;
+ bool valid;
+
+ nr_ptes = count_contiguous(start, end, &valid);
+ if (valid)
+ __invalidate_pte_range(mm, addr, nr_ptes, start);
+
+ return nr_ptes;
+}
+
+static void set_pte_range(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t *end, pte_t *cache)
+{
+ int i, nr_ptes;
+
+ while (ptep < end) {
+ nr_ptes = invalidate_pte_range(mm, addr, ptep, end);
+
+ for (i = 0; i < nr_ptes; i++, ptep++, cache++) {
+ __set_pte(ptep, *cache);
+ *cache = __pte(PTE_POISON);
+ }
+
+ addr += nr_ptes * PAGE_SIZE;
+ }
+}
+
+static void enter_ipte_norange(void)
+{
+ struct ipte_range __maybe_unused *range;
+
+ if (!test_facility(13))
+ return;
+
+ range = get_cpu_var(ipte_range);
+ get_lowcore()->lazy_mmu_count++;
+}
+
+static void enter_ipte_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end, pte_t *pte)
+{
+ struct ipte_range *range;
+
+ if (!test_facility(13))
+ return;
+
+ range = get_cpu_var(ipte_range);
+ get_lowcore()->lazy_mmu_count++;
+
+ range->mm = mm;
+ range->base_addr = addr;
+ range->base_end = end;
+ range->base_pte = pte;
+}
+
+static void leave_ipte_range(void)
+{
+ pte_t *ptep, *start, *start_cache, *cache;
+ unsigned long start_addr, addr;
+ struct ipte_range *range;
+ int start_idx;
+
+ if (!test_facility(13))
+ return;
+
+ lockdep_assert_preemption_disabled();
+ range = this_cpu_read(ipte_range);
+ if (!range->mm)
+ goto norange;
+ if (!range->start_pte)
+ goto done;
+
+ start = range->start_pte;
+ start_idx = range->start_pte - range->base_pte;
+ start_addr = range->base_addr + start_idx * PAGE_SIZE;
+ addr = start_addr;
+ start_cache = &range->cache[start_idx];
+ cache = start_cache;
+ for (ptep = start; ptep < range->end_pte; ptep++, cache++, addr += PAGE_SIZE) {
+ if (pte_val(*cache) == PTE_POISON) {
+ if (start) {
+ set_pte_range(range->mm, start_addr, start, ptep, start_cache);
+ start = NULL;
+ }
+ } else if (!start) {
+ start = ptep;
+ start_addr = addr;
+ start_cache = cache;
+ }
+ }
+ set_pte_range(range->mm, start_addr, start, ptep, start_cache);
+
+ range->start_pte = NULL;
+ range->end_pte = NULL;
+
+done:
+ range->mm = NULL;
+ range->base_addr = 0;
+ range->base_end = 0;
+ range->base_pte = NULL;
+
+norange:
+ get_lowcore()->lazy_mmu_count--;
+ put_cpu_var(ipte_range);
+}
+
+static void flush_lazy_mmu_mode(void)
+{
+ unsigned long addr, end;
+ struct ipte_range *range;
+ struct mm_struct *mm;
+ pte_t *pte;
+
+ if (!test_facility(13))
+ return;
+
+ range = get_cpu_var(ipte_range);
+ if (range->mm) {
+ mm = range->mm;
+ addr = range->base_addr;
+ end = range->base_end;
+ pte = range->base_pte;
+
+ leave_ipte_range();
+ enter_ipte_range(mm, addr, end, pte);
+ }
+ put_cpu_var(ipte_range);
+}
+
+void arch_enter_lazy_mmu_mode(void)
+{
+ enter_ipte_norange();
+}
+EXPORT_SYMBOL_IF_KUNIT(arch_enter_lazy_mmu_mode);
+
+void arch_enter_lazy_mmu_mode_with_ptes(struct mm_struct *mm,
+ unsigned long addr, unsigned long end,
+ pte_t *pte)
+{
+ enter_ipte_range(mm, addr, end, pte);
+}
+EXPORT_SYMBOL_IF_KUNIT(arch_enter_lazy_mmu_mode_with_ptes);
+
+void arch_leave_lazy_mmu_mode(void)
+{
+ leave_ipte_range();
+}
+EXPORT_SYMBOL_IF_KUNIT(arch_leave_lazy_mmu_mode);
+
+void arch_flush_lazy_mmu_mode(void)
+{
+ flush_lazy_mmu_mode();
+}
+EXPORT_SYMBOL_IF_KUNIT(arch_flush_lazy_mmu_mode);
+
+static void __ipte_range_set_pte(struct ipte_range *range, pte_t *ptep, pte_t pte)
+{
+ unsigned int idx = ptep - range->base_pte;
+
+ lockdep_assert_preemption_disabled();
+ range->cache[idx] = pte;
+
+ if (!range->start_pte) {
+ range->start_pte = ptep;
+ range->end_pte = ptep + 1;
+ } else if (ptep < range->start_pte) {
+ range->start_pte = ptep;
+ } else if (ptep + 1 > range->end_pte) {
+ range->end_pte = ptep + 1;
+ }
+}
+
+static pte_t __ipte_range_ptep_get(struct ipte_range *range, pte_t *ptep)
+{
+ unsigned int idx = ptep - range->base_pte;
+
+ lockdep_assert_preemption_disabled();
+ if (pte_val(range->cache[idx]) == PTE_POISON)
+ return __ptep_get(ptep);
+ return range->cache[idx];
+}
+
+static struct ipte_range *this_ipte_range(pte_t *ptep)
+{
+ struct ipte_range *range;
+ unsigned int nr_ptes;
+
+ range = this_cpu_read(ipte_range);
+ if (ptep < range->base_pte)
+ return NULL;
+ nr_ptes = (range->base_end - range->base_addr) / PAGE_SIZE;
+ if (ptep >= range->base_pte + nr_ptes)
+ return NULL;
+
+ return range;
+}
+
+bool __lazy_mmu_set_pte(pte_t *ptep, pte_t pte)
+{
+ struct ipte_range *range;
+
+ range = this_ipte_range(ptep);
+ if (!range)
+ return false;
+
+ __ipte_range_set_pte(range, ptep, pte);
+
+ return true;
+}
+
+bool __lazy_mmu_ptep_get(pte_t *ptep, pte_t *res)
+{
+ struct ipte_range *range;
+
+ range = this_ipte_range(ptep);
+ if (!range)
+ return false;
+
+ *res = __ipte_range_ptep_get(range, ptep);
+
+ return true;
+}
+
+bool __lazy_mmu_ptep_test_and_clear_young(unsigned long addr, pte_t *ptep, int *res)
+{
+ struct ipte_range *range;
+ pte_t pte, old;
+
+ range = this_ipte_range(ptep);
+ if (!range)
+ return false;
+
+ old = __ipte_range_ptep_get(range, ptep);
+ pte = pte_mkold(old);
+ __ipte_range_set_pte(range, ptep, pte);
+ *res = pte_young(old);
+
+ return true;
+}
+
+bool __lazy_mmu_ptep_get_and_clear(unsigned long addr, pte_t *ptep, pte_t *res)
+{
+ struct ipte_range *range;
+ pte_t pte, old;
+
+ range = this_ipte_range(ptep);
+ if (!range)
+ return false;
+
+ old = __ipte_range_ptep_get(range, ptep);
+ pte = __pte(_PAGE_INVALID);
+ __ipte_range_set_pte(range, ptep, pte);
+ *res = old;
+
+ return true;
+}
+
+bool __lazy_mmu_ptep_modify_prot_start(unsigned long addr, pte_t *ptep, pte_t *res)
+{
+ return __lazy_mmu_ptep_get_and_clear(addr, ptep, res);
+}
+
+bool __lazy_mmu_ptep_modify_prot_commit(unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct ipte_range *range;
+
+ range = this_ipte_range(ptep);
+ if (!range)
+ return false;
+
+ __ipte_range_set_pte(range, ptep, pte);
+
+ return true;
+}
+
+bool __lazy_mmu_ptep_set_wrprotect(unsigned long addr, pte_t *ptep)
+{
+ struct ipte_range *range;
+ pte_t pte;
+
+ range = this_ipte_range(ptep);
+ if (!range)
+ return false;
+
+ pte = __ipte_range_ptep_get(range, ptep);
+ if (pte_write(pte)) {
+ pte = pte_wrprotect(pte);
+ __ipte_range_set_pte(range, ptep, pte);
+ }
+
+ return true;
+}
+
+int lazy_mmu_online_cpu(gfp_t gfp, unsigned int cpu)
+{
+ struct ipte_range *range;
+ int i;
+
+ if (!test_facility(13))
+ return 0;
+
+ range = kzalloc_obj(*range, gfp);
+ if (!range)
+ return -ENOMEM;
+ for (i = 0; i < ARRAY_SIZE(range->cache); i++)
+ range->cache[i] = __pte(PTE_POISON);
+ per_cpu(ipte_range, cpu) = range;
+
+ return 0;
+}
+
+void lazy_mmu_offline_cpu(unsigned int cpu)
+{
+ struct ipte_range *range;
+
+ if (!test_facility(13))
+ return;
+
+ range = per_cpu(ipte_range, cpu);
+ per_cpu(ipte_range, cpu) = NULL;
+ kfree(range);
+}
+
+void __init lazy_mmu_online_boot_cpu(void)
+{
+ lazy_mmu_online_cpu(GFP_ATOMIC, 0);
+}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 4acd8b140c4b..df36523bcbbb 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -166,14 +166,14 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(ptep_xchg_lazy);
-pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep)
+pte_t ___ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep)
{
return ptep_flush_lazy(vma->vm_mm, addr, ptep, 1);
}
-void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t old_pte, pte_t pte)
+void ___ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
{
set_pte(ptep, pte);
}
--
2.53.0
^ permalink raw reply related [flat|nested] 5+ messages in thread