[PATCH 0/4] munmap() vs unmap_mapping

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/4] munmap() vs unmap_mapping_range()
@ 2022-07-08  7:18 Peter Zijlstra
  2022-07-08  7:18 ` [PATCH 1/4] mmu_gather: Remove per arch tlb_{start,end}_vma() Peter Zijlstra
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Peter Zijlstra @ 2022-07-08  7:18 UTC (permalink / raw)
  To: Jann Horn, Linus Torvalds, Will Deacon
  Cc: linux-kernel, linux-mm, peterz, Dave Airlie, Daniel Vetter,
	Andrew Morton, Guo Ren, David Miller

Hi,

These 4 patches implement the suggestion by Linus.

The first patch nukes all architecture implementations of tlb_{start,end}_vma()
by adding two additional MMU_GATHER config knobs.

DaveM, please clarify the sparc64 case, it seems a little odd to have
flush_cache_range() but to explicitly not call it at unmap time. It would be
nice to remove this one special case.

The second patch isn't strictly needed for this series but since I was looking
at all this, I figured C-SKY didn't make a whole lot of sense. Guo, please
have a look.

The third patch cleans up the #ifdeffery and provides a single
tlb_{start,end}_vma() implementation for everyone.

The fourth, and final, patch does the force TLB-flush on VM_PFNMAP thing.

I've had the first three patches exposed to the robots and they've (so far) not
reported build (or other) fail.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/4] mmu_gather: Remove per arch tlb_{start,end}_vma()
  2022-07-08  7:18 [PATCH 0/4] munmap() vs unmap_mapping_range() Peter Zijlstra
@ 2022-07-08  7:18 ` Peter Zijlstra
  2022-07-08 13:25   ` Will Deacon
  2022-07-08  7:18 ` [PATCH 2/4] csky/tlb: Remove tlb_flush() define Peter Zijlstra
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2022-07-08  7:18 UTC (permalink / raw)
  To: Jann Horn, Linus Torvalds, Will Deacon
  Cc: linux-kernel, linux-mm, peterz, Dave Airlie, Daniel Vetter,
	Andrew Morton, Guo Ren, David Miller

Scattered across the archs are 3 basic forms of tlb_{start,end}_vma().
Provide two new MMU_GATHER_knobs to enumerate them and remove the per
arch tlb_{start,end}_vma() implementations.

 - MMU_GATHER_NO_FLUSH_CACHE indicates the arch has flush_cache_range()
   but does *NOT* want to call it for each VMA.

 - MMU_GATHER_MERGE_VMAS indicates the arch wants to merge the
   invalidate across multiple VMAs if possible.

With these it is possible to capture the three forms:

  1) empty stubs;
     select MMU_GATHER_NO_FLUSH_CACHE and MMU_GATHER_MERGE_VMAS

  2) start: flush_cache_range(), end: empty;
     select MMU_GATHER_MERGE_VMAS

  3) start: flush_cache_range(), end: flush_tlb_range();
     default

Obviously, if the architecture does not have flush_cache_range() then
it also doesn't need to select MMU_GATHER_NO_FLUSH_CACHE.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/Kconfig                     |    7 +++++++
 arch/csky/include/asm/tlb.h      |   13 -------------
 arch/loongarch/Kconfig           |    1 +
 arch/loongarch/include/asm/tlb.h |   10 ----------
 arch/powerpc/Kconfig             |    1 +
 arch/powerpc/include/asm/tlb.h   |    2 --
 arch/s390/Kconfig                |    1 +
 arch/s390/include/asm/tlb.h      |    3 ---
 arch/sparc/Kconfig               |    2 ++
 arch/sparc/include/asm/tlb_64.h  |    2 --
 arch/x86/Kconfig                 |    1 +
 arch/x86/include/asm/tlb.h       |    3 ---
 include/asm-generic/tlb.h        |   21 +++++++++++++++++++--
 13 files changed, 32 insertions(+), 35 deletions(-)

--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -438,6 +438,13 @@ config MMU_GATHER_PAGE_SIZE
 
 config MMU_GATHER_NO_RANGE
 	bool
+	select MMU_GATHER_MERGE_VMAS
+
+config MMU_GATHER_NO_FLUSH_CACHE
+	bool
+
+config MMU_GATHER_MERGE_VMAS
+	bool
 
 config MMU_GATHER_NO_GATHER
 	bool
--- a/arch/csky/include/asm/tlb.h
+++ b/arch/csky/include/asm/tlb.h
@@ -4,19 +4,6 @@
 #define __ASM_CSKY_TLB_H
 
 #include <asm/cacheflush.h>
-
-#define tlb_start_vma(tlb, vma) \
-	do { \
-		if (!(tlb)->fullmm) \
-			flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \
-	}  while (0)
-
-#define tlb_end_vma(tlb, vma) \
-	do { \
-		if (!(tlb)->fullmm) \
-			flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \
-	}  while (0)
-
 #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
 
 #include <asm-generic/tlb.h>
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -112,6 +112,7 @@ config LOONGARCH
 	select TRACE_IRQFLAGS_SUPPORT
 	select USE_PERCPU_NUMA_NODE_ID
 	select ZONE_DMA32
+	select MMU_GATHER_MERGE_VMAS if MMU
 
 config 32BIT
 	bool
--- a/arch/loongarch/include/asm/tlb.h
+++ b/arch/loongarch/include/asm/tlb.h
@@ -137,16 +137,6 @@ static inline void invtlb_all(u32 op, u3
 		);
 }
 
-/*
- * LoongArch doesn't need any special per-pte or per-vma handling, except
- * we need to flush cache for area to be unmapped.
- */
-#define tlb_start_vma(tlb, vma)					\
-	do {							\
-		if (!(tlb)->fullmm)				\
-			flush_cache_range(vma, vma->vm_start, vma->vm_end); \
-	}  while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 
 static void tlb_flush(struct mmu_gather *tlb);
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -256,6 +256,7 @@ config PPC
 	select IRQ_FORCED_THREADING
 	select MMU_GATHER_PAGE_SIZE
 	select MMU_GATHER_RCU_TABLE_FREE
+	select MMU_GATHER_MERGE_VMAS
 	select MODULES_USE_ELF_RELA
 	select NEED_DMA_MAP_STATE		if PPC64 || NOT_COHERENT_CACHE
 	select NEED_PER_CPU_EMBED_FIRST_CHUNK	if PPC64
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -19,8 +19,6 @@
 
 #include <linux/pagemap.h>
 
-#define tlb_start_vma(tlb, vma)	do { } while (0)
-#define tlb_end_vma(tlb, vma)	do { } while (0)
 #define __tlb_remove_tlb_entry	__tlb_remove_tlb_entry
 
 #define tlb_flush tlb_flush
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -204,6 +204,7 @@ config S390
 	select IOMMU_SUPPORT		if PCI
 	select MMU_GATHER_NO_GATHER
 	select MMU_GATHER_RCU_TABLE_FREE
+	select MMU_GATHER_MERGE_VMAS
 	select MODULES_USE_ELF_RELA
 	select NEED_DMA_MAP_STATE	if PCI
 	select NEED_SG_DMA_LENGTH	if PCI
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -27,9 +27,6 @@ static inline void tlb_flush(struct mmu_
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 					  struct page *page, int page_size);
 
-#define tlb_start_vma(tlb, vma)			do { } while (0)
-#define tlb_end_vma(tlb, vma)			do { } while (0)
-
 #define tlb_flush tlb_flush
 #define pte_free_tlb pte_free_tlb
 #define pmd_free_tlb pmd_free_tlb
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -67,6 +67,8 @@ config SPARC64
 	select HAVE_KRETPROBES
 	select HAVE_KPROBES
 	select MMU_GATHER_RCU_TABLE_FREE if SMP
+	select MMU_GATHER_MERGE_VMAS
+	select MMU_GATHER_NO_FLUSH_CACHE
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -22,8 +22,6 @@ void smp_flush_tlb_mm(struct mm_struct *
 void __flush_tlb_pending(unsigned long, unsigned long, unsigned long *);
 void flush_tlb_pending(void);
 
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma)	do { } while (0)
 #define tlb_flush(tlb)	flush_tlb_pending()
 
 /*
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -245,6 +245,7 @@ config X86
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE	if PARAVIRT
+	select MMU_GATHER_MERGE_VMAS
 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if UNWINDER_ORC || STACK_VALIDATION
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -2,9 +2,6 @@
 #ifndef _ASM_X86_TLB_H
 #define _ASM_X86_TLB_H
 
-#define tlb_start_vma(tlb, vma) do { } while (0)
-#define tlb_end_vma(tlb, vma) do { } while (0)
-
 #define tlb_flush tlb_flush
 static inline void tlb_flush(struct mmu_gather *tlb);
 
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -158,9 +158,24 @@
  *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
  *  and therefore doesn't naturally serialize with software page-table walkers.
  *
+ *  MMU_GATHER_NO_FLUSH_CACHE
+ *
+ *  Indicates the architecture has flush_cache_range() but it needs *NOT* be called
+ *  before unmapping a VMA.
+ *
+ *  NOTE: strictly speaking we shouldn't have this knob and instead rely on
+ *	  flush_cache_range() being a NOP, except Sparc64 seems to be
+ *	  different here.
+ *
+ *  MMU_GATHER_MERGE_VMAS
+ *
+ *  Indicates the architecture wants to merge ranges over VMAs; typical when
+ *  multiple range invalidates are more expensive than a full invalidate.
+ *
  *  MMU_GATHER_NO_RANGE
  *
- *  Use this if your architecture lacks an efficient flush_tlb_range().
+ *  Use this if your architecture lacks an efficient flush_tlb_range(). This
+ *  option implies MMU_GATHER_MERGE_VMAS above.
  *
  *  MMU_GATHER_NO_GATHER
  *
@@ -493,14 +508,16 @@ static inline void tlb_start_vma(struct
 		return;
 
 	tlb_update_vma_flags(tlb, vma);
+#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE
 	flush_cache_range(vma, vma->vm_start, vma->vm_end);
+#endif
 }
 #endif
 
 #ifndef tlb_end_vma
 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
-	if (tlb->fullmm)
+	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
 		return;
 
 	/*




^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/4] csky/tlb: Remove tlb_flush() define
  2022-07-08  7:18 [PATCH 0/4] munmap() vs unmap_mapping_range() Peter Zijlstra
  2022-07-08  7:18 ` [PATCH 1/4] mmu_gather: Remove per arch tlb_{start,end}_vma() Peter Zijlstra
@ 2022-07-08  7:18 ` Peter Zijlstra
  2022-07-08 13:31   ` Will Deacon
  2022-07-08  7:18 ` [PATCH 3/4] mmu_gather: Let there be one tlb_{start,end}_vma() implementation Peter Zijlstra
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2022-07-08  7:18 UTC (permalink / raw)
  To: Jann Horn, Linus Torvalds, Will Deacon
  Cc: linux-kernel, linux-mm, peterz, Dave Airlie, Daniel Vetter,
	Andrew Morton, Guo Ren, David Miller

The previous patch removed the tlb_flush_end() implementation which
used tlb_flush_range(). This means:

 - csky did double invalidates, a range invalidate per vma and a full
   invalidate at the end

 - csky actually has range invalidates and as such the generic
   tlb_flush implementation is more efficient for it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/csky/include/asm/tlb.h |    2 --
 1 file changed, 2 deletions(-)

--- a/arch/csky/include/asm/tlb.h
+++ b/arch/csky/include/asm/tlb.h
@@ -4,8 +4,6 @@
 #define __ASM_CSKY_TLB_H
 
 #include <asm/cacheflush.h>
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
-
 #include <asm-generic/tlb.h>
 
 #endif /* __ASM_CSKY_TLB_H */




^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/4] mmu_gather: Let there be one tlb_{start,end}_vma() implementation
  2022-07-08  7:18 [PATCH 0/4] munmap() vs unmap_mapping_range() Peter Zijlstra
  2022-07-08  7:18 ` [PATCH 1/4] mmu_gather: Remove per arch tlb_{start,end}_vma() Peter Zijlstra
  2022-07-08  7:18 ` [PATCH 2/4] csky/tlb: Remove tlb_flush() define Peter Zijlstra
@ 2022-07-08  7:18 ` Peter Zijlstra
  2022-07-08 13:32   ` Will Deacon
  2022-07-08  7:18 ` [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas Peter Zijlstra
  2022-07-21  8:37 ` [PATCH 0/4] munmap() vs unmap_mapping_range() Peter Zijlstra
  4 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2022-07-08  7:18 UTC (permalink / raw)
  To: Jann Horn, Linus Torvalds, Will Deacon
  Cc: linux-kernel, linux-mm, peterz, Dave Airlie, Daniel Vetter,
	Andrew Morton, Guo Ren, David Miller

Now that architectures are no longer allowed to override
tlb_{start,end}_vma() re-arrange code so that there is only one
implementation for each of these functions.

This much simplifies trying to figure out what they actually do.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/asm-generic/tlb.h |   15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -346,8 +346,8 @@ static inline void __tlb_reset_range(str
 
 #ifdef CONFIG_MMU_GATHER_NO_RANGE
 
-#if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma)
-#error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma()
+#if defined(tlb_flush)
+#error MMU_GATHER_NO_RANGE relies on default tlb_flush()
 #endif
 
 /*
@@ -367,17 +367,10 @@ static inline void tlb_flush(struct mmu_
 static inline void
 tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
 
-#define tlb_end_vma tlb_end_vma
-static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
-
 #else /* CONFIG_MMU_GATHER_NO_RANGE */
 
 #ifndef tlb_flush
 
-#if defined(tlb_start_vma) || defined(tlb_end_vma)
-#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma()
-#endif
-
 /*
  * When an architecture does not provide its own tlb_flush() implementation
  * but does have a reasonably efficient flush_vma_range() implementation
@@ -498,7 +491,6 @@ static inline unsigned long tlb_get_unma
  * case where we're doing a full MM flush.  When we're doing a munmap,
  * the vmas are adjusted to only cover the region to be torn down.
  */
-#ifndef tlb_start_vma
 static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
 	if (tlb->fullmm)
@@ -509,9 +501,7 @@ static inline void tlb_start_vma(struct
 	flush_cache_range(vma, vma->vm_start, vma->vm_end);
 #endif
 }
-#endif
 
-#ifndef tlb_end_vma
 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
 	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
@@ -525,7 +515,6 @@ static inline void tlb_end_vma(struct mm
 	 */
 	tlb_flush_mmu_tlbonly(tlb);
 }
-#endif
 
 /*
  * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,




^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas
  2022-07-08  7:18 [PATCH 0/4] munmap() vs unmap_mapping_range() Peter Zijlstra
                   ` (2 preceding siblings ...)
  2022-07-08  7:18 ` [PATCH 3/4] mmu_gather: Let there be one tlb_{start,end}_vma() implementation Peter Zijlstra
@ 2022-07-08  7:18 ` Peter Zijlstra
  2022-07-08 13:36   ` Will Deacon
  2022-07-08 14:04   ` Jann Horn
  2022-07-21  8:37 ` [PATCH 0/4] munmap() vs unmap_mapping_range() Peter Zijlstra
  4 siblings, 2 replies; 16+ messages in thread
From: Peter Zijlstra @ 2022-07-08  7:18 UTC (permalink / raw)
  To: Jann Horn, Linus Torvalds, Will Deacon
  Cc: linux-kernel, linux-mm, peterz, Dave Airlie, Daniel Vetter,
	Andrew Morton, Guo Ren, David Miller

Jann reported a race between munmap() and unmap_mapping_range(), where
unmap_mapping_range() will no-op once unmap_vmas() has unlinked the
VMA; however munmap() will not yet have invalidated the TLBs.

Therefore unmap_mapping_range() will complete while there are still
(stale) TLB entries for the specified range.

Mitigate this by force flushing TLBs for VM_PFNMAP ranges.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/asm-generic/tlb.h |   33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -303,6 +303,7 @@ struct mmu_gather {
 	 */
 	unsigned int		vma_exec : 1;
 	unsigned int		vma_huge : 1;
+	unsigned int		vma_pfn  : 1;
 
 	unsigned int		batch_count;
 
@@ -373,7 +374,6 @@ tlb_update_vma_flags(struct mmu_gather *
 #else /* CONFIG_MMU_GATHER_NO_RANGE */
 
 #ifndef tlb_flush
-
 /*
  * When an architecture does not provide its own tlb_flush() implementation
  * but does have a reasonably efficient flush_vma_range() implementation
@@ -393,6 +393,9 @@ static inline void tlb_flush(struct mmu_
 		flush_tlb_range(&vma, tlb->start, tlb->end);
 	}
 }
+#endif
+
+#endif /* CONFIG_MMU_GATHER_NO_RANGE */
 
 static inline void
 tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
@@ -410,17 +413,9 @@ tlb_update_vma_flags(struct mmu_gather *
 	 */
 	tlb->vma_huge = is_vm_hugetlb_page(vma);
 	tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
+	tlb->vma_pfn  = !!(vma->vm_flags & VM_PFNMAP);
 }
 
-#else
-
-static inline void
-tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
-
-#endif
-
-#endif /* CONFIG_MMU_GATHER_NO_RANGE */
-
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
 	/*
@@ -507,16 +502,22 @@ static inline void tlb_start_vma(struct
 
 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
-	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
+	if (tlb->fullmm)
 		return;
 
 	/*
-	 * Do a TLB flush and reset the range at VMA boundaries; this avoids
-	 * the ranges growing with the unused space between consecutive VMAs,
-	 * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
-	 * this.
+	 * VM_PFNMAP is more fragile because the core mm will not track the
+	 * page mapcount -- there might not be page-frames for these PFNs after
+	 * all. Force flush TLBs for such ranges to avoid munmap() vs
+	 * unmap_mapping_range() races.
 	 */
-	tlb_flush_mmu_tlbonly(tlb);
+	if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
+		/*
+		 * Do a TLB flush and reset the range at VMA boundaries; this avoids
+		 * the ranges growing with the unused space between consecutive VMAs.
+		 */
+		tlb_flush_mmu_tlbonly(tlb);
+	}
 }
 
 /*




^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] mmu_gather: Remove per arch tlb_{start,end}_vma()
  2022-07-08  7:18 ` [PATCH 1/4] mmu_gather: Remove per arch tlb_{start,end}_vma() Peter Zijlstra
@ 2022-07-08 13:25   ` Will Deacon
  0 siblings, 0 replies; 16+ messages in thread
From: Will Deacon @ 2022-07-08 13:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jann Horn, Linus Torvalds, linux-kernel, linux-mm, Dave Airlie,
	Daniel Vetter, Andrew Morton, Guo Ren, David Miller

On Fri, Jul 08, 2022 at 09:18:03AM +0200, Peter Zijlstra wrote:
> Scattered across the archs are 3 basic forms of tlb_{start,end}_vma().
> Provide two new MMU_GATHER_knobs to enumerate them and remove the per
> arch tlb_{start,end}_vma() implementations.
> 
>  - MMU_GATHER_NO_FLUSH_CACHE indicates the arch has flush_cache_range()
>    but does *NOT* want to call it for each VMA.
> 
>  - MMU_GATHER_MERGE_VMAS indicates the arch wants to merge the
>    invalidate across multiple VMAs if possible.
> 
> With these it is possible to capture the three forms:
> 
>   1) empty stubs;
>      select MMU_GATHER_NO_FLUSH_CACHE and MMU_GATHER_MERGE_VMAS
> 
>   2) start: flush_cache_range(), end: empty;
>      select MMU_GATHER_MERGE_VMAS
> 
>   3) start: flush_cache_range(), end: flush_tlb_range();
>      default
> 
> Obviously, if the architecture does not have flush_cache_range() then
> it also doesn't need to select MMU_GATHER_NO_FLUSH_CACHE.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/Kconfig                     |    7 +++++++
>  arch/csky/include/asm/tlb.h      |   13 -------------
>  arch/loongarch/Kconfig           |    1 +
>  arch/loongarch/include/asm/tlb.h |   10 ----------
>  arch/powerpc/Kconfig             |    1 +
>  arch/powerpc/include/asm/tlb.h   |    2 --
>  arch/s390/Kconfig                |    1 +
>  arch/s390/include/asm/tlb.h      |    3 ---
>  arch/sparc/Kconfig               |    2 ++
>  arch/sparc/include/asm/tlb_64.h  |    2 --
>  arch/x86/Kconfig                 |    1 +
>  arch/x86/include/asm/tlb.h       |    3 ---
>  include/asm-generic/tlb.h        |   21 +++++++++++++++++++--
>  13 files changed, 32 insertions(+), 35 deletions(-)
> 
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -438,6 +438,13 @@ config MMU_GATHER_PAGE_SIZE
>  
>  config MMU_GATHER_NO_RANGE
>  	bool
> +	select MMU_GATHER_MERGE_VMAS
> +
> +config MMU_GATHER_NO_FLUSH_CACHE
> +	bool

If this is really a sparc-special and we don't necessarily want it to
proliferate, then maybe:

	default y
	depends on SPARC

would keep it confined?

But I don't mind either way and the important bits of the patch look good:

Acked-by: Will Deacon <will@kernel.org>

Thanks,

Will


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] csky/tlb: Remove tlb_flush() define
  2022-07-08  7:18 ` [PATCH 2/4] csky/tlb: Remove tlb_flush() define Peter Zijlstra
@ 2022-07-08 13:31   ` Will Deacon
  0 siblings, 0 replies; 16+ messages in thread
From: Will Deacon @ 2022-07-08 13:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jann Horn, Linus Torvalds, linux-kernel, linux-mm, Dave Airlie,
	Daniel Vetter, Andrew Morton, Guo Ren, David Miller

On Fri, Jul 08, 2022 at 09:18:04AM +0200, Peter Zijlstra wrote:
> The previous patch removed the tlb_flush_end() implementation which
> used tlb_flush_range(). This means:
> 
>  - csky did double invalidates, a range invalidate per vma and a full
>    invalidate at the end
> 
>  - csky actually has range invalidates and as such the generic
>    tlb_flush implementation is more efficient for it.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/csky/include/asm/tlb.h |    2 --
>  1 file changed, 2 deletions(-)
> 
> --- a/arch/csky/include/asm/tlb.h
> +++ b/arch/csky/include/asm/tlb.h
> @@ -4,8 +4,6 @@
>  #define __ASM_CSKY_TLB_H
>  
>  #include <asm/cacheflush.h>
> -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)

Looks right to me, and the generic code handles the fullmm case when
!CONFIG_MMU_GATHER_NO_RANGE so:

Acked-by: Will Deacon <will@kernel.org>

Will


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] mmu_gather: Let there be one tlb_{start,end}_vma() implementation
  2022-07-08  7:18 ` [PATCH 3/4] mmu_gather: Let there be one tlb_{start,end}_vma() implementation Peter Zijlstra
@ 2022-07-08 13:32   ` Will Deacon
  0 siblings, 0 replies; 16+ messages in thread
From: Will Deacon @ 2022-07-08 13:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jann Horn, Linus Torvalds, linux-kernel, linux-mm, Dave Airlie,
	Daniel Vetter, Andrew Morton, Guo Ren, David Miller

On Fri, Jul 08, 2022 at 09:18:05AM +0200, Peter Zijlstra wrote:
> Now that architectures are no longer allowed to override
> tlb_{start,end}_vma() re-arrange code so that there is only one
> implementation for each of these functions.
> 
> This much simplifies trying to figure out what they actually do.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/asm-generic/tlb.h |   15 ++-------------
>  1 file changed, 2 insertions(+), 13 deletions(-)
> 
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -346,8 +346,8 @@ static inline void __tlb_reset_range(str
>  
>  #ifdef CONFIG_MMU_GATHER_NO_RANGE
>  
> -#if defined(tlb_flush) || defined(tlb_start_vma) || defined(tlb_end_vma)
> -#error MMU_GATHER_NO_RANGE relies on default tlb_flush(), tlb_start_vma() and tlb_end_vma()
> +#if defined(tlb_flush)
> +#error MMU_GATHER_NO_RANGE relies on default tlb_flush()
>  #endif
>  
>  /*
> @@ -367,17 +367,10 @@ static inline void tlb_flush(struct mmu_
>  static inline void
>  tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
>  
> -#define tlb_end_vma tlb_end_vma
> -static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
> -
>  #else /* CONFIG_MMU_GATHER_NO_RANGE */
>  
>  #ifndef tlb_flush
>  
> -#if defined(tlb_start_vma) || defined(tlb_end_vma)
> -#error Default tlb_flush() relies on default tlb_start_vma() and tlb_end_vma()
> -#endif
> -
>  /*
>   * When an architecture does not provide its own tlb_flush() implementation
>   * but does have a reasonably efficient flush_vma_range() implementation
> @@ -498,7 +491,6 @@ static inline unsigned long tlb_get_unma
>   * case where we're doing a full MM flush.  When we're doing a munmap,
>   * the vmas are adjusted to only cover the region to be torn down.
>   */
> -#ifndef tlb_start_vma
>  static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
>  {
>  	if (tlb->fullmm)
> @@ -509,9 +501,7 @@ static inline void tlb_start_vma(struct
>  	flush_cache_range(vma, vma->vm_start, vma->vm_end);
>  #endif
>  }
> -#endif
>  
> -#ifndef tlb_end_vma
>  static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
>  {
>  	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
> @@ -525,7 +515,6 @@ static inline void tlb_end_vma(struct mm
>  	 */
>  	tlb_flush_mmu_tlbonly(tlb);
>  }
> -#endif

Much nicer:

Acked-by: Will Deacon <will@kernel.org>

Will


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas
  2022-07-08  7:18 ` [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas Peter Zijlstra
@ 2022-07-08 13:36   ` Will Deacon
  2022-07-08 14:03     ` Peter Zijlstra
  2022-07-08 14:04   ` Jann Horn
  1 sibling, 1 reply; 16+ messages in thread
From: Will Deacon @ 2022-07-08 13:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jann Horn, Linus Torvalds, linux-kernel, linux-mm, Dave Airlie,
	Daniel Vetter, Andrew Morton, Guo Ren, David Miller

On Fri, Jul 08, 2022 at 09:18:06AM +0200, Peter Zijlstra wrote:
> Jann reported a race between munmap() and unmap_mapping_range(), where
> unmap_mapping_range() will no-op once unmap_vmas() has unlinked the
> VMA; however munmap() will not yet have invalidated the TLBs.
> 
> Therefore unmap_mapping_range() will complete while there are still
> (stale) TLB entries for the specified range.
> 
> Mitigate this by force flushing TLBs for VM_PFNMAP ranges.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/asm-generic/tlb.h |   33 +++++++++++++++++----------------
>  1 file changed, 17 insertions(+), 16 deletions(-)
> 
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -303,6 +303,7 @@ struct mmu_gather {
>  	 */
>  	unsigned int		vma_exec : 1;
>  	unsigned int		vma_huge : 1;
> +	unsigned int		vma_pfn  : 1;
>  
>  	unsigned int		batch_count;
>  
> @@ -373,7 +374,6 @@ tlb_update_vma_flags(struct mmu_gather *
>  #else /* CONFIG_MMU_GATHER_NO_RANGE */
>  
>  #ifndef tlb_flush
> -
>  /*
>   * When an architecture does not provide its own tlb_flush() implementation
>   * but does have a reasonably efficient flush_vma_range() implementation
> @@ -393,6 +393,9 @@ static inline void tlb_flush(struct mmu_
>  		flush_tlb_range(&vma, tlb->start, tlb->end);
>  	}
>  }
> +#endif
> +
> +#endif /* CONFIG_MMU_GATHER_NO_RANGE */
>  
>  static inline void
>  tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
> @@ -410,17 +413,9 @@ tlb_update_vma_flags(struct mmu_gather *
>  	 */
>  	tlb->vma_huge = is_vm_hugetlb_page(vma);
>  	tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
> +	tlb->vma_pfn  = !!(vma->vm_flags & VM_PFNMAP);
>  }
>  
> -#else
> -
> -static inline void
> -tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
> -
> -#endif
> -
> -#endif /* CONFIG_MMU_GATHER_NO_RANGE */
> -
>  static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
>  {
>  	/*
> @@ -507,16 +502,22 @@ static inline void tlb_start_vma(struct
>  
>  static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
>  {
> -	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
> +	if (tlb->fullmm)
>  		return;
>  
>  	/*
> -	 * Do a TLB flush and reset the range at VMA boundaries; this avoids
> -	 * the ranges growing with the unused space between consecutive VMAs,
> -	 * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
> -	 * this.
> +	 * VM_PFNMAP is more fragile because the core mm will not track the
> +	 * page mapcount -- there might not be page-frames for these PFNs after
> +	 * all. Force flush TLBs for such ranges to avoid munmap() vs
> +	 * unmap_mapping_range() races.
>  	 */
> -	tlb_flush_mmu_tlbonly(tlb);
> +	if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
> +		/*
> +		 * Do a TLB flush and reset the range at VMA boundaries; this avoids
> +		 * the ranges growing with the unused space between consecutive VMAs.
> +		 */
> +		tlb_flush_mmu_tlbonly(tlb);
> +	}

We already have the vma here, so I'm not sure how much the new 'vma_pfn'
field really buys us over checking the 'vm_flags', but perhaps that's
cleanup for another day.

Acked-by: Will Deacon <will@kernel.org>

Will


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas
  2022-07-08 13:36   ` Will Deacon
@ 2022-07-08 14:03     ` Peter Zijlstra
  0 siblings, 0 replies; 16+ messages in thread
From: Peter Zijlstra @ 2022-07-08 14:03 UTC (permalink / raw)
  To: Will Deacon
  Cc: Jann Horn, Linus Torvalds, linux-kernel, linux-mm, Dave Airlie,
	Daniel Vetter, Andrew Morton, Guo Ren, David Miller

On Fri, Jul 08, 2022 at 02:36:06PM +0100, Will Deacon wrote:
> On Fri, Jul 08, 2022 at 09:18:06AM +0200, Peter Zijlstra wrote:

> > @@ -507,16 +502,22 @@ static inline void tlb_start_vma(struct
> >  
> >  static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
> >  {
> > -	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
> > +	if (tlb->fullmm)
> >  		return;
> >  
> >  	/*
> > -	 * Do a TLB flush and reset the range at VMA boundaries; this avoids
> > -	 * the ranges growing with the unused space between consecutive VMAs,
> > -	 * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
> > -	 * this.
> > +	 * VM_PFNMAP is more fragile because the core mm will not track the
> > +	 * page mapcount -- there might not be page-frames for these PFNs after
> > +	 * all. Force flush TLBs for such ranges to avoid munmap() vs
> > +	 * unmap_mapping_range() races.
> >  	 */
> > -	tlb_flush_mmu_tlbonly(tlb);
> > +	if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
> > +		/*
> > +		 * Do a TLB flush and reset the range at VMA boundaries; this avoids
> > +		 * the ranges growing with the unused space between consecutive VMAs.
> > +		 */
> > +		tlb_flush_mmu_tlbonly(tlb);
> > +	}
> 
> We already have the vma here, so I'm not sure how much the new 'vma_pfn'
> field really buys us over checking the 'vm_flags', but perhaps that's
> cleanup for another day.

Duh, that's just me being daft again. For some raisin I was convinced
(and failed to check) that we only had the vma at start.

I can easily respin this to not need the extra variable.

How's this then?

---
Subject: mmu_gather: Force tlb-flush VM_PFNMAP vmas
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Jul 7 11:51:16 CEST 2022

Jann reported a race between munmap() and unmap_mapping_range(), where
unmap_mapping_range() will no-op once unmap_vmas() has unlinked the
VMA; however munmap() will not yet have invalidated the TLBs.

Therefore unmap_mapping_range() will complete while there are still
(stale) TLB entries for the specified range.

Mitigate this by force flushing TLBs for VM_PFNMAP ranges.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/asm-generic/tlb.h |   18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -507,16 +507,22 @@ static inline void tlb_start_vma(struct
 
 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
-	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
+	if (tlb->fullmm)
 		return;
 
 	/*
-	 * Do a TLB flush and reset the range at VMA boundaries; this avoids
-	 * the ranges growing with the unused space between consecutive VMAs,
-	 * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
-	 * this.
+	 * VM_PFNMAP is more fragile because the core mm will not track the
+	 * page mapcount -- there might not be page-frames for these PFNs after
+	 * all. Force flush TLBs for such ranges to avoid munmap() vs
+	 * unmap_mapping_range() races.
 	 */
-	tlb_flush_mmu_tlbonly(tlb);
+	if ((vma->vm_flags & VM_PFNMAP) || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
+		/*
+		 * Do a TLB flush and reset the range at VMA boundaries; this avoids
+		 * the ranges growing with the unused space between consecutive VMAs.
+		 */
+		tlb_flush_mmu_tlbonly(tlb);
+	}
 }
 
 /*


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas
  2022-07-08  7:18 ` [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas Peter Zijlstra
  2022-07-08 13:36   ` Will Deacon
@ 2022-07-08 14:04   ` Jann Horn
  2022-07-09  8:38     ` Peter Zijlstra
  1 sibling, 1 reply; 16+ messages in thread
From: Jann Horn @ 2022-07-08 14:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Will Deacon, linux-kernel, linux-mm, Dave Airlie,
	Daniel Vetter, Andrew Morton, Guo Ren, David Miller

On Fri, Jul 8, 2022 at 9:19 AM Peter Zijlstra <peterz@infradead.org> wrote:
> Jann reported a race between munmap() and unmap_mapping_range(), where
> unmap_mapping_range() will no-op once unmap_vmas() has unlinked the
> VMA; however munmap() will not yet have invalidated the TLBs.
>
> Therefore unmap_mapping_range() will complete while there are still
> (stale) TLB entries for the specified range.
>
> Mitigate this by force flushing TLBs for VM_PFNMAP ranges.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/asm-generic/tlb.h |   33 +++++++++++++++++----------------
>  1 file changed, 17 insertions(+), 16 deletions(-)
>
> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -303,6 +303,7 @@ struct mmu_gather {
>          */
>         unsigned int            vma_exec : 1;
>         unsigned int            vma_huge : 1;
> +       unsigned int            vma_pfn  : 1;
>
>         unsigned int            batch_count;
>
> @@ -373,7 +374,6 @@ tlb_update_vma_flags(struct mmu_gather *
>  #else /* CONFIG_MMU_GATHER_NO_RANGE */
>
>  #ifndef tlb_flush
> -
>  /*
>   * When an architecture does not provide its own tlb_flush() implementation
>   * but does have a reasonably efficient flush_vma_range() implementation
> @@ -393,6 +393,9 @@ static inline void tlb_flush(struct mmu_
>                 flush_tlb_range(&vma, tlb->start, tlb->end);
>         }
>  }
> +#endif
> +
> +#endif /* CONFIG_MMU_GATHER_NO_RANGE */
>
>  static inline void
>  tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
> @@ -410,17 +413,9 @@ tlb_update_vma_flags(struct mmu_gather *
>          */
>         tlb->vma_huge = is_vm_hugetlb_page(vma);
>         tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
> +       tlb->vma_pfn  = !!(vma->vm_flags & VM_PFNMAP);

We should probably handle VM_MIXEDMAP the same way as VM_PFNMAP here,
I think? Conceptually I think the same issue can happen with
device-owned pages that aren't managed by the kernel's page allocator,
and for those, VM_MIXEDMAP is the same as VM_PFNMAP.

>  }
>
> -#else
> -
> -static inline void
> -tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
> -
> -#endif
> -
> -#endif /* CONFIG_MMU_GATHER_NO_RANGE */
> -
>  static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
>  {
>         /*
> @@ -507,16 +502,22 @@ static inline void tlb_start_vma(struct
>
>  static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
>  {
> -       if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
> +       if (tlb->fullmm)
>                 return;

Is this correct, or would there still be a race between MM teardown
(which sets ->fullmm, see exit_mmap()->tlb_gather_mmu_fullmm()) and
unmap_mapping_range()? My understanding is that ->fullmm only
guarantees a flush at tlb_finish_mmu(), but here we're trying to
ensure a flush before unlink_file_vma().

>         /*
> -        * Do a TLB flush and reset the range at VMA boundaries; this avoids
> -        * the ranges growing with the unused space between consecutive VMAs,
> -        * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
> -        * this.
> +        * VM_PFNMAP is more fragile because the core mm will not track the
> +        * page mapcount -- there might not be page-frames for these PFNs after
> +        * all. Force flush TLBs for such ranges to avoid munmap() vs
> +        * unmap_mapping_range() races.

Maybe add: "We do *not* guarantee that after munmap() has passed
through tlb_end_vma(), there are no more stale TLB entries for this
VMA; there could be a parallel PTE-zapping operation that has zapped
PTEs before we looked at them but hasn't done the corresponding TLB
flush yet. However, such a parallel zap can't be done through the
mm_struct (we've unlinked the VMA), so it would have to be done under
the ->i_mmap_sem in read mode, which we synchronize against in
unlink_file_vma()."

I'm not convinced it's particularly nice to do a flush in
tlb_end_vma() when we can't make guarantees about the TLB state wrt
parallel invalidations, and when we only really care about having a
flush between unmap_vmas() and free_pgtables(), but I guess it works?

>          */
> -       tlb_flush_mmu_tlbonly(tlb);
> +       if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
> +               /*
> +                * Do a TLB flush and reset the range at VMA boundaries; this avoids
> +                * the ranges growing with the unused space between consecutive VMAs.
> +                */
> +               tlb_flush_mmu_tlbonly(tlb);
> +       }
>  }
>
>  /*
>
>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas
  2022-07-08 14:04   ` Jann Horn
@ 2022-07-09  8:38     ` Peter Zijlstra
  2022-07-11 15:04       ` Jann Horn
  0 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2022-07-09  8:38 UTC (permalink / raw)
  To: Jann Horn
  Cc: Linus Torvalds, Will Deacon, linux-kernel, linux-mm, Dave Airlie,
	Daniel Vetter, Andrew Morton, Guo Ren, David Miller

On Fri, Jul 08, 2022 at 04:04:38PM +0200, Jann Horn wrote:
> On Fri, Jul 8, 2022 at 9:19 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > Jann reported a race between munmap() and unmap_mapping_range(), where
> > unmap_mapping_range() will no-op once unmap_vmas() has unlinked the
> > VMA; however munmap() will not yet have invalidated the TLBs.
> >
> > Therefore unmap_mapping_range() will complete while there are still
> > (stale) TLB entries for the specified range.
> >
> > Mitigate this by force flushing TLBs for VM_PFNMAP ranges.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >  include/asm-generic/tlb.h |   33 +++++++++++++++++----------------
> >  1 file changed, 17 insertions(+), 16 deletions(-)
> >
> > --- a/include/asm-generic/tlb.h
> > +++ b/include/asm-generic/tlb.h
> > @@ -303,6 +303,7 @@ struct mmu_gather {
> >          */
> >         unsigned int            vma_exec : 1;
> >         unsigned int            vma_huge : 1;
> > +       unsigned int            vma_pfn  : 1;
> >
> >         unsigned int            batch_count;
> >
> > @@ -373,7 +374,6 @@ tlb_update_vma_flags(struct mmu_gather *
> >  #else /* CONFIG_MMU_GATHER_NO_RANGE */
> >
> >  #ifndef tlb_flush
> > -
> >  /*
> >   * When an architecture does not provide its own tlb_flush() implementation
> >   * but does have a reasonably efficient flush_vma_range() implementation
> > @@ -393,6 +393,9 @@ static inline void tlb_flush(struct mmu_
> >                 flush_tlb_range(&vma, tlb->start, tlb->end);
> >         }
> >  }
> > +#endif
> > +
> > +#endif /* CONFIG_MMU_GATHER_NO_RANGE */
> >
> >  static inline void
> >  tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
> > @@ -410,17 +413,9 @@ tlb_update_vma_flags(struct mmu_gather *
> >          */
> >         tlb->vma_huge = is_vm_hugetlb_page(vma);
> >         tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
> > +       tlb->vma_pfn  = !!(vma->vm_flags & VM_PFNMAP);
> 
> We should probably handle VM_MIXEDMAP the same way as VM_PFNMAP here,
> I think? Conceptually I think the same issue can happen with
> device-owned pages that aren't managed by the kernel's page allocator,
> and for those, VM_MIXEDMAP is the same as VM_PFNMAP.

Hmm, yeah, that seems to make sense.

> >  }
> >
> > -#else
> > -
> > -static inline void
> > -tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
> > -
> > -#endif
> > -
> > -#endif /* CONFIG_MMU_GATHER_NO_RANGE */
> > -
> >  static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
> >  {
> >         /*
> > @@ -507,16 +502,22 @@ static inline void tlb_start_vma(struct
> >
> >  static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
> >  {
> > -       if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
> > +       if (tlb->fullmm)
> >                 return;
> 
> Is this correct, or would there still be a race between MM teardown
> (which sets ->fullmm, see exit_mmap()->tlb_gather_mmu_fullmm()) and
> unmap_mapping_range()? My understanding is that ->fullmm only
> guarantees a flush at tlb_finish_mmu(), but here we're trying to
> ensure a flush before unlink_file_vma().

fullmm is when the last user of the mm goes away, there should not be
any races on the address space at that time. Also see the comment with
tlb_gather_mmu_fullmm() and its users.

> >         /*
> > -        * Do a TLB flush and reset the range at VMA boundaries; this avoids
> > -        * the ranges growing with the unused space between consecutive VMAs,
> > -        * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
> > -        * this.
> > +        * VM_PFNMAP is more fragile because the core mm will not track the
> > +        * page mapcount -- there might not be page-frames for these PFNs after
> > +        * all. Force flush TLBs for such ranges to avoid munmap() vs
> > +        * unmap_mapping_range() races.
> 
> Maybe add: "We do *not* guarantee that after munmap() has passed
                         ^ otherwise?
> through tlb_end_vma(), there are no more stale TLB entries for this
> VMA; there could be a parallel PTE-zapping operation that has zapped
> PTEs before we looked at them but hasn't done the corresponding TLB
> flush yet. However, such a parallel zap can't be done through the
> mm_struct (we've unlinked the VMA), so it would have to be done under
> the ->i_mmap_sem in read mode, which we synchronize against in
> unlink_file_vma()."

Done.

> I'm not convinced it's particularly nice to do a flush in
> tlb_end_vma() when we can't make guarantees about the TLB state wrt
> parallel invalidations, and when we only really care about having a
> flush between unmap_vmas() and free_pgtables(), but I guess it works?

Yeah, none of this is pretty. I despise this whole parallel invalidation
stuff with a passion, we've had ever so many bugs because of that :-(

We could add another mmu_gather callback and place it between
unmap_vmas() and free_pgtables(), but it's a much larger patch and I'm
not entirely sure it's worth the complexity.

OTOH having a callback between freeing pages and freeing page-tables
might not be the worst idea. Let me ponder that for a bit.

Meanwhile; updated patch below.

---
Subject: mmu_gather: Force TLB-flush VM_PFNMAP|VM_MIXEDMAP vmas
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Jul 7 11:51:16 CEST 2022

Jann reported a race between munmap() and unmap_mapping_range(), where
unmap_mapping_range() will no-op once unmap_vmas() has unlinked the
VMA; however munmap() will not yet have invalidated the TLBs.

Therefore unmap_mapping_range() will complete while there are still
(stale) TLB entries for the specified range.

Mitigate this by force flushing TLBs for VM_PFNMAP ranges.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/asm-generic/tlb.h |   28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -507,16 +507,32 @@ static inline void tlb_start_vma(struct
 
 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 {
-	if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
+	if (tlb->fullmm)
 		return;
 
 	/*
-	 * Do a TLB flush and reset the range at VMA boundaries; this avoids
-	 * the ranges growing with the unused space between consecutive VMAs,
-	 * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
-	 * this.
+	 * VM_PFNMAP|VM_MIXEDMAP is more fragile because the core mm will not
+	 * track the page mapcount -- there might not be page-frames for these
+	 * PFNs after all. Force flush TLBs for such ranges to avoid munmap()
+	 * vs unmap_mapping_range() races.
+	 *
+	 * We do *NOT* guarantee that after munmap() has passed through
+	 * tlb_end_vma(), there are no more stale TLB entries for this VMA;
+	 * there could be a parallel PTE-zapping operation that has zapped PTEs
+	 * before we looked at them but hasn't done the corresponding TLB flush
+	 * yet. However, such a parallel zap can't be done through the
+	 * mm_struct (we've unliked the VMA), so it would have to be done under
+	 * the ->i_mmap_sem in read move, which we synchronize against in
+	 * unlink_file_vma().
 	 */
-	tlb_flush_mmu_tlbonly(tlb);
+	if ((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ||
+	    !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
+		/*
+		 * Do a TLB flush and reset the range at VMA boundaries; this avoids
+		 * the ranges growing with the unused space between consecutive VMAs.
+		 */
+		tlb_flush_mmu_tlbonly(tlb);
+	}
 }
 
 /*


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas
  2022-07-09  8:38     ` Peter Zijlstra
@ 2022-07-11 15:04       ` Jann Horn
  0 siblings, 0 replies; 16+ messages in thread
From: Jann Horn @ 2022-07-11 15:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Will Deacon, linux-kernel, linux-mm, Dave Airlie,
	Daniel Vetter, Andrew Morton, Guo Ren, David Miller

On Sat, Jul 9, 2022 at 10:38 AM Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Jul 08, 2022 at 04:04:38PM +0200, Jann Horn wrote:
> > On Fri, Jul 8, 2022 at 9:19 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > > @@ -507,16 +502,22 @@ static inline void tlb_start_vma(struct
> > >
> > >  static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
> > >  {
> > > -       if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
> > > +       if (tlb->fullmm)
> > >                 return;
> >
> > Is this correct, or would there still be a race between MM teardown
> > (which sets ->fullmm, see exit_mmap()->tlb_gather_mmu_fullmm()) and
> > unmap_mapping_range()? My understanding is that ->fullmm only
> > guarantees a flush at tlb_finish_mmu(), but here we're trying to
> > ensure a flush before unlink_file_vma().
>
> fullmm is when the last user of the mm goes away, there should not be

(FWIW, there also seems to be an error path in write_ldt ->
free_ldt_pgtables -> tlb_gather_mmu_fullmm where ->fullmm can be set
for a TLB shootdown in a live process, but that's irrelevant for this
patch.)

> any races on the address space at that time. Also see the comment with
> tlb_gather_mmu_fullmm() and its users.

Ah, right, aside from the LDT weirdness, fullmm is only used in
exit_mmap, and at that point there can be no more parallel access to
the MM except for remote memory reaping (which is synchronized against
using mmap_write_lock()) and rmap walks...

> Subject: mmu_gather: Force TLB-flush VM_PFNMAP|VM_MIXEDMAP vmas
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Thu Jul 7 11:51:16 CEST 2022
>
> Jann reported a race between munmap() and unmap_mapping_range(), where
> unmap_mapping_range() will no-op once unmap_vmas() has unlinked the
> VMA; however munmap() will not yet have invalidated the TLBs.
>
> Therefore unmap_mapping_range() will complete while there are still
> (stale) TLB entries for the specified range.
>
> Mitigate this by force flushing TLBs for VM_PFNMAP ranges.
>
> Reported-by: Jann Horn <jannh@google.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Looks good to me.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] munmap() vs unmap_mapping_range()
  2022-07-08  7:18 [PATCH 0/4] munmap() vs unmap_mapping_range() Peter Zijlstra
                   ` (3 preceding siblings ...)
  2022-07-08  7:18 ` [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas Peter Zijlstra
@ 2022-07-21  8:37 ` Peter Zijlstra
  2022-07-21 17:46   ` Linus Torvalds
  4 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2022-07-21  8:37 UTC (permalink / raw)
  To: Jann Horn, Linus Torvalds, Will Deacon
  Cc: linux-kernel, linux-mm, Dave Airlie, Daniel Vetter, Andrew Morton,
	Guo Ren, David Miller

On Fri, Jul 08, 2022 at 09:18:02AM +0200, Peter Zijlstra wrote:
> Hi,
> 
> These 4 patches implement the suggestion by Linus.
> 
> The first patch nukes all architecture implementations of tlb_{start,end}_vma()
> by adding two additional MMU_GATHER config knobs.
> 
> DaveM, please clarify the sparc64 case, it seems a little odd to have
> flush_cache_range() but to explicitly not call it at unmap time. It would be
> nice to remove this one special case.
> 
> The second patch isn't strictly needed for this series but since I was looking
> at all this, I figured C-SKY didn't make a whole lot of sense. Guo, please
> have a look.
> 
> The third patch cleans up the #ifdeffery and provides a single
> tlb_{start,end}_vma() implementation for everyone.
> 
> The fourth, and final, patch does the force TLB-flush on VM_PFNMAP thing.
> 
> I've had the first three patches exposed to the robots and they've (so far) not
> reported build (or other) fail.
> 

What are we doing with these patches? Andrew will you pick them up, or
will I smuggle them in x86/mm or something ?


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] munmap() vs unmap_mapping_range()
  2022-07-21  8:37 ` [PATCH 0/4] munmap() vs unmap_mapping_range() Peter Zijlstra
@ 2022-07-21 17:46   ` Linus Torvalds
  2022-07-21 17:52     ` Linus Torvalds
  0 siblings, 1 reply; 16+ messages in thread
From: Linus Torvalds @ 2022-07-21 17:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jann Horn, Will Deacon, Linux Kernel Mailing List, Linux-MM,
	Dave Airlie, Daniel Vetter, Andrew Morton, Guo Ren, David Miller

On Thu, Jul 21, 2022 at 1:37 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> What are we doing with these patches? Andrew will you pick them up, or
> will I smuggle them in x86/mm or something ?

I'm picking up other random things right now anyway as people have
emailed me, so I'll just do that with this series too.

           Linus


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] munmap() vs unmap_mapping_range()
  2022-07-21 17:46   ` Linus Torvalds
@ 2022-07-21 17:52     ` Linus Torvalds
  0 siblings, 0 replies; 16+ messages in thread
From: Linus Torvalds @ 2022-07-21 17:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jann Horn, Will Deacon, Linux Kernel Mailing List, Linux-MM,
	Dave Airlie, Daniel Vetter, Andrew Morton, Guo Ren, David Miller

On Thu, Jul 21, 2022 at 10:46 AM Linus Torvalds
<torvalds@linuxfoundation.org> wrote:
>
> I'm picking up other random things right now anyway as people have
> emailed me, so I'll just do that with this series too.

.. and I took the suggestion from Jann to make that last patch also
check VM_MIXEDMAP in addition to VM_PFNMAP.

                Linus


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2022-07-21 17:52 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-07-08  7:18 [PATCH 0/4] munmap() vs unmap_mapping_range() Peter Zijlstra
2022-07-08  7:18 ` [PATCH 1/4] mmu_gather: Remove per arch tlb_{start,end}_vma() Peter Zijlstra
2022-07-08 13:25   ` Will Deacon
2022-07-08  7:18 ` [PATCH 2/4] csky/tlb: Remove tlb_flush() define Peter Zijlstra
2022-07-08 13:31   ` Will Deacon
2022-07-08  7:18 ` [PATCH 3/4] mmu_gather: Let there be one tlb_{start,end}_vma() implementation Peter Zijlstra
2022-07-08 13:32   ` Will Deacon
2022-07-08  7:18 ` [PATCH 4/4] mmu_gather: Force tlb-flush VM_PFNMAP vmas Peter Zijlstra
2022-07-08 13:36   ` Will Deacon
2022-07-08 14:03     ` Peter Zijlstra
2022-07-08 14:04   ` Jann Horn
2022-07-09  8:38     ` Peter Zijlstra
2022-07-11 15:04       ` Jann Horn
2022-07-21  8:37 ` [PATCH 0/4] munmap() vs unmap_mapping_range() Peter Zijlstra
2022-07-21 17:46   ` Linus Torvalds
2022-07-21 17:52     ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).