LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH 6/7] lazy tlb: allow lazy tlb mm switching to be configurable
From: Nicholas Piggin @ 2020-07-10  1:56 UTC (permalink / raw)
  To: linux-arch
  Cc: Arnd Bergmann, Peter Zijlstra, x86, linux-kernel, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20200710015646.2020871-1-npiggin@gmail.com>

NOMMU systems could easily go without this and save a bit of code
and the mm refcounting, because their mm switch is a no-op. I haven't
flipped them over because haven't audited all arch code to convert
over to using the _lazy_tlb refcounting.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig             |  7 +++++
 include/linux/sched/mm.h | 12 ++++++---
 kernel/sched/core.c      | 55 +++++++++++++++++++++++++++-------------
 kernel/sched/sched.h     |  4 ++-
 4 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 8cc35dc556c7..2daf8fe6146a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -411,6 +411,13 @@ config MMU_GATHER_NO_GATHER
 	bool
 	depends on MMU_GATHER_TABLE_FREE
 
+# Would like to make this depend on MMU, because there is little use for lazy mm switching
+# with NOMMU, but have to audit NOMMU architecture code first.
+config MMU_LAZY_TLB
+	def_bool y
+	help
+	  Enable "lazy TLB" mmu context switching for kernel threads.
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 110d4ad21de6..2c2b20e2ccc7 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -53,18 +53,22 @@ void mmdrop(struct mm_struct *mm);
 /* Helpers for lazy TLB mm refcounting */
 static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 {
-	mmgrab(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+		mmgrab(mm);
 }
 
 static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 {
-	mmdrop(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+		mmdrop(mm);
 }
 
 static inline void mmdrop_lazy_tlb_smp_mb(struct mm_struct *mm)
 {
-	/* This depends on mmdrop providing a full smp_mb() */
-	mmdrop(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+		mmdrop(mm); /* This depends on mmdrop providing a full smp_mb() */
+	else
+		smp_mb();
 }
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d19f2f517f6c..14b4fae6f6e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3253,7 +3253,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
+	struct mm_struct *mm = NULL;
 	long prev_state;
 
 	/*
@@ -3272,7 +3272,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		      current->comm, current->pid, preempt_count()))
 		preempt_count_set(FORK_PREEMPT_COUNT);
 
-	rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB
+	mm = rq->prev_lazy_mm;
+	rq->prev_lazy_mm = NULL;
+#endif
 
 	/*
 	 * A task struct has one reference for the use as "current".
@@ -3393,22 +3396,11 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	calculate_sigpending();
 }
 
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next, struct rq_flags *rf)
+static __always_inline void
+context_switch_mm(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
 {
-	prepare_task_switch(rq, prev, next);
-
-	/*
-	 * For paravirt, this is coupled with an exit in switch_to to
-	 * combine the page table reload and the switch backend into
-	 * one hypercall.
-	 */
-	arch_start_context_switch(prev);
-
+#ifdef CONFIG_MMU_LAZY_TLB
 	/*
 	 * kernel -> kernel   lazy + transfer active
 	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
@@ -3440,10 +3432,37 @@ context_switch(struct rq *rq, struct task_struct *prev,
 			exit_lazy_tlb(prev->active_mm, next);
 
 			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
-			rq->prev_mm = prev->active_mm;
+			rq->prev_lazy_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}
 	}
+#else
+	if (!next->mm)
+		next->active_mm = &init_mm;
+	membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
+	switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
+	if (!prev->mm)
+		prev->active_mm = NULL;
+#endif
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf)
+{
+	prepare_task_switch(rq, prev, next);
+
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	context_switch_mm(rq, prev, next);
 
 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 877fb08eb1b0..b196dd885d33 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -929,7 +929,9 @@ struct rq {
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-	struct mm_struct	*prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB
+	struct mm_struct	*prev_lazy_mm;
+#endif
 
 	unsigned int		clock_update_flags;
 	u64			clock;
-- 
2.23.0


^ permalink raw reply related

* [RFC PATCH 5/7] lazy tlb: introduce lazy mm refcount helper functions
From: Nicholas Piggin @ 2020-07-10  1:56 UTC (permalink / raw)
  To: linux-arch
  Cc: Arnd Bergmann, Peter Zijlstra, x86, linux-kernel, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20200710015646.2020871-1-npiggin@gmail.com>

Add explicit _lazy_tlb annotated functions for lazy mm refcounting.
This makes things a bit more explicit, and allows explicit refcounting
to be removed if it is not used.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/smp.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 ++--
 fs/exec.c                            |  2 +-
 include/linux/sched/mm.h             | 17 +++++++++++++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kthread.c                     | 11 +++++++----
 kernel/sched/core.c                  | 13 +++++++------
 8 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 73199470c265..ad95812d2a3f 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1253,7 +1253,7 @@ void start_secondary(void *unused)
 	unsigned int cpu = smp_processor_id();
 	struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
 
-	mmgrab(&init_mm);
+	mmgrab(&init_mm); /* XXX: where is the mmput for this? */
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b5cc9b23cf02..52730629b3eb 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -652,10 +652,10 @@ static void do_exit_flush_lazy_tlb(void *arg)
 		 * Must be a kernel thread because sender is single-threaded.
 		 */
 		BUG_ON(current->mm);
-		mmgrab(&init_mm);
+		mmgrab_lazy_tlb(&init_mm);
 		switch_mm(mm, &init_mm, current);
 		current->active_mm = &init_mm;
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
 	_tlbiel_pid(pid, RIC_FLUSH_ALL);
 }
diff --git a/fs/exec.c b/fs/exec.c
index e2ab71e88293..3a01b2751ea9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1119,7 +1119,7 @@ static int exec_mmap(struct mm_struct *mm)
 		mmput(old_mm);
 	} else {
 		exit_lazy_tlb(active_mm, tsk);
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 	}
 	return 0;
 }
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 9b026264b445..110d4ad21de6 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -50,6 +50,23 @@ static inline void mmdrop(struct mm_struct *mm)
 
 void mmdrop(struct mm_struct *mm);
 
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+	mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+	mmdrop(mm);
+}
+
+static inline void mmdrop_lazy_tlb_smp_mb(struct mm_struct *mm)
+{
+	/* This depends on mmdrop providing a full smp_mb() */
+	mmdrop(mm);
+}
+
 /*
  * This has to be called after a get_task_mm()/mmget_not_zero()
  * followed by taking the mmap_lock for writing before modifying the
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 134688d79589..ff9fcbc4e76b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -578,7 +578,7 @@ static int finish_cpu(unsigned int cpu)
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
-	mmdrop(mm);
+	mmdrop_lazy_tlb(mm);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 727150f28103..d535da9fd2f8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -470,7 +470,7 @@ static void exit_mm(void)
 		__set_current_state(TASK_RUNNING);
 		mmap_read_lock(mm);
 	}
-	mmgrab(mm);
+	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6f93c649aa97..a7133cc2ddaf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1238,12 +1238,12 @@ void kthread_use_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(tsk->mm);
 
+	mmgrab(mm);
+
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
-	if (active_mm != mm) {
-		mmgrab(mm);
+	if (active_mm != mm)
 		tsk->active_mm = mm;
-	}
 	tsk->mm = mm;
 	switch_mm(active_mm, mm, tsk);
 	task_unlock(tsk);
@@ -1253,7 +1253,7 @@ void kthread_use_mm(struct mm_struct *mm)
 
 	exit_lazy_tlb(active_mm, tsk);
 	if (active_mm != mm)
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 
 	to_kthread(tsk)->oldfs = get_fs();
 	set_fs(USER_DS);
@@ -1276,9 +1276,12 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	task_lock(tsk);
 	sync_mm_rss(mm);
 	tsk->mm = NULL;
+	mmgrab_lazy_tlb(mm);
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
 	task_unlock(tsk);
+
+	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 31e22c79826c..d19f2f517f6c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3302,10 +3302,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * schedule between user->kernel->user threads without passing though
 	 * switch_mm(). Membarrier requires a full barrier after storing to
 	 * rq->curr, before returning to userspace, for
-	 * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by mmdrop().
+	 * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by
+	 * mmdrop_lazy_tlb_smp_mb().
 	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_lazy_tlb_smp_mb(mm);
 
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
@@ -3410,9 +3411,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	/*
 	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
 	 *
-	 * kernel ->   user   switch + mmdrop() active
+	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
 	 */
 	if (!next->mm) {                                // to kernel
@@ -3420,7 +3421,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
+			mmgrab_lazy_tlb(prev->active_mm);
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
@@ -3438,7 +3439,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		if (!prev->mm) {                        // from kernel
 			exit_lazy_tlb(prev->active_mm, next);
 
-			/* will mmdrop() in finish_task_switch(). */
+			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}
-- 
2.23.0


^ permalink raw reply related

* [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-07-10  1:56 UTC (permalink / raw)
  To: linux-arch
  Cc: Arnd Bergmann, Peter Zijlstra, x86, linux-kernel, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20200710015646.2020871-1-npiggin@gmail.com>

And get rid of the generic sync_core_before_usermode facility.

This helper is the wrong way around I think. The idea that membarrier
state requires a core sync before returning to user is the easy one
that does not need hiding behind membarrier calls. The gap in core
synchronization due to x86's sysret/sysexit and lazy tlb mode, is the
tricky detail that is better put in x86 lazy tlb code.

Consider if an arch did not synchronize core in switch_mm either, then
membarrier_mm_sync_core_before_usermode would be in the wrong place
but arch specific mmu context functions would still be the right place.
There is also a exit_lazy_tlb case that is not covered by this call, which
could be a bugs (kthread use mm the membarrier process's mm then context
switch back to the process without switching mm or lazy mm switch).

This makes lazy tlb code a bit more modular.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 .../membarrier-sync-core/arch-support.txt     |  6 +++-
 arch/x86/include/asm/mmu_context.h            | 35 +++++++++++++++++++
 arch/x86/include/asm/sync_core.h              | 28 ---------------
 include/linux/sched/mm.h                      | 14 --------
 include/linux/sync_core.h                     | 21 -----------
 kernel/cpu.c                                  |  4 ++-
 kernel/kthread.c                              |  2 +-
 kernel/sched/core.c                           | 16 ++++-----
 8 files changed, 51 insertions(+), 75 deletions(-)
 delete mode 100644 arch/x86/include/asm/sync_core.h
 delete mode 100644 include/linux/sync_core.h

diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index 52ad74a25f54..bd43fb1f5986 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -5,6 +5,10 @@
 #
 # Architecture requirements
 #
+# If your architecture returns to user-space through non-core-serializing
+# instructions, you need to ensure these are done in switch_mm and exit_lazy_tlb
+# (if lazy tlb switching is implemented).
+#
 # * arm/arm64/powerpc
 #
 # Rely on implicit context synchronization as a result of exception return
@@ -24,7 +28,7 @@
 # instead on write_cr3() performed by switch_mm() to provide core serialization
 # after changing the current mm, and deal with the special case of kthread ->
 # uthread (temporarily keeping current mm into active_mm) by issuing a
-# sync_core_before_usermode() in that specific case.
+# serializing instruction in exit_lazy_mm() in that specific case.
 #
     -----------------------
     |         arch |status|
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 255750548433..5263863a9be8 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -6,6 +6,7 @@
 #include <linux/atomic.h>
 #include <linux/mm_types.h>
 #include <linux/pkeys.h>
+#include <linux/sched/mm.h>
 
 #include <trace/events/tlb.h>
 
@@ -95,6 +96,40 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
 #define enter_lazy_tlb enter_lazy_tlb
 extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
 
+#ifdef CONFIG_MEMBARRIER
+/*
+ * Ensure that a core serializing instruction is issued before returning
+ * to user-mode, if a SYNC_CORE was requested. x86 implements return to
+ * user-space through sysexit, sysrel, and sysretq, which are not core
+ * serializing.
+ *
+ * See the membarrier comment in finish_task_switch as to why this is done
+ * in exit_lazy_tlb.
+ */
+#define exit_lazy_tlb exit_lazy_tlb
+static inline void exit_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+	/* Switching mm is serializing with write_cr3 */
+        if (tsk->mm != mm)
+                return;
+
+        if (likely(!(atomic_read(&mm->membarrier_state) &
+                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
+                return;
+
+	/* With PTI, we unconditionally serialize before running user code. */
+	if (static_cpu_has(X86_FEATURE_PTI))
+		return;
+	/*
+	 * Return from interrupt and NMI is done through iret, which is core
+	 * serializing.
+	 */
+	if (in_irq() || in_nmi())
+		return;
+	sync_core();
+}
+#endif
+
 /*
  * Init a new mm.  Used on mm copies, like at fork()
  * and on mm's that are brand-new, like at execve().
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
deleted file mode 100644
index c67caafd3381..000000000000
--- a/arch/x86/include/asm/sync_core.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_SYNC_CORE_H
-#define _ASM_X86_SYNC_CORE_H
-
-#include <linux/preempt.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-
-/*
- * Ensure that a core serializing instruction is issued before returning
- * to user-mode. x86 implements return to user-space through sysexit,
- * sysrel, and sysretq, which are not core serializing.
- */
-static inline void sync_core_before_usermode(void)
-{
-	/* With PTI, we unconditionally serialize before running user code. */
-	if (static_cpu_has(X86_FEATURE_PTI))
-		return;
-	/*
-	 * Return from interrupt and NMI is done through iret, which is core
-	 * serializing.
-	 */
-	if (in_irq() || in_nmi())
-		return;
-	sync_core();
-}
-
-#endif /* _ASM_X86_SYNC_CORE_H */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 480a4d1b7dd8..9b026264b445 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -7,7 +7,6 @@
 #include <linux/sched.h>
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
-#include <linux/sync_core.h>
 
 /*
  * Routines for handling mm_structs
@@ -364,16 +363,6 @@ enum {
 #include <asm/membarrier.h>
 #endif
 
-static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
-{
-	if (current->mm != mm)
-		return;
-	if (likely(!(atomic_read(&mm->membarrier_state) &
-		     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
-		return;
-	sync_core_before_usermode();
-}
-
 extern void membarrier_exec_mmap(struct mm_struct *mm);
 
 #else
@@ -387,9 +376,6 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
 static inline void membarrier_exec_mmap(struct mm_struct *mm)
 {
 }
-static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
-{
-}
 #endif
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h
deleted file mode 100644
index 013da4b8b327..000000000000
--- a/include/linux/sync_core.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SYNC_CORE_H
-#define _LINUX_SYNC_CORE_H
-
-#ifdef CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
-#include <asm/sync_core.h>
-#else
-/*
- * This is a dummy sync_core_before_usermode() implementation that can be used
- * on all architectures which return to user-space through core serializing
- * instructions.
- * If your architecture returns to user-space through non-core-serializing
- * instructions, you need to write your own functions.
- */
-static inline void sync_core_before_usermode(void)
-{
-}
-#endif
-
-#endif /* _LINUX_SYNC_CORE_H */
-
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ff2578ecf17..134688d79589 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -572,7 +572,9 @@ static int finish_cpu(unsigned int cpu)
 
 	/*
 	 * idle_task_exit() will have switched to &init_mm, now
-	 * clean up any remaining active_mm state.
+	 * clean up any remaining active_mm state. exit_lazy_tlb
+	 * is not done, if an arch did any accounting in these
+	 * functions it would have to be added.
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e813d92f2eab..6f93c649aa97 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1251,9 +1251,9 @@ void kthread_use_mm(struct mm_struct *mm)
 	finish_arch_post_lock_switch();
 #endif
 
+	exit_lazy_tlb(active_mm, tsk);
 	if (active_mm != mm)
 		mmdrop(active_mm);
-	exit_lazy_tlb(active_mm, tsk);
 
 	to_kthread(tsk)->oldfs = get_fs();
 	set_fs(USER_DS);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index debc917bc69b..31e22c79826c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3294,22 +3294,19 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	kcov_finish_switch(current);
 
 	fire_sched_in_preempt_notifiers(current);
+
 	/*
 	 * When switching through a kernel thread, the loop in
 	 * membarrier_{private,global}_expedited() may have observed that
 	 * kernel thread and not issued an IPI. It is therefore possible to
 	 * schedule between user->kernel->user threads without passing though
-	 * switch_mm(). Membarrier requires a barrier after storing to
-	 * rq->curr, before returning to userspace, so provide them here:
-	 *
-	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
-	 *   provided by mmdrop(),
-	 * - a sync_core for SYNC_CORE.
+	 * switch_mm(). Membarrier requires a full barrier after storing to
+	 * rq->curr, before returning to userspace, for
+	 * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by mmdrop().
 	 */
-	if (mm) {
-		membarrier_mm_sync_core_before_usermode(mm);
+	if (mm)
 		mmdrop(mm);
-	}
+
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -6292,6 +6289,7 @@ void idle_task_exit(void)
 	BUG_ON(current != this_rq()->idle);
 
 	if (mm != &init_mm) {
+		/* enter_lazy_tlb is not done because we're about to go down */
 		switch_mm(mm, &init_mm, current);
 		finish_arch_post_lock_switch();
 	}
-- 
2.23.0


^ permalink raw reply related

* [RFC PATCH 3/7] mm: introduce exit_lazy_tlb
From: Nicholas Piggin @ 2020-07-10  1:56 UTC (permalink / raw)
  To: linux-arch
  Cc: Arnd Bergmann, Peter Zijlstra, x86, linux-kernel, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20200710015646.2020871-1-npiggin@gmail.com>

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 fs/exec.c                         |  5 +++--
 include/asm-generic/mmu_context.h | 20 ++++++++++++++++++++
 kernel/kthread.c                  |  1 +
 kernel/sched/core.c               |  2 ++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e6e8a9a70327..e2ab71e88293 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1117,9 +1117,10 @@ static int exec_mmap(struct mm_struct *mm)
 		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 		mm_update_next_owner(old_mm);
 		mmput(old_mm);
-		return 0;
+	} else {
+		exit_lazy_tlb(active_mm, tsk);
+		mmdrop(active_mm);
 	}
-	mmdrop(active_mm);
 	return 0;
 }
 
diff --git a/include/asm-generic/mmu_context.h b/include/asm-generic/mmu_context.h
index 86cea80a50df..3fc4c3879b79 100644
--- a/include/asm-generic/mmu_context.h
+++ b/include/asm-generic/mmu_context.h
@@ -24,6 +24,26 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 }
 #endif
 
+/*
+ * exit_lazy_tlb - Called after switching away from a lazy TLB mode mm.
+ *
+ * mm:  the lazy mm context that was switched away from
+ * tsk: the task that was switched to non-lazy mm
+ *
+ * tsk->mm will not be NULL.
+ *
+ * Note this is not symmetrical to enter_lazy_tlb, this is not
+ * called when tasks switch into the lazy mm, it's called after the
+ * lazy mm becomes non-lazy (either switched to a different mm or the
+ * owner of the mm returns).
+ */
+#ifndef exit_lazy_tlb
+static inline void exit_lazy_tlb(struct mm_struct *mm,
+			struct task_struct *tsk)
+{
+}
+#endif
+
 /**
  * init_new_context - Initialize context of a new mm_struct.
  * @tsk: task struct for the mm
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 132f84a5fde3..e813d92f2eab 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1253,6 +1253,7 @@ void kthread_use_mm(struct mm_struct *mm)
 
 	if (active_mm != mm)
 		mmdrop(active_mm);
+	exit_lazy_tlb(active_mm, tsk);
 
 	to_kthread(tsk)->oldfs = get_fs();
 	set_fs(USER_DS);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ca5db40392d4..debc917bc69b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3439,6 +3439,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
+			exit_lazy_tlb(prev->active_mm, next);
+
 			/* will mmdrop() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
-- 
2.23.0


^ permalink raw reply related

* [RFC PATCH 2/7] arch: use asm-generic mmu context for no-op implementations
From: Nicholas Piggin @ 2020-07-10  1:56 UTC (permalink / raw)
  To: linux-arch
  Cc: Arnd Bergmann, Peter Zijlstra, x86, linux-kernel, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20200710015646.2020871-1-npiggin@gmail.com>

This patch bunches all architectures together. If the general idea is
accepted I will split them individually. Some architectures can go
further e.g., with consolidating switch_mm and activate_mm but I
only did the more obvious ones.
---
 arch/alpha/include/asm/mmu_context.h         | 12 ++---
 arch/arc/include/asm/mmu_context.h           | 16 +++----
 arch/arm/include/asm/mmu_context.h           | 26 ++---------
 arch/arm64/include/asm/mmu_context.h         |  7 ++-
 arch/csky/include/asm/mmu_context.h          |  8 ++--
 arch/hexagon/include/asm/mmu_context.h       | 33 +++-----------
 arch/ia64/include/asm/mmu_context.h          | 17 ++-----
 arch/m68k/include/asm/mmu_context.h          | 47 ++++----------------
 arch/microblaze/include/asm/mmu_context_mm.h |  8 ++--
 arch/microblaze/include/asm/processor.h      |  3 --
 arch/mips/include/asm/mmu_context.h          | 11 ++---
 arch/nds32/include/asm/mmu_context.h         | 10 +----
 arch/nios2/include/asm/mmu_context.h         | 21 ++-------
 arch/nios2/mm/mmu_context.c                  |  1 +
 arch/openrisc/include/asm/mmu_context.h      |  8 ++--
 arch/openrisc/mm/tlb.c                       |  2 +
 arch/parisc/include/asm/mmu_context.h        | 12 ++---
 arch/powerpc/include/asm/mmu_context.h       | 22 +++------
 arch/riscv/include/asm/mmu_context.h         | 22 +--------
 arch/s390/include/asm/mmu_context.h          |  9 ++--
 arch/sh/include/asm/mmu_context.h            |  5 +--
 arch/sh/include/asm/mmu_context_32.h         |  9 ----
 arch/sparc/include/asm/mmu_context_32.h      | 10 ++---
 arch/sparc/include/asm/mmu_context_64.h      | 10 ++---
 arch/um/include/asm/mmu_context.h            | 12 +++--
 arch/unicore32/include/asm/mmu_context.h     | 24 ++--------
 arch/x86/include/asm/mmu_context.h           |  6 +++
 arch/xtensa/include/asm/mmu_context.h        | 11 ++---
 arch/xtensa/include/asm/nommu_context.h      | 26 +----------
 29 files changed, 106 insertions(+), 302 deletions(-)

diff --git a/arch/alpha/include/asm/mmu_context.h b/arch/alpha/include/asm/mmu_context.h
index 6d7d9bc1b4b8..4eea7c616992 100644
--- a/arch/alpha/include/asm/mmu_context.h
+++ b/arch/alpha/include/asm/mmu_context.h
@@ -214,8 +214,6 @@ ev4_activate_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm)
 	tbiap();
 }
 
-#define deactivate_mm(tsk,mm)	do { } while (0)
-
 #ifdef CONFIG_ALPHA_GENERIC
 # define switch_mm(a,b,c)	alpha_mv.mv_switch_mm((a),(b),(c))
 # define activate_mm(x,y)	alpha_mv.mv_activate_mm((x),(y))
@@ -229,6 +227,7 @@ ev4_activate_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm)
 # endif
 #endif
 
+#define init_new_context init_new_context
 static inline int
 init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
@@ -242,12 +241,7 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }
 
-extern inline void
-destroy_context(struct mm_struct *mm)
-{
-	/* Nothing to do.  */
-}
-
+#define enter_lazy_tlb enter_lazy_tlb
 static inline void
 enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
@@ -255,6 +249,8 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 	  = ((unsigned long)mm->pgd - IDENT_ADDR) >> PAGE_SHIFT;
 }
 
+#include <asm-generic/mmu_context.h>
+
 #ifdef __MMU_EXTERN_INLINE
 #undef __EXTERN_INLINE
 #undef __MMU_EXTERN_INLINE
diff --git a/arch/arc/include/asm/mmu_context.h b/arch/arc/include/asm/mmu_context.h
index 3a5e6a5b9ed6..586d31902a99 100644
--- a/arch/arc/include/asm/mmu_context.h
+++ b/arch/arc/include/asm/mmu_context.h
@@ -102,6 +102,7 @@ static inline void get_new_mmu_context(struct mm_struct *mm)
  * Initialize the context related info for a new mm_struct
  * instance.
  */
+#define init_new_context init_new_context
 static inline int
 init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
@@ -113,6 +114,7 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }
 
+#define destroy_context destroy_context
 static inline void destroy_context(struct mm_struct *mm)
 {
 	unsigned long flags;
@@ -153,13 +155,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 }
 
 /*
- * Called at the time of execve() to get a new ASID
- * Note the subtlety here: get_new_mmu_context() behaves differently here
- * vs. in switch_mm(). Here it always returns a new ASID, because mm has
- * an unallocated "initial" value, while in latter, it moves to a new ASID,
- * only if it was unallocated
+ * activate_mm defaults to switch_mm and is called at the time of execve() to
+ * get a new ASID Note the subtlety here: get_new_mmu_context() behaves
+ * differently here vs. in switch_mm(). Here it always returns a new ASID,
+ * because mm has an unallocated "initial" value, while in latter, it moves to
+ * a new ASID, only if it was unallocated
  */
-#define activate_mm(prev, next)		switch_mm(prev, next, NULL)
 
 /* it seemed that deactivate_mm( ) is a reasonable place to do book-keeping
  * for retiring-mm. However destroy_context( ) still needs to do that because
@@ -168,8 +169,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  * there is a good chance that task gets sched-out/in, making it's ASID valid
  * again (this teased me for a whole day).
  */
-#define deactivate_mm(tsk, mm)   do { } while (0)
 
-#define enter_lazy_tlb(mm, tsk)
+#include <asm-generic/mmu_context.h>
 
 #endif /* __ASM_ARC_MMU_CONTEXT_H */
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index f99ed524fe41..84e58956fcab 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -26,6 +26,8 @@ void __check_vmalloc_seq(struct mm_struct *mm);
 #ifdef CONFIG_CPU_HAS_ASID
 
 void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk);
+
+#define init_new_context init_new_context
 static inline int
 init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
@@ -92,32 +94,10 @@ static inline void finish_arch_post_lock_switch(void)
 
 #endif	/* CONFIG_MMU */
 
-static inline int
-init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-	return 0;
-}
-
-
 #endif	/* CONFIG_CPU_HAS_ASID */
 
-#define destroy_context(mm)		do { } while(0)
 #define activate_mm(prev,next)		switch_mm(prev, next, NULL)
 
-/*
- * This is called when "tsk" is about to enter lazy TLB mode.
- *
- * mm:  describes the currently active mm context
- * tsk: task which is entering lazy tlb
- * cpu: cpu number which is entering lazy tlb
- *
- * tsk->mm will be NULL
- */
-static inline void
-enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 /*
  * This is the actual mm switch as far as the scheduler
  * is concerned.  No registers are touched.  We avoid
@@ -149,6 +129,6 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 #endif
 }
 
-#define deactivate_mm(tsk,mm)	do { } while (0)
+#include <asm-generic/mmu_context.h>
 
 #endif
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index b0bd9b55594c..0f5e351f586a 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -174,7 +174,6 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
  * Setting a reserved TTBR0 or EPD0 would work, but it all gets ugly when you
  * take CPU migration into account.
  */
-#define destroy_context(mm)		do { } while(0)
 void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
 
 #define init_new_context(tsk,mm)	({ atomic64_set(&(mm)->context.id, 0); 0; })
@@ -202,6 +201,7 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
 }
 #endif
 
+#define enter_lazy_tlb enter_lazy_tlb
 static inline void
 enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
@@ -244,12 +244,11 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	update_saved_ttbr0(tsk, next);
 }
 
-#define deactivate_mm(tsk,mm)	do { } while (0)
-#define activate_mm(prev,next)	switch_mm(prev, next, current)
-
 void verify_cpu_asid_bits(void);
 void post_ttbr_update_workaround(void);
 
+#include <asm-generic/mmu_context.h>
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* !__ASM_MMU_CONTEXT_H */
diff --git a/arch/csky/include/asm/mmu_context.h b/arch/csky/include/asm/mmu_context.h
index abdf1f1cb6ec..b227d29393a8 100644
--- a/arch/csky/include/asm/mmu_context.h
+++ b/arch/csky/include/asm/mmu_context.h
@@ -24,11 +24,6 @@
 #define cpu_asid(mm)		(atomic64_read(&mm->context.asid) & ASID_MASK)
 
 #define init_new_context(tsk,mm)	({ atomic64_set(&(mm)->context.asid, 0); 0; })
-#define activate_mm(prev,next)		switch_mm(prev, next, current)
-
-#define destroy_context(mm)		do {} while (0)
-#define enter_lazy_tlb(mm, tsk)		do {} while (0)
-#define deactivate_mm(tsk, mm)		do {} while (0)
 
 void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
 
@@ -46,4 +41,7 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 	flush_icache_deferred(next);
 }
+
+#include <asm-generic/mmu_context.h>
+
 #endif /* __ASM_CSKY_MMU_CONTEXT_H */
diff --git a/arch/hexagon/include/asm/mmu_context.h b/arch/hexagon/include/asm/mmu_context.h
index cdc4adc0300a..81947764c47d 100644
--- a/arch/hexagon/include/asm/mmu_context.h
+++ b/arch/hexagon/include/asm/mmu_context.h
@@ -15,39 +15,13 @@
 #include <asm/pgalloc.h>
 #include <asm/mem-layout.h>
 
-static inline void destroy_context(struct mm_struct *mm)
-{
-}
-
 /*
  * VM port hides all TLB management, so "lazy TLB" isn't very
  * meaningful.  Even for ports to architectures with visble TLBs,
  * this is almost invariably a null function.
+ *
+ * mm->context is set up by pgd_alloc, so no init_new_context required.
  */
-static inline void enter_lazy_tlb(struct mm_struct *mm,
-	struct task_struct *tsk)
-{
-}
-
-/*
- * Architecture-specific actions, if any, for memory map deactivation.
- */
-static inline void deactivate_mm(struct task_struct *tsk,
-	struct mm_struct *mm)
-{
-}
-
-/**
- * init_new_context - initialize context related info for new mm_struct instance
- * @tsk: pointer to a task struct
- * @mm: pointer to a new mm struct
- */
-static inline int init_new_context(struct task_struct *tsk,
-					struct mm_struct *mm)
-{
-	/* mm->context is set up by pgd_alloc */
-	return 0;
-}
 
 /*
  *  Switch active mm context
@@ -74,6 +48,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 /*
  *  Activate new memory map for task
  */
+#define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
 	unsigned long flags;
@@ -86,4 +61,6 @@ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 /*  Generic hooks for arch_dup_mmap and arch_exit_mmap  */
 #include <asm-generic/mm_hooks.h>
 
+#include <asm-generic/mmu_context.h>
+
 #endif
diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h
index 2da0e2eb036b..87a0d5bc11ef 100644
--- a/arch/ia64/include/asm/mmu_context.h
+++ b/arch/ia64/include/asm/mmu_context.h
@@ -49,11 +49,6 @@ DECLARE_PER_CPU(u8, ia64_need_tlb_flush);
 extern void mmu_context_init (void);
 extern void wrap_mmu_context (struct mm_struct *mm);
 
-static inline void
-enter_lazy_tlb (struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 /*
  * When the context counter wraps around all TLBs need to be flushed because
  * an old context number might have been reused. This is signalled by the
@@ -116,6 +111,7 @@ get_mmu_context (struct mm_struct *mm)
  * Initialize context number to some sane value.  MM is guaranteed to be a
  * brand-new address-space, so no TLB flushing is needed, ever.
  */
+#define init_new_context init_new_context
 static inline int
 init_new_context (struct task_struct *p, struct mm_struct *mm)
 {
@@ -123,12 +119,6 @@ init_new_context (struct task_struct *p, struct mm_struct *mm)
 	return 0;
 }
 
-static inline void
-destroy_context (struct mm_struct *mm)
-{
-	/* Nothing to do.  */
-}
-
 static inline void
 reload_context (nv_mm_context_t context)
 {
@@ -178,11 +168,10 @@ activate_context (struct mm_struct *mm)
 	} while (unlikely(context != mm->context));
 }
 
-#define deactivate_mm(tsk,mm)	do { } while (0)
-
 /*
  * Switch from address space PREV to address space NEXT.
  */
+#define activate_mm activate_mm
 static inline void
 activate_mm (struct mm_struct *prev, struct mm_struct *next)
 {
@@ -196,5 +185,7 @@ activate_mm (struct mm_struct *prev, struct mm_struct *next)
 
 #define switch_mm(prev_mm,next_mm,next_task)	activate_mm(prev_mm, next_mm)
 
+#include <asm-generic/mmu_context.h>
+
 # endif /* ! __ASSEMBLY__ */
 #endif /* _ASM_IA64_MMU_CONTEXT_H */
diff --git a/arch/m68k/include/asm/mmu_context.h b/arch/m68k/include/asm/mmu_context.h
index cac9f289d1f6..56ae27322178 100644
--- a/arch/m68k/include/asm/mmu_context.h
+++ b/arch/m68k/include/asm/mmu_context.h
@@ -5,10 +5,6 @@
 #include <asm-generic/mm_hooks.h>
 #include <linux/mm_types.h>
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 #ifdef CONFIG_MMU
 
 #if defined(CONFIG_COLDFIRE)
@@ -58,6 +54,7 @@ static inline void get_mmu_context(struct mm_struct *mm)
 /*
  * We're finished using the context for an address space.
  */
+#define destroy_context destroy_context
 static inline void destroy_context(struct mm_struct *mm)
 {
 	if (mm->context != NO_CONTEXT) {
@@ -79,19 +76,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	set_context(tsk->mm->context, next->pgd);
 }
 
-/*
- * After we have set current->mm to a new value, this activates
- * the context for the new mm so we see the new mappings.
- */
-static inline void activate_mm(struct mm_struct *active_mm,
-	struct mm_struct *mm)
-{
-	get_mmu_context(mm);
-	set_context(mm->context, mm->pgd);
-}
-
-#define deactivate_mm(tsk, mm) do { } while (0)
-
 #define prepare_arch_switch(next) load_ksp_mmu(next)
 
 static inline void load_ksp_mmu(struct task_struct *task)
@@ -176,6 +160,7 @@ extern unsigned long get_free_context(struct mm_struct *mm);
 extern void clear_context(unsigned long context);
 
 /* set the context for a new task to unmapped */
+#define init_new_context init_new_context
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
@@ -210,8 +195,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	activate_context(tsk->mm);
 }
 
-#define deactivate_mm(tsk, mm)	do { } while (0)
-
+#define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *prev_mm,
 			       struct mm_struct *next_mm)
 {
@@ -224,6 +208,7 @@ static inline void activate_mm(struct mm_struct *prev_mm,
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 
+#define init_new_context init_new_context
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
@@ -231,8 +216,6 @@ static inline int init_new_context(struct task_struct *tsk,
 	return 0;
 }
 
-#define destroy_context(mm)		do { } while(0)
-
 static inline void switch_mm_0230(struct mm_struct *mm)
 {
 	unsigned long crp[2] = {
@@ -300,8 +283,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, str
 	}
 }
 
-#define deactivate_mm(tsk,mm)	do { } while (0)
-
+#define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *prev_mm,
 			       struct mm_struct *next_mm)
 {
@@ -315,24 +297,11 @@ static inline void activate_mm(struct mm_struct *prev_mm,
 
 #endif
 
-#else /* !CONFIG_MMU */
+#include <asm-generic/mmu_context.h>
 
-static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-	return 0;
-}
-
-
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk)
-{
-}
-
-#define destroy_context(mm)	do { } while (0)
-#define deactivate_mm(tsk,mm)	do { } while (0)
+#else /* !CONFIG_MMU */
 
-static inline void activate_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm)
-{
-}
+#include <asm-generic/nommu_context.h>
 
 #endif /* CONFIG_MMU */
 #endif /* __M68K_MMU_CONTEXT_H */
diff --git a/arch/microblaze/include/asm/mmu_context_mm.h b/arch/microblaze/include/asm/mmu_context_mm.h
index a1c7dd48454c..c2c77f708455 100644
--- a/arch/microblaze/include/asm/mmu_context_mm.h
+++ b/arch/microblaze/include/asm/mmu_context_mm.h
@@ -33,10 +33,6 @@
    to represent all kernel pages as shared among all contexts.
  */
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 # define NO_CONTEXT	256
 # define LAST_CONTEXT	255
 # define FIRST_CONTEXT	1
@@ -105,6 +101,7 @@ static inline void get_mmu_context(struct mm_struct *mm)
 /*
  * We're finished using the context for an address space.
  */
+#define destroy_context destroy_context
 static inline void destroy_context(struct mm_struct *mm)
 {
 	if (mm->context != NO_CONTEXT) {
@@ -126,6 +123,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  * After we have set current->mm to a new value, this activates
  * the context for the new mm so we see the new mappings.
  */
+#define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *active_mm,
 			struct mm_struct *mm)
 {
@@ -136,5 +134,7 @@ static inline void activate_mm(struct mm_struct *active_mm,
 
 extern void mmu_context_init(void);
 
+#include <asm-generic/mmu_context.h>
+
 # endif /* __KERNEL__ */
 #endif /* _ASM_MICROBLAZE_MMU_CONTEXT_H */
diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h
index 1ff5a82b76b6..616211871a6e 100644
--- a/arch/microblaze/include/asm/processor.h
+++ b/arch/microblaze/include/asm/processor.h
@@ -122,9 +122,6 @@ unsigned long get_wchan(struct task_struct *p);
 #  define KSTK_EIP(task)	(task_pc(task))
 #  define KSTK_ESP(task)	(task_sp(task))
 
-/* FIXME */
-#  define deactivate_mm(tsk, mm)	do { } while (0)
-
 #  define STACK_TOP	TASK_SIZE
 #  define STACK_TOP_MAX	STACK_TOP
 
diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h
index cddead91acd4..ed9f2d748f63 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -124,10 +124,6 @@ static inline void set_cpu_context(unsigned int cpu,
 #define cpu_asid(cpu, mm) \
 	(cpu_context((cpu), (mm)) & cpu_asid_mask(&cpu_data[cpu]))
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 extern void get_new_mmu_context(struct mm_struct *mm);
 extern void check_mmu_context(struct mm_struct *mm);
 extern void check_switch_mmu_context(struct mm_struct *mm);
@@ -136,6 +132,7 @@ extern void check_switch_mmu_context(struct mm_struct *mm);
  * Initialize the context related info for a new mm_struct
  * instance.
  */
+#define init_new_context init_new_context
 static inline int
 init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
@@ -180,14 +177,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  * Destroy context related info for an mm_struct that is about
  * to be put to rest.
  */
+#define destroy_context destroy_context
 static inline void destroy_context(struct mm_struct *mm)
 {
 	dsemul_mm_cleanup(mm);
 }
 
-#define activate_mm(prev, next)	switch_mm(prev, next, current)
-#define deactivate_mm(tsk, mm)	do { } while (0)
-
 static inline void
 drop_mmu_context(struct mm_struct *mm)
 {
@@ -237,4 +232,6 @@ drop_mmu_context(struct mm_struct *mm)
 	local_irq_restore(flags);
 }
 
+#include <asm-generic/mmu_context.h>
+
 #endif /* _ASM_MMU_CONTEXT_H */
diff --git a/arch/nds32/include/asm/mmu_context.h b/arch/nds32/include/asm/mmu_context.h
index b8fd3d189fdc..c651bc8cacdc 100644
--- a/arch/nds32/include/asm/mmu_context.h
+++ b/arch/nds32/include/asm/mmu_context.h
@@ -9,6 +9,7 @@
 #include <asm/proc-fns.h>
 #include <asm-generic/mm_hooks.h>
 
+#define init_new_context init_new_context
 static inline int
 init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
@@ -16,8 +17,6 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }
 
-#define destroy_context(mm)	do { } while(0)
-
 #define CID_BITS	9
 extern spinlock_t cid_lock;
 extern unsigned int cpu_last_cid;
@@ -47,10 +46,6 @@ static inline void check_context(struct mm_struct *mm)
 		__new_context(mm);
 }
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
 {
@@ -62,7 +57,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	}
 }
 
-#define deactivate_mm(tsk,mm)	do { } while (0)
-#define activate_mm(prev,next)	switch_mm(prev, next, NULL)
+#include <asm-generic/mmu_context.h>
 
 #endif
diff --git a/arch/nios2/include/asm/mmu_context.h b/arch/nios2/include/asm/mmu_context.h
index 78ab3dacf579..4f99ed09b5a7 100644
--- a/arch/nios2/include/asm/mmu_context.h
+++ b/arch/nios2/include/asm/mmu_context.h
@@ -26,16 +26,13 @@ extern unsigned long get_pid_from_context(mm_context_t *ctx);
  */
 extern pgd_t *pgd_current;
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 /*
  * Initialize the context related info for a new mm_struct instance.
  *
  * Set all new contexts to 0, that way the generation will never match
  * the currently running generation when this context is switched in.
  */
+#define init_new_context init_new_context
 static inline int init_new_context(struct task_struct *tsk,
 					struct mm_struct *mm)
 {
@@ -43,26 +40,16 @@ static inline int init_new_context(struct task_struct *tsk,
 	return 0;
 }
 
-/*
- * Destroy context related info for an mm_struct that is about
- * to be put to rest.
- */
-static inline void destroy_context(struct mm_struct *mm)
-{
-}
-
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		struct task_struct *tsk);
 
-static inline void deactivate_mm(struct task_struct *tsk,
-				struct mm_struct *mm)
-{
-}
-
 /*
  * After we have set current->mm to a new value, this activates
  * the context for the new mm so we see the new mappings.
  */
+#define activate_mm activate_mm
 void activate_mm(struct mm_struct *prev, struct mm_struct *next);
 
+#include <asm-generic/mmu_context.h>
+
 #endif /* _ASM_NIOS2_MMU_CONTEXT_H */
diff --git a/arch/nios2/mm/mmu_context.c b/arch/nios2/mm/mmu_context.c
index 45d6b9c58d67..d77aa542deb2 100644
--- a/arch/nios2/mm/mmu_context.c
+++ b/arch/nios2/mm/mmu_context.c
@@ -103,6 +103,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  * After we have set current->mm to a new value, this activates
  * the context for the new mm so we see the new mappings.
  */
+#define activate_mm activate_mm
 void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
 	next->context = get_new_context();
diff --git a/arch/openrisc/include/asm/mmu_context.h b/arch/openrisc/include/asm/mmu_context.h
index ced577542e29..a6702384c77d 100644
--- a/arch/openrisc/include/asm/mmu_context.h
+++ b/arch/openrisc/include/asm/mmu_context.h
@@ -17,13 +17,13 @@
 
 #include <asm-generic/mm_hooks.h>
 
+#define init_new_context init_new_context
 extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+#define destroy_context destroy_context
 extern void destroy_context(struct mm_struct *mm);
 extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		      struct task_struct *tsk);
 
-#define deactivate_mm(tsk, mm)	do { } while (0)
-
 #define activate_mm(prev, next) switch_mm((prev), (next), NULL)
 
 /* current active pgd - this is similar to other processors pgd
@@ -32,8 +32,6 @@ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 extern volatile pgd_t *current_pgd[]; /* defined in arch/openrisc/mm/fault.c */
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
+#include <asm-generic/mmu_context.h>
 
 #endif
diff --git a/arch/openrisc/mm/tlb.c b/arch/openrisc/mm/tlb.c
index 4b680aed8f5f..821aab4cf3be 100644
--- a/arch/openrisc/mm/tlb.c
+++ b/arch/openrisc/mm/tlb.c
@@ -159,6 +159,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  * instance.
  */
 
+#define init_new_context init_new_context
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
 	mm->context = NO_CONTEXT;
@@ -170,6 +171,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
  * drops it.
  */
 
+#define destroy_context destroy_context
 void destroy_context(struct mm_struct *mm)
 {
 	flush_tlb_mm(mm);
diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h
index 07b89c74abeb..71f8a3679b83 100644
--- a/arch/parisc/include/asm/mmu_context.h
+++ b/arch/parisc/include/asm/mmu_context.h
@@ -8,16 +8,13 @@
 #include <asm/pgalloc.h>
 #include <asm-generic/mm_hooks.h>
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 /* on PA-RISC, we actually have enough contexts to justify an allocator
  * for them.  prumpf */
 
 extern unsigned long alloc_sid(void);
 extern void free_sid(unsigned long);
 
+#define init_new_context init_new_context
 static inline int
 init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 {
@@ -27,6 +24,7 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	return 0;
 }
 
+#define destroy_context destroy_context
 static inline void
 destroy_context(struct mm_struct *mm)
 {
@@ -72,8 +70,7 @@ static inline void switch_mm(struct mm_struct *prev,
 }
 #define switch_mm_irqs_off switch_mm_irqs_off
 
-#define deactivate_mm(tsk,mm)	do { } while (0)
-
+#define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
 	/*
@@ -91,4 +88,7 @@ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 
 	switch_mm(prev,next,current);
 }
+
+#include <asm-generic/mmu_context.h>
+
 #endif
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 1a474f6b1992..242bd987247b 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -14,7 +14,9 @@
 /*
  * Most if the context management is out of line
  */
+#define init_new_context init_new_context
 extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+#define destroy_context destroy_context
 extern void destroy_context(struct mm_struct *mm);
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 struct mm_iommu_table_group_mem_t;
@@ -237,27 +239,15 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 }
 #define switch_mm_irqs_off switch_mm_irqs_off
 
-
-#define deactivate_mm(tsk,mm)	do { } while (0)
-
-/*
- * After we have set current->mm to a new value, this activates
- * the context for the new mm so we see the new mappings.
- */
-static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
-{
-	switch_mm(prev, next, current);
-}
-
-/* We don't currently use enter_lazy_tlb() for anything */
+#ifdef CONFIG_PPC_BOOK3E_64
+#define enter_lazy_tlb enter_lazy_tlb
 static inline void enter_lazy_tlb(struct mm_struct *mm,
 				  struct task_struct *tsk)
 {
 	/* 64-bit Book3E keeps track of current PGD in the PACA */
-#ifdef CONFIG_PPC_BOOK3E_64
 	get_paca()->pgd = NULL;
-#endif
 }
+#endif
 
 extern void arch_exit_mmap(struct mm_struct *mm);
 
@@ -300,5 +290,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
 	return 0;
 }
 
+#include <asm-generic/mmu_context.h>
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/riscv/include/asm/mmu_context.h b/arch/riscv/include/asm/mmu_context.h
index 67c463812e2d..250defa06f3a 100644
--- a/arch/riscv/include/asm/mmu_context.h
+++ b/arch/riscv/include/asm/mmu_context.h
@@ -13,34 +13,16 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 
-static inline void enter_lazy_tlb(struct mm_struct *mm,
-	struct task_struct *task)
-{
-}
-
-/* Initialize context-related info for a new mm_struct */
-static inline int init_new_context(struct task_struct *task,
-	struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void destroy_context(struct mm_struct *mm)
-{
-}
-
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	struct task_struct *task);
 
+#define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *prev,
 			       struct mm_struct *next)
 {
 	switch_mm(prev, next, NULL);
 }
 
-static inline void deactivate_mm(struct task_struct *task,
-	struct mm_struct *mm)
-{
-}
+#include <asm-generic/mmu_context.h>
 
 #endif /* _ASM_RISCV_MMU_CONTEXT_H */
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index c9f3d8a52756..66f9cf0a07e3 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,6 +15,7 @@
 #include <asm/ctl_reg.h>
 #include <asm-generic/mm_hooks.h>
 
+#define init_new_context init_new_context
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
@@ -69,8 +70,6 @@ static inline int init_new_context(struct task_struct *tsk,
 	return 0;
 }
 
-#define destroy_context(mm)             do { } while (0)
-
 static inline void set_user_asce(struct mm_struct *mm)
 {
 	S390_lowcore.user_asce = mm->context.asce;
@@ -125,9 +124,7 @@ static inline void finish_arch_post_lock_switch(void)
 	set_fs(current->thread.mm_segment);
 }
 
-#define enter_lazy_tlb(mm,tsk)	do { } while (0)
-#define deactivate_mm(tsk,mm)	do { } while (0)
-
+#define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *prev,
                                struct mm_struct *next)
 {
@@ -136,4 +133,6 @@ static inline void activate_mm(struct mm_struct *prev,
 	set_user_asce(next);
 }
 
+#include <asm-generic/mmu_context.h>
+
 #endif /* __S390_MMU_CONTEXT_H */
diff --git a/arch/sh/include/asm/mmu_context.h b/arch/sh/include/asm/mmu_context.h
index 9470d17c71c2..ce40147d4a7d 100644
--- a/arch/sh/include/asm/mmu_context.h
+++ b/arch/sh/include/asm/mmu_context.h
@@ -85,6 +85,7 @@ static inline void get_mmu_context(struct mm_struct *mm, unsigned int cpu)
  * Initialize the context related info for a new mm_struct
  * instance.
  */
+#define init_new_context init_new_context
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
@@ -121,9 +122,7 @@ static inline void switch_mm(struct mm_struct *prev,
 			activate_context(next, cpu);
 }
 
-#define activate_mm(prev, next)		switch_mm((prev),(next),NULL)
-#define deactivate_mm(tsk,mm)		do { } while (0)
-#define enter_lazy_tlb(mm,tsk)		do { } while (0)
+#include <asm-generic/mmu_context.h>
 
 #else
 
diff --git a/arch/sh/include/asm/mmu_context_32.h b/arch/sh/include/asm/mmu_context_32.h
index 71bf12ef1f65..bc5034fa6249 100644
--- a/arch/sh/include/asm/mmu_context_32.h
+++ b/arch/sh/include/asm/mmu_context_32.h
@@ -2,15 +2,6 @@
 #ifndef __ASM_SH_MMU_CONTEXT_32_H
 #define __ASM_SH_MMU_CONTEXT_32_H
 
-/*
- * Destroy context related info for an mm_struct that is about
- * to be put to rest.
- */
-static inline void destroy_context(struct mm_struct *mm)
-{
-	/* Do nothing */
-}
-
 #ifdef CONFIG_CPU_HAS_PTEAEX
 static inline void set_asid(unsigned long asid)
 {
diff --git a/arch/sparc/include/asm/mmu_context_32.h b/arch/sparc/include/asm/mmu_context_32.h
index 7ddcb8badf70..509043f81560 100644
--- a/arch/sparc/include/asm/mmu_context_32.h
+++ b/arch/sparc/include/asm/mmu_context_32.h
@@ -6,13 +6,10 @@
 
 #include <asm-generic/mm_hooks.h>
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 /* Initialize a new mmu context.  This is invoked when a new
  * address space instance (unique or shared) is instantiated.
  */
+#define init_new_context init_new_context
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 
 /* Destroy a dead context.  This occurs when mmput drops the
@@ -20,17 +17,18 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
  * all the page tables have been flushed.  Our job is to destroy
  * any remaining processor-specific state.
  */
+#define destroy_context destroy_context
 void destroy_context(struct mm_struct *mm);
 
 /* Switch the current MM context. */
 void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm,
 	       struct task_struct *tsk);
 
-#define deactivate_mm(tsk,mm)	do { } while (0)
-
 /* Activate a new MM instance for the current task. */
 #define activate_mm(active_mm, mm) switch_mm((active_mm), (mm), NULL)
 
+#include <asm-generic/mmu_context.h>
+
 #endif /* !(__ASSEMBLY__) */
 
 #endif /* !(__SPARC_MMU_CONTEXT_H) */
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index 312fcee8df2b..7a8380c63aab 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -16,17 +16,16 @@
 #include <asm-generic/mm_hooks.h>
 #include <asm/percpu.h>
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 extern spinlock_t ctx_alloc_lock;
 extern unsigned long tlb_context_cache;
 extern unsigned long mmu_context_bmap[];
 
 DECLARE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm);
 void get_new_mmu_context(struct mm_struct *mm);
+
+#define init_new_context init_new_context
 int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+#define destroy_context destroy_context
 void destroy_context(struct mm_struct *mm);
 
 void __tsb_context_switch(unsigned long pgd_pa,
@@ -136,7 +135,6 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
 	spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
-#define deactivate_mm(tsk,mm)	do { } while (0)
 #define activate_mm(active_mm, mm) switch_mm(active_mm, mm, NULL)
 
 #define  __HAVE_ARCH_START_CONTEXT_SWITCH
@@ -187,6 +185,8 @@ static inline void finish_arch_post_lock_switch(void)
 	}
 }
 
+#include <asm-generic/mmu_context.h>
+
 #endif /* !(__ASSEMBLY__) */
 
 #endif /* !(__SPARC64_MMU_CONTEXT_H) */
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index 17ddd4edf875..f8a100770691 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -37,10 +37,9 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
  * end asm-generic/mm_hooks.h functions
  */
 
-#define deactivate_mm(tsk,mm)	do { } while (0)
-
 extern void force_flush_all(void);
 
+#define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *old, struct mm_struct *new)
 {
 	/*
@@ -66,13 +65,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	}
 }
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, 
-				  struct task_struct *tsk)
-{
-}
-
+#define init_new_context init_new_context
 extern int init_new_context(struct task_struct *task, struct mm_struct *mm);
 
+#define destroy_context destroy_context
 extern void destroy_context(struct mm_struct *mm);
 
+#include <asm-generic/mmu_context.h>
+
 #endif
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index 388c0c811c68..e1751cb5439c 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -18,24 +18,6 @@
 #include <asm/cacheflush.h>
 #include <asm/cpu-single.h>
 
-#define init_new_context(tsk, mm)	0
-
-#define destroy_context(mm)		do { } while (0)
-
-/*
- * This is called when "tsk" is about to enter lazy TLB mode.
- *
- * mm:  describes the currently active mm context
- * tsk: task which is entering lazy tlb
- * cpu: cpu number which is entering lazy tlb
- *
- * tsk->mm will be NULL
- */
-static inline void
-enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
 /*
  * This is the actual mm switch as far as the scheduler
  * is concerned.  No registers are touched.  We avoid
@@ -52,9 +34,6 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		cpu_switch_mm(next->pgd, next);
 }
 
-#define deactivate_mm(tsk, mm)	do { } while (0)
-#define activate_mm(prev, next)	switch_mm(prev, next, NULL)
-
 /*
  * We are inserting a "fake" vma for the user-accessible vector page so
  * gdb and friends can get to it through ptrace and /proc/<pid>/mem.
@@ -95,4 +74,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 	/* by default, allow everything */
 	return true;
 }
+
+#include <asm-generic/mmu_context.h>
+
 #endif
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 47562147e70b..255750548433 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -92,12 +92,14 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
 }
 #endif
 
+#define enter_lazy_tlb enter_lazy_tlb
 extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
 
 /*
  * Init a new mm.  Used on mm copies, like at fork()
  * and on mm's that are brand-new, like at execve().
  */
+#define init_new_context init_new_context
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
@@ -117,6 +119,8 @@ static inline int init_new_context(struct task_struct *tsk,
 	init_new_context_ldt(mm);
 	return 0;
 }
+
+#define destroy_context destroy_context
 static inline void destroy_context(struct mm_struct *mm)
 {
 	destroy_context_ldt(mm);
@@ -215,4 +219,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 
 unsigned long __get_current_cr3_fast(void);
 
+#include <asm-generic/mmu_context.h>
+
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/xtensa/include/asm/mmu_context.h b/arch/xtensa/include/asm/mmu_context.h
index 74923ef3b228..e337ba9686e9 100644
--- a/arch/xtensa/include/asm/mmu_context.h
+++ b/arch/xtensa/include/asm/mmu_context.h
@@ -111,6 +111,7 @@ static inline void activate_context(struct mm_struct *mm, unsigned int cpu)
  * to -1 says the process has never run on any core.
  */
 
+#define init_new_context init_new_context
 static inline int init_new_context(struct task_struct *tsk,
 		struct mm_struct *mm)
 {
@@ -136,24 +137,18 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		activate_context(next, cpu);
 }
 
-#define activate_mm(prev, next)	switch_mm((prev), (next), NULL)
-#define deactivate_mm(tsk, mm)	do { } while (0)
-
 /*
  * Destroy context related info for an mm_struct that is about
  * to be put to rest.
  */
+#define destroy_context destroy_context
 static inline void destroy_context(struct mm_struct *mm)
 {
 	invalidate_page_directory();
 }
 
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-	/* Nothing to do. */
-
-}
+#include <asm-generic/mmu_context.h>
 
 #endif /* CONFIG_MMU */
 #endif /* _XTENSA_MMU_CONTEXT_H */
diff --git a/arch/xtensa/include/asm/nommu_context.h b/arch/xtensa/include/asm/nommu_context.h
index 37251b2ef871..7c9d1918dc41 100644
--- a/arch/xtensa/include/asm/nommu_context.h
+++ b/arch/xtensa/include/asm/nommu_context.h
@@ -7,28 +7,4 @@ static inline void init_kio(void)
 {
 }
 
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
-static inline int init_new_context(struct task_struct *tsk,struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void destroy_context(struct mm_struct *mm)
-{
-}
-
-static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
-{
-}
-
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-				struct task_struct *tsk)
-{
-}
-
-static inline void deactivate_mm(struct task_struct *tsk, struct mm_struct *mm)
-{
-}
+#include <asm-generic/nommu_context.h>
-- 
2.23.0


^ permalink raw reply related

* [RFC PATCH 1/7] asm-generic: add generic MMU versions of mmu context functions
From: Nicholas Piggin @ 2020-07-10  1:56 UTC (permalink / raw)
  To: linux-arch
  Cc: Arnd Bergmann, Peter Zijlstra, x86, linux-kernel, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, Remis Lima Baima, linuxppc-dev
In-Reply-To: <20200710015646.2020871-1-npiggin@gmail.com>

Many of these are no-ops on many architectures, so extend mmu_context.h
to cover MMU and NOMMU, and split the NOMMU bits out to nommu_context.h

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Remis Lima Baima <remis.developer@googlemail.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/microblaze/include/asm/mmu_context.h |  2 +-
 arch/sh/include/asm/mmu_context.h         |  2 +-
 include/asm-generic/mmu_context.h         | 57 +++++++++++++++++------
 include/asm-generic/nommu_context.h       | 19 ++++++++
 4 files changed, 64 insertions(+), 16 deletions(-)
 create mode 100644 include/asm-generic/nommu_context.h

diff --git a/arch/microblaze/include/asm/mmu_context.h b/arch/microblaze/include/asm/mmu_context.h
index f74f9da07fdc..34004efb3def 100644
--- a/arch/microblaze/include/asm/mmu_context.h
+++ b/arch/microblaze/include/asm/mmu_context.h
@@ -2,5 +2,5 @@
 #ifdef CONFIG_MMU
 # include <asm/mmu_context_mm.h>
 #else
-# include <asm-generic/mmu_context.h>
+# include <asm-generic/nommu_context.h>
 #endif
diff --git a/arch/sh/include/asm/mmu_context.h b/arch/sh/include/asm/mmu_context.h
index 48e67d544d53..9470d17c71c2 100644
--- a/arch/sh/include/asm/mmu_context.h
+++ b/arch/sh/include/asm/mmu_context.h
@@ -134,7 +134,7 @@ static inline void switch_mm(struct mm_struct *prev,
 #define set_TTB(pgd)			do { } while (0)
 #define get_TTB()			(0)
 
-#include <asm-generic/mmu_context.h>
+#include <asm-generic/nommu_context.h>
 
 #endif /* CONFIG_MMU */
 
diff --git a/include/asm-generic/mmu_context.h b/include/asm-generic/mmu_context.h
index 6be9106fb6fb..86cea80a50df 100644
--- a/include/asm-generic/mmu_context.h
+++ b/include/asm-generic/mmu_context.h
@@ -3,44 +3,73 @@
 #define __ASM_GENERIC_MMU_CONTEXT_H
 
 /*
- * Generic hooks for NOMMU architectures, which do not need to do
- * anything special here.
+ * Generic hooks to implement no-op functionality.
  */
 
-#include <asm-generic/mm_hooks.h>
-
 struct task_struct;
 struct mm_struct;
 
+/*
+ * enter_lazy_tlb - Called when "tsk" is about to enter lazy TLB mode.
+ *
+ * @mm:  the currently active mm context which is becoming lazy
+ * @tsk: task which is entering lazy tlb
+ *
+ * tsk->mm will be NULL
+ */
+#ifndef enter_lazy_tlb
 static inline void enter_lazy_tlb(struct mm_struct *mm,
 			struct task_struct *tsk)
 {
 }
+#endif
 
+/**
+ * init_new_context - Initialize context of a new mm_struct.
+ * @tsk: task struct for the mm
+ * @mm:  the new mm struct
+ */
+#ifndef init_new_context
 static inline int init_new_context(struct task_struct *tsk,
 			struct mm_struct *mm)
 {
 	return 0;
 }
+#endif
 
+/**
+ * destroy_context - Undo init_new_context when the mm is going away
+ * @mm: old mm struct
+ */
+#ifndef destroy_context
 static inline void destroy_context(struct mm_struct *mm)
 {
 }
+#endif
 
-static inline void deactivate_mm(struct task_struct *task,
-			struct mm_struct *mm)
-{
-}
-
-static inline void switch_mm(struct mm_struct *prev,
-			struct mm_struct *next,
-			struct task_struct *tsk)
+/**
+ * activate_mm - called after exec switches the current task to a new mm, to switch to it
+ * @prev_mm: previous mm of this task
+ * @next_mm: new mm
+ */
+#ifndef activate_mm
+static inline void activate_mm(struct mm_struct *prev_mm,
+			       struct mm_struct *next_mm)
 {
+	switch_mm(prev_mm, next_mm, current);
 }
+#endif
 
-static inline void activate_mm(struct mm_struct *prev_mm,
-			       struct mm_struct *next_mm)
+/**
+ * dectivate_mm - called when an mm is released after exit or exec switches away from it
+ * @tsk: the task
+ * @mm:  the old mm
+ */
+#ifndef deactivate_mm
+static inline void deactivate_mm(struct task_struct *tsk,
+			struct mm_struct *mm)
 {
 }
+#endif
 
 #endif /* __ASM_GENERIC_MMU_CONTEXT_H */
diff --git a/include/asm-generic/nommu_context.h b/include/asm-generic/nommu_context.h
new file mode 100644
index 000000000000..72b8d8b1d81e
--- /dev/null
+++ b/include/asm-generic/nommu_context.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_GENERIC_NOMMU_H
+#define __ASM_GENERIC_NOMMU_H
+
+/*
+ * Generic hooks for NOMMU architectures, which do not need to do
+ * anything special here.
+ */
+
+#include <asm-generic/mm_hooks.h>
+#include <asm-generic/mmu_context.h>
+
+static inline void switch_mm(struct mm_struct *prev,
+			struct mm_struct *next,
+			struct task_struct *tsk)
+{
+}
+
+#endif /* __ASM_GENERIC_NOMMU_H */
-- 
2.23.0


^ permalink raw reply related

* [RFC PATCH 0/7] mmu context cleanup, lazy tlb cleanup,
From: Nicholas Piggin @ 2020-07-10  1:56 UTC (permalink / raw)
  To: linux-arch
  Cc: Arnd Bergmann, Peter Zijlstra, x86, linux-kernel, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev

This blew up a bit bigger than I thought, so I'd like to get some
comments as to whether people agree with the direction it's going.

The patches aren't cleanly split out by arch, but as it is now it's
probably easier to get a quick overview of the changes at a glance
anyway.

So there's a few different things here.

1. Clean up and use asm-generic for no-op mmu context functions (so
   not just for nommu architectures). This should be functionally a
   no-op for everybody. This allows exit_lazy_tlb to easily be added.

2. Add exit_lazy_tlb and use it for x86, so this is x86 and membarrier
   specific changes. I _may_ have spotted a small membarrier / core sync
   bug here when adding exit_lazy_tlb.

3. Tidy up lazy tlb a little bit, have its own refcount function and
   allow it to be selected out. We can audit the nommu archs and
   deselect it for those.

4. Add a non-refcounting lazy mmu mode, to help scalability when the
   same mm is used for a lot of lazy mmu switching.

Comments, questions on anything would be much appreciated.

Thanks,
Nick

Nicholas Piggin (7):
  asm-generic: add generic MMU versions of mmu context functions
  arch: use asm-generic mmu context for no-op implementations
  mm: introduce exit_lazy_tlb
  x86: use exit_lazy_tlb rather than
    membarrier_mm_sync_core_before_usermode
  lazy tlb: introduce lazy mm refcount helper functions
  lazy tlb: allow lazy tlb mm switching to be configurable
  lazy tlb: shoot lazies, a non-refcounting lazy tlb option

 .../membarrier-sync-core/arch-support.txt     |  6 +-
 arch/Kconfig                                  | 23 +++++
 arch/alpha/include/asm/mmu_context.h          | 12 +--
 arch/arc/include/asm/mmu_context.h            | 16 ++--
 arch/arm/include/asm/mmu_context.h            | 26 +-----
 arch/arm64/include/asm/mmu_context.h          |  7 +-
 arch/csky/include/asm/mmu_context.h           |  8 +-
 arch/hexagon/include/asm/mmu_context.h        | 33 ++------
 arch/ia64/include/asm/mmu_context.h           | 17 +---
 arch/m68k/include/asm/mmu_context.h           | 47 ++---------
 arch/microblaze/include/asm/mmu_context.h     |  2 +-
 arch/microblaze/include/asm/mmu_context_mm.h  |  8 +-
 arch/microblaze/include/asm/processor.h       |  3 -
 arch/mips/include/asm/mmu_context.h           | 11 +--
 arch/nds32/include/asm/mmu_context.h          | 10 +--
 arch/nios2/include/asm/mmu_context.h          | 21 +----
 arch/nios2/mm/mmu_context.c                   |  1 +
 arch/openrisc/include/asm/mmu_context.h       |  8 +-
 arch/openrisc/mm/tlb.c                        |  2 +
 arch/parisc/include/asm/mmu_context.h         | 12 +--
 arch/powerpc/Kconfig                          |  1 +
 arch/powerpc/include/asm/mmu_context.h        | 22 ++---
 arch/powerpc/kernel/smp.c                     |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c          |  4 +-
 arch/riscv/include/asm/mmu_context.h          | 22 +----
 arch/s390/include/asm/mmu_context.h           |  9 +-
 arch/sh/include/asm/mmu_context.h             |  7 +-
 arch/sh/include/asm/mmu_context_32.h          |  9 --
 arch/sparc/include/asm/mmu_context_32.h       | 10 +--
 arch/sparc/include/asm/mmu_context_64.h       | 10 +--
 arch/um/include/asm/mmu_context.h             | 12 ++-
 arch/unicore32/include/asm/mmu_context.h      | 24 +-----
 arch/x86/include/asm/mmu_context.h            | 41 +++++++++
 arch/x86/include/asm/sync_core.h              | 28 -------
 arch/xtensa/include/asm/mmu_context.h         | 11 +--
 arch/xtensa/include/asm/nommu_context.h       | 26 +-----
 fs/exec.c                                     |  5 +-
 include/asm-generic/mmu_context.h             | 77 +++++++++++++----
 include/asm-generic/nommu_context.h           | 19 +++++
 include/linux/sched/mm.h                      | 35 ++++----
 include/linux/sync_core.h                     | 21 -----
 kernel/cpu.c                                  |  6 +-
 kernel/exit.c                                 |  2 +-
 kernel/fork.c                                 | 39 +++++++++
 kernel/kthread.c                              | 12 ++-
 kernel/sched/core.c                           | 84 ++++++++++++-------
 kernel/sched/sched.h                          |  4 +-
 47 files changed, 388 insertions(+), 427 deletions(-)
 delete mode 100644 arch/x86/include/asm/sync_core.h
 create mode 100644 include/asm-generic/nommu_context.h
 delete mode 100644 include/linux/sync_core.h

-- 
2.23.0


^ permalink raw reply

* Re: [RFC][PATCH] avoid refcounting the lazy tlb mm struct
From: Anton Blanchard @ 2020-07-10  0:45 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linux-arch, linux-mm, linuxppc-dev
In-Reply-To: <1594019787.286knc5cet.astroid@bobo.none>

Hi Nick,

> On big systems, the mm refcount can become highly contented when doing
> a lot of context switching with threaded applications (particularly
> switching between the idle thread and an application thread).
> 
> Not doing lazy tlb at all slows switching down quite a bit, so I
> wonder if we can avoid the refcount for the lazy tlb, but have
> __mmdrop() IPI all CPUs that might be using this mm lazily.
> 
> This patch has only had light testing so far, but seems to work okay.

I tested this patch on a large POWER8 system with 1536 hardware threads.
I can create a worst case situation for mm refcounting by using
the threaded context switch test in will-it-scale set to half the
number of available CPUs (768).

With that workload the patch improves the context switch rate by 118x!

Tested-by: Anton Blanchard <anton@ozlabs.org>

Thanks,
Anton

> diff --git a/arch/Kconfig b/arch/Kconfig
> index 8cc35dc556c7..69ea7172db3d 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -411,6 +411,16 @@ config MMU_GATHER_NO_GATHER
>  	bool
>  	depends on MMU_GATHER_TABLE_FREE
>  
> +config MMU_LAZY_TLB_SHOOTDOWN
> +	bool
> +	help
> +	  Instead of refcounting the "lazy tlb" mm struct, which can
> cause
> +	  contention with multi-threaded apps on large
> multiprocessor systems,
> +	  this option causes __mmdrop to IPI all CPUs in the
> mm_cpumask and
> +	  switch to init_mm if they were using the to-be-freed mm as
> the lazy
> +	  tlb. Architectures which do not track all possible lazy
> tlb CPUs in
> +	  mm_cpumask can not use this (without modification).
> +
>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>  	bool
>  
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 920c4e3ca4ef..24ac85c868db 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -225,6 +225,7 @@ config PPC
>  	select HAVE_PERF_USER_STACK_DUMP
>  	select MMU_GATHER_RCU_TABLE_FREE
>  	select MMU_GATHER_PAGE_SIZE
> +	select MMU_LAZY_TLB_SHOOTDOWN
>  	select HAVE_REGS_AND_STACK_ACCESS_API
>  	select HAVE_RELIABLE_STACKTRACE		if
> PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c
> b/arch/powerpc/mm/book3s64/radix_tlb.c index
> b5cc9b23cf02..52730629b3eb 100644 ---
> a/arch/powerpc/mm/book3s64/radix_tlb.c +++
> b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -652,10 +652,10 @@ static
> void do_exit_flush_lazy_tlb(void *arg)
>  		 * Must be a kernel thread because sender is
> single-threaded. */
>  		BUG_ON(current->mm);
> -		mmgrab(&init_mm);
> +		mmgrab_lazy_tlb(&init_mm);
>  		switch_mm(mm, &init_mm, current);
>  		current->active_mm = &init_mm;
> -		mmdrop(mm);
> +		mmdrop_lazy_tlb(mm);
>  	}
>  	_tlbiel_pid(pid, RIC_FLUSH_ALL);
>  }
> diff --git a/fs/exec.c b/fs/exec.c
> index e6e8a9a70327..6c96c8feba1f 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1119,7 +1119,7 @@ static int exec_mmap(struct mm_struct *mm)
>  		mmput(old_mm);
>  		return 0;
>  	}
> -	mmdrop(active_mm);
> +	mmdrop_lazy_tlb(active_mm);
>  	return 0;
>  }
>  
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 480a4d1b7dd8..ef28059086a1 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -51,6 +51,25 @@ static inline void mmdrop(struct mm_struct *mm)
>  
>  void mmdrop(struct mm_struct *mm);
>  
> +static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
> +{
> +	if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> +		mmgrab(mm);
> +}
> +
> +static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
> +{
> +	if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> +		mmdrop(mm);
> +}
> +
> +static inline void mmdrop_lazy_tlb_smp_mb(struct mm_struct *mm)
> +{
> +	mmdrop_lazy_tlb(mm);
> +	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> +		smp_mb();
> +}
> +
>  /*
>   * This has to be called after a get_task_mm()/mmget_not_zero()
>   * followed by taking the mmap_lock for writing before modifying the
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 142b23645d82..e3f1039cee9f 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -685,6 +685,34 @@ static void check_mm(struct mm_struct *mm)
>  #define allocate_mm()	(kmem_cache_alloc(mm_cachep,
> GFP_KERNEL)) #define free_mm(mm)	(kmem_cache_free(mm_cachep,
> (mm))) 
> +static void do_shoot_lazy_tlb(void *arg)
> +{
> +	struct mm_struct *mm = arg;
> +
> +	if (current->active_mm == mm) {
> +		BUG_ON(current->mm);
> +		switch_mm(mm, &init_mm, current);
> +		current->active_mm = &init_mm;
> +	}
> +}
> +
> +static void do_check_lazy_tlb(void *arg)
> +{
> +	struct mm_struct *mm = arg;
> +
> +	BUG_ON(current->active_mm == mm);
> +}
> +
> +void shoot_lazy_tlbs(struct mm_struct *mm)
> +{
> +	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
> +		smp_call_function_many(mm_cpumask(mm),
> do_shoot_lazy_tlb, (void *)mm, 1);
> +		do_shoot_lazy_tlb(mm);
> +	}
> +	smp_call_function(do_check_lazy_tlb, (void *)mm, 1);
> +	do_check_lazy_tlb(mm);
> +}
> +
>  /*
>   * Called when the last reference to the mm
>   * is dropped: either by a lazy thread or by
> @@ -692,6 +720,7 @@ static void check_mm(struct mm_struct *mm)
>   */
>  void __mmdrop(struct mm_struct *mm)
>  {
> +	shoot_lazy_tlbs(mm);
>  	BUG_ON(mm == &init_mm);
>  	WARN_ON_ONCE(mm == current->mm);
>  	WARN_ON_ONCE(mm == current->active_mm);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ca5db40392d4..4d615e0be9e0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3308,7 +3308,7 @@ static struct rq *finish_task_switch(struct
> task_struct *prev) */
>  	if (mm) {
>  		membarrier_mm_sync_core_before_usermode(mm);
> -		mmdrop(mm);
> +		mmdrop_lazy_tlb_smp_mb(mm);
>  	}
>  	if (unlikely(prev_state == TASK_DEAD)) {
>  		if (prev->sched_class->task_dead)
> @@ -3413,9 +3413,9 @@ context_switch(struct rq *rq, struct
> task_struct *prev, 
>  	/*
>  	 * kernel -> kernel   lazy + transfer active
> -	 *   user -> kernel   lazy + mmgrab() active
> +	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
>  	 *
> -	 * kernel ->   user   switch + mmdrop() active
> +	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
>  	 *   user ->   user   switch
>  	 */
>  	if (!next->mm) {                                // to kernel
> @@ -3423,7 +3423,7 @@ context_switch(struct rq *rq, struct
> task_struct *prev, 
>  		next->active_mm = prev->active_mm;
>  		if (prev->mm)                           // from user
> -			mmgrab(prev->active_mm);
> +			mmgrab_lazy_tlb(prev->active_mm);
>  		else
>  			prev->active_mm = NULL;
>  	} else {                                        // to user
> @@ -3439,7 +3439,7 @@ context_switch(struct rq *rq, struct
> task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm,
> next); 
>  		if (!prev->mm) {                        // from
> kernel
> -			/* will mmdrop() in finish_task_switch(). */
> +			/* will mmdrop_lazy_tlb() in
> finish_task_switch(). */ rq->prev_mm = prev->active_mm;
>  			prev->active_mm = NULL;
>  		}
> 


^ permalink raw reply

* RE: [PATCH 1/2] powerpc/vas: Report proper error for address translation failure
From: Bulent Abali @ 2020-07-09 20:12 UTC (permalink / raw)
  To: Haren Myneni; +Cc: tulioqm, Haren Myneni, Linuxppc-dev, linuxppc-dev, rzinsly
In-Reply-To: <OFC54F205D.A4C093B7-ON002585A0.006C5930-882585A0.006DFE69@LocalDomain>

[-- Attachment #1: Type: text/plain, Size: 5015 bytes --]

copied verbatim from P9 DD2 Nest Accelerators Workbook Version 3.2

Table 4-36. CSB Non-zero CC Reported Error Types

CC=5, Error Type: Translation, 
Comment: Unused, defined by RFC02130 (footnote:  DMA controller uses this 
CC internally in translation fault handling. Do not reuse for other 
purposes.)

CC=240 through 251, reserved for future firmware use, 
Comment: Error codes 240 - 255 (0xF0 - 0xF0) are reserved for firmware use 
and are not signalled by the hardware. 
These CCs are written in the CSB by hypervisor to alert the partition to 
error conditions detected by the hypervisor. 
These codes have been used in past processors for this purpose and ought 
not be relocated.





From:   Haren Myneni/Beaverton/IBM
To:     Michael Ellerman <mpe@ellerman.id.au>
Cc:     abali@us.ibm.com, Haren Myneni <haren@linux.ibm.com>, 
linuxppc-dev@lists.ozlabs.org, 
"Linuxppc-dev"<linuxppc-dev-bounces+hbabu=us.ibm.com@lists.ozlabs.org>, 
rzinsly@linux.ibm.com, tulioqm@br.ibm.com, Haren 
Myneni/Beaverton/IBM@IBMUS
Date:   07/09/2020 04:01 PM
Subject:        Re: [EXTERNAL] Re: [PATCH 1/2] powerpc/vas: Report proper 
error for address translation failure




"Linuxppc-dev" <linuxppc-dev-bounces+hbabu=us.ibm.com@lists.ozlabs.org> 
wrote on 07/09/2020 04:22:10 AM:

> From: Michael Ellerman <mpe@ellerman.id.au>
> To: Haren Myneni <haren@linux.ibm.com>
> Cc: tulioqm@br.ibm.com, abali@us.ibm.com, linuxppc-
> dev@lists.ozlabs.org, rzinsly@linux.ibm.com
> Date: 07/09/2020 04:21 AM
> Subject: [EXTERNAL] Re: [PATCH 1/2] powerpc/vas: Report proper error
> for address translation failure
> Sent by: "Linuxppc-dev" <linuxppc-dev-bounces
> +hbabu=us.ibm.com@lists.ozlabs.org>
> 
> Haren Myneni <haren@linux.ibm.com> writes:
> > DMA controller uses CC=5 internally for translation fault handling. So
> > OS should be using CC=250 and should report this error to the user 
space
> > when NX encounters address translation failure on the request buffer.
> 
> That doesn't really explain *why* the OS must use CC=250.
> 
> Is it documented somewhere that 5 is for hardware use, and 250 is for
> software?

Yes, mentioned in Table 4-36. CSB Non-zero CC Reported Error Types (P9 NX 
DD2 work book). Also footnote for CC=5 says "DMA controller uses this CC 
internally in translation fault handling. Do not reuse for other purposes"

I will add documentation reference for CC=250 comment. 

> 
> > This patch defines CSB_CC_ADDRESS_TRANSLATION(250) and updates
> > CSB.CC with this proper error code for user space.
> 
> We still have:
> 
> #define CSB_CC_TRANSLATION   (5)
> 
> And it's very unclear where one or the other should be used.
> 
> Can one or the other get a name that makes the distinction clear.

CSB_CC_TRANSLATION is added in 842 driver (nx-common-powernv.c) when NX is 
introduced (P7+). NX will not see faults on kernel requests (cc=250) and 
even CC=5. 

Table 4-36: 
For CC=5: says Translation
CC=250:    says "Address Translation Fault"

So I can say CRB_CC_ADDRESS_TRANSLATION_FAULT or CRN_CC_TRANSLATION_FAULT. 
This code path (also CRBs) should be generic, so should not use like 
CRB_CC_NX_FAULT. 

Thanks
Haren

> 
> cheers
> 
> 
> > diff --git a/Documentation/powerpc/vas-api.rst b/Documentation/
> powerpc/vas-api.rst
> > index 1217c2f..78627cc 100644
> > --- a/Documentation/powerpc/vas-api.rst
> > +++ b/Documentation/powerpc/vas-api.rst
> > @@ -213,7 +213,7 @@ request buffers are not in memory. The 
> operating system handles the fault by
> >  updating CSB with the following data:
> > 
> >     csb.flags = CSB_V;
> > -   csb.cc = CSB_CC_TRANSLATION;
> > +   csb.cc = CSB_CC_ADDRESS_TRANSLATION;
> >     csb.ce = CSB_CE_TERMINATION;
> >     csb.address = fault_address;
> > 
> > diff --git a/arch/powerpc/include/asm/icswx.h b/arch/powerpc/
> include/asm/icswx.h
> > index 965b1f3..b1c9a57 100644
> > --- a/arch/powerpc/include/asm/icswx.h
> > +++ b/arch/powerpc/include/asm/icswx.h
> > @@ -77,6 +77,8 @@ struct coprocessor_completion_block {
> >  #define CSB_CC_CHAIN      (37)
> >  #define CSB_CC_SEQUENCE      (38)
> >  #define CSB_CC_HW      (39)
> > +/* User space address traslation failure */
> > +#define   CSB_CC_ADDRESS_TRANSLATION   (250)
> > 
> >  #define CSB_SIZE      (0x10)
> >  #define CSB_ALIGN      CSB_SIZE
> > diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/
> powerpc/platforms/powernv/vas-fault.c
> > index 266a6ca..33e89d4 100644
> > --- a/arch/powerpc/platforms/powernv/vas-fault.c
> > +++ b/arch/powerpc/platforms/powernv/vas-fault.c
> > @@ -79,7 +79,7 @@ static void update_csb(struct vas_window *window,
> >     csb_addr = (void __user *)be64_to_cpu(crb->csb_addr);
> > 
> >     memset(&csb, 0, sizeof(csb));
> > -   csb.cc = CSB_CC_TRANSLATION;
> > +   csb.cc = CSB_CC_ADDRESS_TRANSLATION;
> >     csb.ce = CSB_CE_TERMINATION;
> >     csb.cs = 0;
> >     csb.count = 0;
> > -- 
> > 1.8.3.1
> 





[-- Attachment #2: Type: text/html, Size: 8505 bytes --]

^ permalink raw reply

* RE: [PATCH 1/2] powerpc/vas: Report proper error for address translation failure
From: Haren Myneni @ 2020-07-09 20:01 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: tulioqm, Haren Myneni, Linuxppc-dev, abali, linuxppc-dev, rzinsly
In-Reply-To: <87y2ntue59.fsf@mpe.ellerman.id.au>

[-- Attachment #1: Type: text/plain, Size: 3783 bytes --]




"Linuxppc-dev" <linuxppc-dev-bounces+hbabu=us.ibm.com@lists.ozlabs.org>
wrote on 07/09/2020 04:22:10 AM:

> From: Michael Ellerman <mpe@ellerman.id.au>
> To: Haren Myneni <haren@linux.ibm.com>
> Cc: tulioqm@br.ibm.com, abali@us.ibm.com, linuxppc-
> dev@lists.ozlabs.org, rzinsly@linux.ibm.com
> Date: 07/09/2020 04:21 AM
> Subject: [EXTERNAL] Re: [PATCH 1/2] powerpc/vas: Report proper error
> for address translation failure
> Sent by: "Linuxppc-dev" <linuxppc-dev-bounces
> +hbabu=us.ibm.com@lists.ozlabs.org>
>
> Haren Myneni <haren@linux.ibm.com> writes:
> > DMA controller uses CC=5 internally for translation fault handling. So
> > OS should be using CC=250 and should report this error to the user
space
> > when NX encounters address translation failure on the request buffer.
>
> That doesn't really explain *why* the OS must use CC=250.
>
> Is it documented somewhere that 5 is for hardware use, and 250 is for
> software?

Yes, mentioned in Table 4-36. CSB Non-zero CC Reported Error Types (P9 NX
DD2 work book). Also footnote for CC=5 says "DMA controller uses this CC
internally in translation fault handling. Do not reuse for other purposes"

I will add documentation reference for CC=250 comment.

>
> > This patch defines CSB_CC_ADDRESS_TRANSLATION(250) and updates
> > CSB.CC with this proper error code for user space.
>
> We still have:
>
> #define CSB_CC_TRANSLATION   (5)
>
> And it's very unclear where one or the other should be used.
>
> Can one or the other get a name that makes the distinction clear.

CSB_CC_TRANSLATION is added in 842 driver (nx-common-powernv.c) when NX is
introduced (P7+). NX will not see faults on kernel requests (cc=250) and
even CC=5.

Table 4-36:
For CC=5: says Translation
CC=250:    says "Address Translation Fault"

So I can say CRB_CC_ADDRESS_TRANSLATION_FAULT or CRN_CC_TRANSLATION_FAULT.
This code path (also CRBs) should be generic, so should not use like
CRB_CC_NX_FAULT.

Thanks
Haren

>
> cheers
>
>
> > diff --git a/Documentation/powerpc/vas-api.rst b/Documentation/
> powerpc/vas-api.rst
> > index 1217c2f..78627cc 100644
> > --- a/Documentation/powerpc/vas-api.rst
> > +++ b/Documentation/powerpc/vas-api.rst
> > @@ -213,7 +213,7 @@ request buffers are not in memory. The
> operating system handles the fault by
> >  updating CSB with the following data:
> >
> >     csb.flags = CSB_V;
> > -   csb.cc = CSB_CC_TRANSLATION;
> > +   csb.cc = CSB_CC_ADDRESS_TRANSLATION;
> >     csb.ce = CSB_CE_TERMINATION;
> >     csb.address = fault_address;
> >
> > diff --git a/arch/powerpc/include/asm/icswx.h b/arch/powerpc/
> include/asm/icswx.h
> > index 965b1f3..b1c9a57 100644
> > --- a/arch/powerpc/include/asm/icswx.h
> > +++ b/arch/powerpc/include/asm/icswx.h
> > @@ -77,6 +77,8 @@ struct coprocessor_completion_block {
> >  #define CSB_CC_CHAIN      (37)
> >  #define CSB_CC_SEQUENCE      (38)
> >  #define CSB_CC_HW      (39)
> > +/* User space address traslation failure */
> > +#define   CSB_CC_ADDRESS_TRANSLATION   (250)
> >
> >  #define CSB_SIZE      (0x10)
> >  #define CSB_ALIGN      CSB_SIZE
> > diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/
> powerpc/platforms/powernv/vas-fault.c
> > index 266a6ca..33e89d4 100644
> > --- a/arch/powerpc/platforms/powernv/vas-fault.c
> > +++ b/arch/powerpc/platforms/powernv/vas-fault.c
> > @@ -79,7 +79,7 @@ static void update_csb(struct vas_window *window,
> >     csb_addr = (void __user *)be64_to_cpu(crb->csb_addr);
> >
> >     memset(&csb, 0, sizeof(csb));
> > -   csb.cc = CSB_CC_TRANSLATION;
> > +   csb.cc = CSB_CC_ADDRESS_TRANSLATION;
> >     csb.ce = CSB_CE_TERMINATION;
> >     csb.cs = 0;
> >     csb.count = 0;
> > --
> > 1.8.3.1
>

[-- Attachment #2: Type: text/html, Size: 5461 bytes --]

^ permalink raw reply

* [PATCH v5] ima: move APPRAISE_BOOTPARAM dependency on ARCH_POLICY to runtime
From: Bruno Meneguele @ 2020-07-09 16:46 UTC (permalink / raw)
  To: linux-kernel, x86, linuxppc-dev, linux-s390, linux-integrity
  Cc: erichte, Bruno Meneguele, nayna, stable, zohar

APPRAISE_BOOTPARAM has been marked as dependent on !ARCH_POLICY in compile
time, enforcing the appraisal whenever the kernel had the arch policy option
enabled.

However it breaks systems where the option is set but the system didn't
boot in a "secure boot" platform. In this scenario, anytime an appraisal
policy (i.e. ima_policy=appraisal_tcb) is used it will be forced, without
giving the user the opportunity to label the filesystem, before enforcing
integrity.

Considering the ARCH_POLICY is only effective when secure boot is actually
enabled this patch remove the compile time dependency and move it to a
runtime decision, based on the secure boot state of that platform.

With this patch:

- x86-64 with secure boot enabled

[    0.004305] Secure boot enabled
...
[    0.015651] Kernel command line: <...> ima_policy=appraise_tcb ima_appraise=fix
[    0.015682] ima: appraise boot param ignored: secure boot enabled

- powerpc with secure boot disabled

[    0.000000] Kernel command line: <...> ima_policy=appraise_tcb ima_appraise=fix
[    0.000000] Secure boot mode disabled
...
< nothing about boot param ignored >

System working fine without secure boot and with both options set:

CONFIG_IMA_APPRAISE_BOOTPARAM=y
CONFIG_IMA_ARCH_POLICY=y

Audit logs pointing to "missing-hash" but still being able to execute due to
ima_appraise=fix:

type=INTEGRITY_DATA msg=audit(07/09/2020 12:30:27.778:1691) : pid=4976
uid=root auid=root ses=2
subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 op=appraise_data
cause=missing-hash comm=bash name=/usr/bin/evmctl dev="dm-0" ino=493150
res=no

Cc: stable@vger.kernel.org
Fixes: d958083a8f64 ("x86/ima: define arch_get_ima_policy() for x86")
Signed-off-by: Bruno Meneguele <bmeneg@redhat.com>
---
Changelog:
v5:
  - add pr_info() to inform user the ima_appraise= boot param is being
	ignored due to secure boot enabled (Nayna)
  - add some testing results to commit log
v4:
  - instead of change arch_policy loading code, check secure boot state at
	"ima_appraise=" parameter handler (Mimi)
v3:
  - extend secure boot arch checker to also consider trusted boot
  - enforce IMA appraisal when secure boot is effectively enabled (Nayna)
  - fix ima_appraise flag assignment by or'ing it (Mimi)
v2:
  - pr_info() message prefix correction

 security/integrity/ima/Kconfig        | 2 +-
 security/integrity/ima/ima_appraise.c | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig
index edde88dbe576..62dc11a5af01 100644
--- a/security/integrity/ima/Kconfig
+++ b/security/integrity/ima/Kconfig
@@ -232,7 +232,7 @@ config IMA_APPRAISE_REQUIRE_POLICY_SIGS
 
 config IMA_APPRAISE_BOOTPARAM
 	bool "ima_appraise boot parameter"
-	depends on IMA_APPRAISE && !IMA_ARCH_POLICY
+	depends on IMA_APPRAISE
 	default y
 	help
 	  This option enables the different "ima_appraise=" modes
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index a9649b04b9f1..884de471b38a 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -19,6 +19,11 @@
 static int __init default_appraise_setup(char *str)
 {
 #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
+	if (arch_ima_get_secureboot()) {
+		pr_info("appraise boot param ignored: secure boot enabled");
+		return 1;
+	}
+
 	if (strncmp(str, "off", 3) == 0)
 		ima_appraise = 0;
 	else if (strncmp(str, "log", 3) == 0)
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH 2/2] PCI/AER: Log correctable errors as warning, not error
From: Bjorn Helgaas @ 2020-07-09 22:06 UTC (permalink / raw)
  To: Matt Jolly
  Cc: Sam Bobroff, linux-pci, linux-kernel, Oliver O'Halloran,
	Bjorn Helgaas, linuxppc-dev
In-Reply-To: <20200708001401.405749-2-helgaas@kernel.org>

On Tue, Jul 07, 2020 at 07:14:01PM -0500, Bjorn Helgaas wrote:
> From: Matt Jolly <Kangie@footclan.ninja>
> 
> PCIe correctable errors are recovered by hardware with no need for software
> intervention (PCIe r5.0, sec 6.2.2.1).
> 
> Reduce the log level of correctable errors from KERN_ERR to KERN_WARNING.
> 
> The bug reports below are for correctable error logging.  This doesn't fix
> the cause of those reports, but it may make the messages less alarming.
> 
> [bhelgaas: commit log, use pci_printk() to avoid code duplication]
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=201517
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=196183
> Link: https://lore.kernel.org/r/20200618155511.16009-1-Kangie@footclan.ninja
> Signed-off-by: Matt Jolly <Kangie@footclan.ninja>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>

I applied both of these to pci/error for v5.9.

> ---
>  drivers/pci/pcie/aer.c | 25 +++++++++++++++----------
>  1 file changed, 15 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 9176c8a968b9..ca886bf91fd9 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -673,20 +673,23 @@ static void __aer_print_error(struct pci_dev *dev,
>  {
>  	const char **strings;
>  	unsigned long status = info->status & ~info->mask;
> -	const char *errmsg;
> +	const char *level, *errmsg;
>  	int i;
>  
> -	if (info->severity == AER_CORRECTABLE)
> +	if (info->severity == AER_CORRECTABLE) {
>  		strings = aer_correctable_error_string;
> -	else
> +		level = KERN_WARNING;
> +	} else {
>  		strings = aer_uncorrectable_error_string;
> +		level = KERN_ERR;
> +	}
>  
>  	for_each_set_bit(i, &status, 32) {
>  		errmsg = strings[i];
>  		if (!errmsg)
>  			errmsg = "Unknown Error Bit";
>  
> -		pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
> +		pci_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
>  				info->first_error == i ? " (First)" : "");
>  	}
>  	pci_dev_aer_stats_incr(dev, info);
> @@ -696,6 +699,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  {
>  	int layer, agent;
>  	int id = ((dev->bus->number << 8) | dev->devfn);
> +	const char *level;
>  
>  	if (!info->status) {
>  		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
> @@ -706,13 +710,14 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
>  	agent = AER_GET_AGENT(info->severity, info->status);
>  
> -	pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> -		aer_error_severity_string[info->severity],
> -		aer_error_layer[layer], aer_agent_string[agent]);
> +	level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
> +
> +	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> +		   aer_error_severity_string[info->severity],
> +		   aer_error_layer[layer], aer_agent_string[agent]);
>  
> -	pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> -		dev->vendor, dev->device,
> -		info->status, info->mask);
> +	pci_printk(level, dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> +		   dev->vendor, dev->device, info->status, info->mask);
>  
>  	__aer_print_error(dev, info);
>  
> -- 
> 2.25.1
> 

^ permalink raw reply

* Re: /sys/kernel/debug/kmemleak empty despite kmemleak reports
From: Paul Menzel @ 2020-07-09 21:08 UTC (permalink / raw)
  To: Catalin Marinas; +Cc: linuxppc-dev
In-Reply-To: <20200709175705.GD6579@gaia>

Dear Catalin,


Am 09.07.20 um 19:57 schrieb Catalin Marinas:
> On Thu, Jul 09, 2020 at 04:37:10PM +0200, Paul Menzel wrote:
>> Despite Linux 5.8-rc4 reporting memory leaks on the IBM POWER 8 S822LC, the
>> file does not contain more information.
>>
>>> $ dmesg
>>> […] > [48662.953323] perf: interrupt took too long (2570 > 2500), lowering kernel.perf_event_max_sample_rate to 77750
>>> [48854.810636] perf: interrupt took too long (3216 > 3212), lowering kernel.perf_event_max_sample_rate to 62000
>>> [52300.044518] perf: interrupt took too long (4244 > 4020), lowering kernel.perf_event_max_sample_rate to 47000
>>> [52751.373083] perf: interrupt took too long (5373 > 5305), lowering kernel.perf_event_max_sample_rate to 37000
>>> [53354.000363] perf: interrupt took too long (6793 > 6716), lowering kernel.perf_event_max_sample_rate to 29250
>>> [53850.215606] perf: interrupt took too long (8672 > 8491), lowering kernel.perf_event_max_sample_rate to 23000
>>> [57542.266099] perf: interrupt took too long (10940 > 10840), lowering kernel.perf_event_max_sample_rate to 18250
>>> [57559.645404] perf: interrupt took too long (13714 > 13675), lowering kernel.perf_event_max_sample_rate to 14500
>>> [61608.697728] Can't find PMC that caused IRQ
>>> [71774.463111] kmemleak: 12 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>> [92372.044785] process '@/usr/bin/gnatmake-5' started with executable stack
>>> [92849.380672] FS-Cache: Loaded
>>> [92849.417269] FS-Cache: Netfs 'nfs' registered for caching
>>> [92849.595974] NFS: Registering the id_resolver key type
>>> [92849.596000] Key type id_resolver registered
>>> [92849.596000] Key type id_legacy registered
>>> [101808.079143] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>> [106904.323471] Can't find PMC that caused IRQ
>>> [129416.391456] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>> [158171.604221] kmemleak: 34 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>> $ sudo cat /sys/kernel/debug/kmemleak
> 
> When they are no longer present, they are most likely false positives.

How can this be? Shouldn’t the false positive also be logged in 
`/sys/kernel/debug/kmemleak`?

> Was this triggered during boot? Or under some workload?

 From the timestamps it looks like under some load.


Kind regards,

Paul

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Mathieu Desnoyers @ 2020-07-09 20:57 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <20200709204609.GQ3598@gate.crashing.org>

----- On Jul 9, 2020, at 4:46 PM, Segher Boessenkool segher@kernel.crashing.org wrote:

> On Thu, Jul 09, 2020 at 01:56:19PM -0400, Mathieu Desnoyers wrote:
>> > Just to make sure I understand your recommendation. So rather than
>> > hard coding r17 as the temporary registers, we could explicitly
>> > declare the temporary register as a C variable, pass it as an
>> > input operand to the inline asm, and then refer to it by operand
>> > name in the macros using it. This way the compiler would be free
>> > to perform its own register allocation.
>> > 
>> > If that is what you have in mind, then yes, I think it makes a
>> > lot of sense.
>> 
>> Except that asm goto have this limitation with gcc: those cannot
>> have any output operand, only inputs, clobbers and target labels.
>> We cannot modify a temporary register received as input operand. So I don't
>> see how to get a temporary register allocated by the compiler considering
>> this limitation.
> 
> Heh, yet another reason not to obfuscate your inline asm: it didn't
> register this is asm goto.
> 
> A clobber is one way, yes (those *are* allowed in asm goto).  Another
> way is to not actually change that register: move the original value
> back into there at the end of the asm!  (That isn't always easy to do,
> it depends on your code).  So something like
> 
>	long start = ...;
>	long tmp = start;
>	asm("stuff that modifies %0; ...; mr %0,%1" : : "r"(tmp), "r"(start));
> 
> is just fine: %0 isn't actually modified at all, as far as GCC is
> concerned, and this isn't lying to it!

It appears to be at the cost of adding one extra instruction on the fast-path
to restore the register to its original value. I'll leave Boqun whom authored
the original rseq-ppc code to figure out what works best performance-wise
(when he finds time).

Thanks for the pointers!

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Segher Boessenkool @ 2020-07-09 20:46 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <1682947575.7422.1594317379612.JavaMail.zimbra@efficios.com>

On Thu, Jul 09, 2020 at 01:56:19PM -0400, Mathieu Desnoyers wrote:
> > Just to make sure I understand your recommendation. So rather than
> > hard coding r17 as the temporary registers, we could explicitly
> > declare the temporary register as a C variable, pass it as an
> > input operand to the inline asm, and then refer to it by operand
> > name in the macros using it. This way the compiler would be free
> > to perform its own register allocation.
> > 
> > If that is what you have in mind, then yes, I think it makes a
> > lot of sense.
> 
> Except that asm goto have this limitation with gcc: those cannot
> have any output operand, only inputs, clobbers and target labels.
> We cannot modify a temporary register received as input operand. So I don't
> see how to get a temporary register allocated by the compiler considering
> this limitation.

Heh, yet another reason not to obfuscate your inline asm: it didn't
register this is asm goto.

A clobber is one way, yes (those *are* allowed in asm goto).  Another
way is to not actually change that register: move the original value
back into there at the end of the asm!  (That isn't always easy to do,
it depends on your code).  So something like

	long start = ...;
	long tmp = start;
	asm("stuff that modifies %0; ...; mr %0,%1" : : "r"(tmp), "r"(start));

is just fine: %0 isn't actually modified at all, as far as GCC is
concerned, and this isn't lying to it!


Segher

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Segher Boessenkool @ 2020-07-09 20:31 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <1584179170.7410.1594316576293.JavaMail.zimbra@efficios.com>

On Thu, Jul 09, 2020 at 01:42:56PM -0400, Mathieu Desnoyers wrote:
> > That works fine then, for a testcase.  Using r17 is not a great idea for
> > performance (it increases the active register footprint, and causes more
> > registers to be saved in the prologue of the functions, esp. on older
> > compilers), and it is easier to just let the compiler choose a good
> > register to use.  But maybe you want to see r17 in the generated
> > testcases, as eyecatcher or something, dunno :-)
> 
> Just to make sure I understand your recommendation. So rather than
> hard coding r17 as the temporary registers, we could explicitly
> declare the temporary register as a C variable, pass it as an
> input operand to the inline asm, and then refer to it by operand
> name in the macros using it. This way the compiler would be free
> to perform its own register allocation.
> 
> If that is what you have in mind, then yes, I think it makes a
> lot of sense.

You write to it as well, so an inout register ("+r" or such).  And yes,
you use a local var for it (like "long tmp;").  And then you can refer
to it like anything else in your asm, like "%3" or like
"%[a_long_name]"; and the compiler sees it as any other register,
exactly.


Segher

^ permalink raw reply

* Re: [PATCH 11/20] Documentation: leds/ledtrig-transient: eliminate duplicated word
From: Jacek Anaszewski @ 2020-07-09 20:01 UTC (permalink / raw)
  To: Randy Dunlap, linux-kernel
  Cc: kvm, linux-doc, David Airlie, kgdb-bugreport, linux-fpga,
	Liviu Dudau, dri-devel, linux-mips, Paul Cercueil, keyrings,
	Paul Mackerras, linux-i2c, Pavel Machek, Srinivas Pandruvada,
	Mihail Atanassov, linux-leds, linux-s390, Daniel Thompson,
	linux-scsi, Jonathan Corbet, Masahiro Yamada, Matthew Wilcox,
	Halil Pasic, Jarkko Sakkinen, James Wang, linux-input,
	Mali DP Maintainers, Derek Kiernan, Dragan Cvetic, Wu Hao,
	Tony Krowiak, linux-kbuild, James E.J. Bottomley, Jiri Kosina,
	Hannes Reinecke, linux-block, Thomas Bogendoerfer, Dan Murphy,
	linux-mm, Dan Williams, Andrew Morton, Mimi Zohar, Jens Axboe,
	Michal Marek, Martin K. Petersen, Pierre Morel, Douglas Anderson,
	Wolfram Sang, Daniel Vetter, Jason Wessel, Paolo Bonzini,
	linux-integrity, linuxppc-dev, Mike Rapoport
In-Reply-To: <20200707180414.10467-12-rdunlap@infradead.org>

On 7/7/20 8:04 PM, Randy Dunlap wrote:
> Drop the doubled word "for".
> 
> Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
> Cc: Jonathan Corbet <corbet@lwn.net>
> Cc: linux-doc@vger.kernel.org
> Cc: Jacek Anaszewski <jacek.anaszewski@gmail.com>
> Cc: Pavel Machek <pavel@ucw.cz>
> Cc: Dan Murphy <dmurphy@ti.com>
> Cc: linux-leds@vger.kernel.org
> ---
>   Documentation/leds/ledtrig-transient.rst |    2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> --- linux-next-20200701.orig/Documentation/leds/ledtrig-transient.rst
> +++ linux-next-20200701/Documentation/leds/ledtrig-transient.rst
> @@ -157,7 +157,7 @@ repeat the following step as needed::
>   	echo 1 > activate - start timer = duration to run once
>   	echo none > trigger
>   
> -This trigger is intended to be used for for the following example use cases:
> +This trigger is intended to be used for the following example use cases:
>   
>    - Control of vibrate (phones, tablets etc.) hardware by user space app.
>    - Use of LED by user space app as activity indicator.
> 

Acked-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>

-- 
Best regards,
Jacek Anaszewski

^ permalink raw reply

* Re: /sys/kernel/debug/kmemleak empty despite kmemleak reports
From: Catalin Marinas @ 2020-07-09 17:57 UTC (permalink / raw)
  To: Paul Menzel; +Cc: linuxppc-dev
In-Reply-To: <070dd6b7-1ee6-8090-8973-1eb0240f6948@molgen.mpg.de>

On Thu, Jul 09, 2020 at 04:37:10PM +0200, Paul Menzel wrote:
> Despite Linux 5.8-rc4 reporting memory leaks on the IBM POWER 8 S822LC, the
> file does not contain more information.
> 
> > $ dmesg
> > […] > [48662.953323] perf: interrupt took too long (2570 > 2500),
> > lowering
> kernel.perf_event_max_sample_rate to 77750
> > [48854.810636] perf: interrupt took too long (3216 > 3212), lowering kernel.perf_event_max_sample_rate to 62000
> > [52300.044518] perf: interrupt took too long (4244 > 4020), lowering kernel.perf_event_max_sample_rate to 47000
> > [52751.373083] perf: interrupt took too long (5373 > 5305), lowering kernel.perf_event_max_sample_rate to 37000
> > [53354.000363] perf: interrupt took too long (6793 > 6716), lowering kernel.perf_event_max_sample_rate to 29250
> > [53850.215606] perf: interrupt took too long (8672 > 8491), lowering kernel.perf_event_max_sample_rate to 23000
> > [57542.266099] perf: interrupt took too long (10940 > 10840), lowering kernel.perf_event_max_sample_rate to 18250
> > [57559.645404] perf: interrupt took too long (13714 > 13675), lowering kernel.perf_event_max_sample_rate to 14500
> > [61608.697728] Can't find PMC that caused IRQ
> > [71774.463111] kmemleak: 12 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > [92372.044785] process '@/usr/bin/gnatmake-5' started with executable stack
> > [92849.380672] FS-Cache: Loaded
> > [92849.417269] FS-Cache: Netfs 'nfs' registered for caching
> > [92849.595974] NFS: Registering the id_resolver key type
> > [92849.596000] Key type id_resolver registered
> > [92849.596000] Key type id_legacy registered
> > [101808.079143] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > [106904.323471] Can't find PMC that caused IRQ
> > [129416.391456] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > [158171.604221] kmemleak: 34 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > $ sudo cat /sys/kernel/debug/kmemleak

When they are no longer present, they are most likely false positives.
Was this triggered during boot? Or under some workload?

-- 
Catalin

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Mathieu Desnoyers @ 2020-07-09 17:56 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <1584179170.7410.1594316576293.JavaMail.zimbra@efficios.com>

----- On Jul 9, 2020, at 1:42 PM, Mathieu Desnoyers mathieu.desnoyers@efficios.com wrote:

> ----- On Jul 9, 2020, at 1:37 PM, Segher Boessenkool segher@kernel.crashing.org
> wrote:
> 
>> On Thu, Jul 09, 2020 at 09:43:47AM -0400, Mathieu Desnoyers wrote:
>>> > What protects r17 *after* this asm statement?
>>> 
>>> As discussed in the other leg of the thread (with the code example),
>>> r17 is in the clobber list of all asm statements using this macro, and
>>> is used as a temporary register within each inline asm.
>> 
>> That works fine then, for a testcase.  Using r17 is not a great idea for
>> performance (it increases the active register footprint, and causes more
>> registers to be saved in the prologue of the functions, esp. on older
>> compilers), and it is easier to just let the compiler choose a good
>> register to use.  But maybe you want to see r17 in the generated
>> testcases, as eyecatcher or something, dunno :-)
> 
> Just to make sure I understand your recommendation. So rather than
> hard coding r17 as the temporary registers, we could explicitly
> declare the temporary register as a C variable, pass it as an
> input operand to the inline asm, and then refer to it by operand
> name in the macros using it. This way the compiler would be free
> to perform its own register allocation.
> 
> If that is what you have in mind, then yes, I think it makes a
> lot of sense.

Except that asm goto have this limitation with gcc: those cannot
have any output operand, only inputs, clobbers and target labels.
We cannot modify a temporary register received as input operand. So I don't
see how to get a temporary register allocated by the compiler considering
this limitation.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* [Bug 208197] OF: /pci@f2000000/mac-io@17/gpio@50/...: could not find phandle
From: bugzilla-daemon @ 2020-07-09 17:50 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-208197-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=208197

--- Comment #9 from Erhard F. (erhard_f@mailbox.org) ---
(In reply to Michael Ellerman from comment #7)
> I couldn't really make sense of your bisect log, it doesn't have any
> good/bad commits in it.
> 
> Can you attach the output of "git bisect log".
Yea sorry, my fault... I thought e.g. with "git bisect bad | tee -a
~/bisect.log" bisect.log generated via tee would have the same output as with
"git bisect log" but I was wrong. 

Please find the correct one attached now. I had to restart the bisect as I
already resetted the original one.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* [Bug 208197] OF: /pci@f2000000/mac-io@17/gpio@50/...: could not find phandle
From: bugzilla-daemon @ 2020-07-09 17:44 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-208197-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=208197

Erhard F. (erhard_f@mailbox.org) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
 Attachment #290097|0                           |1
        is obsolete|                            |

--- Comment #8 from Erhard F. (erhard_f@mailbox.org) ---
Created attachment 290191
  --> https://bugzilla.kernel.org/attachment.cgi?id=290191&action=edit
bisect.log

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Mathieu Desnoyers @ 2020-07-09 17:42 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <20200709173712.GL3598@gate.crashing.org>

----- On Jul 9, 2020, at 1:37 PM, Segher Boessenkool segher@kernel.crashing.org wrote:

> On Thu, Jul 09, 2020 at 09:43:47AM -0400, Mathieu Desnoyers wrote:
>> > What protects r17 *after* this asm statement?
>> 
>> As discussed in the other leg of the thread (with the code example),
>> r17 is in the clobber list of all asm statements using this macro, and
>> is used as a temporary register within each inline asm.
> 
> That works fine then, for a testcase.  Using r17 is not a great idea for
> performance (it increases the active register footprint, and causes more
> registers to be saved in the prologue of the functions, esp. on older
> compilers), and it is easier to just let the compiler choose a good
> register to use.  But maybe you want to see r17 in the generated
> testcases, as eyecatcher or something, dunno :-)

Just to make sure I understand your recommendation. So rather than
hard coding r17 as the temporary registers, we could explicitly
declare the temporary register as a C variable, pass it as an
input operand to the inline asm, and then refer to it by operand
name in the macros using it. This way the compiler would be free
to perform its own register allocation.

If that is what you have in mind, then yes, I think it makes a
lot of sense.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Segher Boessenkool @ 2020-07-09 17:37 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <1769596686.6365.1594302227962.JavaMail.zimbra@efficios.com>

On Thu, Jul 09, 2020 at 09:43:47AM -0400, Mathieu Desnoyers wrote:
> > What protects r17 *after* this asm statement?
> 
> As discussed in the other leg of the thread (with the code example),
> r17 is in the clobber list of all asm statements using this macro, and
> is used as a temporary register within each inline asm.

That works fine then, for a testcase.  Using r17 is not a great idea for
performance (it increases the active register footprint, and causes more
registers to be saved in the prologue of the functions, esp. on older
compilers), and it is easier to just let the compiler choose a good
register to use.  But maybe you want to see r17 in the generated
testcases, as eyecatcher or something, dunno :-)

Segher

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Segher Boessenkool @ 2020-07-09 17:31 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <429958629.6348.1594301598584.JavaMail.zimbra@efficios.com>

On Thu, Jul 09, 2020 at 09:33:18AM -0400, Mathieu Desnoyers wrote:
> > The way this all uses r17 will likely not work reliably.
> 
> r17 is only used as a temporary register within the inline assembler, and it is
> in the clobber list. In which scenario would it not work reliably ?

This isn't clear at all, that is the problem.

> > The way multiple asm statements are used seems to have missing
> > dependencies between the statements.
> 
> I'm not sure I follow here. Note that we are injecting the CPP macros into
> a single inline asm statement as strings.

Yeah...  more trickiness.

> > And done macro-mess this, you want to be able to debug it, and you need
> > other people to be able to read it!
> 
> I understand that looking at macros can be cumbersome from the perspective
> of a reviewer only interested in a single architecture,

No, from the perspective of *any* reviewer.

> However, from my perspective, as a maintainer who must maintain similar code
> for x86 32/64, powerpc 32/64, arm, aarch64, s390, s390x, mips 32/64, and likely
> other architectures in the future, the macros abstracting 32-bit and 64-bit
> allow to eliminate code duplication for each architecture with 32-bit and 64-bit
> variants, which is better for maintainability.

IMNSHO it is MUCH better to just have simple separate implementations
for each.  They differ in *all* details.

Or have static inline functions, with proper dependencies, instead of
nasty text macros.

But it's your code, do what you want :-)


Segher

^ permalink raw reply

* [PATCH v2] powerpc/pseries: Avoid using addr_to_pfn in realmode
From: Ganesh Goudar @ 2020-07-09 16:39 UTC (permalink / raw)
  To: mpe, linuxppc-dev; +Cc: mahesh, Ganesh Goudar, npiggin, aneesh.kumar

When an UE or memory error exception is encountered the MCE handler
tries to find the pfn using addr_to_pfn() which takes effective
address as an argument, later pfn is used to poison the page where
memory error occurred, recent rework in this area made addr_to_pfn
to run in realmode, which can be fatal as it may try to access
memory outside RMO region.

To fix this use addr_to_pfn after switching to virtual mode.

Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
---
V2: Leave bare metal code and save_mce_event as is.
---
 arch/powerpc/platforms/pseries/ras.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index f3736fcd98fc..def875815e92 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -610,16 +610,8 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 		if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED)
 			eaddr = be64_to_cpu(mce_log->effective_address);
 
-		if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
+		if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED)
 			paddr = be64_to_cpu(mce_log->logical_address);
-		} else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
-			unsigned long pfn;
-
-			pfn = addr_to_pfn(regs, eaddr);
-			if (pfn != ULONG_MAX)
-				paddr = pfn << PAGE_SHIFT;
-		}
-
 		break;
 	case MC_ERROR_TYPE_SLB:
 		mce_err.error_type = MCE_ERROR_TYPE_SLB;
@@ -725,6 +717,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 	 *       SLB multihit is done by now.
 	 */
 	mtmsr(mfmsr() | MSR_IR | MSR_DR);
+
+	/* Use addr_to_pfn after switching to virtual mode */
+	if (!paddr && error_type == MC_ERROR_TYPE_UE &&
+	    mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
+		unsigned long pfn;
+
+		pfn = addr_to_pfn(regs, eaddr);
+		if (pfn != ULONG_MAX)
+			paddr = pfn << PAGE_SHIFT;
+	}
 	save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
 			&mce_err, regs->nip, eaddr, paddr);
 
-- 
2.17.2


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox