LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] powerpc/kup: Mark the kuap/keup function non __init
From: Christophe Leroy @ 2020-12-14  7:22 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev, mpe
In-Reply-To: <20201214071306.346399-1-aneesh.kumar@linux.ibm.com>



Le 14/12/2020 à 08:13, Aneesh Kumar K.V a écrit :
> Kernel call these functions on cpu online and hence they should
> not be marked __init.

This is PPC64 only.

See commit 
https://github.com/linuxppc/linux/commit/67d53f30e23ec66aa7bbdd1592d5e64d46876190#diff-9799ddc8e77e666295031a560afc2a754d2f5fa0ddfb96335495b26a07511ad4

Christophe

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>   arch/powerpc/mm/book3s32/mmu.c   | 4 ++--
>   arch/powerpc/mm/book3s64/pkeys.c | 4 ++--
>   arch/powerpc/mm/init-common.c    | 2 +-
>   arch/powerpc/mm/nohash/8xx.c     | 4 ++--
>   4 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
> index 23f60e97196e..8d9e90a99b51 100644
> --- a/arch/powerpc/mm/book3s32/mmu.c
> +++ b/arch/powerpc/mm/book3s32/mmu.c
> @@ -449,7 +449,7 @@ void __init print_system_hash_info(void)
>   }
>   
>   #ifdef CONFIG_PPC_KUEP
> -void __init setup_kuep(bool disabled)
> +void setup_kuep(bool disabled)
>   {
>   	pr_info("Activating Kernel Userspace Execution Prevention\n");
>   
> @@ -459,7 +459,7 @@ void __init setup_kuep(bool disabled)
>   #endif
>   
>   #ifdef CONFIG_PPC_KUAP
> -void __init setup_kuap(bool disabled)
> +void setup_kuap(bool disabled)
>   {
>   	pr_info("Activating Kernel Userspace Access Protection\n");
>   
> diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
> index 2b7ded396db4..f1c6f264ed91 100644
> --- a/arch/powerpc/mm/book3s64/pkeys.c
> +++ b/arch/powerpc/mm/book3s64/pkeys.c
> @@ -251,7 +251,7 @@ void __init pkey_early_init_devtree(void)
>   }
>   
>   #ifdef CONFIG_PPC_KUEP
> -void __init setup_kuep(bool disabled)
> +void setup_kuep(bool disabled)
>   {
>   	if (disabled)
>   		return;
> @@ -277,7 +277,7 @@ void __init setup_kuep(bool disabled)
>   #endif
>   
>   #ifdef CONFIG_PPC_KUAP
> -void __init setup_kuap(bool disabled)
> +void setup_kuap(bool disabled)
>   {
>   	if (disabled)
>   		return;
> diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
> index afdebb95bcae..c71af3978496 100644
> --- a/arch/powerpc/mm/init-common.c
> +++ b/arch/powerpc/mm/init-common.c
> @@ -47,7 +47,7 @@ static int __init parse_nosmap(char *p)
>   }
>   early_param("nosmap", parse_nosmap);
>   
> -void __ref setup_kup(void)
> +void setup_kup(void)
>   {
>   	setup_kuep(disable_kuep);
>   	setup_kuap(disable_kuap);
> diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
> index 231ca95f9ffb..9fba29b95b5a 100644
> --- a/arch/powerpc/mm/nohash/8xx.c
> +++ b/arch/powerpc/mm/nohash/8xx.c
> @@ -245,7 +245,7 @@ void set_context(unsigned long id, pgd_t *pgd)
>   }
>   
>   #ifdef CONFIG_PPC_KUEP
> -void __init setup_kuep(bool disabled)
> +void setup_kuep(bool disabled)
>   {
>   	if (disabled)
>   		return;
> @@ -257,7 +257,7 @@ void __init setup_kuep(bool disabled)
>   #endif
>   
>   #ifdef CONFIG_PPC_KUAP
> -void __init setup_kuap(bool disabled)
> +void setup_kuap(bool disabled)
>   {
>   	pr_info("Activating Kernel Userspace Access Protection\n");
>   
> 

^ permalink raw reply

* [PATCH] powerpc/kup: Mark the kuap/keup function non __init
From: Aneesh Kumar K.V @ 2020-12-14  7:13 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V

Kernel call these functions on cpu online and hence they should
not be marked __init.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/book3s32/mmu.c   | 4 ++--
 arch/powerpc/mm/book3s64/pkeys.c | 4 ++--
 arch/powerpc/mm/init-common.c    | 2 +-
 arch/powerpc/mm/nohash/8xx.c     | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 23f60e97196e..8d9e90a99b51 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -449,7 +449,7 @@ void __init print_system_hash_info(void)
 }
 
 #ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
 	pr_info("Activating Kernel Userspace Execution Prevention\n");
 
@@ -459,7 +459,7 @@ void __init setup_kuep(bool disabled)
 #endif
 
 #ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
 {
 	pr_info("Activating Kernel Userspace Access Protection\n");
 
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 2b7ded396db4..f1c6f264ed91 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -251,7 +251,7 @@ void __init pkey_early_init_devtree(void)
 }
 
 #ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
 	if (disabled)
 		return;
@@ -277,7 +277,7 @@ void __init setup_kuep(bool disabled)
 #endif
 
 #ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
 {
 	if (disabled)
 		return;
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index afdebb95bcae..c71af3978496 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -47,7 +47,7 @@ static int __init parse_nosmap(char *p)
 }
 early_param("nosmap", parse_nosmap);
 
-void __ref setup_kup(void)
+void setup_kup(void)
 {
 	setup_kuep(disable_kuep);
 	setup_kuap(disable_kuap);
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 231ca95f9ffb..9fba29b95b5a 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -245,7 +245,7 @@ void set_context(unsigned long id, pgd_t *pgd)
 }
 
 #ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
 	if (disabled)
 		return;
@@ -257,7 +257,7 @@ void __init setup_kuep(bool disabled)
 #endif
 
 #ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
 {
 	pr_info("Activating Kernel Userspace Access Protection\n");
 
-- 
2.28.0


^ permalink raw reply related

* Re: [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Randy Dunlap @ 2020-12-14  7:04 UTC (permalink / raw)
  To: Nicholas Piggin, linux-kernel
  Cc: linux-arch, linux-mm, linuxppc-dev, Andy Lutomirski
In-Reply-To: <20201214065312.270062-4-npiggin@gmail.com>

On 12/13/20 10:53 PM, Nicholas Piggin wrote:
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 84faaba66364..e69c974369cc 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -443,9 +443,22 @@ config MMU_LAZY_TLB
>  config MMU_LAZY_TLB_REFCOUNT
>  	def_bool y
>  	depends on MMU_LAZY_TLB
> +	depends on !MMU_LAZY_TLB_SHOOTDOWN
>  	help
> -	  This must be enabled if MMU_LAZY_TLB is enabled until the next
> -	  patch.
> +	  This refcounts the mm that is used as the lazy TLB mm when switching
> +	  switching to a kernel thread.

duplicate "switching".

> +
> +config MMU_LAZY_TLB_SHOOTDOWN
> +	bool
> +	depends on MMU_LAZY_TLB
> +	help
> +	  Instead of refcounting the "lazy tlb" mm struct, which can cause
> +	  contention with multi-threaded apps on large multiprocessor systems,
> +	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
> +	  switch to init_mm if they were using the to-be-freed mm as the lazy
> +	  tlb. To implement this, architectures must use _lazy_tlb variants of
> +	  mm refcounting, and mm_cpumask must include at least all possible
> +	  CPUs in which mm might be lazy.
>  
>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>  	bool


-- 


^ permalink raw reply

* [PATCH v2 5/5] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>

On a 16-socket 192-core POWER8 system, a context switching benchmark
with as many software threads as CPUs (so each switch will go in and
out of idle), upstream can achieve a rate of about 1 million context
switches per second. After this patch it goes up to 118 million.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5181872f9452..356138bdb5bb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -232,6 +232,7 @@ config PPC
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE
 	select MMU_GATHER_PAGE_SIZE
+	select MMU_LAZY_TLB_SHOOTDOWN		if PPC_BOOK3S_64
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
 	select HAVE_SYSCALL_TRACEPOINTS
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 4/5] powerpc: use lazy mm refcount helper functions
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>

Use _lazy_tlb functions for lazy mm refcounting in powerpc, to prepare
to move to MMU_LAZY_TLB_SHOOTDOWN.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c2857cbd960..93c0eaa6f4bf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1395,7 +1395,7 @@ void start_secondary(void *unused)
 {
 	unsigned int cpu = raw_smp_processor_id();
 
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>

On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).

Abandoning lazy tlb slows switching down quite a bit in the important
user->idle->user cases, so instead implement a non-refcounted scheme
that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
any remaining lazy ones.

Shootdown IPIs are some concern, but they have not been observed to be
a big problem with this scheme (the powerpc implementation generated
314 additional interrupts on a 144 CPU system during a kernel compile).
There are a number of strategies that could be employed to reduce IPIs
if they turn out to be a problem for some workload.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig  | 17 +++++++++++++++--
 kernel/fork.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 84faaba66364..e69c974369cc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -443,9 +443,22 @@ config MMU_LAZY_TLB
 config MMU_LAZY_TLB_REFCOUNT
 	def_bool y
 	depends on MMU_LAZY_TLB
+	depends on !MMU_LAZY_TLB_SHOOTDOWN
 	help
-	  This must be enabled if MMU_LAZY_TLB is enabled until the next
-	  patch.
+	  This refcounts the mm that is used as the lazy TLB mm when switching
+	  switching to a kernel thread.
+
+config MMU_LAZY_TLB_SHOOTDOWN
+	bool
+	depends on MMU_LAZY_TLB
+	help
+	  Instead of refcounting the "lazy tlb" mm struct, which can cause
+	  contention with multi-threaded apps on large multiprocessor systems,
+	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+	  switch to init_mm if they were using the to-be-freed mm as the lazy
+	  tlb. To implement this, architectures must use _lazy_tlb variants of
+	  mm refcounting, and mm_cpumask must include at least all possible
+	  CPUs in which mm might be lazy.
 
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..74b972d2d8a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -669,6 +669,53 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+static void do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		WARN_ON_ONCE(current->mm);
+		current->active_mm = &init_mm;
+		switch_mm(mm, &init_mm, current);
+	}
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+		/*
+		 * IPI overheads have not found to be expensive, but they could
+		 * be reduced in a number of possible ways, for example (in
+		 * roughly increasing order of complexity):
+		 * - A batch of mms requiring IPIs could be gathered and freed
+		 *   at once.
+		 * - CPUs could store their active mm somewhere that can be
+		 *   remotely checked without a lock, to filter out
+		 *   false-positives in the cpumask.
+		 * - After mm_users or mm_count reaches zero, switching away
+		 *   from the mm could clear mm_cpumask to reduce some IPIs
+		 *   (some batching or delaying would help).
+		 * - A delayed freeing and RCU-like quiescing sequence based on
+		 *   mm switching to avoid IPIs completely.
+		 */
+		on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+		if (IS_ENABLED(CONFIG_DEBUG_VM))
+			on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+	} else {
+		/*
+		 * In this case, lazy tlb mms are refounted and would not reach
+		 * __mmdrop until all CPUs have switched away and mmdrop()ed.
+		 */
+	}
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -678,7 +725,12 @@ void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
+
+	/* Ensure no CPUs are using this as their lazy tlb mm */
+	shoot_lazy_tlbs(mm);
+
 	WARN_ON_ONCE(mm == current->active_mm);
+
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_subscriptions_destroy(mm);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 2/5] lazy tlb: allow lazy tlb mm switching to be configurable
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>

Add CONFIG_MMU_LAZY_TLB which can be configured out to disable
the lazy tlb mechanism entirely, and switches to init_mm when
switching to a kernel thread.

NOMMU systems could easily go without this and save a bit of code
and the refcount atomics, because their mm switch is a no-op. They
have not been switched over by default because the arch code needs
to be audited and tested for lazy tlb mm refcounting and converted
to _lazy_tlb refcounting if necessary.

CONFIG_MMU_LAZY_TLB_REFCOUNT is also added, but it must always
be enabled if CONFIG_MMU_LAZY_TLB is enabled until the next patch
which provides an alternate scheme.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig             | 17 +++++++++
 include/linux/sched/mm.h | 13 +++++--
 kernel/sched/core.c      | 75 ++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h     |  4 ++-
 4 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index ba4e966484ab..84faaba66364 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -430,6 +430,23 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	  irqs disabled over activate_mm. Architectures that do IPI based TLB
 	  shootdowns should enable this.
 
+# Should make this depend on MMU, because there is little use for lazy mm switching
+# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
+config MMU_LAZY_TLB
+	def_bool y
+	help
+	  Enable "lazy TLB" mmu context switching for kernel threads.
+	  If this is disabled then switching to a kernel thread always
+	  switches to init_mm. If mm switches are inexpensive or free
+	  (in the case of NOMMU) then this could be disabled.
+
+config MMU_LAZY_TLB_REFCOUNT
+	def_bool y
+	depends on MMU_LAZY_TLB
+	help
+	  This must be enabled if MMU_LAZY_TLB is enabled until the next
+	  patch.
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 94a117160083..5edf8e942c84 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -52,12 +52,21 @@ static inline void mmdrop(struct mm_struct *mm)
 /* Helpers for lazy TLB mm refcounting */
 static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 {
-	mmgrab(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+		mmgrab(mm);
 }
 
 static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 {
-	mmdrop(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+		mmdrop(mm);
+	} else {
+		/*
+		 * mmdrop_lazy_tlb must provide a full memory barrier, see the
+		 * membarrier comment finish_task_switch which relies on this.
+		 */
+		smp_mb();
+	}
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c2f8ea43d29b..9c1dc9406e4b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3579,7 +3579,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
+	struct mm_struct *mm = NULL;
 	long prev_state;
 
 	/*
@@ -3598,7 +3598,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		      current->comm, current->pid, preempt_count()))
 		preempt_count_set(FORK_PREEMPT_COUNT);
 
-	rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	mm = rq->prev_lazy_mm;
+	rq->prev_lazy_mm = NULL;
+#endif
 
 	/*
 	 * A task struct has one reference for the use as "current".
@@ -3722,22 +3725,10 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	calculate_sigpending();
 }
 
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next, struct rq_flags *rf)
+static __always_inline void
+context_switch_mm(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
 {
-	prepare_task_switch(rq, prev, next);
-
-	/*
-	 * For paravirt, this is coupled with an exit in switch_to to
-	 * combine the page table reload and the switch backend into
-	 * one hypercall.
-	 */
-	arch_start_context_switch(prev);
-
 	/*
 	 * kernel -> kernel   lazy + transfer active
 	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
@@ -3766,11 +3757,57 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
-			rq->prev_mm = prev->active_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+			/* Will mmdrop_lazy_tlb() in finish_task_switch(). */
+			rq->prev_lazy_mm = prev->active_mm;
 			prev->active_mm = NULL;
+#else
+			/*
+			 * Without MMU_LAZY_REFCOUNT there is no lazy
+			 * tracking (because no rq->prev_lazy_mm) in
+			 * finish_task_switch, so no mmdrop_lazy_tlb(),
+			 * so no memory barrier for membarrier (see the
+			 * membarrier comment in finish_task_switch()).
+			 * Do it here.
+			 */
+			smp_mb();
+#endif
 		}
 	}
+}
+
+static __always_inline void
+context_switch_mm_nolazy(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	if (!next->mm)
+		next->active_mm = &init_mm;
+	membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
+	switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
+	if (!prev->mm)
+		prev->active_mm = NULL;
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf)
+{
+	prepare_task_switch(rq, prev, next);
+
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+		context_switch_mm(rq, prev, next);
+	else
+		context_switch_mm_nolazy(rq, prev, next);
 
 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..3b72aec5a2f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -950,7 +950,9 @@ struct rq {
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-	struct mm_struct	*prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	struct mm_struct	*prev_lazy_mm;
+#endif
 
 	unsigned int		clock_update_flags;
 	u64			clock;
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 1/5] lazy tlb: introduce lazy mm refcount helper functions
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>

Add explicit _lazy_tlb annotated functions for lazy mm refcounting.
This makes things a bit more explicit, and allows explicit refcounting
to be removed if it is not used.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 ++--
 fs/exec.c                            |  4 ++--
 include/linux/sched/mm.h             | 11 +++++++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kthread.c                     | 11 +++++++----
 kernel/sched/core.c                  | 15 ++++++++-------
 8 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 827b50f1c73e..1b4a41aad793 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
 	current->mm = mm;
 	current->active_mm = mm;
 	activate_mm(active_mm, mm);
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	ecard_init_pgtables(mm);
 	return 0;
 }
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b487b489d4b6..74708aef333e 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -658,10 +658,10 @@ static void do_exit_flush_lazy_tlb(void *arg)
 	if (current->active_mm == mm) {
 		WARN_ON_ONCE(current->mm != NULL);
 		/* Is a kernel thread and is using mm as the lazy tlb */
-		mmgrab(&init_mm);
+		mmgrab_lazy_tlb(&init_mm);
 		current->active_mm = &init_mm;
 		switch_mm_irqs_off(mm, &init_mm, current);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
 
 	atomic_dec(&mm->context.active_cpus);
diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..56fc23dcbe4d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1028,9 +1028,9 @@ static int exec_mmap(struct mm_struct *mm)
 		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 		mm_update_next_owner(old_mm);
 		mmput(old_mm);
-		return 0;
+	} else {
+		mmdrop_lazy_tlb(active_mm);
 	}
-	mmdrop(active_mm);
 	return 0;
 }
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..94a117160083 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+	mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+	mmdrop(mm);
+}
+
 /**
  * mmget() - Pin the address space associated with a &struct mm_struct.
  * @mm: The address space to pin.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2b8d7a5db383..a54cdfa08d71 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -576,7 +576,7 @@ static int finish_cpu(unsigned int cpu)
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
-	mmdrop(mm);
+	mmdrop_lazy_tlb(mm);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 1f236ed375f8..3711a74fcf4a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -474,7 +474,7 @@ static void exit_mm(void)
 		__set_current_state(TASK_RUNNING);
 		mmap_read_lock(mm);
 	}
-	mmgrab(mm);
+	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 933a625621b8..da189e0d26ed 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1240,14 +1240,14 @@ void kthread_use_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(tsk->mm);
 
+	mmgrab(mm);
+
 	task_lock(tsk);
 	/* Hold off tlb flush IPIs while switching mm's */
 	local_irq_disable();
 	active_mm = tsk->active_mm;
-	if (active_mm != mm) {
-		mmgrab(mm);
+	if (active_mm != mm)
 		tsk->active_mm = mm;
-	}
 	tsk->mm = mm;
 	switch_mm_irqs_off(active_mm, mm, tsk);
 	local_irq_enable();
@@ -1257,7 +1257,7 @@ void kthread_use_mm(struct mm_struct *mm)
 #endif
 
 	if (active_mm != mm)
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 
 	to_kthread(tsk)->oldfs = force_uaccess_begin();
 }
@@ -1280,10 +1280,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	sync_mm_rss(mm);
 	local_irq_disable();
 	tsk->mm = NULL;
+	mmgrab_lazy_tlb(mm);
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
 	local_irq_enable();
 	task_unlock(tsk);
+
+	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7e453492cff..c2f8ea43d29b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3629,13 +3629,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * rq->curr, before returning to userspace, so provide them here:
 	 *
 	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
-	 *   provided by mmdrop(),
+	 *   provided by mmdrop_lazy_tlb(),
 	 * - a sync_core for SYNC_CORE.
 	 */
 	if (mm) {
 		membarrier_mm_sync_core_before_usermode(mm);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
+
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -3739,9 +3740,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	/*
 	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
 	 *
-	 * kernel ->   user   switch + mmdrop() active
+	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
 	 */
 	if (!next->mm) {                                // to kernel
@@ -3749,7 +3750,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
+			mmgrab_lazy_tlb(prev->active_mm);
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
@@ -3765,7 +3766,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop() in finish_task_switch(). */
+			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}
@@ -7206,7 +7207,7 @@ void __init sched_init(void)
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	/*
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 0/5] shoot lazy tlbs
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev

This is another rebase, on top of mainline now (don't need the
asm-generic tree), and without any x86 or membarrier changes.
This makes the series far smaller and more manageable and
without the controversial bits.

Thanks,
Nick

Nicholas Piggin (5):
  lazy tlb: introduce lazy mm refcount helper functions
  lazy tlb: allow lazy tlb mm switching to be configurable
  lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  powerpc: use lazy mm refcount helper functions
  powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN

 arch/Kconfig                         | 30 ++++++++++
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/Kconfig                 |  1 +
 arch/powerpc/kernel/smp.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 +-
 fs/exec.c                            |  4 +-
 include/linux/sched/mm.h             | 20 +++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/fork.c                        | 52 ++++++++++++++++
 kernel/kthread.c                     | 11 ++--
 kernel/sched/core.c                  | 88 ++++++++++++++++++++--------
 kernel/sched/sched.h                 |  4 +-
 13 files changed, 184 insertions(+), 38 deletions(-)

-- 
2.23.0


^ permalink raw reply

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-12-14  5:53 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <1607918323.6muyu2l982.astroid@bobo.none>

Excerpts from Nicholas Piggin's message of December 14, 2020 2:07 pm:
> Excerpts from Andy Lutomirski's message of December 11, 2020 10:11 am:
>>> On Dec 5, 2020, at 7:59 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>>
>> 
>>> I'm still going to persue shoot-lazies for the merge window. As you
>>> see it's about a dozen lines and a if (IS_ENABLED(... in core code.
>>> Your change is common code, but a significant complexity (which
>>> affects all archs) so needs a lot more review and testing at this
>>> point.
>> 
>> I don't think it's ready for this merge window.
> 
> Yes next one I meant (aka this one for development perspective :)).
> 
>> I read the early
>> patches again, and I think they make the membarrier code worse, not
>> better.
> 
> Mathieu and I disagree, so we are at an impasse.

Well actually not really, I went and cut out the exit_lazy_tlb stuff
from the patch series, those are better to be untangled anyway. I think 
an earlier version had something in exit_lazy_tlb for the mm refcounting 
change but it's not required now anyway.

I'll split them out and just work on the shoot lazies series for now, I
might revisit exit_lazy_tlb after the dust settles from that and the
current membarrier changes. I'll test and repost shortly.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/64s: Trim offlined CPUs from mm_cpumasks
From: Nicholas Piggin @ 2020-12-14  4:15 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Peter Zijlstra, Aneesh Kumar K.V, Linux Kernel Mailing List,
	Anton Vorontsov, Thomas Gleixner, linuxppc-dev
In-Reply-To: <CAMuHMdUdorW03=mipgm92SXNPBZO5owW1Wp6_SacRDZ7fOe9gw@mail.gmail.com>

Excerpts from Geert Uytterhoeven's message of December 10, 2020 7:06 pm:
> Hi Nicholas,
> 
> On Fri, Nov 20, 2020 at 4:01 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> When offlining a CPU, powerpc/64s does not flush TLBs, rather it just
>> leaves the CPU set in mm_cpumasks, so it continues to receive TLBIEs
>> to manage its TLBs.
>>
>> However the exit_flush_lazy_tlbs() function expects that after
>> returning, all CPUs (except self) have flushed TLBs for that mm, in
>> which case TLBIEL can be used for this flush. This breaks for offline
>> CPUs because they don't get the IPI to flush their TLB. This can lead
>> to stale translations.
>>
>> Fix this by clearing the CPU from mm_cpumasks, then flushing all TLBs
>> before going offline.
>>
>> These offlined CPU bits stuck in the cpumask also prevents the cpumask
>> from being trimmed back to local mode, which means continual broadcast
>> IPIs or TLBIEs are needed for TLB flushing. This patch prevents that
>> situation too.
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> 
> Thanks for your patch!
> 
>> --- a/arch/powerpc/platforms/powermac/smp.c
>> +++ b/arch/powerpc/platforms/powermac/smp.c
>> @@ -911,6 +911,8 @@ static int smp_core99_cpu_disable(void)
>>
>>         mpic_cpu_set_priority(0xf);
>>
>> +       cleanup_cpu_mmu_context();
>> +
> 
> I guess this change broke pmac32_defconfig+SMP in v5.10-rc7?
> 
> arch/powerpc/platforms/powermac/smp.c: error: implicit
> declaration of function 'cleanup_cpu_mmu_context'
> [-Werror=implicit-function-declaration]:  => 914:2
> 
> http://kisskb.ellerman.id.au/kisskb/buildresult/14423174/

Hey, yeah it does thanks for catching it. This patch fixes it for me

---
From a9b5ec92ffac975e81c6d7db6ff2b1486b2723f7 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 14 Dec 2020 13:52:39 +1000
Subject: [PATCH] powerpc/32s: Fix cleanup_cpu_mmu_context() compile bug

32s has no tlbiel_all() defined, so just disable the cleanup with a
comment.

Fixes: 01b0f0eae081 ("powerpc/64s: Trim offlined CPUs from mm_cpumasks")
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/platforms/powermac/smp.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index adae2a6712e1..66ef5f8f4445 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -911,7 +911,16 @@ static int smp_core99_cpu_disable(void)
 
 	mpic_cpu_set_priority(0xf);
 
+	/*
+	 * Would be nice for consistency if all platforms clear mm_cpumask and
+	 * flush TLBs on unplug, but the TLB invalidation bug described in
+	 * commit 01b0f0eae081 ("powerpc/64s: Trim offlined CPUs from
+	 * mm_cpumasks") only applies to 64s and for now we only have the TLB
+	 * flush code for that platform.
+	 */
+#ifdef CONFIG_PPC64
 	cleanup_cpu_mmu_context();
+#endif
 
 	return 0;
 }
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-12-14  4:07 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <CALCETrV5BzXuUYm5YAoEKPZZPfLrbHckvwBHzWKrxZS8hqzHEg@mail.gmail.com>

Excerpts from Andy Lutomirski's message of December 11, 2020 10:11 am:
>> On Dec 5, 2020, at 7:59 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>
> 
>> I'm still going to persue shoot-lazies for the merge window. As you
>> see it's about a dozen lines and a if (IS_ENABLED(... in core code.
>> Your change is common code, but a significant complexity (which
>> affects all archs) so needs a lot more review and testing at this
>> point.
> 
> I don't think it's ready for this merge window.

Yes next one I meant (aka this one for development perspective :)).

> I read the early
> patches again, and I think they make the membarrier code worse, not
> better.

Mathieu and I disagree, so we are at an impasse. I addressed your 
comment about not being able to do the additional core sync avoidance 
from the exit tlb call (you can indeed do so in your arch code) and 
about exit_lazy_tlb being a call into the scheduler (it's not) and
about the arch code not being able to reconcile lazy tlb mm with the
core scheduler code (you can).

I fundamentally think the core sync is an issue with what the membarrier
/ arch specifics are doing with lazy tlb mm switching, and not something
the core scheduler needs to know about at all. I don't see the big
problem with essentially moving it from an explicit call to 
exit_lazy_tlb (which from scheduler POV describes better what it is 
doing, not how).

> I'm not fundamentally opposed to the shoot-lazies concept,
> but it needs more thought and it needs a cleaner foundation.

Well shoot lazies actually doesn't really rely on that membarrier
change at all, it just came as a nice looking cleanup so that part
can be dropped from the series. It's not really foundational.

Thanks,
Nick

^ permalink raw reply

* [powerpc:next] BUILD SUCCESS WITH WARNING dddc4ef92d1ce92987da1d6926cdfa99e8acb622
From: kernel test robot @ 2020-12-14  2:54 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  next
branch HEAD: dddc4ef92d1ce92987da1d6926cdfa99e8acb622  KVM: PPC: Book3S HV: XIVE: Add a comment regarding VP numbering

Warning reports:

https://lore.kernel.org/linuxppc-dev/202012042220.zO7hSFT2-lkp@intel.com

Warning in current branch:

arch/powerpc/kernel/vdso32/vgettimeofday.c:13:5: warning: no previous prototype for function '__c_kernel_clock_gettime64' [-Wmissing-prototypes]

Warning ids grouped by kconfigs:

clang_recent_errors
`-- powerpc64-randconfig-r025-20201213
    `-- arch-powerpc-kernel-vdso32-vgettimeofday.c:warning:no-previous-prototype-for-function-__c_kernel_clock_gettime64

elapsed time: 1767m

configs tested: 138
configs skipped: 48

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
nios2                            alldefconfig
sh                          polaris_defconfig
m68k                        mvme16x_defconfig
openrisc                            defconfig
mips                        qi_lb60_defconfig
arm                           tegra_defconfig
mips                           ip27_defconfig
s390                             alldefconfig
mips                       bmips_be_defconfig
arm                          exynos_defconfig
arm                          collie_defconfig
arc                          axs101_defconfig
sh                          urquell_defconfig
sh                        dreamcast_defconfig
mips                          rm200_defconfig
sh                               j2_defconfig
powerpc                     pseries_defconfig
mips                        bcm63xx_defconfig
mips                        nlm_xlp_defconfig
arm                        shmobile_defconfig
powerpc                 linkstation_defconfig
sh                        apsh4ad0a_defconfig
arc                           tb10x_defconfig
riscv                             allnoconfig
arm                        multi_v5_defconfig
sh                                  defconfig
powerpc                      pcm030_defconfig
m68k                          atari_defconfig
mips                      loongson3_defconfig
arm                           efm32_defconfig
arm                      footbridge_defconfig
powerpc                      ppc6xx_defconfig
powerpc                    amigaone_defconfig
powerpc                       ebony_defconfig
x86_64                              defconfig
alpha                            allyesconfig
mips                      maltasmvp_defconfig
mips                malta_kvm_guest_defconfig
c6x                        evmc6678_defconfig
powerpc                     rainier_defconfig
powerpc                          g5_defconfig
mips                            e55_defconfig
powerpc                      pmac32_defconfig
sh                          lboxre2_defconfig
sh                           se7206_defconfig
i386                             alldefconfig
xtensa                  audio_kc705_defconfig
arc                 nsimosci_hs_smp_defconfig
powerpc                 mpc832x_mds_defconfig
powerpc                   currituck_defconfig
arm                          pxa910_defconfig
arm                          imote2_defconfig
h8300                       h8s-sim_defconfig
microblaze                      mmu_defconfig
sh                           se7724_defconfig
m68k                         apollo_defconfig
mips                       capcella_defconfig
powerpc                mpc7448_hpc2_defconfig
m68k                          hp300_defconfig
powerpc                    klondike_defconfig
xtensa                    smp_lx200_defconfig
sparc64                             defconfig
ia64                         bigsur_defconfig
parisc                generic-32bit_defconfig
csky                                defconfig
arm                       multi_v4t_defconfig
sh                         ecovec24_defconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
c6x                              allyesconfig
nds32                               defconfig
nios2                            allyesconfig
alpha                               defconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
parisc                           allyesconfig
s390                                defconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                               tinyconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a003-20201213
x86_64               randconfig-a006-20201213
x86_64               randconfig-a002-20201213
x86_64               randconfig-a005-20201213
x86_64               randconfig-a004-20201213
x86_64               randconfig-a001-20201213
i386                 randconfig-a001-20201213
i386                 randconfig-a004-20201213
i386                 randconfig-a003-20201213
i386                 randconfig-a002-20201213
i386                 randconfig-a005-20201213
i386                 randconfig-a006-20201213
i386                 randconfig-a014-20201213
i386                 randconfig-a013-20201213
i386                 randconfig-a012-20201213
i386                 randconfig-a011-20201213
i386                 randconfig-a016-20201213
i386                 randconfig-a015-20201213
riscv                    nommu_k210_defconfig
riscv                            allyesconfig
riscv                    nommu_virt_defconfig
riscv                               defconfig
riscv                          rv32_defconfig
riscv                            allmodconfig
x86_64                                   rhel
x86_64                           allyesconfig
x86_64                    rhel-7.6-kselftests
x86_64                               rhel-8.3
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-a016-20201213
x86_64               randconfig-a012-20201213
x86_64               randconfig-a013-20201213
x86_64               randconfig-a015-20201213
x86_64               randconfig-a014-20201213
x86_64               randconfig-a011-20201213

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* Re: [PATCH v12 00/31] Speculative page faults
From: Joel Fernandes @ 2020-12-14  2:03 UTC (permalink / raw)
  To: Chinwen Chang
  Cc: jack, sergey.senozhatsky.work, peterz, Will Deacon, mhocko,
	linux-mm, paulus, Punit Agrawal, hpa, Michel Lespinasse,
	Alexei Starovoitov, Andrea Arcangeli, ak, Minchan Kim,
	aneesh.kumar, x86, Matthew Wilcox, Daniel Jordan, Ingo Molnar,
	zhong jiang, David Rientjes, paulmck, npiggin, sj38.park,
	Jerome Glisse, dave, kemi.wang, kirill, Thomas Gleixner,
	Laurent Dufour, Haiyan Song, Ganesh Mahendran, Yang Shi,
	Mike Rapoport, linuxppc-dev, linux-kernel, Sergey Senozhatsky,
	miles.chen, vinayak menon, akpm, Tim Chen, haren
In-Reply-To: <1594099897.30360.58.camel@mtkswgap22>

On Tue, Jul 07, 2020 at 01:31:37PM +0800, Chinwen Chang wrote:
[..]
> > > Hi Laurent,
> > > 
> > > We merged SPF v11 and some patches from v12 into our platforms. After
> > > several experiments, we observed SPF has obvious improvements on the
> > > launch time of applications, especially for those high-TLP ones,
> > > 
> > > # launch time of applications(s):
> > > 
> > > package           version      w/ SPF      w/o SPF      improve(%)
> > > ------------------------------------------------------------------
> > > Baidu maps        10.13.3      0.887       0.98         9.49
> > > Taobao            8.4.0.35     1.227       1.293        5.10
> > > Meituan           9.12.401     1.107       1.543        28.26
> > > WeChat            7.0.3        2.353       2.68         12.20
> > > Honor of Kings    1.43.1.6     6.63        6.713        1.24
> > 
> > That's great news, thanks for reporting this!
> > 
> > > 
> > > By the way, we have verified our platforms with those patches and
> > > achieved the goal of mass production.
> > 
> > Another good news!
> > For my information, what is your targeted hardware?
> > 
> > Cheers,
> > Laurent.
> 
> Hi Laurent,
> 
> Our targeted hardware belongs to ARM64 multi-core series.

Hello!

I was trying to develop an intuition about why does SPF give improvement for
you on small CPU systems. This is just a high-level theory but:

1. Assume the improvement is because of elimination of "blocking" on
mmap_sem.
Could it be that the mmap_sem is acquired in write-mode unnecessarily in some
places, thus causing blocking on mmap_sem in other paths? If so, is it
feasible to convert such usages to acquiring them in read-mode?

2. Assume the improvement is because of lesser read-side contention on
mmap_sem.
On small CPU systems, I would not expect reducing cache-line bouncing to give
such a dramatic improvement in performance as you are seeing.

Thanks for any insight on this!

- Joel


^ permalink raw reply

* [PATCH] powerpc/ps3: use dma_mapping_error()
From: Vincent Stehlé @ 2020-12-13 18:26 UTC (permalink / raw)
  To: linuxppc-dev, linux-kernel
  Cc: Geoff Levand, Geert Uytterhoeven, Vincent Stehlé

The DMA address returned by dma_map_single() should be checked with
dma_mapping_error(). Fix the ps3stor_setup() function accordingly.

Fixes: 80071802cb9c ("[POWERPC] PS3: Storage Driver Core")
Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
---
 drivers/ps3/ps3stor_lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ps3/ps3stor_lib.c b/drivers/ps3/ps3stor_lib.c
index 333ba83006e48..a12a1ad9b5fe3 100644
--- a/drivers/ps3/ps3stor_lib.c
+++ b/drivers/ps3/ps3stor_lib.c
@@ -189,7 +189,7 @@ int ps3stor_setup(struct ps3_storage_device *dev, irq_handler_t handler)
 	dev->bounce_lpar = ps3_mm_phys_to_lpar(__pa(dev->bounce_buf));
 	dev->bounce_dma = dma_map_single(&dev->sbd.core, dev->bounce_buf,
 					 dev->bounce_size, DMA_BIDIRECTIONAL);
-	if (!dev->bounce_dma) {
+	if (dma_mapping_error(&dev->sbd.core, dev->bounce_dma)) {
 		dev_err(&dev->sbd.core, "%s:%u: map DMA region failed\n",
 			__func__, __LINE__);
 		error = -ENODEV;
-- 
2.29.2


^ permalink raw reply related

* [PATCH] powerpc: fix alignment bug whithin the init sections
From: Ariel Marcovitch @ 2020-12-13 18:35 UTC (permalink / raw)
  To: mpe; +Cc: paulus, linuxppc-dev, Ariel Marcovitch, linux-kernel

This is a bug that can cause early crashes in configurations with a
.exit.text section smaller than a page and a .init.text section that
ends in the beginning of a physical page (this is kinda random, which
might explain why this wasn't really encountered before).

The init sections are ordered like this:
	.init.text
	.exit.text
	.init.data

Currently, these sections aren't page aligned.

Because the init code is mapped read-only at runtime and because the
.init.text section can potentially reside on the same physical page as
.init.data, the beginning of .init.data might be mapped read-only along
with .init.text.

Then when the kernel tries to modify a variable in .init.data (like
kthreadd_done, used in kernel_init()) the kernel panics.

To avoid this, I made these sections page aligned.

Fixes: 060ef9d89d18 ("powerpc32: PAGE_EXEC required for inittext")
Signed-off-by: Ariel Marcovitch <ariel.marcovitch@gmail.com>
---
 arch/powerpc/kernel/vmlinux.lds.S | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 326e113d2e45..e3a7c90c03f4 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -179,6 +179,11 @@ SECTIONS
 #endif
 	} :text
 
+	/* .init.text is made RO and .exit.text is not, so we must
+	 * ensure these sections reside in separate physical pages.
+	 */
+	. = ALIGN(PAGE_SIZE);
+
 	/* .exit.text is discarded at runtime, not link time,
 	 * to deal with references from __bug_table
 	 */
@@ -186,6 +191,8 @@ SECTIONS
 		EXIT_TEXT
 	}
 
+	. = ALIGN(PAGE_SIZE);
+
 	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
 		INIT_DATA
 	}

base-commit: 1398820fee515873379809a6415930ad0764b2f6
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH] powerpc/ps3: use dma_mapping_error()
From: Geert Uytterhoeven @ 2020-12-13 19:39 UTC (permalink / raw)
  To: Vincent Stehlé
  Cc: Geoff Levand, Geert Uytterhoeven, linuxppc-dev,
	Linux Kernel Mailing List
In-Reply-To: <20201213182622.23047-1-vincent.stehle@laposte.net>

On Sun, Dec 13, 2020 at 8:06 PM Vincent Stehlé
<vincent.stehle@laposte.net> wrote:
> The DMA address returned by dma_map_single() should be checked with
> dma_mapping_error(). Fix the ps3stor_setup() function accordingly.
>
> Fixes: 80071802cb9c ("[POWERPC] PS3: Storage Driver Core")
> Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH] powerpc/ps3: use dma_mapping_error()
From: Geoff Levand @ 2020-12-13 19:38 UTC (permalink / raw)
  To: Vincent Stehlé, linuxppc-dev, linux-kernel; +Cc: Geert Uytterhoeven
In-Reply-To: <20201213182622.23047-1-vincent.stehle@laposte.net>

On 12/13/20 10:26 AM, Vincent Stehlé wrote:
> The DMA address returned by dma_map_single() should be checked with
> dma_mapping_error(). Fix the ps3stor_setup() function accordingly.
> 
> Fixes: 80071802cb9c ("[POWERPC] PS3: Storage Driver Core")
> Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>
> Cc: Geoff Levand <geoff@infradead.org>
> Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
> ---
>  drivers/ps3/ps3stor_lib.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Looks good.  Thanks for submitting.

Acked by: Geoff Levand <geoff@infradead.org>

^ permalink raw reply

* Re: [PATCH AUTOSEL 5.9 27/39] sched/idle: Fix arch_cpu_idle() vs tracing
From: Sasha Levin @ 2020-12-13 14:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mark Rutland, uclinux-h8-devel, linux-ia64, linux-parisc,
	linux-s390, linux-hexagon, Heiko Carstens, linux-sh, linux-um,
	linux-kernel, stable, linux-mips, openrisc, linux-csky,
	Sven Schnelle, linux-alpha, sparclinux, linux-riscv, linuxppc-dev,
	linux-arm-kernel
In-Reply-To: <20201203171015.GN2414@hirez.programming.kicks-ass.net>

On Thu, Dec 03, 2020 at 06:10:15PM +0100, Peter Zijlstra wrote:
>On Thu, Dec 03, 2020 at 03:54:42PM +0100, Heiko Carstens wrote:
>> On Thu, Dec 03, 2020 at 08:28:21AM -0500, Sasha Levin wrote:
>> > From: Peter Zijlstra <peterz@infradead.org>
>> >
>> > [ Upstream commit 58c644ba512cfbc2e39b758dd979edd1d6d00e27 ]
>> >
>> > We call arch_cpu_idle() with RCU disabled, but then use
>> > local_irq_{en,dis}able(), which invokes tracing, which relies on RCU.
>> >
>> > Switch all arch_cpu_idle() implementations to use
>> > raw_local_irq_{en,dis}able() and carefully manage the
>> > lockdep,rcu,tracing state like we do in entry.
>> >
>> > (XXX: we really should change arch_cpu_idle() to not return with
>> > interrupts enabled)
>> >
>> > Reported-by: Sven Schnelle <svens@linux.ibm.com>
>> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> > Reviewed-by: Mark Rutland <mark.rutland@arm.com>
>> > Tested-by: Mark Rutland <mark.rutland@arm.com>
>> > Link: https://lkml.kernel.org/r/20201120114925.594122626@infradead.org
>> > Signed-off-by: Sasha Levin <sashal@kernel.org>
>>
>> This patch broke s390 irq state tracing. A patch to fix this is
>> scheduled to be merged upstream today (hopefully).
>> Therefore I think this patch should not yet go into 5.9 stable.
>
>Agreed.

I'll also grab b1cae1f84a0f ("s390: fix irq state tracing"). Thanks!

-- 
Thanks,
Sasha

^ permalink raw reply

* [PATCH 2/3] kbuild: LD_VERSION redenomination
From: Masahiro Yamada @ 2020-12-12 16:54 UTC (permalink / raw)
  To: linux-kbuild
  Cc: Thomas Bogendoerfer, Dominique Martinet, linuxppc-dev,
	linux-kernel, Jiaxun Yang, linux-mips, Paul Mackerras,
	Catalin Marinas, Huacai Chen, Will Deacon, Masahiro Yamada,
	linux-arm-kernel
In-Reply-To: <20201212165431.150750-1-masahiroy@kernel.org>

Commit ccbef1674a15 ("Kbuild, lto: add ld-version and ld-ifversion
macros") introduced scripts/ld-version.sh for GCC LTO.

At that time, this script handled 5 version fields because GCC LTO
needed the downstream binutils. (https://lkml.org/lkml/2014/4/8/272)

The code snippet from the submitted patch was as follows:

    # We need HJ Lu's Linux binutils because mainline binutils does not
    # support mixing assembler and LTO code in the same ld -r object.
    # XXX check if the gcc plugin ld is the expected one too
    # XXX some Fedora binutils should also support it. How to check for that?
    ifeq ($(call ld-ifversion,-ge,22710001,y),y)
        ...

However, GCC LTO was not merged into the mainline after all.
(https://lkml.org/lkml/2014/4/8/272)

So, the 4th and 5th fields were never used, and finally removed by
commit 0d61ed17dd30 ("ld-version: Drop the 4th and 5th version
components").

Since then, the last 4-digits returned by this script is always zeros.

Remove the meaningless last 4-digits. This makes the version format
consistent with GCC_VERSION, CLANG_VERSION, LLD_VERSION.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---

 arch/arm64/Kconfig            | 2 +-
 arch/mips/loongson64/Platform | 2 +-
 arch/mips/vdso/Kconfig        | 2 +-
 arch/powerpc/Makefile         | 2 +-
 arch/powerpc/lib/Makefile     | 2 +-
 scripts/ld-version.sh         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a6b5b7ef40ae..69d56b21a6ec 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1499,7 +1499,7 @@ config ARM64_PTR_AUTH
 	depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
 	# Modern compilers insert a .note.gnu.property section note for PAC
 	# which is only understood by binutils starting with version 2.33.1.
-	depends on LD_IS_LLD || LD_VERSION >= 233010000 || (CC_IS_GCC && GCC_VERSION < 90100)
+	depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100)
 	depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
 	depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
 	help
diff --git a/arch/mips/loongson64/Platform b/arch/mips/loongson64/Platform
index ec42c5085905..cc0b9c87f9ad 100644
--- a/arch/mips/loongson64/Platform
+++ b/arch/mips/loongson64/Platform
@@ -35,7 +35,7 @@ cflags-$(CONFIG_CPU_LOONGSON64)	+= $(call as-option,-Wa$(comma)-mno-fix-loongson
 # can't easily be used safely within the kbuild framework.
 #
 ifeq ($(call cc-ifversion, -ge, 0409, y), y)
-  ifeq ($(call ld-ifversion, -ge, 225000000, y), y)
+  ifeq ($(call ld-ifversion, -ge, 22500, y), y)
     cflags-$(CONFIG_CPU_LOONGSON64)  += \
       $(call cc-option,-march=loongson3a -U_MIPS_ISA -D_MIPS_ISA=_MIPS_ISA_MIPS64)
   else
diff --git a/arch/mips/vdso/Kconfig b/arch/mips/vdso/Kconfig
index 7aec721398d5..a665f6108cb5 100644
--- a/arch/mips/vdso/Kconfig
+++ b/arch/mips/vdso/Kconfig
@@ -12,7 +12,7 @@
 # the lack of relocations. As such, we disable the VDSO for microMIPS builds.
 
 config MIPS_LD_CAN_LINK_VDSO
-	def_bool LD_VERSION >= 225000000 || LD_IS_LLD
+	def_bool LD_VERSION >= 22500 || LD_IS_LLD
 
 config MIPS_DISABLE_VDSO
 	def_bool CPU_MICROMIPS || (!CPU_MIPSR6 && !MIPS_LD_CAN_LINK_VDSO)
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 5c8c06215dd4..6a9a852c3d56 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -65,7 +65,7 @@ UTS_MACHINE := $(subst $(space),,$(machine-y))
 ifdef CONFIG_PPC32
 KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
 else
-ifeq ($(call ld-ifversion, -ge, 225000000, y),y)
+ifeq ($(call ld-ifversion, -ge, 22500, y),y)
 # Have the linker provide sfpr if possible.
 # There is a corresponding test in arch/powerpc/lib/Makefile
 KBUILD_LDFLAGS_MODULE += --save-restore-funcs
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 69a91b571845..d4efc182662a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
 # 64-bit linker creates .sfpr on demand for final link (vmlinux),
 # so it is only needed for modules, and only for older linkers which
 # do not support --save-restore-funcs
-ifeq ($(call ld-ifversion, -lt, 225000000, y),y)
+ifeq ($(call ld-ifversion, -lt, 22500, y),y)
 extra-$(CONFIG_PPC64)	+= crtsavres.o
 endif
 
diff --git a/scripts/ld-version.sh b/scripts/ld-version.sh
index f2be0ff9a738..0f8a2c0f9502 100755
--- a/scripts/ld-version.sh
+++ b/scripts/ld-version.sh
@@ -6,6 +6,6 @@
 	gsub(".*version ", "");
 	gsub("-.*", "");
 	split($1,a, ".");
-	print a[1]*100000000 + a[2]*1000000 + a[3]*10000;
+	print a[1]*10000 + a[2]*100 + a[3];
 	exit
 	}
-- 
2.27.0


^ permalink raw reply related

* [PATCH 2/2] dmaengine: fsldma: Fix a resource leak in an error handling path of the probe function
From: Christophe JAILLET @ 2020-12-12 16:06 UTC (permalink / raw)
  To: leoyang.li, zw, vkoul, dan.j.williams, iws
  Cc: dmaengine, kernel-janitors, Christophe JAILLET, linuxppc-dev,
	linux-kernel

In case of error, the previous 'fsl_dma_chan_probe()' calls must be undone
by some 'fsl_dma_chan_remove()', as already done in the remove function.

It was added in the remove function in commit 77cd62e8082b ("fsldma: allow
Freescale Elo DMA driver to be compiled as a module")

Fixes: d3f620b2c4fe ("fsldma: simplify IRQ probing and handling")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
Patch provided as-is.
I don't have the configuration to compile test this patch
---
 drivers/dma/fsldma.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 554f70a0c18c..f8459cc5315d 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -1214,6 +1214,7 @@ static int fsldma_of_probe(struct platform_device *op)
 {
 	struct fsldma_device *fdev;
 	struct device_node *child;
+	unsigned int i;
 	int err;
 
 	fdev = kzalloc(sizeof(*fdev), GFP_KERNEL);
@@ -1292,6 +1293,10 @@ static int fsldma_of_probe(struct platform_device *op)
 	return 0;
 
 out_free_fdev:
+	for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) {
+		if (fdev->chan[i])
+			fsl_dma_chan_remove(fdev->chan[i]);
+	}
 	irq_dispose_mapping(fdev->irq);
 	iounmap(fdev->regs);
 out_free:
-- 
2.27.0


^ permalink raw reply related

* [PATCH 1/2] dmaengine: fsldma: Fix a resource leak in the remove function
From: Christophe JAILLET @ 2020-12-12 16:05 UTC (permalink / raw)
  To: leoyang.li, zw, vkoul, dan.j.williams, timur
  Cc: dmaengine, kernel-janitors, Christophe JAILLET, linuxppc-dev,
	linux-kernel

A 'irq_dispose_mapping()' call is missing in the remove function.
Add it.

This is needed to undo the 'irq_of_parse_and_map() call from the probe
function and already part of the error handling path of the probe function.

It was added in the probe function only in commit d3f620b2c4fe ("fsldma:
simplify IRQ probing and handling")

Fixes: 77cd62e8082b ("fsldma: allow Freescale Elo DMA driver to be compiled as a module")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
Patch provided as-is.
I don't have the configuration to compile test this patch
---
 drivers/dma/fsldma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 0feb323bae1e..554f70a0c18c 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -1314,6 +1314,7 @@ static int fsldma_of_remove(struct platform_device *op)
 		if (fdev->chan[i])
 			fsl_dma_chan_remove(fdev->chan[i]);
 	}
+	irq_dispose_mapping(fdev->irq);
 
 	iounmap(fdev->regs);
 	kfree(fdev);
-- 
2.27.0


^ permalink raw reply related

* [PATCH] powerpc/vas: Fix IRQ name allocation
From: Cédric Le Goater @ 2020-12-12 14:27 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Sukadev Bhattiprolu, Haren Myneni, Cédric Le Goater

The VAS device allocates a generic interrupt to handle page faults but
the IRQ name doesn't show under /proc. This is because it's on
stack. Allocate the name.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---

 I didn't understand this part in init_vas_instance() :

	if (vinst->virq) {
		rc = vas_irq_fault_window_setup(vinst);
		/*
		 * Fault window is used only for user space send windows.
		 * So if vinst->virq is NULL, tx_win_open returns -ENODEV
		 * for user space.
		 */
		if (rc)
			vinst->virq = 0;
	}

 If the IRQ cannot be requested, the device probing should fail but
 it's not today. The use of 'vinst->virq' is suspicious.

 arch/powerpc/platforms/powernv/vas.h |  1 +
 arch/powerpc/platforms/powernv/vas.c | 11 ++++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 70f793e8f6cc..c7db3190baca 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -340,6 +340,7 @@ struct vas_instance {
 	struct vas_window *rxwin[VAS_COP_TYPE_MAX];
 	struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
 
+	char *name;
 	char *dbgname;
 	struct dentry *dbgdir;
 };
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index 598e4cd563fb..b65256a63e87 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -28,12 +28,10 @@ static DEFINE_PER_CPU(int, cpu_vas_id);
 
 static int vas_irq_fault_window_setup(struct vas_instance *vinst)
 {
-	char devname[64];
 	int rc = 0;
 
-	snprintf(devname, sizeof(devname), "vas-%d", vinst->vas_id);
 	rc = request_threaded_irq(vinst->virq, vas_fault_handler,
-				vas_fault_thread_fn, 0, devname, vinst);
+				vas_fault_thread_fn, 0, vinst->name, vinst);
 
 	if (rc) {
 		pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n",
@@ -80,6 +78,12 @@ static int init_vas_instance(struct platform_device *pdev)
 	if (!vinst)
 		return -ENOMEM;
 
+	vinst->name = kasprintf(GFP_KERNEL, "vas-%d", vasid);
+	if (!vinst->name) {
+		kfree(vinst);
+		return -ENOMEM;
+	}
+
 	INIT_LIST_HEAD(&vinst->node);
 	ida_init(&vinst->ida);
 	mutex_init(&vinst->mutex);
@@ -162,6 +166,7 @@ static int init_vas_instance(struct platform_device *pdev)
 	return 0;
 
 free_vinst:
+	kfree(vinst->name);
 	kfree(vinst);
 	return -ENODEV;
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH] powerpc/mm: Fix hugetlb_free_pmd_range() and hugetlb_free_pud_range()
From: Christophe Leroy @ 2020-12-12 13:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, qcai
  Cc: linuxppc-dev, linux-kernel

Commit 7bfe54b5f165 ("powerpc/mm: Refactor the floor/ceiling check in
hugetlb range freeing functions") inadvertely removed the mask
applied to start parameter in those two functions, leading to the
following crash on power9.

[ 7703.114640][T58070] LTP: starting hugemmap05_1 (hugemmap05 -m)
[ 7703.157792][   C99] ------------[ cut here ]------------
[ 7703.158279][   C99] kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:387!
[ 7703.158306][   C99] Oops: Exception in kernel mode, sig: 5 [#1]
[ 7703.158330][   C99] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=256 NUMA PowerNV
[ 7703.158343][   C99] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_spapr_tce vfio vfio_spapr_eeh loop kvm_hv kvm ip_tables x_tables sd_mod ahci libahci tg3 libata firmware_class libphy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: dummy_del_mod]
[ 7703.158435][   C99] CPU: 99 PID: 308 Comm: ksoftirqd/99 Tainted: G           O      5.10.0-rc7-next-20201211 #1
[ 7703.158464][   C99] NIP:  c00000000005dbec LR: c0000000003352f4 CTR: 0000000000000000
[ 7703.158489][   C99] REGS: c00020000bb6f830 TRAP: 0700   Tainted: G           O       (5.10.0-rc7-next-20201211)
[ 7703.158528][   C99] MSR:  900000000282b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 24002284  XER: 20040000
[ 7703.158570][   C99] GPR00: c0000000003352f4 c00020000bb6fad0 c000000007f70b00 c0002000385b3ff0
[ 7703.158570][   C99] GPR04: 0000000000000000 0000000000000003 c00020000bb6f8b4 0000000000000001
[ 7703.158570][   C99] GPR08: 0000000000000001 0000000000000009 0000000000000008 0000000000000002
[ 7703.158570][   C99] GPR12: 0000000024002488 c000201fff649c00 c000000007f2a20c 0000000000000000
[ 7703.158570][   C99] GPR16: 0000000000000007 0000000000000000 c000000000194d10 c000000000194d10
[ 7703.158570][   C99] GPR24: 0000000000000014 0000000000000015 c000201cc6e72398 c000000007fac4b4
[ 7703.158570][   C99] GPR28: c000000007f2bf80 c000000007fac2f8 0000000000000008 c000200033870000
[ 7703.158766][   C99] NIP [c00000000005dbec] __tlb_remove_table+0x1dc/0x1e0
pgtable_free at arch/powerpc/mm/book3s64/pgtable.c:387
(inlined by) __tlb_remove_table at arch/powerpc/mm/book3s64/pgtable.c:405
[ 7703.158805][   C99] LR [c0000000003352f4] tlb_remove_table_rcu+0x54/0xa0
[ 7703.158853][   C99] Call Trace:
[ 7703.158872][   C99] [c00020000bb6fad0] [c00000000005db4c] __tlb_remove_table+0x13c/0x1e0 (unreliable)
[ 7703.158890][   C99] [c00020000bb6fb00] [c0000000003352f4] tlb_remove_table_rcu+0x54/0xa0
__tlb_remove_table_free at mm/mmu_gather.c:101
(inlined by) tlb_remove_table_rcu at mm/mmu_gather.c:156
[ 7703.158927][   C99] [c00020000bb6fb30] [c000000000194d7c] rcu_core+0x35c/0xbb0
rcu_do_batch at kernel/rcu/tree.c:2502
(inlined by) rcu_core at kernel/rcu/tree.c:2737
[ 7703.158966][   C99] [c00020000bb6fbf0] [c00000000095a3d0] __do_softirq+0x480/0x704
[ 7703.159006][   C99] [c00020000bb6fd10] [c0000000000cc1f4] run_ksoftirqd+0x74/0xd0
run_ksoftirqd at kernel/softirq.c:651
(inlined by) run_ksoftirqd at kernel/softirq.c:642
[ 7703.159046][   C99] [c00020000bb6fd30] [c0000000001040c8] smpboot_thread_fn+0x278/0x320
[ 7703.159096][   C99] [c00020000bb6fda0] [c0000000000fc8a4] kthread+0x1c4/0x1d0
[ 7703.159145][   C99] [c00020000bb6fe10] [c00000000000d9fc] ret_from_kernel_thread+0x5c/0x80
[ 7703.159183][   C99] Instruction dump:
[ 7703.159204][   C99] 60000000 7c0802a6 3c82f8b4 7fe3fb78 38847470 f8010040 482b4fc5 60000000
[ 7703.159248][   C99] 0fe00000 7c0802a6 fbe10028 f8010040 <0fe00000> 3c4c07f1 38422f10 7c0802a6
[ 7703.159293][   C99] ---[ end trace 1d92a5231ba6a0d5 ]---

Properly apply the masks before calling pmd_free_tlb() and
pud_free_tlb() respectively.

Reported-by: Qian Cai <qcai@redhat.com>
Fixes: 7bfe54b5f165 ("powerpc/mm: Refactor the floor/ceiling check in hugetlb range freeing functions")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/mm/hugetlbpage.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f8d8a4988e15..8b3cc4d688e8 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -396,9 +396,9 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 	if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK))
 		return;
 
-	pmd = pmd_offset(pud, start);
+	pmd = pmd_offset(pud, start & PUD_MASK);
 	pud_clear(pud);
-	pmd_free_tlb(tlb, pmd, start);
+	pmd_free_tlb(tlb, pmd, start & PUD_MASK);
 	mm_dec_nr_pmds(tlb->mm);
 }
 
@@ -439,9 +439,9 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 	if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK))
 		return;
 
-	pud = pud_offset(p4d, start);
+	pud = pud_offset(p4d, start & PGDIR_MASK);
 	p4d_clear(p4d);
-	pud_free_tlb(tlb, pud, start);
+	pud_free_tlb(tlb, pud, start & PGDIR_MASK);
 	mm_dec_nr_puds(tlb->mm);
 }
 
-- 
2.25.0


^ permalink raw reply related

* Re: [powerpc/merge] System crash during cpu offline/online operation
From: Sachin Sant @ 2020-12-12 10:07 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <F4809152-C7F5-4ED8-B071-85A9115BD29D@linux.vnet.ibm.com>



> On 11-Dec-2020, at 3:47 PM, Sachin Sant <sachinp@linux.vnet.ibm.com> wrote:
> 
> I am observing system crash during a cpu offline/online operation
> with latest merge branch code running in a PowerVM LPAR (P8 onwards)
> 
> # uname -r
> 5.10.0-rc7-01792-g244569c777ca
> # ppc64_cpu --smt=1
> [  244.205194] cpu 1 (hwid 1) Ready to die…
> ………
> ……...
> [  247.015113] cpu 30 (hwid 30) Ready to die...
> [  247.104973] cpu 31 (hwid 31) Ready to die…
> # ppc64_cpu --smt=8
> 
> At this point the LPAR reboots instantly without any trace message.

Git bisect leads me to the following commit:

3b47b7549ead0719e94022c6742199333c7c8d9f is the first bad commit
commit 3b47b7549ead0719e94022c6742199333c7c8d9f
Author: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Date:   Fri Nov 27 10:14:07 2020 +0530

    powerpc/book3s64/kuap: Move KUAP related function outside radix

Here is the bisect log:

# git bisect log
git bisect start
# bad: [244569c777ca638b08c75db88fe035bdec52ef80] Automatic merge of 'next' into merge (2020-12-10 00:34)
git bisect bad 244569c777ca638b08c75db88fe035bdec52ef80
# good: [9acd775e4579bde0a6d937d72f9669e418aa87ad] Automatic merge of 'master' into merge (2020-12-05 22:54)
git bisect good 9acd775e4579bde0a6d937d72f9669e418aa87ad
# good: [ab91292cb3e9f43d9c6839d7572d17b35bc21710] Merge tag 'char-misc-5.10-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc
git bisect good ab91292cb3e9f43d9c6839d7572d17b35bc21710
# bad: [19b311ca51e108b6d8d679496af8635fdc1984a8] ocxl: Initiate a TLB invalidate command
git bisect bad 19b311ca51e108b6d8d679496af8635fdc1984a8
# bad: [d94b827e89dc3f92cd871d10f4992a6bd3c861e5] powerpc/book3s64/kuap: Use Key 3 for kernel mapping with hash translation
git bisect bad d94b827e89dc3f92cd871d10f4992a6bd3c861e5
# good: [1d15ffdfc94127d75e04a88344ee1ce8c79f05fd] KVM: PPC: Book3S HV: Ratelimit machine check messages coming from guests
git bisect good 1d15ffdfc94127d75e04a88344ee1ce8c79f05fd
# good: [9f378b9f007cc94beadea40df83cc62a76975c6f] KVM: PPC: BOOK3S: PR: Ignore UAMOR SPR
git bisect good 9f378b9f007cc94beadea40df83cc62a76975c6f
# bad: [3b47b7549ead0719e94022c6742199333c7c8d9f] powerpc/book3s64/kuap: Move KUAP related function outside radix
git bisect bad 3b47b7549ead0719e94022c6742199333c7c8d9f
# good: [39df17bc20059c84ddc6f91831fce2e2cc79a6f3] powerpc/book3s64/kuap/kuep: Move uamor setup to pkey init
git bisect good 39df17bc20059c84ddc6f91831fce2e2cc79a6f3
# first bad commit: [3b47b7549ead0719e94022c6742199333c7c8d9f] powerpc/book3s64/kuap: Move KUAP related function outside radix

Thanks
-Sachin

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox