LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 3/8] x86: remove ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20201128160141.1003903-1-npiggin@gmail.com>

Switch remaining x86-specific users to asm/sync_core.h, remove the
linux/sync_core.h header and ARCH_ option.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/x86/Kconfig                    |  1 -
 arch/x86/kernel/alternative.c       |  2 +-
 arch/x86/kernel/cpu/mce/core.c      |  2 +-
 drivers/misc/sgi-gru/grufault.c     |  2 +-
 drivers/misc/sgi-gru/gruhandles.c   |  2 +-
 drivers/misc/sgi-gru/grukservices.c |  2 +-
 include/linux/sync_core.h           | 21 ---------------------
 init/Kconfig                        |  3 ---
 8 files changed, 5 insertions(+), 30 deletions(-)
 delete mode 100644 include/linux/sync_core.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f6946b81f74a..160d3ad90507 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -80,7 +80,6 @@ config X86
 	select ARCH_HAS_SET_DIRECT_MAP
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
-	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAS_DEBUG_WX
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2400ad62f330..9a7ab08f4157 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -17,7 +17,7 @@
 #include <linux/kprobes.h>
 #include <linux/mmu_context.h>
 #include <linux/bsearch.h>
-#include <linux/sync_core.h>
+#include <asm/sync_core.h>
 #include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4102b866e7c0..282ea9942829 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -41,12 +41,12 @@
 #include <linux/irq_work.h>
 #include <linux/export.h>
 #include <linux/set_memory.h>
-#include <linux/sync_core.h>
 #include <linux/task_work.h>
 #include <linux/hardirq.h>
 
 #include <asm/intel-family.h>
 #include <asm/processor.h>
+#include <asm/sync_core.h>
 #include <asm/traps.h>
 #include <asm/tlbflush.h>
 #include <asm/mce.h>
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index 723825524ea0..48fd5b101de1 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -20,8 +20,8 @@
 #include <linux/io.h>
 #include <linux/uaccess.h>
 #include <linux/security.h>
-#include <linux/sync_core.h>
 #include <linux/prefetch.h>
+#include <asm/sync_core.h>
 #include "gru.h"
 #include "grutables.h"
 #include "grulib.h"
diff --git a/drivers/misc/sgi-gru/gruhandles.c b/drivers/misc/sgi-gru/gruhandles.c
index 1d75d5e540bc..c8cba1c1b00f 100644
--- a/drivers/misc/sgi-gru/gruhandles.c
+++ b/drivers/misc/sgi-gru/gruhandles.c
@@ -16,7 +16,7 @@
 #define GRU_OPERATION_TIMEOUT	(((cycles_t) local_cpu_data->itc_freq)*10)
 #define CLKS2NSEC(c)		((c) *1000000000 / local_cpu_data->itc_freq)
 #else
-#include <linux/sync_core.h>
+#include <asm/sync_core.h>
 #include <asm/tsc.h>
 #define GRU_OPERATION_TIMEOUT	((cycles_t) tsc_khz*10*1000)
 #define CLKS2NSEC(c)		((c) * 1000000 / tsc_khz)
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index 0ea923fe6371..860aea9deb45 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -16,11 +16,11 @@
 #include <linux/miscdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
-#include <linux/sync_core.h>
 #include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <asm/io_apic.h>
+#include <asm/sync_core.h>
 #include "gru.h"
 #include "grulib.h"
 #include "grutables.h"
diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h
deleted file mode 100644
index 013da4b8b327..000000000000
--- a/include/linux/sync_core.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SYNC_CORE_H
-#define _LINUX_SYNC_CORE_H
-
-#ifdef CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
-#include <asm/sync_core.h>
-#else
-/*
- * This is a dummy sync_core_before_usermode() implementation that can be used
- * on all architectures which return to user-space through core serializing
- * instructions.
- * If your architecture returns to user-space through non-core-serializing
- * instructions, you need to write your own functions.
- */
-static inline void sync_core_before_usermode(void)
-{
-}
-#endif
-
-#endif /* _LINUX_SYNC_CORE_H */
-
diff --git a/init/Kconfig b/init/Kconfig
index 02d13ae27abb..82f9b5c937cb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -2334,9 +2334,6 @@ source "kernel/Kconfig.locks"
 config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	bool
 
-config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
-	bool
-
 # It may be useful for an architecture to override the definitions of the
 # SYSCALL_DEFINE() and __SYSCALL_DEFINEx() macros in <linux/syscalls.h>
 # and the COMPAT_ variants in <linux/compat.h>, in particular to use a
-- 
2.23.0


^ permalink raw reply related

* [PATCH 4/8] lazy tlb: introduce lazy mm refcount helper functions
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20201128160141.1003903-1-npiggin@gmail.com>

Add explicit _lazy_tlb annotated functions for lazy mm refcounting.
This makes things a bit more explicit, and allows explicit refcounting
to be removed if it is not used.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 ++--
 fs/exec.c                            |  2 +-
 include/linux/sched/mm.h             | 11 +++++++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kthread.c                     | 11 +++++++----
 kernel/sched/core.c                  | 15 ++++++++-------
 8 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 43eb1bfba466..a75938702c58 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -254,7 +254,7 @@ static int ecard_init_mm(void)
 	current->active_mm = mm;
 	activate_mm(active_mm, mm);
 	exit_lazy_tlb(active_mm, current);
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	ecard_init_pgtables(mm);
 	return 0;
 }
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index ac3fec03926a..e66606ef2a3d 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -658,11 +658,11 @@ static void do_exit_flush_lazy_tlb(void *arg)
 	if (current->active_mm == mm) {
 		WARN_ON_ONCE(current->mm != NULL);
 		/* Is a kernel thread and is using mm as the lazy tlb */
-		mmgrab(&init_mm);
+		mmgrab_lazy_tlb(&init_mm);
 		current->active_mm = &init_mm;
 		switch_mm_irqs_off(mm, &init_mm, current);
 		exit_lazy_tlb(mm, current);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
 
 	atomic_dec(&mm->context.active_cpus);
diff --git a/fs/exec.c b/fs/exec.c
index 4b4dea1bb7ba..0a1461bb62e2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1031,7 +1031,7 @@ static int exec_mmap(struct mm_struct *mm)
 		mm_update_next_owner(old_mm);
 		mmput(old_mm);
 	} else {
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 	}
 	return 0;
 }
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2c6bcdf76d99..7157c0f6fef8 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -48,6 +48,17 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+	mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+	mmdrop(mm);
+}
+
 /**
  * mmget() - Pin the address space associated with a &struct mm_struct.
  * @mm: The address space to pin.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 134688d79589..ff9fcbc4e76b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -578,7 +578,7 @@ static int finish_cpu(unsigned int cpu)
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
-	mmdrop(mm);
+	mmdrop_lazy_tlb(mm);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 1f236ed375f8..3711a74fcf4a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -474,7 +474,7 @@ static void exit_mm(void)
 		__set_current_state(TASK_RUNNING);
 		mmap_read_lock(mm);
 	}
-	mmgrab(mm);
+	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e380302aac13..f1241e19327e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1240,14 +1240,14 @@ void kthread_use_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(tsk->mm);
 
+	mmgrab(mm);
+
 	task_lock(tsk);
 	/* Hold off tlb flush IPIs while switching mm's */
 	local_irq_disable();
 	active_mm = tsk->active_mm;
-	if (active_mm != mm) {
-		mmgrab(mm);
+	if (active_mm != mm)
 		tsk->active_mm = mm;
-	}
 	tsk->mm = mm;
 	switch_mm_irqs_off(active_mm, mm, tsk);
 	exit_lazy_tlb(active_mm, tsk);
@@ -1258,7 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm)
 #endif
 
 	if (active_mm != mm)
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 
 	to_kthread(tsk)->oldfs = force_uaccess_begin();
 }
@@ -1281,10 +1281,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	sync_mm_rss(mm);
 	local_irq_disable();
 	tsk->mm = NULL;
+	mmgrab_lazy_tlb(mm);
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
 	local_irq_enable();
 	task_unlock(tsk);
+
+	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e4e8cebd82e2..e372b613d514 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3628,10 +3628,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * schedule between user->kernel->user threads without passing though
 	 * switch_mm(). Membarrier requires a full barrier after storing to
 	 * rq->curr, before returning to userspace, for
-	 * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by mmdrop().
+	 * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by
+	 * mmdrop_lazy_tlb().
 	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
@@ -3736,9 +3737,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	/*
 	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
 	 *
-	 * kernel ->   user   switch + mmdrop() active
+	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
 	 */
 	if (!next->mm) {                                // to kernel
@@ -3746,7 +3747,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
+			mmgrab_lazy_tlb(prev->active_mm);
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
@@ -3764,7 +3765,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		if (!prev->mm) {                        // from kernel
 			exit_lazy_tlb(prev->active_mm, next);
 
-			/* will mmdrop() in finish_task_switch(). */
+			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}
@@ -7206,7 +7207,7 @@ void __init sched_init(void)
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	/*
-- 
2.23.0


^ permalink raw reply related

* [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20201128160141.1003903-1-npiggin@gmail.com>

NOMMU systems could easily go without this and save a bit of code
and the refcount atomics, because their mm switch is a no-op. I
haven't flipped them over because haven't audited all arch code to
convert over to using the _lazy_tlb refcounting.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig             | 11 +++++++
 include/linux/sched/mm.h | 13 ++++++--
 kernel/sched/core.c      | 68 +++++++++++++++++++++++++++++-----------
 kernel/sched/sched.h     |  4 ++-
 4 files changed, 75 insertions(+), 21 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 56b6ccc0e32d..596bf589d74b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -430,6 +430,17 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	  irqs disabled over activate_mm. Architectures that do IPI based TLB
 	  shootdowns should enable this.
 
+# Should make this depend on MMU, because there is little use for lazy mm switching
+# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
+config MMU_LAZY_TLB
+	def_bool y
+	help
+	  Enable "lazy TLB" mmu context switching for kernel threads.
+
+config MMU_LAZY_TLB_REFCOUNT
+	def_bool y
+	depends on MMU_LAZY_TLB
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 7157c0f6fef8..bd0f27402d4b 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -51,12 +51,21 @@ static inline void mmdrop(struct mm_struct *mm)
 /* Helpers for lazy TLB mm refcounting */
 static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 {
-	mmgrab(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+		mmgrab(mm);
 }
 
 static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 {
-	mmdrop(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+		mmdrop(mm);
+	} else {
+		/*
+		 * mmdrop_lazy_tlb must provide a full memory barrier, see the
+		 * membarrier comment finish_task_switch.
+		 */
+		smp_mb();
+	}
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e372b613d514..3b79c6cc3a37 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3579,7 +3579,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
+	struct mm_struct *mm = NULL;
 	long prev_state;
 
 	/*
@@ -3598,7 +3598,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		      current->comm, current->pid, preempt_count()))
 		preempt_count_set(FORK_PREEMPT_COUNT);
 
-	rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	mm = rq->prev_lazy_mm;
+	rq->prev_lazy_mm = NULL;
+#endif
 
 	/*
 	 * A task struct has one reference for the use as "current".
@@ -3630,6 +3633,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * rq->curr, before returning to userspace, for
 	 * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by
 	 * mmdrop_lazy_tlb().
+	 *
+	 * This same issue applies to other places that mmdrop_lazy_tlb().
 	 */
 	if (mm)
 		mmdrop_lazy_tlb(mm);
@@ -3719,22 +3724,10 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	calculate_sigpending();
 }
 
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next, struct rq_flags *rf)
+static __always_inline void
+context_switch_mm(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
 {
-	prepare_task_switch(rq, prev, next);
-
-	/*
-	 * For paravirt, this is coupled with an exit in switch_to to
-	 * combine the page table reload and the switch backend into
-	 * one hypercall.
-	 */
-	arch_start_context_switch(prev);
-
 	/*
 	 * kernel -> kernel   lazy + transfer active
 	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
@@ -3765,11 +3758,50 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		if (!prev->mm) {                        // from kernel
 			exit_lazy_tlb(prev->active_mm, next);
 
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
 			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
-			rq->prev_mm = prev->active_mm;
+			rq->prev_lazy_mm = prev->active_mm;
 			prev->active_mm = NULL;
+#else
+			/* See membarrier comment in finish_task_switch(). */
+			smp_mb();
+#endif
 		}
 	}
+}
+
+static __always_inline void
+context_switch_mm_nolazy(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	if (!next->mm)
+		next->active_mm = &init_mm;
+	membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
+	switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
+	if (!prev->mm)
+		prev->active_mm = NULL;
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf)
+{
+	prepare_task_switch(rq, prev, next);
+
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+		context_switch_mm(rq, prev, next);
+	else
+		context_switch_mm_nolazy(rq, prev, next);
 
 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..3b72aec5a2f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -950,7 +950,9 @@ struct rq {
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-	struct mm_struct	*prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	struct mm_struct	*prev_lazy_mm;
+#endif
 
 	unsigned int		clock_update_flags;
 	u64			clock;
-- 
2.23.0


^ permalink raw reply related

* [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20201128160141.1003903-1-npiggin@gmail.com>

On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).

Abandoning lazy tlb slows switching down quite a bit in the important
user->idle->user cases, so so instead implement a non-refcounted scheme
that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
any remaining lazy ones.

Shootdown IPIs are some concern, but they have not been observed to be
a big problem with this scheme (the powerpc implementation generated
314 additional interrupts on a 144 CPU system during a kernel compile).
There are a number of strategies that could be employed to reduce IPIs
if they turn out to be a problem for some workload.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig  | 13 +++++++++++++
 kernel/fork.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 596bf589d74b..540e43aeefa4 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -440,6 +440,19 @@ config MMU_LAZY_TLB
 config MMU_LAZY_TLB_REFCOUNT
 	def_bool y
 	depends on MMU_LAZY_TLB
+	depends on !MMU_LAZY_TLB_SHOOTDOWN
+
+config MMU_LAZY_TLB_SHOOTDOWN
+	bool
+	depends on MMU_LAZY_TLB
+	help
+	  Instead of refcounting the "lazy tlb" mm struct, which can cause
+	  contention with multi-threaded apps on large multiprocessor systems,
+	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+	  switch to init_mm if they were using the to-be-freed mm as the lazy
+	  tlb. To implement this, architectures must use _lazy_tlb variants of
+	  mm refcounting, and mm_cpumask must include at least all possible
+	  CPUs in which mm might be lazy.
 
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..e47312c2b48b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -669,6 +669,54 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+static void do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		WARN_ON_ONCE(current->mm);
+		current->active_mm = &init_mm;
+		switch_mm(mm, &init_mm, current);
+		exit_lazy_tlb(mm, current);
+	}
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+		/*
+		 * IPI overheads have not found to be expensive, but they could
+		 * be reduced in a number of possible ways, for example (in
+		 * roughly increasing order of complexity):
+		 * - A batch of mms requiring IPIs could be gathered and freed
+		 *   at once.
+		 * - CPUs could store their active mm somewhere that can be
+		 *   remotely checked without a lock, to filter out
+		 *   false-positives in the cpumask.
+		 * - After mm_users or mm_count reaches zero, switching away
+		 *   from the mm could clear mm_cpumask to reduce some IPIs
+		 *   (some batching or delaying would help).
+		 * - A delayed freeing and RCU-like quiescing sequence based on
+		 *   mm switching to avoid IPIs completely.
+		 */
+		on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+		if (IS_ENABLED(CONFIG_DEBUG_VM))
+			on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+	} else {
+		/*
+		 * In this case, lazy tlb mms are refounted and would not reach
+		 * __mmdrop until all CPUs have switched away and mmdrop()ed.
+		 */
+	}
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -678,7 +726,12 @@ void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
+
+	/* Ensure no CPUs are using this as their lazy tlb mm */
+	shoot_lazy_tlbs(mm);
+
 	WARN_ON_ONCE(mm == current->active_mm);
+
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_subscriptions_destroy(mm);
-- 
2.23.0


^ permalink raw reply related

* [PATCH 7/8] powerpc: use lazy mm refcount helper functions
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20201128160141.1003903-1-npiggin@gmail.com>

Use _lazy_tlb functions for lazy mm refcounting in powerpc, to prepare
to move to MMU_LAZY_TLB_SHOOTDOWN.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c2857cbd960..93c0eaa6f4bf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1395,7 +1395,7 @@ void start_secondary(void *unused)
 {
 	unsigned int cpu = raw_smp_processor_id();
 
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
-- 
2.23.0


^ permalink raw reply related

* [PATCH 8/8] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
From: Nicholas Piggin @ 2020-11-28 16:01 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, Nicholas Piggin,
	linux-mm, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20201128160141.1003903-1-npiggin@gmail.com>

On a 16-socket 192-core POWER8 system, a context switching benchmark
with as many software threads as CPUs (so each switch will go in and
out of idle), upstream can achieve a rate of about 1 million context
switches per second. After this patch it goes up to 118 million.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e9f13fe08492..d4793c0229d2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -231,6 +231,7 @@ config PPC
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE
 	select MMU_GATHER_PAGE_SIZE
+	select MMU_LAZY_TLB_SHOOTDOWN		if PPC_BOOK3S_64
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
 	select HAVE_SYSCALL_TRACEPOINTS
-- 
2.23.0


^ permalink raw reply related

* [Bug 204789] Boot failure with more than 256G of memory on Power9 with 4K pages & Hash MMU
From: bugzilla-daemon @ 2020-11-28 16:48 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-204789-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=204789

Cameron (cam@neo-zeon.de) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|ASSIGNED                    |RESOLVED
         Resolution|---                         |CODE_FIX

--- Comment #13 from Cameron (cam@neo-zeon.de) ---
This was resolved some time back by Aneesh and the patches made into mainline a
long time ago. Marking resolved.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [PATCH v8 11/12] mm/vmalloc: Hugepage vmalloc mappings
From: kernel test robot @ 2020-11-28 17:07 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm, Andrew Morton
  Cc: linux-arch, kbuild-all, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Linux Memory Management List, Zefan Li,
	Jonathan Cameron, linuxppc-dev
In-Reply-To: <20201128152559.999540-12-npiggin@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1741 bytes --]

Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on arm64/for-next/core linus/master v5.10-rc5]
[cannot apply to hnaz-linux-mm/master next-20201127]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/huge-vmalloc-mappings/20201128-232946
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: microblaze-randconfig-r035-20201128 (attached as .config)
compiler: microblaze-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/95e5da88c21d305af971ed4f00112f0576c2b94f
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Nicholas-Piggin/huge-vmalloc-mappings/20201128-232946
        git checkout 95e5da88c21d305af971ed4f00112f0576c2b94f
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=microblaze 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   microblaze-linux-ld: mm/page_alloc.o: in function `alloc_large_system_hash':
>> (.init.text+0x4144): undefined reference to `find_vm_area'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 22316 bytes --]

^ permalink raw reply

* Re: [PATCH v8 11/12] mm/vmalloc: Hugepage vmalloc mappings
From: kernel test robot @ 2020-11-28 17:41 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm, Andrew Morton
  Cc: linux-arch, kbuild-all, linux-kernel, Nicholas Piggin,
	Christoph Hellwig, Linux Memory Management List, Zefan Li,
	Jonathan Cameron, linuxppc-dev
In-Reply-To: <20201128152559.999540-12-npiggin@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1848 bytes --]

Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on arm64/for-next/core linus/master v5.10-rc5]
[cannot apply to hnaz-linux-mm/master next-20201127]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Nicholas-Piggin/huge-vmalloc-mappings/20201128-232946
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: h8300-randconfig-r032-20201128 (attached as .config)
compiler: h8300-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/95e5da88c21d305af971ed4f00112f0576c2b94f
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Nicholas-Piggin/huge-vmalloc-mappings/20201128-232946
        git checkout 95e5da88c21d305af971ed4f00112f0576c2b94f
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=h8300 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   h8300-linux-ld: arch/h8300/kernel/entry.o: in function `resume_kernel':
   (.text+0x29e): undefined reference to `TI_PRE_COUNT'
   h8300-linux-ld: mm/page_alloc.o: in function `.L1614':
>> page_alloc.c:(.init.text+0x2187): undefined reference to `find_vm_area'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 18551 bytes --]

^ permalink raw reply

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Andy Lutomirski @ 2020-11-28 17:55 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20201128160141.1003903-3-npiggin@gmail.com>

On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> And get rid of the generic sync_core_before_usermode facility. This is
> functionally a no-op in the core scheduler code, but it also catches
>
> This helper is the wrong way around I think. The idea that membarrier
> state requires a core sync before returning to user is the easy one
> that does not need hiding behind membarrier calls. The gap in core
> synchronization due to x86's sysret/sysexit and lazy tlb mode, is the
> tricky detail that is better put in x86 lazy tlb code.
>
> Consider if an arch did not synchronize core in switch_mm either, then
> membarrier_mm_sync_core_before_usermode would be in the wrong place
> but arch specific mmu context functions would still be the right place.
> There is also a exit_lazy_tlb case that is not covered by this call, which
> could be a bugs (kthread use mm the membarrier process's mm then context
> switch back to the process without switching mm or lazy mm switch).
>
> This makes lazy tlb code a bit more modular.

I have a couple of membarrier fixes that I want to send out today or
tomorrow, and they might eliminate the need for this patch.  Let me
think about this a little bit.  I'll cc you.  The existing code is way
to subtle and the comments are far too confusing for me to be quickly
confident about any of my conclusions :)

^ permalink raw reply

* Re: [PATCH v5] PCI: Unify ECAM constants in native PCI Express drivers
From: Bjorn Helgaas @ 2020-11-28 18:35 UTC (permalink / raw)
  To: Krzysztof Wilczyński
  Cc: Heiko Stuebner, Shawn Lin, Paul Mackerras, Thomas Petazzoni,
	Jonathan Chocron, Toan Le, Will Deacon, Rob Herring,
	Lorenzo Pieralisi, Michal Simek, linux-rockchip,
	bcm-kernel-feedback-list, linux-arm-kernel, linux-pci, Ray Jui,
	Florian Fainelli, linux-rpi-kernel, Jonathan Cameron,
	Bjorn Helgaas, Jonathan Derrick, Scott Branden, Zhou Wang,
	Robert Richter, linuxppc-dev, Nicolas Saenz Julienne
In-Reply-To: <20201127104626.3979165-1-kw@linux.com>

On Fri, Nov 27, 2020 at 10:46:26AM +0000, Krzysztof Wilczyński wrote:
> Unify ECAM-related constants into a single set of standard constants
> defining memory address shift values for the byte-level address that can
> be used when accessing the PCI Express Configuration Space, and then
> move native PCI Express controller drivers to use newly introduced
> definitions retiring any driver-specific ones.
> 
> The ECAM ("Enhanced Configuration Access Mechanism") is defined by the
> PCI Express specification (see PCI Express Base Specification, Revision
> 5.0, Version 1.0, Section 7.2.2, p. 676), thus most hardware should
> implement it the same way.  Most of the native PCI Express controller
> drivers define their ECAM-related constants, many of these could be
> shared, or use open-coded values when setting the .bus_shift field of
> the struct pci_ecam_ops.
> 
> All of the newly added constants should remove ambiguity and reduce the
> number of open-coded values, and also correlate more strongly with the
> descriptions in the aforementioned specification (see Table 7-1
> "Enhanced Configuration Address Mapping", p. 677).
> 
> There is no change to functionality.
> 
> Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
> Signed-off-by: Krzysztof Wilczyński <kw@linux.com>

Beautiful.  This should probably go via Lorenzo's tree, so he may have
comments, too.  Could apply this as-is; I had a few trivial notes
below.

It's ironic that we don't use PCIE_ECAM_OFFSET in drivers/pci/ecam.c.
We could do something like this, which would also let us drop
.bus_shift completely in all the conforming implementations.  It also
closes the hole that we didn't limit "where" to 4K for
pci_ecam_map_bus() users.

  if (per_bus_mapping) {
    base = cfg->winp[busn];
    busn = 0;
  } else {
    base = cfg->win;
  }

  if (cfg->ops->bus_shift) {
    u32 bus_offset = (busn & 0xff) << cfg->ops->bus_shift;
    u32 devfn_offset = (devfn & 0xff) << (cfg->ops->bus_shift - 8);

    where &= 0xfff;

    return base + (bus_offset | devfn_offset | where);
  }

  return base + PCIE_ECAM_OFFSET(busn, devfn, where);

Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>

>  static void __iomem *ppc4xx_pciex_get_config_base(struct ppc4xx_pciex_port *port,
>  						  struct pci_bus *bus,
> -						  unsigned int devfn)
> +						  unsigned int devfn,
> +						  int offset)

The interface change (to add "offset") could be a preparatory patch by
itself.

But I'm actually not sure it's worth even touching this file.  This is
the only place outside drivers/pci that includes linux/pci-ecam.h.  I
think I might rather put PCIE_ECAM_OFFSET() and related things in
drivers/pci/pci.h and keep it all inside drivers/pci.

>  static const struct pci_ecam_ops pci_thunder_pem_ops = {
> -	.bus_shift	= 24,
> +	.bus_shift	= THUNDER_PCIE_ECAM_BUS_SHIFT,
>  	.init		= thunder_pem_platform_init,
>  	.pci_ops	= {
>  		.map_bus	= pci_ecam_map_bus,

This could be split to its own patch, no big deal either way.

>  const struct pci_ecam_ops xgene_v2_pcie_ecam_ops = {
> -	.bus_shift	= 16,
>  	.init		= xgene_v2_pcie_ecam_init,
>  	.pci_ops	= {
>  		.map_bus	= xgene_pcie_map_bus,

Thanks for mentioning this change in the cover letter.  It could also
be split off to a preparatory patch, since it's not related to
PCIE_ECAM_OFFSET(), which is the main point of this patch.

>  static void __iomem *iproc_pcie_map_ep_cfg_reg(struct iproc_pcie *pcie,
>  					       unsigned int busno,
> -					       unsigned int slot,
> -					       unsigned int fn,
> +					       unsigned int devfn,

This interface change *could* be a separate preparatory patch, too,
but I'm starting to feel even more OCD than usual :)

> @@ -94,7 +95,7 @@ struct vmd_dev {
>  	struct pci_dev		*dev;
>  
>  	spinlock_t		cfg_lock;
> -	char __iomem		*cfgbar;
> +	void __iomem		*cfgbar;

This type change might be worth pushing to a separate patch since the
casting issues are not completely trivial.

^ permalink raw reply

* Re: [PATCH] powerpc: Allow relative pointers in bug table entries
From: Christophe Leroy @ 2020-11-28 19:00 UTC (permalink / raw)
  To: Jordan Niethe, linuxppc-dev
In-Reply-To: <20201127030238.763-1-jniethe5@gmail.com>



Le 27/11/2020 à 04:02, Jordan Niethe a écrit :
> This enables GENERIC_BUG_RELATIVE_POINTERS on Power so that 32-bit
> offsets are stored in the bug entries rather than 64-bit pointers.
> 
> Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
> ---
>   arch/powerpc/Kconfig           |  4 ++++
>   arch/powerpc/include/asm/bug.h | 37 ++++++++++++++++++++++++++++++++--
>   arch/powerpc/xmon/xmon.c       | 17 ++++++++++++++--
>   3 files changed, 54 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index e9f13fe08492..294108e0e5c6 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -311,6 +311,10 @@ config GENERIC_BUG
>   	default y
>   	depends on BUG
>   
> +config GENERIC_BUG_RELATIVE_POINTERS
> +	def_bool y
> +	depends on GENERIC_BUG
> +
>   config SYS_SUPPORTS_APM_EMULATION
>   	default y if PMAC_APM_EMU
>   	bool
> diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
> index 338f36cd9934..d03d834042a1 100644
> --- a/arch/powerpc/include/asm/bug.h
> +++ b/arch/powerpc/include/asm/bug.h
> @@ -12,7 +12,11 @@
>   #ifdef CONFIG_DEBUG_BUGVERBOSE
>   .macro EMIT_BUG_ENTRY addr,file,line,flags
>   	 .section __bug_table,"aw"
> +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS

As far as I understand, as soon as CONFIG_BUG is selected, GENERIC_BUG is automatically selected so 
GENERIC_BUG_RELATIVE_POINTERS is selected as well. Therefore this #ifndef is never possible.

>   5001:	 PPC_LONG \addr, 5002f
> +#else
> +5001:	 .4byte \addr - 5001b, 5002f - 5001b
> +#endif /* CONFIG_GENERIC_BUG_RELATIVE_POINTERS */
>   	 .short \line, \flags
>   	 .org 5001b+BUG_ENTRY_SIZE
>   	 .previous
> @@ -23,7 +27,11 @@
>   #else
>   .macro EMIT_BUG_ENTRY addr,file,line,flags
>   	 .section __bug_table,"aw"
> +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS

Same

>   5001:	 PPC_LONG \addr
> +#else
> +5001:	 .4byte \addr - 5001b
> +#endif /* CONFIG_GENERIC_BUG_RELATIVE_POINTERS */
>   	 .short \flags
>   	 .org 5001b+BUG_ENTRY_SIZE
>   	 .previous
> @@ -34,20 +42,45 @@
>   /* _EMIT_BUG_ENTRY expects args %0,%1,%2,%3 to be FILE, LINE, flags and
>      sizeof(struct bug_entry), respectively */
>   #ifdef CONFIG_DEBUG_BUGVERBOSE
> +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS

Same

>   #define _EMIT_BUG_ENTRY				\
>   	".section __bug_table,\"aw\"\n"		\
>   	"2:\t" PPC_LONG "1b, %0\n"		\
>   	"\t.short %1, %2\n"			\
>   	".org 2b+%3\n"				\
>   	".previous\n"
> -#else
> +
> +#else /* relative pointers */
> +
> +#define _EMIT_BUG_ENTRY				\
> +	".section __bug_table,\"aw\"\n"		\
> +	"2:\t.4byte 1b - 2b, %0 - 2b\n"		\
> +	"\t.short %1, %2\n"			\
> +	".org 2b+%3\n"				\
> +	".previous\n"
> +#endif /* relative pointers */
> +
> +#else /* verbose */
> +
> +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS

Same

>   #define _EMIT_BUG_ENTRY				\
>   	".section __bug_table,\"aw\"\n"		\
>   	"2:\t" PPC_LONG "1b\n"			\
>   	"\t.short %2\n"				\
>   	".org 2b+%3\n"				\
>   	".previous\n"
> -#endif
> +
> +#else /* relative pointers */
> +
> +#define _EMIT_BUG_ENTRY				\
> +	".section __bug_table,\"aw\"\n"		\
> +	"2:\t.4byte 1b - 2b\n"		\
> +	"\t.short %2\n"				\
> +	".org 2b+%3\n"				\
> +	".previous\n"
> +
> +#endif /* relative pointers */
> +#endif /* verbose */
>   
>   #define BUG_ENTRY(insn, flags, ...)			\
>   	__asm__ __volatile__(				\
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 55c43a6c9111..5f7cf7e95767 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -1731,6 +1731,9 @@ static void print_bug_trap(struct pt_regs *regs)
>   #ifdef CONFIG_BUG
>   	const struct bug_entry *bug;
>   	unsigned long addr;
> +#ifdef CONFIG_DEBUG_BUGVERBOSE
> +	char *file;
> +#endif
>   
>   	if (regs->msr & MSR_PR)
>   		return;		/* not in kernel */
> @@ -1744,10 +1747,20 @@ static void print_bug_trap(struct pt_regs *regs)
>   		return;
>   
>   #ifdef CONFIG_DEBUG_BUGVERBOSE
> +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS

Same

> +	file = bug->file;
> +#else /* relative pointers */
> +	file = (char *)bug + bug->file_disp;
> +#endif /* relative pointers */
>   	printf("kernel BUG at %s:%u!\n",
> -	       bug->file, bug->line);
> +	       file, bug->line);
>   #else
> -	printf("kernel BUG at %px!\n", (void *)bug->bug_addr);
> +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS

Same

> +	addr = bug->addr;
> +#else /* relative pointers */
> +	addr = (unsigned long)bug + bug->bug_addr_disp;
> +#endif /* relative pointers */
> +	printf("kernel BUG at %px!\n", (void *)addr);
>   #endif
>   #endif /* CONFIG_BUG */
>   }
> 

Christophe

^ permalink raw reply

* Re: [PATCH] powerpc: fix the allyesconfig build
From: Jakub Kicinski @ 2020-11-28 19:36 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Salil Mehta, Geert Uytterhoeven, Stephen Boyd, Michael Turquette,
	linux-kernel, Nicholas Piggin, linux-clk, linux-renesas-soc,
	Huazhong Tan, Yunsheng Lin, Yisen Zhuang, Joel Stanley, netdev,
	PowerPC, David S. Miller, Daniel Axtens
In-Reply-To: <20201128162054.575aea29@canb.auug.org.au>

On Sat, 28 Nov 2020 16:20:54 +1100 Stephen Rothwell wrote:
> On Fri, 27 Nov 2020 17:56:42 -0800 Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > What's the offending structure in hisilicon? I'd rather have a look
> > packing structs with pointers in 'em sounds questionable.
> > 
> > I only see these two:
> > 
> > $ git grep packed drivers/net/ethernet/hisilicon/
> > drivers/net/ethernet/hisilicon/hns/hnae.h:struct __packed hnae_desc {
> > drivers/net/ethernet/hisilicon/hns3/hns3_enet.h:struct __packed hns3_desc {  
> 
> struct hclge_dbg_reg_type_info which is 28 bytes long due to the
> included struct struct hclge_dbg_reg_common_msg (which is 12 bytes
> long).  They are surrounded by #pragma pack(1)/pack().
> 
> This forces the 2 pointers in each second array element of
> hclge_dbg_reg_info[] to be 4 byte aligned (where pointers are 8 bytes
> long on PPC64).

Ah! Thanks, I don't see a reason for these to be packed. 
Looks  like an accident, there is no reason to pack anything 
past struct hclge_dbg_reg_common_msg AFAICT.

Huawei folks, would you mind sending a fix if the analysis is correct?

^ permalink raw reply

* Re: [PATCH net v3 0/9] ibmvnic: assorted bug fixes
From: Jakub Kicinski @ 2020-11-28 21:32 UTC (permalink / raw)
  To: Dany Madden; +Cc: netdev, sukadev, ljp, linuxppc-dev
In-Reply-To: <20201126000432.29897-1-drt@linux.ibm.com>

On Wed, 25 Nov 2020 18:04:23 -0600 Dany Madden wrote:
> Assorted fixes for ibmvnic originated from "[PATCH net 00/15] ibmvnic:
> assorted bug fixes" sent by Lijun Pan.

Applied, thanks!

^ permalink raw reply

* Re: [PATCH 5/8] lazy tlb: allow lazy tlb mm switching to be configurable
From: Andy Lutomirski @ 2020-11-29  0:36 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20201128160141.1003903-6-npiggin@gmail.com>

On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> NOMMU systems could easily go without this and save a bit of code
> and the refcount atomics, because their mm switch is a no-op. I
> haven't flipped them over because haven't audited all arch code to
> convert over to using the _lazy_tlb refcounting.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/Kconfig             | 11 +++++++
>  include/linux/sched/mm.h | 13 ++++++--
>  kernel/sched/core.c      | 68 +++++++++++++++++++++++++++++-----------
>  kernel/sched/sched.h     |  4 ++-
>  4 files changed, 75 insertions(+), 21 deletions(-)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 56b6ccc0e32d..596bf589d74b 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -430,6 +430,17 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
>           irqs disabled over activate_mm. Architectures that do IPI based TLB
>           shootdowns should enable this.
>
> +# Should make this depend on MMU, because there is little use for lazy mm switching
> +# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
> +config MMU_LAZY_TLB
> +       def_bool y
> +       help
> +         Enable "lazy TLB" mmu context switching for kernel threads.
> +
> +config MMU_LAZY_TLB_REFCOUNT
> +       def_bool y
> +       depends on MMU_LAZY_TLB
> +

This could use some documentation as to what "no" means.

>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>         bool
>
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 7157c0f6fef8..bd0f27402d4b 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -51,12 +51,21 @@ static inline void mmdrop(struct mm_struct *mm)
>  /* Helpers for lazy TLB mm refcounting */
>  static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
>  {
> -       mmgrab(mm);
> +       if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
> +               mmgrab(mm);
>  }
>
>  static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
>  {
> -       mmdrop(mm);
> +       if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
> +               mmdrop(mm);
> +       } else {
> +               /*
> +                * mmdrop_lazy_tlb must provide a full memory barrier, see the
> +                * membarrier comment finish_task_switch.

"membarrier comment in finish_task_switch()", perhaps?

> +                */
> +               smp_mb();
> +       }
>  }
>
>  /**
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e372b613d514..3b79c6cc3a37 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3579,7 +3579,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>         __releases(rq->lock)
>  {
>         struct rq *rq = this_rq();
> -       struct mm_struct *mm = rq->prev_mm;
> +       struct mm_struct *mm = NULL;
>         long prev_state;
>
>         /*
> @@ -3598,7 +3598,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>                       current->comm, current->pid, preempt_count()))
>                 preempt_count_set(FORK_PREEMPT_COUNT);
>
> -       rq->prev_mm = NULL;
> +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
> +       mm = rq->prev_lazy_mm;
> +       rq->prev_lazy_mm = NULL;
> +#endif
>
>         /*
>          * A task struct has one reference for the use as "current".
> @@ -3630,6 +3633,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>          * rq->curr, before returning to userspace, for
>          * {PRIVATE,GLOBAL}_EXPEDITED. This is implicitly provided by
>          * mmdrop_lazy_tlb().
> +        *
> +        * This same issue applies to other places that mmdrop_lazy_tlb().
>          */
>         if (mm)
>                 mmdrop_lazy_tlb(mm);
> @@ -3719,22 +3724,10 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
>         calculate_sigpending();
>  }
>
> -/*
> - * context_switch - switch to the new MM and the new thread's register state.
> - */
> -static __always_inline struct rq *
> -context_switch(struct rq *rq, struct task_struct *prev,
> -              struct task_struct *next, struct rq_flags *rf)
> +static __always_inline void
> +context_switch_mm(struct rq *rq, struct task_struct *prev,
> +              struct task_struct *next)
>  {
> -       prepare_task_switch(rq, prev, next);
> -
> -       /*
> -        * For paravirt, this is coupled with an exit in switch_to to
> -        * combine the page table reload and the switch backend into
> -        * one hypercall.
> -        */
> -       arch_start_context_switch(prev);
> -
>         /*
>          * kernel -> kernel   lazy + transfer active
>          *   user -> kernel   lazy + mmgrab_lazy_tlb() active
> @@ -3765,11 +3758,50 @@ context_switch(struct rq *rq, struct task_struct *prev,
>                 if (!prev->mm) {                        // from kernel
>                         exit_lazy_tlb(prev->active_mm, next);
>
> +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
>                         /* will mmdrop_lazy_tlb() in finish_task_switch(). */
> -                       rq->prev_mm = prev->active_mm;
> +                       rq->prev_lazy_mm = prev->active_mm;
>                         prev->active_mm = NULL;
> +#else
> +                       /* See membarrier comment in finish_task_switch(). */
> +                       smp_mb();
> +#endif
>                 }
>         }
> +}
> +

Comment here describing what this does, please.


> +static __always_inline void
> +context_switch_mm_nolazy(struct rq *rq, struct task_struct *prev,
> +              struct task_struct *next)
> +{
> +       if (!next->mm)
> +               next->active_mm = &init_mm;
> +       membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
> +       switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
> +       if (!prev->mm)
> +               prev->active_mm = NULL;
> +}
> +
> +/*
> + * context_switch - switch to the new MM and the new thread's register state.
> + */
> +static __always_inline struct rq *
> +context_switch(struct rq *rq, struct task_struct *prev,
> +              struct task_struct *next, struct rq_flags *rf)
> +{
> +       prepare_task_switch(rq, prev, next);
> +
> +       /*
> +        * For paravirt, this is coupled with an exit in switch_to to
> +        * combine the page table reload and the switch backend into
> +        * one hypercall.
> +        */
> +       arch_start_context_switch(prev);
> +
> +       if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
> +               context_switch_mm(rq, prev, next);
> +       else
> +               context_switch_mm_nolazy(rq, prev, next);
>
>         rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index df80bfcea92e..3b72aec5a2f2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -950,7 +950,9 @@ struct rq {
>         struct task_struct      *idle;
>         struct task_struct      *stop;
>         unsigned long           next_balance;
> -       struct mm_struct        *prev_mm;
> +#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
> +       struct mm_struct        *prev_lazy_mm;
> +#endif
>
>         unsigned int            clock_update_flags;
>         u64                     clock;
> --
> 2.23.0
>

^ permalink raw reply

* Re: [PATCH 1/8] lazy tlb: introduce exit_lazy_tlb
From: Andy Lutomirski @ 2020-11-29  0:38 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20201128160141.1003903-2-npiggin@gmail.com>

On Sat, Nov 28, 2020 at 8:01 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> This is called at points where a lazy mm is switched away or made not
> lazy (by its owner switching back).
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/arm/mach-rpc/ecard.c            |  1 +
>  arch/powerpc/mm/book3s64/radix_tlb.c |  1 +
>  fs/exec.c                            |  6 ++++--
>  include/asm-generic/mmu_context.h    | 21 +++++++++++++++++++++
>  kernel/kthread.c                     |  1 +
>  kernel/sched/core.c                  |  2 ++
>  6 files changed, 30 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
> index 827b50f1c73e..43eb1bfba466 100644
> --- a/arch/arm/mach-rpc/ecard.c
> +++ b/arch/arm/mach-rpc/ecard.c
> @@ -253,6 +253,7 @@ static int ecard_init_mm(void)
>         current->mm = mm;
>         current->active_mm = mm;
>         activate_mm(active_mm, mm);
> +       exit_lazy_tlb(active_mm, current);
>         mmdrop(active_mm);
>         ecard_init_pgtables(mm);
>         return 0;
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
> index b487b489d4b6..ac3fec03926a 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -661,6 +661,7 @@ static void do_exit_flush_lazy_tlb(void *arg)
>                 mmgrab(&init_mm);
>                 current->active_mm = &init_mm;
>                 switch_mm_irqs_off(mm, &init_mm, current);
> +               exit_lazy_tlb(mm, current);
>                 mmdrop(mm);
>         }
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 547a2390baf5..4b4dea1bb7ba 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1017,6 +1017,8 @@ static int exec_mmap(struct mm_struct *mm)
>         if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
>                 local_irq_enable();
>         activate_mm(active_mm, mm);
> +       if (!old_mm)
> +               exit_lazy_tlb(active_mm, tsk);
>         if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
>                 local_irq_enable();
>         tsk->mm->vmacache_seqnum = 0;
> @@ -1028,9 +1030,9 @@ static int exec_mmap(struct mm_struct *mm)
>                 setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
>                 mm_update_next_owner(old_mm);
>                 mmput(old_mm);
> -               return 0;
> +       } else {
> +               mmdrop(active_mm);
>         }
> -       mmdrop(active_mm);

This looks like an unrelated change.

>         return 0;
>  }
>
> diff --git a/include/asm-generic/mmu_context.h b/include/asm-generic/mmu_context.h
> index 91727065bacb..4626d0020e65 100644
> --- a/include/asm-generic/mmu_context.h
> +++ b/include/asm-generic/mmu_context.h
> @@ -24,6 +24,27 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
>  }
>  #endif
>
> +/*
> + * exit_lazy_tlb - Called after switching away from a lazy TLB mode mm.
> + *
> + * mm:  the lazy mm context that was switched
> + * tsk: the task that was switched to (with a non-lazy mm)
> + *
> + * mm may equal tsk->mm.
> + * mm and tsk->mm will not be NULL.
> + *
> + * Note this is not symmetrical to enter_lazy_tlb, this is not
> + * called when tasks switch into the lazy mm, it's called after the
> + * lazy mm becomes non-lazy (either switched to a different mm or the
> + * owner of the mm returns).
> + */
> +#ifndef exit_lazy_tlb
> +static inline void exit_lazy_tlb(struct mm_struct *mm,

Maybe name this parameter prev_lazy_mm?

^ permalink raw reply

* Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Andy Lutomirski @ 2020-11-29  3:54 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20201128160141.1003903-7-npiggin@gmail.com>

On Sat, Nov 28, 2020 at 8:02 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> On big systems, the mm refcount can become highly contented when doing
> a lot of context switching with threaded applications (particularly
> switching between the idle thread and an application thread).
>
> Abandoning lazy tlb slows switching down quite a bit in the important
> user->idle->user cases, so so instead implement a non-refcounted scheme
> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
> any remaining lazy ones.
>
> Shootdown IPIs are some concern, but they have not been observed to be
> a big problem with this scheme (the powerpc implementation generated
> 314 additional interrupts on a 144 CPU system during a kernel compile).
> There are a number of strategies that could be employed to reduce IPIs
> if they turn out to be a problem for some workload.

I'm still wondering whether we can do even better.

The IPIs you're doing aren't really necessary -- we don't
fundamentally need to free the pagetables immediately when all
non-lazy users are done with them (and current kernels don't) -- what
we need to do is to synchronize all the bookkeeping.  So, with
adequate locking (famous last words), a couple of alternative schemes
ought to be possible.

a) Instead of sending an IPI, increment mm_count on behalf of the
remote CPU and do something to make sure that the remote CPU knows we
did this on its behalf.  Then free the mm when mm_count hits zero.

b) Treat mm_cpumask as part of the refcount.  Add one to mm_count when
an mm is created.  Once mm_users hits zero, whoever clears the last
bit in mm_cpumask is responsible for decrementing a single reference
from mm_count, and whoever sets it to zero frees the mm.

Version (b) seems fairly straightforward to implement -- add RCU
protection and a atomic_t special_ref_cleared (initially 0) to struct
mm_struct itself.  After anyone clears a bit to mm_cpumask (which is
already a barrier), they read mm_users.  If it's zero, then they scan
mm_cpumask and see if it's empty.  If it is, they atomically swap
special_ref_cleared to 1.  If it was zero before the swap, they do
mmdrop().  I can imagine some tweaks that could make this a big
faster, at least in the limit of a huge number of CPUs.

Version (a) seems a bit harder to reason about.  Maybe it could be
done like this.  Add a percpu variable mm_with_extra_count.  This
variable can be NULL, but it can also be an mm that has an extra
reference on behalf of the cpu in question.

__mmput scans mm_cpumask and, for each cpu in the mask, mmgrabs the mm
and cmpxchgs that cpu's mm_with_extra_count from NULL to mm.  If it
succeeds, then we win.  If it fails, further thought is required, and
maybe we have to send an IPI, although maybe some other cleverness is
possible.  Any time a CPU switches mms, it does atomic swaps
mm_with_extra_count to NULL and mmdrops whatever the mm was.  (Maybe
it needs to check the mm isn't equal to the new mm, although it would
be quite bizarre for this to happen.)  Other than these mmgrab and
mmdrop calls, the mm switching code doesn't mmgrab or mmdrop at all.

Version (a) seems like it could have excellent performance.

*However*, I think we should consider whether we want to do something
even bigger first.  Even with any of these changes, we still need to
maintain mm_cpumask(), and that itself can be a scalability problem.
I wonder if we can solve this problem too.  Perhaps the switch_mm()
paths could only ever set mm_cpumask bits, and anyone who would send
an IPI because a bit is set in mm_cpumask would first check some
percpu variable (cpu_rq(cpu)->something?  an entirely new variable) to
see if the bit in mm_cpumask is spurious.  Or perhaps mm_cpumask could
be split up across multiple cachelines, one per node.

We should keep the recent lessons from Apple in mind, though: x86 is a
dinosaur.  The future of atomics is going to look a lot more like
ARM's LSE than x86's rather anemic set.  This means that mm_cpumask
operations won't need to be full barriers forever, and we might not
want to take the implied full barriers in set_bit() and clear_bit()
for granted.

--Andy

^ permalink raw reply

* [Bug 209869] Kernel 5.10-rc1 fails to boot on a PowerMac G4 3,6 at an early stage
From: bugzilla-daemon @ 2020-11-29  6:11 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-209869-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=209869

--- Comment #14 from Erhard F. (erhard_f@mailbox.org) ---
Thanks!

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: Kernel 5.10-rc4 doesn't boot
From: Christophe Leroy @ 2020-11-29  7:37 UTC (permalink / raw)
  To: Stan Johnson, debian-powerpc
  Cc: linuxppc-dev@lists.ozlabs.org, Elimar Riesebieter
In-Reply-To: <228955cd-e014-b533-b3bb-4d4e04baa9de@yahoo.com>



Le 29/11/2020 à 02:33, Stan Johnson a écrit :
> 
>>
>>
>> On 11/25/20 5:52 AM, Elimar Riesebieter wrote:
>>>
>>> * John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>[2020-11-25
>>> 13:04 +0100]:
>>>>
>>>> On 11/25/20 12:40 PM, Elimar Riesebieter wrote:
>>>>>
>>>>> I tried to boot linux-image-5.10.0-rc4-powerpc on my PowerBook5,8.
>>>>> It seems the ramdisk isn't loaded as ist stocks at the white
>>>>> firmware boot. A custom build fails as well. I have no idea how to
>>>>> debug that. I am booting via yaboot.
>>>>
>>>> FWIW, if you think the initrd isn't being loaded, you may have run
>>>> into a limitation with Yaboot. You have to keep in mind that Yaboot
>>>> isn't being actively maintained anymore, at least not as it used to
>>>> be. So in case of an incompatible change on the kernel side, you may
>>>> run into such problems with the bootloader. You could try switching
>>>> to GRUB.
>>>
>>> What is the preferred way to switch to grub2? Elimar
>>
>>
>> HelloElimar,
>>
>> I can confirm that Linux 5.10.0-rc5 also does not work on a PowerBook
>> Lombard, hanging at the "found display ... opening ..." screen after
>> loading the initrd.img file.
>>
>> The 5.10.0-rc5 initrd.img can be used with an older 5.8.4 kernel;
>> however, the 5.8.4 initrd.img does not work with the 5.10.0-rc5
>> kernel, so the problem appears to be with the 5.10.0-rc5 kernel.
>>
>> ...
>>
>> -Stan Johnson
>>
> 
> The Wallstreet also won't boot any 5.10.0-rc* kernel (it hangs at the
> BootX screen).  The last v5.9 mainline kernel (v5.9.11) boots on both
> the Wallstreet and Lombard (so it's likely the same problem).
> 
> I saw during one of the boots that PPC 601 support has been removed; I
> thought that there was still an effort not to do that (but that's a
> separate issue).

PPC 601 has been broken for several kernels, and nobody noticed so we concluded it was unused with 
new kernels and finally removed it completely in order to kick out all 601 particularities that were 
a nightmare.

> 
> A git bisect going from v5.9 to v5.10.0-rc1 resulted in the following,
> after about 13 kernels:
> 
> -----
> 69a1593abdbcf03a76367320d929a8ae7a5e3d71 is the first bad commit
> -----
> 
> I'm cc'ing Christophe Leroy, since I don't know what to do next.

Try following commit: 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=01776f070ffcbf336be3bf1672bd3c589548d6c4

Christophe

^ permalink raw reply

* Re: Kernel 5.10-rc4 doesn't boot
From: Stan Johnson @ 2020-11-29 16:40 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: debian-powerpc, linuxppc-dev@lists.ozlabs.org, Elimar Riesebieter
In-Reply-To: <20201129122727.42zbknbaz26dedcg@toy.home.lxtec.de>

I can confirm that applying the patch to 5.10-rc5 also fixes the boot
problem on the Wallstreet and Lombard PowerBooks.

thanks

-Stan

-----

On 11/29/20 5:27 AM, Elimar Riesebieter wrote:
> Hi all,
>
> * Christophe Leroy <christophe.leroy@csgroup.eu> [2020-11-29 08:37 +0100]:
>
> [...]
>
>> Try following commit: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=01776f070ffcbf336be3bf1672bd3c589548d6c4
> I can confirm that this commit patched at 5.10-rc5 made the kernel
> boot again :-)
>
> Thanks for bisecting and providing the patch!
>
> Elimar


^ permalink raw reply

* Re: Kernel 5.10-rc4 doesn't boot
From: Elimar Riesebieter @ 2020-11-29 12:27 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: debian-powerpc, linuxppc-dev@lists.ozlabs.org, Stan Johnson
In-Reply-To: <79a8734e-da7f-6563-2730-b1014ccb35c7@csgroup.eu>

Hi all,

* Christophe Leroy <christophe.leroy@csgroup.eu> [2020-11-29 08:37 +0100]:

[...]

> Try following commit: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=01776f070ffcbf336be3bf1672bd3c589548d6c4

I can confirm that this commit patched at 5.10-rc5 made the kernel
boot again :-)

Thanks for bisecting and providing the patch!

Elimar
-- 
  The path to source is always uphill!
                                -unknown-


^ permalink raw reply

* Re: [PATCH 2/2] powerpc/ps3: make system bus's remove and shutdown callbacks return void
From: Uwe Kleine-König @ 2020-11-29 17:31 UTC (permalink / raw)
  To: Takashi Iwai, Michael Ellerman
  Cc: alsa-devel, linux-fbdev, dri-devel, Jaroslav Kysela,
	Paul Mackerras, linux-scsi, Alan Stern, Jakub Kicinski,
	Arnd Bergmann, Bartlomiej Zolnierkiewicz, James E.J. Bottomley,
	linux-block, Jens Axboe, Martin K. Petersen, Geoff Levand,
	Greg Kroah-Hartman, linux-usb, Takashi Iwai, Jim Paris, netdev,
	linuxppc-dev, David S. Miller
In-Reply-To: <s5hv9dphnoh.wl-tiwai@suse.de>

[-- Attachment #1: Type: text/plain, Size: 1334 bytes --]

Hello Michael,

On Sat, Nov 28, 2020 at 09:48:30AM +0100, Takashi Iwai wrote:
> On Thu, 26 Nov 2020 17:59:50 +0100,
> Uwe Kleine-König wrote:
> > 
> > The driver core ignores the return value of struct device_driver::remove
> > because there is only little that can be done. For the shutdown callback
> > it's ps3_system_bus_shutdown() which ignores the return value.
> > 
> > To simplify the quest to make struct device_driver::remove return void,
> > let struct ps3_system_bus_driver::remove return void, too. All users
> > already unconditionally return 0, this commit makes it obvious that
> > returning an error code is a bad idea and ensures future users behave
> > accordingly.
> > 
> > Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
> 
> For the sound bit:
> Acked-by: Takashi Iwai <tiwai@suse.de>

assuming that you are the one who will apply this patch: Note that it
depends on patch 1 that Takashi already applied to his tree. So you
either have to wait untils patch 1 appears in some tree that you merge
before applying, or you have to take patch 1, too. (With Takashi
optinally dropping it then.)

Best regards
Uwe

-- 
Pengutronix e.K.                           | Uwe Kleine-König            |
Industrial Linux Solutions                 | https://www.pengutronix.de/ |

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH 5/8] net: ethernet: ibm: ibmvnic: Fix some kernel-doc misdemeanours
From: Andrew Lunn @ 2020-11-29 18:43 UTC (permalink / raw)
  To: Lee Jones
  Cc: Thomas Falcon, John Allen, linux-kernel, Santiago Leon,
	Jakub Kicinski, netdev, Lijun Pan, Dany Madden, Paul Mackerras,
	Sukadev Bhattiprolu, linuxppc-dev, David S. Miller
In-Reply-To: <20201126133853.3213268-6-lee.jones@linaro.org>

Hi Lee

>  /**
>   * build_hdr_data - creates L2/L3/L4 header data buffer
> - * @hdr_field - bitfield determining needed headers
> - * @skb - socket buffer
> - * @hdr_len - array of header lengths
> - * @tot_len - total length of data
> + * @hdr_field: bitfield determining needed headers
> + * @skb: socket buffer
> + * @hdr_len: array of header lengths
> + * @tot_len: total length of data
>   *
>   * Reads hdr_field to determine which headers are needed by firmware.
>   * Builds a buffer containing these headers.  Saves individual header

The code is:

static int build_hdr_data(u8 hdr_field, struct sk_buff *skb,
                          int *hdr_len, u8 *hdr_data)
{

What about hdr_data? 

>  /**
>   * create_hdr_descs - create header and header extension descriptors
> - * @hdr_field - bitfield determining needed headers
> - * @data - buffer containing header data
> - * @len - length of data buffer
> - * @hdr_len - array of individual header lengths
> - * @scrq_arr - descriptor array
> + * @hdr_field: bitfield determining needed headers
> + * @data: buffer containing header data
> + * @len: length of data buffer
> + * @hdr_len: array of individual header lengths
> + * @scrq_arr: descriptor array

static int create_hdr_descs(u8 hdr_field, u8 *hdr_data, int len, int *hdr_len,
                            union sub_crq *scrq_arr)

There is no data parameter.

It looks like you just changes - to :, but did not validate the
parameters are actually correct.

	   Andrew

^ permalink raw reply

* Re: [PATCH 6/8] net: ethernet: toshiba: ps3_gelic_net: Fix some kernel-doc misdemeanours
From: Andrew Lunn @ 2020-11-29 18:45 UTC (permalink / raw)
  To: Lee Jones
  Cc: Geoff Levand, linux-kernel, Jens Osterkamp, netdev,
	Paul Mackerras, Utz Bacher, Jakub Kicinski, linuxppc-dev,
	David S. Miller
In-Reply-To: <20201126133853.3213268-7-lee.jones@linaro.org>

On Thu, Nov 26, 2020 at 01:38:51PM +0000, Lee Jones wrote:
> Fixes the following W=1 kernel build warning(s):
> 
>  drivers/net/ethernet/toshiba/ps3_gelic_net.c:1107: warning: Function parameter or member 'irq' not described in 'gelic_card_interrupt'
>  drivers/net/ethernet/toshiba/ps3_gelic_net.c:1107: warning: Function parameter or member 'ptr' not described in 'gelic_card_interrupt'
>  drivers/net/ethernet/toshiba/ps3_gelic_net.c:1407: warning: Function parameter or member 'txqueue' not described in 'gelic_net_tx_timeout'
>  drivers/net/ethernet/toshiba/ps3_gelic_net.c:1439: warning: Function parameter or member 'napi' not described in 'gelic_ether_setup_netdev_ops'
>  drivers/net/ethernet/toshiba/ps3_gelic_net.c:1639: warning: Function parameter or member 'dev' not described in 'ps3_gelic_driver_probe'
>  drivers/net/ethernet/toshiba/ps3_gelic_net.c:1795: warning: Function parameter or member 'dev' not described in 'ps3_gelic_driver_remove'
> 
> Cc: Geoff Levand <geoff@infradead.org>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Jakub Kicinski <kuba@kernel.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Utz Bacher <utz.bacher@de.ibm.com>
> Cc: Jens Osterkamp <Jens.Osterkamp@de.ibm.com>
> Cc: netdev@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Lee Jones <lee.jones@linaro.org>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH 8/8] net: ethernet: ibm: ibmvnic: Fix some kernel-doc issues
From: Andrew Lunn @ 2020-11-29 19:10 UTC (permalink / raw)
  To: Lee Jones
  Cc: Thomas Falcon, John Allen, linux-kernel, Santiago Leon,
	Jakub Kicinski, netdev, Lijun Pan, Dany Madden, Paul Mackerras,
	Sukadev Bhattiprolu, linuxppc-dev, David S. Miller
In-Reply-To: <20201126133853.3213268-9-lee.jones@linaro.org>

On Thu, Nov 26, 2020 at 01:38:53PM +0000, Lee Jones wrote:
> Fixes the following W=1 kernel build warning(s):
> 
>  from drivers/net/ethernet/ibm/ibmvnic.c:35:
>  inlined from ‘handle_vpd_rsp’ at drivers/net/ethernet/ibm/ibmvnic.c:4124:3:
>  drivers/net/ethernet/ibm/ibmvnic.c:1362: warning: Function parameter or member 'hdr_data' not described in 'build_hdr_data'
>  drivers/net/ethernet/ibm/ibmvnic.c:1362: warning: Excess function parameter 'tot_len' description in 'build_hdr_data'
>  drivers/net/ethernet/ibm/ibmvnic.c:1423: warning: Function parameter or member 'hdr_data' not described in 'create_hdr_descs'
>  drivers/net/ethernet/ibm/ibmvnic.c:1423: warning: Excess function parameter 'data' description in 'create_hdr_descs'
>  drivers/net/ethernet/ibm/ibmvnic.c:1474: warning: Function parameter or member 'txbuff' not described in 'build_hdr_descs_arr'
>  drivers/net/ethernet/ibm/ibmvnic.c:1474: warning: Excess function parameter 'skb' description in 'build_hdr_descs_arr'
>  drivers/net/ethernet/ibm/ibmvnic.c:1474: warning: Excess function parameter 'subcrq' description in 'build_hdr_descs_arr'

Hi Lee

It looks like this should be squashed into the previous patch to this
file.

	Andrew

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox