LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 07/52] powerpc/time: add API for KVM to re-arm the host timer/decrementer
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>

Rather than have KVM look up the host timer and fiddle with the
irq-work internal details, have the powerpc/time.c code provide a
function for KVM to re-arm the Linux timer code when exiting a
guest.

This is implementation has an improvement over existing code of
marking a decrementer interrupt as soft-pending if a timer has
expired, rather than setting DEC to a -ve value, which tended to
cause host timers to take two interrupts (first hdec to exit the
guest, then the immediate dec).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/time.h | 16 +++-------
 arch/powerpc/kernel/time.c      | 52 +++++++++++++++++++++++++++------
 arch/powerpc/kvm/book3s_hv.c    |  7 ++---
 3 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 69b6be617772..924b2157882f 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 extern void secondary_cpu_time_init(void);
 extern void __init time_init(void);
 
-#ifdef CONFIG_PPC64
-static inline unsigned long test_irq_work_pending(void)
-{
-	unsigned long x;
-
-	asm volatile("lbz %0,%1(13)"
-		: "=r" (x)
-		: "i" (offsetof(struct paca_struct, irq_work_pending)));
-	return x;
-}
-#endif
-
 DECLARE_PER_CPU(u64, decrementers_next_tb);
 
 static inline u64 timer_get_next_tb(void)
@@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void)
 	return __this_cpu_read(decrementers_next_tb);
 }
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void timer_rearm_host_dec(u64 now);
+#endif
+
 /* Convert timebase ticks to nanoseconds */
 unsigned long long tb_to_ns(unsigned long long tb_ticks);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 6ce40d2ac201..2a6c118a43fb 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -498,6 +498,16 @@ EXPORT_SYMBOL(profile_pc);
  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
  */
 #ifdef CONFIG_PPC64
+static inline unsigned long test_irq_work_pending(void)
+{
+	unsigned long x;
+
+	asm volatile("lbz %0,%1(13)"
+		: "=r" (x)
+		: "i" (offsetof(struct paca_struct, irq_work_pending)));
+	return x;
+}
+
 static inline void set_irq_work_pending_flag(void)
 {
 	asm volatile("stb %0,%1(13)" : :
@@ -541,13 +551,44 @@ void arch_irq_work_raise(void)
 	preempt_enable();
 }
 
+static void set_dec_or_work(u64 val)
+{
+	set_dec(val);
+	/* We may have raced with new irq work */
+	if (unlikely(test_irq_work_pending()))
+		set_dec(1);
+}
+
 #else  /* CONFIG_IRQ_WORK */
 
 #define test_irq_work_pending()	0
 #define clear_irq_work_pending()
 
+static void set_dec_or_work(u64 val)
+{
+	set_dec(val);
+}
 #endif /* CONFIG_IRQ_WORK */
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void timer_rearm_host_dec(u64 now)
+{
+	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
+
+	WARN_ON_ONCE(!arch_irqs_disabled());
+	WARN_ON_ONCE(mfmsr() & MSR_EE);
+
+	if (now >= *next_tb) {
+		local_paca->irq_happened |= PACA_IRQ_DEC;
+	} else {
+		now = *next_tb - now;
+		if (now <= decrementer_max)
+			set_dec_or_work(now);
+	}
+}
+EXPORT_SYMBOL_GPL(timer_rearm_host_dec);
+#endif
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
@@ -608,10 +649,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
 	} else {
 		now = *next_tb - now;
 		if (now <= decrementer_max)
-			set_dec(now);
-		/* We may have raced with new irq work */
-		if (test_irq_work_pending())
-			set_dec(1);
+			set_dec_or_work(now);
 		__this_cpu_inc(irq_stat.timer_irqs_others);
 	}
 
@@ -853,11 +891,7 @@ static int decrementer_set_next_event(unsigned long evt,
 				      struct clock_event_device *dev)
 {
 	__this_cpu_write(decrementers_next_tb, get_tb() + evt);
-	set_dec(evt);
-
-	/* We may have raced with new irq work */
-	if (test_irq_work_pending())
-		set_dec(1);
+	set_dec_or_work(evt);
 
 	return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e4482bf546ed..e83c7aa7dbba 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4049,11 +4049,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 	vc->entry_exit_map = 0x101;
 	vc->in_guest = 0;
 
-	next_timer = timer_get_next_tb();
-	set_dec(next_timer - tb);
-	/* We may have raced with new irq work */
-	if (test_irq_work_pending())
-		set_dec(1);
+	timer_rearm_host_dec(tb);
+
 	mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
 
 	kvmhv_load_host_pmu();
-- 
2.23.0


^ permalink raw reply related

* [PATCH v3 06/52] KVM: PPC: Book3S HV P9: Reduce mftb per guest entry/exit
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin, Fabiano Rosas
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>

mftb is serialising (dispatch next-to-complete) so it is heavy weight
for a mfspr. Avoid reading it multiple times in the entry or exit paths.
A small number of cycles delay to timers is tolerable.

Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_hv.c          | 4 ++--
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 30d400bf161b..e4482bf546ed 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3927,7 +3927,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 	 *
 	 * XXX: Another day's problem.
 	 */
-	mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
+	mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb);
 
 	if (kvmhv_on_pseries()) {
 		/*
@@ -4050,7 +4050,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 	vc->in_guest = 0;
 
 	next_timer = timer_get_next_tb();
-	set_dec(next_timer - mftb());
+	set_dec(next_timer - tb);
 	/* We may have raced with new irq work */
 	if (test_irq_work_pending())
 		set_dec(1);
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 0ff9ddb5e7ca..bd8cf0a65ce8 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -203,7 +203,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 	unsigned long host_dawr1;
 	unsigned long host_dawrx1;
 
-	hdec = time_limit - mftb();
+	tb = mftb();
+	hdec = time_limit - tb;
 	if (hdec < 0)
 		return BOOK3S_INTERRUPT_HV_DECREMENTER;
 
@@ -215,7 +216,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 	vcpu->arch.ceded = 0;
 
 	if (vc->tb_offset) {
-		u64 new_tb = mftb() + vc->tb_offset;
+		u64 new_tb = tb + vc->tb_offset;
 		mtspr(SPRN_TBU40, new_tb);
 		tb = mftb();
 		if ((tb & 0xffffff) < (new_tb & 0xffffff))
-- 
2.23.0


^ permalink raw reply related

* [PATCH v3 05/52] KVM: PPC: Book3S HV P9: Use large decrementer for HDEC
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: Alexey Kardashevskiy, Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>

On processors that don't suppress the HDEC exceptions when LPCR[HDICE]=0,
this could help reduce needless guest exits due to leftover exceptions on
entering the guest.

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/time.h       | 2 ++
 arch/powerpc/kernel/time.c            | 1 +
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index fd09b4797fd7..69b6be617772 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -18,6 +18,8 @@
 #include <asm/vdso/timebase.h>
 
 /* time.c */
+extern u64 decrementer_max;
+
 extern unsigned long tb_ticks_per_jiffy;
 extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index e84a087223ce..6ce40d2ac201 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -88,6 +88,7 @@ static struct clocksource clocksource_timebase = {
 
 #define DECREMENTER_DEFAULT_MAX 0x7FFFFFFF
 u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
+EXPORT_SYMBOL_GPL(decrementer_max); /* for KVM HDEC */
 
 static int decrementer_set_next_event(unsigned long evt,
 				      struct clock_event_device *dev);
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 961b3d70483c..0ff9ddb5e7ca 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -504,7 +504,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc
 		vc->tb_offset_applied = 0;
 	}
 
-	mtspr(SPRN_HDEC, 0x7fffffff);
+	/* HDEC must be at least as large as DEC, so decrementer_max fits */
+	mtspr(SPRN_HDEC, decrementer_max);
 
 	save_clear_guest_mmu(kvm, vcpu);
 	switch_mmu_to_host(kvm, host_pidr);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v3 04/52] KVM: PPC: Book3S HV P9: Use host timer accounting to avoid decrementer read
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>

There is no need to save away the host DEC value, as it is derived
from the host timer subsystem which maintains the next timer time,
so it can be restored from there.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/time.h |  5 +++++
 arch/powerpc/kernel/time.c      |  1 +
 arch/powerpc/kvm/book3s_hv.c    | 14 +++++++-------
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 8c2c3dd4ddba..fd09b4797fd7 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -111,6 +111,11 @@ static inline unsigned long test_irq_work_pending(void)
 
 DECLARE_PER_CPU(u64, decrementers_next_tb);
 
+static inline u64 timer_get_next_tb(void)
+{
+	return __this_cpu_read(decrementers_next_tb);
+}
+
 /* Convert timebase ticks to nanoseconds */
 unsigned long long tb_to_ns(unsigned long long tb_ticks);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 934d8ae66cc6..e84a087223ce 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -107,6 +107,7 @@ struct clock_event_device decrementer_clockevent = {
 EXPORT_SYMBOL(decrementer_clockevent);
 
 DEFINE_PER_CPU(u64, decrementers_next_tb);
+EXPORT_SYMBOL_GPL(decrementers_next_tb);
 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
 
 #define XSEC_PER_SEC (1024*1024)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6a07a79f07d8..30d400bf161b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3860,18 +3860,17 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 	struct p9_host_os_sprs host_os_sprs;
 	s64 dec;
-	u64 tb;
+	u64 tb, next_timer;
 	int trap, save_pmu;
 
 	WARN_ON_ONCE(vcpu->arch.ceded);
 
-	dec = mfspr(SPRN_DEC);
 	tb = mftb();
-	if (dec < 0)
+	next_timer = timer_get_next_tb();
+	if (tb >= next_timer)
 		return BOOK3S_INTERRUPT_HV_DECREMENTER;
-	local_paca->kvm_hstate.dec_expires = dec + tb;
-	if (local_paca->kvm_hstate.dec_expires < time_limit)
-		time_limit = local_paca->kvm_hstate.dec_expires;
+	if (next_timer < time_limit)
+		time_limit = next_timer;
 
 	save_p9_host_os_sprs(&host_os_sprs);
 
@@ -4050,7 +4049,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 	vc->entry_exit_map = 0x101;
 	vc->in_guest = 0;
 
-	set_dec(local_paca->kvm_hstate.dec_expires - mftb());
+	next_timer = timer_get_next_tb();
+	set_dec(next_timer - mftb());
 	/* We may have raced with new irq work */
 	if (test_irq_work_pending())
 		set_dec(1);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v3 03/52] KMV: PPC: Book3S HV P9: Use set_dec to set decrementer to host
From: Nicholas Piggin @ 2021-10-04 16:00 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: Alexey Kardashevskiy, Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>

The host Linux timer code arms the decrementer with the value
'decrementers_next_tb - current_tb' using set_dec(), which stores
val - 1 on Book3S-64, which is not quite the same as what KVM does
to re-arm the host decrementer when exiting the guest.

This shouldn't be a significant change, but it makes the logic match
and avoids this small extra change being brought into the next patch.

Suggested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_hv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f4a779fffd18..6a07a79f07d8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4050,7 +4050,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 	vc->entry_exit_map = 0x101;
 	vc->in_guest = 0;
 
-	mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
+	set_dec(local_paca->kvm_hstate.dec_expires - mftb());
 	/* We may have raced with new irq work */
 	if (test_irq_work_pending())
 		set_dec(1);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v3 02/52] powerpc/64s: guard optional TIDR SPR with CPU ftr test
From: Nicholas Piggin @ 2021-10-04 15:59 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>

The TIDR SPR only exists on POWER9. Avoid accessing it when the
feature bit for it is not set.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kvm/book3s_hv.c | 12 ++++++++----
 arch/powerpc/xmon/xmon.c     | 10 ++++++++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2acb1c96cfaf..f4a779fffd18 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3767,7 +3767,8 @@ static void load_spr_state(struct kvm_vcpu *vcpu)
 	mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
 	mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
 	mtspr(SPRN_BESCR, vcpu->arch.bescr);
-	mtspr(SPRN_TIDR, vcpu->arch.tid);
+	if (cpu_has_feature(CPU_FTR_P9_TIDR))
+		mtspr(SPRN_TIDR, vcpu->arch.tid);
 	mtspr(SPRN_AMR, vcpu->arch.amr);
 	mtspr(SPRN_UAMOR, vcpu->arch.uamor);
 
@@ -3793,7 +3794,8 @@ static void store_spr_state(struct kvm_vcpu *vcpu)
 	vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
 	vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
 	vcpu->arch.bescr = mfspr(SPRN_BESCR);
-	vcpu->arch.tid = mfspr(SPRN_TIDR);
+	if (cpu_has_feature(CPU_FTR_P9_TIDR))
+		vcpu->arch.tid = mfspr(SPRN_TIDR);
 	vcpu->arch.amr = mfspr(SPRN_AMR);
 	vcpu->arch.uamor = mfspr(SPRN_UAMOR);
 	vcpu->arch.dscr = mfspr(SPRN_DSCR);
@@ -3813,7 +3815,8 @@ struct p9_host_os_sprs {
 static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
 {
 	host_os_sprs->dscr = mfspr(SPRN_DSCR);
-	host_os_sprs->tidr = mfspr(SPRN_TIDR);
+	if (cpu_has_feature(CPU_FTR_P9_TIDR))
+		host_os_sprs->tidr = mfspr(SPRN_TIDR);
 	host_os_sprs->iamr = mfspr(SPRN_IAMR);
 	host_os_sprs->amr = mfspr(SPRN_AMR);
 	host_os_sprs->fscr = mfspr(SPRN_FSCR);
@@ -3827,7 +3830,8 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
 	mtspr(SPRN_UAMOR, 0);
 
 	mtspr(SPRN_DSCR, host_os_sprs->dscr);
-	mtspr(SPRN_TIDR, host_os_sprs->tidr);
+	if (cpu_has_feature(CPU_FTR_P9_TIDR))
+		mtspr(SPRN_TIDR, host_os_sprs->tidr);
 	mtspr(SPRN_IAMR, host_os_sprs->iamr);
 
 	if (host_os_sprs->amr != vcpu->arch.amr)
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index dd8241c009e5..7958e5aae844 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2107,8 +2107,14 @@ static void dump_300_sprs(void)
 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
 		return;
 
-	printf("pidr   = %.16lx  tidr  = %.16lx\n",
-		mfspr(SPRN_PID), mfspr(SPRN_TIDR));
+	if (cpu_has_feature(CPU_FTR_P9_TIDR)) {
+		printf("pidr   = %.16lx  tidr  = %.16lx\n",
+			mfspr(SPRN_PID), mfspr(SPRN_TIDR));
+	} else {
+		printf("pidr   = %.16lx\n",
+			mfspr(SPRN_PID));
+	}
+
 	printf("psscr  = %.16lx\n",
 		hv ? mfspr(SPRN_PSSCR) : mfspr(SPRN_PSSCR_PR));
 
-- 
2.23.0


^ permalink raw reply related

* [PATCH v3 01/52] powerpc/64s: Remove WORT SPR from POWER9/10 (take 2)
From: Nicholas Piggin @ 2021-10-04 15:59 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20211004160049.1338837-1-npiggin@gmail.com>

This removes a missed remnant of the WORT SPR.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/platforms/powernv/idle.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index e3ffdc8e8567..86e787502e42 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -589,7 +589,6 @@ struct p9_sprs {
 	u64 purr;
 	u64 spurr;
 	u64 dscr;
-	u64 wort;
 	u64 ciabr;
 
 	u64 mmcra;
-- 
2.23.0


^ permalink raw reply related

* [PATCH v3 00/52] KVM: PPC: Book3S HV P9: entry/exit optimisations
From: Nicholas Piggin @ 2021-10-04 15:59 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin

This reduces radix guest full entry/exit latency on POWER9 and POWER10
by 2x.

Nested HV guests should see smaller improvements in their L1 entry/exit,
but this is also combined with most L0 speedups also applying to nested
entry. nginx localhost throughput test in a SMP nested guest is improved
about 10% (in a direct guest it doesn't change much because it uses XIVE
for IPIs) when L0 and L1 are patched.

It does this in several main ways:

- Rearrange code to optimise SPR accesses. Mainly, avoid scoreboard
  stalls.

- Test SPR values to avoid mtSPRs where possible. mtSPRs are expensive.

- Reduce mftb. mftb is expensive.

- Demand fault certain facilities to avoid saving and/or restoring them
  (at the cost of fault when they are used, but this is mitigated over
  a number of entries, like the facilities when context switching 
  processes). PM, TM, and EBB so far.

- Defer some sequences that are made just in case a guest is interrupted
  in the middle of a critical section to the case where the guest is
  scheduled on a different CPU, rather than every time (at the cost of
  an extra IPI in this case). Namely the tlbsync sequence for radix with
  GTSE, which is very expensive.

- Reduce locking, barriers, atomics related to the vcpus-per-vcore > 1
  handling that the P9 path does not require.

Changes since v2:
- Rebased, several patches from the series were merged in the previous
  merge window.
- Fixed some compile errors noticed by kernel test robot.
- Added RB from Athira for the PMU stuff (thanks!)
- Split TIDR ftr check (patch 2) out into its own patch.
- Added a missed license tag on new file.

Changes since v1:
- Verified DPDES changes still work with msgsndp SMT emulation.
- Fixed HMI handling bug.
- Split softpatch handling fixes into smaller pieces.
- Rebased with Fabiano's latest HV sanitising patches.
- Fix TM demand faulting bug causing nested guest TM tests to TM Bad
  Thing the host in rare cases.
- Re-name new "pmu=" command line option to "pmu_override=" and update
  documentation wording.
- Add default=y config option rather than unconditionally removing the
  L0 nested PMU workaround.
- Remove unnecessary MSR[RI] updates in entry/exit. Down to about 4700
  cycles now.
- Another bugfix from Alexey's testing.

Changes since RFC:
- Rebased with Fabiano's HV sanitising patches at the front.
- Several demand faulting bug fixes mostly relating to nested guests.
- Removed facility demand-faulting from L0 nested entry/exit handler.
  Demand faulting is still done in the L1, but not the L0. The reason
  is to reduce complexity (although it's only a small amount of
  complexity), reduce demand faulting overhead that may require several

Thanks,
Nick

Nicholas Piggin (52):
  powerpc/64s: Remove WORT SPR from POWER9/10 (take 2)
  powerpc/64s: guard optional TIDR SPR with CPU ftr test
  KMV: PPC: Book3S HV P9: Use set_dec to set decrementer to host
  KVM: PPC: Book3S HV P9: Use host timer accounting to avoid decrementer
    read
  KVM: PPC: Book3S HV P9: Use large decrementer for HDEC
  KVM: PPC: Book3S HV P9: Reduce mftb per guest entry/exit
  powerpc/time: add API for KVM to re-arm the host timer/decrementer
  KVM: PPC: Book3S HV: POWER10 enable HAIL when running radix guests
  powerpc/64s: Keep AMOR SPR a constant ~0 at runtime
  KVM: PPC: Book3S HV: Don't always save PMU for guest capable of
    nesting
  powerpc/64s: Always set PMU control registers to frozen/disabled when
    not in use
  powerpc/64s: Implement PMU override command line option
  KVM: PPC: Book3S HV P9: Implement PMU save/restore in C
  KVM: PPC: Book3S HV P9: Factor PMU save/load into context switch
    functions
  KVM: PPC: Book3S HV P9: Demand fault PMU SPRs when marked not inuse
  KVM: PPC: Book3S HV P9: Factor out yield_count increment
  KVM: PPC: Book3S HV: CTRL SPR does not require read-modify-write
  KVM: PPC: Book3S HV P9: Move SPRG restore to restore_p9_host_os_sprs
  KVM: PPC: Book3S HV P9: Reduce mtmsrd instructions required to save
    host SPRs
  KVM: PPC: Book3S HV P9: Improve mtmsrd scheduling by delaying MSR[EE]
    disable
  KVM: PPC: Book3S HV P9: Add kvmppc_stop_thread to match
    kvmppc_start_thread
  KVM: PPC: Book3S HV: Change dec_expires to be relative to guest
    timebase
  KVM: PPC: Book3S HV P9: Move TB updates
  KVM: PPC: Book3S HV P9: Optimise timebase reads
  KVM: PPC: Book3S HV P9: Avoid SPR scoreboard stalls
  KVM: PPC: Book3S HV P9: Only execute mtSPR if the value changed
  KVM: PPC: Book3S HV P9: Juggle SPR switching around
  KVM: PPC: Book3S HV P9: Move vcpu register save/restore into functions
  KVM: PPC: Book3S HV P9: Move host OS save/restore functions to
    built-in
  KVM: PPC: Book3S HV P9: Move nested guest entry into its own function
  KVM: PPC: Book3S HV P9: Move remaining SPR and MSR access into low
    level entry
  KVM: PPC: Book3S HV P9: Implement TM fastpath for guest entry/exit
  KVM: PPC: Book3S HV P9: Switch PMU to guest as late as possible
  KVM: PPC: Book3S HV P9: Restrict DSISR canary workaround to processors
    that require it
  KVM: PPC: Book3S HV P9: More SPR speed improvements
  KVM: PPC: Book3S HV P9: Demand fault EBB facility registers
  KVM: PPC: Book3S HV P9: Demand fault TM facility registers
  KVM: PPC: Book3S HV P9: Use Linux SPR save/restore to manage some host
    SPRs
  KVM: PPC: Book3S HV P9: Comment and fix MMU context switching code
  KVM: PPC: Book3S HV P9: Test dawr_enabled() before saving host DAWR
    SPRs
  KVM: PPC: Book3S HV P9: Don't restore PSSCR if not needed
  KVM: PPC: Book3S HV P9: Avoid tlbsync sequence on radix guest exit
  KVM: PPC: Book3S HV Nested: Avoid extra mftb() in nested entry
  KVM: PPC: Book3S HV P9: Improve mfmsr performance on entry
  KVM: PPC: Book3S HV P9: Optimise hash guest SLB saving
  KVM: PPC: Book3S HV P9: Avoid changing MSR[RI] in entry and exit
  KVM: PPC: Book3S HV P9: Add unlikely annotation for !mmu_ready
  KVM: PPC: Book3S HV P9: Avoid cpu_in_guest atomics on entry and exit
  KVM: PPC: Book3S HV P9: Remove most of the vcore logic
  KVM: PPC: Book3S HV P9: Tidy kvmppc_create_dtl_entry
  KVM: PPC: Book3S HV P9: Stop using vc->dpdes
  KVM: PPC: Book3S HV P9: Remove subcore HMI handling

 .../admin-guide/kernel-parameters.txt         |   8 +
 arch/powerpc/include/asm/asm-prototypes.h     |   5 -
 arch/powerpc/include/asm/kvm_asm.h            |   1 +
 arch/powerpc/include/asm/kvm_book3s.h         |   6 +
 arch/powerpc/include/asm/kvm_book3s_64.h      |   5 +-
 arch/powerpc/include/asm/kvm_host.h           |   7 +-
 arch/powerpc/include/asm/kvm_ppc.h            |   1 +
 arch/powerpc/include/asm/switch_to.h          |   3 +
 arch/powerpc/include/asm/time.h               |  19 +-
 arch/powerpc/kernel/cpu_setup_power.c         |  12 +-
 arch/powerpc/kernel/dt_cpu_ftrs.c             |   8 +-
 arch/powerpc/kernel/process.c                 |  34 +
 arch/powerpc/kernel/time.c                    |  54 +-
 arch/powerpc/kvm/Kconfig                      |  15 +
 arch/powerpc/kvm/book3s_64_entry.S            |  11 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c        |   4 +
 arch/powerpc/kvm/book3s_hv.c                  | 836 +++++++++---------
 arch/powerpc/kvm/book3s_hv.h                  |  42 +
 arch/powerpc/kvm/book3s_hv_builtin.c          |   2 +
 arch/powerpc/kvm/book3s_hv_hmi.c              |   7 +-
 arch/powerpc/kvm/book3s_hv_interrupts.S       |  13 +-
 arch/powerpc/kvm/book3s_hv_nested.c           |   8 +-
 arch/powerpc/kvm/book3s_hv_p9_entry.c         | 821 ++++++++++++++---
 arch/powerpc/kvm/book3s_hv_ras.c              |  54 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S       |  73 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c      |  15 -
 arch/powerpc/perf/core-book3s.c               |  35 +
 arch/powerpc/platforms/powernv/idle.c         |   9 +-
 arch/powerpc/xmon/xmon.c                      |  10 +-
 29 files changed, 1459 insertions(+), 659 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv.h

-- 
2.23.0


^ permalink raw reply

* Re: [PATCH 5/5] powerpc/64s: Fix unrecoverable MCE calling async handler from NMI
From: Cédric Le Goater @ 2021-10-04 15:28 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Ganesh Goudar
In-Reply-To: <20211004145642.1331214-6-npiggin@gmail.com>

On 10/4/21 16:56, Nicholas Piggin wrote:
> The machine check handler is not considered NMI on 64s. The early
> handler is the true NMI handler, and then it schedules the
> machine_check_exception handler to run when interrupts are enabled.
> 
> This works fine except the case of an unrecoverable MCE, where the true
> NMI is taken when MSR[RI] is clear, it can not recover, so it calls
> machine_check_exception directly so something might be done about it.
> 
> Calling an async handler from NMI context can result in irq state and
> other things getting corrupted. This can also trigger the BUG at
>    arch/powerpc/include/asm/interrupt.h:168
>    BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));

I was hitting this problem when I rebooted a P8 tuleta system and
this series fixes it.

Tested-by: Cédric Le Goater <clg@kaod.org>

Thanks,

C.
  
> Fix this by making an _async version of the handler which is called
> in the normal case, and a NMI version that is called for unrecoverable
> interrupts.
> 
> Fixes: 2b43dd7653cc ("powerpc/64: enable MSR[EE] in irq replay pt_regs")
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>> ---
>   arch/powerpc/include/asm/interrupt.h |  5 ++---
>   arch/powerpc/kernel/exceptions-64s.S |  8 +++++--
>   arch/powerpc/kernel/traps.c          | 31 ++++++++++++++++------------
>   3 files changed, 26 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
> index b894b7169706..a1d238255f07 100644
> --- a/arch/powerpc/include/asm/interrupt.h
> +++ b/arch/powerpc/include/asm/interrupt.h
> @@ -528,10 +528,9 @@ static __always_inline long ____##func(struct pt_regs *regs)
>   /* kernel/traps.c */
>   DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception);
>   #ifdef CONFIG_PPC_BOOK3S_64
> -DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception);
> -#else
> -DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
> +DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async);
>   #endif
> +DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
>   DECLARE_INTERRUPT_HANDLER(SMIException);
>   DECLARE_INTERRUPT_HANDLER(handle_hmi_exception);
>   DECLARE_INTERRUPT_HANDLER(unknown_exception);
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 024d9231f88c..eaf1f72131a1 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -1243,7 +1243,7 @@ EXC_COMMON_BEGIN(machine_check_common)
>   	li	r10,MSR_RI
>   	mtmsrd 	r10,1
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
> -	bl	machine_check_exception
> +	bl	machine_check_exception_async
>   	b	interrupt_return_srr
>   
>   
> @@ -1303,7 +1303,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
>   	subi	r12,r12,1
>   	sth	r12,PACA_IN_MCE(r13)
>   
> -	/* Invoke machine_check_exception to print MCE event and panic. */
> +	/*
> +	 * Invoke machine_check_exception to print MCE event and panic.
> +	 * This is the NMI version of the handler because we are called from
> +	 * the early handler which is a true NMI.
> +	 */
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	machine_check_exception
>   
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index e453b666613b..11741703d26e 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -796,24 +796,22 @@ void die_mce(const char *str, struct pt_regs *regs, long err)
>   	 * do_exit() checks for in_interrupt() and panics in that case, so
>   	 * exit the irq/nmi before calling die.
>   	 */
> -	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
> -		irq_exit();
> -	else
> +	if (in_nmi())
>   		nmi_exit();
> +	else
> +		irq_exit();
>   	die(str, regs, err);
>   }
>   
>   /*
> - * BOOK3S_64 does not call this handler as a non-maskable interrupt
> + * BOOK3S_64 does not usually call this handler as a non-maskable interrupt
>    * (it uses its own early real-mode handler to handle the MCE proper
>    * and then raises irq_work to call this handler when interrupts are
> - * enabled).
> + * enabled). The only time when this is not true is if the early handler
> + * is unrecoverable, then it does call this directly to try to get a
> + * message out.
>    */
> -#ifdef CONFIG_PPC_BOOK3S_64
> -DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception)
> -#else
> -DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
> -#endif
> +static void __machine_check_exception(struct pt_regs *regs)
>   {
>   	int recover = 0;
>   
> @@ -847,12 +845,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
>   	/* Must die if the interrupt is not recoverable */
>   	if (regs_is_unrecoverable(regs))
>   		die_mce("Unrecoverable Machine check", regs, SIGBUS);
> +}
>   
>   #ifdef CONFIG_PPC_BOOK3S_64
> -	return;
> -#else
> -	return 0;
> +DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async)
> +{
> +	__machine_check_exception(regs);
> +}
>   #endif
> +DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
> +{
> +	__machine_check_exception(regs);
> +
> +	return 0;
>   }
>   
>   DEFINE_INTERRUPT_HANDLER(SMIException) /* async? */
> 


^ permalink raw reply

* Re: [PATCH 1/3] fixup mmu_features immediately after getting cpu pa features.
From: Aneesh Kumar K.V @ 2021-10-04 15:32 UTC (permalink / raw)
  To: Sourabh Jain, mpe
  Cc: linuxppc-dev, mahesh, Mahesh Salgaonkar, linux-kernel,
	Abdul haleem, hbathini
In-Reply-To: <20211004151142.256251-2-sourabhjain@linux.ibm.com>

On 10/4/21 20:41, Sourabh Jain wrote:
> From: Mahesh Salgaonkar <mahesh@linux.ibm.com>
> 
> On system with radix support available, early_radix_enabled() starts
> returning true for a small window (until mmu_early_init_devtree() is
> called) even when radix mode disabled on kernel command line. This causes
> ppc64_bolted_size() to return ULONG_MAX in HPT mode instead of supported
> segment size, during boot cpu paca allocation.
> 
> With kernel command line = "... disable_radix":
> 
> early_init_devtree:			  <- early_radix_enabled() = false
>    early_init_dt_scan_cpus:		  <- early_radix_enabled() = false
>        ...
>        check_cpu_pa_features:		  <- early_radix_enabled() = false
>        ...				^ <- early_radix_enabled() = TRUE
>        allocate_paca:			| <- early_radix_enabled() = TRUE
>            ...                           |
>            ppc64_bolted_size:		| <- early_radix_enabled() = TRUE
>                if (early_radix_enabled())| <- early_radix_enabled() = TRUE
>                    return ULONG_MAX;     |
>        ...                               |
>    ...					| <- early_radix_enabled() = TRUE
>    ...					| <- early_radix_enabled() = TRUE
>    mmu_early_init_devtree()              V
>    ...					  <- early_radix_enabled() = false
> 
> So far we have not seen any issue because allocate_paca() takes minimum of
> ppc64_bolted_size and rma_size while allocating paca. However it is better
> to close this window by fixing up the mmu features as early as possible.
> This fixes early_radix_enabled() and ppc64_bolted_size() to return valid
> values in radix disable mode. This patch will help subsequent patch to
> depend on early_radix_enabled() check while detecting supported segment
> size in HPT mode.
> 
> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
> Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
> Reported-and-tested-by: Abdul haleem <abdhalee@linux.vnet.ibm.com>
> ---
>   arch/powerpc/include/asm/book3s/64/mmu.h | 1 +
>   arch/powerpc/include/asm/mmu.h           | 1 +
>   arch/powerpc/kernel/prom.c               | 1 +
>   arch/powerpc/mm/init_64.c                | 5 ++++-
>   4 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
> index c02f42d1031e..69a89fa1330d 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> @@ -197,6 +197,7 @@ extern int mmu_vmemmap_psize;
>   extern int mmu_io_psize;
>   
>   /* MMU initialization */
> +void mmu_cpu_feature_fixup(void);
>   void mmu_early_init_devtree(void);
>   void hash__early_init_devtree(void);
>   void radix__early_init_devtree(void);
> diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
> index 8abe8e42e045..c8eafd401fe9 100644
> --- a/arch/powerpc/include/asm/mmu.h
> +++ b/arch/powerpc/include/asm/mmu.h
> @@ -401,6 +401,7 @@ extern void early_init_mmu(void);
>   extern void early_init_mmu_secondary(void);
>   extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
>   				       phys_addr_t first_memblock_size);
> +static inline void mmu_cpu_feature_fixup(void) { }
>   static inline void mmu_early_init_devtree(void) { }
>   
>   static inline void pkey_early_init_devtree(void) {}
> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> index 2e67588f6f6e..1727a3abe6c1 100644
> --- a/arch/powerpc/kernel/prom.c
> +++ b/arch/powerpc/kernel/prom.c
> @@ -380,6 +380,7 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
>   		check_cpu_pa_features(node);
>   	}
>   
> +	mmu_cpu_feature_fixup();

can you do that call inside check_cpu_pa_features? or is it because we 
have the same issue with baremetal platforms?

Can we also rename this to indicate we are sanitizing the feature flag 
based on kernel command line.  Something like

/* Update cpu features based on kernel command line */
update_cpu_features();

>   	identical_pvr_fixup(node);
>   	init_mmu_slb_size(node);
>   
> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
> index 386be136026e..9ed452605a2c 100644
> --- a/arch/powerpc/mm/init_64.c
> +++ b/arch/powerpc/mm/init_64.c
> @@ -437,12 +437,15 @@ static void __init early_check_vec5(void)
>   	}
>   }
>   
> -void __init mmu_early_init_devtree(void)
> +void __init mmu_cpu_feature_fixup(void)
>   {
>   	/* Disable radix mode based on kernel command line. */
>   	if (disable_radix)
>   		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
> +}
>   
> +void __init mmu_early_init_devtree(void)
> +{
>   	/*
>   	 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
>   	 * When running bare-metal, we can use radix if we like
> 


^ permalink raw reply

* [PATCH 3/3] powerpc: Set crashkernel offset to mid of RMA region
From: Sourabh Jain @ 2021-10-04 15:11 UTC (permalink / raw)
  To: mpe
  Cc: linuxppc-dev, mahesh, linux-kernel, Abdul haleem, aneesh.kumar,
	hbathini
In-Reply-To: <20211004151142.256251-1-sourabhjain@linux.ibm.com>

On large config LPARs (having 192 and more cores), Linux fails to boot
due to insufficient memory in the first memory block. It is due to the
reserve crashkernel area starts at 128MB offset by default and which
doesn't leave enough space in the first memory block to accommodate
memory for other essential system resources.

Given that the RMA region size can be 512MB or more, setting the
crashkernel offset to mid of RMA size will leave enough space to
kernel to allocate memory for other system resources in the first
memory block.

Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Reported-and-tested-by: Abdul haleem <abdhalee@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/rtas.c |  3 +++
 arch/powerpc/kexec/core.c  | 13 +++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index ff80bbad22a5..ce5e62bb4d8e 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1235,6 +1235,9 @@ int __init early_init_dt_scan_rtas(unsigned long node,
 	entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL);
 	sizep  = of_get_flat_dt_prop(node, "rtas-size", NULL);
 
+	if (of_get_flat_dt_prop(node, "ibm,hypertas-functions", NULL))
+		powerpc_firmware_features |= FW_FEATURE_LPAR;
+
 	if (basep && entryp && sizep) {
 		rtas.base = *basep;
 		rtas.entry = *entryp;
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 48525e8b5730..f69cf3e370ec 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -147,11 +147,16 @@ void __init reserve_crashkernel(void)
 	if (!crashk_res.start) {
 #ifdef CONFIG_PPC64
 		/*
-		 * On 64bit we split the RMO in half but cap it at half of
-		 * a small SLB (128MB) since the crash kernel needs to place
-		 * itself and some stacks to be in the first segment.
+		 * crash kernel needs to placed in the first segment. On LPAR
+		 * setting crash kernel start to mid of RMA size (512MB or more)
+		 * would help primary kernel to boot properly on large config
+		 * LPAR (with core count 192 or more) and for the reset keep
+		 * cap the crash kernel start at 128MB offse.
 		 */
-		crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
+		if (firmware_has_feature(FW_FEATURE_LPAR))
+			crashk_res.start = ppc64_rma_size / 2;
+		else
+			crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2));
 #else
 		crashk_res.start = KDUMP_KERNELBASE;
 #endif
-- 
2.31.1


^ permalink raw reply related

* [PATCH 2/3] Remove 256MB limit restriction for boot cpu paca allocation
From: Sourabh Jain @ 2021-10-04 15:11 UTC (permalink / raw)
  To: mpe
  Cc: Abdul haleem, mahesh, Sourabh Jain, Mahesh Salgaonkar,
	linux-kernel, linuxppc-dev, aneesh.kumar, hbathini
In-Reply-To: <20211004151142.256251-1-sourabhjain@linux.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.ibm.com>

At the time when we detect and allocate paca for boot cpu, we havn't yet
detected mmu feature of 1T segments support (not until
mmu_early_init_devtree() call). This causes ppc64_bolted_size() to return
256MB as limit forcing boot cpu paca allocation below 256MB always.

This works fine for kdump kernel boot as long as crashkernel reservation is
at offset below 256MB. But when we move kdump offset to 256MB or above,
kdump kernel fails to allocate paca for boot cpu below 256MB and crashes in
allocate_paca().

Moving the detection of segment sizes just before paca allocation for boot
cpu removes this restriction of 256MB limit. This allows kdump kernel to
successfully boot and capture vmcore.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Signed-off-by: Sourabh Jain <sourabhjain@linu.ibm.com>
Reported-and-tested-by: Abdul haleem <abdhalee@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 1 +
 arch/powerpc/kernel/prom.c               | 4 ++++
 arch/powerpc/mm/book3s64/hash_utils.c    | 5 ++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 69a89fa1330d..f43070581f11 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -199,6 +199,7 @@ extern int mmu_io_psize;
 /* MMU initialization */
 void mmu_cpu_feature_fixup(void);
 void mmu_early_init_devtree(void);
+void hash__early_detect_seg_size(void);
 void hash__early_init_devtree(void);
 void radix__early_init_devtree(void);
 #ifdef CONFIG_PPC_PKEY
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 1727a3abe6c1..68397f335caf 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -384,6 +384,10 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 	identical_pvr_fixup(node);
 	init_mmu_slb_size(node);
 
+	/* Initialize segment sizes */
+	if (!early_radix_enabled())
+		hash__early_detect_seg_size();
+
 #ifdef CONFIG_PPC64
 	if (nthreads == 1)
 		cur_cpu_spec->cpu_features &= ~CPU_FTR_SMT;
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index c145776d3ae5..ef4fc6bb1b30 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1020,11 +1020,14 @@ static void __init htab_initialize(void)
 #undef KB
 #undef MB
 
-void __init hash__early_init_devtree(void)
+void __init hash__early_detect_seg_size(void)
 {
 	/* Initialize segment sizes */
 	of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL);
+}
 
+void __init hash__early_init_devtree(void)
+{
 	/* Initialize page sizes */
 	htab_scan_page_sizes();
 }
-- 
2.31.1


^ permalink raw reply related

* [PATCH 1/3] fixup mmu_features immediately after getting cpu pa features.
From: Sourabh Jain @ 2021-10-04 15:11 UTC (permalink / raw)
  To: mpe
  Cc: Abdul haleem, mahesh, Mahesh Salgaonkar, linux-kernel,
	linuxppc-dev, aneesh.kumar, hbathini
In-Reply-To: <20211004151142.256251-1-sourabhjain@linux.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.ibm.com>

On system with radix support available, early_radix_enabled() starts
returning true for a small window (until mmu_early_init_devtree() is
called) even when radix mode disabled on kernel command line. This causes
ppc64_bolted_size() to return ULONG_MAX in HPT mode instead of supported
segment size, during boot cpu paca allocation.

With kernel command line = "... disable_radix":

early_init_devtree:			  <- early_radix_enabled() = false
  early_init_dt_scan_cpus:		  <- early_radix_enabled() = false
      ...
      check_cpu_pa_features:		  <- early_radix_enabled() = false
      ...				^ <- early_radix_enabled() = TRUE
      allocate_paca:			| <- early_radix_enabled() = TRUE
          ...                           |
          ppc64_bolted_size:		| <- early_radix_enabled() = TRUE
              if (early_radix_enabled())| <- early_radix_enabled() = TRUE
                  return ULONG_MAX;     |
      ...                               |
  ...					| <- early_radix_enabled() = TRUE
  ...					| <- early_radix_enabled() = TRUE
  mmu_early_init_devtree()              V
  ...					  <- early_radix_enabled() = false

So far we have not seen any issue because allocate_paca() takes minimum of
ppc64_bolted_size and rma_size while allocating paca. However it is better
to close this window by fixing up the mmu features as early as possible.
This fixes early_radix_enabled() and ppc64_bolted_size() to return valid
values in radix disable mode. This patch will help subsequent patch to
depend on early_radix_enabled() check while detecting supported segment
size in HPT mode.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Reported-and-tested-by: Abdul haleem <abdhalee@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 1 +
 arch/powerpc/include/asm/mmu.h           | 1 +
 arch/powerpc/kernel/prom.c               | 1 +
 arch/powerpc/mm/init_64.c                | 5 ++++-
 4 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index c02f42d1031e..69a89fa1330d 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -197,6 +197,7 @@ extern int mmu_vmemmap_psize;
 extern int mmu_io_psize;
 
 /* MMU initialization */
+void mmu_cpu_feature_fixup(void);
 void mmu_early_init_devtree(void);
 void hash__early_init_devtree(void);
 void radix__early_init_devtree(void);
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 8abe8e42e045..c8eafd401fe9 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -401,6 +401,7 @@ extern void early_init_mmu(void);
 extern void early_init_mmu_secondary(void);
 extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 				       phys_addr_t first_memblock_size);
+static inline void mmu_cpu_feature_fixup(void) { }
 static inline void mmu_early_init_devtree(void) { }
 
 static inline void pkey_early_init_devtree(void) {}
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 2e67588f6f6e..1727a3abe6c1 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -380,6 +380,7 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
 		check_cpu_pa_features(node);
 	}
 
+	mmu_cpu_feature_fixup();
 	identical_pvr_fixup(node);
 	init_mmu_slb_size(node);
 
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 386be136026e..9ed452605a2c 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -437,12 +437,15 @@ static void __init early_check_vec5(void)
 	}
 }
 
-void __init mmu_early_init_devtree(void)
+void __init mmu_cpu_feature_fixup(void)
 {
 	/* Disable radix mode based on kernel command line. */
 	if (disable_radix)
 		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+}
 
+void __init mmu_early_init_devtree(void)
+{
 	/*
 	 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
 	 * When running bare-metal, we can use radix if we like
-- 
2.31.1


^ permalink raw reply related

* [PATCH 0/3] Update crashkernel offset to allow kernel to boot on large config LPARs
From: Sourabh Jain @ 2021-10-04 15:11 UTC (permalink / raw)
  To: mpe; +Cc: mahesh, aneesh.kumar, linux-kernel, hbathini, linuxppc-dev

As the crashkernel reserve memory at 128MB offset in the first memory
block, it leaves less than 128MB memory to accommodate other essential
system resources that need memory reservation in the same block. This
creates kernel boot failure on large config LPARs having core count
greater than 192.

Setting the crashkernel to mid of RMA size which can be 512MB or more
instead of capping it to 128MB by default leaves enough space to allocate
memory to another system resource in the first memory block.

Now keeping the crashkernel at mid of RMA size works fine for the primary
kernel but creates boot failure for the kdump kernel when the crashekernel
reservation start offset crosses 256MB. The reason is, in the early boot
MMU feature of 1T segments support is not detected which restricts the paca
allocation for boot CPU below 256MB. When the crashkernel itself is
starting at 256MB offset, attempt to allocate paca below 256MB leads to the
kdump kernel boot failure.

Moving the detection of segment sizes before identifying the boot CPU
removes the restriction of 256MB limit for boot CPU paca allocation
which allows the kdump kernel to successfully boot and capture vmcore.

While allocating paca for boot CPU we found that there is a small window
during kernel boot where early_radix_enabled returns True even though
the radix is disabled using command-line. This leads to an invalid bolated
size calculation on which paca limit of boot CPU is dependent. Patch 0001
closes that window that by fixing the radix bit in mmu_feature.

Mahesh Salgaonkar (2):
  fixup mmu_features immediately after getting cpu pa features.
  Remove 256MB limit restriction for boot cpu paca allocation

Sourabh Jain (1):
  powerpc: Set crashkernel offset to mid of RMA region

 arch/powerpc/include/asm/book3s/64/mmu.h |  2 ++
 arch/powerpc/include/asm/mmu.h           |  1 +
 arch/powerpc/kernel/prom.c               |  5 +++++
 arch/powerpc/kernel/rtas.c               |  3 +++
 arch/powerpc/kexec/core.c                | 13 +++++++++----
 arch/powerpc/mm/book3s64/hash_utils.c    |  5 ++++-
 arch/powerpc/mm/init_64.c                |  5 ++++-
 7 files changed, 28 insertions(+), 6 deletions(-)

-- 
2.31.1


^ permalink raw reply

* [PATCH] KVM: PPC: Book3S HV: H_ENTER filter out reserved HPTE[B] value
From: Nicholas Piggin @ 2021-10-04 14:57 UTC (permalink / raw)
  To: kvm-ppc, linuxppc-dev; +Cc: Nicholas Piggin

The HPTE B field is a 2-bit field with values 0b10 and 0b11 reserved.
This field is also taken from the HPTE and used when KVM executes
TLBIEs to set the B field of those instructions.

Disallow the guest setting B to a reserved value with H_ENTER by
rejecting it. This is the same approach already taken for rejecting
reserved (unsupported) LLP values. This prevents the guest from being
able to induce the host to execute TLBIE with reserved values, which
is not known to be a problem with current processors but in theory it
could prevent the TLBIE from working correctly in a future processor.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 4 ++++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 19b6942c6969..fff391b9b97b 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -378,6 +378,10 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 		rb |= 1;		/* L field */
 		rb |= r & 0xff000 & ((1ul << a_pgshift) - 1); /* LP field */
 	}
+	/*
+	 * This sets both bits of the B field in the PTE. 0b1x values are
+	 * reserved, but those will have been filtered by kvmppc_do_h_enter.
+	 */
 	rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8;	/* B field */
 	return rb;
 }
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 632b2545072b..2c1f3c6e72d1 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -207,6 +207,15 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
+	/*
+	 * The HPTE gets used by compute_tlbie_rb() to set TLBIE bits, so
+	 * these functions should work together -- must ensure a guest can not
+	 * cause problems with the TLBIE that KVM executes.
+	 */
+	if ((pteh >> HPTE_V_SSIZE_SHIFT) & 0x2) {
+		/* B=0b1x is a reserved value, disallow it. */
+		return H_PARAMETER;
+	}
 	psize = kvmppc_actual_pgsz(pteh, ptel);
 	if (!psize)
 		return H_PARAMETER;
-- 
2.23.0


^ permalink raw reply related

* [PATCH 5/5] powerpc/64s: Fix unrecoverable MCE calling async handler from NMI
From: Nicholas Piggin @ 2021-10-04 14:56 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Ganesh Goudar, Nicholas Piggin
In-Reply-To: <20211004145642.1331214-1-npiggin@gmail.com>

The machine check handler is not considered NMI on 64s. The early
handler is the true NMI handler, and then it schedules the
machine_check_exception handler to run when interrupts are enabled.

This works fine except the case of an unrecoverable MCE, where the true
NMI is taken when MSR[RI] is clear, it can not recover, so it calls
machine_check_exception directly so something might be done about it.

Calling an async handler from NMI context can result in irq state and
other things getting corrupted. This can also trigger the BUG at
  arch/powerpc/include/asm/interrupt.h:168
  BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));

Fix this by making an _async version of the handler which is called
in the normal case, and a NMI version that is called for unrecoverable
interrupts.

Fixes: 2b43dd7653cc ("powerpc/64: enable MSR[EE] in irq replay pt_regs")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/interrupt.h |  5 ++---
 arch/powerpc/kernel/exceptions-64s.S |  8 +++++--
 arch/powerpc/kernel/traps.c          | 31 ++++++++++++++++------------
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index b894b7169706..a1d238255f07 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -528,10 +528,9 @@ static __always_inline long ____##func(struct pt_regs *regs)
 /* kernel/traps.c */
 DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception);
 #ifdef CONFIG_PPC_BOOK3S_64
-DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception);
-#else
-DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
+DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async);
 #endif
+DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
 DECLARE_INTERRUPT_HANDLER(SMIException);
 DECLARE_INTERRUPT_HANDLER(handle_hmi_exception);
 DECLARE_INTERRUPT_HANDLER(unknown_exception);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 024d9231f88c..eaf1f72131a1 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1243,7 +1243,7 @@ EXC_COMMON_BEGIN(machine_check_common)
 	li	r10,MSR_RI
 	mtmsrd 	r10,1
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	machine_check_exception
+	bl	machine_check_exception_async
 	b	interrupt_return_srr
 
 
@@ -1303,7 +1303,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	subi	r12,r12,1
 	sth	r12,PACA_IN_MCE(r13)
 
-	/* Invoke machine_check_exception to print MCE event and panic. */
+	/*
+	 * Invoke machine_check_exception to print MCE event and panic.
+	 * This is the NMI version of the handler because we are called from
+	 * the early handler which is a true NMI.
+	 */
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	machine_check_exception
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index e453b666613b..11741703d26e 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -796,24 +796,22 @@ void die_mce(const char *str, struct pt_regs *regs, long err)
 	 * do_exit() checks for in_interrupt() and panics in that case, so
 	 * exit the irq/nmi before calling die.
 	 */
-	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
-		irq_exit();
-	else
+	if (in_nmi())
 		nmi_exit();
+	else
+		irq_exit();
 	die(str, regs, err);
 }
 
 /*
- * BOOK3S_64 does not call this handler as a non-maskable interrupt
+ * BOOK3S_64 does not usually call this handler as a non-maskable interrupt
  * (it uses its own early real-mode handler to handle the MCE proper
  * and then raises irq_work to call this handler when interrupts are
- * enabled).
+ * enabled). The only time when this is not true is if the early handler
+ * is unrecoverable, then it does call this directly to try to get a
+ * message out.
  */
-#ifdef CONFIG_PPC_BOOK3S_64
-DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception)
-#else
-DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
-#endif
+static void __machine_check_exception(struct pt_regs *regs)
 {
 	int recover = 0;
 
@@ -847,12 +845,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
 	/* Must die if the interrupt is not recoverable */
 	if (regs_is_unrecoverable(regs))
 		die_mce("Unrecoverable Machine check", regs, SIGBUS);
+}
 
 #ifdef CONFIG_PPC_BOOK3S_64
-	return;
-#else
-	return 0;
+DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async)
+{
+	__machine_check_exception(regs);
+}
 #endif
+DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
+{
+	__machine_check_exception(regs);
+
+	return 0;
 }
 
 DEFINE_INTERRUPT_HANDLER(SMIException) /* async? */
-- 
2.23.0


^ permalink raw reply related

* [PATCH 4/5] powerpc/64/interrupt: Reconcile soft-mask state in NMI and fix false BUG
From: Nicholas Piggin @ 2021-10-04 14:56 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Ganesh Goudar, Nicholas Piggin
In-Reply-To: <20211004145642.1331214-1-npiggin@gmail.com>

If a NMI hits early in an interrupt handler before the irq soft-mask
state is reconciled, that can cause a false-positive BUG with a
CONFIG_PPC_IRQ_SOFT_MASK_DEBUG assertion.

Remove that assertion and instead check the case that if regs->msr has
EE clear, then regs->softe should be marked as disabled so the irq state
looks correct to NMI handlers, the same as how it's fixed up in the
case it was implicit soft-masked.

This doesn't fix a known problem -- the change that was fixed by commit
4ec5feec1ad02 ("powerpc/64s: Make NMI record implicitly soft-masked code
as irqs disabled") was the addition of a warning in the soft-nmi
watchdog interrupt which can never actually fire when MSR[EE]=0. However
it may be important if NMI handlers grow more code, and it's less
surprising to anything using 'regs' - (I tripped over this when working
in the area).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/interrupt.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index 6b800d3e2681..b894b7169706 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -265,13 +265,16 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte
 	local_paca->irq_soft_mask = IRQS_ALL_DISABLED;
 	local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
 
-	if (is_implicit_soft_masked(regs)) {
-		// Adjust regs->softe soft implicit soft-mask, so
-		// arch_irq_disabled_regs(regs) behaves as expected.
+	if (!(regs->msr & MSR_EE) || is_implicit_soft_masked(regs)) {
+		/*
+		 * Adjust regs->softe to be soft-masked if it had not been
+		 * reconcied (e.g., interrupt entry with MSR[EE]=0 but softe
+		 * not yet set disabled), or if it was in an implicit soft
+		 * masked state. This makes arch_irq_disabled_regs(regs)
+		 * behave as expected.
+		 */
 		regs->softe = IRQS_ALL_DISABLED;
 	}
-	if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
-		BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));
 
 	/* Don't do any per-CPU operations until interrupt state is fixed */
 
-- 
2.23.0


^ permalink raw reply related

* [PATCH 3/5] powerpc/64: warn if local irqs are enabled in NMI or hardirq context
From: Nicholas Piggin @ 2021-10-04 14:56 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Ganesh Goudar, Nicholas Piggin
In-Reply-To: <20211004145642.1331214-1-npiggin@gmail.com>

This can help catch bugs such as the one fixed by the previous change
to prevent _exception() from enabling irqs.

ppc32 could have a similar warning but it has no good config option to
debug this stuff (the test may be overkill to add for production
kernels).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/irq.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 551b653228c4..c4f1d6b7d992 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -229,6 +229,9 @@ notrace void arch_local_irq_restore(unsigned long mask)
 		return;
 	}
 
+	if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
+		WARN_ON_ONCE(in_nmi() || in_hardirq());
+
 	/*
 	 * After the stb, interrupts are unmasked and there are no interrupts
 	 * pending replay. The restart sequence makes this atomic with
@@ -321,6 +324,9 @@ notrace void arch_local_irq_restore(unsigned long mask)
 	if (mask)
 		return;
 
+	if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
+		WARN_ON_ONCE(in_nmi() || in_hardirq());
+
 	/*
 	 * From this point onward, we can take interrupts, preempt,
 	 * etc... unless we got hard-disabled. We check if an event
-- 
2.23.0


^ permalink raw reply related

* [PATCH 2/5] powerpc/traps: do not enable irqs in _exception
From: Nicholas Piggin @ 2021-10-04 14:56 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Ganesh Goudar, Nicholas Piggin
In-Reply-To: <20211004145642.1331214-1-npiggin@gmail.com>

_exception can be called by machine check handlers when the MCE hits
user code (e.g., pseries and powernv). This will enable local irqs
because, which is a dicey thing to do in NMI or hard irq context.

This seemed to worked out okay because a userspace MCE can basically be
treated like a synchronous interrupt (after async / imprecise MCEs are
filtered out). Since NMI and hard irq handlers have started growing
nmi_enter / irq_enter, and more irq state sanity checks, this has
started to cause problems (or at least trigger warnings).

The Fixes tag to the commit which introduced this rather than try to
work out exactly which commit was the first that could possibly cause a
problem because that may be difficult to prove.

Fixes: 9f2f79e3a3c1 ("powerpc: Disable interrupts in 64-bit kernel FP and vector faults")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/traps.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index aac8c0412ff9..e453b666613b 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -340,10 +340,16 @@ static bool exception_common(int signr, struct pt_regs *regs, int code,
 		return false;
 	}
 
-	show_signal_msg(signr, regs, code, addr);
+	/*
+	 * Must not enable interrupts even for user-mode exception, because
+	 * this can be called from machine check, which may be a NMI or IRQ
+	 * which don't like interrupts being enabled. Could check for
+	 * in_hardirq || in_nmi perhaps, but there doesn't seem to be a good
+	 * reason why _exception() should enable irqs for an exception handler,
+	 * the handlers themselves do that directly.
+	 */
 
-	if (arch_irqs_disabled())
-		interrupt_cond_local_irq_enable(regs);
+	show_signal_msg(signr, regs, code, addr);
 
 	current->thread.trap_nr = code;
 
-- 
2.23.0


^ permalink raw reply related

* [PATCH 0/5] powerpc: various interrupt handling fixes
From: Nicholas Piggin @ 2021-10-04 14:56 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Ganesh Goudar, Nicholas Piggin

This fixes a number of bugs found mostly looking at a MCE handler issue,
which should be fixed in patch 5 of the series, previous attempt here
which Ganesh found to be wrong.

https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210922020247.209409-1-npiggin@gmail.com/

I didn't increment to patch v2 because it's a different approach (so I
gave it a different title).

Thanks,
Nick

Nicholas Piggin (5):
  powerpc/64s: fix program check interrupt emergency stack path
  powerpc/traps: do not enable irqs in _exception
  powerpc/64: warn if local irqs are enabled in NMI or hardirq context
  powerpc/64/interrupt: Reconcile soft-mask state in NMI and fix false
    BUG
  powerpc/64s: Fix unrecoverable MCE calling async handler from NMI

 arch/powerpc/include/asm/interrupt.h | 18 ++++++------
 arch/powerpc/kernel/exceptions-64s.S | 25 ++++++++++------
 arch/powerpc/kernel/irq.c            |  6 ++++
 arch/powerpc/kernel/traps.c          | 43 +++++++++++++++++-----------
 4 files changed, 59 insertions(+), 33 deletions(-)

-- 
2.23.0


^ permalink raw reply

* [PATCH 1/5] powerpc/64s: fix program check interrupt emergency stack path
From: Nicholas Piggin @ 2021-10-04 14:56 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Ganesh Goudar, Nicholas Piggin
In-Reply-To: <20211004145642.1331214-1-npiggin@gmail.com>

Emergency stack path was jumping into a 3: label inside the
__GEN_COMMON_BODY macro for the normal path after it had finished,
rather than jumping over it. By a small miracle this is the correct
place to build up a new interrupt frame with the existing stack
pointer, so things basically worked okay with an added weird looking
700 trap frame on top (which had the wrong ->nip so it didn't decode
bug messages either).

Fix this by avoiding using numeric labels when jumping over non-trivial
macros.

Before:

 LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
 Modules linked in:
 CPU: 0 PID: 88 Comm: sh Not tainted 5.15.0-rc2-00034-ge057cdade6e5 #2637
 NIP:  7265677368657265 LR: c00000000006c0c8 CTR: c0000000000097f0
 REGS: c0000000fffb3a50 TRAP: 0700   Not tainted
 MSR:  9000000000021031 <SF,HV,ME,IR,DR,LE>  CR: 00000700  XER: 20040000
 CFAR: c0000000000098b0 IRQMASK: 0
 GPR00: c00000000006c964 c0000000fffb3cf0 c000000001513800 0000000000000000
 GPR04: 0000000048ab0778 0000000042000000 0000000000000000 0000000000001299
 GPR08: 000001e447c718ec 0000000022424282 0000000000002710 c00000000006bee8
 GPR12: 9000000000009033 c0000000016b0000 00000000000000b0 0000000000000001
 GPR16: 0000000000000000 0000000000000002 0000000000000000 0000000000000ff8
 GPR20: 0000000000001fff 0000000000000007 0000000000000080 00007fff89d90158
 GPR24: 0000000002000000 0000000002000000 0000000000000255 0000000000000300
 GPR28: c000000001270000 0000000042000000 0000000048ab0778 c000000080647e80
 NIP [7265677368657265] 0x7265677368657265
 LR [c00000000006c0c8] ___do_page_fault+0x3f8/0xb10
 Call Trace:
 [c0000000fffb3cf0] [c00000000000bdac] soft_nmi_common+0x13c/0x1d0 (unreliable)
 --- interrupt: 700 at decrementer_common_virt+0xb8/0x230
 NIP:  c0000000000098b8 LR: c00000000006c0c8 CTR: c0000000000097f0
 REGS: c0000000fffb3d60 TRAP: 0700   Not tainted
 MSR:  9000000000021031 <SF,HV,ME,IR,DR,LE>  CR: 22424282  XER: 20040000
 CFAR: c0000000000098b0 IRQMASK: 0
 GPR00: c00000000006c964 0000000000002400 c000000001513800 0000000000000000
 GPR04: 0000000048ab0778 0000000042000000 0000000000000000 0000000000001299
 GPR08: 000001e447c718ec 0000000022424282 0000000000002710 c00000000006bee8
 GPR12: 9000000000009033 c0000000016b0000 00000000000000b0 0000000000000001
 GPR16: 0000000000000000 0000000000000002 0000000000000000 0000000000000ff8
 GPR20: 0000000000001fff 0000000000000007 0000000000000080 00007fff89d90158
 GPR24: 0000000002000000 0000000002000000 0000000000000255 0000000000000300
 GPR28: c000000001270000 0000000042000000 0000000048ab0778 c000000080647e80
 NIP [c0000000000098b8] decrementer_common_virt+0xb8/0x230
 LR [c00000000006c0c8] ___do_page_fault+0x3f8/0xb10
 --- interrupt: 700
 Instruction dump:
 XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 ---[ end trace 6d28218e0cc3c949 ]---

After:

 ------------[ cut here ]------------
 kernel BUG at arch/powerpc/kernel/exceptions-64s.S:491!
 Oops: Exception in kernel mode, sig: 5 [#1]
 LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
 Modules linked in:
 CPU: 0 PID: 88 Comm: login Not tainted 5.15.0-rc2-00034-ge057cdade6e5-dirty #2638
 NIP:  c0000000000098b8 LR: c00000000006bf04 CTR: c0000000000097f0
 REGS: c0000000fffb3d60 TRAP: 0700   Not tainted
 MSR:  9000000000021031 <SF,HV,ME,IR,DR,LE>  CR: 24482227  XER: 00040000
 CFAR: c0000000000098b0 IRQMASK: 0
 GPR00: c00000000006bf04 0000000000002400 c000000001513800 c000000001271868
 GPR04: 00000000100f0d29 0000000042000000 0000000000000007 0000000000000009
 GPR08: 00000000100f0d29 0000000024482227 0000000000002710 c000000000181b3c
 GPR12: 9000000000009033 c0000000016b0000 00000000100f0d29 c000000005b22f00
 GPR16: 00000000ffff0000 0000000000000001 0000000000000009 00000000100eed90
 GPR20: 00000000100eed90 0000000010000000 000000001000a49c 00000000100f1430
 GPR24: c000000001271868 0000000002000000 0000000000000215 0000000000000300
 GPR28: c000000001271800 0000000042000000 00000000100f0d29 c000000080647860
 NIP [c0000000000098b8] decrementer_common_virt+0xb8/0x230
 LR [c00000000006bf04] ___do_page_fault+0x234/0xb10
 Call Trace:
 Instruction dump:
 4182000c 39400001 48000008 894d0932 714a0001 39400008 408225fc 718a4000
 7c2a0b78 3821fcf0 41c20008 e82d0910 <0981fcf0> f92101a0 f9610170 f9810178
 ---[ end trace a5dbd1f5ea4ccc51 ]---

Fixes: 0a882e28468f4 ("powerpc/64s/exception: remove bad stack branch")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/exceptions-64s.S | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 37859e62a8dc..024d9231f88c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1665,27 +1665,30 @@ EXC_COMMON_BEGIN(program_check_common)
 	 */
 
 	andi.	r10,r12,MSR_PR
-	bne	2f			/* If userspace, go normal path */
+	bne	.Lnormal_stack		/* If userspace, go normal path */
 
 	andis.	r10,r12,(SRR1_PROGTM)@h
-	bne	1f			/* If TM, emergency		*/
+	bne	.Lemergency_stack	/* If TM, emergency		*/
 
 	cmpdi	r1,-INT_FRAME_SIZE	/* check if r1 is in userspace	*/
-	blt	2f			/* normal path if not		*/
+	blt	.Lnormal_stack		/* normal path if not		*/
 
 	/* Use the emergency stack					*/
-1:	andi.	r10,r12,MSR_PR		/* Set CR0 correctly for label	*/
+.Lemergency_stack:
+	andi.	r10,r12,MSR_PR		/* Set CR0 correctly for label	*/
 					/* 3 in EXCEPTION_PROLOG_COMMON	*/
 	mr	r10,r1			/* Save r1			*/
 	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
 	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
 	__ISTACK(program_check)=0
 	__GEN_COMMON_BODY program_check
-	b 3f
-2:
+	b .Ldo_program_check
+
+.Lnormal_stack:
 	__ISTACK(program_check)=1
 	__GEN_COMMON_BODY program_check
-3:
+
+.Ldo_program_check:
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	program_check_exception
 	REST_NVGPRS(r1) /* instruction emulation may change GPRs */
-- 
2.23.0


^ permalink raw reply related

* [PATCH v6 10/11] PCI: Replace pci_dev::driver usage by pci_dev::dev.driver
From: Uwe Kleine-König @ 2021-10-04 12:59 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Mark Rutland, Peter Zijlstra, linux-pci, Oliver O'Halloran,
	H. Peter Anvin, Jiri Olsa, Boris Ostrovsky, Stefano Stabellini,
	Mathias Nyman, x86, Alexander Shishkin, Ingo Molnar, xen-devel,
	Andrew Donnellan, Arnd Bergmann, Konrad Rzeszutek Wilk,
	Arnaldo Carvalho de Melo, Borislav Petkov, Bjorn Helgaas,
	Namhyung Kim, Thomas Gleixner, Juergen Gross, Greg Kroah-Hartman,
	linux-usb, linux-perf-users, kernel, Frederic Barrat,
	Paul Mackerras, linuxppc-dev
In-Reply-To: <20211004125935.2300113-1-u.kleine-koenig@pengutronix.de>

struct pci_dev::driver contains (apart from a constant offset) the same
data as struct pci_dev::dev->driver. Replace all remaining users of the
former pointer by the latter to allow removing the former.

Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 arch/powerpc/kernel/eeh_driver.c | 10 ++---
 arch/x86/events/intel/uncore.c   |  2 +-
 arch/x86/kernel/probe_roms.c     | 10 +++--
 drivers/misc/cxl/guest.c         | 24 +++++-----
 drivers/misc/cxl/pci.c           | 30 ++++++++-----
 drivers/pci/iov.c                | 33 ++++++++++----
 drivers/pci/pci-driver.c         | 76 +++++++++++++++++++-------------
 drivers/pci/pci.c                |  4 +-
 drivers/pci/pcie/err.c           | 36 ++++++++-------
 drivers/pci/xen-pcifront.c       |  4 +-
 drivers/usb/host/xhci-pci.c      |  2 +-
 11 files changed, 140 insertions(+), 91 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 3eff6a4888e7..350dab18e137 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -104,13 +104,13 @@ static bool eeh_edev_actionable(struct eeh_dev *edev)
  */
 static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
 {
-	if (!pdev || !pdev->driver)
+	if (!pdev || !pdev->dev.driver)
 		return NULL;
 
-	if (!try_module_get(pdev->driver->driver.owner))
+	if (!try_module_get(pdev->dev.driver->owner))
 		return NULL;
 
-	return pdev->driver;
+	return to_pci_driver(pdev->dev.driver);
 }
 
 /**
@@ -122,10 +122,10 @@ static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
  */
 static inline void eeh_pcid_put(struct pci_dev *pdev)
 {
-	if (!pdev || !pdev->driver)
+	if (!pdev || !pdev->dev.driver)
 		return;
 
-	module_put(pdev->driver->driver.owner);
+	module_put(pdev->dev.driver->owner);
 }
 
 /**
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index c72e368dd164..f1ba6ab2e97e 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1187,7 +1187,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	 * PCI slot and func to indicate the uncore box.
 	 */
 	if (id->driver_data & ~0xffff) {
-		struct pci_driver *pci_drv = pdev->driver;
+		struct pci_driver *pci_drv = to_pci_driver(pdev->dev.driver);
 
 		pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table);
 		if (pmu == NULL)
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 9e1def3744f2..deaaef6efe34 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -80,15 +80,17 @@ static struct resource video_rom_resource = {
  */
 static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
 {
-	struct pci_driver *drv = pdev->driver;
 	const struct pci_device_id *id;
 
 	if (pdev->vendor == vendor && pdev->device == device)
 		return true;
 
-	for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
-		if (id->vendor == vendor && id->device == device)
-			break;
+	if (pdev->dev.driver) {
+		struct pci_driver *drv = to_pci_driver(pdev->dev.driver);
+		for (id = drv->id_table; id && id->vendor; id++)
+			if (id->vendor == vendor && id->device == device)
+				break;
+	}
 
 	return id && id->vendor;
 }
diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
index 186308f1f8eb..d997c9c3ebb5 100644
--- a/drivers/misc/cxl/guest.c
+++ b/drivers/misc/cxl/guest.c
@@ -25,28 +25,32 @@ static void pci_error_handlers(struct cxl_afu *afu,
 		return;
 
 	list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
-		if (!afu_dev->driver)
+		struct pci_driver *afu_drv;
+
+		if (!afu_dev->dev.driver)
 			continue;
 
+		afu_drv = to_pci_driver(afu_dev->dev.driver);
+
 		switch (bus_error_event) {
 		case CXL_ERROR_DETECTED_EVENT:
 			afu_dev->error_state = state;
 
-			if (afu_dev->driver->err_handler &&
-			    afu_dev->driver->err_handler->error_detected)
-				afu_dev->driver->err_handler->error_detected(afu_dev, state);
+			if (afu_drv->err_handler &&
+			    afu_drv->err_handler->error_detected)
+				afu_drv->err_handler->error_detected(afu_dev, state);
 		break;
 		case CXL_SLOT_RESET_EVENT:
 			afu_dev->error_state = state;
 
-			if (afu_dev->driver->err_handler &&
-			    afu_dev->driver->err_handler->slot_reset)
-				afu_dev->driver->err_handler->slot_reset(afu_dev);
+			if (afu_drv->err_handler &&
+			    afu_drv->err_handler->slot_reset)
+				afu_drv->err_handler->slot_reset(afu_dev);
 		break;
 		case CXL_RESUME_EVENT:
-			if (afu_dev->driver->err_handler &&
-			    afu_dev->driver->err_handler->resume)
-				afu_dev->driver->err_handler->resume(afu_dev);
+			if (afu_drv->err_handler &&
+			    afu_drv->err_handler->resume)
+				afu_drv->err_handler->resume(afu_dev);
 		break;
 		}
 	}
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 2ba899f5659f..7e7545d01e27 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1805,14 +1805,16 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
 		return result;
 
 	list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
-		if (!afu_dev->driver)
+		struct pci_driver *afu_drv;
+		if (!afu_dev->dev.driver)
 			continue;
 
+		afu_drv = to_pci_driver(afu_dev->dev.driver);
+
 		afu_dev->error_state = state;
 
-		if (afu_dev->driver->err_handler)
-			afu_result = afu_dev->driver->err_handler->error_detected(afu_dev,
-										  state);
+		if (afu_drv->err_handler)
+			afu_result = afu_drv->err_handler->error_detected(afu_dev, state);
 		/* Disconnect trumps all, NONE trumps NEED_RESET */
 		if (afu_result == PCI_ERS_RESULT_DISCONNECT)
 			result = PCI_ERS_RESULT_DISCONNECT;
@@ -2003,6 +2005,8 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 			continue;
 
 		list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
+			struct pci_driver *afu_drv;
+
 			/* Reset the device context.
 			 * TODO: make this less disruptive
 			 */
@@ -2028,12 +2032,14 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 			 * shouldn't start new work until we call
 			 * their resume function.
 			 */
-			if (!afu_dev->driver)
+			if (!afu_dev->dev.driver)
 				continue;
 
-			if (afu_dev->driver->err_handler &&
-			    afu_dev->driver->err_handler->slot_reset)
-				afu_result = afu_dev->driver->err_handler->slot_reset(afu_dev);
+			afu_drv = to_pci_driver(afu_dev->dev.driver);
+
+			if (afu_drv->err_handler &&
+			    afu_drv->err_handler->slot_reset)
+				afu_result = afu_drv->err_handler->slot_reset(afu_dev);
 
 			if (afu_result == PCI_ERS_RESULT_DISCONNECT)
 				result = PCI_ERS_RESULT_DISCONNECT;
@@ -2074,9 +2080,11 @@ static void cxl_pci_resume(struct pci_dev *pdev)
 			continue;
 
 		list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
-			if (afu_dev->driver && afu_dev->driver->err_handler &&
-			    afu_dev->driver->err_handler->resume)
-				afu_dev->driver->err_handler->resume(afu_dev);
+			struct pci_driver *afu_drv;
+			if (afu_dev->dev.driver &&
+			    (afu_drv = to_pci_driver(afu_dev->dev.driver))->err_handler &&
+			    afu_drv->err_handler->resume)
+				afu_drv->err_handler->resume(afu_dev);
 		}
 	}
 	spin_unlock(&adapter->afu_list_lock);
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index dafdc652fcd0..0d0a34347868 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -164,13 +164,18 @@ static ssize_t sriov_vf_total_msix_show(struct device *dev,
 					char *buf)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_driver *pdrv;
 	u32 vf_total_msix = 0;
 
 	device_lock(dev);
-	if (!pdev->driver || !pdev->driver->sriov_get_vf_total_msix)
+	if (!dev->driver)
 		goto unlock;
 
-	vf_total_msix = pdev->driver->sriov_get_vf_total_msix(pdev);
+	pdrv = to_pci_driver(dev->driver);
+	if (!pdrv->sriov_get_vf_total_msix)
+		goto unlock;
+
+	vf_total_msix = pdrv->sriov_get_vf_total_msix(pdev);
 unlock:
 	device_unlock(dev);
 	return sysfs_emit(buf, "%u\n", vf_total_msix);
@@ -183,6 +188,7 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev,
 {
 	struct pci_dev *vf_dev = to_pci_dev(dev);
 	struct pci_dev *pdev = pci_physfn(vf_dev);
+	struct pci_driver *pdrv;
 	int val, ret;
 
 	ret = kstrtoint(buf, 0, &val);
@@ -193,13 +199,19 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev,
 		return -EINVAL;
 
 	device_lock(&pdev->dev);
-	if (!pdev->driver || !pdev->driver->sriov_set_msix_vec_count) {
+	if (!pdev->dev.driver) {
+		ret = -EOPNOTSUPP;
+		goto err_pdev;
+	}
+
+	pdrv = to_pci_driver(pdev->dev.driver);
+	if (!pdrv->sriov_set_msix_vec_count) {
 		ret = -EOPNOTSUPP;
 		goto err_pdev;
 	}
 
 	device_lock(&vf_dev->dev);
-	if (vf_dev->driver) {
+	if (vf_dev->dev.driver) {
 		/*
 		 * A driver is already attached to this VF and has configured
 		 * itself based on the current MSI-X vector count. Changing
@@ -209,7 +221,7 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev,
 		goto err_dev;
 	}
 
-	ret = pdev->driver->sriov_set_msix_vec_count(vf_dev, val);
+	ret = pdrv->sriov_set_msix_vec_count(vf_dev, val);
 
 err_dev:
 	device_unlock(&vf_dev->dev);
@@ -376,6 +388,7 @@ static ssize_t sriov_numvfs_store(struct device *dev,
 				  const char *buf, size_t count)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
+	struct pci_driver *pdrv;
 	int ret;
 	u16 num_vfs;
 
@@ -392,14 +405,16 @@ static ssize_t sriov_numvfs_store(struct device *dev,
 		goto exit;
 
 	/* is PF driver loaded */
-	if (!pdev->driver) {
+	if (!pdev->dev.driver) {
 		pci_info(pdev, "no driver bound to device; cannot configure SR-IOV\n");
 		ret = -ENOENT;
 		goto exit;
 	}
 
+	pdrv = to_pci_driver(pdev->dev.driver);
+
 	/* is PF driver loaded w/callback */
-	if (!pdev->driver->sriov_configure) {
+	if (!pdrv->sriov_configure) {
 		pci_info(pdev, "driver does not support SR-IOV configuration via sysfs\n");
 		ret = -ENOENT;
 		goto exit;
@@ -407,7 +422,7 @@ static ssize_t sriov_numvfs_store(struct device *dev,
 
 	if (num_vfs == 0) {
 		/* disable VFs */
-		ret = pdev->driver->sriov_configure(pdev, 0);
+		ret = pdrv->sriov_configure(pdev, 0);
 		goto exit;
 	}
 
@@ -419,7 +434,7 @@ static ssize_t sriov_numvfs_store(struct device *dev,
 		goto exit;
 	}
 
-	ret = pdev->driver->sriov_configure(pdev, num_vfs);
+	ret = pdrv->sriov_configure(pdev, num_vfs);
 	if (ret < 0)
 		goto exit;
 
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 50449ec622a3..8654fe70cd66 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -457,7 +457,7 @@ static int pci_device_probe(struct device *dev)
 static void pci_device_remove(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct pci_driver *drv = pci_dev->driver;
+	struct pci_driver *drv = to_pci_driver(pci_dev->dev.driver);
 
 	if (drv->remove) {
 		pm_runtime_get_sync(dev);
@@ -493,12 +493,15 @@ static void pci_device_remove(struct device *dev)
 static void pci_device_shutdown(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct pci_driver *drv = pci_dev->driver;
 
 	pm_runtime_resume(dev);
 
-	if (drv && drv->shutdown)
-		drv->shutdown(pci_dev);
+	if (pci_dev->dev.driver) {
+		struct pci_driver *drv = to_pci_driver(pci_dev->dev.driver);
+
+		if (drv->shutdown)
+			drv->shutdown(pci_dev);
+	}
 
 	/*
 	 * If this is a kexec reboot, turn off Bus Master bit on the
@@ -589,22 +592,25 @@ static int pci_pm_reenable_device(struct pci_dev *pci_dev)
 static int pci_legacy_suspend(struct device *dev, pm_message_t state)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct pci_driver *drv = pci_dev->driver;
 
-	if (drv && drv->suspend) {
-		pci_power_t prev = pci_dev->current_state;
-		int error;
+	if (dev->driver) {
+		struct pci_driver *drv = to_pci_driver(dev->driver);
 
-		error = drv->suspend(pci_dev, state);
-		suspend_report_result(drv->suspend, error);
-		if (error)
-			return error;
+		if (drv->suspend) {
+			pci_power_t prev = pci_dev->current_state;
+			int error;
 
-		if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0
-		    && pci_dev->current_state != PCI_UNKNOWN) {
-			pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev,
-				      "PCI PM: Device state not saved by %pS\n",
-				      drv->suspend);
+			error = drv->suspend(pci_dev, state);
+			suspend_report_result(drv->suspend, error);
+			if (error)
+				return error;
+
+			if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0
+			    && pci_dev->current_state != PCI_UNKNOWN) {
+				pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev,
+					      "PCI PM: Device state not saved by %pS\n",
+					      drv->suspend);
+			}
 		}
 	}
 
@@ -630,12 +636,17 @@ static int pci_legacy_suspend_late(struct device *dev, pm_message_t state)
 static int pci_legacy_resume(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
-	struct pci_driver *drv = pci_dev->driver;
 
 	pci_fixup_device(pci_fixup_resume, pci_dev);
 
-	return drv && drv->resume ?
-			drv->resume(pci_dev) : pci_pm_reenable_device(pci_dev);
+	if (pci_dev->dev.driver) {
+		struct pci_driver *drv = to_pci_driver(pci_dev->dev.driver);
+
+		if (drv->resume)
+			return drv->resume(pci_dev);
+	}
+
+	return pci_pm_reenable_device(pci_dev);
 }
 
 /* Auxiliary functions used by the new power management framework */
@@ -649,8 +660,14 @@ static void pci_pm_default_suspend(struct pci_dev *pci_dev)
 
 static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev)
 {
-	struct pci_driver *drv = pci_dev->driver;
-	bool ret = drv && (drv->suspend || drv->resume);
+	struct pci_driver *drv;
+	bool ret;
+
+	if (!pci_dev->dev.driver)
+		return false;
+
+	drv = to_pci_driver(pci_dev->dev.driver);
+	ret = drv && (drv->suspend || drv->resume);
 
 	/*
 	 * Legacy PM support is used by default, so warn if the new framework is
@@ -1242,11 +1259,11 @@ static int pci_pm_runtime_suspend(struct device *dev)
 	int error;
 
 	/*
-	 * If pci_dev->driver is not set (unbound), we leave the device in D0,
+	 * If pci_dev->dev.driver is not set (unbound), we leave the device in D0,
 	 * but it may go to D3cold when the bridge above it runtime suspends.
 	 * Save its config space in case that happens.
 	 */
-	if (!pci_dev->driver) {
+	if (!pci_dev->dev.driver) {
 		pci_save_state(pci_dev);
 		return 0;
 	}
@@ -1303,7 +1320,7 @@ static int pci_pm_runtime_resume(struct device *dev)
 	 */
 	pci_restore_standard_config(pci_dev);
 
-	if (!pci_dev->driver)
+	if (!dev->driver)
 		return 0;
 
 	pci_fixup_device(pci_fixup_resume_early, pci_dev);
@@ -1322,14 +1339,13 @@ static int pci_pm_runtime_resume(struct device *dev)
 
 static int pci_pm_runtime_idle(struct device *dev)
 {
-	struct pci_dev *pci_dev = to_pci_dev(dev);
 	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
 	/*
-	 * If pci_dev->driver is not set (unbound), the device should
+	 * If dev->driver is not set (unbound), the device should
 	 * always remain in D0 regardless of the runtime PM status
 	 */
-	if (!pci_dev->driver)
+	if (!dev->driver)
 		return 0;
 
 	if (!pm)
@@ -1436,8 +1452,8 @@ static struct pci_driver pci_compat_driver = {
  */
 struct pci_driver *pci_dev_driver(const struct pci_dev *dev)
 {
-	if (dev->driver)
-		return dev->driver;
+	if (dev->dev.driver)
+		return to_pci_driver(dev->dev.driver);
 	else {
 		int i;
 		for (i = 0; i <= PCI_ROM_RESOURCE; i++)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index ce2ab62b64cf..ccecf740de59 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5089,7 +5089,7 @@ EXPORT_SYMBOL_GPL(pci_dev_unlock);
 static void pci_dev_save_and_disable(struct pci_dev *dev)
 {
 	const struct pci_error_handlers *err_handler =
-			dev->driver ? dev->driver->err_handler : NULL;
+			dev->dev.driver ? to_pci_driver(dev->dev.driver)->err_handler : NULL;
 
 	/*
 	 * dev->driver->err_handler->reset_prepare() is protected against
@@ -5120,7 +5120,7 @@ static void pci_dev_save_and_disable(struct pci_dev *dev)
 static void pci_dev_restore(struct pci_dev *dev)
 {
 	const struct pci_error_handlers *err_handler =
-			dev->driver ? dev->driver->err_handler : NULL;
+			dev->dev.driver ? to_pci_driver(dev->dev.driver)->err_handler : NULL;
 
 	pci_restore_state(dev);
 
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index b576aa890c76..b314b54f7821 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -49,14 +49,15 @@ static int report_error_detected(struct pci_dev *dev,
 				 pci_channel_state_t state,
 				 enum pci_ers_result *result)
 {
+	struct pci_driver *pdrv;
 	pci_ers_result_t vote;
 	const struct pci_error_handlers *err_handler;
 
 	device_lock(&dev->dev);
 	if (!pci_dev_set_io_state(dev, state) ||
-		!dev->driver ||
-		!dev->driver->err_handler ||
-		!dev->driver->err_handler->error_detected) {
+		!dev->dev.driver ||
+		!(pdrv = to_pci_driver(dev->dev.driver))->err_handler ||
+		!pdrv->err_handler->error_detected) {
 		/*
 		 * If any device in the subtree does not have an error_detected
 		 * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent
@@ -70,7 +71,7 @@ static int report_error_detected(struct pci_dev *dev,
 			vote = PCI_ERS_RESULT_NONE;
 		}
 	} else {
-		err_handler = dev->driver->err_handler;
+		err_handler = pdrv->err_handler;
 		vote = err_handler->error_detected(dev, state);
 	}
 	pci_uevent_ers(dev, vote);
@@ -92,15 +93,16 @@ static int report_normal_detected(struct pci_dev *dev, void *data)
 static int report_mmio_enabled(struct pci_dev *dev, void *data)
 {
 	pci_ers_result_t vote, *result = data;
+	struct pci_driver *pdrv;
 	const struct pci_error_handlers *err_handler;
 
 	device_lock(&dev->dev);
-	if (!dev->driver ||
-		!dev->driver->err_handler ||
-		!dev->driver->err_handler->mmio_enabled)
+	if (!dev->dev.driver ||
+		!(pdrv = to_pci_driver(dev->dev.driver))->err_handler ||
+		!pdrv->err_handler->mmio_enabled)
 		goto out;
 
-	err_handler = dev->driver->err_handler;
+	err_handler = pdrv->err_handler;
 	vote = err_handler->mmio_enabled(dev);
 	*result = merge_result(*result, vote);
 out:
@@ -112,14 +114,15 @@ static int report_slot_reset(struct pci_dev *dev, void *data)
 {
 	pci_ers_result_t vote, *result = data;
 	const struct pci_error_handlers *err_handler;
+	struct pci_driver *pdrv;
 
 	device_lock(&dev->dev);
-	if (!dev->driver ||
-		!dev->driver->err_handler ||
-		!dev->driver->err_handler->slot_reset)
+	if (!dev->dev.driver ||
+		!(pdrv = to_pci_driver(dev->dev.driver))->err_handler ||
+		!pdrv->err_handler->slot_reset)
 		goto out;
 
-	err_handler = dev->driver->err_handler;
+	err_handler = pdrv->err_handler;
 	vote = err_handler->slot_reset(dev);
 	*result = merge_result(*result, vote);
 out:
@@ -130,15 +133,16 @@ static int report_slot_reset(struct pci_dev *dev, void *data)
 static int report_resume(struct pci_dev *dev, void *data)
 {
 	const struct pci_error_handlers *err_handler;
+	struct pci_driver *pdrv;
 
 	device_lock(&dev->dev);
 	if (!pci_dev_set_io_state(dev, pci_channel_io_normal) ||
-		!dev->driver ||
-		!dev->driver->err_handler ||
-		!dev->driver->err_handler->resume)
+		!dev->dev.driver ||
+		!(pdrv = to_pci_driver(dev->dev.driver))->err_handler ||
+		!pdrv->err_handler->resume)
 		goto out;
 
-	err_handler = dev->driver->err_handler;
+	err_handler = pdrv->err_handler;
 	err_handler->resume(dev);
 out:
 	pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index f2d7f70a7a10..73831fb87a1e 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -601,12 +601,12 @@ static pci_ers_result_t pcifront_common_process(int cmd,
 	result = PCI_ERS_RESULT_NONE;
 
 	pcidev = pci_get_domain_bus_and_slot(domain, bus, devfn);
-	if (!pcidev || !pcidev->driver) {
+	if (!pcidev || !pcidev->dev.driver) {
 		dev_err(&pdev->xdev->dev, "device or AER driver is NULL\n");
 		pci_dev_put(pcidev);
 		return result;
 	}
-	pdrv = pcidev->driver;
+	pdrv = to_pci_driver(pcidev->dev.driver);
 
 	if (pdrv->err_handler && pdrv->err_handler->error_detected) {
 		pci_dbg(pcidev, "trying to call AER service\n");
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 2c9f25ca8edd..2f4729f4f1e0 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -103,7 +103,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
 	struct xhci_driver_data         *driver_data;
 	const struct pci_device_id      *id;
 
-	id = pci_match_id(pdev->driver->id_table, pdev);
+	id = pci_match_id(to_pci_driver(pdev->dev.driver)->id_table, pdev);
 
 	if (id && id->driver_data) {
 		driver_data = (struct xhci_driver_data *)id->driver_data;
-- 
2.30.2


^ permalink raw reply related

* [PATCH v6 07/11] PCI: Replace pci_dev::driver usage that gets the driver name
From: Uwe Kleine-König @ 2021-10-04 12:59 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: linux-pci, Alexander Duyck, oss-drivers, Paul Mackerras,
	Herbert Xu, Rafał Miłecki, Jesse Brandeburg,
	Ido Schimmel, Jakub Kicinski, Yisen Zhuang, Vadym Kochan,
	Michael Buesch, Jiri Pirko, Salil Mehta, netdev, linux-wireless,
	linux-kernel, Taras Chornyi, Zhou Wang, linux-crypto, kernel,
	Simon Horman, Oliver O'Halloran, linuxppc-dev,
	David S. Miller
In-Reply-To: <20211004125935.2300113-1-u.kleine-koenig@pengutronix.de>

struct pci_dev::driver holds (apart from a constant offset) the same
data as struct pci_dev::dev->driver. With the goal to remove struct
pci_dev::driver to get rid of data duplication replace getting the
driver name by dev_driver_string() which implicitly makes use of struct
pci_dev::dev->driver.

Acked-by: Simon Horman <simon.horman@corigine.com> (for NFP)
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/crypto/hisilicon/qm.c                        | 2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c   | 2 +-
 drivers/net/ethernet/marvell/prestera/prestera_pci.c | 2 +-
 drivers/net/ethernet/mellanox/mlxsw/pci.c            | 2 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 3 ++-
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 369562d34d66..8f361e54e524 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -3085,7 +3085,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
 	};
 	int ret;
 
-	ret = strscpy(interface.name, pdev->driver->name,
+	ret = strscpy(interface.name, dev_driver_string(&pdev->dev),
 		      sizeof(interface.name));
 	if (ret < 0)
 		return -ENAMETOOLONG;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 7ea511d59e91..f279edfce3f1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -606,7 +606,7 @@ static void hns3_get_drvinfo(struct net_device *netdev,
 		return;
 	}
 
-	strncpy(drvinfo->driver, h->pdev->driver->name,
+	strncpy(drvinfo->driver, dev_driver_string(&h->pdev->dev),
 		sizeof(drvinfo->driver));
 	drvinfo->driver[sizeof(drvinfo->driver) - 1] = '\0';
 
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_pci.c b/drivers/net/ethernet/marvell/prestera/prestera_pci.c
index a250d394da38..a8f007f6dad2 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_pci.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_pci.c
@@ -720,7 +720,7 @@ static int prestera_fw_load(struct prestera_fw *fw)
 static int prestera_pci_probe(struct pci_dev *pdev,
 			      const struct pci_device_id *id)
 {
-	const char *driver_name = pdev->driver->name;
+	const char *driver_name = dev_driver_string(&pdev->dev);
 	struct prestera_fw *fw;
 	int err;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 13b0259f7ea6..8f306364f7bf 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1876,7 +1876,7 @@ static void mlxsw_pci_cmd_fini(struct mlxsw_pci *mlxsw_pci)
 
 static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	const char *driver_name = pdev->driver->name;
+	const char *driver_name = dev_driver_string(&pdev->dev);
 	struct mlxsw_pci *mlxsw_pci;
 	int err;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 0685ece1f155..1de076f55740 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -202,7 +202,8 @@ nfp_get_drvinfo(struct nfp_app *app, struct pci_dev *pdev,
 {
 	char nsp_version[ETHTOOL_FWVERS_LEN] = {};
 
-	strlcpy(drvinfo->driver, pdev->driver->name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->driver, dev_driver_string(&pdev->dev),
+		sizeof(drvinfo->driver));
 	nfp_net_get_nspinfo(app, nsp_version);
 	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
 		 "%s %s %s %s", vnic_version, nsp_version,
-- 
2.30.2


^ permalink raw reply related

* [PATCH v6 05/11] powerpc/eeh: Don't use driver member of struct pci_dev and further cleanups
From: Uwe Kleine-König @ 2021-10-04 12:59 UTC (permalink / raw)
  To: Bjorn Helgaas, Michael Ellerman
  Cc: linux-pci, Oliver O'Halloran, kernel, Paul Mackerras,
	linuxppc-dev
In-Reply-To: <20211004125935.2300113-1-u.kleine-koenig@pengutronix.de>

The driver member of struct pci_dev is to be removed as it tracks
information already present by tracking of the driver core. So replace
pdev->driver->name by dev_driver_string() for the corresponding struct
device.

Also move the function nearer to its only user and instead of the ?:
operator use a normal if which is more readable.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 arch/powerpc/include/asm/ppc-pci.h | 5 -----
 arch/powerpc/kernel/eeh.c          | 8 ++++++++
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 2b9edbf6e929..f6cf0159024e 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -55,11 +55,6 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode);
 void eeh_sysfs_add_device(struct pci_dev *pdev);
 void eeh_sysfs_remove_device(struct pci_dev *pdev);
 
-static inline const char *eeh_driver_name(struct pci_dev *pdev)
-{
-	return (pdev && pdev->driver) ? pdev->driver->name : "<null>";
-}
-
 #endif /* CONFIG_EEH */
 
 #define PCI_BUSNO(bdfn) ((bdfn >> 8) & 0xff)
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index e9b597ed423c..4b08881c4a1e 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -399,6 +399,14 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
 	return ret;
 }
 
+static inline const char *eeh_driver_name(struct pci_dev *pdev)
+{
+	if (pdev)
+		return dev_driver_string(&pdev->dev);
+
+	return "<null>";
+}
+
 /**
  * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
  * @edev: eeh device
-- 
2.30.2


^ permalink raw reply related

* Re: Add Apple M1 support to PASemi i2c driver
From: Arnd Bergmann @ 2021-10-04 11:20 UTC (permalink / raw)
  To: Wolfram Sang, Christian Zigotzky, Sven Peter, Michael Ellerman,
	Benjamin Herrenschmidt, Paul Mackerras, Olof Johansson,
	Arnd Bergmann, Hector Martin, Mohamed Mediouni, Stan Skowronek,
	Mark Kettenis, Linux ARM, Alyssa Rosenzweig, linuxppc-dev,
	Linux I2C, Linux Kernel Mailing List, R.T.Dickinson,
	Darren Stevens, Matthew Leaman, R.T.Dickinson
In-Reply-To: <YVrPf4yVFm184LEG@shikoro>

On Mon, Oct 4, 2021 at 11:55 AM Wolfram Sang <wsa@kernel.org> wrote:
>
>
> > i2c-8 i2c             PA Semi SMBus adapter at 0x(____ptrval____)     I2C adapter
> > i2c-9 i2c             PA Semi SMBus adapter at 0x(____ptrval____)     I2C adapter
> > i2c-10        i2c             PA Semi SMBus adapter at 0x(____ptrval____)     I2C adapter
>
> As Sven correctly switched from %lx to %p, this is intended behaviour.
> Run 'i2cdetect' as root to see the values again.

I think the address could just get removed here, as this is clearly not helpful.
port number, which is somewhat useful for identifying the device, now
it's either the pointless string, or the virtual address that the
device is mapped
to, which is not helpful either and potentially leaks information about kernel
internal structures.

       Arnd

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox