public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH x86/nmi 1/2] x86/nmi: Accumulate NMI-progress evidence in exc_nmi()
@ 2023-01-05  1:15 Paul E. McKenney
  2023-01-05  1:15 ` [PATCH x86/nmi 2/2] x86/nmi: Print reasons why backtrace NMIs are ignored Paul E. McKenney
  2023-01-09 16:19 ` [PATCH x86/nmi 1/2] x86/nmi: Accumulate NMI-progress evidence in exc_nmi() Peter Zijlstra
  0 siblings, 2 replies; 6+ messages in thread
From: Paul E. McKenney @ 2023-01-05  1:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: kernel-team, Paul E. McKenney, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin, x86

CPUs ignoring NMIs is often a sign of those CPUs going bad, but there
are quite a few other reasons why a CPU might ignore NMIs.  Therefore,
accumulate evidence within exc_nmi() as to what might be preventing a
given CPU from responding to an NMI.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
---
 arch/x86/kernel/nmi.c | 31 ++++++++++++++++++++++++++++++-
 lib/Kconfig.debug     | 11 +++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index cec0bfa3bc04f..4f1651dc65b3a 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -69,6 +69,15 @@ struct nmi_stats {
 	unsigned int unknown;
 	unsigned int external;
 	unsigned int swallow;
+	unsigned long recv_jiffies;
+	unsigned long idt_seq;
+	unsigned long idt_nmi_seq;
+	unsigned long idt_ignored;
+	atomic_long_t idt_calls;
+	unsigned long idt_seq_snap;
+	unsigned long idt_nmi_seq_snap;
+	unsigned long idt_ignored_snap;
+	long idt_calls_snap;
 };
 
 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
@@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
 DEFINE_IDTENTRY_RAW(exc_nmi)
 {
 	irqentry_state_t irq_state;
+	struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats);
 
 	/*
 	 * Re-enable NMIs right here when running as an SEV-ES guest. This might
 	 * cause nested NMIs, but those can be handled safely.
 	 */
 	sev_es_nmi_complete();
+	if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
+		arch_atomic_long_inc(&nsp->idt_calls);
 
 	if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
 		return;
@@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
 	}
 	this_cpu_write(nmi_state, NMI_EXECUTING);
 	this_cpu_write(nmi_cr2, read_cr2());
+	if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+		WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
+		WARN_ON_ONCE(!(nsp->idt_seq & 0x1));
+		WRITE_ONCE(nsp->recv_jiffies, jiffies);
+	}
 nmi_restart:
 
 	/*
@@ -509,8 +526,15 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
 
 	inc_irq_stat(__nmi_count);
 
-	if (!ignore_nmis)
+	if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) {
+		WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1);
+	} else if (!ignore_nmis) {
+		WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
+		WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1));
 		default_do_nmi(regs);
+		WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
+		WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1);
+	}
 
 	irqentry_nmi_exit(regs, irq_state);
 
@@ -525,6 +549,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
 
 	if (user_mode(regs))
 		mds_user_clear_cpu_buffers();
+	if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+		WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
+		WARN_ON_ONCE(nsp->idt_seq & 0x1);
+		WRITE_ONCE(nsp->recv_jiffies, jiffies);
+	}
 }
 
 #if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 881c3f84e88a3..dad99197d2695 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1552,6 +1552,17 @@ config TRACE_IRQFLAGS_NMI
 	depends on TRACE_IRQFLAGS
 	depends on TRACE_IRQFLAGS_NMI_SUPPORT
 
+config NMI_CHECK_CPU
+	bool "Debugging for CPUs failing to respond to backtrace requests"
+	depends on DEBUG_KERNEL
+	depends on X86
+	default n
+	help
+	  Enables debug prints when a CPU fails to respond to a given
+	  backtrace NMI.  These prints provide some reasons why a CPU
+	  might legitimately be failing to respond, for example, if it
+	  is offline of if ignore_nmis is set.
+
 config DEBUG_IRQFLAGS
 	bool "Debug IRQ flag manipulation"
 	help
-- 
2.31.1.189.g2e36527f23


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-01-09 17:06 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-01-05  1:15 [PATCH x86/nmi 1/2] x86/nmi: Accumulate NMI-progress evidence in exc_nmi() Paul E. McKenney
2023-01-05  1:15 ` [PATCH x86/nmi 2/2] x86/nmi: Print reasons why backtrace NMIs are ignored Paul E. McKenney
2023-01-05 10:40   ` Ingo Molnar
2023-01-05 19:24     ` Paul E. McKenney
2023-01-09 16:19 ` [PATCH x86/nmi 1/2] x86/nmi: Accumulate NMI-progress evidence in exc_nmi() Peter Zijlstra
2023-01-09 16:59   ` Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox