* [PATCH 1/4] firmware: arm_sdei: add SDEI_EVENT_SIGNAL support
2026-06-03 14:36 [PATCH 0/4] arm64: cross-CPU NMI via SDEI Kiryl Shutsemau
@ 2026-06-03 14:36 ` Kiryl Shutsemau
2026-06-03 14:36 ` [PATCH 2/4] drivers/firmware: add SDEI cross-CPU NMI service for arm64 Kiryl Shutsemau
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Kiryl Shutsemau @ 2026-06-03 14:36 UTC (permalink / raw)
To: Catalin Marinas, Will Deacon, James Morse
Cc: Mark Rutland, Marc Zyngier, Doug Anderson, Petr Mladek,
Thomas Gleixner, Andrew Morton, Baoquan He, Puranjay Mohan,
Usama Arif, Breno Leitao, Julien Thierry, Lecopzer Chen,
Sumit Garg, kernel-team, kexec, linux-arm-kernel, linux-kernel,
Kiryl Shutsemau (Meta)
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Add sdei_event_signal(), a thin wrapper over the SDEI_EVENT_SIGNAL call
(DEN0054) that makes the software-signalled event (event 0) pending on a
target PE -- delivered NMI-like even when that PE has interrupts masked.
It takes no locks, so it is safe to call from NMI / crash context.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
---
drivers/firmware/arm_sdei.c | 12 ++++++++++++
include/linux/arm_sdei.h | 6 ++++++
include/uapi/linux/arm_sdei.h | 1 +
3 files changed, 19 insertions(+)
diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index f39ed7ba3a38..e3fd604d9894 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -339,6 +339,18 @@ static void _ipi_unmask_cpu(void *ignored)
sdei_unmask_local_cpu();
}
+/*
+ * Signal the software-signalled event (event 0) to @mpidr. Does nothing
+ * but the SMC -- no locks, no event lookup -- so it is safe from NMI /
+ * crash context (e.g. the cross-CPU NMI service).
+ */
+int sdei_event_signal(u32 event_num, u64 mpidr)
+{
+ return invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_SIGNAL, event_num,
+ mpidr, 0, 0, 0, NULL);
+}
+NOKPROBE_SYMBOL(sdei_event_signal);
+
static void _ipi_private_reset(void *ignored)
{
int err;
diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h
index f652a5028b59..3f3ec01155e8 100644
--- a/include/linux/arm_sdei.h
+++ b/include/linux/arm_sdei.h
@@ -37,6 +37,12 @@ int sdei_event_unregister(u32 event_num);
int sdei_event_enable(u32 event_num);
int sdei_event_disable(u32 event_num);
+/*
+ * Signal the software-signalled event (event 0) to another PE, NMI-like.
+ * @mpidr is the target's MPIDR affinity.
+ */
+int sdei_event_signal(u32 event_num, u64 mpidr);
+
/* GHES register/unregister helpers */
int sdei_register_ghes(struct ghes *ghes, sdei_event_callback *normal_cb,
sdei_event_callback *critical_cb);
diff --git a/include/uapi/linux/arm_sdei.h b/include/uapi/linux/arm_sdei.h
index af0630ba5437..22eb61612673 100644
--- a/include/uapi/linux/arm_sdei.h
+++ b/include/uapi/linux/arm_sdei.h
@@ -22,6 +22,7 @@
#define SDEI_1_0_FN_SDEI_PE_UNMASK SDEI_1_0_FN(0x0C)
#define SDEI_1_0_FN_SDEI_INTERRUPT_BIND SDEI_1_0_FN(0x0D)
#define SDEI_1_0_FN_SDEI_INTERRUPT_RELEASE SDEI_1_0_FN(0x0E)
+#define SDEI_1_0_FN_SDEI_EVENT_SIGNAL SDEI_1_0_FN(0x0F)
#define SDEI_1_0_FN_SDEI_PRIVATE_RESET SDEI_1_0_FN(0x11)
#define SDEI_1_0_FN_SDEI_SHARED_RESET SDEI_1_0_FN(0x12)
--
2.54.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH 2/4] drivers/firmware: add SDEI cross-CPU NMI service for arm64
2026-06-03 14:36 [PATCH 0/4] arm64: cross-CPU NMI via SDEI Kiryl Shutsemau
2026-06-03 14:36 ` [PATCH 1/4] firmware: arm_sdei: add SDEI_EVENT_SIGNAL support Kiryl Shutsemau
@ 2026-06-03 14:36 ` Kiryl Shutsemau
2026-06-03 14:36 ` [PATCH 3/4] arm64: wire SDEI NMI into the hardlockup watchdog Kiryl Shutsemau
2026-06-03 14:36 ` [PATCH 4/4] arm64: route crash_smp_send_stop() last resort through SDEI Kiryl Shutsemau
3 siblings, 0 replies; 5+ messages in thread
From: Kiryl Shutsemau @ 2026-06-03 14:36 UTC (permalink / raw)
To: Catalin Marinas, Will Deacon, James Morse
Cc: Mark Rutland, Marc Zyngier, Doug Anderson, Petr Mladek,
Thomas Gleixner, Andrew Morton, Baoquan He, Puranjay Mohan,
Usama Arif, Breno Leitao, Julien Thierry, Lecopzer Chen,
Sumit Garg, kernel-team, kexec, linux-arm-kernel, linux-kernel,
Kiryl Shutsemau (Meta)
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Deliver an NMI-like event to an interrupt-masked arm64 CPU via the
standard SDEI software-signalled event (event 0), without the pseudo-NMI
hot-path cost: register a handler for event 0 and poke a target with
sdei_event_signal(0, mpidr).
First user is arch_trigger_cpumask_backtrace() (sysrq-l, RCU stalls,
hung-task/soft-lockup dumps), which otherwise rides an IPI that can't
reach a masked CPU. Falls back to the IPI path when SDEI is absent; no
watchdog backend yet, so the stock detector is untouched.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
---
arch/arm64/include/asm/nmi.h | 24 ++++++
arch/arm64/kernel/smp.c | 9 +++
drivers/firmware/Kconfig | 19 +++++
drivers/firmware/Makefile | 1 +
drivers/firmware/sdei_nmi.c | 147 +++++++++++++++++++++++++++++++++++
5 files changed, 200 insertions(+)
create mode 100644 arch/arm64/include/asm/nmi.h
create mode 100644 drivers/firmware/sdei_nmi.c
diff --git a/arch/arm64/include/asm/nmi.h b/arch/arm64/include/asm/nmi.h
new file mode 100644
index 000000000000..ccdb75692e9d
--- /dev/null
+++ b/arch/arm64/include/asm/nmi.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_NMI_H
+#define __ASM_NMI_H
+
+#include <linux/cpumask.h>
+
+/*
+ * Cross-CPU NMI provider hooks, consulted by the arm64 arch code before
+ * its regular-IRQ / pseudo-NMI IPI paths. The SDEI provider in
+ * drivers/firmware/sdei_nmi.c implements them when active; a future
+ * FEAT_NMI provider could slot in here too. The stubs let callers stay
+ * unconditional when ARM_SDEI_NMI is off.
+ */
+#ifdef CONFIG_ARM_SDEI_NMI
+bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu);
+#else
+static inline bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
+ int exclude_cpu)
+{
+ return false;
+}
+#endif
+
+#endif /* __ASM_NMI_H */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 1aa324104afb..656b8417af72 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -45,6 +45,7 @@
#include <asm/daifflags.h>
#include <asm/kvm_mmu.h>
#include <asm/mmu_context.h>
+#include <asm/nmi.h>
#include <asm/numa.h>
#include <asm/processor.h>
#include <asm/smp_plat.h>
@@ -928,11 +929,19 @@ static void arm64_backtrace_ipi(cpumask_t *mask)
void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
{
/*
+ * Prefer the SDEI cross-CPU NMI provider when active: firmware
+ * dispatches the event out of EL3 and reaches CPUs that have
+ * interrupts locally masked, without the per-IRQ-mask cost that
+ * pseudo-NMI pays for the same reach. The plain IPI path below
+ * can't reach such a CPU unless pseudo-NMI is enabled.
+ *
* NOTE: though nmi_trigger_cpumask_backtrace() has "nmi_" in the name,
* nothing about it truly needs to be implemented using an NMI, it's
* just that it's _allowed_ to work with NMIs. If ipi_should_be_nmi()
* returned false our backtrace attempt will just use a regular IPI.
*/
+ if (sdei_nmi_trigger_cpumask_backtrace(mask, exclude_cpu))
+ return;
nmi_trigger_cpumask_backtrace(mask, exclude_cpu, arm64_backtrace_ipi);
}
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index bbd2155d8483..6501087ff90d 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -36,6 +36,25 @@ config ARM_SDE_INTERFACE
standard for registering callbacks from the platform firmware
into the OS. This is typically used to implement RAS notifications.
+config ARM_SDEI_NMI
+ bool "SDEI-based cross-CPU NMI service (arm64)"
+ depends on ARM64 && ARM_SDE_INTERFACE
+ help
+ Provides SDEI-based cross-CPU NMI delivery for hooks that need
+ to reach interrupt-masked CPUs on silicon that lacks FEAT_NMI:
+
+ - arch_trigger_cpumask_backtrace() (sysrq-l, RCU stalls,
+ hardlockup_all_cpu_backtrace, soft-lockup secondary dumps,
+ hung-task auxiliary dumps)
+
+ The driver registers a handler for the SDEI software-signalled
+ event (event 0) and reaches a target CPU by signalling it with
+ SDEI_EVENT_SIGNAL. Firmware delivers the event out of EL3
+ regardless of the target's PSTATE.DAIF -- forced delivery into a
+ CPU wedged with interrupts locally masked.
+
+ If unsure, say N.
+
config EDD
tristate "BIOS Enhanced Disk Drive calls determine boot disk"
depends on X86
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 4ddec2820c96..48221fb8b385 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -4,6 +4,7 @@
#
obj-$(CONFIG_ARM_SCPI_PROTOCOL) += arm_scpi.o
obj-$(CONFIG_ARM_SDE_INTERFACE) += arm_sdei.o
+obj-$(CONFIG_ARM_SDEI_NMI) += sdei_nmi.o
obj-$(CONFIG_DMI) += dmi_scan.o
obj-$(CONFIG_DMI_SYSFS) += dmi-sysfs.o
obj-$(CONFIG_EDD) += edd.o
diff --git a/drivers/firmware/sdei_nmi.c b/drivers/firmware/sdei_nmi.c
new file mode 100644
index 000000000000..e5c3f28b3991
--- /dev/null
+++ b/drivers/firmware/sdei_nmi.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arm64 SDEI-based cross-CPU NMI service.
+ *
+ * Delivering an "NMI-shaped" event to an EL1 context that has locally
+ * masked interrupts, on silicon without FEAT_NMI, can be done two ways:
+ *
+ * - pseudo-NMI: mask "interrupts" via the GIC priority register
+ * (ICC_PMR_EL1) instead of PSTATE.DAIF, leaving a high-priority band
+ * deliverable. Functionally this works -- but it reimplements every
+ * local_irq_disable()/enable() and exception entry/exit as a PMR
+ * write plus synchronisation, a cost paid on that hot path forever,
+ * whether or not an NMI is ever delivered.
+ *
+ * - SDEI: leave interrupt masking as the cheap PSTATE.DAIF operation
+ * and have the firmware bounce an EL3-routed Group-0 SGI back to
+ * NS-EL1 as an event callback. The cost is a firmware round-trip,
+ * but only at the rare moment delivery is actually needed.
+ *
+ * This driver takes the second path: it keeps the IRQ-mask hot path
+ * free and pays only when it fires, which is what makes cross-CPU NMI
+ * affordable on hardware where the pseudo-NMI tax isn't, until FEAT_NMI
+ * makes NMI masking cheap in the architecture itself.
+ *
+ * Capabilities provided:
+ *
+ * - sdei_nmi_trigger_cpumask_backtrace() — override for arm64's
+ * arch_trigger_cpumask_backtrace(), so sysrq-l, RCU stall dumps,
+ * hardlockup_all_cpu_backtrace, soft-lockup/hung-task secondary
+ * dumps all reach interrupt-masked CPUs.
+ *
+ * Delivery uses the standard SDEI software-signalled event (event 0) and
+ * SDEI_EVENT_SIGNAL. We register a handler for event 0, enable it, and
+ * poke a target CPU with sdei_event_signal(0, mpidr): firmware makes
+ * event 0 pending on that PE and dispatches the handler NMI-like,
+ * regardless of the target's DAIF.
+ * Availability is simply whether event 0 registers and enables -- if SDEI
+ * and its software-signalled event are present we use it, otherwise the
+ * driver stays inert.
+ */
+
+#define pr_fmt(fmt) "sdei_nmi: " fmt
+
+#include <linux/arm_sdei.h>
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/nmi.h>
+#include <linux/printk.h>
+#include <linux/ptrace.h>
+#include <linux/smp.h>
+#include <linux/types.h>
+
+#include <asm/nmi.h>
+#include <asm/smp_plat.h>
+
+static bool sdei_nmi_available;
+
+#define SDEI_NMI_EVENT 0
+
+static int sdei_nmi_handler(u32 event, struct pt_regs *regs, void *arg)
+{
+ /*
+ * nmi_cpu_backtrace() no-ops unless this CPU's bit is set in the
+ * global backtrace mask (driven by nmi_trigger_cpumask_backtrace()),
+ * so a fire that reaches a CPU not being backtraced is harmless.
+ */
+ nmi_cpu_backtrace(regs);
+ return SDEI_EV_HANDLED;
+}
+
+static void sdei_nmi_fire(unsigned int target_cpu)
+{
+ int err = sdei_event_signal(SDEI_NMI_EVENT, cpu_logical_map(target_cpu));
+
+ if (err)
+ pr_warn("SDEI_EVENT_SIGNAL to CPU %u failed: %d\n",
+ target_cpu, err);
+}
+
+/*
+ * Raise callback for nmi_trigger_cpumask_backtrace(): signal event 0
+ * at every CPU still pending in @mask. The framework excludes the local
+ * CPU from @mask before calling us.
+ */
+static void sdei_nmi_raise_backtrace(cpumask_t *mask)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ sdei_nmi_fire(cpu);
+}
+
+/*
+ * Override hook for arch_trigger_cpumask_backtrace() (see
+ * arch/arm64/kernel/smp.c). Returns true when SDEI handled the request,
+ * which is the case whenever SDEI is active; on a false return the arch
+ * falls back to its regular-IRQ (or pseudo-NMI, if enabled) IPI.
+ *
+ * On a kernel built without paying the pseudo-NMI hot-path cost (the
+ * usual case for this driver's target), the IPI can't reach a CPU that
+ * has interrupts masked -- so the backtrace of the one CPU you care
+ * about comes back empty. SDEI is dispatched out of EL3 and lands
+ * regardless of the target's DAIF, without taxing the IRQ-mask path.
+ */
+bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
+{
+ if (!sdei_nmi_available)
+ return false;
+
+ nmi_trigger_cpumask_backtrace(mask, exclude_cpu,
+ sdei_nmi_raise_backtrace);
+ return true;
+}
+
+/*
+ * device_initcall (after arch_initcall(sdei_init), so the SDEI subsystem
+ * is up): probe the firmware, register the event, and turn on the
+ * cross-CPU service. If the probe fails the driver stays inert and the
+ * override hooks decline, leaving the arch's own paths in place.
+ */
+static int __init sdei_nmi_init(void)
+{
+ int err;
+
+ err = sdei_event_register(SDEI_NMI_EVENT, sdei_nmi_handler, NULL);
+ if (err) {
+ pr_err("sdei_event_register(%u) failed: %d\n",
+ SDEI_NMI_EVENT, err);
+ return 0;
+ }
+
+ err = sdei_event_enable(SDEI_NMI_EVENT);
+ if (err) {
+ pr_err("sdei_event_enable(%u) failed: %d\n",
+ SDEI_NMI_EVENT, err);
+ sdei_event_unregister(SDEI_NMI_EVENT);
+ return 0;
+ }
+
+ sdei_nmi_available = true;
+ pr_info("using SDEI cross-CPU NMI (SDEI_EVENT_SIGNAL, event %u)\n",
+ SDEI_NMI_EVENT);
+
+ return 0;
+}
+device_initcall(sdei_nmi_init);
--
2.54.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH 3/4] arm64: wire SDEI NMI into the hardlockup watchdog
2026-06-03 14:36 [PATCH 0/4] arm64: cross-CPU NMI via SDEI Kiryl Shutsemau
2026-06-03 14:36 ` [PATCH 1/4] firmware: arm_sdei: add SDEI_EVENT_SIGNAL support Kiryl Shutsemau
2026-06-03 14:36 ` [PATCH 2/4] drivers/firmware: add SDEI cross-CPU NMI service for arm64 Kiryl Shutsemau
@ 2026-06-03 14:36 ` Kiryl Shutsemau
2026-06-03 14:36 ` [PATCH 4/4] arm64: route crash_smp_send_stop() last resort through SDEI Kiryl Shutsemau
3 siblings, 0 replies; 5+ messages in thread
From: Kiryl Shutsemau @ 2026-06-03 14:36 UTC (permalink / raw)
To: Catalin Marinas, Will Deacon, James Morse
Cc: Mark Rutland, Marc Zyngier, Doug Anderson, Petr Mladek,
Thomas Gleixner, Andrew Morton, Baoquan He, Puranjay Mohan,
Usama Arif, Breno Leitao, Julien Thierry, Lecopzer Chen,
Sumit Garg, kernel-team, kexec, linux-arm-kernel, linux-kernel,
Kiryl Shutsemau (Meta)
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Select HAVE_HARDLOCKUP_DETECTOR_ARCH so the framework takes its backend
from this driver. A per-CPU hrtimer checks its buddy's heartbeat and
signals event 0 at a stalled CPU, which runs watchdog_hardlockup_check()
NMI-like.
The source is chosen at boot: SDEI if firmware provides it, otherwise a
perf-NMI counter (pseudo-NMI) fallback -- one image covers both.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
---
arch/arm64/Kconfig | 1 +
drivers/firmware/Kconfig | 3 +
drivers/firmware/sdei_nmi.c | 247 +++++++++++++++++++++++++++++++++++-
3 files changed, 248 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index fe60738e5943..ebefe1e20806 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -205,6 +205,7 @@ config ARM64
select HAVE_FUNCTION_GRAPH_FREGS
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_GCC_PLUGINS
+ select HAVE_HARDLOCKUP_DETECTOR_ARCH if ARM_SDEI_NMI
select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && \
HW_PERF_EVENTS && HAVE_PERF_EVENTS_NMI
select HAVE_HW_BREAKPOINT if PERF_EVENTS
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 6501087ff90d..552eff7b9bc3 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -39,6 +39,7 @@ config ARM_SDE_INTERFACE
config ARM_SDEI_NMI
bool "SDEI-based cross-CPU NMI service (arm64)"
depends on ARM64 && ARM_SDE_INTERFACE
+ select HARDLOCKUP_DETECTOR_COUNTS_HRTIMER if HARDLOCKUP_DETECTOR
help
Provides SDEI-based cross-CPU NMI delivery for hooks that need
to reach interrupt-masked CPUs on silicon that lacks FEAT_NMI:
@@ -46,6 +47,8 @@ config ARM_SDEI_NMI
- arch_trigger_cpumask_backtrace() (sysrq-l, RCU stalls,
hardlockup_all_cpu_backtrace, soft-lockup secondary dumps,
hung-task auxiliary dumps)
+ - the hardlockup watchdog backend, when HARDLOCKUP_DETECTOR is
+ also enabled
The driver registers a handler for the SDEI software-signalled
event (event 0) and reaches a target CPU by signalling it with
diff --git a/drivers/firmware/sdei_nmi.c b/drivers/firmware/sdei_nmi.c
index e5c3f28b3991..51e220d4083d 100644
--- a/drivers/firmware/sdei_nmi.c
+++ b/drivers/firmware/sdei_nmi.c
@@ -29,6 +29,14 @@
* hardlockup_all_cpu_backtrace, soft-lockup/hung-task secondary
* dumps all reach interrupt-masked CPUs.
*
+ * - the hardlockup-detector backend (watchdog_hardlockup_enable/
+ * disable/probe()), when CONFIG_HARDLOCKUP_DETECTOR is also on.
+ * ARM_SDEI_NMI selects HAVE_HARDLOCKUP_DETECTOR_ARCH, so the
+ * framework picks this backend. The detection source is chosen at
+ * boot: SDEI when the firmware has it, otherwise a perf-PMU NMI
+ * counter if one is available (pseudo-NMI enabled). One kernel image
+ * thus serves SDEI and non-SDEI hosts.
+ *
* Delivery uses the standard SDEI software-signalled event (event 0) and
* SDEI_EVENT_SIGNAL. We register a handler for event 0, enable it, and
* poke a target CPU with sdei_event_signal(0, mpidr): firmware makes
@@ -42,12 +50,18 @@
#define pr_fmt(fmt) "sdei_nmi: " fmt
#include <linux/arm_sdei.h>
+#include <linux/cpufreq.h>
#include <linux/cpumask.h>
+#include <linux/hrtimer.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/nmi.h>
+#include <linux/percpu-defs.h>
+#include <linux/perf_event.h>
+#include <linux/perf/arm_pmu.h>
#include <linux/printk.h>
#include <linux/ptrace.h>
+#include <linux/sched/clock.h>
#include <linux/smp.h>
#include <linux/types.h>
@@ -61,11 +75,17 @@ static bool sdei_nmi_available;
static int sdei_nmi_handler(u32 event, struct pt_regs *regs, void *arg)
{
/*
- * nmi_cpu_backtrace() no-ops unless this CPU's bit is set in the
- * global backtrace mask (driven by nmi_trigger_cpumask_backtrace()),
- * so a fire that reaches a CPU not being backtraced is harmless.
+ * Both consumers no-op on a CPU that wasn't actually requested:
+ * nmi_cpu_backtrace() unless this CPU's bit is set in the global
+ * backtrace mask, and watchdog_hardlockup_check() unless this CPU's
+ * hrtimer_interrupts counter has stalled. The latter is only
+ * declared when the watchdog backend is built in (COUNTS_HRTIMER,
+ * pulled by ARM_SDEI_NMI when HARDLOCKUP_DETECTOR is enabled).
*/
nmi_cpu_backtrace(regs);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER
+ watchdog_hardlockup_check(smp_processor_id(), regs);
+#endif
return SDEI_EV_HANDLED;
}
@@ -113,6 +133,220 @@ bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
return true;
}
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER
+
+/*
+ * SDEI watchdog source: a per-CPU hrtimer pets its own heartbeat and
+ * checks its buddy's; on a stall it signals event 0 at the buddy,
+ * whose SDEI handler then runs watchdog_hardlockup_check().
+ */
+#define SDEI_NMI_WATCHDOG_TICK_MS 1000
+
+static cpumask_t __read_mostly sdei_nmi_watchdog_cpus;
+static DEFINE_PER_CPU(struct hrtimer, sdei_nmi_watchdog_hrtimer);
+static DEFINE_PER_CPU(u64, sdei_nmi_watchdog_heartbeat_ns);
+
+static unsigned int sdei_nmi_watchdog_next_cpu(unsigned int cpu)
+{
+ unsigned int next = cpumask_next_wrap(cpu, &sdei_nmi_watchdog_cpus);
+
+ if (next == cpu)
+ return nr_cpu_ids;
+ return next;
+}
+
+static enum hrtimer_restart sdei_nmi_watchdog_hrtimer_fn(struct hrtimer *t)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int buddy;
+ u64 now = local_clock();
+ u64 buddy_hb, thresh_ns;
+
+ this_cpu_write(sdei_nmi_watchdog_heartbeat_ns, now);
+
+ buddy = sdei_nmi_watchdog_next_cpu(this_cpu);
+ if (buddy >= nr_cpu_ids)
+ goto restart;
+
+ /* pair with smp_wmb() in start_watchdog/stop_watchdog */
+ smp_rmb();
+
+ buddy_hb = per_cpu(sdei_nmi_watchdog_heartbeat_ns, buddy);
+ thresh_ns = (u64)watchdog_thresh * NSEC_PER_SEC;
+
+ if (now > buddy_hb + thresh_ns) {
+ /*
+ * Fire every tick while the buddy looks stale: the framework's
+ * watchdog_hardlockup_check() needs two consecutive calls
+ * before it'll declare a lockup (first call updates
+ * hrtimer_interrupts_saved; second confirms the counter
+ * hasn't moved). One-shot firing wedges the detection at
+ * step 1. The cost of an extra SMC per second on a truly
+ * wedged CPU is negligible; the alternative is silent
+ * non-detection.
+ */
+ pr_warn_ratelimited("watchdog: CPU %u no heartbeat for %llu ms (thresh %us), firing NMI from CPU %u\n",
+ buddy,
+ (now - buddy_hb) / NSEC_PER_MSEC,
+ watchdog_thresh, this_cpu);
+ sdei_nmi_fire(buddy);
+ }
+
+restart:
+ hrtimer_forward_now(t, ms_to_ktime(SDEI_NMI_WATCHDOG_TICK_MS));
+ return HRTIMER_RESTART;
+}
+
+static void sdei_nmi_watchdog_enable(unsigned int cpu)
+{
+ struct hrtimer *t = this_cpu_ptr(&sdei_nmi_watchdog_hrtimer);
+
+ if (cpumask_test_cpu(cpu, &sdei_nmi_watchdog_cpus))
+ return;
+
+ this_cpu_write(sdei_nmi_watchdog_heartbeat_ns, local_clock());
+
+ hrtimer_setup(t, sdei_nmi_watchdog_hrtimer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED);
+
+ /* pair with smp_rmb() in the hrtimer callback */
+ smp_wmb();
+ cpumask_set_cpu(cpu, &sdei_nmi_watchdog_cpus);
+
+ hrtimer_start(t, ms_to_ktime(SDEI_NMI_WATCHDOG_TICK_MS),
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void sdei_nmi_watchdog_disable(unsigned int cpu)
+{
+ if (!cpumask_test_cpu(cpu, &sdei_nmi_watchdog_cpus))
+ return;
+
+ cpumask_clear_cpu(cpu, &sdei_nmi_watchdog_cpus);
+ /* pair with smp_rmb() in the hrtimer callback */
+ smp_wmb();
+
+ hrtimer_cancel(this_cpu_ptr(&sdei_nmi_watchdog_hrtimer));
+}
+
+/*
+ * Perf-NMI fallback source, used when SDEI is absent but the PMU IRQ is
+ * a (pseudo-)NMI. A per-CPU cycle counter overflows into the same
+ * watchdog_hardlockup_check(). This is the stock arm64 perf hardlockup
+ * detector, minimal-copied here because the framework's
+ * HARDLOCKUP_DETECTOR_PERF is compile-excluded once we select
+ * HAVE_HARDLOCKUP_DETECTOR_ARCH (it would otherwise provide a second
+ * definition of these same hooks).
+ */
+static struct perf_event_attr perf_wd_attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 1,
+};
+
+static DEFINE_PER_CPU(struct perf_event *, perf_wd_event);
+
+static u64 perf_wd_period(int cpu)
+{
+ /* 5 GHz safe max when cpufreq is unavailable, as in watchdog_hld.c. */
+ u64 hz = cpufreq_get_hw_max_freq(cpu) * 1000UL;
+
+ return (hz ? hz : 5000000000UL) * watchdog_thresh;
+}
+
+static void perf_wd_overflow(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ watchdog_hardlockup_check(smp_processor_id(), regs);
+}
+
+static void perf_wd_enable(unsigned int cpu)
+{
+ struct perf_event *evt;
+
+ if (this_cpu_read(perf_wd_event))
+ return;
+
+ perf_wd_attr.sample_period = perf_wd_period(cpu);
+ evt = perf_event_create_kernel_counter(&perf_wd_attr, cpu, NULL,
+ perf_wd_overflow, NULL);
+ if (IS_ERR(evt)) {
+ pr_warn_once("perf event create on CPU %u failed: %ld\n",
+ cpu, PTR_ERR(evt));
+ return;
+ }
+
+ this_cpu_write(perf_wd_event, evt);
+ perf_event_enable(evt);
+}
+
+static void perf_wd_disable(unsigned int cpu)
+{
+ struct perf_event *evt = this_cpu_read(perf_wd_event);
+
+ if (!evt)
+ return;
+
+ perf_event_disable(evt);
+ perf_event_release_kernel(evt);
+ this_cpu_write(perf_wd_event, NULL);
+}
+
+/* Set by the late_initcall below once the perf fallback is chosen. */
+static bool perf_wd_active;
+
+void watchdog_hardlockup_enable(unsigned int cpu)
+{
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
+ if (sdei_nmi_available)
+ sdei_nmi_watchdog_enable(cpu);
+ else if (perf_wd_active)
+ perf_wd_enable(cpu);
+}
+
+void watchdog_hardlockup_disable(unsigned int cpu)
+{
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
+ if (sdei_nmi_available)
+ sdei_nmi_watchdog_disable(cpu);
+ else if (perf_wd_active)
+ perf_wd_disable(cpu);
+}
+
+int __init watchdog_hardlockup_probe(void)
+{
+ return (sdei_nmi_available || perf_wd_active) ? 0 : -ENODEV;
+}
+
+/*
+ * Phase 2 of init, at late_initcall so it runs after both our own
+ * device_initcall (SDEI decision) and armv8_pmuv3's (which is what makes
+ * arm_pmu_irq_is_nmi() read true). If SDEI didn't claim the watchdog and
+ * the PMU IRQ is a (pseudo-)NMI, take the perf fallback. Deciding here,
+ * after both device_initcalls, keeps the choice deterministic -- no race
+ * over which initcall ran first, and no flip from perf to SDEI.
+ */
+static int __init perf_wd_init(void)
+{
+ if (sdei_nmi_available)
+ return 0; /* SDEI already owns the watchdog */
+
+ if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && arm_pmu_irq_is_nmi()) {
+ perf_wd_active = true;
+ pr_info("no SDEI firmware; using perf-NMI watchdog fallback\n");
+ lockup_detector_retry_init();
+ }
+ return 0;
+}
+late_initcall(perf_wd_init);
+
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */
+
/*
* device_initcall (after arch_initcall(sdei_init), so the SDEI subsystem
* is up): probe the firmware, register the event, and turn on the
@@ -142,6 +376,13 @@ static int __init sdei_nmi_init(void)
pr_info("using SDEI cross-CPU NMI (SDEI_EVENT_SIGNAL, event %u)\n",
SDEI_NMI_EVENT);
+ /*
+ * lockup_detector_init() ran in early init and found no hardlockup
+ * backend yet; re-probe now that SDEI owns the watchdog.
+ */
+ if (IS_ENABLED(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER))
+ lockup_detector_retry_init();
+
return 0;
}
device_initcall(sdei_nmi_init);
--
2.54.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH 4/4] arm64: route crash_smp_send_stop() last resort through SDEI
2026-06-03 14:36 [PATCH 0/4] arm64: cross-CPU NMI via SDEI Kiryl Shutsemau
` (2 preceding siblings ...)
2026-06-03 14:36 ` [PATCH 3/4] arm64: wire SDEI NMI into the hardlockup watchdog Kiryl Shutsemau
@ 2026-06-03 14:36 ` Kiryl Shutsemau
3 siblings, 0 replies; 5+ messages in thread
From: Kiryl Shutsemau @ 2026-06-03 14:36 UTC (permalink / raw)
To: Catalin Marinas, Will Deacon, James Morse
Cc: Mark Rutland, Marc Zyngier, Doug Anderson, Petr Mladek,
Thomas Gleixner, Andrew Morton, Baoquan He, Puranjay Mohan,
Usama Arif, Breno Leitao, Julien Thierry, Lecopzer Chen,
Sumit Garg, kernel-team, kexec, linux-arm-kernel, linux-kernel,
Kiryl Shutsemau (Meta)
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Add SDEI as the final rung after the normal stop IPI (and the pseudo-NMI
IPI, if enabled): signal event 0 at the CPUs still online, whose handler
runs crash_save_cpu() on the wedged context and parks them. It only ever
touches CPUs the normal path couldn't reach.
SDEI is last because a CPU parked in the handler never completes the
event, so it is less recoverable -- a cost paid only when nothing else
worked.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
---
arch/arm64/include/asm/nmi.h | 6 ++
arch/arm64/kernel/smp.c | 24 ++++++
drivers/firmware/Kconfig | 1 +
drivers/firmware/sdei_nmi.c | 137 ++++++++++++++++++++++++++++++++++-
4 files changed, 167 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/nmi.h b/arch/arm64/include/asm/nmi.h
index ccdb75692e9d..e3edfb24fc08 100644
--- a/arch/arm64/include/asm/nmi.h
+++ b/arch/arm64/include/asm/nmi.h
@@ -13,12 +13,18 @@
*/
#ifdef CONFIG_ARM_SDEI_NMI
bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu);
+bool sdei_nmi_crash_smp_send_stop(void);
#else
static inline bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
int exclude_cpu)
{
return false;
}
+
+static inline bool sdei_nmi_crash_smp_send_stop(void)
+{
+ return false;
+}
#endif
#endif /* __ASM_NMI_H */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 656b8417af72..386ddd526b48 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1288,8 +1288,32 @@ void crash_smp_send_stop(void)
return;
crash_stop = 1;
+ /*
+ * Stop the normal way first: IPI_CPU_STOP escalating to a pseudo-NMI
+ * IPI. Every CPU that responds saves its state via crash_save_cpu()
+ * and parks in cpu_park_loop() with its online bit cleared -- the
+ * standard kdump stop, identical to a kernel without SDEI. Crucially
+ * those CPUs stay in a clean, potentially-reusable state.
+ */
smp_send_stop();
+ /*
+ * Whatever is still online didn't respond -- typically a CPU wedged
+ * with interrupts masked. The plain IPI can't reach it, and a fleet
+ * that declines the pseudo-NMI hot-path cost has no NMI IPI to
+ * escalate to. Hit only the survivors with the SDEI cross-CPU NMI
+ * (no-op if SDEI isn't active, or if everything already stopped):
+ * firmware delivers out of EL3 regardless of PSTATE.DAIF, and the
+ * handler captures crash_save_cpu() state from the wedged context
+ * before parking the CPU.
+ *
+ * SDEI is deliberately last: an SDEI-stopped CPU never completes its
+ * event (it parks inside the handler, so EL3 retains its dispatch
+ * slot until reset), which is strictly less recoverable than a normal
+ * stop. We pay that only for CPUs that left no other way to reach them.
+ */
+ sdei_nmi_crash_smp_send_stop();
+
sdei_handler_abort();
}
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 552eff7b9bc3..84aead609406 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -49,6 +49,7 @@ config ARM_SDEI_NMI
hung-task auxiliary dumps)
- the hardlockup watchdog backend, when HARDLOCKUP_DETECTOR is
also enabled
+ - crash_smp_send_stop() (panic / kdump path)
The driver registers a handler for the SDEI software-signalled
event (event 0) and reaches a target CPU by signalling it with
diff --git a/drivers/firmware/sdei_nmi.c b/drivers/firmware/sdei_nmi.c
index 51e220d4083d..ad8fbb1c90a6 100644
--- a/drivers/firmware/sdei_nmi.c
+++ b/drivers/firmware/sdei_nmi.c
@@ -29,6 +29,11 @@
* hardlockup_all_cpu_backtrace, soft-lockup/hung-task secondary
* dumps all reach interrupt-masked CPUs.
*
+ * - sdei_nmi_crash_smp_send_stop() — override for arm64's
+ * crash_smp_send_stop(); the panic/kdump last resort for CPUs that
+ * didn't answer the normal stop IPI, capturing the wedged context
+ * into the vmcore before parking the CPU.
+ *
* - the hardlockup-detector backend (watchdog_hardlockup_enable/
* disable/probe()), when CONFIG_HARDLOCKUP_DETECTOR is also on.
* ARM_SDEI_NMI selects HAVE_HARDLOCKUP_DETECTOR_ARCH, so the
@@ -50,11 +55,15 @@
#define pr_fmt(fmt) "sdei_nmi: " fmt
#include <linux/arm_sdei.h>
+#include <linux/cpu.h>
#include <linux/cpufreq.h>
#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/kexec.h>
#include <linux/nmi.h>
#include <linux/percpu-defs.h>
#include <linux/perf_event.h>
@@ -72,8 +81,66 @@ static bool sdei_nmi_available;
#define SDEI_NMI_EVENT 0
+/*
+ * Crash-stop dispatch lives on the same SDEI event 0 as everything else.
+ * The requesting CPU sets sdei_nmi_crash_stop_requested for each target
+ * before signalling event 0; the target's handler clears it, saves crash
+ * state, parks, and sets sdei_nmi_crash_stop_acked so the requester knows
+ * the target is down.
+ *
+ * Using a per-CPU flag rather than a separate SDEI event avoids needing
+ * extra registrations from firmware. The SDEI_EVENT_SIGNAL SMC is itself
+ * a write barrier, so a WRITE_ONCE() before the signal is sufficient
+ * ordering against the handler's READ_ONCE() on the target.
+ */
+static DEFINE_PER_CPU(unsigned long, sdei_nmi_crash_stop_requested);
+static DEFINE_PER_CPU(unsigned long, sdei_nmi_crash_stop_acked);
+
static int sdei_nmi_handler(u32 event, struct pt_regs *regs, void *arg)
{
+ int cpu = smp_processor_id();
+
+ if (READ_ONCE(*this_cpu_ptr(&sdei_nmi_crash_stop_requested))) {
+ WRITE_ONCE(*this_cpu_ptr(&sdei_nmi_crash_stop_requested), 0);
+
+ /*
+ * Capture the wedged context for kdump while pt_regs still
+ * points at the interrupted PC. This is the main motivation
+ * for using SDEI here: the plain IPI stop path can't reach an
+ * interrupt-masked CPU (and the fleet declines pseudo-NMI to
+ * keep the IRQ-mask hot path cheap), so crash_save_cpu() for
+ * that CPU would otherwise record nothing useful.
+ */
+ crash_save_cpu(regs, cpu);
+ set_cpu_online(cpu, false);
+
+ /* publish the crash state/offline before the requester sees the ack */
+ smp_wmb();
+ WRITE_ONCE(*this_cpu_ptr(&sdei_nmi_crash_stop_acked), 1);
+
+ /*
+ * Park forever from within the SDEI handler. We deliberately
+ * do NOT issue SDEI_EVENT_COMPLETE: the framework's return
+ * path restores firmware's saved interrupted context, which
+ * would land the CPU back wherever it was running (often
+ * do_idle, which then notices cpu_is_offline=true and BUGs
+ * at cpuhp_report_idle_dead). Returning the modified pt_regs
+ * doesn't help -- arch/arm64/kernel/sdei.c::do_sdei_event
+ * only honours a PC override via its IRQ-state heuristic
+ * and otherwise hands EL3 its own saved-context slot back.
+ *
+ * Trade-off: EL3 firmware retains ~one saved-context slot
+ * per parked CPU until the next hardware reset (~hundreds of
+ * bytes per CPU). The CPU itself is parked in cpu_park_loop
+ * exactly as if IPI_CPU_STOP had stopped it; recoverability
+ * is unchanged versus the existing path (neither is
+ * recoverable without hardware reset, since PSCI sees the
+ * CPU as ALREADY_ON in both cases).
+ */
+ cpu_park_loop();
+ /* unreachable */
+ }
+
/*
* Both consumers no-op on a CPU that wasn't actually requested:
* nmi_cpu_backtrace() unless this CPU's bit is set in the global
@@ -84,7 +151,7 @@ static int sdei_nmi_handler(u32 event, struct pt_regs *regs, void *arg)
*/
nmi_cpu_backtrace(regs);
#ifdef CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER
- watchdog_hardlockup_check(smp_processor_id(), regs);
+ watchdog_hardlockup_check(cpu, regs);
#endif
return SDEI_EV_HANDLED;
}
@@ -133,6 +200,74 @@ bool sdei_nmi_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
return true;
}
+/*
+ * Last-resort half of arm64's crash_smp_send_stop() (see
+ * arch/arm64/kernel/smp.c). The caller runs the normal IPI / pseudo-NMI
+ * stop first; whatever is left in cpu_online_mask by the time we're
+ * called are the CPUs that didn't respond -- wedged with interrupts
+ * masked, unreachable by those paths. We snapshot that residual mask,
+ * set each survivor's per-CPU crash-stop request flag, signal event 0
+ * at it, and poll for acks. The handler captures crash_save_cpu() state
+ * and parks the CPU (without completing the SDEI event, see
+ * sdei_nmi_handler()).
+ *
+ * Because SDEI-stopped CPUs are less recoverable than normally-stopped
+ * ones, this is intentionally the fallback, not the first choice -- it
+ * only ever runs against CPUs the normal path already gave up on.
+ *
+ * Returns true when SDEI was active and this path ran (even if some CPU
+ * failed to ack within the timeout, or there were no survivors to stop);
+ * false when SDEI isn't active, leaving the caller's normal-path result
+ * as the final word.
+ */
+bool sdei_nmi_crash_smp_send_stop(void)
+{
+ unsigned int this_cpu, cpu, remaining;
+ unsigned long timeout;
+ cpumask_t mask;
+
+ if (!sdei_nmi_available)
+ return false;
+
+ this_cpu = smp_processor_id();
+ cpumask_copy(&mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, &mask);
+ if (cpumask_empty(&mask))
+ return true;
+
+ for_each_cpu(cpu, &mask) {
+ WRITE_ONCE(per_cpu(sdei_nmi_crash_stop_acked, cpu), 0);
+ WRITE_ONCE(per_cpu(sdei_nmi_crash_stop_requested, cpu), 1);
+ }
+ /* Publish flags before the SMCs read them on the target side. */
+ smp_wmb();
+
+ for_each_cpu(cpu, &mask)
+ sdei_nmi_fire(cpu);
+
+ /*
+ * Poll up to 100ms -- same order as the kernel's existing pseudo-NMI
+ * stop wait (10ms) plus headroom for the SDEI round-trip on slow
+ * firmware.
+ */
+ timeout = USEC_PER_MSEC * 100;
+ while (timeout--) {
+ remaining = 0;
+ for_each_cpu(cpu, &mask)
+ if (!READ_ONCE(per_cpu(sdei_nmi_crash_stop_acked, cpu)))
+ remaining++;
+ if (!remaining)
+ break;
+ udelay(1);
+ }
+
+ if (remaining)
+ pr_warn("crash_stop: %u CPU(s) did not ack within 100ms\n",
+ remaining);
+
+ return true;
+}
+
#ifdef CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER
/*
--
2.54.0
^ permalink raw reply related [flat|nested] 5+ messages in thread