Re: [PATCH 3/3] riscv: crash: use NMI to stop the CPU

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Conor Dooley <conor@kernel.org>
To: Yunhui Cui <cuiyunhui@bytedance.com>
Cc: paul.walmsley@sifive.com, palmer@dabbelt.com,
	aou@eecs.berkeley.edu, alex@ghiti.fr, luxu.kernel@bytedance.com,
	atishp@rivosinc.com, cleger@rivosinc.com,
	ajones@ventanamicro.com, apatel@ventanamicro.com,
	linux-kernel@vger.kernel.org, linux-riscv@lists.infradead.org,
	songshuaishuai@tinylab.org, bjorn@rivosinc.com,
	charlie@rivosinc.com, masahiroy@kernel.org,
	valentina.fernandezalanis@microchip.com,
	jassisinghbrar@gmail.com, conor.dooley@microchip.com
Subject: Re: [PATCH 3/3] riscv: crash: use NMI to stop the CPU
Date: Tue, 28 Oct 2025 10:42:12 +0000	[thread overview]
Message-ID: <20251028-scallion-list-c8aa5f350286@spud> (raw)
In-Reply-To: <20251027133431.15321-4-cuiyunhui@bytedance.com>

[-- Attachment #1: Type: text/plain, Size: 7582 bytes --]

On Mon, Oct 27, 2025 at 09:34:31PM +0800, Yunhui Cui wrote:
> NMI is more robust than IPI for stopping CPUs during crashes,
> especially with interrupts disabled. Add SBI_SSE_EVENT_LOCAL_CRASH_NMI
> eventid to implement NMI for stopping CPUs.
> 
> Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
> ---
>  arch/riscv/include/asm/crash.h   |  1 +
>  arch/riscv/include/asm/sbi.h     |  1 +
>  arch/riscv/kernel/crash.c        | 31 +++++++++++++-
>  drivers/firmware/riscv/sse_nmi.c | 71 +++++++++++++++++++++++++++++++-
>  include/linux/sse_nmi.h          |  8 ++++
>  5 files changed, 109 insertions(+), 3 deletions(-)
>  create mode 100644 include/linux/sse_nmi.h
> 
> diff --git a/arch/riscv/include/asm/crash.h b/arch/riscv/include/asm/crash.h
> index b64df919277d4..5076f297cbc15 100644
> --- a/arch/riscv/include/asm/crash.h
> +++ b/arch/riscv/include/asm/crash.h
> @@ -5,6 +5,7 @@
>  
>  #ifdef CONFIG_KEXEC_CORE
>  void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs);
> +void cpu_crash_stop(unsigned int cpu, struct pt_regs *regs);
>  #else
>  static inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
>  {
> diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
> index 52d3fdf2d4cc1..65cce85237879 100644
> --- a/arch/riscv/include/asm/sbi.h
> +++ b/arch/riscv/include/asm/sbi.h
> @@ -487,6 +487,7 @@ enum sbi_sse_attr_id {
>  #define SBI_SSE_EVENT_GLOBAL_LOW_PRIO_RAS	0x00108000
>  #define SBI_SSE_EVENT_LOCAL_SOFTWARE_INJECTED	0xffff0000
>  #define SBI_SSE_EVENT_LOCAL_UNKNOWN_NMI		0xffff0001
> +#define SBI_SSE_EVENT_LOCAL_CRASH_NMI		0xffff0002
>  #define SBI_SSE_EVENT_GLOBAL_SOFTWARE_INJECTED	0xffff8000
>  
>  #define SBI_SSE_EVENT_PLATFORM		BIT(14)
> diff --git a/arch/riscv/kernel/crash.c b/arch/riscv/kernel/crash.c
> index 12598bbc2df04..9f3f0becfdd95 100644
> --- a/arch/riscv/kernel/crash.c
> +++ b/arch/riscv/kernel/crash.c
> @@ -3,14 +3,16 @@
>  #include <linux/cpu.h>
>  #include <linux/delay.h>
>  #include <linux/kexec.h>
> +#include <linux/sse_nmi.h>
>  #include <linux/smp.h>
>  #include <linux/sched.h>
>  
> +#include <asm/crash.h>
>  #include <asm/cpu_ops.h>
>  
>  static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0);
>  
> -inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> +void cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
>  {
>  	crash_save_cpu(regs, cpu);
>  
> @@ -27,6 +29,11 @@ inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
>  		wait_for_interrupt();
>  }
>  
> +inline void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
> +{
> +	cpu_crash_stop(cpu, regs);
> +}
> +
>  /*
>   * The number of CPUs online, not counting this CPU (which may not be
>   * fully online and so not counted in num_online_cpus()).
> @@ -38,6 +45,24 @@ static inline unsigned int num_other_online_cpus(void)
>  	return num_online_cpus() - this_cpu_online;
>  }
>  
> +#ifdef CONFIG_RISCV_SSE_NMI
> +static int send_nmi_stop_cpu(cpumask_t *mask)
> +{
> +	unsigned int cpu;
> +	int ret = 0;
> +
> +	for_each_cpu(cpu, mask)
> +		ret += carsh_nmi_stop_cpu(cpu);

+= ? I don't really get why this sort of overcomplication is needed, why
not just return immediately here with a real error code, since you're
going to have to go to the ipi fallback anyway?

> +
> +	return ret;
> +}
> +#else
> +static inline int send_nmi_stop_cpu(cpumask_t *mask)
> +{
> +	return -EOPNOTSUPP;
> +}
> +#endif
> +
>  void crash_smp_send_stop(void)
>  {
>  	static int cpus_stopped;
> @@ -66,7 +91,9 @@ void crash_smp_send_stop(void)
>  	atomic_set(&waiting_for_crash_ipi, num_other_online_cpus());
>  
>  	pr_crit("SMP: stopping secondary CPUs\n");
> -	send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
> +
> +	if (send_nmi_stop_cpu(&mask))
> +		send_ipi_mask(&mask, IPI_CPU_CRASH_STOP);
>  
>  	/* Wait up to one second for other CPUs to stop */
>  	timeout = USEC_PER_SEC;
> diff --git a/drivers/firmware/riscv/sse_nmi.c b/drivers/firmware/riscv/sse_nmi.c
> index 2c1eaea2bbabc..152d787075345 100644
> --- a/drivers/firmware/riscv/sse_nmi.c
> +++ b/drivers/firmware/riscv/sse_nmi.c
> @@ -4,13 +4,16 @@
>  
>  #include <linux/nmi.h>
>  #include <linux/riscv_sbi_sse.h>
> +#include <linux/sse_nmi.h>
>  #include <linux/sysctl.h>
>  
> +#include <asm/crash.h>
>  #include <asm/irq_regs.h>
>  #include <asm/sbi.h>
>  
>  int unknown_nmi_panic;
>  static struct sse_event *unknown_nmi_evt;
> +static struct sse_event *crash_nmi_evt;
>  static struct ctl_table_header *unknown_nmi_sysctl_header;
>  
>  static int __init setup_unknown_nmi_panic(char *str)
> @@ -32,6 +35,12 @@ const struct ctl_table unknown_nmi_table[] = {
>  	},
>  };
>  
> +static inline struct sbiret sbi_sse_ecall(int fid, unsigned long arg0,
> +					  unsigned long arg1)
> +{
> +	return sbi_ecall(SBI_EXT_SSE, fid, arg0, arg1, 0, 0, 0, 0);
> +}
> +
>  static int unknown_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
>  {
>  	pr_emerg("NMI received for unknown on CPU %d.\n", smp_processor_id());
> @@ -73,9 +82,69 @@ static int unknown_nmi_init(void)
>  	return ret;
>  }
>  
> +#ifdef CONFIG_KEXEC_CORE
> +int carsh_nmi_stop_cpu(unsigned int cpu)

typo: crash

> +{
> +	unsigned int hart_id = cpuid_to_hartid_map(cpu);
> +	u32 evt = SBI_SSE_EVENT_LOCAL_CRASH_NMI;
> +	struct sbiret ret;
> +
> +	ret = sbi_sse_ecall(SBI_SSE_EVENT_INJECT, evt, hart_id);
> +	if (ret.error) {
> +		pr_err("Failed to signal event %x, error %ld\n", evt, ret.error);

Isn't this going to emit pointless (and maybe confusing) error messages
on systems that enable the option but don't support SSE? And it's going
to be one for each secondary CPU too.

> +		return sbi_err_map_linux_errno(ret.error);
> +	}
> +
> +	return 0;
> +}
> +
> +static int crash_nmi_handler(u32 evt, void *arg, struct pt_regs *regs)
> +{
> +	cpu_crash_stop(smp_processor_id(), regs);
> +
> +	return 0;
> +}
> +
> +static int crash_nmi_init(void)
> +{
> +	int ret;
> +
> +	crash_nmi_evt = sse_event_register(SBI_SSE_EVENT_LOCAL_CRASH_NMI, 0,
> +				 crash_nmi_handler, NULL);
> +	if (IS_ERR(crash_nmi_evt))
> +		return PTR_ERR(crash_nmi_evt);
> +
> +	ret = sse_event_enable(crash_nmi_evt);
> +	if (ret) {
> +		sse_event_unregister(crash_nmi_evt);
> +		return ret;
> +	}
> +
> +	pr_info("Using SSE for crash NMI event delivery\n");
> +
> +	return 0;
> +}
> +#endif
> +
>  static int __init sse_nmi_init(void)
>  {
> -	return unknown_nmi_init();
> +	int ret;
> +
> +	ret = unknown_nmi_init();
> +	if (ret) {
> +		pr_err("Unknown_nmi_init failed with error %d\n", ret);
> +		return ret;
> +	}

This change looks like it shouldn't be in this patch, if you want it to
print an error, just do that from the start?

> +
> +#ifdef CONFIG_KEXEC_CORE

Can this be IS_ENABLED() or does crash_nmi_init() not have a stub?

> +	ret = crash_nmi_init();
> +	if (ret) {
> +		pr_err("Crash_nmi_init failed with error %d\n", ret);
> +		return ret;
> +	}
> +#endif
> +
> +	return 0;
>  }
>  
>  late_initcall(sse_nmi_init);
> diff --git a/include/linux/sse_nmi.h b/include/linux/sse_nmi.h
> new file mode 100644
> index 0000000000000..548a348ac0a46
> --- /dev/null
> +++ b/include/linux/sse_nmi.h
> @@ -0,0 +1,8 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __LINUX_RISCV_SSE_NMI_H
> +#define __LINUX_RISCV_SSE_NMI_H
> +
> +int carsh_nmi_stop_cpu(unsigned int cpu);
> +
> +#endif
> -- 
> 2.39.5
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

next prev parent reply	other threads:[~2025-10-28 10:42 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-27 13:34 [PATCH 0/3] Add NMI Support to RISC-V via SSE Yunhui Cui
2025-10-27 13:34 ` [PATCH 1/3] drivers: firmware: riscv: add SSE NMI support Yunhui Cui
2025-10-28 10:53   ` Conor Dooley
2025-10-27 13:34 ` [PATCH 2/3] riscv: crash: move IPI crash handling logic to crash.c Yunhui Cui
2025-10-27 13:34 ` [PATCH 3/3] riscv: crash: use NMI to stop the CPU Yunhui Cui
2025-10-28 10:42   ` Conor Dooley [this message]
2025-10-28 12:36     ` Radim Krčmář
2025-11-03 14:10       ` [External] " yunhui cui
2025-11-03 17:23         ` Radim Krčmář
2025-11-03 13:36     ` yunhui cui
2025-10-30  8:46   ` Atish Patra
2025-10-31  1:24     ` Bagas Sanjaya

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251028-scallion-list-c8aa5f350286@spud \
    --to=conor@kernel.org \
    --cc=ajones@ventanamicro.com \
    --cc=alex@ghiti.fr \
    --cc=aou@eecs.berkeley.edu \
    --cc=apatel@ventanamicro.com \
    --cc=atishp@rivosinc.com \
    --cc=bjorn@rivosinc.com \
    --cc=charlie@rivosinc.com \
    --cc=cleger@rivosinc.com \
    --cc=conor.dooley@microchip.com \
    --cc=cuiyunhui@bytedance.com \
    --cc=jassisinghbrar@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-riscv@lists.infradead.org \
    --cc=luxu.kernel@bytedance.com \
    --cc=masahiroy@kernel.org \
    --cc=palmer@dabbelt.com \
    --cc=paul.walmsley@sifive.com \
    --cc=songshuaishuai@tinylab.org \
    --cc=valentina.fernandezalanis@microchip.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox