Linux-RISC-V Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Ruidong Tian <tianruidong@linux.alibaba.com>
To: pjw@kernel.org, palmer@dabbelt.com, aou@eecs.berkeley.edu,
	alex@ghiti.fr, rafael@kernel.org, tony.luck@intel.com,
	bp@alien8.de, guohanjun@huawei.com, mchehab@kernel.org,
	xueshuai@linux.alibaba.com, lenb@kernel.org,
	saket.dumbre@intel.com
Cc: linux-riscv@lists.infradead.org, linux-kernel@vger.kernel.org,
	linux-acpi@vger.kernel.org, acpica-devel@lists.linux.dev,
	Ruidong Tian <tianruidong@linux.alibaba.com>
Subject: [PATCH 3/3] riscv: collect hardware error information via APEI on HEE
Date: Fri,  8 May 2026 16:20:20 +0800	[thread overview]
Message-ID: <20260508082020.3368109-4-tianruidong@linux.alibaba.com> (raw)
In-Reply-To: <20260508082020.3368109-1-tianruidong@linux.alibaba.com>

RISC-V already dispatches Hardware Error Exceptions through
do_trap_hardware_error(), but the trap handler currently has no way to
learn *what* went wrong: the user sees the offending task killed, or
the kernel panic, with no diagnostic about the underlying hardware
fault. No error record is logged, and the subsequent memory_failure()
handling has no input.

There are two principal ways to obtain that information on HEE:

  1. Have firmware parse the platform error registers and hand the
     kernel a CPER record through APEI / GHES.
  2. Have the kernel read the error registers directly.

Option (2) is not yet viable on RISC-V: the architecture does not
define a unified, mandatory layout for hardware error status
registers across implementations, so there is nothing stable for
common code to read. This patch therefore only implements option (1):
collect hardware error information on HEE through the existing APEI /
GHES path, mirroring how arm64 treats SEA.

Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
---
 arch/riscv/include/asm/acpi.h |  2 ++
 arch/riscv/kernel/acpi.c      | 54 +++++++++++++++++++++++++++++++++++
 arch/riscv/kernel/traps.c     | 35 +++++++++++++++++++++--
 3 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/include/asm/acpi.h b/arch/riscv/include/asm/acpi.h
index aa889093f531..e4d18421063e 100644
--- a/arch/riscv/include/asm/acpi.h
+++ b/arch/riscv/include/asm/acpi.h
@@ -87,6 +87,7 @@ int acpi_get_riscv_isa(struct acpi_table_header *table,
 
 void acpi_get_cbo_block_size(struct acpi_table_header *table, u32 *cbom_size,
 			     u32 *cboz_size, u32 *cbop_size);
+int apei_claim_hee(struct pt_regs *regs);
 #else
 static inline void acpi_init_rintc_map(void) { }
 static inline struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu)
@@ -104,6 +105,7 @@ static inline void acpi_get_cbo_block_size(struct acpi_table_header *table,
 					   u32 *cbom_size, u32 *cboz_size,
 					   u32 *cbop_size) { }
 
+static inline int apei_claim_hee(struct pt_regs *regs) { return -ENOENT; }
 #endif /* CONFIG_ACPI */
 
 #ifdef CONFIG_ACPI_NUMA
diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c
index 068e0b404b6f..77ad1e18a092 100644
--- a/arch/riscv/kernel/acpi.c
+++ b/arch/riscv/kernel/acpi.c
@@ -21,6 +21,9 @@
 #include <linux/of_fdt.h>
 #include <linux/pci.h>
 #include <linux/serial_core.h>
+#include <linux/irq_work.h>
+#include <linux/nmi.h>
+#include <acpi/ghes.h>
 
 int acpi_noirq = 1;		/* skip ACPI IRQ initialization */
 int acpi_disabled = 1;
@@ -353,3 +356,54 @@ int acpi_get_cpu_uid(unsigned int cpu, u32 *uid)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(acpi_get_cpu_uid);
+
+/*
+ * Claim Hardware Error Exception as a firmware first notification.
+ *
+ * Used by RISC-V exception handler for hardware error processing.
+ */
+int apei_claim_hee(struct pt_regs *regs)
+{
+	int err = -ENOENT;
+	unsigned long flags;
+	bool return_to_irqs_enabled;
+	bool need_nmi_ctx = !in_nmi();
+
+	if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES))
+		return err;
+
+	local_irq_save(flags);
+
+	/*
+	 * Determine whether the interrupted context had IRQs enabled.
+	 * This decides if we can run irq_work immediately after.
+	 */
+	return_to_irqs_enabled = false;
+	if (regs)
+		return_to_irqs_enabled = !regs_irqs_disabled(regs);
+
+	if (need_nmi_ctx)
+		nmi_enter();
+	err = ghes_notify_hee();
+	if (need_nmi_ctx)
+		nmi_exit();
+
+	/*
+	 * APEI NMI-like notifications are deferred to irq_work. Unless
+	 * we interrupted irqs-masked code, we can do that now.
+	 */
+	if (!err) {
+		if (return_to_irqs_enabled) {
+			__irq_enter();
+			irq_work_run();
+			__irq_exit();
+		} else {
+			pr_warn_ratelimited("APEI work queued but not completed");
+			err = -EINPROGRESS;
+		}
+	}
+
+	local_irq_restore(flags);
+	return err;
+}
+EXPORT_SYMBOL(apei_claim_hee);
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 8c62c771a656..5ee0ac8b0745 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -22,6 +22,7 @@
 #include <linux/irq.h>
 #include <linux/kexec.h>
 #include <linux/entry-common.h>
+#include <linux/acpi.h>
 
 #include <asm/asm-prototypes.h>
 #include <asm/bug.h>
@@ -161,8 +162,6 @@ asmlinkage __visible __trap_section void name(struct pt_regs *regs)		\
 
 DO_ERROR_INFO(do_trap_unknown,
 	SIGILL, ILL_ILLTRP, "unknown exception");
-DO_ERROR_INFO(do_trap_hardware_error,
-	SIGBUS, BUS_MCEERR_AR, "hardware error");
 DO_ERROR_INFO(do_trap_insn_misaligned,
 	SIGBUS, BUS_ADRALN, "instruction address misaligned");
 DO_ERROR_INFO(do_trap_insn_fault,
@@ -484,3 +483,35 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs)
 		wait_for_interrupt();
 }
 #endif
+
+static int claim_hardware_error(struct pt_regs *regs)
+{
+	if (IS_ENABLED(CONFIG_ACPI_APEI_HEE))
+		return apei_claim_hee(regs);
+	return -ENOENT;
+}
+
+asmlinkage __visible __trap_section void do_trap_hardware_error(struct pt_regs *regs)
+{
+	if (user_mode(regs)) {
+		irqentry_enter_from_user_mode(regs);
+		local_irq_enable();
+
+		if (claim_hardware_error(regs))
+			do_trap_error(regs, SIGBUS, BUS_MCEERR_AR,
+				      regs->badaddr,
+				      "Hardware Error Exception");
+
+		local_irq_disable();
+		irqentry_exit_to_user_mode(regs);
+	} else {
+		irqentry_state_t state = irqentry_nmi_enter(regs);
+
+		claim_hardware_error(regs);
+
+		if (!fixup_exception(regs))
+			die(regs, "Hardware Error Exception");
+
+		irqentry_nmi_exit(regs, state);
+	}
+}
-- 
2.51.2.612.gdc70283dfc


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

      parent reply	other threads:[~2026-05-08  8:20 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-08  8:20 [PATCH 0/3] riscv: log Hardware Error Exception via APEI Ruidong Tian
2026-05-08  8:20 ` [PATCH 1/3] acpi: Introduce HEE in HEST notification types Ruidong Tian
2026-05-08  8:20 ` [PATCH 2/3] riscv: Introduce HEST HEE notification handlers for APEI Ruidong Tian
2026-05-08  8:20 ` Ruidong Tian [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260508082020.3368109-4-tianruidong@linux.alibaba.com \
    --to=tianruidong@linux.alibaba.com \
    --cc=acpica-devel@lists.linux.dev \
    --cc=alex@ghiti.fr \
    --cc=aou@eecs.berkeley.edu \
    --cc=bp@alien8.de \
    --cc=guohanjun@huawei.com \
    --cc=lenb@kernel.org \
    --cc=linux-acpi@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-riscv@lists.infradead.org \
    --cc=mchehab@kernel.org \
    --cc=palmer@dabbelt.com \
    --cc=pjw@kernel.org \
    --cc=rafael@kernel.org \
    --cc=saket.dumbre@intel.com \
    --cc=tony.luck@intel.com \
    --cc=xueshuai@linux.alibaba.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox