From: Ruidong Tian <tianruidong@linux.alibaba.com>
To: pjw@kernel.org, palmer@dabbelt.com, aou@eecs.berkeley.edu,
alex@ghiti.fr, rafael@kernel.org, tony.luck@intel.com,
bp@alien8.de, guohanjun@huawei.com, mchehab@kernel.org,
xueshuai@linux.alibaba.com, lenb@kernel.org,
saket.dumbre@intel.com
Cc: linux-riscv@lists.infradead.org, linux-kernel@vger.kernel.org,
linux-acpi@vger.kernel.org, acpica-devel@lists.linux.dev,
Ruidong Tian <tianruidong@linux.alibaba.com>
Subject: [PATCH 3/3] riscv: collect hardware error information via APEI on HEE
Date: Fri, 8 May 2026 16:20:20 +0800 [thread overview]
Message-ID: <20260508082020.3368109-4-tianruidong@linux.alibaba.com> (raw)
In-Reply-To: <20260508082020.3368109-1-tianruidong@linux.alibaba.com>
RISC-V already dispatches Hardware Error Exceptions through
do_trap_hardware_error(), but the trap handler currently has no way to
learn *what* went wrong: the user sees the offending task killed, or
the kernel panic, with no diagnostic about the underlying hardware
fault. No error record is logged, and the subsequent memory_failure()
handling has no input.
There are two principal ways to obtain that information on HEE:
1. Have firmware parse the platform error registers and hand the
kernel a CPER record through APEI / GHES.
2. Have the kernel read the error registers directly.
Option (2) is not yet viable on RISC-V: the architecture does not
define a unified, mandatory layout for hardware error status
registers across implementations, so there is nothing stable for
common code to read. This patch therefore only implements option (1):
collect hardware error information on HEE through the existing APEI /
GHES path, mirroring how arm64 treats SEA.
Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
---
arch/riscv/include/asm/acpi.h | 2 ++
arch/riscv/kernel/acpi.c | 54 +++++++++++++++++++++++++++++++++++
arch/riscv/kernel/traps.c | 35 +++++++++++++++++++++--
3 files changed, 89 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/include/asm/acpi.h b/arch/riscv/include/asm/acpi.h
index aa889093f531..e4d18421063e 100644
--- a/arch/riscv/include/asm/acpi.h
+++ b/arch/riscv/include/asm/acpi.h
@@ -87,6 +87,7 @@ int acpi_get_riscv_isa(struct acpi_table_header *table,
void acpi_get_cbo_block_size(struct acpi_table_header *table, u32 *cbom_size,
u32 *cboz_size, u32 *cbop_size);
+int apei_claim_hee(struct pt_regs *regs);
#else
static inline void acpi_init_rintc_map(void) { }
static inline struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu)
@@ -104,6 +105,7 @@ static inline void acpi_get_cbo_block_size(struct acpi_table_header *table,
u32 *cbom_size, u32 *cboz_size,
u32 *cbop_size) { }
+static inline int apei_claim_hee(struct pt_regs *regs) { return -ENOENT; }
#endif /* CONFIG_ACPI */
#ifdef CONFIG_ACPI_NUMA
diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c
index 068e0b404b6f..77ad1e18a092 100644
--- a/arch/riscv/kernel/acpi.c
+++ b/arch/riscv/kernel/acpi.c
@@ -21,6 +21,9 @@
#include <linux/of_fdt.h>
#include <linux/pci.h>
#include <linux/serial_core.h>
+#include <linux/irq_work.h>
+#include <linux/nmi.h>
+#include <acpi/ghes.h>
int acpi_noirq = 1; /* skip ACPI IRQ initialization */
int acpi_disabled = 1;
@@ -353,3 +356,54 @@ int acpi_get_cpu_uid(unsigned int cpu, u32 *uid)
return 0;
}
EXPORT_SYMBOL_GPL(acpi_get_cpu_uid);
+
+/*
+ * Claim Hardware Error Exception as a firmware first notification.
+ *
+ * Used by RISC-V exception handler for hardware error processing.
+ */
+int apei_claim_hee(struct pt_regs *regs)
+{
+ int err = -ENOENT;
+ unsigned long flags;
+ bool return_to_irqs_enabled;
+ bool need_nmi_ctx = !in_nmi();
+
+ if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES))
+ return err;
+
+ local_irq_save(flags);
+
+ /*
+ * Determine whether the interrupted context had IRQs enabled.
+ * This decides if we can run irq_work immediately after.
+ */
+ return_to_irqs_enabled = false;
+ if (regs)
+ return_to_irqs_enabled = !regs_irqs_disabled(regs);
+
+ if (need_nmi_ctx)
+ nmi_enter();
+ err = ghes_notify_hee();
+ if (need_nmi_ctx)
+ nmi_exit();
+
+ /*
+ * APEI NMI-like notifications are deferred to irq_work. Unless
+ * we interrupted irqs-masked code, we can do that now.
+ */
+ if (!err) {
+ if (return_to_irqs_enabled) {
+ __irq_enter();
+ irq_work_run();
+ __irq_exit();
+ } else {
+ pr_warn_ratelimited("APEI work queued but not completed");
+ err = -EINPROGRESS;
+ }
+ }
+
+ local_irq_restore(flags);
+ return err;
+}
+EXPORT_SYMBOL(apei_claim_hee);
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 8c62c771a656..5ee0ac8b0745 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -22,6 +22,7 @@
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/entry-common.h>
+#include <linux/acpi.h>
#include <asm/asm-prototypes.h>
#include <asm/bug.h>
@@ -161,8 +162,6 @@ asmlinkage __visible __trap_section void name(struct pt_regs *regs) \
DO_ERROR_INFO(do_trap_unknown,
SIGILL, ILL_ILLTRP, "unknown exception");
-DO_ERROR_INFO(do_trap_hardware_error,
- SIGBUS, BUS_MCEERR_AR, "hardware error");
DO_ERROR_INFO(do_trap_insn_misaligned,
SIGBUS, BUS_ADRALN, "instruction address misaligned");
DO_ERROR_INFO(do_trap_insn_fault,
@@ -484,3 +483,35 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs)
wait_for_interrupt();
}
#endif
+
+static int claim_hardware_error(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_ACPI_APEI_HEE))
+ return apei_claim_hee(regs);
+ return -ENOENT;
+}
+
+asmlinkage __visible __trap_section void do_trap_hardware_error(struct pt_regs *regs)
+{
+ if (user_mode(regs)) {
+ irqentry_enter_from_user_mode(regs);
+ local_irq_enable();
+
+ if (claim_hardware_error(regs))
+ do_trap_error(regs, SIGBUS, BUS_MCEERR_AR,
+ regs->badaddr,
+ "Hardware Error Exception");
+
+ local_irq_disable();
+ irqentry_exit_to_user_mode(regs);
+ } else {
+ irqentry_state_t state = irqentry_nmi_enter(regs);
+
+ claim_hardware_error(regs);
+
+ if (!fixup_exception(regs))
+ die(regs, "Hardware Error Exception");
+
+ irqentry_nmi_exit(regs, state);
+ }
+}
--
2.51.2.612.gdc70283dfc
_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv
prev parent reply other threads:[~2026-05-08 8:20 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-08 8:20 [PATCH 0/3] riscv: log Hardware Error Exception via APEI Ruidong Tian
2026-05-08 8:20 ` [PATCH 1/3] acpi: Introduce HEE in HEST notification types Ruidong Tian
2026-05-08 8:20 ` [PATCH 2/3] riscv: Introduce HEST HEE notification handlers for APEI Ruidong Tian
2026-05-08 8:20 ` Ruidong Tian [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260508082020.3368109-4-tianruidong@linux.alibaba.com \
--to=tianruidong@linux.alibaba.com \
--cc=acpica-devel@lists.linux.dev \
--cc=alex@ghiti.fr \
--cc=aou@eecs.berkeley.edu \
--cc=bp@alien8.de \
--cc=guohanjun@huawei.com \
--cc=lenb@kernel.org \
--cc=linux-acpi@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-riscv@lists.infradead.org \
--cc=mchehab@kernel.org \
--cc=palmer@dabbelt.com \
--cc=pjw@kernel.org \
--cc=rafael@kernel.org \
--cc=saket.dumbre@intel.com \
--cc=tony.luck@intel.com \
--cc=xueshuai@linux.alibaba.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox