* [PATCH 1/3] acpi: Introduce HEE in HEST notification types
2026-05-08 8:20 [PATCH 0/3] riscv: log Hardware Error Exception via APEI Ruidong Tian
@ 2026-05-08 8:20 ` Ruidong Tian
2026-05-08 8:20 ` [PATCH 2/3] riscv: Introduce HEST HEE notification handlers for APEI Ruidong Tian
2026-05-08 8:20 ` [PATCH 3/3] riscv: collect hardware error information via APEI on HEE Ruidong Tian
2 siblings, 0 replies; 4+ messages in thread
From: Ruidong Tian @ 2026-05-08 8:20 UTC (permalink / raw)
To: pjw, palmer, aou, alex, rafael, tony.luck, bp, guohanjun, mchehab,
xueshuai, lenb, saket.dumbre
Cc: linux-riscv, linux-kernel, linux-acpi, acpica-devel, Ruidong Tian
Introduce a new HEST notification type for RISC-V Hardware
Error Exception. The GHES entry's notification structure
contains the notification to be used for a given error source.
Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
---
include/acpi/actbl1.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 14924383e2d0..2c1ad1d8587b 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -1793,7 +1793,8 @@ enum acpi_hest_notify_types {
ACPI_HEST_NOTIFY_GSIV = 10, /* ACPI 6.1 */
ACPI_HEST_NOTIFY_SOFTWARE_DELEGATED = 11, /* ACPI 6.2 */
ACPI_HEST_NOTIFY_SSE = 12, /* RISCV SSE */
- ACPI_HEST_NOTIFY_RESERVED = 13 /* 13 and greater are reserved */
+ ACPI_HEST_NOTIFY_HEE = 13, /* RISCV Hardware Error Exception */
+ ACPI_HEST_NOTIFY_RESERVED = 14 /* 14 and greater are reserved */
};
/* Values for config_write_enable bitfield above */
--
2.51.2.612.gdc70283dfc
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH 2/3] riscv: Introduce HEST HEE notification handlers for APEI
2026-05-08 8:20 [PATCH 0/3] riscv: log Hardware Error Exception via APEI Ruidong Tian
2026-05-08 8:20 ` [PATCH 1/3] acpi: Introduce HEE in HEST notification types Ruidong Tian
@ 2026-05-08 8:20 ` Ruidong Tian
2026-05-08 8:20 ` [PATCH 3/3] riscv: collect hardware error information via APEI on HEE Ruidong Tian
2 siblings, 0 replies; 4+ messages in thread
From: Ruidong Tian @ 2026-05-08 8:20 UTC (permalink / raw)
To: pjw, palmer, aou, alex, rafael, tony.luck, bp, guohanjun, mchehab,
xueshuai, lenb, saket.dumbre
Cc: linux-riscv, linux-kernel, linux-acpi, acpica-devel, Ruidong Tian
Add functions to register a ghes entry with HEE, allowing the OS
to receive hardware error notifications from firmware through
standardized ACPI interfaces.
Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
---
arch/riscv/include/asm/fixmap.h | 3 ++
drivers/acpi/apei/Kconfig | 12 ++++++
drivers/acpi/apei/ghes.c | 68 ++++++++++++++++++++++++++++++++-
include/acpi/ghes.h | 6 +++
4 files changed, 87 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index e874fd952286..9b3d5bbfda24 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -45,6 +45,9 @@ enum fixed_addresses {
FIX_APEI_GHES_SSE_LOW_PRIORITY,
FIX_APEI_GHES_SSE_HIGH_PRIORITY,
#endif /* CONFIG_RISCV_SBI_SSE */
+#ifdef CONFIG_ACPI_APEI_HEE
+ FIX_APEI_GHES_HEE,
+#endif /* CONFIG_ACPI_APEI_HEE */
#endif /* CONFIG_ACPI_APEI_GHES */
__end_of_permanent_fixed_addresses,
/*
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index 895a843d0e36..ff487ab28a65 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -51,6 +51,18 @@ config ACPI_APEI_SSE
depends on RISCV && RISCV_SBI_SSE && ACPI_APEI_GHES
default y
+config ACPI_APEI_HEE
+ bool "APEI Hardware Error Exception support"
+ depends on RISCV && ACPI_APEI_GHES
+ default y
+ help
+ Enable support for RISC-V Hardware Error Exception (HEE) notification
+ in ACPI Platform Error Interface (APEI). This allows firmware
+ to report hardware errors through RISC-V exception mechanism.
+
+ Say Y if you want to support firmware-first error handling
+ on RISC-V platforms with ACPI.
+
config ACPI_APEI_MEMORY_FAILURE
bool "APEI memory error recovering support"
depends on ACPI_APEI && MEMORY_FAILURE
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index f228640d3f25..e0a5db80e554 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -125,7 +125,8 @@ static inline bool is_hest_sync_notify(struct ghes *ghes)
{
u8 notify_type = ghes->generic->notify.type;
- return notify_type == ACPI_HEST_NOTIFY_SEA;
+ return notify_type == ACPI_HEST_NOTIFY_SEA ||
+ notify_type == ACPI_HEST_NOTIFY_HEE;
}
/*
@@ -1404,7 +1405,8 @@ static int ghes_in_nmi_queue_one_entry(struct ghes *ghes,
return rc;
}
-#if defined(CONFIG_HAVE_ACPI_APEI_NMI) || defined(CONFIG_ACPI_APEI_SEA)
+#if defined(CONFIG_HAVE_ACPI_APEI_NMI) || defined(CONFIG_ACPI_APEI_SEA) || \
+ defined(CONFIG_ACPI_APEI_HEE)
static int ghes_in_nmi_spool_from_list(struct list_head *rcu_list,
enum fixed_addresses fixmap_idx)
{
@@ -1540,6 +1542,53 @@ static inline int ghes_sea_add(struct ghes *ghes) { return -EINVAL; }
static inline void ghes_sea_remove(struct ghes *ghes) { }
#endif /* CONFIG_ACPI_APEI_SEA */
+#ifdef CONFIG_ACPI_APEI_HEE
+static LIST_HEAD(ghes_hee);
+
+/*
+ * Return 0 only if one of the HEE error sources successfully reported an error
+ * record sent from the firmware.
+ */
+int ghes_notify_hee(void)
+{
+ static DEFINE_RAW_SPINLOCK(ghes_notify_lock_hee);
+ int rv;
+
+ raw_spin_lock(&ghes_notify_lock_hee);
+ rv = ghes_in_nmi_spool_from_list(&ghes_hee, FIX_APEI_GHES_HEE);
+ raw_spin_unlock(&ghes_notify_lock_hee);
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(ghes_notify_hee);
+
+static int ghes_hee_add(struct ghes *ghes)
+{
+ int rc;
+
+ rc = ghes_map_error_status(ghes);
+ if (rc)
+ return rc;
+
+ mutex_lock(&ghes_list_mutex);
+ list_add_rcu(&ghes->list, &ghes_hee);
+ mutex_unlock(&ghes_list_mutex);
+
+ return 0;
+}
+
+static void ghes_hee_remove(struct ghes *ghes)
+{
+ mutex_lock(&ghes_list_mutex);
+ list_del_rcu(&ghes->list);
+ mutex_unlock(&ghes_list_mutex);
+ synchronize_rcu();
+}
+#else /* CONFIG_ACPI_APEI_HEE */
+static inline void ghes_hee_add(struct ghes *ghes) { }
+static inline void ghes_hee_remove(struct ghes *ghes) { }
+#endif /* CONFIG_ACPI_APEI_HEE */
+
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
/*
* NMI may be triggered on any CPU, so ghes_in_nmi is used for
@@ -1754,6 +1803,13 @@ static int ghes_probe(struct platform_device *ghes_dev)
goto err;
}
break;
+ case ACPI_HEST_NOTIFY_HEE:
+ if (!IS_ENABLED(CONFIG_ACPI_APEI_HEE)) {
+ pr_warn(GHES_PFX "Generic hardware error source: %d notified via HEE is not supported\n",
+ generic->header.source_id);
+ goto err;
+ }
+ break;
case ACPI_HEST_NOTIFY_NMI:
if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) {
pr_warn(GHES_PFX "Generic hardware error source: %d notified via NMI interrupt is not supported!\n",
@@ -1837,6 +1893,11 @@ static int ghes_probe(struct platform_device *ghes_dev)
if (rc)
goto err;
break;
+ case ACPI_HEST_NOTIFY_HEE:
+ rc = ghes_hee_add(ghes);
+ if (rc)
+ goto err;
+ break;
case ACPI_HEST_NOTIFY_NMI:
rc = ghes_nmi_add(ghes);
if (rc)
@@ -1917,6 +1978,9 @@ static void ghes_remove(struct platform_device *ghes_dev)
case ACPI_HEST_NOTIFY_SEA:
ghes_sea_remove(ghes);
break;
+ case ACPI_HEST_NOTIFY_HEE:
+ ghes_hee_remove(ghes);
+ break;
case ACPI_HEST_NOTIFY_NMI:
ghes_nmi_remove(ghes);
break;
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 8d7e5caef3f1..bf4f6077ca39 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -140,6 +140,12 @@ int ghes_notify_sea(void);
static inline int ghes_notify_sea(void) { return -ENOENT; }
#endif
+#ifdef CONFIG_ACPI_APEI_HEE
+int ghes_notify_hee(void);
+#else
+static inline int ghes_notify_hee(void) { return -ENOENT; }
+#endif
+
struct notifier_block;
extern void ghes_register_report_chain(struct notifier_block *nb);
extern void ghes_unregister_report_chain(struct notifier_block *nb);
--
2.51.2.612.gdc70283dfc
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH 3/3] riscv: collect hardware error information via APEI on HEE
2026-05-08 8:20 [PATCH 0/3] riscv: log Hardware Error Exception via APEI Ruidong Tian
2026-05-08 8:20 ` [PATCH 1/3] acpi: Introduce HEE in HEST notification types Ruidong Tian
2026-05-08 8:20 ` [PATCH 2/3] riscv: Introduce HEST HEE notification handlers for APEI Ruidong Tian
@ 2026-05-08 8:20 ` Ruidong Tian
2 siblings, 0 replies; 4+ messages in thread
From: Ruidong Tian @ 2026-05-08 8:20 UTC (permalink / raw)
To: pjw, palmer, aou, alex, rafael, tony.luck, bp, guohanjun, mchehab,
xueshuai, lenb, saket.dumbre
Cc: linux-riscv, linux-kernel, linux-acpi, acpica-devel, Ruidong Tian
RISC-V already dispatches Hardware Error Exceptions through
do_trap_hardware_error(), but the trap handler currently has no way to
learn *what* went wrong: the user sees the offending task killed, or
the kernel panic, with no diagnostic about the underlying hardware
fault. No error record is logged, and the subsequent memory_failure()
handling has no input.
There are two principal ways to obtain that information on HEE:
1. Have firmware parse the platform error registers and hand the
kernel a CPER record through APEI / GHES.
2. Have the kernel read the error registers directly.
Option (2) is not yet viable on RISC-V: the architecture does not
define a unified, mandatory layout for hardware error status
registers across implementations, so there is nothing stable for
common code to read. This patch therefore only implements option (1):
collect hardware error information on HEE through the existing APEI /
GHES path, mirroring how arm64 treats SEA.
Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
---
arch/riscv/include/asm/acpi.h | 2 ++
arch/riscv/kernel/acpi.c | 54 +++++++++++++++++++++++++++++++++++
arch/riscv/kernel/traps.c | 35 +++++++++++++++++++++--
3 files changed, 89 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/include/asm/acpi.h b/arch/riscv/include/asm/acpi.h
index aa889093f531..e4d18421063e 100644
--- a/arch/riscv/include/asm/acpi.h
+++ b/arch/riscv/include/asm/acpi.h
@@ -87,6 +87,7 @@ int acpi_get_riscv_isa(struct acpi_table_header *table,
void acpi_get_cbo_block_size(struct acpi_table_header *table, u32 *cbom_size,
u32 *cboz_size, u32 *cbop_size);
+int apei_claim_hee(struct pt_regs *regs);
#else
static inline void acpi_init_rintc_map(void) { }
static inline struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu)
@@ -104,6 +105,7 @@ static inline void acpi_get_cbo_block_size(struct acpi_table_header *table,
u32 *cbom_size, u32 *cboz_size,
u32 *cbop_size) { }
+static inline int apei_claim_hee(struct pt_regs *regs) { return -ENOENT; }
#endif /* CONFIG_ACPI */
#ifdef CONFIG_ACPI_NUMA
diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c
index 068e0b404b6f..77ad1e18a092 100644
--- a/arch/riscv/kernel/acpi.c
+++ b/arch/riscv/kernel/acpi.c
@@ -21,6 +21,9 @@
#include <linux/of_fdt.h>
#include <linux/pci.h>
#include <linux/serial_core.h>
+#include <linux/irq_work.h>
+#include <linux/nmi.h>
+#include <acpi/ghes.h>
int acpi_noirq = 1; /* skip ACPI IRQ initialization */
int acpi_disabled = 1;
@@ -353,3 +356,54 @@ int acpi_get_cpu_uid(unsigned int cpu, u32 *uid)
return 0;
}
EXPORT_SYMBOL_GPL(acpi_get_cpu_uid);
+
+/*
+ * Claim Hardware Error Exception as a firmware first notification.
+ *
+ * Used by RISC-V exception handler for hardware error processing.
+ */
+int apei_claim_hee(struct pt_regs *regs)
+{
+ int err = -ENOENT;
+ unsigned long flags;
+ bool return_to_irqs_enabled;
+ bool need_nmi_ctx = !in_nmi();
+
+ if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES))
+ return err;
+
+ local_irq_save(flags);
+
+ /*
+ * Determine whether the interrupted context had IRQs enabled.
+ * This decides if we can run irq_work immediately after.
+ */
+ return_to_irqs_enabled = false;
+ if (regs)
+ return_to_irqs_enabled = !regs_irqs_disabled(regs);
+
+ if (need_nmi_ctx)
+ nmi_enter();
+ err = ghes_notify_hee();
+ if (need_nmi_ctx)
+ nmi_exit();
+
+ /*
+ * APEI NMI-like notifications are deferred to irq_work. Unless
+ * we interrupted irqs-masked code, we can do that now.
+ */
+ if (!err) {
+ if (return_to_irqs_enabled) {
+ __irq_enter();
+ irq_work_run();
+ __irq_exit();
+ } else {
+ pr_warn_ratelimited("APEI work queued but not completed");
+ err = -EINPROGRESS;
+ }
+ }
+
+ local_irq_restore(flags);
+ return err;
+}
+EXPORT_SYMBOL(apei_claim_hee);
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 8c62c771a656..5ee0ac8b0745 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -22,6 +22,7 @@
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/entry-common.h>
+#include <linux/acpi.h>
#include <asm/asm-prototypes.h>
#include <asm/bug.h>
@@ -161,8 +162,6 @@ asmlinkage __visible __trap_section void name(struct pt_regs *regs) \
DO_ERROR_INFO(do_trap_unknown,
SIGILL, ILL_ILLTRP, "unknown exception");
-DO_ERROR_INFO(do_trap_hardware_error,
- SIGBUS, BUS_MCEERR_AR, "hardware error");
DO_ERROR_INFO(do_trap_insn_misaligned,
SIGBUS, BUS_ADRALN, "instruction address misaligned");
DO_ERROR_INFO(do_trap_insn_fault,
@@ -484,3 +483,35 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs)
wait_for_interrupt();
}
#endif
+
+static int claim_hardware_error(struct pt_regs *regs)
+{
+ if (IS_ENABLED(CONFIG_ACPI_APEI_HEE))
+ return apei_claim_hee(regs);
+ return -ENOENT;
+}
+
+asmlinkage __visible __trap_section void do_trap_hardware_error(struct pt_regs *regs)
+{
+ if (user_mode(regs)) {
+ irqentry_enter_from_user_mode(regs);
+ local_irq_enable();
+
+ if (claim_hardware_error(regs))
+ do_trap_error(regs, SIGBUS, BUS_MCEERR_AR,
+ regs->badaddr,
+ "Hardware Error Exception");
+
+ local_irq_disable();
+ irqentry_exit_to_user_mode(regs);
+ } else {
+ irqentry_state_t state = irqentry_nmi_enter(regs);
+
+ claim_hardware_error(regs);
+
+ if (!fixup_exception(regs))
+ die(regs, "Hardware Error Exception");
+
+ irqentry_nmi_exit(regs, state);
+ }
+}
--
2.51.2.612.gdc70283dfc
^ permalink raw reply related [flat|nested] 4+ messages in thread