public inbox for linux-acpi@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] acpi/apei: Add NVIDIA GHES vendor CPER record handler
@ 2026-03-16 10:50 Kai-Heng Feng
  2026-03-16 17:07 ` Jonathan Cameron
  0 siblings, 1 reply; 3+ messages in thread
From: Kai-Heng Feng @ 2026-03-16 10:50 UTC (permalink / raw)
  To: rafael
  Cc: Kai-Heng Feng, Shiju Jose, Tony Luck, Borislav Petkov, Hanjun Guo,
	Mauro Carvalho Chehab, Shuai Xue, Len Brown, Huang Yiwei,
	Will Deacon, Gavin Shan, Jonathan Cameron, Fabio M. De Francesco,
	Nathan Chancellor, linux-kernel, linux-acpi

Add support for decoding NVIDIA-specific CPER sections delivered via
the APEI GHES vendor record notifier chain. NVIDIA hardware generates
vendor-specific CPER sections containing error signatures and diagnostic
register dumps. This implementation registers a notifier_block with the
GHES vendor record notifier and decodes these sections, printing error
details via dev_info().

The driver binds to ACPI device NVDA2012, present on NVIDIA server
platforms. The NVIDIA CPER section contains a fixed header with error
metadata (signature, error type, severity, socket) followed by
variable-length register address-value pairs for hardware diagnostics.

This work is based on libcper [0].

Example output:
nvidia-ghes NVDA2012:00: NVIDIA CPER section, error_data_length: 544
nvidia-ghes NVDA2012:00: signature: CMET-INFO
nvidia-ghes NVDA2012:00: error_type: 0
nvidia-ghes NVDA2012:00: error_instance: 0
nvidia-ghes NVDA2012:00: severity: 3
nvidia-ghes NVDA2012:00: socket: 0
nvidia-ghes NVDA2012:00: number_regs: 32
nvidia-ghes NVDA2012:00: instance_base: 0x0000000000000000
nvidia-ghes NVDA2012:00: register[0]: address=0x8000000100000000 value=0x0000000100000000

[0] https://github.com/openbmc/libcper/commit/683e055061ce
Cc: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Kai-Heng Feng <kaihengf@nvidia.com>
---
 MAINTAINERS                     |   6 ++
 drivers/acpi/apei/Kconfig       |  14 +++
 drivers/acpi/apei/Makefile      |   1 +
 drivers/acpi/apei/nvidia-ghes.c | 168 ++++++++++++++++++++++++++++++++
 4 files changed, 189 insertions(+)
 create mode 100644 drivers/acpi/apei/nvidia-ghes.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 837db4f7bcca..db12d7ffb1f9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18910,6 +18910,12 @@ S:	Maintained
 F:	drivers/video/fbdev/nvidia/
 F:	drivers/video/fbdev/riva/
 
+NVIDIA GHES VENDOR CPER RECORD HANDLER
+M:	Kai-Heng Feng <kaihengf@nvidia.com>
+L:	linux-acpi@vger.kernel.org
+S:	Maintained
+F:	drivers/acpi/apei/nvidia-ghes.c
+
 NVIDIA VRS RTC DRIVER
 M:	Shubhi Garg <shgarg@nvidia.com>
 L:	linux-tegra@vger.kernel.org
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index 070c07d68dfb..7dc49f14f223 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -74,6 +74,20 @@ config ACPI_APEI_EINJ_CXL
 
 	  If unsure say 'n'
 
+config ACPI_APEI_NVIDIA_GHES
+	tristate "NVIDIA GHES vendor record handler"
+	depends on ACPI_APEI_GHES
+	help
+	  Support for decoding NVIDIA-specific CPER sections delivered via
+	  the APEI GHES vendor record notifier chain. Registers a handler
+	  for the NVIDIA section GUID and logs error signatures, severity,
+	  socket, and diagnostic register address-value pairs.
+
+	  Enable on NVIDIA server platforms (e.g. DGX, HGX) that expose
+	  ACPI device NVDA2012 in their firmware tables.
+
+	  If unsure, say N.
+
 config ACPI_APEI_ERST_DEBUG
 	tristate "APEI Error Record Serialization Table (ERST) Debug Support"
 	depends on ACPI_APEI
diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
index 1a0b85923cd4..4a883f67d698 100644
--- a/drivers/acpi/apei/Makefile
+++ b/drivers/acpi/apei/Makefile
@@ -10,5 +10,6 @@ obj-$(CONFIG_ACPI_APEI_EINJ)	+= einj.o
 einj-y				:= einj-core.o
 einj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o
 obj-$(CONFIG_ACPI_APEI_ERST_DEBUG) += erst-dbg.o
+obj-$(CONFIG_ACPI_APEI_NVIDIA_GHES) += nvidia-ghes.o
 
 apei-y := apei-base.o hest.o erst.o bert.o
diff --git a/drivers/acpi/apei/nvidia-ghes.c b/drivers/acpi/apei/nvidia-ghes.c
new file mode 100644
index 000000000000..0e866f536a7a
--- /dev/null
+++ b/drivers/acpi/apei/nvidia-ghes.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * NVIDIA GHES vendor record handler
+ *
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#include <linux/acpi.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/unaligned.h>
+#include <acpi/ghes.h>
+
+static const guid_t nvidia_sec_guid =
+	GUID_INIT(0x6d5244f2, 0x2712, 0x11ec,
+		  0xbe, 0xa7, 0xcb, 0x3f, 0xdb, 0x95, 0xc7, 0x86);
+
+#define NVIDIA_CPER_REG_PAIR_SIZE	16	/* address + value, each u64 */
+
+struct cper_sec_nvidia {
+	char	signature[16];
+	__le16	error_type;
+	__le16	error_instance;
+	u8	severity;
+	u8	socket;
+	u8	number_regs;
+	u8	reserved;
+	__le64	instance_base;
+} __packed;
+
+struct nvidia_ghes_private {
+	struct notifier_block	nb;
+	struct device		*dev;
+};
+
+static void nvidia_ghes_print_error(struct device *dev,
+				    const struct cper_sec_nvidia *nvidia_err,
+				    size_t error_data_length, bool fatal)
+{
+	const char *level = fatal ? KERN_ERR : KERN_INFO;
+	const u8 *reg_data;
+	size_t min_size;
+	int i;
+
+	dev_printk(level, dev, "signature: %.16s\n", nvidia_err->signature);
+	dev_printk(level, dev, "error_type: %u\n", le16_to_cpu(nvidia_err->error_type));
+	dev_printk(level, dev, "error_instance: %u\n", le16_to_cpu(nvidia_err->error_instance));
+	dev_printk(level, dev, "severity: %u\n", nvidia_err->severity);
+	dev_printk(level, dev, "socket: %u\n", nvidia_err->socket);
+	dev_printk(level, dev, "number_regs: %u\n", nvidia_err->number_regs);
+	dev_printk(level, dev, "instance_base: 0x%016llx\n",
+		   (unsigned long long)le64_to_cpu(nvidia_err->instance_base));
+
+	if (nvidia_err->number_regs == 0)
+		return;
+
+	/*
+	 * Validate that all registers fit within error_data_length.
+	 * Each register pair is NVIDIA_CPER_REG_PAIR_SIZE bytes (two u64s).
+	 */
+	min_size = sizeof(struct cper_sec_nvidia) +
+		   (size_t)nvidia_err->number_regs * NVIDIA_CPER_REG_PAIR_SIZE;
+	if (error_data_length < min_size) {
+		dev_err(dev, "Invalid number_regs %u (section size %zu, need %zu)\n",
+			nvidia_err->number_regs, error_data_length, min_size);
+		return;
+	}
+
+	/*
+	 * Registers are stored as address-value pairs immediately
+	 * following the fixed header.  Each pair is two little-endian u64s.
+	 */
+	reg_data = (const u8 *)(nvidia_err + 1);
+	for (i = 0; i < nvidia_err->number_regs; i++) {
+		u64 addr = get_unaligned_le64(reg_data + i * NVIDIA_CPER_REG_PAIR_SIZE);
+		u64 val = get_unaligned_le64(reg_data + i * NVIDIA_CPER_REG_PAIR_SIZE + 8);
+
+		dev_printk(level, dev, "register[%d]: address=0x%016llx value=0x%016llx\n",
+			   i, (unsigned long long)addr, (unsigned long long)val);
+	}
+}
+
+static int nvidia_ghes_notify(struct notifier_block *nb,
+			      unsigned long event, void *data)
+{
+	struct acpi_hest_generic_data *gdata = data;
+	struct nvidia_ghes_private *priv;
+	const struct cper_sec_nvidia *nvidia_err;
+	guid_t sec_guid;
+
+	import_guid(&sec_guid, gdata->section_type);
+	if (!guid_equal(&sec_guid, &nvidia_sec_guid))
+		return NOTIFY_DONE;
+
+	priv = container_of(nb, struct nvidia_ghes_private, nb);
+
+	if (acpi_hest_get_error_length(gdata) < sizeof(struct cper_sec_nvidia)) {
+		dev_err(priv->dev, "Section too small (%u < %zu)\n",
+			acpi_hest_get_error_length(gdata), sizeof(struct cper_sec_nvidia));
+		return NOTIFY_OK;
+	}
+
+	nvidia_err = acpi_hest_get_payload(gdata);
+
+	if (event >= GHES_SEV_RECOVERABLE)
+		dev_err(priv->dev, "NVIDIA CPER section, error_data_length: %u\n",
+			acpi_hest_get_error_length(gdata));
+	else
+		dev_info(priv->dev, "NVIDIA CPER section, error_data_length: %u\n",
+			 acpi_hest_get_error_length(gdata));
+
+	nvidia_ghes_print_error(priv->dev, nvidia_err, acpi_hest_get_error_length(gdata),
+				event >= GHES_SEV_RECOVERABLE);
+
+	return NOTIFY_OK;
+}
+
+static int nvidia_ghes_probe(struct platform_device *pdev)
+{
+	struct nvidia_ghes_private *priv;
+	int ret;
+
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->nb.notifier_call = nvidia_ghes_notify;
+	priv->dev = &pdev->dev;
+
+	ret = ghes_register_vendor_record_notifier(&priv->nb);
+	if (ret) {
+		dev_err(&pdev->dev,
+			"Failed to register NVIDIA GHES vendor record notifier: %d\n", ret);
+		return ret;
+	}
+
+	platform_set_drvdata(pdev, priv);
+
+	return 0;
+}
+
+static void nvidia_ghes_remove(struct platform_device *pdev)
+{
+	struct nvidia_ghes_private *priv = platform_get_drvdata(pdev);
+
+	ghes_unregister_vendor_record_notifier(&priv->nb);
+}
+
+static const struct acpi_device_id nvidia_ghes_acpi_match[] = {
+	{ "NVDA2012", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, nvidia_ghes_acpi_match);
+
+static struct platform_driver nvidia_ghes_driver = {
+	.driver = {
+		.name		= "nvidia-ghes",
+		.acpi_match_table = nvidia_ghes_acpi_match,
+	},
+	.probe	= nvidia_ghes_probe,
+	.remove	= nvidia_ghes_remove,
+};
+module_platform_driver(nvidia_ghes_driver);
+
+MODULE_AUTHOR("Kai-Heng Feng <kaihengf@nvidia.com>");
+MODULE_DESCRIPTION("NVIDIA GHES vendor CPER record handler");
+MODULE_LICENSE("GPL");
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-03-17 13:09 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-16 10:50 [PATCH] acpi/apei: Add NVIDIA GHES vendor CPER record handler Kai-Heng Feng
2026-03-16 17:07 ` Jonathan Cameron
2026-03-17 13:09   ` Kai-Heng Feng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox