From: Kai-Heng Feng <kaihengf@nvidia.com>
To: rafael@kernel.org, shuah@kernel.org, kees@kernel.org
Cc: julianbraha@gmail.com, linux-kernel@vger.kernel.org,
linux-acpi@vger.kernel.org, linux-kselftest@vger.kernel.org,
linux-hardening@vger.kernel.org, csoto@nvidia.com,
mochs@nvidia.com, Kai-Heng Feng <kaihengf@nvidia.com>
Subject: [PATCH v2 2/4] ACPI: APEI: GHES: Add NVIDIA Vera decoder
Date: Tue, 16 Jun 2026 11:44:08 +0800 [thread overview]
Message-ID: <20260616034410.70675-3-kaihengf@nvidia.com> (raw)
In-Reply-To: <20260616034410.70675-1-kaihengf@nvidia.com>
Vera is NVIDIA's next-generation server SoC. Its CPER section uses a
different GUID and a different binary layout from Grace, so it needs
its own decoder. Without this, firmware-reported hardware errors on
Vera platforms are received but not decoded.
Signed-off-by: Kai-Heng Feng <kaihengf@nvidia.com>
---
v2:
- No change.
drivers/acpi/apei/ghes-nvidia.c | 368 ++++++++++++++++++++++++++++++--
drivers/acpi/apei/ghes-nvidia.h | 29 ++-
2 files changed, 382 insertions(+), 15 deletions(-)
diff --git a/drivers/acpi/apei/ghes-nvidia.c b/drivers/acpi/apei/ghes-nvidia.c
index af445152def0..c74c155dd2ba 100644
--- a/drivers/acpi/apei/ghes-nvidia.c
+++ b/drivers/acpi/apei/ghes-nvidia.c
@@ -7,18 +7,27 @@
#include <linux/acpi.h>
#include <linux/module.h>
+#include <linux/overflow.h>
#include <linux/platform_device.h>
+#include <linux/slab.h>
#include <linux/types.h>
+#include <linux/unaligned.h>
#include <linux/uuid.h>
+#include <kunit/visibility.h>
#include <acpi/ghes.h>
-#include <kunit/visibility.h>
#include "ghes-nvidia.h"
+#define NVIDIA_GHES_VERA_VERSION 1
+
static const guid_t nvidia_grace_sec_guid =
GUID_INIT(0x6d5244f2, 0x2712, 0x11ec,
0xbe, 0xa7, 0xcb, 0x3f, 0xdb, 0x95, 0xc7, 0x86);
+static const guid_t nvidia_vera_sec_guid =
+ GUID_INIT(0x9068e568, 0x6ca0, 0x11f0,
+ 0xae, 0xaf, 0x15, 0x93, 0x43, 0x59, 0x1e, 0xac);
+
struct cper_sec_nvidia {
char signature[16];
__le16 error_type;
@@ -31,11 +40,51 @@ struct cper_sec_nvidia {
struct nvidia_ghes_grace_reg regs[] __counted_by(number_regs);
};
+struct cper_sec_nvidia_vera_event {
+ u8 version;
+ u8 event_context_count;
+ u8 source_device_type;
+ u8 reserved;
+ __le16 event_type;
+ __le16 event_sub_type;
+ __le64 event_link_id;
+ char source_module_signature[16];
+} __packed;
+
+struct cper_sec_nvidia_vera_cpu_info {
+ __le16 info_version;
+ u8 info_size;
+ u8 socket_number;
+ __le32 architecture;
+ u8 chip_serial_number[16];
+ __le64 instance_base;
+} __packed;
+
+struct cper_sec_nvidia_vera_context {
+ __le32 context_size;
+ __le16 context_version;
+ __le16 reserved;
+ __le16 data_format_type;
+ __le16 data_format_version;
+ __le32 data_size;
+} __packed;
+
struct nvidia_ghes_private {
struct notifier_block nb;
struct device *dev;
};
+VISIBLE_IF_KUNIT
+enum nvidia_ghes_format nvidia_ghes_format_from_guid(const guid_t *guid)
+{
+ if (guid_equal(guid, &nvidia_grace_sec_guid))
+ return NVIDIA_GHES_FORMAT_GRACE;
+ if (guid_equal(guid, &nvidia_vera_sec_guid))
+ return NVIDIA_GHES_FORMAT_VERA;
+ return NVIDIA_GHES_FORMAT_UNKNOWN;
+}
+EXPORT_SYMBOL_IF_KUNIT(nvidia_ghes_format_from_guid);
+
VISIBLE_IF_KUNIT
int nvidia_ghes_decode_grace(struct device *dev, const void *buf,
size_t len,
@@ -81,7 +130,7 @@ EXPORT_SYMBOL_IF_KUNIT(nvidia_ghes_decode_grace);
VISIBLE_IF_KUNIT
int nvidia_ghes_grace_reg_pair(const struct nvidia_ghes_decoded *decoded,
- unsigned int index, u64 *addr, u64 *val)
+ unsigned int index, u64 *addr, u64 *val)
{
const struct nvidia_ghes_grace_reg *regs;
@@ -98,6 +147,220 @@ int nvidia_ghes_grace_reg_pair(const struct nvidia_ghes_decoded *decoded,
}
EXPORT_SYMBOL_IF_KUNIT(nvidia_ghes_grace_reg_pair);
+static int nvidia_ghes_vera_validate_context_data(u16 data_format_type,
+ u32 data_size)
+{
+ switch (data_format_type) {
+ case 0:
+ return 0;
+ case 1:
+ return data_size % 16 ? -EINVAL : 0;
+ case 2:
+ case 3:
+ return data_size % 8 ? -EINVAL : 0;
+ case 4:
+ return data_size % 4 ? -EINVAL : 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+VISIBLE_IF_KUNIT
+int nvidia_ghes_decode_vera(struct device *dev, const void *buf,
+ size_t len,
+ struct nvidia_ghes_decoded *decoded)
+{
+ const struct cper_sec_nvidia_vera_event *event = buf;
+ const struct cper_sec_nvidia_vera_cpu_info *cpu_info;
+ const struct cper_sec_nvidia_vera_context *context;
+ const u8 *bytes = buf;
+ size_t data_end_advance;
+ size_t advance;
+ size_t offset;
+ int ret;
+
+ if (!buf || !decoded)
+ return -EINVAL;
+ if (len < sizeof(*event)) {
+ if (dev)
+ dev_err_ratelimited(dev, "Vera event header truncated (%zu < %zu)\n",
+ len, sizeof(*event));
+ return -ENODATA;
+ }
+ if (event->version != NVIDIA_GHES_VERA_VERSION)
+ return -EOPNOTSUPP;
+ if (event->source_device_type != 0)
+ return -EOPNOTSUPP;
+
+ offset = sizeof(*event);
+ if (len - offset < sizeof(*cpu_info)) {
+ if (dev)
+ dev_err_ratelimited(dev, "Vera CPU info truncated (%zu < %zu)\n",
+ len - offset, sizeof(*cpu_info));
+ return -ENODATA;
+ }
+
+ cpu_info = (const void *)(bytes + offset);
+ if (cpu_info->info_size < sizeof(*cpu_info)) {
+ if (dev)
+ dev_err_ratelimited(dev, "Vera CPU info size %u smaller than header %zu\n",
+ cpu_info->info_size, sizeof(*cpu_info));
+ return -EINVAL;
+ }
+ if (len - offset < cpu_info->info_size) {
+ if (dev)
+ dev_err_ratelimited(dev, "Vera CPU info extends past section (%u > %zu)\n",
+ cpu_info->info_size, len - offset);
+ return -ENODATA;
+ }
+
+ offset += cpu_info->info_size;
+ if (event->event_context_count > NVIDIA_GHES_MAX_CONTEXTS) {
+ if (dev)
+ dev_err_ratelimited(dev, "Vera context count %u exceeds maximum %u\n",
+ event->event_context_count,
+ NVIDIA_GHES_MAX_CONTEXTS);
+ return -E2BIG;
+ }
+
+ memset(decoded, 0, sizeof(*decoded));
+ decoded->format = NVIDIA_GHES_FORMAT_VERA;
+ memcpy(decoded->signature, event->source_module_signature,
+ sizeof(event->source_module_signature));
+ decoded->signature[sizeof(event->source_module_signature)] = '\0';
+ decoded->event_context_count = event->event_context_count;
+ decoded->source_device_type = event->source_device_type;
+ decoded->event_type = get_unaligned_le16(&event->event_type);
+ decoded->event_sub_type = get_unaligned_le16(&event->event_sub_type);
+ decoded->event_link_id = get_unaligned_le64(&event->event_link_id);
+ decoded->socket = cpu_info->socket_number;
+ decoded->architecture = get_unaligned_le32(&cpu_info->architecture);
+ memcpy(decoded->chip_serial_number, cpu_info->chip_serial_number,
+ sizeof(cpu_info->chip_serial_number));
+ decoded->instance_base = get_unaligned_le64(&cpu_info->instance_base);
+
+ for (int i = 0; i < event->event_context_count; i++) {
+ struct nvidia_ghes_vera_context *decoded_context = &decoded->contexts[i];
+ u32 context_size;
+ u32 data_size;
+ u16 data_format_type;
+
+ if (len - offset < sizeof(*context)) {
+ if (dev)
+ dev_err_ratelimited(dev, "Vera context[%d] header truncated (%zu < %zu)\n",
+ i, len - offset, sizeof(*context));
+ return -ENODATA;
+ }
+
+ context = (const void *)(bytes + offset);
+ context_size = get_unaligned_le32(&context->context_size);
+ data_format_type = get_unaligned_le16(&context->data_format_type);
+ data_size = get_unaligned_le32(&context->data_size);
+
+ if (context_size < sizeof(*context)) {
+ if (dev)
+ dev_err_ratelimited(dev,
+ "Vera context[%d] size %u smaller than header %zu\n",
+ i, context_size, sizeof(*context));
+ return -EINVAL;
+ }
+ if (data_format_type > 4) {
+ if (dev)
+ dev_dbg(dev,
+ "Vera context[%d] unsupported data format %u\n",
+ i, data_format_type);
+ return -EOPNOTSUPP;
+ }
+ if (check_add_overflow((size_t)data_size, sizeof(*context),
+ &data_end_advance)) {
+ if (dev)
+ dev_err_ratelimited(dev,
+ "Vera context[%d] data_size %u overflows section accounting\n",
+ i, data_size);
+ return -EOVERFLOW;
+ }
+
+ if (data_end_advance > len - offset) {
+ if (dev)
+ dev_err_ratelimited(dev,
+ "Vera context[%d] data extends past section (%zu > %zu)\n",
+ i, data_end_advance, len - offset);
+ return -ENODATA;
+ }
+
+ /*
+ * Some Vera payloads use only the header size here and
+ * place the format-specific payload immediately after it.
+ */
+ if (context_size == sizeof(*context))
+ advance = data_end_advance;
+ else if (data_size <= context_size - sizeof(*context))
+ advance = context_size;
+ else {
+ if (dev)
+ dev_err_ratelimited(dev,
+ "Vera context[%d] data_size %u exceeds context_size %u\n",
+ i, data_size, context_size);
+ return -EINVAL;
+ }
+
+ if (advance > len - offset) {
+ if (dev)
+ dev_err_ratelimited(dev,
+ "Vera context[%d] advance %zu extends past section (%zu)\n",
+ i, advance, len - offset);
+ return -ENODATA;
+ }
+
+ ret = nvidia_ghes_vera_validate_context_data(data_format_type, data_size);
+ if (ret) {
+ if (dev)
+ dev_err_ratelimited(dev,
+ "Vera context[%d] format %u rejected data_size %u (ret=%d)\n",
+ i, data_format_type, data_size, ret);
+ return ret;
+ }
+
+ decoded_context->context_size = context_size;
+ decoded_context->context_version =
+ get_unaligned_le16(&context->context_version);
+ decoded_context->data_format_type = data_format_type;
+ decoded_context->data_format_version =
+ get_unaligned_le16(&context->data_format_version);
+ decoded_context->data_size = data_size;
+ decoded_context->data = bytes + offset + sizeof(*context);
+ offset += advance;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_IF_KUNIT(nvidia_ghes_decode_vera);
+
+VISIBLE_IF_KUNIT
+int nvidia_ghes_vera_context_entry_count(const struct nvidia_ghes_vera_context *ctx)
+{
+ if (!ctx)
+ return -EINVAL;
+ if (ctx->data_size > INT_MAX)
+ return -EOVERFLOW;
+
+ switch (ctx->data_format_type) {
+ case 0:
+ return 0;
+ case 1:
+ return ctx->data_size / 16;
+ case 2:
+ return ctx->data_size / 8;
+ case 3:
+ return ctx->data_size / 8;
+ case 4:
+ return ctx->data_size / 4;
+ default:
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL_IF_KUNIT(nvidia_ghes_vera_context_entry_count);
+
static void nvidia_ghes_print_grace(struct device *dev,
const struct nvidia_ghes_decoded *decoded,
bool fatal)
@@ -111,7 +374,8 @@ static void nvidia_ghes_print_grace(struct device *dev,
dev_printk(level, dev, "severity: %u\n", decoded->severity);
dev_printk(level, dev, "socket: %u\n", decoded->socket);
dev_printk(level, dev, "number_regs: %u\n", decoded->number_regs);
- dev_printk(level, dev, "instance_base: 0x%016llx\n", decoded->instance_base);
+ dev_printk(level, dev, "instance_base: 0x%016llx\n",
+ decoded->instance_base);
for (int i = 0; i < decoded->number_regs; i++) {
if (nvidia_ghes_grace_reg_pair(decoded, i, &addr, &val))
@@ -121,12 +385,52 @@ static void nvidia_ghes_print_grace(struct device *dev,
}
}
+static void nvidia_ghes_print_vera(struct device *dev,
+ const struct nvidia_ghes_decoded *decoded,
+ bool fatal, unsigned long ghes_severity)
+{
+ const char *level = fatal ? KERN_ERR : KERN_INFO;
+
+ dev_printk(level, dev, "signature: %s\n", decoded->signature);
+ dev_printk(level, dev, "event_type: %u\n", decoded->event_type);
+ dev_printk(level, dev, "event_sub_type: %u\n", decoded->event_sub_type);
+ dev_printk(level, dev, "ghes_severity: %lu\n", ghes_severity);
+ dev_printk(level, dev, "event_link_id: 0x%016llx\n",
+ decoded->event_link_id);
+ dev_printk(level, dev, "socket: %u\n", decoded->socket);
+ dev_printk(level, dev, "architecture: 0x%x\n", decoded->architecture);
+ dev_printk(level, dev, "chip_serial_number: %*phN\n",
+ (int)sizeof(decoded->chip_serial_number),
+ decoded->chip_serial_number);
+ dev_printk(level, dev, "instance_base: 0x%016llx\n", decoded->instance_base);
+ dev_printk(level, dev, "event_context_count: %u\n", decoded->event_context_count);
+
+ for (int i = 0; i < decoded->event_context_count; i++) {
+ const struct nvidia_ghes_vera_context *ctx = &decoded->contexts[i];
+ int entries = nvidia_ghes_vera_context_entry_count(ctx);
+
+ dev_printk(level, dev,
+ "context[%d]: version=%u format=%u format_version=%u context_size=%u data_size=%u\n",
+ i, ctx->context_version, ctx->data_format_type,
+ ctx->data_format_version, ctx->context_size, ctx->data_size);
+ if (ctx->data_format_type == 0 && ctx->data_size > 0) {
+ int prefix_len = ctx->data_size > 16 ? 16 : ctx->data_size;
+
+ dev_printk(level, dev, "context[%d]_opaque_prefix: %*phN\n",
+ i, prefix_len, ctx->data);
+ } else if (entries >= 0) {
+ dev_printk(level, dev, "context[%d]_entries: %d\n", i, entries);
+ }
+ }
+}
+
static int nvidia_ghes_notify(struct notifier_block *nb,
unsigned long event, void *data)
{
struct acpi_hest_generic_data *gdata = data;
- struct nvidia_ghes_decoded decoded;
+ struct nvidia_ghes_decoded *decoded;
struct nvidia_ghes_private *priv;
+ enum nvidia_ghes_format format;
const void *payload;
guid_t sec_guid;
u32 len;
@@ -134,26 +438,64 @@ static int nvidia_ghes_notify(struct notifier_block *nb,
bool fatal;
import_guid(&sec_guid, gdata->section_type);
- if (!guid_equal(&sec_guid, &nvidia_grace_sec_guid))
+ format = nvidia_ghes_format_from_guid(&sec_guid);
+ if (format == NVIDIA_GHES_FORMAT_UNKNOWN)
return NOTIFY_DONE;
priv = container_of(nb, struct nvidia_ghes_private, nb);
len = acpi_hest_get_error_length(gdata);
+
payload = acpi_hest_get_payload(gdata);
fatal = event >= GHES_SEV_RECOVERABLE;
+ decoded = kzalloc_obj(*decoded);
+ if (!decoded) {
+ dev_err_ratelimited(priv->dev,
+ "Failed to allocate NVIDIA CPER decode buffer\n");
+ return NOTIFY_OK;
+ }
+
+ switch (format) {
+ case NVIDIA_GHES_FORMAT_GRACE:
+ ret = nvidia_ghes_decode_grace(priv->dev, payload, len, decoded);
+ break;
+ case NVIDIA_GHES_FORMAT_VERA:
+ ret = nvidia_ghes_decode_vera(priv->dev, payload, len, decoded);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
- ret = nvidia_ghes_decode_grace(priv->dev, payload, len, &decoded);
if (ret) {
- dev_err(priv->dev,
- "Malformed NVIDIA CPER section, error_data_length: %u, ret: %d\n",
- len, ret);
- return NOTIFY_OK;
+ if (ret == -EOPNOTSUPP && format == NVIDIA_GHES_FORMAT_VERA)
+ dev_info(priv->dev,
+ "Unsupported NVIDIA Vera CPER section, error_data_length: %u, ret: %d\n",
+ len, ret);
+ else if (format == NVIDIA_GHES_FORMAT_GRACE)
+ dev_err(priv->dev,
+ "Malformed NVIDIA Grace CPER section, error_data_length: %u, ret: %d\n",
+ len, ret);
+ else
+ dev_err(priv->dev,
+ "Malformed NVIDIA Vera CPER section, error_data_length: %u, ret: %d\n",
+ len, ret);
+ goto out;
}
- dev_printk(fatal ? KERN_ERR : KERN_INFO, priv->dev,
- "NVIDIA CPER section, error_data_length: %u\n", len);
- nvidia_ghes_print_grace(priv->dev, &decoded, fatal);
+ if (format == NVIDIA_GHES_FORMAT_GRACE)
+ dev_printk(fatal ? KERN_ERR : KERN_INFO, priv->dev,
+ "NVIDIA Grace CPER section, error_data_length: %u\n", len);
+ else
+ dev_printk(fatal ? KERN_ERR : KERN_INFO, priv->dev,
+ "NVIDIA Vera CPER section, error_data_length: %u\n", len);
+
+ if (format == NVIDIA_GHES_FORMAT_VERA)
+ nvidia_ghes_print_vera(priv->dev, decoded, fatal, event);
+ else
+ nvidia_ghes_print_grace(priv->dev, decoded, fatal);
+out:
+ kfree(decoded);
return NOTIFY_OK;
}
diff --git a/drivers/acpi/apei/ghes-nvidia.h b/drivers/acpi/apei/ghes-nvidia.h
index f0592fa41abf..7fff088e1dc1 100644
--- a/drivers/acpi/apei/ghes-nvidia.h
+++ b/drivers/acpi/apei/ghes-nvidia.h
@@ -3,36 +3,61 @@
#define GHES_NVIDIA_H
#include <linux/types.h>
+#include <linux/uuid.h>
#include <kunit/visibility.h>
-struct device;
-
enum nvidia_ghes_format {
NVIDIA_GHES_FORMAT_UNKNOWN,
NVIDIA_GHES_FORMAT_GRACE,
+ NVIDIA_GHES_FORMAT_VERA,
};
+#define NVIDIA_GHES_MAX_CONTEXTS 16
+
struct nvidia_ghes_grace_reg {
__le64 addr;
__le64 val;
};
+struct nvidia_ghes_vera_context {
+ u32 context_size;
+ u16 context_version;
+ u16 data_format_type;
+ u16 data_format_version;
+ u32 data_size;
+ const u8 *data;
+};
+
struct nvidia_ghes_decoded {
enum nvidia_ghes_format format;
char signature[17];
u16 error_type;
u16 error_instance;
+ u16 event_type;
+ u16 event_sub_type;
u8 severity;
u8 socket;
u8 number_regs;
+ u8 source_device_type;
+ u8 event_context_count;
+ u32 architecture;
+ u64 event_link_id;
u64 instance_base;
+ u8 chip_serial_number[16];
const struct nvidia_ghes_grace_reg *grace_regs;
+ struct nvidia_ghes_vera_context contexts[NVIDIA_GHES_MAX_CONTEXTS];
};
+VISIBLE_IF_KUNIT enum nvidia_ghes_format nvidia_ghes_format_from_guid(const guid_t *guid);
VISIBLE_IF_KUNIT int nvidia_ghes_decode_grace(struct device *dev, const void *buf,
size_t len,
struct nvidia_ghes_decoded *decoded);
VISIBLE_IF_KUNIT int nvidia_ghes_grace_reg_pair(const struct nvidia_ghes_decoded *decoded,
unsigned int index, u64 *addr, u64 *val);
+VISIBLE_IF_KUNIT int nvidia_ghes_decode_vera(struct device *dev, const void *buf,
+ size_t len,
+ struct nvidia_ghes_decoded *decoded);
+VISIBLE_IF_KUNIT
+int nvidia_ghes_vera_context_entry_count(const struct nvidia_ghes_vera_context *ctx);
#endif
--
2.50.1 (Apple Git-155)
next prev parent reply other threads:[~2026-06-16 3:44 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-16 3:44 [PATCH v2 0/4] ACPI: APEI: GHES: Add NVIDIA Vera CPER decoder and tests Kai-Heng Feng
2026-06-16 3:44 ` [PATCH v2 1/4] ACPI: APEI: GHES: Refactor Grace decoder helpers Kai-Heng Feng
2026-06-16 3:44 ` Kai-Heng Feng [this message]
2026-06-16 3:44 ` [PATCH v2 3/4] ACPI: APEI: GHES: Add Grace and Vera KUnit coverage Kai-Heng Feng
2026-06-16 3:44 ` [PATCH v2 4/4] selftests: firmware: Add NVIDIA GHES EINJ selftest Kai-Heng Feng
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260616034410.70675-3-kaihengf@nvidia.com \
--to=kaihengf@nvidia.com \
--cc=csoto@nvidia.com \
--cc=julianbraha@gmail.com \
--cc=kees@kernel.org \
--cc=linux-acpi@vger.kernel.org \
--cc=linux-hardening@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=mochs@nvidia.com \
--cc=rafael@kernel.org \
--cc=shuah@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox