Linux PCI subsystem development
 help / color / mirror / Atom feed
From: Nathan Lynch via B4 Relay <devnull+nathan.lynch.amd.com@kernel.org>
To: Vinod Koul <vkoul@kernel.org>
Cc: Wei Huang <wei.huang2@amd.com>,
	 Mario Limonciello <mario.limonciello@amd.com>,
	 Bjorn Helgaas <bhelgaas@google.com>,
	linux-pci@vger.kernel.org,  linux-kernel@vger.kernel.org,
	dmaengine@vger.kernel.org
Subject: [PATCH RFC 06/13] dmaengine: sdxi: Add error reporting support
Date: Fri, 05 Sep 2025 13:48:29 -0500	[thread overview]
Message-ID: <20250905-sdxi-base-v1-6-d0341a1292ba@amd.com> (raw)
In-Reply-To: <20250905-sdxi-base-v1-0-d0341a1292ba@amd.com>

From: Nathan Lynch <nathan.lynch@amd.com>

SDXI implementations provide software with detailed information about
error conditions using a per-device ring buffer in system memory. When
an error condition is signaled via interrupt, the driver retrieves any
pending error log entries and reports them to the kernel log.

Co-developed-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Wei Huang <wei.huang2@amd.com>
Signed-off-by: Nathan Lynch <nathan.lynch@amd.com>
---
 drivers/dma/sdxi/error.c | 340 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/dma/sdxi/error.h |  16 +++
 2 files changed, 356 insertions(+)

diff --git a/drivers/dma/sdxi/error.c b/drivers/dma/sdxi/error.c
new file mode 100644
index 0000000000000000000000000000000000000000..c5e33f5989250352f6b081a3049b3b1f972c85a6
--- /dev/null
+++ b/drivers/dma/sdxi/error.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SDXI error reporting.
+ *
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irqreturn.h>
+#include <linux/packing.h>
+#include <linux/types.h>
+
+#include "error.h"
+#include "mmio.h"
+#include "sdxi.h"
+
+/*
+ * The error log ring buffer size is configurable, but for now we fix
+ * it to 64 entries (which is the spec minimum).
+ */
+#define ERROR_LOG_ENTRIES 64
+#define ERROR_LOG_SZ (ERROR_LOG_ENTRIES * sizeof(struct sdxi_errlog_hd_ent))
+
+/* The "unpacked" counterpart to ERRLOG_HD_ENT. */
+struct errlog_entry {
+	u64 dsc_index;
+	u16 cxt_num;
+	u16 err_class;
+	u16 type;
+	u8 step;
+	u8 buf;
+	u8 sub_step;
+	u8 re;
+	bool vl;
+	bool cv;
+	bool div;
+	bool bv;
+};
+
+#define ERRLOG_ENTRY_FIELD(hi_, lo_, name_)				\
+	PACKED_FIELD(hi_, lo_, struct errlog_entry, name_)
+#define ERRLOG_ENTRY_FLAG(nr_, name_) \
+	ERRLOG_ENTRY_FIELD(nr_, nr_, name_)
+
+/* Refer to "Error Log Header Entry (ERRLOG_HD_ENT)" */
+static const struct packed_field_u16 errlog_hd_ent_fields[] = {
+	ERRLOG_ENTRY_FLAG(0, vl),
+	ERRLOG_ENTRY_FIELD(13, 8, step),
+	ERRLOG_ENTRY_FIELD(26, 16, type),
+	ERRLOG_ENTRY_FLAG(32, cv),
+	ERRLOG_ENTRY_FLAG(33, div),
+	ERRLOG_ENTRY_FLAG(34, bv),
+	ERRLOG_ENTRY_FIELD(38, 36, buf),
+	ERRLOG_ENTRY_FIELD(43, 40, sub_step),
+	ERRLOG_ENTRY_FIELD(46, 44, re),
+	ERRLOG_ENTRY_FIELD(63, 48, cxt_num),
+	ERRLOG_ENTRY_FIELD(127, 64, dsc_index),
+	ERRLOG_ENTRY_FIELD(367, 352, err_class),
+};
+
+enum {
+	SDXI_PACKING_QUIRKS = QUIRK_LITTLE_ENDIAN | QUIRK_LSW32_IS_FIRST,
+};
+
+/*
+ * Refer to "(Flagged) Processing Step" and
+ * "Error Log Header Entry (ERRLOG_HD_ENT)", subfield "step"
+ */
+enum errv_step {
+	ERRV_INT         = 1,
+	ERRV_CXT_L2      = 2,
+	ERRV_CXT_L1      = 3,
+	ERRV_CXT_CTL     = 4,
+	ERRV_CXT_STS     = 5,
+	ERRV_WRT_IDX     = 6,
+	ERRV_DSC_GEN     = 7,
+	ERRV_DSC_CSB     = 8,
+	ERRV_ATOMIC      = 9,
+	ERRV_DSC_BUF     = 10,
+	ERRV_DSC_AKEY    = 11,
+	ERRV_FN_RKEY     = 12,
+};
+
+static const char *const processing_steps[] = {
+	[ERRV_INT]        = "Internal Error",
+	[ERRV_CXT_L2]     = "Context Level 2 Table Entry - Translate, Read, Validate",
+	[ERRV_CXT_L1]     = "Context Level 1 Table Entry - Translate, Read, Validate",
+	[ERRV_CXT_CTL]    = "Context Control - Translate, Read, Validate",
+	[ERRV_CXT_STS]    = "Context Status - Translate, Access, Validate",
+	[ERRV_WRT_IDX]    = "Write_Index - Translate, Read, Validate",
+	[ERRV_DSC_GEN]    = "Descriptor Entry - Translate, Access, Validate",
+	[ERRV_DSC_CSB]    = "Descriptor CST_BLK - Translate, Access, Validate",
+	[ERRV_ATOMIC]     = "Atomic Return Data - Translate, Access",
+	[ERRV_DSC_BUF]    = "Descriptor: Data Buffer - Translate, Access",
+	[ERRV_DSC_AKEY]   = "Descriptor AKey Lookup - Translate, Access, Validate",
+	[ERRV_FN_RKEY]    = "Function RKey Lookup - Translate, Read, Validate",
+};
+
+static const char *step_str(enum errv_step step)
+{
+	const char *str = "reserved";
+
+	switch (step) {
+	case ERRV_INT ... ERRV_FN_RKEY:
+		str = processing_steps[step];
+		break;
+	}
+
+	return str;
+}
+
+/* Refer to "Error Log Header Entry (ERRLOG_HD_ENT)", subfield "sub_step" */
+enum errv_sub_step {
+	SUB_STEP_OTHER    = 0,
+	SUB_STEP_ATF      = 1,
+	SUB_STEP_DAF      = 2,
+	SUB_STEP_DVF      = 3,
+};
+
+static const char * const processing_sub_steps[] = {
+	[SUB_STEP_OTHER]    = "Other/unknown",
+	[SUB_STEP_ATF]      = "Address Translation Failure",
+	[SUB_STEP_DAF]      = "Data Access Failure",
+	[SUB_STEP_DVF]      = "Data Validation Failure",
+};
+
+static const char *sub_step_str(enum errv_sub_step sub_step)
+{
+	const char *str = "reserved";
+
+	switch (sub_step) {
+	case SUB_STEP_OTHER ... SUB_STEP_DVF:
+		str = processing_sub_steps[sub_step];
+		break;
+	}
+
+	return str;
+}
+
+/* Refer to "Error Log Header Entry (ERRLOG_HD_ENT)", subfield "re" */
+enum fn_reaction {
+	FN_REACT_INFORM      = 0,
+	FN_REACT_CXT_STOP    = 1,
+	FN_REACT_FN_STOP     = 2,
+};
+
+static const char * const fn_reactions[] = {
+	[FN_REACT_INFORM]      = "Informative, nothing stopped",
+	[FN_REACT_CXT_STOP]    = "Context stopped",
+	[FN_REACT_FN_STOP]     = "Function stopped",
+};
+
+static const char *reaction_str(enum fn_reaction reaction)
+{
+	const char *str = "reserved";
+
+	switch (reaction) {
+	case FN_REACT_INFORM ... FN_REACT_FN_STOP:
+		str = fn_reactions[reaction];
+		break;
+	}
+
+	return str;
+}
+
+static void sdxi_print_err(struct sdxi_dev *sdxi, u64 err_rd)
+{
+	struct errlog_entry ent;
+	size_t index;
+
+	index = err_rd % ERROR_LOG_ENTRIES;
+
+	unpack_fields(&sdxi->err_log[index], sizeof(sdxi->err_log[0]),
+		      &ent, errlog_hd_ent_fields, SDXI_PACKING_QUIRKS);
+
+	if (!ent.vl) {
+		dev_err_ratelimited(sdxi_to_dev(sdxi),
+				    "Ignoring error log entry with vl=0\n");
+		return;
+	}
+
+	if (ent.type != OP_TYPE_ERRLOG) {
+		dev_err_ratelimited(sdxi_to_dev(sdxi),
+				    "Ignoring error log entry with type=%#x\n",
+				    ent.type);
+		return;
+	}
+
+	sdxi_err(sdxi, "error log entry[%zu], MMIO_ERR_RD=%#llx:\n",
+		 index, err_rd);
+	sdxi_err(sdxi, "  re: %#x (%s)\n", ent.re, reaction_str(ent.re));
+	sdxi_err(sdxi, "  step: %#x (%s)\n", ent.step, step_str(ent.step));
+	sdxi_err(sdxi, "  sub_step: %#x (%s)\n",
+		 ent.sub_step, sub_step_str(ent.sub_step));
+	sdxi_err(sdxi, "  cv: %u div: %u bv: %u\n", ent.cv, ent.div, ent.bv);
+	if (ent.bv)
+		sdxi_err(sdxi, "  buf: %u\n", ent.buf);
+	if (ent.cv)
+		sdxi_err(sdxi, "  cxt_num: %#x\n", ent.cxt_num);
+	if (ent.div)
+		sdxi_err(sdxi, "  dsc_index: %#llx\n", ent.dsc_index);
+	sdxi_err(sdxi, "  err_class: %#x\n", ent.err_class);
+}
+
+/* Refer to "Error Log Processing by Software" */
+static irqreturn_t sdxi_irq_thread(int irq, void *data)
+{
+	struct sdxi_dev *sdxi = data;
+	u64 write_index;
+	u64 read_index;
+	u64 err_sts;
+
+	/* 1. Check MMIO_ERR_STS and perform any required remediation. */
+	err_sts = sdxi_read64(sdxi, SDXI_MMIO_ERR_STS);
+	if (!(err_sts & SDXI_MMIO_ERR_STS_STS_BIT))
+		return IRQ_HANDLED;
+
+	if (err_sts & SDXI_MMIO_ERR_STS_ERR_BIT) {
+		/*
+		 * Assume this isn't recoverable; e.g. the error log
+		 * isn't configured correctly. Don't clear
+		 * SDXI_MMIO_ERR_STS before returning.
+		 */
+		sdxi_err(sdxi, "attempted but failed to log errors\n");
+		sdxi_err(sdxi, "error log not functional\n");
+		return IRQ_HANDLED;
+	}
+
+	if (err_sts & SDXI_MMIO_ERR_STS_OVF_BIT)
+		sdxi_err(sdxi, "error log overflow, some entries lost\n");
+
+	/* 2. If MMIO_ERR_STS.sts is 1, then compute read_index. */
+	read_index = sdxi_read64(sdxi, SDXI_MMIO_ERR_RD);
+
+	/* 3. Clear MMIO_ERR_STS. The flags in this register are RW1C. */
+	sdxi_write64(sdxi, SDXI_MMIO_ERR_STS,
+		     SDXI_MMIO_ERR_STS_STS_BIT |
+		     SDXI_MMIO_ERR_STS_OVF_BIT |
+		     SDXI_MMIO_ERR_STS_ERR_BIT);
+
+	/* 4. Compute write_index. */
+	write_index = sdxi_read64(sdxi, SDXI_MMIO_ERR_WRT);
+
+	/* 5. If the indexes are equal then exit. */
+	if (read_index == write_index)
+		return IRQ_HANDLED;
+
+	/* 6. While read_index < write_index... */
+	while (read_index < write_index) {
+
+		/*
+		 * 7. and 8. Compute the real ring buffer index from
+		 * read_index and process the entry.
+		 */
+		sdxi_print_err(sdxi, read_index);
+
+		/* 9. Advance read_index. */
+		++read_index;
+
+		/* 10. Return to step 6. */
+	}
+
+	/* 11. Write read_index to MMIO_ERR_RD. */
+	sdxi_write64(sdxi, SDXI_MMIO_ERR_RD, read_index);
+
+	return IRQ_HANDLED;
+}
+
+/* Refer to "Error Log Initialization" */
+int sdxi_error_init(struct sdxi_dev *sdxi)
+{
+	u64 reg;
+	int err;
+
+	/* 1. Clear MMIO_ERR_CFG. Error interrupts are inhibited until step 6. */
+	sdxi_write64(sdxi, SDXI_MMIO_ERR_CFG, 0);
+
+	/* 2. Clear MMIO_ERR_STS. The flags in this register are RW1C. */
+	reg = FIELD_PREP(SDXI_MMIO_ERR_STS_STS_BIT, 1) |
+	      FIELD_PREP(SDXI_MMIO_ERR_STS_OVF_BIT, 1) |
+	      FIELD_PREP(SDXI_MMIO_ERR_STS_ERR_BIT, 1);
+	sdxi_write64(sdxi, SDXI_MMIO_ERR_STS, reg);
+
+	/* 3. Allocate memory for the error log ring buffer, initialize to zero. */
+	sdxi->err_log = dma_alloc_coherent(sdxi_to_dev(sdxi), ERROR_LOG_SZ,
+					   &sdxi->err_log_dma, GFP_KERNEL);
+	if (!sdxi->err_log)
+		return -ENOMEM;
+
+	/*
+	 * 4. Set MMIO_ERR_CTL.intr_en to 1 if interrupts on
+	 * context-level errors are desired.
+	 */
+	reg = sdxi_read64(sdxi, SDXI_MMIO_ERR_CTL);
+	FIELD_MODIFY(SDXI_MMIO_ERR_CTL_EN, &reg, 1);
+	sdxi_write64(sdxi, SDXI_MMIO_ERR_CTL, reg);
+
+	/*
+	 * The spec is not explicit about when to do this, but this
+	 * seems like the right time: enable interrupt on
+	 * function-level transition to error state.
+	 */
+	reg = sdxi_read64(sdxi, SDXI_MMIO_CTL0);
+	FIELD_MODIFY(SDXI_MMIO_CTL0_FN_ERR_INTR_EN, &reg, 1);
+	sdxi_write64(sdxi, SDXI_MMIO_CTL0, reg);
+
+	/* 5. Clear MMIO_ERR_WRT and MMIO_ERR_RD. */
+	sdxi_write64(sdxi, SDXI_MMIO_ERR_WRT, 0);
+	sdxi_write64(sdxi, SDXI_MMIO_ERR_RD, 0);
+
+	/*
+	 * Error interrupts can be generated once MMIO_ERR_CFG.en is
+	 * set in step 6, so set up the handler now.
+	 */
+	err = request_threaded_irq(sdxi->error_irq, NULL, sdxi_irq_thread,
+				   IRQF_TRIGGER_NONE, "SDXI error", sdxi);
+	if (err)
+		goto free_errlog;
+
+	/* 6. Program MMIO_ERR_CFG. */
+	reg = FIELD_PREP(SDXI_MMIO_ERR_CFG_PTR, sdxi->err_log_dma >> 12) |
+	      FIELD_PREP(SDXI_MMIO_ERR_CFG_SZ, ERROR_LOG_ENTRIES >> 6) |
+	      FIELD_PREP(SDXI_MMIO_ERR_CFG_EN, 1);
+	sdxi_write64(sdxi, SDXI_MMIO_ERR_CFG, reg);
+
+	return 0;
+
+free_errlog:
+	dma_free_coherent(sdxi_to_dev(sdxi), ERROR_LOG_SZ,
+			  sdxi->err_log, sdxi->err_log_dma);
+	return err;
+}
+
+void sdxi_error_exit(struct sdxi_dev *sdxi)
+{
+	sdxi_write64(sdxi, SDXI_MMIO_ERR_CFG, 0);
+	free_irq(sdxi->error_irq, sdxi);
+	dma_free_coherent(sdxi_to_dev(sdxi), ERROR_LOG_SZ,
+			  sdxi->err_log, sdxi->err_log_dma);
+}
diff --git a/drivers/dma/sdxi/error.h b/drivers/dma/sdxi/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..50019d9811184464227ae13baa509101a2a3aacc
--- /dev/null
+++ b/drivers/dma/sdxi/error.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SDXI error handling entry points.
+ *
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#ifndef DMA_SDXI_ERROR_H
+#define DMA_SDXI_ERROR_H
+
+struct sdxi_dev;
+
+int sdxi_error_init(struct sdxi_dev *sdxi);
+void sdxi_error_exit(struct sdxi_dev *sdxi);
+
+#endif  /* DMA_SDXI_ERROR_H */

-- 
2.39.5



  parent reply	other threads:[~2025-09-05 18:48 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-05 18:48 [PATCH RFC 00/13] dmaengine: Smart Data Accelerator Interface (SDXI) basic support Nathan Lynch via B4 Relay
2025-09-05 18:48 ` [PATCH RFC 01/13] PCI: Add SNIA SDXI accelerator sub-class Nathan Lynch via B4 Relay
2025-09-15 17:25   ` Bjorn Helgaas
2025-09-15 20:17     ` Nathan Lynch
2025-09-05 18:48 ` [PATCH RFC 02/13] dmaengine: sdxi: Add control structure definitions Nathan Lynch via B4 Relay
2025-09-05 18:48 ` [PATCH RFC 03/13] dmaengine: sdxi: Add descriptor encoding and unit tests Nathan Lynch via B4 Relay
2025-09-15 11:52   ` Jonathan Cameron
2025-09-15 19:30     ` Nathan Lynch
2025-09-16 14:20       ` Jonathan Cameron
2025-09-16 19:06         ` Nathan Lynch
2025-09-05 18:48 ` [PATCH RFC 04/13] dmaengine: sdxi: Add MMIO register definitions Nathan Lynch via B4 Relay
2025-09-05 18:48 ` [PATCH RFC 05/13] dmaengine: sdxi: Add software data structures Nathan Lynch via B4 Relay
2025-09-15 11:59   ` Jonathan Cameron
2025-09-16 19:07     ` Nathan Lynch
2025-09-16  9:38   ` Markus Elfring
2025-09-05 18:48 ` Nathan Lynch via B4 Relay [this message]
2025-09-15 12:11   ` [PATCH RFC 06/13] dmaengine: sdxi: Add error reporting support Jonathan Cameron
2025-09-15 20:42     ` Nathan Lynch
2025-09-16 14:23       ` Jonathan Cameron
2025-09-05 18:48 ` [PATCH RFC 07/13] dmaengine: sdxi: Import descriptor enqueue code from spec Nathan Lynch via B4 Relay
2025-09-15 12:18   ` Jonathan Cameron
2025-09-16 17:05   ` [External] : " ALOK TIWARI
2025-09-05 18:48 ` [PATCH RFC 08/13] dmaengine: sdxi: Context creation/removal, descriptor submission Nathan Lynch via B4 Relay
2025-09-15 14:12   ` Jonathan Cameron
2025-09-16 20:40     ` Nathan Lynch
2025-09-17 13:34       ` Jonathan Cameron
2025-09-15 19:42   ` Markus Elfring
2025-09-05 18:48 ` [PATCH RFC 09/13] dmaengine: sdxi: Add core device management code Nathan Lynch via B4 Relay
2025-09-15 14:23   ` Jonathan Cameron
2025-09-16 21:23     ` Nathan Lynch
2025-09-05 18:48 ` [PATCH RFC 10/13] dmaengine: sdxi: Add PCI driver support Nathan Lynch via B4 Relay
2025-09-05 19:14   ` Mario Limonciello
2025-09-10 15:25     ` Nathan Lynch
2025-09-05 20:05   ` Bjorn Helgaas
2025-09-10 15:28     ` Nathan Lynch
2025-09-15 15:03   ` Jonathan Cameron
2025-09-16 16:43   ` [External] : " ALOK TIWARI
2025-09-05 18:48 ` [PATCH RFC 11/13] dmaengine: sdxi: Add DMA engine provider Nathan Lynch via B4 Relay
2025-09-15 15:16   ` Jonathan Cameron
2025-09-05 18:48 ` [PATCH RFC 12/13] dmaengine: sdxi: Add Kconfig and Makefile Nathan Lynch via B4 Relay
2025-09-15 15:08   ` Jonathan Cameron
2025-09-15 16:44     ` Nathan Lynch
2025-09-05 18:48 ` [PATCH RFC 13/13] MAINTAINERS: Add entry for SDXI driver Nathan Lynch via B4 Relay

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250905-sdxi-base-v1-6-d0341a1292ba@amd.com \
    --to=devnull+nathan.lynch.amd.com@kernel.org \
    --cc=bhelgaas@google.com \
    --cc=dmaengine@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=mario.limonciello@amd.com \
    --cc=nathan.lynch@amd.com \
    --cc=vkoul@kernel.org \
    --cc=wei.huang2@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox