public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Terry Bowman <terry.bowman@amd.com>
To: <linux-cxl@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<linux-pci@vger.kernel.org>, <nifan.cxl@gmail.com>,
	<dave@stgolabs.net>, <jonathan.cameron@huawei.com>,
	<dave.jiang@intel.com>, <alison.schofield@intel.com>,
	<vishal.l.verma@intel.com>, <dan.j.williams@intel.com>,
	<bhelgaas@google.com>, <mahesh@linux.ibm.com>,
	<ira.weiny@intel.com>, <oohall@gmail.com>,
	<Benjamin.Cheatham@amd.com>, <rrichter@amd.com>,
	<nathan.fontenot@amd.com>, <terry.bowman@amd.com>,
	<Smita.KoralahalliChannabasappa@amd.com>, <lukas@wunner.de>,
	<ming.li@zohomail.com>, <PradeepVineshReddy.Kodamati@amd.com>
Subject: [PATCH v7 14/17] cxl/pci: Update CXL Port RAS logging to also display PCIe SBDF
Date: Tue, 11 Feb 2025 13:24:41 -0600	[thread overview]
Message-ID: <20250211192444.2292833-15-terry.bowman@amd.com> (raw)
In-Reply-To: <20250211192444.2292833-1-terry.bowman@amd.com>

CXL RAS errors are currently logged using the associated CXL port's name
returned from devname(). They are typically named with 'port1', 'port2',
etc. to indicate the hierarchial location in the CXL topology. But, this
doesn't clearly indicate the CXL card or slot reporting the error.

Update the logging to also log the corresponding PCIe devname. This will
give a PCIe SBDF or ACPI object name (in case of CXL HB). This will provide
details helping users understand which physical slot and card has the
error.

Below is example output after making these changes.

Correctable error example output:
cxl_port_aer_correctable_error: device=port1 (0000:0c:00.0) parent=root0 (pci0000:0c) status='Received Error From Physical Layer'

Uncorrectable error example output:
cxl_port_aer_uncorrectable_error: device=port1 (0000:0c:00.0) parent=root0 (pci0000:0c) status: 'Memory Byte Enable Parity Error' first_error: 'Memory Byte Enable Parity Error'

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/cxl/core/pci.c   | 39 +++++++++++++++++++------------------
 drivers/cxl/core/trace.h | 42 +++++++++++++++++++++++++---------------
 2 files changed, 46 insertions(+), 35 deletions(-)

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 9a3090dae46a..f154dcf6dfda 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -652,14 +652,14 @@ void read_cdat_data(struct cxl_port *port)
 }
 EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
 
-static void __cxl_handle_cor_ras(struct device *dev,
+static void __cxl_handle_cor_ras(struct device *cxl_dev, struct device *pcie_dev,
 				 void __iomem *ras_base)
 {
 	void __iomem *addr;
 	u32 status;
 
 	if (!ras_base) {
-		dev_warn_once(dev, "CXL RAS register block is not mapped");
+		dev_warn_once(cxl_dev, "CXL RAS register block is not mapped");
 		return;
 	}
 
@@ -669,15 +669,15 @@ static void __cxl_handle_cor_ras(struct device *dev,
 		return;
 	writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
 
-	if (is_cxl_memdev(dev))
-		trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status);
-	else if (is_cxl_port(dev))
-		trace_cxl_port_aer_correctable_error(dev, status);
+	if (is_cxl_memdev(cxl_dev))
+		trace_cxl_aer_correctable_error(to_cxl_memdev(cxl_dev), status);
+	else if (is_cxl_port(cxl_dev))
+		trace_cxl_port_aer_correctable_error(cxl_dev, pcie_dev, status);
 }
 
 static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds)
 {
-	return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+	return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, NULL, cxlds->regs.ras);
 }
 
 /* CXL spec rev3.0 8.2.4.16.1 */
@@ -701,7 +701,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
  * Log the state of the RAS status registers and prepare them to log the
  * next error status. Return 1 if reset needed.
  */
-static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
+static pci_ers_result_t __cxl_handle_ras(struct device *cxl_dev, struct device *pcie_dev,
+					 void __iomem *ras_base)
 {
 	u32 hl[CXL_HEADERLOG_SIZE_U32];
 	void __iomem *addr;
@@ -709,7 +710,7 @@ static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_b
 	u32 fe;
 
 	if (!ras_base) {
-		dev_warn_once(dev, "CXL RAS register block is not mapped");
+		dev_warn_once(cxl_dev, "CXL RAS register block is not mapped");
 		return PCI_ERS_RESULT_NONE;
 	}
 
@@ -730,10 +731,10 @@ static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_b
 	}
 
 	header_log_copy(ras_base, hl);
-	if (is_cxl_memdev(dev))
-		trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl);
-	else if (is_cxl_port(dev))
-		trace_cxl_port_aer_uncorrectable_error(dev, status, fe, hl);
+	if (is_cxl_memdev(cxl_dev))
+		trace_cxl_aer_uncorrectable_error(to_cxl_memdev(cxl_dev), status, fe, hl);
+	else if (is_cxl_port(cxl_dev))
+		trace_cxl_port_aer_uncorrectable_error(cxl_dev, pcie_dev, status, fe, hl);
 
 	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
 
@@ -742,7 +743,7 @@ static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_b
 
 static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
 {
-	return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras);
+	return __cxl_handle_ras(&cxlds->cxlmd->dev, NULL, cxlds->regs.ras);
 }
 
 #ifdef CONFIG_PCIEAER_CXL
@@ -814,7 +815,7 @@ static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev, struct device **dev)
 		struct cxl_dport *dport = NULL;
 
 		port = find_cxl_port(&pdev->dev, &dport);
-		if (!port) {
+		if (!port || !is_cxl_port(&port->dev)) {
 			pci_err(pdev, "Failed to find root/dport in CXL topology\n");
 			return NULL;
 		}
@@ -848,7 +849,7 @@ static void cxl_port_cor_error_detected(struct pci_dev *pdev)
 	struct device *dev;
 	void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev);
 
-	__cxl_handle_cor_ras(dev, ras_base);
+	__cxl_handle_cor_ras(dev, &pdev->dev, ras_base);
 }
 
 static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev)
@@ -856,7 +857,7 @@ static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev)
 	struct device *dev;
 	void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev);
 
-	return __cxl_handle_ras(dev, ras_base);
+	return __cxl_handle_ras(dev, &pdev->dev, ras_base);
 }
 
 void cxl_uport_init_ras_reporting(struct cxl_port *port)
@@ -909,13 +910,13 @@ EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
 static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
 					  struct cxl_dport *dport)
 {
-	return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+	return __cxl_handle_cor_ras(&cxlds->cxlmd->dev, NULL, dport->regs.ras);
 }
 
 static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds,
 				       struct cxl_dport *dport)
 {
-	return __cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras);
+	return __cxl_handle_ras(&cxlds->cxlmd->dev, NULL, dport->regs.ras);
 }
 
 /*
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index b536233ac210..a74803f4aa22 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -49,18 +49,22 @@
 )
 
 TRACE_EVENT(cxl_port_aer_uncorrectable_error,
-	TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
-	TP_ARGS(dev, status, fe, hl),
+	TP_PROTO(struct device *cxl_dev, struct device *pcie_dev, u32 status, u32 fe, u32 *hl),
+	TP_ARGS(cxl_dev, pcie_dev, status, fe, hl),
 	TP_STRUCT__entry(
-		__string(devname, dev_name(dev))
-		__string(parent, dev_name(dev->parent))
+		__string(cxl_name, dev_name(cxl_dev))
+		__string(cxl_parent_name, dev_name(cxl_dev->parent))
+		__string(pcie_name, dev_name(pcie_dev))
+		__string(pcie_parent_name, dev_name(pcie_dev->parent))
 		__field(u32, status)
 		__field(u32, first_error)
 		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
 	),
 	TP_fast_assign(
-		__assign_str(devname);
-		__assign_str(parent);
+		__assign_str(cxl_name);
+		__assign_str(cxl_parent_name);
+		__assign_str(pcie_name);
+		__assign_str(pcie_parent_name);
 		__entry->status = status;
 		__entry->first_error = fe;
 		/*
@@ -69,8 +73,9 @@ TRACE_EVENT(cxl_port_aer_uncorrectable_error,
 		 */
 		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
 	),
-	TP_printk("device=%s parent=%s status: '%s' first_error: '%s'",
-		__get_str(devname), __get_str(parent),
+	TP_printk("device=%s (%s) parent=%s (%s) status: '%s' first_error: '%s'",
+		__get_str(cxl_name), __get_str(pcie_name),
+		__get_str(cxl_parent_name), __get_str(pcie_parent_name),
 		show_uc_errs(__entry->status),
 		show_uc_errs(__entry->first_error)
 	)
@@ -125,20 +130,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
 )
 
 TRACE_EVENT(cxl_port_aer_correctable_error,
-	TP_PROTO(struct device *dev, u32 status),
-	TP_ARGS(dev, status),
+	TP_PROTO(struct device *cxl_dev, struct device *pcie_dev, u32 status),
+	TP_ARGS(cxl_dev, pcie_dev, status),
 	TP_STRUCT__entry(
-		__string(devname, dev_name(dev))
-		__string(parent, dev_name(dev->parent))
+		__string(cxl_name, dev_name(cxl_dev))
+		__string(cxl_parent_name, dev_name(cxl_dev->parent))
+		__string(pcie_name, dev_name(pcie_dev))
+		__string(pcie_parent_name, dev_name(pcie_dev->parent))
 		__field(u32, status)
 	),
 	TP_fast_assign(
-		__assign_str(devname);
-		__assign_str(parent);
+		__assign_str(cxl_name);
+		__assign_str(cxl_parent_name);
+		__assign_str(pcie_name);
+		__assign_str(pcie_parent_name);
 		__entry->status = status;
 	),
-	TP_printk("device=%s parent=%s status='%s'",
-		__get_str(devname), __get_str(parent),
+	TP_printk("device=%s (%s) parent=%s (%s) status='%s'",
+		__get_str(cxl_name), __get_str(pcie_name),
+		__get_str(cxl_parent_name), __get_str(pcie_parent_name),
 		show_ce_errs(__entry->status)
 	)
 );
-- 
2.34.1


  parent reply	other threads:[~2025-02-11 19:27 UTC|newest]

Thread overview: 94+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-11 19:24 [PATCH v7 00/17] Enable CXL PCIe port protocol error handling and logging Terry Bowman
2025-02-11 19:24 ` [PATCH v7 01/17] PCI/AER: Introduce 'struct cxl_err_handlers' and add to 'struct pci_driver' Terry Bowman
2025-02-11 20:25   ` Bjorn Helgaas
2025-02-11 20:42   ` Dan Williams
2025-02-11 19:24 ` [PATCH v7 02/17] PCI/AER: Rename AER driver's interfaces to also indicate CXL PCIe Port support Terry Bowman
2025-02-11 20:26   ` Bjorn Helgaas
2025-02-11 20:44   ` Dan Williams
2025-02-11 19:24 ` [PATCH v7 03/17] CXL/PCI: Introduce PCIe helper functions pcie_is_cxl() and pcie_is_cxl_port() Terry Bowman
2025-02-11 20:28   ` Bjorn Helgaas
2025-02-11 20:38     ` Bowman, Terry
2025-02-11 22:33   ` Dan Williams
2025-02-12 19:07     ` Bowman, Terry
2025-02-12 19:51       ` Dan Williams
2025-03-04 19:11   ` Ira Weiny
2025-02-11 19:24 ` [PATCH v7 04/17] PCI/AER: Modify AER driver logging to report CXL or PCIe bus error type Terry Bowman
2025-02-11 20:28   ` Bjorn Helgaas
2025-02-11 23:47   ` Dan Williams
2025-02-12 19:15     ` Bowman, Terry
2025-02-12 19:57       ` Dan Williams
2025-02-12 21:08         ` Bowman, Terry
2025-02-12 21:17           ` Lukas Wunner
2025-02-11 19:24 ` [PATCH v7 05/17] PCI/AER: Add CXL PCIe Port correctable error support in AER service driver Terry Bowman
2025-02-11 20:28   ` Bjorn Helgaas
2025-02-11 23:58   ` Dan Williams
2025-02-12 21:52     ` Bowman, Terry
2025-02-11 19:24 ` [PATCH v7 06/17] PCI/AER: Add CXL PCIe Port uncorrectable error recovery " Terry Bowman
2025-02-11 20:29   ` Bjorn Helgaas
2025-02-11 21:59   ` Dave Jiang
2025-02-12  0:02   ` Gregory Price
2025-02-12  0:24   ` Dan Williams
2025-02-14 19:36     ` Bowman, Terry
2025-02-14 15:11   ` Jonathan Cameron
2025-02-18 15:43     ` Bowman, Terry
2025-02-14 17:36   ` Fan Ni
2025-02-11 19:24 ` [PATCH v7 07/17] cxl/pci: Map CXL PCIe Root Port and Downstream Switch Port RAS registers Terry Bowman
2025-02-11 22:40   ` Dave Jiang
2025-02-12  1:23   ` Dan Williams
2025-02-13 15:43     ` Bowman, Terry
2025-02-14 21:24       ` Dan Williams
2025-02-14 22:23         ` Bowman, Terry
2025-02-14 22:42           ` Dan Williams
2025-02-12 22:28   ` Alison Schofield
2025-02-12 22:37     ` Bowman, Terry
2025-02-11 19:24 ` [PATCH v7 08/17] cxl/pci: Map CXL PCIe Upstream " Terry Bowman
2025-02-11 23:02   ` Dave Jiang
2025-02-12  2:00   ` Dan Williams
2025-02-14 19:46     ` Bowman, Terry
2025-02-14 21:29       ` Dan Williams
2025-02-14 15:15   ` Jonathan Cameron
2025-02-14 19:50     ` Bowman, Terry
2025-02-11 19:24 ` [PATCH v7 09/17] cxl/pci: Update RAS handler interfaces to also support CXL PCIe Ports Terry Bowman
2025-02-11 23:26   ` Dave Jiang
2025-02-14 15:19   ` Jonathan Cameron
2025-02-11 19:24 ` [PATCH v7 10/17] cxl/pci: Add log message and add type check in existing RAS handlers Terry Bowman
2025-02-11 23:28   ` Dave Jiang
2025-02-12 22:59   ` Dan Williams
2025-02-13  0:08     ` Bowman, Terry
2025-02-14 15:28       ` Jonathan Cameron
2025-02-11 19:24 ` [PATCH v7 11/17] cxl/pci: Change find_cxl_port() to non-static Terry Bowman
2025-02-11 23:42   ` Dave Jiang
2025-02-13 23:15   ` Dan Williams
2025-02-11 19:24 ` [PATCH v7 12/17] cxl/pci: Add error handler for CXL PCIe Port RAS errors Terry Bowman
2025-02-12  0:11   ` Dave Jiang
2025-02-12 16:34     ` Bowman, Terry
2025-02-14  2:18   ` Dan Williams
2025-02-14 21:43     ` Bowman, Terry
2025-02-15  0:20       ` Dan Williams
2025-02-18 15:33         ` Bowman, Terry
2025-02-18 17:15           ` Dan Williams
2025-03-05  0:22   ` Ira Weiny
2025-03-06 13:50     ` Bowman, Terry
2025-03-06 13:50     ` Bowman, Terry
2025-02-11 19:24 ` [PATCH v7 13/17] cxl/pci: Add trace logging " Terry Bowman
2025-02-12  0:11   ` Gregory Price
2025-02-12  0:17   ` Dave Jiang
2025-02-12  0:19     ` Dave Jiang
2025-02-12 16:23     ` Bowman, Terry
2025-02-14  2:21   ` Dan Williams
2025-02-14 15:34     ` Jonathan Cameron
2025-02-11 19:24 ` Terry Bowman [this message]
2025-02-12  0:21   ` [PATCH v7 14/17] cxl/pci: Update CXL Port RAS logging to also display PCIe SBDF Dave Jiang
2025-02-12 16:20     ` Bowman, Terry
2025-02-12 23:30   ` Alison Schofield
2025-02-12 23:34     ` Bowman, Terry
2025-02-11 19:24 ` [PATCH v7 15/17] cxl/pci: Add support to assign and clear pci_driver::cxl_err_handlers Terry Bowman
2025-02-12  0:38   ` Dave Jiang
2025-02-14  2:29   ` Dan Williams
2025-02-11 19:24 ` [PATCH v7 16/17] PCI/AER: Enable internal errors for CXL Upstream and Downstream Switch Ports Terry Bowman
2025-02-11 20:25   ` Bjorn Helgaas
2025-02-11 20:40     ` Bowman, Terry
2025-02-12  0:40   ` Dave Jiang
2025-02-14  2:35   ` Dan Williams
2025-02-11 19:24 ` [PATCH v7 17/17] cxl/pci: Handle CXL Endpoint and RCH Protocol Errors separately from PCIe errors Terry Bowman
2025-02-14  2:43   ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250211192444.2292833-15-terry.bowman@amd.com \
    --to=terry.bowman@amd.com \
    --cc=Benjamin.Cheatham@amd.com \
    --cc=PradeepVineshReddy.Kodamati@amd.com \
    --cc=Smita.KoralahalliChannabasappa@amd.com \
    --cc=alison.schofield@intel.com \
    --cc=bhelgaas@google.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.jiang@intel.com \
    --cc=dave@stgolabs.net \
    --cc=ira.weiny@intel.com \
    --cc=jonathan.cameron@huawei.com \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=lukas@wunner.de \
    --cc=mahesh@linux.ibm.com \
    --cc=ming.li@zohomail.com \
    --cc=nathan.fontenot@amd.com \
    --cc=nifan.cxl@gmail.com \
    --cc=oohall@gmail.com \
    --cc=rrichter@amd.com \
    --cc=vishal.l.verma@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox