* [PATCH v8 1/2] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors
2025-03-10 22:38 [PATCH v8 0/2] acpi/ghes, cper, cxl: Process CXL CPER Protocol errors Smita Koralahalli
@ 2025-03-10 22:38 ` Smita Koralahalli
2025-03-12 17:56 ` Ira Weiny
` (2 more replies)
2025-03-10 22:38 ` [PATCH v8 2/2] cxl/pci: Add trace logging for CXL PCIe Port RAS errors Smita Koralahalli
` (2 subsequent siblings)
3 siblings, 3 replies; 10+ messages in thread
From: Smita Koralahalli @ 2025-03-10 22:38 UTC (permalink / raw)
To: linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Terry Bowman,
Smita Koralahalli
When PCIe AER is in FW-First, OS should process CXL Protocol errors from
CPER records. Introduce support for handling and logging CXL Protocol
errors.
The defined trace events cxl_aer_uncorrectable_error and
cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
to trace FW-First Protocol errors.
Since the CXL code is required to be called from process context and
GHES is in interrupt context, use workqueues for processing.
Similar to CXL CPER event handling, use kfifo to handle errors as it
simplifies queue processing by providing lock free fifo operations.
Add the ability for the CXL sub-system to register a workqueue to
process CXL CPER protocol errors.
Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
drivers/acpi/apei/ghes.c | 49 ++++++++++++++++++++++
drivers/cxl/core/Makefile | 1 +
drivers/cxl/core/core.h | 3 ++
drivers/cxl/core/port.c | 7 ++++
drivers/cxl/core/ras.c | 86 +++++++++++++++++++++++++++++++++++++++
include/cxl/event.h | 15 +++++++
tools/testing/cxl/Kbuild | 1 +
7 files changed, 162 insertions(+)
create mode 100644 drivers/cxl/core/ras.c
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 4d725d988c43..289e365f84b2 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -674,6 +674,15 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
schedule_work(&entry->work);
}
+/* Room for 8 entries */
+#define CXL_CPER_PROT_ERR_FIFO_DEPTH 8
+static DEFINE_KFIFO(cxl_cper_prot_err_fifo, struct cxl_cper_prot_err_work_data,
+ CXL_CPER_PROT_ERR_FIFO_DEPTH);
+
+/* Synchronize schedule_work() with cxl_cper_prot_err_work changes */
+static DEFINE_SPINLOCK(cxl_cper_prot_err_work_lock);
+struct work_struct *cxl_cper_prot_err_work;
+
static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
int severity)
{
@@ -700,6 +709,11 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER))
pr_warn(FW_WARN "CXL CPER no device serial number\n");
+ guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock);
+
+ if (!cxl_cper_prot_err_work)
+ return;
+
switch (prot_err->agent_type) {
case RCD:
case DEVICE:
@@ -721,9 +735,44 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err,
prot_err->agent_type);
return;
}
+
+ if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) {
+ pr_err_ratelimited("CXL CPER kfifo overflow\n");
+ return;
+ }
+
+ schedule_work(cxl_cper_prot_err_work);
#endif
}
+int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+ if (cxl_cper_prot_err_work)
+ return -EINVAL;
+
+ guard(spinlock)(&cxl_cper_prot_err_work_lock);
+ cxl_cper_prot_err_work = work;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_register_prot_err_work, "CXL");
+
+int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+ if (cxl_cper_prot_err_work != work)
+ return -EINVAL;
+
+ guard(spinlock)(&cxl_cper_prot_err_work_lock);
+ cxl_cper_prot_err_work = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_prot_err_work, "CXL");
+
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+ return kfifo_get(&cxl_cper_prot_err_fifo, wd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_prot_err_kfifo_get, "CXL");
+
/* Room for 8 entries for each of the 4 event log queues */
#define CXL_CPER_FIFO_DEPTH 32
DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH);
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index e1d591e52d4b..139b349b3a52 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -15,6 +15,7 @@ cxl_core-y += hdm.o
cxl_core-y += pmu.o
cxl_core-y += cdat.o
cxl_core-y += acpi.o
+cxl_core-y += ras.o
cxl_core-$(CONFIG_TRACING) += trace.o
cxl_core-$(CONFIG_CXL_REGION) += region.o
cxl_core-$(CONFIG_CXL_FEATURES) += features.o
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 3d3b00835446..1803aedb25ca 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -119,6 +119,9 @@ int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port);
+int cxl_ras_init(void);
+void cxl_ras_exit(void);
+
#ifdef CONFIG_CXL_FEATURES
size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
enum cxl_get_feat_selection selection,
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 6a44b6dad3c7..0fd6646c1a2e 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -2348,8 +2348,14 @@ static __init int cxl_core_init(void)
if (rc)
goto err_region;
+ rc = cxl_ras_init();
+ if (rc)
+ goto err_ras;
+
return 0;
+err_ras:
+ cxl_region_exit();
err_region:
bus_unregister(&cxl_bus_type);
err_bus:
@@ -2361,6 +2367,7 @@ static __init int cxl_core_init(void)
static void cxl_core_exit(void)
{
+ cxl_ras_exit();
cxl_region_exit();
bus_unregister(&cxl_bus_type);
destroy_workqueue(cxl_bus_wq);
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
new file mode 100644
index 000000000000..9b1773273e23
--- /dev/null
+++ b/drivers/cxl/core/ras.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
+
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include <cxl/event.h>
+#include <cxlmem.h>
+#include "trace.h"
+
+static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
+ struct cxl_dev_state *cxlds;
+
+ cxlds = pci_get_drvdata(pdev);
+ if (!cxlds)
+ return;
+
+ trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+}
+
+static void cxl_cper_trace_uncorr_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+ struct cxl_dev_state *cxlds;
+ u32 fe;
+
+ cxlds = pci_get_drvdata(pdev);
+ if (!cxlds)
+ return;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ ras_cap.cap_control));
+ else
+ fe = status;
+
+ trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe,
+ ras_cap.header_log);
+}
+
+static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
+{
+ unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
+ data->prot_err.agent_addr.function);
+ struct pci_dev *pdev __free(pci_dev_put) =
+ pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
+ data->prot_err.agent_addr.bus,
+ devfn);
+
+ if (!pdev)
+ return;
+
+ guard(device)(&pdev->dev);
+
+ if (data->severity == AER_CORRECTABLE)
+ cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
+ else
+ cxl_cper_trace_uncorr_prot_err(pdev, data->ras_cap);
+}
+
+static void cxl_cper_prot_err_work_fn(struct work_struct *work)
+{
+ struct cxl_cper_prot_err_work_data wd;
+
+ while (cxl_cper_prot_err_kfifo_get(&wd))
+ cxl_cper_handle_prot_err(&wd);
+}
+static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
+
+int cxl_ras_init(void)
+{
+ int rc;
+
+ rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
+
+ return rc;
+}
+
+void cxl_ras_exit(void)
+{
+ cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
+ cancel_work_sync(&cxl_cper_prot_err_work);
+}
diff --git a/include/cxl/event.h b/include/cxl/event.h
index 8381a07052d0..f9ae1796da85 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -254,6 +254,9 @@ struct cxl_cper_prot_err_work_data {
int cxl_cper_register_work(struct work_struct *work);
int cxl_cper_unregister_work(struct work_struct *work);
int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
+int cxl_cper_register_prot_err_work(struct work_struct *work);
+int cxl_cper_unregister_prot_err_work(struct work_struct *work);
+int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd);
#else
static inline int cxl_cper_register_work(struct work_struct *work)
{
@@ -268,6 +271,18 @@ static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
{
return 0;
}
+static inline int cxl_cper_register_prot_err_work(struct work_struct *work)
+{
+ return 0;
+}
+static inline int cxl_cper_unregister_prot_err_work(struct work_struct *work)
+{
+ return 0;
+}
+static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data *wd)
+{
+ return 0;
+}
#endif
#endif /* _LINUX_CXL_EVENT_H */
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index ef10a896a384..4efcc0606bd6 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -62,6 +62,7 @@ cxl_core-y += $(CXL_CORE_SRC)/hdm.o
cxl_core-y += $(CXL_CORE_SRC)/pmu.o
cxl_core-y += $(CXL_CORE_SRC)/cdat.o
cxl_core-y += $(CXL_CORE_SRC)/acpi.o
+cxl_core-y += $(CXL_CORE_SRC)/ras.o
cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o
--
2.17.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v8 1/2] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors
2025-03-10 22:38 ` [PATCH v8 1/2] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors Smita Koralahalli
@ 2025-03-12 17:56 ` Ira Weiny
2025-03-12 18:58 ` Alison Schofield
2025-03-13 1:03 ` Li Ming
2 siblings, 0 replies; 10+ messages in thread
From: Ira Weiny @ 2025-03-12 17:56 UTC (permalink / raw)
To: Smita Koralahalli, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Terry Bowman,
Smita Koralahalli
Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records. Introduce support for handling and logging CXL Protocol
> errors.
>
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
> to trace FW-First Protocol errors.
>
> Since the CXL code is required to be called from process context and
> GHES is in interrupt context, use workqueues for processing.
>
> Similar to CXL CPER event handling, use kfifo to handle errors as it
> simplifies queue processing by providing lock free fifo operations.
>
> Add the ability for the CXL sub-system to register a workqueue to
> process CXL CPER protocol errors.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
[snip]
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v8 1/2] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors
2025-03-10 22:38 ` [PATCH v8 1/2] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors Smita Koralahalli
2025-03-12 17:56 ` Ira Weiny
@ 2025-03-12 18:58 ` Alison Schofield
2025-03-13 1:03 ` Li Ming
2 siblings, 0 replies; 10+ messages in thread
From: Alison Schofield @ 2025-03-12 18:58 UTC (permalink / raw)
To: Smita Koralahalli
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel, Vishal Verma,
Ira Weiny, Dan Williams, Jonathan Cameron, Yazen Ghannam,
Terry Bowman
On Mon, Mar 10, 2025 at 10:38:38PM +0000, Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records. Introduce support for handling and logging CXL Protocol
> errors.
>
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
> to trace FW-First Protocol errors.
>
> Since the CXL code is required to be called from process context and
> GHES is in interrupt context, use workqueues for processing.
>
> Similar to CXL CPER event handling, use kfifo to handle errors as it
> simplifies queue processing by providing lock free fifo operations.
>
> Add the ability for the CXL sub-system to register a workqueue to
> process CXL CPER protocol errors.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
snip
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v8 1/2] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors
2025-03-10 22:38 ` [PATCH v8 1/2] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors Smita Koralahalli
2025-03-12 17:56 ` Ira Weiny
2025-03-12 18:58 ` Alison Schofield
@ 2025-03-13 1:03 ` Li Ming
2 siblings, 0 replies; 10+ messages in thread
From: Li Ming @ 2025-03-13 1:03 UTC (permalink / raw)
To: Smita Koralahalli
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Terry Bowman,
linux-efi, linux-cxl, linux-kernel
On 3/11/2025 6:38 AM, Smita Koralahalli wrote:
> When PCIe AER is in FW-First, OS should process CXL Protocol errors from
> CPER records. Introduce support for handling and logging CXL Protocol
> errors.
>
> The defined trace events cxl_aer_uncorrectable_error and
> cxl_aer_correctable_error trace native CXL AER endpoint errors. Reuse them
> to trace FW-First Protocol errors.
>
> Since the CXL code is required to be called from process context and
> GHES is in interrupt context, use workqueues for processing.
>
> Similar to CXL CPER event handling, use kfifo to handle errors as it
> simplifies queue processing by providing lock free fifo operations.
>
> Add the ability for the CXL sub-system to register a workqueue to
> process CXL CPER protocol errors.
>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
[snip]
> +int cxl_ras_init(void)
> +{
> + int rc;
> +
> + rc = cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
> +
> + return rc;
> +}
Just one minor comment.
This rc is not needed, can return cxl_cper_register_prot_err_work() directly.
Reviewed-by: Li Ming <ming.li@zohomail.com>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v8 2/2] cxl/pci: Add trace logging for CXL PCIe Port RAS errors
2025-03-10 22:38 [PATCH v8 0/2] acpi/ghes, cper, cxl: Process CXL CPER Protocol errors Smita Koralahalli
2025-03-10 22:38 ` [PATCH v8 1/2] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors Smita Koralahalli
@ 2025-03-10 22:38 ` Smita Koralahalli
2025-03-12 18:59 ` Alison Schofield
2025-03-13 1:03 ` Li Ming
2025-03-13 19:38 ` [PATCH v8 0/2] acpi/ghes, cper, cxl: Process CXL CPER Protocol errors Luck, Tony
2025-03-13 19:58 ` Dave Jiang
3 siblings, 2 replies; 10+ messages in thread
From: Smita Koralahalli @ 2025-03-10 22:38 UTC (permalink / raw)
To: linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Terry Bowman,
Smita Koralahalli
The CXL drivers use kernel trace functions for logging endpoint and
Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality
is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL
Upstream Switch Ports.
Introduce trace logging functions for both RAS correctable and
uncorrectable errors specific to CXL PCIe Ports. Use them to trace
FW-First Protocol errors.
Co-developed-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
---
drivers/cxl/core/ras.c | 37 +++++++++++++++++++++++++++++++
drivers/cxl/core/trace.h | 47 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 84 insertions(+)
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
index 9b1773273e23..7617c186891d 100644
--- a/drivers/cxl/core/ras.c
+++ b/drivers/cxl/core/ras.c
@@ -7,6 +7,30 @@
#include <cxlmem.h>
#include "trace.h"
+static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
+
+ trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+}
+
+static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+ u32 fe;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ ras_cap.cap_control));
+ else
+ fe = status;
+
+ trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
+ ras_cap.header_log);
+}
+
static void cxl_cper_trace_corr_prot_err(struct pci_dev *pdev,
struct cxl_ras_capability_regs ras_cap)
{
@@ -49,12 +73,25 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
data->prot_err.agent_addr.bus,
devfn);
+ int port_type;
if (!pdev)
return;
guard(device)(&pdev->dev);
+ port_type = pci_pcie_type(pdev);
+ if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
+ port_type == PCI_EXP_TYPE_DOWNSTREAM ||
+ port_type == PCI_EXP_TYPE_UPSTREAM) {
+ if (data->severity == AER_CORRECTABLE)
+ cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap);
+ else
+ cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap);
+
+ return;
+ }
+
if (data->severity == AER_CORRECTABLE)
cxl_cper_trace_corr_prot_err(pdev, data->ras_cap);
else
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index e3f842dcdf1d..25ebfbc1616c 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -48,6 +48,34 @@
{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
)
+TRACE_EVENT(cxl_port_aer_uncorrectable_error,
+ TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
+ TP_ARGS(dev, status, fe, hl),
+ TP_STRUCT__entry(
+ __string(device, dev_name(dev))
+ __string(host, dev_name(dev->parent))
+ __field(u32, status)
+ __field(u32, first_error)
+ __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
+ ),
+ TP_fast_assign(
+ __assign_str(device);
+ __assign_str(host);
+ __entry->status = status;
+ __entry->first_error = fe;
+ /*
+ * Embed the 512B headerlog data for user app retrieval and
+ * parsing, but no need to print this in the trace buffer.
+ */
+ memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
+ ),
+ TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
+ __get_str(device), __get_str(host),
+ show_uc_errs(__entry->status),
+ show_uc_errs(__entry->first_error)
+ )
+);
+
TRACE_EVENT(cxl_aer_uncorrectable_error,
TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
TP_ARGS(cxlmd, status, fe, hl),
@@ -96,6 +124,25 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \
)
+TRACE_EVENT(cxl_port_aer_correctable_error,
+ TP_PROTO(struct device *dev, u32 status),
+ TP_ARGS(dev, status),
+ TP_STRUCT__entry(
+ __string(device, dev_name(dev))
+ __string(host, dev_name(dev->parent))
+ __field(u32, status)
+ ),
+ TP_fast_assign(
+ __assign_str(device);
+ __assign_str(host);
+ __entry->status = status;
+ ),
+ TP_printk("device=%s host=%s status='%s'",
+ __get_str(device), __get_str(host),
+ show_ce_errs(__entry->status)
+ )
+);
+
TRACE_EVENT(cxl_aer_correctable_error,
TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
TP_ARGS(cxlmd, status),
--
2.17.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH v8 2/2] cxl/pci: Add trace logging for CXL PCIe Port RAS errors
2025-03-10 22:38 ` [PATCH v8 2/2] cxl/pci: Add trace logging for CXL PCIe Port RAS errors Smita Koralahalli
@ 2025-03-12 18:59 ` Alison Schofield
2025-03-13 1:03 ` Li Ming
1 sibling, 0 replies; 10+ messages in thread
From: Alison Schofield @ 2025-03-12 18:59 UTC (permalink / raw)
To: Smita Koralahalli
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel, Vishal Verma,
Ira Weiny, Dan Williams, Jonathan Cameron, Yazen Ghannam,
Terry Bowman
On Mon, Mar 10, 2025 at 10:38:39PM +0000, Smita Koralahalli wrote:
> The CXL drivers use kernel trace functions for logging endpoint and
> Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality
> is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL
> Upstream Switch Ports.
>
> Introduce trace logging functions for both RAS correctable and
> uncorrectable errors specific to CXL PCIe Ports. Use them to trace
> FW-First Protocol errors.
>
> Co-developed-by: Terry Bowman <terry.bowman@amd.com>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v8 2/2] cxl/pci: Add trace logging for CXL PCIe Port RAS errors
2025-03-10 22:38 ` [PATCH v8 2/2] cxl/pci: Add trace logging for CXL PCIe Port RAS errors Smita Koralahalli
2025-03-12 18:59 ` Alison Schofield
@ 2025-03-13 1:03 ` Li Ming
1 sibling, 0 replies; 10+ messages in thread
From: Li Ming @ 2025-03-13 1:03 UTC (permalink / raw)
To: Smita Koralahalli
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Terry Bowman,
linux-efi, linux-kernel, linux-cxl
On 3/11/2025 6:38 AM, Smita Koralahalli wrote:
> The CXL drivers use kernel trace functions for logging endpoint and
> Restricted CXL host (RCH) Downstream Port RAS errors. Similar functionality
> is required for CXL Root Ports, CXL Downstream Switch Ports, and CXL
> Upstream Switch Ports.
>
> Introduce trace logging functions for both RAS correctable and
> uncorrectable errors specific to CXL PCIe Ports. Use them to trace
> FW-First Protocol errors.
>
> Co-developed-by: Terry Bowman <terry.bowman@amd.com>
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Li Ming <ming.li@zohomail.com>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v8 0/2] acpi/ghes, cper, cxl: Process CXL CPER Protocol errors
2025-03-10 22:38 [PATCH v8 0/2] acpi/ghes, cper, cxl: Process CXL CPER Protocol errors Smita Koralahalli
2025-03-10 22:38 ` [PATCH v8 1/2] acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors Smita Koralahalli
2025-03-10 22:38 ` [PATCH v8 2/2] cxl/pci: Add trace logging for CXL PCIe Port RAS errors Smita Koralahalli
@ 2025-03-13 19:38 ` Luck, Tony
2025-03-13 19:58 ` Dave Jiang
3 siblings, 0 replies; 10+ messages in thread
From: Luck, Tony @ 2025-03-13 19:38 UTC (permalink / raw)
To: Smita Koralahalli
Cc: linux-efi, linux-kernel, linux-cxl, Ard Biesheuvel,
Alison Schofield, Vishal Verma, Ira Weiny, Dan Williams,
Jonathan Cameron, Yazen Ghannam, Terry Bowman
On Mon, Mar 10, 2025 at 10:38:37PM +0000, Smita Koralahalli wrote:
> This patchset adds logging support for CXL CPER endpoint and port Protocol
> errors.
>
> Based on top of cxl-next.
Seems like a a lot of stuff happening in patch 1, but I don't see an
obvious place to split it.
Reviewed-by: Tony Luck <tony.luck@intel.com>
-Tony
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v8 0/2] acpi/ghes, cper, cxl: Process CXL CPER Protocol errors
2025-03-10 22:38 [PATCH v8 0/2] acpi/ghes, cper, cxl: Process CXL CPER Protocol errors Smita Koralahalli
` (2 preceding siblings ...)
2025-03-13 19:38 ` [PATCH v8 0/2] acpi/ghes, cper, cxl: Process CXL CPER Protocol errors Luck, Tony
@ 2025-03-13 19:58 ` Dave Jiang
3 siblings, 0 replies; 10+ messages in thread
From: Dave Jiang @ 2025-03-13 19:58 UTC (permalink / raw)
To: Smita Koralahalli, linux-efi, linux-kernel, linux-cxl
Cc: Ard Biesheuvel, Alison Schofield, Vishal Verma, Ira Weiny,
Dan Williams, Jonathan Cameron, Yazen Ghannam, Terry Bowman,
Luck, Tony, Li Ming
On 3/10/25 3:38 PM, Smita Koralahalli wrote:
> This patchset adds logging support for CXL CPER endpoint and port Protocol
> errors.
>
> Based on top of cxl-next.
Applied to cxl/next with change suggested by Ming.
>
> Link to v7:
> https://lore.kernel.org/linux-cxl/20250226221157.149406-1-Smita.KoralahalliChannabasappa@amd.com
>
> Changes in v7 -> v8:
> [Yazen]: Moved guard() after !pdev check.
> [Ira]: Included ras.o in test build file.
> [Alison]: Naming consistency: devname -> device, parent -> host.
>
> Changes in v6 -> v7:
> Reworked to move registration and protocol error handling into a new
> file inside CXL core. (cxl/core/ras.c).
>
> Changes in v5 -> v6:
> [Dave, Jonathan, Ira]: Reviewed-by tags.
> [Dave]: Check for cxlds before assigning fe.
> Merge one of the patches (Port error trace logging) from Terry's Port
> error handling.
> Rename host -> parent.
>
> Changes in v4 -> v5:
> [Dave]: Reviewed-by tags.
> [Jonathan]: Remove blank line.
> [Jonathan, Ira]: Change CXL -> "CXL".
> [Ira]: Fix build error for CONFIG_ACPI_APEI_PCIEAER.
>
> Changes in v3 -> v4:
> [Ira]: Use memcpy() for RAS Cap struct.
> [Jonathan]: Commit description edits.
> [Jonathan]: Use separate work registration functions for protocol and
> component errors.
> [Jonathan, Ira]: Replace flags with separate functions for port and
> device errors.
> [Jonathan]: Use goto for register and unregister calls.
>
> Changes in v2 -> v3:
> [Dan]: Define a new workqueue for CXL CPER Protocol errors and avoid
> reusing existing workqueue which handles CXL CPER events.
> [Dan] Update function and struct names.
> [Ira] Don't define common function get_cxl_devstate().
> [Dan] Use switch cases rather than defining array of structures.
> [Dan] Pass the entire cxl_cper_prot_err struct for CXL subsystem.
> [Dan] Use pr_err_ratelimited().
> [Dan] Use AER_ severities directly. Don't define CXL_ severities.
> [Dan] Limit either to Device ID or Agent Info check.
> [Dan] Validate size of RAS field matches expectations.
>
> Changes in v2 -> v1:
> [Jonathan] Refactor code for trace support. Rename get_cxl_dev()
> to get_cxl_devstate().
> [Jonathan] Cleanups for get_cxl_devstate().
> [Alison, Jonathan]: Define array of structures for Device ID and Serial
> number comparison.
> [Dave] p_err -> rec/p_rec.
> [Jonathan] Remove pr_warn.
>
> Smita Koralahalli (2):
> acpi/ghes, cxl/pci: Process CXL CPER Protocol Errors
> cxl/pci: Add trace logging for CXL PCIe Port RAS errors
>
> drivers/acpi/apei/ghes.c | 49 +++++++++++++++
> drivers/cxl/core/Makefile | 1 +
> drivers/cxl/core/core.h | 3 +
> drivers/cxl/core/port.c | 7 +++
> drivers/cxl/core/ras.c | 123 ++++++++++++++++++++++++++++++++++++++
> drivers/cxl/core/trace.h | 47 +++++++++++++++
> include/cxl/event.h | 15 +++++
> tools/testing/cxl/Kbuild | 1 +
> 8 files changed, 246 insertions(+)
> create mode 100644 drivers/cxl/core/ras.c
>
^ permalink raw reply [flat|nested] 10+ messages in thread