* [PATCH] PCI/AER: Consolidate CXL and native AER reporting paths
@ 2025-03-17 10:14 Karolina Stolarek
2025-03-19 22:21 ` Bjorn Helgaas
0 siblings, 1 reply; 7+ messages in thread
From: Karolina Stolarek @ 2025-03-17 10:14 UTC (permalink / raw)
To: linux-pci; +Cc: Bjorn Helgaas, Jon Pan-Doh, Terry Bowman
Make CXL devices use aer_print_error() when reporting AER errors.
Add a helper function to populate aer_err_info struct before logging
an error. Move struct aer_err_info definition to the aer.h header
to make it visible to CXL.
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
---
The patch was tested on the top of Terry Bowman's series[1], with
a setup as outlined in the cover letter, and rebased on the top
of pci-next, with no functional changes.
[1] -
https://lore.kernel.org/linux-pci/20250211192444.2292833-1-terry.bowman@amd.com
drivers/cxl/core/pci.c | 5 +++-
drivers/pci/pci.h | 23 ----------------
drivers/pci/pcie/aer.c | 60 ++++++++++++++++++------------------------
include/linux/aer.h | 25 ++++++++++++++++--
4 files changed, 52 insertions(+), 61 deletions(-)
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 013b869b66cb..217f13c30bde 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -871,6 +871,7 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
{
struct pci_dev *pdev = to_pci_dev(cxlds->dev);
struct aer_capability_regs aer_regs;
+ struct aer_err_info info;
struct cxl_dport *dport;
int severity;
@@ -885,7 +886,9 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
return;
- pci_print_aer(pdev, severity, &aer_regs);
+ memset(&info, 0, sizeof(info));
+ populate_aer_err_info(&info, severity, &aer_regs);
+ aer_print_error(pdev, &info);
if (severity == AER_CORRECTABLE)
cxl_handle_rdport_cor_ras(cxlds, dport);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9e0378fa30ac..b799c2ff7b85 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -561,30 +561,7 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev)
#ifdef CONFIG_PCIEAER
#include <linux/aer.h>
-#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
-
-struct aer_err_info {
- struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
- int error_dev_num;
-
- unsigned int id:16;
-
- unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
- unsigned int __pad1:5;
- unsigned int multi_error_valid:1;
-
- unsigned int first_error:5;
- unsigned int __pad2:2;
- unsigned int tlp_header_valid:1;
-
- unsigned int status; /* COR/UNCOR Error Status */
- unsigned int mask; /* COR/UNCOR Error Mask */
- struct pcie_tlp_log tlp; /* TLP Header */
-};
-
int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
-void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
-
int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
unsigned int tlp_len, bool flit,
struct pcie_tlp_log *log);
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index a1cf8c7ef628..411450ff981e 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -760,47 +760,33 @@ int cper_severity_to_aer(int cper_severity)
EXPORT_SYMBOL_GPL(cper_severity_to_aer);
#endif
-void pci_print_aer(struct pci_dev *dev, int aer_severity,
- struct aer_capability_regs *aer)
+void populate_aer_err_info(struct aer_err_info *info, int aer_severity,
+ struct aer_capability_regs *regs)
{
- int layer, agent, tlp_header_valid = 0;
- u32 status, mask;
- struct aer_err_info info;
+ int tlp_header_valid;
+
+ info->severity = aer_severity;
+ info->first_error = PCI_ERR_CAP_FEP(regs->cap_control);
if (aer_severity == AER_CORRECTABLE) {
- status = aer->cor_status;
- mask = aer->cor_mask;
+ info->id = regs->cor_err_source;
+ info->status = regs->cor_status;
+ info->mask = regs->cor_mask;
} else {
- status = aer->uncor_status;
- mask = aer->uncor_mask;
- tlp_header_valid = status & AER_LOG_TLP_MASKS;
+ info->id = regs->uncor_err_source;
+ info->status = regs->uncor_status;
+ info->mask = regs->uncor_mask;
+ tlp_header_valid = info->status & AER_LOG_TLP_MASKS;
+
+ if (tlp_header_valid) {
+ info->tlp_header_valid = tlp_header_valid;
+ info->tlp = regs->header_log;
+ }
}
+}
+EXPORT_SYMBOL_NS_GPL(populate_aer_err_info, "CXL");
- layer = AER_GET_LAYER_ERROR(aer_severity, status);
- agent = AER_GET_AGENT(aer_severity, status);
-
- memset(&info, 0, sizeof(info));
- info.severity = aer_severity;
- info.status = status;
- info.mask = mask;
- info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
-
- pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
- __aer_print_error(dev, &info);
- pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
- aer_error_layer[layer], aer_agent_string[agent]);
-
- if (aer_severity != AER_CORRECTABLE)
- pci_err(dev, "aer_uncor_severity: 0x%08x\n",
- aer->uncor_severity);
-
- if (tlp_header_valid)
- pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" "));
- trace_aer_event(dev_name(&dev->dev), (status & ~mask),
- aer_severity, tlp_header_valid, &aer->header_log);
-}
-EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
/**
* add_error_device - list device to be handled
@@ -1136,6 +1122,7 @@ static void aer_recover_work_func(struct work_struct *work)
{
struct aer_recover_entry entry;
struct pci_dev *pdev;
+ struct aer_err_info info;
while (kfifo_get(&aer_recover_ring, &entry)) {
pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
@@ -1146,7 +1133,10 @@ static void aer_recover_work_func(struct work_struct *work)
PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
continue;
}
- pci_print_aer(pdev, entry.severity, entry.regs);
+
+ memset(&info, 0, sizeof(info));
+ populate_aer_err_info(&info, entry.severity, entry.regs);
+ aer_print_error(pdev, &info);
/*
* Memory for aer_capability_regs(entry.regs) is being
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 02940be66324..ab408ec18e85 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -53,6 +53,26 @@ struct aer_capability_regs {
u16 uncor_err_source;
};
+#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
+struct aer_err_info {
+ struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
+ int error_dev_num;
+
+ unsigned int id:16;
+
+ unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
+ unsigned int __pad1:5;
+ unsigned int multi_error_valid:1;
+
+ unsigned int first_error:5;
+ unsigned int __pad2:2;
+ unsigned int tlp_header_valid:1;
+
+ unsigned int status; /* COR/UNCOR Error Status */
+ unsigned int mask; /* COR/UNCOR Error Mask */
+ struct pcie_tlp_log tlp; /* TLP Header */
+};
+
#if defined(CONFIG_PCIEAER)
int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
int pcie_aer_is_native(struct pci_dev *dev);
@@ -64,8 +84,9 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
#endif
-void pci_print_aer(struct pci_dev *dev, int aer_severity,
- struct aer_capability_regs *aer);
+void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
+void populate_aer_err_info(struct aer_err_info *info, int aer_severity,
+ struct aer_capability_regs *regs);
int cper_severity_to_aer(int cper_severity);
void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
int severity, struct aer_capability_regs *aer_regs);
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] PCI/AER: Consolidate CXL and native AER reporting paths
2025-03-17 10:14 [PATCH] PCI/AER: Consolidate CXL and native AER reporting paths Karolina Stolarek
@ 2025-03-19 22:21 ` Bjorn Helgaas
2025-03-20 15:14 ` Karolina Stolarek
0 siblings, 1 reply; 7+ messages in thread
From: Bjorn Helgaas @ 2025-03-19 22:21 UTC (permalink / raw)
To: Karolina Stolarek
Cc: linux-pci, Bjorn Helgaas, Jon Pan-Doh, Terry Bowman, Len Brown,
James Morse, Tony Luck, Borislav Petkov, Ben Cheatham, Ira Weiny,
Shuai Xue, Liu Xinpeng, Darren Hart, Dan Williams
[+cc ACPI APEI reviewers and more CXL folks]
On Mon, Mar 17, 2025 at 10:14:46AM +0000, Karolina Stolarek wrote:
> Make CXL devices use aer_print_error() when reporting AER errors.
> Add a helper function to populate aer_err_info struct before logging
> an error. Move struct aer_err_info definition to the aer.h header
> to make it visible to CXL.
Previously, pci_print_aer() was used by both CXL (via
cxl_handle_rdport_errors()) and by ACPI GHES via aer_recover_queue()
and aer_recover_work_func(), right?
And after this patch, they would use aer_print_error() like native
AER, native DPC, and the ACPI EDR DPC path?
If it changes the GHES path, I think we should mention that in the
subject and commit log as well.
I think this consolidation is a good thing, because I don't think we
should log errors differently just because we learned about them via a
different path.
But I think this also changes the text we put in dmesg, which is
potentially disruptive to users and scripts that consume it, so I
think we should include a comparison of the previous and new text in
the commit log.
Eventually would like an ack from the CXL and GHES folks before
merging, but I have a couple more questions below.
> Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
> ---
> The patch was tested on the top of Terry Bowman's series[1], with
> a setup as outlined in the cover letter, and rebased on the top
> of pci-next, with no functional changes.
>
> [1] -
> https://lore.kernel.org/linux-pci/20250211192444.2292833-1-terry.bowman@amd.com
>
> drivers/cxl/core/pci.c | 5 +++-
> drivers/pci/pci.h | 23 ----------------
> drivers/pci/pcie/aer.c | 60 ++++++++++++++++++------------------------
> include/linux/aer.h | 25 ++++++++++++++++--
> 4 files changed, 52 insertions(+), 61 deletions(-)
>
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 013b869b66cb..217f13c30bde 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -871,6 +871,7 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
> {
> struct pci_dev *pdev = to_pci_dev(cxlds->dev);
> struct aer_capability_regs aer_regs;
> + struct aer_err_info info;
> struct cxl_dport *dport;
> int severity;
>
> @@ -885,7 +886,9 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
> if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
> return;
>
> - pci_print_aer(pdev, severity, &aer_regs);
> + memset(&info, 0, sizeof(info));
> + populate_aer_err_info(&info, severity, &aer_regs);
Maybe the memset() should go inside populate_aer_err_info() to avoid
potential error in the callers? I'm guessing we don't envision a case
where "info" already contains data that needs to be preserved?
Both GHES and CXL start with struct aer_capability_regs from
include/linux/aer.h, so instead of also exposing struct aer_err_info
to the world, maybe we should have a logging interface like
aer_recover_queue() that takes a struct aer_capability_regs pointer
that CXL could use? Or maybe it could use aer_recover_queue()
directly?
> + aer_print_error(pdev, &info);
>
> if (severity == AER_CORRECTABLE)
> cxl_handle_rdport_cor_ras(cxlds, dport);
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 9e0378fa30ac..b799c2ff7b85 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -561,30 +561,7 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev)
> #ifdef CONFIG_PCIEAER
> #include <linux/aer.h>
>
> -#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
> -
> -struct aer_err_info {
> - struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
> - int error_dev_num;
> -
> - unsigned int id:16;
> -
> - unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
> - unsigned int __pad1:5;
> - unsigned int multi_error_valid:1;
> -
> - unsigned int first_error:5;
> - unsigned int __pad2:2;
> - unsigned int tlp_header_valid:1;
> -
> - unsigned int status; /* COR/UNCOR Error Status */
> - unsigned int mask; /* COR/UNCOR Error Mask */
> - struct pcie_tlp_log tlp; /* TLP Header */
> -};
> -
> int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
> -void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
> -
> int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
> unsigned int tlp_len, bool flit,
> struct pcie_tlp_log *log);
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index a1cf8c7ef628..411450ff981e 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -760,47 +760,33 @@ int cper_severity_to_aer(int cper_severity)
> EXPORT_SYMBOL_GPL(cper_severity_to_aer);
> #endif
>
> -void pci_print_aer(struct pci_dev *dev, int aer_severity,
> - struct aer_capability_regs *aer)
> +void populate_aer_err_info(struct aer_err_info *info, int aer_severity,
> + struct aer_capability_regs *regs)
> {
> - int layer, agent, tlp_header_valid = 0;
> - u32 status, mask;
> - struct aer_err_info info;
> + int tlp_header_valid;
> +
> + info->severity = aer_severity;
> + info->first_error = PCI_ERR_CAP_FEP(regs->cap_control);
>
> if (aer_severity == AER_CORRECTABLE) {
> - status = aer->cor_status;
> - mask = aer->cor_mask;
> + info->id = regs->cor_err_source;
> + info->status = regs->cor_status;
> + info->mask = regs->cor_mask;
> } else {
> - status = aer->uncor_status;
> - mask = aer->uncor_mask;
> - tlp_header_valid = status & AER_LOG_TLP_MASKS;
> + info->id = regs->uncor_err_source;
> + info->status = regs->uncor_status;
> + info->mask = regs->uncor_mask;
> + tlp_header_valid = info->status & AER_LOG_TLP_MASKS;
> +
> + if (tlp_header_valid) {
> + info->tlp_header_valid = tlp_header_valid;
> + info->tlp = regs->header_log;
> + }
> }
> +}
> +EXPORT_SYMBOL_NS_GPL(populate_aer_err_info, "CXL");
>
> - layer = AER_GET_LAYER_ERROR(aer_severity, status);
> - agent = AER_GET_AGENT(aer_severity, status);
> -
> - memset(&info, 0, sizeof(info));
> - info.severity = aer_severity;
> - info.status = status;
> - info.mask = mask;
> - info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
> -
> - pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> - __aer_print_error(dev, &info);
> - pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
> - aer_error_layer[layer], aer_agent_string[agent]);
> -
> - if (aer_severity != AER_CORRECTABLE)
> - pci_err(dev, "aer_uncor_severity: 0x%08x\n",
> - aer->uncor_severity);
> -
> - if (tlp_header_valid)
> - pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" "));
>
> - trace_aer_event(dev_name(&dev->dev), (status & ~mask),
> - aer_severity, tlp_header_valid, &aer->header_log);
> -}
> -EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
>
> /**
> * add_error_device - list device to be handled
> @@ -1136,6 +1122,7 @@ static void aer_recover_work_func(struct work_struct *work)
> {
> struct aer_recover_entry entry;
> struct pci_dev *pdev;
> + struct aer_err_info info;
>
> while (kfifo_get(&aer_recover_ring, &entry)) {
> pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
> @@ -1146,7 +1133,10 @@ static void aer_recover_work_func(struct work_struct *work)
> PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
> continue;
> }
> - pci_print_aer(pdev, entry.severity, entry.regs);
> +
> + memset(&info, 0, sizeof(info));
> + populate_aer_err_info(&info, entry.severity, entry.regs);
> + aer_print_error(pdev, &info);
>
> /*
> * Memory for aer_capability_regs(entry.regs) is being
> diff --git a/include/linux/aer.h b/include/linux/aer.h
> index 02940be66324..ab408ec18e85 100644
> --- a/include/linux/aer.h
> +++ b/include/linux/aer.h
> @@ -53,6 +53,26 @@ struct aer_capability_regs {
> u16 uncor_err_source;
> };
>
> +#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
> +struct aer_err_info {
> + struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
> + int error_dev_num;
> +
> + unsigned int id:16;
> +
> + unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
> + unsigned int __pad1:5;
> + unsigned int multi_error_valid:1;
> +
> + unsigned int first_error:5;
> + unsigned int __pad2:2;
> + unsigned int tlp_header_valid:1;
> +
> + unsigned int status; /* COR/UNCOR Error Status */
> + unsigned int mask; /* COR/UNCOR Error Mask */
> + struct pcie_tlp_log tlp; /* TLP Header */
> +};
> +
> #if defined(CONFIG_PCIEAER)
> int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
> int pcie_aer_is_native(struct pci_dev *dev);
> @@ -64,8 +84,9 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
> static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
> #endif
>
> -void pci_print_aer(struct pci_dev *dev, int aer_severity,
> - struct aer_capability_regs *aer);
> +void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
> +void populate_aer_err_info(struct aer_err_info *info, int aer_severity,
> + struct aer_capability_regs *regs);
> int cper_severity_to_aer(int cper_severity);
> void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
> int severity, struct aer_capability_regs *aer_regs);
> --
> 2.43.5
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] PCI/AER: Consolidate CXL and native AER reporting paths
2025-03-19 22:21 ` Bjorn Helgaas
@ 2025-03-20 15:14 ` Karolina Stolarek
2025-03-20 18:17 ` Bjorn Helgaas
0 siblings, 1 reply; 7+ messages in thread
From: Karolina Stolarek @ 2025-03-20 15:14 UTC (permalink / raw)
To: Bjorn Helgaas
Cc: linux-pci, Bjorn Helgaas, Jon Pan-Doh, Terry Bowman, Len Brown,
James Morse, Tony Luck, Borislav Petkov, Ben Cheatham, Ira Weiny,
Shuai Xue, Liu Xinpeng, Darren Hart, Dan Williams
On 19/03/2025 23:21, Bjorn Helgaas wrote:
> [+cc ACPI APEI reviewers and more CXL folks]
Thanks, it's good to get more eyeballs on this patch :)
> On Mon, Mar 17, 2025 at 10:14:46AM +0000, Karolina Stolarek wrote:
>> Make CXL devices use aer_print_error() when reporting AER errors.
>> Add a helper function to populate aer_err_info struct before logging
>> an error. Move struct aer_err_info definition to the aer.h header
>> to make it visible to CXL.
>
> Previously, pci_print_aer() was used by both CXL (via
> cxl_handle_rdport_errors()) and by ACPI GHES via aer_recover_queue()
> and aer_recover_work_func(), right?
>
> And after this patch, they would use aer_print_error() like native
> AER, native DPC, and the ACPI EDR DPC path?
That is correct.
> If it changes the GHES path, I think we should mention that in the
> subject and commit log as well.
OK, I'll make it clearer in v2.
> I think this consolidation is a good thing, because I don't think we
> should log errors differently just because we learned about them via a
> different path.
>
> But I think this also changes the text we put in dmesg, which is
> potentially disruptive to users and scripts that consume it, so I
> think we should include a comparison of the previous and new text in
> the commit log.
Like I said in a comment to the patch, I tested CXL error reporting in
QEMU with and without my patch, and the output is the same:
[ 152.090666] pcieport 0000:0c:00.0: aer_inject: Injecting errors
00004000/00000000 into device 0000:0c:00.0
[ 152.095743] pcieport 0000:0c:00.0: AER: Correctable error message
received from 0000:0c:00.0
[ 152.098676] pcieport 0000:0c:00.0: CXL Bus Error:
severity=Correctable, type=Transaction Layer, (Receiver ID)
[ 152.101576] pcieport 0000:0c:00.0: device [8086:7075] error
status/mask=00004000/0000a000
[ 152.104012] pcieport 0000:0c:00.0: [14] CorrIntErr
Please mind that this is coming from a kernel with Terry's patches
applied, as I wasn't sure if and how I can inject CXL errors without them.
> Eventually would like an ack from the CXL and GHES folks before
> merging, but I have a couple more questions below.
Absolutely, many thanks for taking a look at the patch.
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index 013b869b66cb..217f13c30bde 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -885,7 +886,9 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
>> if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
>> return;
>>
>> - pci_print_aer(pdev, severity, &aer_regs);
>> + memset(&info, 0, sizeof(info));
>> + populate_aer_err_info(&info, severity, &aer_regs);
>
> Maybe the memset() should go inside populate_aer_err_info() to avoid
> potential error in the callers? I'm guessing we don't envision a case
> where "info" already contains data that needs to be preserved?
That's a good call, I can move it there. We always want to start with a
blank slate when reporting errors.
> Both GHES and CXL start with struct aer_capability_regs from
> include/linux/aer.h, so instead of also exposing struct aer_err_info
> to the world, maybe we should have a logging interface like
> aer_recover_queue() that takes a struct aer_capability_regs pointer
> that CXL could use? Or maybe it could use aer_recover_queue()
> directly?
Hmm, sounds good, I can try implementing it this way.
All the best,
Karolina
>
>> + aer_print_error(pdev, &info);
>>
>> if (severity == AER_CORRECTABLE)
>> cxl_handle_rdport_cor_ras(cxlds, dport);
>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>> index 9e0378fa30ac..b799c2ff7b85 100644
>> --- a/drivers/pci/pci.h
>> +++ b/drivers/pci/pci.h
>> @@ -561,30 +561,7 @@ static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev)
>> #ifdef CONFIG_PCIEAER
>> #include <linux/aer.h>
>>
>> -#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
>> -
>> -struct aer_err_info {
>> - struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
>> - int error_dev_num;
>> -
>> - unsigned int id:16;
>> -
>> - unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
>> - unsigned int __pad1:5;
>> - unsigned int multi_error_valid:1;
>> -
>> - unsigned int first_error:5;
>> - unsigned int __pad2:2;
>> - unsigned int tlp_header_valid:1;
>> -
>> - unsigned int status; /* COR/UNCOR Error Status */
>> - unsigned int mask; /* COR/UNCOR Error Mask */
>> - struct pcie_tlp_log tlp; /* TLP Header */
>> -};
>> -
>> int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
>> -void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
>> -
>> int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
>> unsigned int tlp_len, bool flit,
>> struct pcie_tlp_log *log);
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index a1cf8c7ef628..411450ff981e 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -760,47 +760,33 @@ int cper_severity_to_aer(int cper_severity)
>> EXPORT_SYMBOL_GPL(cper_severity_to_aer);
>> #endif
>>
>> -void pci_print_aer(struct pci_dev *dev, int aer_severity,
>> - struct aer_capability_regs *aer)
>> +void populate_aer_err_info(struct aer_err_info *info, int aer_severity,
>> + struct aer_capability_regs *regs)
>> {
>> - int layer, agent, tlp_header_valid = 0;
>> - u32 status, mask;
>> - struct aer_err_info info;
>> + int tlp_header_valid;
>> +
>> + info->severity = aer_severity;
>> + info->first_error = PCI_ERR_CAP_FEP(regs->cap_control);
>>
>> if (aer_severity == AER_CORRECTABLE) {
>> - status = aer->cor_status;
>> - mask = aer->cor_mask;
>> + info->id = regs->cor_err_source;
>> + info->status = regs->cor_status;
>> + info->mask = regs->cor_mask;
>> } else {
>> - status = aer->uncor_status;
>> - mask = aer->uncor_mask;
>> - tlp_header_valid = status & AER_LOG_TLP_MASKS;
>> + info->id = regs->uncor_err_source;
>> + info->status = regs->uncor_status;
>> + info->mask = regs->uncor_mask;
>> + tlp_header_valid = info->status & AER_LOG_TLP_MASKS;
>> +
>> + if (tlp_header_valid) {
>> + info->tlp_header_valid = tlp_header_valid;
>> + info->tlp = regs->header_log;
>> + }
>> }
>> +}
>> +EXPORT_SYMBOL_NS_GPL(populate_aer_err_info, "CXL");
>>
>> - layer = AER_GET_LAYER_ERROR(aer_severity, status);
>> - agent = AER_GET_AGENT(aer_severity, status);
>> -
>> - memset(&info, 0, sizeof(info));
>> - info.severity = aer_severity;
>> - info.status = status;
>> - info.mask = mask;
>> - info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
>> -
>> - pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
>> - __aer_print_error(dev, &info);
>> - pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
>> - aer_error_layer[layer], aer_agent_string[agent]);
>> -
>> - if (aer_severity != AER_CORRECTABLE)
>> - pci_err(dev, "aer_uncor_severity: 0x%08x\n",
>> - aer->uncor_severity);
>> -
>> - if (tlp_header_valid)
>> - pcie_print_tlp_log(dev, &aer->header_log, dev_fmt(" "));
>>
>> - trace_aer_event(dev_name(&dev->dev), (status & ~mask),
>> - aer_severity, tlp_header_valid, &aer->header_log);
>> -}
>> -EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL");
>>
>> /**
>> * add_error_device - list device to be handled
>> @@ -1136,6 +1122,7 @@ static void aer_recover_work_func(struct work_struct *work)
>> {
>> struct aer_recover_entry entry;
>> struct pci_dev *pdev;
>> + struct aer_err_info info;
>>
>> while (kfifo_get(&aer_recover_ring, &entry)) {
>> pdev = pci_get_domain_bus_and_slot(entry.domain, entry.bus,
>> @@ -1146,7 +1133,10 @@ static void aer_recover_work_func(struct work_struct *work)
>> PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn));
>> continue;
>> }
>> - pci_print_aer(pdev, entry.severity, entry.regs);
>> +
>> + memset(&info, 0, sizeof(info));
>> + populate_aer_err_info(&info, entry.severity, entry.regs);
>> + aer_print_error(pdev, &info);
>>
>> /*
>> * Memory for aer_capability_regs(entry.regs) is being
>> diff --git a/include/linux/aer.h b/include/linux/aer.h
>> index 02940be66324..ab408ec18e85 100644
>> --- a/include/linux/aer.h
>> +++ b/include/linux/aer.h
>> @@ -53,6 +53,26 @@ struct aer_capability_regs {
>> u16 uncor_err_source;
>> };
>>
>> +#define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */
>> +struct aer_err_info {
>> + struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
>> + int error_dev_num;
>> +
>> + unsigned int id:16;
>> +
>> + unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */
>> + unsigned int __pad1:5;
>> + unsigned int multi_error_valid:1;
>> +
>> + unsigned int first_error:5;
>> + unsigned int __pad2:2;
>> + unsigned int tlp_header_valid:1;
>> +
>> + unsigned int status; /* COR/UNCOR Error Status */
>> + unsigned int mask; /* COR/UNCOR Error Mask */
>> + struct pcie_tlp_log tlp; /* TLP Header */
>> +};
>> +
>> #if defined(CONFIG_PCIEAER)
>> int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
>> int pcie_aer_is_native(struct pci_dev *dev);
>> @@ -64,8 +84,9 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>> static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
>> #endif
>>
>> -void pci_print_aer(struct pci_dev *dev, int aer_severity,
>> - struct aer_capability_regs *aer);
>> +void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
>> +void populate_aer_err_info(struct aer_err_info *info, int aer_severity,
>> + struct aer_capability_regs *regs);
>> int cper_severity_to_aer(int cper_severity);
>> void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
>> int severity, struct aer_capability_regs *aer_regs);
>> --
>> 2.43.5
>>
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] PCI/AER: Consolidate CXL and native AER reporting paths
2025-03-20 15:14 ` Karolina Stolarek
@ 2025-03-20 18:17 ` Bjorn Helgaas
2025-03-21 13:56 ` Karolina Stolarek
0 siblings, 1 reply; 7+ messages in thread
From: Bjorn Helgaas @ 2025-03-20 18:17 UTC (permalink / raw)
To: Karolina Stolarek
Cc: linux-pci, Bjorn Helgaas, Jon Pan-Doh, Terry Bowman, Len Brown,
James Morse, Tony Luck, Borislav Petkov, Ben Cheatham, Ira Weiny,
Shuai Xue, Liu Xinpeng, Darren Hart, Dan Williams
On Thu, Mar 20, 2025 at 04:14:04PM +0100, Karolina Stolarek wrote:
> On 19/03/2025 23:21, Bjorn Helgaas wrote:
> > On Mon, Mar 17, 2025 at 10:14:46AM +0000, Karolina Stolarek wrote:
> > > Make CXL devices use aer_print_error() when reporting AER errors.
> > > Add a helper function to populate aer_err_info struct before logging
> > > an error. Move struct aer_err_info definition to the aer.h header
> > > to make it visible to CXL.
> >
> > Previously, pci_print_aer() was used by both CXL (via
> > cxl_handle_rdport_errors()) and by ACPI GHES via aer_recover_queue()
> > and aer_recover_work_func(), right?
> >
> > And after this patch, they would use aer_print_error() like native
> > AER, native DPC, and the ACPI EDR DPC path?
>
> That is correct.
> > I think this consolidation is a good thing, because I don't think we
> > should log errors differently just because we learned about them via a
> > different path.
> >
> > But I think this also changes the text we put in dmesg, which is
> > potentially disruptive to users and scripts that consume it, so I
> > think we should include a comparison of the previous and new text in
> > the commit log.
>
> Like I said in a comment to the patch, I tested CXL error reporting in QEMU
> with and without my patch, and the output is the same:
>
> pcieport 0000:0c:00.0: aer_inject: Injecting errors 00004000/00000000 into device 0000:0c:00.0
> pcieport 0000:0c:00.0: AER: Correctable error message received from 0000:0c:00.0
> pcieport 0000:0c:00.0: CXL Bus Error: severity=Correctable, type=Transaction Layer, (Receiver ID)
> pcieport 0000:0c:00.0: device [8086:7075] error status/mask=00004000/0000a000
> pcieport 0000:0c:00.0: [14] CorrIntErr
Maybe there's CXL magic that I missed. It looks like Terry's series
changes some of this path. And GHES also currently uses
pci_print_aer(). Some sample logs at [1,2].
Looking at v6.14-rc1, only aer_print_error() logs the "error status"
string, and only pci_print_aer() logs "aer_status", "aer_layer", etc.
The previous path is:
pci_print_aer
pci_err("aer_status: 0x%08x, aer_mask: 0x%08x\n") <--
__aer_print_error
pci_err("aer_layer=%s, aer_agent=%s\n") <--
pcie_print_tlp_log
New path is:
aer_print_error
pci_printk("PCIe Bus Error: severity=%s, type=%s, (%s)\n")
pci_printk(" device [%04x:%04x] error status/mask=%08x/%08x\n)
__aer_print_error
pcie_print_tlp_log
So I expected that the lines I marked in pci_print_aer() would be
different.
Bjorn
[1] https://lore.kernel.org/lkml/2149597.8uJZFlvqrj@xrated/T/
[2] https://lore.kernel.org/all/e8a58616-aeae-ad78-d496-6dfcef4ddcaa@arm.com/T/
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] PCI/AER: Consolidate CXL and native AER reporting paths
2025-03-20 18:17 ` Bjorn Helgaas
@ 2025-03-21 13:56 ` Karolina Stolarek
2025-03-21 15:06 ` Bjorn Helgaas
0 siblings, 1 reply; 7+ messages in thread
From: Karolina Stolarek @ 2025-03-21 13:56 UTC (permalink / raw)
To: Bjorn Helgaas
Cc: linux-pci, Bjorn Helgaas, Jon Pan-Doh, Terry Bowman, Len Brown,
James Morse, Tony Luck, Borislav Petkov, Ben Cheatham, Ira Weiny,
Shuai Xue, Liu Xinpeng, Darren Hart, Dan Williams
On 20/03/2025 19:17, Bjorn Helgaas wrote:
>
> Maybe there's CXL magic that I missed. It looks like Terry's series
> changes some of this path. And GHES also currently uses
> pci_print_aer(). Some sample logs at [1,2].
Maybe that's it, thanks a lot for the pointers.
> Looking at v6.14-rc1, only aer_print_error() logs the "error status"
> string, and only pci_print_aer() logs "aer_status", "aer_layer", etc.
>
> The previous path is:
>
> pci_print_aer
> pci_err("aer_status: 0x%08x, aer_mask: 0x%08x\n") <--
> __aer_print_error
> pci_err("aer_layer=%s, aer_agent=%s\n") <--
> pcie_print_tlp_log
>
> New path is:
>
> aer_print_error
> pci_printk("PCIe Bus Error: severity=%s, type=%s, (%s)\n")
> pci_printk(" device [%04x:%04x] error status/mask=%08x/%08x\n)
> __aer_print_error
> pcie_print_tlp_log
>
> So I expected that the lines I marked in pci_print_aer() would be
> different.
Hmm, that seems to be the case. But still, the question is if going with
the new format that matches what's in AER a bad or disruptive thing. I'd
like to try going in the direction of using one way of reporting AER
errors, if possible.
I will send v2 on Monday (with the memset move) and we can keep
discussing other changes in the patch.
All the best,
Karolina
>
> Bjorn
>
> [1] https://lore.kernel.org/lkml/2149597.8uJZFlvqrj@xrated/T/
> [2] https://lore.kernel.org/all/e8a58616-aeae-ad78-d496-6dfcef4ddcaa@arm.com/T/
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] PCI/AER: Consolidate CXL and native AER reporting paths
2025-03-21 13:56 ` Karolina Stolarek
@ 2025-03-21 15:06 ` Bjorn Helgaas
2025-03-24 19:31 ` Karolina Stolarek
0 siblings, 1 reply; 7+ messages in thread
From: Bjorn Helgaas @ 2025-03-21 15:06 UTC (permalink / raw)
To: Karolina Stolarek
Cc: linux-pci, Bjorn Helgaas, Jon Pan-Doh, Terry Bowman, Len Brown,
James Morse, Tony Luck, Borislav Petkov, Ben Cheatham, Ira Weiny,
Shuai Xue, Liu Xinpeng, Darren Hart, Dan Williams
On Fri, Mar 21, 2025 at 02:56:16PM +0100, Karolina Stolarek wrote:
> On 20/03/2025 19:17, Bjorn Helgaas wrote:
> > Maybe there's CXL magic that I missed. It looks like Terry's series
> > changes some of this path. And GHES also currently uses
> > pci_print_aer().
> ... But still, the question is if going with the
> new format that matches what's in AER a bad or disruptive thing. I'd like to
> try going in the direction of using one way of reporting AER errors, if
> possible.
Absolutely, I agree 100%. The choice between native AER and GHES is
made by the platform, not by the OS, so I think it's crazy that we log
them differently. We just need to include information about any log
format changes we make to help users adapt to them.
Bjorn
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] PCI/AER: Consolidate CXL and native AER reporting paths
2025-03-21 15:06 ` Bjorn Helgaas
@ 2025-03-24 19:31 ` Karolina Stolarek
0 siblings, 0 replies; 7+ messages in thread
From: Karolina Stolarek @ 2025-03-24 19:31 UTC (permalink / raw)
To: Bjorn Helgaas, Jonathan Cameron
Cc: linux-pci, Bjorn Helgaas, Jon Pan-Doh, Terry Bowman, Len Brown,
James Morse, Tony Luck, Borislav Petkov, Ben Cheatham, Ira Weiny,
Shuai Xue, Liu Xinpeng, Darren Hart, Dan Williams
[+Jonathan to ask about CXL error injection in qemu]
On 21/03/2025 16:06, Bjorn Helgaas wrote:
> On Fri, Mar 21, 2025 at 02:56:16PM +0100, Karolina Stolarek wrote:
>>
>> ... But still, the question is if going with the
>> new format that matches what's in AER a bad or disruptive thing. I'd like to
>> try going in the direction of using one way of reporting AER errors, if
>> possible.
>
> Absolutely, I agree 100%. The choice between native AER and GHES is
> made by the platform, not by the OS, so I think it's crazy that we log
> them differently. We just need to include information about any log
> format changes we make to help users adapt to them.
I agree, we should highlight the difference. I'm trying to get
before-after logs for CXL but I can't get this to work on pci-next
kernel (i.e., without Terry's patchset).
Jonathan, I'm sorry for dragging you into the thread, but I thought you
could provide suggestions on what might be wrong with my setup.
I tried to test this patch by injecting a CXL Correctable Error via
QMP[0], following these instructions[1].
qmp_cxl_inject_correctable_error() finishes with no issues (it calls
pcie_aer_inject_error()) but I see no AER log or whatsoever. This is the
command that I used[2], with QEMU v7.1.0-rc2-21892-gd9a4282c4b and the
pci-next kernel.
Is there something else I need to enable or provide some parameters to
get this working?
All the best,
Karolina
------------------------------------------------------------------------
[0]:
{ "execute": "qmp_capabilities" }
...
{ "execute": "cxl-inject-correctable-error",
"arguments": {
"path": "/machine/peripheral/cxl-vmem0",
"type": "physical"
}
}
[1] -
https://gitlab.com/jic23/qemu/-/commit/b3488ff7ee6ebfe247c9af751f44f2990babd4a7
[2]:
$ qemu-system-x86_64 -enable-kvm -cpu host -smp 4 -m 16G -rtc base=utc \
-M q35,cxl=on -nographic -serial mon:stdio \
-drive
file=/.../ovmf_code.pure-efi.fd,index=0,if=pflash,format=raw,readonly=on \
-drive file=/.../ovmf-vars.base_image.fd,index=1,if=pflash,format=raw \
-object
memory-backend-file,id=cxl-mem0,share=on,mem-path=/tmp/cxltest.raw,size=256M
\
-object
memory-backend-file,id=cxl-lsa0,share=on,mem-path=/tmp/lsa0.raw,size=256M \
-device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
-device cxl-rp,port=0,bus=cxl.1,id=root_port0,chassis=0,slot=0 \
-device cxl-upstream,bus=root_port0,id=us0 \
-device cxl-downstream,port=0,bus=us0,id=swport0,chassis=0,slot=4 \
-device
cxl-type3,bus=swport0,volatile-memdev=cxl-mem0,lsa=cxl-lsa0,id=cxl-vmem0 \
-qmp tcp:localhost:4444,server,wait=off \
-M
cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=4k
\
-hda disk1.qcow2
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2025-03-24 19:31 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-17 10:14 [PATCH] PCI/AER: Consolidate CXL and native AER reporting paths Karolina Stolarek
2025-03-19 22:21 ` Bjorn Helgaas
2025-03-20 15:14 ` Karolina Stolarek
2025-03-20 18:17 ` Bjorn Helgaas
2025-03-21 13:56 ` Karolina Stolarek
2025-03-21 15:06 ` Bjorn Helgaas
2025-03-24 19:31 ` Karolina Stolarek
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox