From: "Bowman, Terry" <terry.bowman@amd.com>
To: "Cheatham, Benjamin" <benjamin.cheatham@amd.com>
Cc: linux-kernel@vger.kernel.org, linux-pci@vger.kernel.org,
dave@stgolabs.net, jonathan.cameron@huawei.com,
dave.jiang@intel.com, alison.schofield@intel.com,
dan.j.williams@intel.com, bhelgaas@google.com,
shiju.jose@huawei.com, ming.li@zohomail.com,
Smita.KoralahalliChannabasappa@amd.com, rrichter@amd.com,
dan.carpenter@linaro.org, PradeepVineshReddy.Kodamati@amd.com,
lukas@wunner.de, sathyanarayanan.kuppuswamy@linux.intel.com,
linux-cxl@vger.kernel.org, alucerop@amd.com, ira.weiny@intel.com
Subject: Re: [PATCH v12 09/25] PCI/AER: Report CXL or PCIe bus error type in trace logging
Date: Mon, 6 Oct 2025 14:59:14 -0500 [thread overview]
Message-ID: <16e5e21e-e4dc-4e8e-85a9-e2b236f1251c@amd.com> (raw)
In-Reply-To: <fd73dd2f-4988-423f-bceb-cd1a831a2a78@amd.com>
On 10/3/2025 3:11 PM, Cheatham, Benjamin wrote:
> [snip]
>
>> +/**
>> + * struct aer_err_info - AER Error Information
>> + * @dev: Devices reporting error
>> + * @ratelimit_print: Flag to log or not log the devices' error. 0=NotLog/1=Log
>> + * @error_devnum: Number of devices reporting an error
>> + * @level: printk level to use in logging
>> + * @id: Value from register PCI_ERR_ROOT_ERR_SRC
>> + * @severity: AER severity, 0-UNCOR Non-fatal, 1-UNCOR fatal, 2-COR
>> + * @root_ratelimit_print: Flag to log or not log the root's error. 0=NotLog/1=Log
>> + * @multi_error_valid: If multiple errors are reported
>> + * @first_error: First reported error
>> + * @is_cxl: Bus type error: 0-PCI Bus error, 1-CXL Bus error
>> + * @tlp_header_valid: Indicates if TLP field contains error information
>> + * @status: COR/UNCOR error status
>> + * @mask: COR/UNCOR mask
>> + * @tlp: Transaction packet information
>> + */
>> struct aer_err_info {
>> struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
>> int ratelimit_print[AER_MAX_MULTI_ERR_DEVICES];
>> @@ -621,7 +638,8 @@ struct aer_err_info {
>> unsigned int multi_error_valid:1;
>>
>> unsigned int first_error:5;
>> - unsigned int __pad2:2;
>> + unsigned int __pad2:1;
>> + bool is_cxl:1; /* CXL or PCI bus error? */
>> unsigned int tlp_header_valid:1;
>>
>> unsigned int status; /* COR/UNCOR Error Status */
> I'd get rid of the comments after the members since it's the exact same thing as the kernel
> doc above the struct.
Good idea.
>> @@ -632,6 +650,11 @@ struct aer_err_info {
>> int aer_get_device_error_info(struct aer_err_info *info, int i);
>> void aer_print_error(struct aer_err_info *info, int i);
>>
>> +static inline const char *aer_err_bus(struct aer_err_info *info)
>> +{
>> + return info->is_cxl ? "CXL" : "PCIe";
>> +}
>> +
>> int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
>> unsigned int tlp_len, bool flit,
>> struct pcie_tlp_log *log);
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index 6e5c9efe2920..befa73ace9bb 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -837,6 +837,7 @@ void aer_print_error(struct aer_err_info *info, int i)
>> struct pci_dev *dev;
>> int layer, agent, id;
>> const char *level = info->level;
>> + const char *bus_type = aer_err_bus(info);
>>
>> if (WARN_ON_ONCE(i >= AER_MAX_MULTI_ERR_DEVICES))
>> return;
>> @@ -845,23 +846,23 @@ void aer_print_error(struct aer_err_info *info, int i)
>> id = pci_dev_id(dev);
>>
>> pci_dev_aer_stats_incr(dev, info);
>> - trace_aer_event(pci_name(dev), (info->status & ~info->mask),
>> + trace_aer_event(pci_name(dev), bus_type, (info->status & ~info->mask),
>> info->severity, info->tlp_header_valid, &info->tlp);
>>
>> if (!info->ratelimit_print[i])
>> return;
>>
>> if (!info->status) {
>> - pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
>> - aer_error_severity_string[info->severity]);
>> + pci_err(dev, "%s Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
>> + bus_type, aer_error_severity_string[info->severity]);
>> goto out;
>> }
>>
>> layer = AER_GET_LAYER_ERROR(info->severity, info->status);
>> agent = AER_GET_AGENT(info->severity, info->status);
>>
>> - aer_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
>> - aer_error_severity_string[info->severity],
>> + aer_printk(level, dev, "%s Bus Error: severity=%s, type=%s, (%s)\n",
>> + bus_type, aer_error_severity_string[info->severity],
>> aer_error_layer[layer], aer_agent_string[agent]);
>>
>> aer_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n",
>> @@ -895,6 +896,7 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer);
>> void pci_print_aer(struct pci_dev *dev, int aer_severity,
>> struct aer_capability_regs *aer)
>> {
>> + const char *bus_type;
>> int layer, agent, tlp_header_valid = 0;
>> u32 status, mask;
>> struct aer_err_info info = {
>> @@ -915,9 +917,12 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
>>
>> info.status = status;
>> info.mask = mask;
>> + info.is_cxl = pcie_is_cxl(dev);
>> +
>> + bus_type = aer_err_bus(&info);
>>
>> pci_dev_aer_stats_incr(dev, &info);
>> - trace_aer_event(pci_name(dev), (status & ~mask),
>> + trace_aer_event(pci_name(dev), bus_type, (status & ~mask),
>> aer_severity, tlp_header_valid, &aer->header_log);
>>
>> if (!aer_ratelimit(dev, info.severity))
>> @@ -1278,6 +1283,7 @@ int aer_get_device_error_info(struct aer_err_info *info, int i)
>> /* Must reset in this function */
>> info->status = 0;
>> info->tlp_header_valid = 0;
>> + info->is_cxl = pcie_is_cxl(dev);
>>
> So am I right in assuming every AER error that occurs while the link is trained
> as a CXL link will be reported as a CXL error? Sorry if this is a stupid question,
> but is it possible for a PCIe error to occur or does CXL.io just replace the PCIe
> protocol once the link is trained as CXL?
Correct. Any PCI bus protocol errors reported while CXL trained will be reported as
CXL errors.
In your example a "PCIe error" will be detected as a CXL.io error and the AER driver
will log the extended AER register status. The device's CXL RAS will also be logged
if it is a CXL bus error.
> If so, do we not care if the error is a PCIe-level error and just report it as
> a CXL error anyway?
We can't access CXL RAS if its not a CXL error and not a device.
> Sorry if you've already hashed all of this out, but I figured I'd ask just to make sure.
Terry
next prev parent reply other threads:[~2025-10-06 19:59 UTC|newest]
Thread overview: 92+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-09-25 22:34 [PATCH v12 00/25] Enable CXL PCIe Port Protocol Error handling and logging Terry Bowman
2025-09-25 22:34 ` [PATCH v12 01/25] cxl/pci: Remove unnecessary CXL Endpoint handling helper functions Terry Bowman
2025-09-25 22:34 ` [PATCH v12 02/25] cxl/pci: Remove unnecessary CXL RCH " Terry Bowman
2025-10-01 15:09 ` Jonathan Cameron
2025-09-25 22:34 ` [PATCH v12 03/25] cxl: Remove ifdef blocks of CONFIG_PCIEAER_CXL from core/pci.c Terry Bowman
2025-10-03 20:11 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 04/25] CXL/AER: Remove CONFIG_PCIEAER_CXL and replace with CONFIG_CXL_RAS Terry Bowman
2025-09-25 23:17 ` Dave Jiang
2025-10-01 15:11 ` Jonathan Cameron
2025-10-03 20:11 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 05/25] cxl: Move CXL driver RCH error handling into CONFIG_CXL_RCH_RAS conditional block Terry Bowman
2025-09-25 23:31 ` Dave Jiang
2025-10-01 15:23 ` Jonathan Cameron
2025-10-03 20:11 ` Cheatham, Benjamin
2025-10-06 18:52 ` Bowman, Terry
2025-09-25 22:34 ` [PATCH v12 06/25] CXL/AER: Introduce aer_cxl_rch.c into AER driver for handling CXL RCH errors Terry Bowman
2025-09-25 23:36 ` Dave Jiang
2025-09-26 12:32 ` kernel test robot
2025-10-01 15:42 ` Jonathan Cameron
2025-10-03 20:11 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 07/25] CXL/PCI: Move CXL DVSEC definitions into uapi/linux/pci_regs.h Terry Bowman
2025-09-25 23:53 ` Dave Jiang
2025-10-01 15:58 ` Jonathan Cameron
2025-10-02 15:25 ` Bowman, Terry
2025-10-03 20:11 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 08/25] PCI/CXL: Introduce pcie_is_cxl() Terry Bowman
2025-10-03 20:11 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 09/25] PCI/AER: Report CXL or PCIe bus error type in trace logging Terry Bowman
2025-10-03 20:11 ` Cheatham, Benjamin
2025-10-06 19:59 ` Bowman, Terry [this message]
2025-09-25 22:34 ` [PATCH v12 10/25] CXL/AER: Update PCI class code check to use FIELD_GET() Terry Bowman
2025-09-26 0:02 ` Dave Jiang
2025-10-01 16:12 ` Jonathan Cameron
2025-10-02 7:40 ` Lukas Wunner
2025-10-30 17:16 ` Bowman, Terry
2025-10-31 5:30 ` Lukas Wunner
2025-09-25 22:34 ` [PATCH v12 11/25] cxl/pci: Update RAS handler interfaces to also support CXL Ports Terry Bowman
2025-10-03 20:11 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 12/25] cxl/pci: Log message if RAS registers are unmapped Terry Bowman
2025-10-03 20:11 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 13/25] cxl/pci: Unify CXL trace logging for CXL Endpoints and CXL Ports Terry Bowman
2025-09-26 20:44 ` Dave Jiang
2025-09-25 22:34 ` [PATCH v12 14/25] cxl/pci: Update cxl_handle_cor_ras() to return early if no RAS errors Terry Bowman
2025-10-03 20:11 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 15/25] cxl/pci: Map CXL Endpoint Port and CXL Switch Port RAS registers Terry Bowman
2025-09-26 21:10 ` Dave Jiang
2025-10-24 10:25 ` Alejandro Lucero Palau
2025-10-24 17:15 ` Dave Jiang
2025-10-24 19:40 ` Bowman, Terry
2025-10-27 16:33 ` Alejandro Lucero Palau
2025-09-25 22:34 ` [PATCH v12 16/25] CXL/PCI: Introduce PCI_ERS_RESULT_PANIC Terry Bowman
2025-09-26 21:26 ` Dave Jiang
2025-10-01 16:14 ` Jonathan Cameron
2025-10-03 20:11 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 17/25] cxl/pci: Introduce CXL Endpoint protocol error handlers Terry Bowman
2025-09-26 22:04 ` Dave Jiang
2025-09-30 14:06 ` Bowman, Terry
2025-09-30 16:09 ` Dave Jiang
2025-10-03 20:12 ` Cheatham, Benjamin
2025-10-06 21:07 ` Bowman, Terry
2025-09-25 22:34 ` [PATCH v12 18/25] CXL/AER: Introduce aer_cxl_vh.c in AER driver for forwarding CXL errors Terry Bowman
2025-09-26 22:56 ` Dave Jiang
2025-10-03 20:12 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 19/25] cxl: Introduce cxl_pci_drv_bound() to check for bound driver Terry Bowman
2025-09-26 23:02 ` Dave Jiang
2025-10-02 12:27 ` Jonathan Cameron
2025-10-03 20:12 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 20/25] PCI/AER: Dequeue forwarded CXL error Terry Bowman
2025-09-26 23:26 ` Dave Jiang
2025-10-03 20:12 ` Cheatham, Benjamin
2025-10-06 20:17 ` Dave Jiang
2025-09-25 22:34 ` [PATCH v12 21/25] CXL/PCI: Introduce CXL Port protocol error handlers Terry Bowman
2025-09-29 23:32 ` Dave Jiang
2025-10-03 20:12 ` Cheatham, Benjamin
2025-10-06 21:28 ` Bowman, Terry
2025-09-25 22:34 ` [PATCH v12 22/25] CXL/PCI: Export and rename merge_result() to pci_ers_merge_result() Terry Bowman
2025-09-26 15:01 ` kernel test robot
2025-09-26 18:10 ` kernel test robot
2025-09-25 22:34 ` [PATCH v12 23/25] CXL/PCI: Introduce CXL uncorrectable protocol error recovery Terry Bowman
2025-09-30 0:26 ` Dave Jiang
2025-09-30 14:38 ` Bowman, Terry
2025-09-30 16:13 ` Dave Jiang
2025-09-30 16:43 ` Bowman, Terry
2025-09-30 16:46 ` Dave Jiang
2025-10-01 13:58 ` Bowman, Terry
2025-10-01 15:33 ` Dave Jiang
2025-10-03 20:12 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 24/25] CXL/PCI: Enable CXL protocol errors during CXL Port probe Terry Bowman
2025-09-30 0:28 ` Dave Jiang
2025-10-03 20:12 ` Cheatham, Benjamin
2025-09-25 22:34 ` [PATCH v12 25/25] CXL/PCI: Disable CXL protocol error interrupts during CXL Port cleanup Terry Bowman
2025-10-03 20:12 ` Cheatham, Benjamin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=16e5e21e-e4dc-4e8e-85a9-e2b236f1251c@amd.com \
--to=terry.bowman@amd.com \
--cc=PradeepVineshReddy.Kodamati@amd.com \
--cc=Smita.KoralahalliChannabasappa@amd.com \
--cc=alison.schofield@intel.com \
--cc=alucerop@amd.com \
--cc=benjamin.cheatham@amd.com \
--cc=bhelgaas@google.com \
--cc=dan.carpenter@linaro.org \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=dave@stgolabs.net \
--cc=ira.weiny@intel.com \
--cc=jonathan.cameron@huawei.com \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-pci@vger.kernel.org \
--cc=lukas@wunner.de \
--cc=ming.li@zohomail.com \
--cc=rrichter@amd.com \
--cc=sathyanarayanan.kuppuswamy@linux.intel.com \
--cc=shiju.jose@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.