From: "Wang, Qingshun" <qingshun.wang@linux.intel.com>
To: linux-pci@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
linux-acpi@vger.kernel.org
Cc: Miaohe Lin <linmiaohe@huawei.com>,
Alison Schofield <alison.schofield@intel.com>,
"Rafael J. Wysocki" <rafael@kernel.org>,
erwin.tsaur@intel.com,
Kuppuswamy Sathyanarayanan
<sathyanarayanan.kuppuswamy@linux.intel.com>,
linux-cxl@vger.kernel.org, linux-kernel@vger.kernel.org,
Oliver O'Halloran <oohall@gmail.com>,
chao.p.peng@linux.intel.com, Ira Weiny <ira.weiny@intel.com>,
Davidlohr Bueso <dave@stgolabs.net>,
Dave Jiang <dave.jiang@intel.com>,
Vishal Verma <vishal.l.verma@intel.com>,
Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>,
Bjorn Helgaas <helgaas@kernel.org>, Len Brown <lenb@kernel.org>,
Robert Richter <rrichter@amd.com>, Borislav Petkov <bp@alien8.de>,
"Wang, Qingshun" <qingshun.wang@linux.intel.com>,
Jonathan Cameron <jonathan.cameron@huawei.com>,
Bjorn Helgaas <bhelgaas@google.com>,
Dan Williams <dan.j.williams@intel.com>,
linux-edac@vger.kernel.org, Tony Luck <tony.luck@intel.com>,
feiting.wanyan@intel.com, Adam Preble <adam.c.preble@intel.com>,
Mahesh J Sa lgaonkar <mahesh@linux.ibm.com>,
Li Yang <leoyang.li@nxp.com>, Lukas Wunner <lukas@wunner.de>,
James Morse <james.morse@arm.com>,
qingshun.wang@intel.com, Shiju Jose <shiju.jose@huawei.com>
Subject: [PATCH v2 4/4] RAS: Trace more information in aer_event
Date: Thu, 25 Jan 2024 14:28:02 +0800 [thread overview]
Message-ID: <20240125062802.50819-5-qingshun.wang@linux.intel.com> (raw)
In-Reply-To: <20240125062802.50819-1-qingshun.wang@linux.intel.com>
Add following fields in aer_event to better understand Advisory
Non-Fatal and other errors for external observation:
- cor_status (Correctable Error Status)
- cor_mask (Correctable Error Mask)
- uncor_status (Uncorrectable Error Status)
- uncor_severity (Uncorrectable Error Severity)
- uncor_mask (Uncorrectable Error Mask)
- aer_cap_ctrl (AER Capabilities and Control)
- link_status (Link Status)
- device_status (Device Status)
- device_control_2 (Device Control 2)
In addition to the raw register value, value of following fields are
extracted and logged for better observability:
- "First Error Pointer" and "Completion Timeout Prefix/Header Log
Capable" from "AER Capabilities and Control"
- "Completion Timeout Value" and "Completion Timeout Disable"
from "Device Control 2"
Signed-off-by: "Wang, Qingshun" <qingshun.wang@linux.intel.com>
---
drivers/pci/pcie/aer.c | 17 +++++++++++--
include/ras/ras_event.h | 48 ++++++++++++++++++++++++++++++++---
include/uapi/linux/pci_regs.h | 1 +
3 files changed, 60 insertions(+), 6 deletions(-)
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index eec3406f727a..2f5639f6c40f 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -757,6 +757,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
int layer, agent;
int id = pci_dev_id(dev);
const char *level;
+ struct aer_capability_regs aer_caps;
if (info->severity == AER_CORRECTABLE) {
status = info->cor_status;
@@ -793,8 +794,18 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
if (info->id && info->error_dev_num > 1 && info->id == id)
pci_err(dev, " Error of this Agent is reported first\n");
+ aer_caps = (struct aer_capability_regs) {
+ .cor_status = info->cor_status,
+ .cor_mask = info->cor_mask,
+ .uncor_status = info->uncor_status,
+ .uncor_severity = info->uncor_severity,
+ .uncor_mask = info->uncor_mask,
+ .cap_control = info->aer_cap_ctrl
+ };
trace_aer_event(dev_name(&dev->dev), (status & ~mask),
- info->severity, info->tlp_header_valid, &info->tlp);
+ info->severity, info->tlp_header_valid, &info->tlp,
+ &aer_caps, info->link_status,
+ info->device_status, info->device_control_2);
}
static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
@@ -870,7 +881,9 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
__print_tlp_header(dev, &aer->header_log);
trace_aer_event(dev_name(&dev->dev), (status & ~mask),
- aer_severity, tlp_header_valid, &aer->header_log);
+ aer_severity, tlp_header_valid, &aer->header_log,
+ aer, info.link_status,
+ info.device_status, info.device_control_2);
}
EXPORT_SYMBOL_NS_GPL(pci_print_aer, CXL);
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index cbd3ddd7c33d..a94997073d90 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -300,9 +300,14 @@ TRACE_EVENT(aer_event,
const u32 status,
const u8 severity,
const u8 tlp_header_valid,
- struct aer_header_log_regs *tlp),
+ struct aer_header_log_regs *tlp,
+ struct aer_capability_regs *aer_caps,
+ const u16 link_status,
+ const u16 device_status,
+ const u16 device_control_2),
- TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp),
+ TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp,
+ aer_caps, link_status, device_status, device_control_2),
TP_STRUCT__entry(
__string( dev_name, dev_name )
@@ -310,6 +315,10 @@ TRACE_EVENT(aer_event,
__field( u8, severity )
__field( u8, tlp_header_valid)
__array( u32, tlp_header, 4 )
+ __field_struct(struct aer_capability_regs, aer_caps)
+ __field( u16, link_status )
+ __field( u16, device_status )
+ __field( u16, device_control_2)
),
TP_fast_assign(
@@ -317,6 +326,10 @@ TRACE_EVENT(aer_event,
__entry->status = status;
__entry->severity = severity;
__entry->tlp_header_valid = tlp_header_valid;
+ __entry->aer_caps = *aer_caps;
+ __entry->link_status = link_status;
+ __entry->device_status = device_status;
+ __entry->device_control_2 = device_control_2;
if (tlp_header_valid) {
__entry->tlp_header[0] = tlp->dw0;
__entry->tlp_header[1] = tlp->dw1;
@@ -325,7 +338,20 @@ TRACE_EVENT(aer_event,
}
),
- TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n",
+ TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s, "
+ "Correctable Error Status=0x%08x, "
+ "Correctable Error Mask=0x%08x, "
+ "Uncorrectable Error Status=0x%08x, "
+ "Uncorrectable Error Severity=0x%08x, "
+ "Uncorrectable Error Mask=0x%08x, "
+ "AER Capability and Control=0x%08x, "
+ "First Error Pointer=0x%x, "
+ "Completion Timeout Prefix/Header Log Capable=%s, "
+ "Link Status=0x%04x, "
+ "Device Status=0x%04x, "
+ "Device Control 2=0x%04x, "
+ "Completion Timeout Value=0x%x, "
+ "Completion Timeout Disable=%sn",
__get_str(dev_name),
__entry->severity == AER_CORRECTABLE ? "Corrected" :
__entry->severity == AER_FATAL ?
@@ -335,7 +361,21 @@ TRACE_EVENT(aer_event,
__print_flags(__entry->status, "|", aer_uncorrectable_errors),
__entry->tlp_header_valid ?
__print_array(__entry->tlp_header, 4, 4) :
- "Not available")
+ "Not available",
+ __entry->aer_caps.cor_status,
+ __entry->aer_caps.cor_mask,
+ __entry->aer_caps.uncor_status,
+ __entry->aer_caps.uncor_severity,
+ __entry->aer_caps.uncor_mask,
+ __entry->aer_caps.cap_control,
+ PCI_ERR_CAP_FEP(__entry->aer_caps.cap_control),
+ __entry->aer_caps.cap_control & PCI_ERR_CAP_CTO_LOGC ? "True" : "False",
+ __entry->link_status,
+ __entry->device_status,
+ __entry->device_control_2,
+ __entry->device_control_2 & PCI_EXP_DEVCTL2_COMP_TIMEOUT,
+ __entry->device_control_2 & PCI_EXP_DEVCTL2_COMP_TMOUT_DIS ?
+ "True" : "False")
);
/*
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index a39193213ff2..54160ed2a8c9 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -787,6 +787,7 @@
#define PCI_ERR_CAP_ECRC_GENE 0x00000040 /* ECRC Generation Enable */
#define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */
#define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */
+#define PCI_ERR_CAP_CTO_LOGC 0x00001000 /* Completion Timeout Prefix/Header Log Capable */
#define PCI_ERR_HEADER_LOG 0x1c /* Header Log Register (16 bytes) */
#define PCI_ERR_ROOT_COMMAND 0x2c /* Root Error Command */
#define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 /* Correctable Err Reporting Enable */
--
2.42.0
prev parent reply other threads:[~2024-01-25 11:05 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-01-25 6:27 [PATCH v2 0/4] PCI/AER: Handle Advisory Non-Fatal properly Wang, Qingshun
2024-01-25 6:27 ` [PATCH v2 1/4] PCI/AER: Store more information in aer_err_info Wang, Qingshun
2024-01-31 2:26 ` Kuppuswamy Sathyanarayanan
2024-01-31 8:04 ` Wang, Qingshun
2024-02-05 23:12 ` Bjorn Helgaas
2024-02-06 16:41 ` Wang, Qingshun
2024-02-06 17:23 ` Bjorn Helgaas
2024-02-08 16:16 ` Wang, Qingshun
2024-01-25 6:28 ` [PATCH v2 2/4] PCI/AER: Handle Advisory Non-Fatal properly Wang, Qingshun
2024-02-05 23:26 ` Bjorn Helgaas
2024-02-06 16:46 ` Wang, Qingshun
2024-01-25 6:28 ` [PATCH v2 3/4] PCI/AER: Fetch information for FTrace Wang, Qingshun
2024-02-02 18:01 ` Dan Williams
2024-02-03 4:59 ` Wang, Qingshun
2024-01-25 6:28 ` Wang, Qingshun [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240125062802.50819-5-qingshun.wang@linux.intel.com \
--to=qingshun.wang@linux.intel.com \
--cc=Smita.KoralahalliChannabasappa@amd.com \
--cc=adam.c.preble@intel.com \
--cc=alison.schofield@intel.com \
--cc=bhelgaas@google.com \
--cc=bp@alien8.de \
--cc=chao.p.peng@linux.intel.com \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=dave@stgolabs.net \
--cc=erwin.tsaur@intel.com \
--cc=feiting.wanyan@intel.com \
--cc=helgaas@kernel.org \
--cc=ira.weiny@intel.com \
--cc=james.morse@arm.com \
--cc=jonathan.cameron@huawei.com \
--cc=lenb@kernel.org \
--cc=leoyang.li@nxp.com \
--cc=linmiaohe@huawei.com \
--cc=linux-acpi@vger.kernel.org \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-edac@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-pci@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=lukas@wunner.de \
--cc=mahesh@linux.ibm.com \
--cc=oohall@gmail.com \
--cc=qingshun.wang@intel.com \
--cc=rafael@kernel.org \
--cc=rrichter@amd.com \
--cc=sathyanarayanan.kuppuswamy@linux.intel.com \
--cc=shiju.jose@huawei.com \
--cc=tony.luck@intel.com \
--cc=vishal.l.verma@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).