* [RFC 1/4] PCI/AER: Use the same log level for all messages
2024-12-12 14:27 [RFC 0/4] Rate limit PCIe Correctable Errors Karolina Stolarek
@ 2024-12-12 14:27 ` Karolina Stolarek
2024-12-12 14:27 ` [RFC 2/4] PCI/AER: Add Correctable Errors rate limiting Karolina Stolarek
` (3 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Karolina Stolarek @ 2024-12-12 14:27 UTC (permalink / raw)
To: linux-pci; +Cc: Bjorn Helgaas
When reporting an AER error, we check its type multiple times
to determine the log level for each message. Do this check only
in the top-level function and propagate the result down the call
chain. Make aer_print_port_info output to match the level of the
reported error.
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
---
drivers/pci/pci.h | 2 +-
drivers/pci/pcie/aer.c | 64 ++++++++++++++++++++++--------------------
drivers/pci/pcie/dpc.c | 2 +-
3 files changed, 36 insertions(+), 32 deletions(-)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2e40fc63ba31..139ea4f01448 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -546,7 +546,7 @@ struct aer_err_info {
};
int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
-void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
+void aer_print_error(struct pci_dev *dev, struct aer_err_info *info, const char *level);
#endif /* CONFIG_PCIEAER */
#ifdef CONFIG_PCIEPORTBUS
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 80c5ba8d8296..b13690fd172f 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -672,20 +672,18 @@ static void __print_tlp_header(struct pci_dev *dev, struct pcie_tlp_log *t)
}
static void __aer_print_error(struct pci_dev *dev,
- struct aer_err_info *info)
+ struct aer_err_info *info,
+ const char *level)
{
const char **strings;
unsigned long status = info->status & ~info->mask;
- const char *level, *errmsg;
+ const char *errmsg;
int i;
- if (info->severity == AER_CORRECTABLE) {
+ if (info->severity == AER_CORRECTABLE)
strings = aer_correctable_error_string;
- level = KERN_WARNING;
- } else {
+ else
strings = aer_uncorrectable_error_string;
- level = KERN_ERR;
- }
for_each_set_bit(i, &status, 32) {
errmsg = strings[i];
@@ -698,11 +696,11 @@ static void __aer_print_error(struct pci_dev *dev,
pci_dev_aer_stats_incr(dev, info);
}
-void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
+void aer_print_error(struct pci_dev *dev, struct aer_err_info *info,
+ const char *level)
{
int layer, agent;
int id = pci_dev_id(dev);
- const char *level;
if (!info->status) {
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
@@ -713,8 +711,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
layer = AER_GET_LAYER_ERROR(info->severity, info->status);
agent = AER_GET_AGENT(info->severity, info->status);
- level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
-
pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
aer_error_severity_string[info->severity],
aer_error_layer[layer], aer_agent_string[agent]);
@@ -722,7 +718,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
pci_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n",
dev->vendor, dev->device, info->status, info->mask);
- __aer_print_error(dev, info);
+ __aer_print_error(dev, info, level);
if (info->tlp_header_valid)
__print_tlp_header(dev, &info->tlp);
@@ -735,16 +731,17 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
info->severity, info->tlp_header_valid, &info->tlp);
}
-static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
+static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info,
+ const char *level)
{
u8 bus = info->id >> 8;
u8 devfn = info->id & 0xff;
- pci_info(dev, "%s%s error message received from %04x:%02x:%02x.%d\n",
- info->multi_error_valid ? "Multiple " : "",
- aer_error_severity_string[info->severity],
- pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
- PCI_FUNC(devfn));
+ pci_printk(level, dev, "%s%s error message received from %04x:%02x:%02x.%d\n",
+ info->multi_error_valid ? "Multiple " : "",
+ aer_error_severity_string[info->severity],
+ pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn),
+ PCI_FUNC(devfn));
}
#ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -767,15 +764,18 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
{
int layer, agent, tlp_header_valid = 0;
u32 status, mask;
+ const char *level;
struct aer_err_info info;
if (aer_severity == AER_CORRECTABLE) {
status = aer->cor_status;
mask = aer->cor_mask;
+ level = KERN_WARNING;
} else {
status = aer->uncor_status;
mask = aer->uncor_mask;
tlp_header_valid = status & AER_LOG_TLP_MASKS;
+ level = KERN_ERR;
}
layer = AER_GET_LAYER_ERROR(aer_severity, status);
@@ -787,14 +787,14 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
info.mask = mask;
info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
- pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
- __aer_print_error(dev, &info);
- pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
- aer_error_layer[layer], aer_agent_string[agent]);
+ pci_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
+ __aer_print_error(dev, &info, level);
+ pci_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
+ aer_error_layer[layer], aer_agent_string[agent]);
if (aer_severity != AER_CORRECTABLE)
- pci_err(dev, "aer_uncor_severity: 0x%08x\n",
- aer->uncor_severity);
+ pci_printk(level, dev, "aer_uncor_severity: 0x%08x\n",
+ aer->uncor_severity);
if (tlp_header_valid)
__print_tlp_header(dev, &aer->header_log);
@@ -1255,14 +1255,15 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
return 1;
}
-static inline void aer_process_err_devices(struct aer_err_info *e_info)
+static inline void aer_process_err_devices(struct aer_err_info *e_info,
+ const char *level)
{
int i;
/* Report all before handle them, not to lost records by reset etc. */
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
if (aer_get_device_error_info(e_info->dev[i], e_info))
- aer_print_error(e_info->dev[i], e_info);
+ aer_print_error(e_info->dev[i], e_info, level);
}
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
if (aer_get_device_error_info(e_info->dev[i], e_info))
@@ -1280,6 +1281,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
{
struct pci_dev *pdev = rpc->rpd;
struct aer_err_info e_info;
+ const char *level;
pci_rootport_aer_stats_incr(pdev, e_src);
@@ -1290,19 +1292,21 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
if (e_src->status & PCI_ERR_ROOT_COR_RCV) {
e_info.id = ERR_COR_ID(e_src->id);
e_info.severity = AER_CORRECTABLE;
+ level = KERN_WARNING;
if (e_src->status & PCI_ERR_ROOT_MULTI_COR_RCV)
e_info.multi_error_valid = 1;
else
e_info.multi_error_valid = 0;
- aer_print_port_info(pdev, &e_info);
+ aer_print_port_info(pdev, &e_info, level);
if (find_source_device(pdev, &e_info))
- aer_process_err_devices(&e_info);
+ aer_process_err_devices(&e_info, level);
}
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
e_info.id = ERR_UNCOR_ID(e_src->id);
+ level = KERN_ERR;
if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
e_info.severity = AER_FATAL;
@@ -1314,10 +1318,10 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
else
e_info.multi_error_valid = 0;
- aer_print_port_info(pdev, &e_info);
+ aer_print_port_info(pdev, &e_info, level);
if (find_source_device(pdev, &e_info))
- aer_process_err_devices(&e_info);
+ aer_process_err_devices(&e_info, level);
}
}
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 2b6ef7efa3c1..9e48d571d9e7 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -291,7 +291,7 @@ void dpc_process_error(struct pci_dev *pdev)
else if (reason == PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR &&
dpc_get_aer_uncorrect_severity(pdev, &info) &&
aer_get_device_error_info(pdev, &info)) {
- aer_print_error(pdev, &info);
+ aer_print_error(pdev, &info, KERN_ERR);
pci_aer_clear_nonfatal_status(pdev);
pci_aer_clear_fatal_status(pdev);
}
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread* [RFC 2/4] PCI/AER: Add Correctable Errors rate limiting
2024-12-12 14:27 [RFC 0/4] Rate limit PCIe Correctable Errors Karolina Stolarek
2024-12-12 14:27 ` [RFC 1/4] PCI/AER: Use the same log level for all messages Karolina Stolarek
@ 2024-12-12 14:27 ` Karolina Stolarek
2024-12-12 14:27 ` [RFC 3/4] PCI/AER: Increase the rate limit interval after threshold Karolina Stolarek
` (2 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Karolina Stolarek @ 2024-12-12 14:27 UTC (permalink / raw)
To: linux-pci; +Cc: Bjorn Helgaas
In the case of a compromised Link integrity, we may see excessive
logging of Correctable Errors. This kind of errors is handled by
the hardware, so the messages are purely informational. It should
suffice to report the error once in a while, and inform how many
messages were suppressed over that time.
Add a ratelimit_state to control the number of printed Correctable
Errors per Root Port and check it each time a Correctable Error is
to be reported.
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
---
drivers/pci/pcie/aer.c | 44 ++++++++++++++++++++++++++++--------------
include/linux/pci.h | 1 +
2 files changed, 31 insertions(+), 14 deletions(-)
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index b13690fd172f..5c34cc2b5bf3 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -40,6 +40,8 @@
#define AER_MAX_TYPEOF_COR_ERRS 16 /* as per PCI_ERR_COR_STATUS */
#define AER_MAX_TYPEOF_UNCOR_ERRS 27 /* as per PCI_ERR_UNCOR_STATUS*/
+#define AER_COR_ERR_INTERVAL (2 * HZ)
+
struct aer_err_source {
u32 status; /* PCI_ERR_ROOT_STATUS */
u32 id; /* PCI_ERR_ROOT_ERR_SRC */
@@ -375,6 +377,9 @@ void pci_aer_init(struct pci_dev *dev)
dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
+ /* Allow Root Port to report a Correctable Error message every 2 seconds */
+ ratelimit_state_init(&dev->cor_rs, AER_COR_ERR_INTERVAL, 1);
+
/*
* We save/restore PCI_ERR_UNCOR_MASK, PCI_ERR_UNCOR_SEVER,
* PCI_ERR_COR_MASK, and PCI_ERR_CAP. Root and Root Complex Event
@@ -766,11 +771,13 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
u32 status, mask;
const char *level;
struct aer_err_info info;
+ bool no_ratelimit = true;
if (aer_severity == AER_CORRECTABLE) {
status = aer->cor_status;
mask = aer->cor_mask;
level = KERN_WARNING;
+ no_ratelimit = __ratelimit(&dev->cor_rs);
} else {
status = aer->uncor_status;
mask = aer->uncor_mask;
@@ -787,17 +794,20 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
info.mask = mask;
info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
- pci_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
- __aer_print_error(dev, &info, level);
- pci_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
- aer_error_layer[layer], aer_agent_string[agent]);
+ if (no_ratelimit) {
+ pci_printk(level, dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
+ status, mask);
+ __aer_print_error(dev, &info, level);
+ pci_printk(level, dev, "aer_layer=%s, aer_agent=%s\n",
+ aer_error_layer[layer], aer_agent_string[agent]);
- if (aer_severity != AER_CORRECTABLE)
- pci_printk(level, dev, "aer_uncor_severity: 0x%08x\n",
- aer->uncor_severity);
+ if (aer_severity != AER_CORRECTABLE)
+ pci_printk(level, dev, "aer_uncor_severity: 0x%08x\n",
+ aer->uncor_severity);
- if (tlp_header_valid)
- __print_tlp_header(dev, &aer->header_log);
+ if (tlp_header_valid)
+ __print_tlp_header(dev, &aer->header_log);
+ }
trace_aer_event(dev_name(&dev->dev), (status & ~mask),
aer_severity, tlp_header_valid, &aer->header_log);
@@ -1256,13 +1266,14 @@ int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
}
static inline void aer_process_err_devices(struct aer_err_info *e_info,
- const char *level)
+ const char *level,
+ bool no_ratelimit)
{
int i;
/* Report all before handle them, not to lost records by reset etc. */
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
- if (aer_get_device_error_info(e_info->dev[i], e_info))
+ if (aer_get_device_error_info(e_info->dev[i], e_info) && no_ratelimit)
aer_print_error(e_info->dev[i], e_info, level);
}
for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
@@ -1282,6 +1293,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
struct pci_dev *pdev = rpc->rpd;
struct aer_err_info e_info;
const char *level;
+ bool no_ratelimit = true;
pci_rootport_aer_stats_incr(pdev, e_src);
@@ -1298,10 +1310,14 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
e_info.multi_error_valid = 1;
else
e_info.multi_error_valid = 0;
- aer_print_port_info(pdev, &e_info, level);
+
+ no_ratelimit = __ratelimit(&pdev->cor_rs);
+
+ if (no_ratelimit)
+ aer_print_port_info(pdev, &e_info, level);
if (find_source_device(pdev, &e_info))
- aer_process_err_devices(&e_info, level);
+ aer_process_err_devices(&e_info, level, no_ratelimit);
}
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
@@ -1321,7 +1337,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
aer_print_port_info(pdev, &e_info, level);
if (find_source_device(pdev, &e_info))
- aer_process_err_devices(&e_info, level);
+ aer_process_err_devices(&e_info, level, no_ratelimit);
}
}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index db9b47ce3eef..3dfa2aac31b4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -347,6 +347,7 @@ struct pci_dev {
#ifdef CONFIG_PCIEAER
u16 aer_cap; /* AER capability offset */
struct aer_stats *aer_stats; /* AER stats for this device */
+ struct ratelimit_state cor_rs; /* Correctable Errors Ratelimit */
#endif
#ifdef CONFIG_PCIEPORTBUS
struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread* [RFC 3/4] PCI/AER: Increase the rate limit interval after threshold
2024-12-12 14:27 [RFC 0/4] Rate limit PCIe Correctable Errors Karolina Stolarek
2024-12-12 14:27 ` [RFC 1/4] PCI/AER: Use the same log level for all messages Karolina Stolarek
2024-12-12 14:27 ` [RFC 2/4] PCI/AER: Add Correctable Errors rate limiting Karolina Stolarek
@ 2024-12-12 14:27 ` Karolina Stolarek
2024-12-12 14:27 ` [RFC 4/4] PCI: Add 'cor_err_reporting_enable' attribute Karolina Stolarek
2024-12-16 10:44 ` [RFC 0/4] Rate limit PCIe Correctable Errors Jonathan Cameron
4 siblings, 0 replies; 7+ messages in thread
From: Karolina Stolarek @ 2024-12-12 14:27 UTC (permalink / raw)
To: linux-pci; +Cc: Bjorn Helgaas
In extreme circumstances, the default rate limit might not
be enough and a longer timeout is needed. To avoid spamming
the logs, update the interval to 30 seconds for the specific
Root Port after it observes over 1000 Correctable Errors.
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
---
drivers/pci/pcie/aer.c | 22 +++++++++++++++++++++-
include/linux/pci.h | 1 +
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 5c34cc2b5bf3..98bf8bbadc07 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -40,7 +40,9 @@
#define AER_MAX_TYPEOF_COR_ERRS 16 /* as per PCI_ERR_COR_STATUS */
#define AER_MAX_TYPEOF_UNCOR_ERRS 27 /* as per PCI_ERR_UNCOR_STATUS*/
+#define AER_COR_ERR_THRESHOLD 1000
#define AER_COR_ERR_INTERVAL (2 * HZ)
+#define AER_COR_ERR_LONG_INTERVAL (30 * HZ)
struct aer_err_source {
u32 status; /* PCI_ERR_ROOT_STATUS */
@@ -670,6 +672,24 @@ static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
}
}
+static bool report_aer_cor_err(struct pci_dev *pdev)
+{
+ struct ratelimit_state *rs = &pdev->cor_rs;
+ struct aer_stats *aer_stats = pdev->aer_stats;
+ unsigned int total_cor_errs = aer_stats->rootport_total_cor_errs;
+
+ /* A significant number of errors reported, increase the rate limit */
+ if (total_cor_errs > AER_COR_ERR_THRESHOLD && !pdev->cor_err_throttled) {
+ pci_warn(pdev,
+ "Over %d Correctable Errors reported, increasing the rate limit",
+ AER_COR_ERR_THRESHOLD);
+ rs->interval = AER_COR_ERR_LONG_INTERVAL;
+ pdev->cor_err_throttled = 1;
+ }
+
+ return __ratelimit(&pdev->cor_rs);
+}
+
static void __print_tlp_header(struct pci_dev *dev, struct pcie_tlp_log *t)
{
pci_err(dev, " TLP Header: %08x %08x %08x %08x\n",
@@ -1311,7 +1331,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
else
e_info.multi_error_valid = 0;
- no_ratelimit = __ratelimit(&pdev->cor_rs);
+ no_ratelimit = report_aer_cor_err(pdev);
if (no_ratelimit)
aer_print_port_info(pdev, &e_info, level);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3dfa2aac31b4..b01bfb339e4e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -348,6 +348,7 @@ struct pci_dev {
u16 aer_cap; /* AER capability offset */
struct aer_stats *aer_stats; /* AER stats for this device */
struct ratelimit_state cor_rs; /* Correctable Errors Ratelimit */
+ unsigned int cor_err_throttled:1;
#endif
#ifdef CONFIG_PCIEPORTBUS
struct rcec_ea *rcec_ea; /* RCEC cached endpoint association */
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread* [RFC 4/4] PCI: Add 'cor_err_reporting_enable' attribute
2024-12-12 14:27 [RFC 0/4] Rate limit PCIe Correctable Errors Karolina Stolarek
` (2 preceding siblings ...)
2024-12-12 14:27 ` [RFC 3/4] PCI/AER: Increase the rate limit interval after threshold Karolina Stolarek
@ 2024-12-12 14:27 ` Karolina Stolarek
2024-12-16 10:44 ` [RFC 0/4] Rate limit PCIe Correctable Errors Jonathan Cameron
4 siblings, 0 replies; 7+ messages in thread
From: Karolina Stolarek @ 2024-12-12 14:27 UTC (permalink / raw)
To: linux-pci; +Cc: Bjorn Helgaas
In some cases, the number of Correctable Error messages is overwhelming,
and even with the rate limit imposed, they fill up the logs. The system
cannot do much about such errors, so a user might wish to silence them
completely.
Add a sysfs attribute to control reporting of the Correctable Error
Messages per device.
Signed-off-by: Karolina Stolarek <karolina.stolarek@oracle.com>
---
Documentation/ABI/testing/sysfs-bus-pci | 7 +++++
drivers/pci/pci-sysfs.c | 42 +++++++++++++++++++++++++
2 files changed, 49 insertions(+)
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 5da6a14dc326..dba72ee37ce4 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -479,6 +479,13 @@ Description:
The file is writable if the PF is bound to a driver that
implements ->sriov_set_msix_vec_count().
+What: /sys/bus/pci/devices/.../cor_err_reporting_enable
+Date: December 2024
+Contact: Linux PCI developers <linux-pci@vger.kernel.org>
+Description:
+ This file exposes a bit to control sending of Correctable Error
+ Messages. The value comes from the Device Control register.
+
What: /sys/bus/pci/devices/.../resourceN_resize
Date: September 2022
Contact: Alex Williamson <alex.williamson@redhat.com>
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 6f1bb7514efb..f7f0d7971ad7 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -186,6 +186,47 @@ static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(resource);
+static ssize_t cor_err_reporting_enable_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ u16 reg;
+ int err;
+
+ err = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, ®);
+
+ if (err)
+ return pcibios_err_to_errno(err);
+
+ return sysfs_emit(buf, "%u\n", reg & PCI_EXP_DEVCTL_CERE);
+}
+
+static ssize_t cor_err_reporting_enable_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ u16 reg;
+ u8 val;
+ int err;
+
+ if (kstrtou8(buf, 0, &val) < 0)
+ return -EINVAL;
+
+ pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, ®);
+
+ reg &= ~PCI_EXP_DEVCTL_CERE;
+ reg |= val;
+ err = pcie_capability_write_word(pdev, PCI_EXP_DEVCTL, reg);
+
+ if (err)
+ return pcibios_err_to_errno(err);
+
+ return count;
+}
+static DEVICE_ATTR_RW(cor_err_reporting_enable);
+
static ssize_t max_link_speed_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -659,6 +700,7 @@ static struct attribute *pcie_dev_attrs[] = {
&dev_attr_current_link_width.attr,
&dev_attr_max_link_width.attr,
&dev_attr_max_link_speed.attr,
+ &dev_attr_cor_err_reporting_enable.attr,
NULL,
};
--
2.43.5
^ permalink raw reply related [flat|nested] 7+ messages in thread* Re: [RFC 0/4] Rate limit PCIe Correctable Errors
2024-12-12 14:27 [RFC 0/4] Rate limit PCIe Correctable Errors Karolina Stolarek
` (3 preceding siblings ...)
2024-12-12 14:27 ` [RFC 4/4] PCI: Add 'cor_err_reporting_enable' attribute Karolina Stolarek
@ 2024-12-16 10:44 ` Jonathan Cameron
2024-12-17 8:34 ` Karolina Stolarek
4 siblings, 1 reply; 7+ messages in thread
From: Jonathan Cameron @ 2024-12-16 10:44 UTC (permalink / raw)
To: Karolina Stolarek; +Cc: linux-pci, Bjorn Helgaas
On Thu, 12 Dec 2024 14:27:28 +0000
Karolina Stolarek <karolina.stolarek@oracle.com> wrote:
> TL;DR
> ====
>
> We are getting multiple reports about excessive logging of Correctable
> Errors with no clear common root cause. As these errors are already
> corrected by hardware, it makes sense to limit them. Introduce
> a ratelimit state definition to pci_dev to control the number of
> messages reported by a Root Port within a specified time interval.
> The series adds other improvements in the area, as outlined in the
> Proposal section.
Hi Karolina,
Just to check, this doesn't affect tracepoints? From a quick read
of the patches they look like they will still be triggered so monitoring
tools will see the correctable errors. That's definitely the right
option even if we limit prints to the kernel log.
Assuming I read it right, change the series title to make it clear this
is just the prints to the kernel log that you are touching.
Thanks,
Jonathan
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [RFC 0/4] Rate limit PCIe Correctable Errors
2024-12-16 10:44 ` [RFC 0/4] Rate limit PCIe Correctable Errors Jonathan Cameron
@ 2024-12-17 8:34 ` Karolina Stolarek
0 siblings, 0 replies; 7+ messages in thread
From: Karolina Stolarek @ 2024-12-17 8:34 UTC (permalink / raw)
To: Jonathan Cameron; +Cc: linux-pci, Bjorn Helgaas
Hi Jonathan,
Many thanks for taking a look at the patches.
On 16/12/2024 11:44, Jonathan Cameron wrote:
>
> Hi Karolina,
>
> Just to check, this doesn't affect tracepoints? From a quick read
> of the patches they look like they will still be triggered so monitoring
> tools will see the correctable errors. That's definitely the right
> option even if we limit prints to the kernel log.
The tracing seems to be working -- rasdaemon recorded every correctable
error despite that we didn't print all of them to the kernel log.
> Assuming I read it right, change the series title to make it clear this
> is just the prints to the kernel log that you are touching.
OK, will do that in the next version.
All the best,
Karolina
>
> Thanks,
>
> Jonathan
^ permalink raw reply [flat|nested] 7+ messages in thread